[pocl] 02/02: Imported Upstream version 0.13

Andreas Beckmann anbe at moszumanska.debian.org
Mon Apr 25 09:11:37 UTC 2016


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch upstream
in repository pocl.

commit d916dbafead0035f3e9f2e926e061dacb7cc1cc9
Author: Andreas Beckmann <anbe at debian.org>
Date:   Fri Apr 22 22:48:21 2016 +0200

    Imported Upstream version 0.13
---
 .bzrignore                                         |    52 +
 .gitattributes                                     |    41 +
 .gitignore                                         |    95 +
 .mailmap                                           |    44 +
 CHANGES                                            |    38 +
 CMakeLists.txt                                     |   482 +-
 CREDITS                                            |     1 +
 INSTALL                                            |   430 +-
 Makefile.am                                        |     4 +-
 Makefile.in                                        |    11 +-
 README.Cell                                        |    94 -
 README.FreeBSD                                     |     6 +
 TODO.piglit                                        |    29 +
 aclocal.m4                                         |   211 +-
 android/CLONE_POCL_PREBUILTS_HERE                  |     1 +
 android/build-arm.sh                               |   161 +
 autogen.sh                                         |  1578 ++
 cmake/LLVM.cmake                                   |   207 +-
 cmake/bitcode_rules.cmake                          |    70 +-
 cmake/kernellib_hash.cmake                         |    30 +-
 cmake/run_test.cmake                               |    79 +-
 config.h.in                                        |    82 +-
 config.h.in.cmake                                  |    51 +-
 configure                                          |   586 +-
 configure.ac                                       |   222 +-
 doc/luxmark.txt                                    |    13 +
 doc/sphinx/source/conf.py                          |     4 +-
 doc/sphinx/source/env_variables.rst                |    55 +-
 doc/sphinx/source/faq.rst                          |     7 +-
 doc/sphinx/source/features.rst                     |    18 +
 doc/sphinx/source/hsa.rst                          |    65 +-
 doc/sphinx/source/index.rst                        |     2 +-
 doc/sphinx/source/install.rst                      |   139 +
 doc/sphinx/source/using.rst                        |     2 +
 examples/AMD/CMakeLists.txt                        |   115 +
 examples/AMD/Makefile.in                           |     5 +-
 examples/AMDSDK2.9/CMakeLists.txt                  |   123 +
 examples/AMDSDK2.9/Makefile.in                     |     5 +-
 examples/AMDSDK3.0/CMakeLists.txt                  |   152 +
 examples/AMDSDK3.0/Makefile.am                     |   417 +
 examples/AMDSDK3.0/Makefile.in                     |   939 ++
 examples/AMDSDK3.0/amdsdk3_0.patch                 |    50 +
 examples/ASL/CMakeLists.txt                        |   188 +
 examples/CMakeLists.txt                            |    78 +-
 examples/CloverLeaf/CMakeLists.txt                 |    82 +
 examples/CloverLeaf/Makefile.in                    |     5 +-
 .../{example2 => EinsteinToolkit}/CMakeLists.txt   |    24 +-
 examples/EinsteinToolkit/Makefile.in               |     5 +-
 examples/Halide/CMakeLists.txt                     |  1076 ++
 examples/Halide/Makefile.in                        |     5 +-
 examples/IntelSVM/CMakeLists.txt                   |    68 +
 examples/IntelSVM/Makefile.am                      |    70 +
 examples/{piglit => IntelSVM}/Makefile.in          |    87 +-
 examples/IntelSVM/README                           |    18 +
 .../intelsvm_CMakeLists.txt}                       |    26 +-
 examples/Makefile.am                               |     9 +-
 examples/Makefile.in                               |    19 +-
 examples/OpenCV/CMakeLists.txt                     |   332 +
 examples/OpenCV/Makefile.in                        |     5 +-
 examples/OpenCV/opencv.patch                       |    13 +
 examples/Parboil/CMakeLists.txt                    |   149 +
 examples/Parboil/Makefile.in                       |     5 +-
 examples/PyOpenCL/README                           |     2 +-
 examples/Rodinia/CMakeLists.txt                    |   115 +
 examples/Rodinia/Makefile.in                       |     5 +-
 examples/Rodinia/Rodinia.patch                     |    29 +-
 examples/VexCL/CMakeLists.txt                      |   113 +
 examples/VexCL/Makefile.in                         |     5 +-
 examples/ViennaCL/CMakeLists.txt                   |   298 +
 examples/ViennaCL/Makefile.in                      |     5 +-
 examples/ViennaCL/vienna_170.patch                 |    11 +
 examples/arrayfire/CMakeLists.txt                  |   697 +
 examples/clBLAS/CMakeLists.txt                     |   266 +
 examples/clFFT/CMakeLists.txt                      |    87 +
 examples/example1-spir32/CMakeLists.txt            |     2 +-
 examples/example1-spir32/Makefile.in               |     5 +-
 examples/example1-spir32/example1.c                |     8 +-
 examples/example1-spir32/generate_spir32.sh        |    10 +
 examples/example1-spir64/CMakeLists.txt            |     2 +-
 examples/example1-spir64/Makefile.in               |     5 +-
 examples/example1-spir64/example1.c                |     8 +-
 examples/example1-spir64/generate_spir.sh          |    10 +
 examples/example1/CMakeLists.txt                   |     6 +-
 examples/example1/Makefile.in                      |     5 +-
 examples/example2/CMakeLists.txt                   |     6 +-
 examples/example2/Makefile.in                      |     5 +-
 examples/example2a/CMakeLists.txt                  |     6 +-
 examples/example2a/Makefile.in                     |     5 +-
 examples/opencl-book-samples/CMakeLists.txt        |   105 +
 examples/opencl-book-samples/Makefile.in           |     5 +-
 examples/piglit/CMakeLists.txt                     |   220 +
 examples/piglit/Makefile.in                        |     5 +-
 examples/pocl-android-sample/.cproject             |    86 +
 examples/pocl-android-sample/.project              |    49 +
 examples/pocl-android-sample/AndroidManifest.xml   |    28 +
 examples/pocl-android-sample/jni/Android.mk        |    12 +
 examples/pocl-android-sample/jni/Application.mk    |     2 +
 .../jni/CLONE_LIBOPENCL_STUB_HERE                  |     1 +
 examples/pocl-android-sample/jni/vectorAdd.cpp     |   137 +
 examples/pocl-android-sample/jni/vectorAdd.h       |    49 +
 examples/pocl-android-sample/project.properties    |    14 +
 .../res/drawable-hdpi/ic_launcher.png              |   Bin 0 -> 5319 bytes
 .../res/drawable-mdpi/ic_launcher.png              |   Bin 0 -> 3228 bytes
 .../res/drawable-xhdpi/ic_launcher.png             |   Bin 0 -> 7224 bytes
 .../res/drawable-xxhdpi/ic_launcher.png            |   Bin 0 -> 11609 bytes
 .../pocl-android-sample/res/values-v11/styles.xml  |    11 +
 .../pocl-android-sample/res/values-v14/styles.xml  |    12 +
 .../res/values-w820dp/dimens.xml                   |    10 +
 examples/pocl-android-sample/res/values/dimens.xml |     7 +
 .../pocl-android-sample/res/values/strings.xml     |     6 +
 examples/pocl-android-sample/res/values/styles.xml |    20 +
 .../src/org/pocl/sample1/MainActivity.java         |   101 +
 examples/scalarwave/CMakeLists.txt                 |     6 +-
 examples/scalarwave/Makefile.in                    |     5 +-
 examples/standalone/CMakeLists.txt                 |     6 +
 examples/standalone/Makefile.in                    |     5 +-
 examples/trig/Makefile.in                          |     5 +-
 examples/trig/trig.c                               |     6 +-
 fix-include/CL/cl_platform.h                       |     4 +
 include/CL/Makefile.in                             |     5 +-
 include/CL/cl.h                                    |   391 +-
 include/CL/cl_ext.h                                |   115 +-
 include/CL/cl_gl.h                                 |     4 +-
 include/CL/cl_gl_ext.h                             |     2 +-
 include/CL/cl_platform.h                           |   411 +-
 include/CMakeLists.txt                             |     2 +-
 include/Makefile.am                                |     2 +-
 include/Makefile.in                                |     7 +-
 include/OpenCL/Makefile.in                         |     5 +-
 include/_kernel.h                                  |   196 +-
 include/_kernel_c.h                                |    11 +-
 include/pocl.h                                     |    62 +-
 include/pocl_cache.h                               |     6 +-
 include/pocl_features.h                            |    37 -
 include/pocl_types.h                               |     2 -
 include/vccompat.hpp                               |     3 +
 lib/CL/CMakeLists.txt                              |    23 +-
 lib/CL/Makefile.am                                 |    24 +-
 lib/CL/Makefile.in                                 |   296 +-
 lib/CL/clBuildProgram.c                            |    64 +-
 lib/CL/clCreateBuffer.c                            |     7 +-
 lib/CL/clCreateCommandQueue.c                      |     8 +-
 lib/CL/clCreateCommandQueueWithProperties.c        |   107 +
 lib/CL/clCreateContext.c                           |     2 +
 lib/CL/clCreateContextFromType.c                   |     5 +-
 lib/CL/clCreateKernel.c                            |     8 +-
 lib/CL/clCreateKernelsInProgram.c                  |     6 +-
 lib/CL/clCreateProgramWithBinary.c                 |     3 +
 lib/CL/clCreateProgramWithSource.c                 |     3 +
 lib/CL/clEnqueueCopyBuffer.c                       |     2 +-
 lib/CL/clEnqueueCopyBufferRect.c                   |   101 +-
 lib/CL/clEnqueueCopyImage.c                        |    85 +-
 lib/CL/clEnqueueFillBuffer.c                       |   108 +
 lib/CL/clEnqueueMapBuffer.c                        |     8 +-
 lib/CL/clEnqueueMapImage.c                         |     2 +-
 lib/CL/clEnqueueNDRangeKernel.c                    |    14 +-
 lib/CL/clEnqueueReadBufferRect.c                   |     2 +-
 lib/CL/clEnqueueSVMFree.c                          |    86 +
 lib/CL/clEnqueueSVMMap.c                           |    91 +
 lib/CL/clEnqueueSVMMemFill.c                       |    93 +
 lib/CL/clEnqueueSVMMemcpy.c                        |    83 +
 lib/CL/clEnqueueSVMUnmap.c                         |    78 +
 lib/CL/clEnqueueUnmapMemObject.c                   |     2 +-
 lib/CL/clEnqueueWriteBuffer.c                      |     2 +-
 lib/CL/clFinish.c                                  |    83 +
 lib/CL/clGetDeviceInfo.c                           |    41 +-
 lib/CL/clGetPlatformIDs.c                          |   192 +-
 lib/CL/clGetPlatformInfo.c                         |     3 +-
 lib/CL/clGetProgramBuildInfo.c                     |     4 +-
 lib/CL/clGetProgramInfo.c                          |    74 +-
 lib/CL/clReleaseCommandQueue.c                     |     7 +-
 lib/CL/clReleaseContext.c                          |     7 +-
 lib/CL/clReleaseMemObject.c                        |    23 +-
 lib/CL/clReleaseProgram.c                          |     6 +-
 lib/CL/clSVMAlloc.c                                |   110 +
 lib/{kernel/hsail64/log1p.cl => CL/clSVMFree.c}    |    30 +-
 lib/CL/clSetKernelArgSVMPointer.c                  |    59 +
 .../{clReleaseContext.c => clSetKernelExecInfo.c}  |    50 +-
 lib/CL/devices/CMakeLists.txt                      |     7 -
 lib/CL/devices/Makefile.am                         |     5 -
 lib/CL/devices/Makefile.in                         |    20 +-
 lib/CL/devices/basic/Makefile.in                   |     5 +-
 lib/CL/devices/basic/basic.c                       |   294 +-
 lib/CL/devices/bufalloc.c                          |    76 +-
 lib/CL/devices/bufalloc.h                          |    46 +-
 lib/CL/devices/cellspu/Makefile.am                 |    32 -
 lib/CL/devices/cellspu/Makefile.in                 |   713 -
 lib/CL/devices/cellspu/cellspu.c                   |   649 -
 lib/CL/devices/cellspu/cellspu.h                   |    61 -
 lib/CL/devices/common.c                            |   156 +
 lib/CL/devices/common.h                            |    21 +
 lib/CL/devices/devices.c                           |    10 +-
 lib/CL/devices/hsa/Makefile.in                     |     5 +-
 lib/CL/devices/hsa/pocl-hsa.c                      |   290 +-
 lib/CL/devices/prototypes.inc                      |     8 +-
 lib/CL/devices/pthread/Makefile.in                 |     5 +-
 lib/CL/devices/pthread/pthread.c                   |   254 +-
 lib/CL/devices/tce/CMakeLists.txt                  |     7 +-
 lib/CL/devices/tce/Makefile.in                     |     5 +-
 lib/CL/devices/tce/tce_common.cc                   |    29 +-
 lib/CL/devices/tce/tta_device_main_dthread.c       |   221 +
 lib/CL/devices/tce/ttasim/CMakeLists.txt           |     3 +-
 lib/CL/devices/tce/ttasim/Makefile.in              |     5 +-
 lib/CL/devices/tce/ttasim/todo.txt                 |    12 +
 lib/CL/devices/tce/ttasim/tta.txt                  |    36 +
 lib/CL/devices/tce/ttasim/ttasim.cc                |     8 +-
 lib/CL/devices/topology/Makefile.in                |     5 +-
 lib/CL/dummy.c                                     |     1 +
 lib/CL/pocl_cache.c                                |    42 +-
 lib/CL/pocl_cl.h                                   |    60 +-
 lib/CL/pocl_debug.c                                |    92 +-
 lib/CL/pocl_debug.h                                |   137 +-
 lib/CL/pocl_icd.h                                  |   120 +-
 lib/CL/pocl_img_buf_cpy.c                          |   217 +
 ...{clReleaseCommandQueue.c => pocl_img_buf_cpy.h} |    39 +-
 lib/CL/pocl_intfn.h                                |    11 +
 lib/CL/pocl_llvm.h                                 |     6 +-
 lib/CL/pocl_llvm_api.cc                            |   450 +-
 lib/CL/pocl_queue_util.c                           |     2 +
 lib/CL/pocl_timing.c                               |   153 +
 lib/CL/pocl_timing.h                               |    26 +
 lib/CL/pocl_util.c                                 |    49 +-
 lib/CL/pocl_util.h                                 |    76 +-
 lib/CMakeLists.txt                                 |    13 +-
 lib/Makefile.in                                    |     5 +-
 lib/kernel/CMakeLists.txt                          |    42 +-
 lib/kernel/Makefile.am                             |     3 +-
 lib/kernel/Makefile.in                             |     8 +-
 lib/kernel/cellspu/CMakeLists.txt                  |    50 -
 lib/kernel/cellspu/Makefile                        |   751 -
 lib/kernel/cellspu/Makefile.in                     |   751 -
 lib/kernel/host/CMakeLists.txt                     |    95 +-
 lib/kernel/host/Makefile.am                        |     9 +
 lib/kernel/host/Makefile.in                        |   224 +-
 lib/kernel/hsail64/CMakeLists.txt                  |    24 +-
 lib/kernel/hsail64/Makefile.am                     |     3 +-
 lib/kernel/hsail64/Makefile.in                     |   222 +-
 lib/kernel/hsail64/erf.cl                          |   181 +
 lib/kernel/hsail64/{log1p.cl => erfc.cl}           |    11 +-
 lib/kernel/hsail64/expm1.cl                        |    28 +-
 lib/kernel/hsail64/fast_length.cl                  |    85 +
 lib/kernel/hsail64/{log1p.cl => fast_normalize.cl} |    14 +-
 lib/kernel/hsail64/hypot.cl                        |    29 +-
 lib/kernel/hsail64/{expm1.cl => length.cl}         |    42 +-
 lib/kernel/hsail64/lgamma.cl                       |   113 +
 lib/kernel/hsail64/log1p.cl                        |    35 +-
 lib/kernel/hsail64/svm_atomics_hsail.cl.ll         |  8078 +++++++++
 lib/kernel/hsail64/tgamma.cl                       |    94 +
 lib/kernel/printf.c                                |    24 +-
 lib/kernel/printf_constant.c                       |    13 +-
 lib/kernel/rules.mk                                |    23 +-
 lib/kernel/sources-vml.mk                          |     2 +
 lib/kernel/sources.mk                              |     5 +-
 lib/kernel/svm_atomics.cl                          |   424 +
 lib/kernel/svm_atomics.h                           |   169 +
 lib/kernel/svm_atomics_host.cl                     |   253 +
 lib/kernel/svm_atomics_x86_64.ll                   |  8075 +++++++++
 lib/kernel/tce/CMakeLists.txt                      |    19 +-
 lib/kernel/tce/Makefile                            |   758 -
 lib/kernel/tce/Makefile.am                         |     6 +-
 lib/kernel/tce/Makefile.in                         |   224 +-
 lib/kernel/vecmathlib/floatbuiltins.h              |   464 +-
 lib/kernel/vecmathlib/floatprops.h                 |   571 +-
 lib/kernel/vecmathlib/floattypes.h                 |   313 +-
 lib/kernel/vecmathlib/loop.cc                      |   290 +
 lib/kernel/vecmathlib/mathfuncs.h                  |     2 +-
 lib/kernel/vecmathlib/mathfuncs_asin.h             |   363 +-
 lib/kernel/vecmathlib/mathfuncs_asinh.h            |    53 +-
 lib/kernel/vecmathlib/mathfuncs_base.h             |   243 +-
 lib/kernel/vecmathlib/mathfuncs_convert.h          |   358 +-
 lib/kernel/vecmathlib/mathfuncs_exp.h              |   237 +-
 lib/kernel/vecmathlib/mathfuncs_fabs.h             |   305 +-
 lib/kernel/vecmathlib/mathfuncs_int.h              |   243 +-
 lib/kernel/vecmathlib/mathfuncs_log.h              |   137 +-
 lib/kernel/vecmathlib/mathfuncs_pow.h              |    43 +-
 lib/kernel/vecmathlib/mathfuncs_rcp.h              |   117 +-
 lib/kernel/vecmathlib/mathfuncs_sin.h              |   419 +-
 lib/kernel/vecmathlib/mathfuncs_sinh.h             |    37 +-
 lib/kernel/vecmathlib/mathfuncs_sqrt.h             |   124 +-
 lib/kernel/vecmathlib/selftest.cc                  |  1724 ++
 lib/kernel/vecmathlib/vec_altivec_float4.h         |  1139 +-
 lib/kernel/vecmathlib/vec_avx_double4.h            |  1386 +-
 lib/kernel/vecmathlib/vec_avx_float8.h             |  1419 +-
 lib/kernel/vecmathlib/vec_avx_fp16_16.h            |  1155 +-
 lib/kernel/vecmathlib/vec_avx_fp8_32.h             |  1223 +-
 lib/kernel/vecmathlib/vec_base.h                   |  1179 +-
 lib/kernel/vecmathlib/vec_builtin.h                |  2619 ++-
 lib/kernel/vecmathlib/vec_mask.h                   |   121 +-
 lib/kernel/vecmathlib/vec_mic_double8.h            |  1236 +-
 lib/kernel/vecmathlib/vec_neon_float2.h            |  1085 +-
 lib/kernel/vecmathlib/vec_neon_float4.h            |  1131 +-
 lib/kernel/vecmathlib/vec_pseudo.h                 |  3052 ++--
 lib/kernel/vecmathlib/vec_qpx_double4.h            |  1399 +-
 lib/kernel/vecmathlib/vec_sse_double1.h            |  1002 +-
 lib/kernel/vecmathlib/vec_sse_double2.h            |  1237 +-
 lib/kernel/vecmathlib/vec_sse_float1.h             |   998 +-
 lib/kernel/vecmathlib/vec_sse_float4.h             |  1286 +-
 lib/kernel/vecmathlib/vec_test.h                   |  2690 ++-
 lib/kernel/vecmathlib/vec_vsx_double2.h            |  1215 +-
 lib/kernel/vecmathlib/vecmathlib.h                 |   248 +-
 lib/kernel/write_image.cl                          |     5 +-
 lib/llvmopencl/AllocasToEntry.cc                   |    15 +-
 lib/llvmopencl/AllocasToEntry.h                    |     6 +-
 lib/llvmopencl/AutomaticLocals.cc                  |    32 +-
 lib/llvmopencl/Barrier.h                           |     7 +-
 lib/llvmopencl/BarrierBlock.cc                     |    13 +-
 lib/llvmopencl/BarrierBlock.h                      |     5 +-
 lib/llvmopencl/BarrierTailReplication.cc           |    47 +-
 lib/llvmopencl/BarrierTailReplication.h            |    18 +-
 lib/llvmopencl/BreakConstantGEPs.cpp               |    30 +-
 lib/llvmopencl/BreakConstantGEPs.h                 |     8 +-
 lib/llvmopencl/CanonicalizeBarriers.cc             |    39 +-
 lib/llvmopencl/CanonicalizeBarriers.h              |     8 +-
 lib/llvmopencl/DebugHelpers.cc                     |    31 +-
 lib/llvmopencl/DebugHelpers.h                      |    14 +-
 lib/llvmopencl/Flatten.cc                          |    41 +-
 lib/llvmopencl/GenerateHeader.cc                   |    58 +-
 lib/llvmopencl/ImplicitConditionalBarriers.cc      |    30 +-
 lib/llvmopencl/ImplicitConditionalBarriers.h       |     6 +-
 lib/llvmopencl/ImplicitLoopBarriers.cc             |    24 +-
 lib/llvmopencl/ImplicitLoopBarriers.h              |     3 +-
 lib/llvmopencl/IsolateRegions.cc                   |    20 +-
 lib/llvmopencl/Kernel.cc                           |    23 +-
 lib/llvmopencl/Kernel.h                            |     7 +-
 lib/llvmopencl/LLVMFileUtils.cc                    |    93 +-
 lib/llvmopencl/LLVMUtils.cc                        |    19 -
 lib/llvmopencl/LLVMUtils.h                         |    13 +-
 lib/llvmopencl/LoopBarriers.cc                     |    27 +-
 lib/llvmopencl/LoopBarriers.h                      |     3 +-
 lib/llvmopencl/Makefile.in                         |     5 +-
 lib/llvmopencl/PHIsToAllocas.cc                    |    26 +-
 lib/llvmopencl/PHIsToAllocas.h                     |     5 +-
 lib/llvmopencl/ParallelRegion.cc                   |   128 +-
 lib/llvmopencl/ParallelRegion.h                    |    19 +-
 lib/llvmopencl/TargetAddressSpaces.cc              |   371 +-
 lib/llvmopencl/TargetAddressSpaces.h               |     6 +-
 lib/llvmopencl/VariableUniformityAnalysis.cc       |    47 +-
 lib/llvmopencl/VariableUniformityAnalysis.h        |     6 +-
 lib/llvmopencl/WorkItemAliasAnalysis.cc            |   136 +-
 lib/llvmopencl/Workgroup.cc                        |   116 +-
 lib/llvmopencl/Workgroup.h                         |     5 +-
 lib/llvmopencl/WorkitemHandler.cc                  |    48 +-
 lib/llvmopencl/WorkitemHandler.h                   |    14 +-
 lib/llvmopencl/WorkitemHandlerChooser.cc           |    11 +-
 lib/llvmopencl/WorkitemLoops.cc                    |    95 +-
 lib/llvmopencl/WorkitemLoops.h                     |    12 +-
 lib/llvmopencl/WorkitemReplication.cc              |    72 +-
 lib/llvmopencl/WorkitemReplication.h               |    13 +-
 lib/llvmopencl/linker.cpp                          |   138 +-
 lib/llvmopencl/linker.h                            |     5 +-
 lib/poclu/Makefile.in                              |     5 +-
 scripts/CMakeLists.txt                             |     6 +
 scripts/Makefile.am                                |     3 +-
 scripts/Makefile.in                                |     8 +-
 scripts/pocl-standalone.in                         |     6 +-
 scripts/pocl-standalone.in.cmake                   |    38 +-
 tests/CMakeLists.txt                               |    77 +-
 tests/Makefile.am                                  |     2 +-
 tests/Makefile.in                                  |     7 +-
 tests/cell/Makefile.am                             |    25 -
 tests/cell/Makefile.in                             |   730 -
 tests/cell/hello/Makefile.am                       |    35 -
 tests/cell/hello/Makefile.in                       |   721 -
 tests/cell/hello/host.cpp                          |   215 -
 tests/kernel/CMakeLists.txt                        |   140 +-
 tests/kernel/Makefile.in                           |     5 +-
 tests/kernel/test_as_type.cl                       |    10 +-
 tests/kernel/test_shuffle.cc                       |   607 +-
 tests/package.m4                                   |    13 -
 tests/regression/CMakeLists.txt                    |   211 +-
 tests/regression/Makefile.in                       |     5 +-
 tests/regression/README.txt                        |     4 +
 tests/runtime/CMakeLists.txt                       |    54 +-
 tests/runtime/Makefile.in                          |     5 +-
 tests/runtime/macro_test.cl                        |     0
 tests/runtime/test_clBuildProgram.c                |    14 +
 tests/runtime/test_clBuildProgram_macros.cl        |    15 +
 tests/runtime/test_version.c                       |     3 +-
 .../devices/cellspu => tests/tce}/CMakeLists.txt   |    16 +-
 tests/tce/Makefile.in                              |     5 +-
 .../example1 => tests/tce/fp16}/CMakeLists.txt     |    47 +-
 tests/tce/fp16/Makefile.in                         |     5 +-
 tests/tce/fp16/expected_out.txt                    |    17 +
 .../example2 => tests/tce/tcemc}/CMakeLists.txt    |    30 +-
 tests/tce/tcemc/Makefile.in                        |     5 +-
 .../standalone => tests/tce/ttasim}/CMakeLists.txt |    25 +-
 tests/tce/ttasim/Makefile.in                       |     5 +-
 tests/testsuite                                    | 16385 -------------------
 tests/testsuite-amdsdk2_9.at                       |     2 +-
 ...stsuite-amdsdk2_9.at => testsuite-amdsdk3_0.at} |   399 +-
 tests/testsuite-regression.at                      |     4 +-
 tests/testsuite.at                                 |    22 +-
 tests/viennacl.at                                  |     7 +
 tests/workgroup/CMakeLists.txt                     |   190 +-
 tests/workgroup/Makefile.in                        |     5 +-
 tools/scripts/run_all_tests                        |    75 +
 tools/scripts/run_hsa_tests                        |     6 +-
 .../Makefile.am => tools/scripts/run_tta_tests     |    28 +-
 398 files changed, 51295 insertions(+), 43983 deletions(-)

diff --git a/.bzrignore b/.bzrignore
new file mode 100644
index 0000000..b3dddc2
--- /dev/null
+++ b/.bzrignore
@@ -0,0 +1,52 @@
+# srcdir
+
+Makefile.in
+Makefile.*.in
+./aclocal.m4
+./autom4te.cache
+./configure
+config/config.guess
+config/config.sub
+config/depcomp
+config/install-sh
+config/ltmain.sh
+config/missing
+config/ylwrap
+./config.h.in
+./m4/*
+
+tests/package.m4
+tests/testsuite
+
+# builddir
+
+Makefile
+pocl.pc
+.deps
+.libs
+*.lo
+*.o
+*.la
+
+./libtool
+./clconfig.h
+./config.h
+./config.log
+./config.status
+./stamp-h1
+
+examples/barriers/barriers
+examples/example1/example1
+examples/example2/example2
+examples/example2a/example2a
+examples/forloops/forloops
+examples/loopbarriers/loopbarriers
+examples/standalone/standalone.bc
+examples/standalone/standalone.h
+examples/trig/trig
+examples/scalarwave/scalarwave
+examples/kernel/kernel
+scripts/pocl-build
+scripts/pocl-kernel
+scripts/pocl-standalone
+scripts/pocl-workgroup
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..141801b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,41 @@
+examples/Rodinia/pathfinder.stdout	export-ignore
+doc/benchmark_results/			export-ignore
+doc/binary_format.txt			export-ignore
+doc/buildbot/				export-ignore
+doc/handling_loops.txt			export-ignore
+doc/LAUNDRY				export-ignore
+doc/notes*.txt				export-ignore
+doc/spir-todo.txt			export-ignore
+doc/ttasim_kernel_capturer.txt		export-ignore
+doc/www/				export-ignore
+
+examples/piglit/sorted_ref		export-ignore
+examples/piglit/sorted_ref_llvm_3.5	export-ignore
+
+# this one is ~20M
+examples/Rodinia/pathfinder.stdout	export-ignore
+
+lib/kernel/amdgcn			export-ignore
+lib/kernel/convert_type.py		export-ignore
+lib/kernel/vecmathlib/bench.cc		export-ignore
+lib/kernel/vecmathlib/coeffs.out	export-ignore
+lib/kernel/vecmathlib/example.cc	export-ignore
+lib/kernel/vecmathlib/example_float.cc	export-ignore
+lib/kernel/vecmathlib/find-coeffs.m	export-ignore
+lib/kernel/vecmathlib/IDEAS		export-ignore
+lib/kernel/vecmathlib/instantiations.cc	export-ignore
+lib/kernel/vecmathlib/interp.cc		export-ignore
+
+scripts/pocl-build.in			export-ignore
+scripts/pocl-kernel.in			export-ignore
+scripts/pocl-workgroup.in		export-ignore
+
+tests/amdsdk.at				export-ignore
+tests/kernel/test_convert_type.py	export-ignore
+tests/kernel/test_convert_type.sh	export-ignore
+
+tools/gdb-breakpoints			export-ignore
+tools/patches/clang-3.4-no-forced-64bit-doubles.patch		export-ignore
+tools/scripts/benchmark_barchart.py	export-ignore
+tools/scripts/benchmark.py		export-ignore
+tools/scripts/devel-configure		export-ignore
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0b0e206
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,95 @@
+**/.deps
+**/.libs
+*.bc
+*.la
+*.lo
+*.o
+
+Makefile
+Makefile.in
+
+aclocal.m4
+autom4te.cache
+
+config.h
+config.h.in
+config.log
+config.status
+config/ar-lib
+config/compile
+config/config.guess
+config/config.sub
+config/depcomp
+config/install-sh
+config/ltmain.sh
+config/missing
+configure
+
+doc/sphinx/build/
+
+examples/EinsteinToolkit/EinsteinToolkit
+examples/example1/example1
+examples/example1-spir32/example1-spir32
+examples/example1-spir64/example1-spir
+examples/example2/example2
+examples/example2a/example2a
+examples/scalarwave/scalarwave
+examples/standalone/standalone.h
+examples/trig/trig
+
+include/CL/cl.hpp
+include/arm/types.h
+include/cellspu/types.h
+include/powerpc/types.h
+include/powerpc64/types.h
+include/x86_64/types.h
+include/CL/cl.hpp.patched
+
+install-paths.h
+
+libtool
+
+lib/CL/kernellib_hash.*
+
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+
+ocl-vendors/pocl-tests.icd
+
+pocl.icd
+pocl.pc
+
+scripts/pocl-build
+scripts/pocl-kernel
+scripts/pocl-standalone
+scripts/pocl-workgroup
+
+stamp-h1
+
+tests/atconfig
+tests/atlocal
+tests/kernel/kernel
+tests/kernel/image_query_funcs
+tests/kernel/sampler_address_clamp
+tests/package.m4
+tests/*/test_*
+!tests/runtime/*.c
+tests/testsuite
+tests/testsuite.dir
+tests/testsuite.log
+tests/workgroup/run_kernel
+
+
+
+# these are created by Qt Creator
+
+pocl.config
+pocl.creator
+pocl.creator.user
+pocl.files
+pocl.includes
+CMakeLists.txt.includes
+CMakeLists.txt.user
diff --git a/.mailmap b/.mailmap
new file mode 100644
index 0000000..53e20f4
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,44 @@
+Carlos Sánchez de La Lama <csanchezdll at gmail.com> <carlos.delalama at urjc.es>
+Carlos Sánchez de La Lama <csanchezdll at gmail.com> <csanchez at csanchez-desktop>
+Carlos Sánchez de La Lama <csanchezdll at gmail.com> Carlos Sanchez de La Lama <carlos.delalama at urjc.es>
+
+Daniel Sanders <daniel.sanders at imgtec.com> Daniel Sanders META COSY <daniel.sanders at imgtec.com>
+
+Erik Schnetter <schnetter at gmail.com> <eschnetter at perimeterinstitute.ca>
+Erik Schnetter <schnetter at gmail.com> Erik Schnetter <>
+
+Heikki Kultala <heikki.kultala at tut.fi> <hkultala at iki.fi>
+Heikki Kultala <heikki.kultala at tut.fi> <hkultala at trurl.(none)>
+Heikki Kultala <heikki.kultala at tut.fi> heikki-llvm-svn-testing <hkultala26 at tiberias>
+Heikki Kultala <heikki.kultala at tut.fi> hkultala at cs.tut.fi <>
+
+Hugo van der Wijst <hugo at wij.st> <hugwijst at gmail.com>
+
+Kalle Raiskila <kraiskil at ovi.com> <kalle.raiskila at nokia.com>
+Kalle Raiskila <kraiskil at ovi.com> <kraiski at ovi.com>
+Kalle Raiskila <kraiskil at ovi.com> Kalle <kraiskil at debian>
+Kalle Raiskila <kraiskil at ovi.com> Kalle Raiskila <>
+Kalle Raiskila <kraiskil at ovi.com> kraiskil at debian <>
+Kalle Raiskila <kraiskil at ovi.com> kraiskil <kraiskil at marvin>
+Kalle Raiskila <kraiskil at ovi.com> kraiskil <kraiskil at users.noreply.github.com>
+
+Krishnaraj Raghavendra Bhat <krrishnarraj at gmail.com> Krishnaraj Bhat <krrishnarraj at gmail.com>
+Krishnaraj Raghavendra Bhat <krrishnarraj at gmail.com> Krishnaraj R Bhat <krrishnarraj at gmail.com>
+
+Matias Koskela <matias.koskela at tut.fi> <koskel29 at kaliforniankeiju.cs.tut.fi>
+
+Michal Babej <michal.babej at tut.fi> <Franz.Netykafka at runbox.com>
+Michal Babej <michal.babej at tut.fi> <franz at users.noreply.github.com>
+
+Pekka Jääskeläinen <pekka.jaaskelainen at tut.fi> <pekka.jaaskelainen at gmail.com>
+Pekka Jääskeläinen <pekka.jaaskelainen at tut.fi> <pjaaskel at haikara>
+Pekka Jääskeläinen <pekka.jaaskelainen at tut.fi> <pjaaskel at users.noreply.github.com>
+Pekka Jääskeläinen <pekka.jaaskelainen at tut.fi> <visit0r at debian>
+Pekka Jääskeläinen <pekka.jaaskelainen at tut.fi> Pekka Jaaskelainen <pekka.jaaskelainen at tut.fi>
+
+Ville Korhonen <ville.t.korhonen at tut.fi> <korhone5 at nokitikka.cs.tut.fi>
+Ville Korhonen <ville.t.korhonen at tut.fi> vkorhonen <ville.t.korhonen at tut.fi>
+
+Vincent Danjean <vdanjean at eyak.imag.fr> <Vincent.Danjean at ens-lyon.org>
+
+Vladimir Guzma <vladimir.guzma at tut.fi> <vg at punarastas>
diff --git a/CHANGES b/CHANGES
index 0d2e7f2..7848373 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,41 @@
+0.13 April 2016
+===============
+
+Highlights
+-----------
+- Support for LLVM/Clang 3.8
+- initial (partial) OpenCL 2.0 support
+  (only Shared Virtual Memory and Atomics are supported ATM)
+- CMake build system almost on parity with autotools
+  (TCE, all external testsuites)
+- CMake build is now able to build multiple kernel libraries
+  for different CPUs and let pocl select a suitable one at runtime
+
+Bugfixes
+---------
+- clEnqueueCopyImage() now works properly
+- improved file locking (much less disk access to kernel cache)
+- Address spaces of structs are handled properly
+
+Other
+------
+- removed custom buffer alloc from pthread device
+- removed IBM Cell support
+- removed support for older LLVM versions (before 3.7)
+- significantly higher performance with a lot of small kernel enqueues
+  (due to improved file locking)
+- vecmathlib now supports AVX2
+- a few more HSA kernel library implementations: l/tgamma, erf(c), hypot
+- implemented OpenCL 2.0 API calls: clEnqueueSVM*, clSVMalloc/free,
+  clEnqueueFillBuffer, clSetKernelExecInfo, clSetKernelArgSVMPointer,
+  clCreateCommandQueueWithProperties - no device side queues yet
+- OpenCL 2.0 atomics (C11 atomics subset) for x86-64 and HSA
+- new testsuites: AMD SDK 3.0, Intel SVM
+- New CMake-only testsuites: ASL, clBLAS, clFFT, arrayfire
+- more debugging info (timing, mem stats)
+- ansi colors with POCL_DEBUG=1 if the output is a terminal
+
+
 0.12 October 2015
 ===============
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8fe68ce..df52f5d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,15 +27,61 @@ cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
 
 project(pocl)
 set(MAJOR_VERSION 0)
-set(MINOR_VERSION 12)
+set(MINOR_VERSION 13)
 set(VERSION_SUFFIX "")
 set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION}${VERSION_SUFFIX})
 set(POCL_VERSION ${VERSION_STRING})
 
+##################################################################################
 
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc")
+  set(POWERPC 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips")
+  set(MIPS 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv7")
+  set(ARMV7 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv6")
+  set(ARMV6 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(i386|AMD64|x86_64)")
+  if(POCL_DEVICE_ADDRESS_BITS MATCHES "32")
+    set(I386 1)
+  else()
+    set(X86_64 1)
+  endif()
+endif()
+
+######################################################################################
+
+macro(set_expr VAR)
+  if(${ARGN})
+    set(${VAR} 1)
+  else()
+    set(${VAR} 0)
+  endif()
+endmacro()
+
+function(rename_if_different SRC DST)
+  if(EXISTS "${DST}")
+    file(MD5 "${SRC}" OLD_MD5)
+    file(MD5 "${DST}" NEW_MD5)
+    if(NOT OLD_MD5 STREQUAL NEW_MD5)
+      file(RENAME "${SRC}" "${DST}")
+    endif()
+  else()
+    file(RENAME "${SRC}" "${DST}")
+  endif()
+endfunction()
 
 ######################################################################################
 
+# Recent versions of CMake can make use of Ninja's console pool to avoid
+# buffering the output of particular commands.
+if(CMAKE_VERSION VERSION_LESS 3.2.0)
+  set(COMMAND_USES_TERMINAL)
+else()
+  set(COMMAND_USES_TERMINAL USES_TERMINAL)
+endif()
+
 if(UNIX)
   include(GNUInstallDirs)
 else()
@@ -57,7 +103,7 @@ set(POCL_INSTALL_PUBLIC_LIBDIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}"
 set(POCL_INSTALL_PRIVATE_LIBDIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pocl" CACHE PATH "POCL private libdir")
 
 # for pocl.icd
-if (UNIX AND NOT CMAKE_CROSSCOMPILING)
+if(UNIX AND NOT CMAKE_CROSSCOMPILING)
   set(POCL_INSTALL_ICD_VENDORDIR "/etc/OpenCL/vendors" CACHE PATH "POCL ICD file destination")
 else()
   set(POCL_INSTALL_ICD_VENDORDIR "${CMAKE_INSTALL_PREFIX}/etc/OpenCL/vendors" CACHE PATH "POCL ICD file destination")
@@ -82,6 +128,7 @@ set(POCL_INSTALL_CMAKE_CONFIG_DIR "${POCL_INSTALL_PRIVATE_LIBDIR}" CACHE PATH
 set(POCL_INSTALL_PKGCONFIG_DIR "${POCL_INSTALL_PUBLIC_LIBDIR}/pkgconfig" CACHE PATH "Destination for pocl.pc")
 
 if(APPLE)
+  set(CMAKE_MACOSX_RPATH ON)
   set(POCL_INSTALL_OPENCL_HEADER_DIR "${POCL_INSTALL_PUBLIC_HEADER_DIR}/OpenCL" CACHE PATH "POCL header dir for OpenCL headers")
 else()
   set(POCL_INSTALL_OPENCL_HEADER_DIR "${POCL_INSTALL_PUBLIC_HEADER_DIR}/CL" CACHE PATH "POCL header dir for OpenCL headers")
@@ -89,7 +136,6 @@ endif()
 
 option(BUILD_SHARED_LIBS "ON=Build shared libs, OFF=static libs" ON)
 
-# TODO measure slowdown with compiled but disabled debug, see if it can default to always compile in
 option(
   POCL_DEBUG_MESSAGES
   "Enable debug messages from pocl (useful for OpenCL developers), must be enabled at runtime, with env var POCL_DEBUG"
@@ -104,22 +150,23 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 option(ENABLE_HSA "Enable the HSA device driver for AMD GCN devices" OFF)
 
 ######################################################################################
-# AC_CONFIG_TESTDIR([tests])
+
 enable_testing()
 
 ######################################################################################
 
-if (UNIX)
+if(UNIX)
   find_package(PkgConfig MODULE REQUIRED)
 endif()
 
 ######################################################################################
+
 set(ANDROID_COMPILER 0)
-if (CMAKE_C_COMPILER MATCHES "android")
+if(CMAKE_C_COMPILER MATCHES "android")
   set(ANDROID_COMPILER 1)
   add_definitions(-DPOCL_ANDROID)
   add_definitions(-DPOCL_ANDROID_PREFIX="/data/data/org.pocl.libs/files")
-endif ()
+endif()
 
 ######################################################################################
 
@@ -143,22 +190,22 @@ message(STATUS "Hwloc_CFLAGS ${Hwloc_CFLAGS}")
 # Find executables to few tools required during build 
 #
 
-find_program (PATCH_EXEC 
-  NAMES patch patch.exe
+find_program(PATCH_EXEC
+  NAMES patch${CMAKE_EXECUTABLE_SUFFIX}
   HINTS ENV PATH
 )
 
-find_program (XARGS_EXEC 
-  NAMES xargs xargs.exe
+find_program(XARGS_EXEC
+  NAMES xargs${CMAKE_EXECUTABLE_SUFFIX}
   HINTS ENV PATH
 )
 
-find_program (CAT_EXEC 
-  NAMES cat cat.exe
+find_program(CAT_EXEC
+  NAMES cat${CMAKE_EXECUTABLE_SUFFIX}
   HINTS ENV PATH
 )
 
-if (!PATCH_EXEC)
+if(NOT PATCH_EXEC)
   message(FATAL_ERROR "Could not find patch command.")
 endif()
 
@@ -170,8 +217,8 @@ if(NOT RES)
   message(FATAL_ERROR "Could not load LLVM.cmake")
 endif()
 
-if("${LLVM_VERSION}" VERSION_LESS "3.2")
-  message(FATAL_ERROR "POCL requires LLVM version >= 3.2 !")
+if("${LLVM_VERSION}" VERSION_LESS "3.6")
+  message(FATAL_ERROR "POCL requires LLVM version >= 3.6 !")
 endif()
 
 ######################################################################################
@@ -233,29 +280,16 @@ int main() {
 }
   " HAVE_CLOCK_GETTIME)
 else()
-  if(WIN32)
-    message(STATUS "Using GetSystemTime...() on Win32")
-    set(HAVE_CLOCK_GETTIME 1)
-  else()
-    set(HAVE_CLOCK_GETTIME 0)
-  endif()
+  set(HAVE_CLOCK_GETTIME 0)
 endif()
 
-######################################################################################
-
-set(NEW_PRINTF_WORKS 1)
-
-if("${LLVM_VERSION}" VERSION_LESS "3.4")
-  message(STATUS "Turning off new printf()")
-  set(NEW_PRINTF_WORKS 0)
-else()
-  message(STATUS "Enabling new printf()")
-endif()
+include(CheckFunctionExists)
+check_function_exists(fork HAVE_FORK)
+check_function_exists(vfork HAVE_VFORK)
 
 ######################################################################################
 
 if(NOT DEFINED DEFAULT_USE_VECMATHLIB)
-  # vecmathlib
   if(CLANGXX_WORKS AND EXISTS "${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib/vecmathlib.h")
     set(DEFAULT_USE_VECMATHLIB 1 CACHE INTERNAL "vecmathlib availability")
   else()
@@ -267,6 +301,13 @@ setup_cached_var(USE_VECMATHLIB "Vecmathlib use"
   "Requested enabling vecmathlib use, but either clang++ doesnt work or vecmathlib sources are missing.. -> disabling vecmathlib use"
   "Vecmathlib is usable, but requested disabling it")
 
+# vecmathlib does not compile with fp16 currently
+if(USE_VECMATHLIB AND (NOT CL_DISABLE_HALF))
+  message(STATUS "Half available, but disabling half support since vecmathlib is enabled.")
+  set(CL_DISABLE_HALF 1)
+  set(CL_DISABLE_HALF 1 CACHE BOOL "Disable cl_khr_fp16 because fp16 is not supported")
+endif()
+
 ######################################################################################
 
 if(UNIX)
@@ -289,22 +330,20 @@ endif()
 
 option(USE_VECMATHLIB_BUILTINS_ONLY "Use only __builtin_* functions in the kernel library." OFF)
 
+set(DEFAULT_KERNEL_CL_FLAGS "-Xclang -cl-std=CL2.0 -D__OPENCL_C_VERSION__=200")
 if(USE_VECMATHLIB)
   set(DEFAULT_KERNEL_CLANGXX_FLAGS "-DVML_NO_IOSTREAM ${CLANGXX_STDLIB}")
+  if(USE_VECMATHLIB_BUILTINS_ONLY)
+    set(DEFAULT_KERNEL_CL_FLAGS "${DEFAULT_KERNEL_CL_FLAGS} -DPOCL_VECMATHLIB_BUILTIN")
+    set(DEFAULT_KERNEL_CLANGXX_FLAGS "${DEFAULT_KERNEL_CLANGXX_FLAGS} -DPOCL_VECMATHLIB_BUILTIN")
+  endif()
 endif()
-set(DEFAULT_KERNEL_CL_FLAGS "-D__OPENCL_VERSION__=120")
 
-if(USE_VECMATHLIB_BUILTINS_ONLY AND USE_VECMATHLIB)
-  set(DEFAULT_KERNEL_CL_FLAGS "${DEFAULT_KERNEL_CL_FLAGS} -DPOCL_VECMATHLIB_BUILTIN")
-  set(DEFAULT_KERNEL_CLANGXX_FLAGS "${DEFAULT_KERNEL_CLANGXX_FLAGS} -DPOCL_VECMATHLIB_BUILTIN")
-endif()
+set(EXTRA_KERNEL_CL_FLAGS "" CACHE STRING "Extra arguments to kernel CL compiler (defaults to empty)")
+set(EXTRA_KERNEL_CXX_FLAGS "" CACHE STRING "Extra arguments to kernel CXX compiler (defaults to empty)")
 
-if(NOT DEFINED KERNEL_CLANGXX_FLAGS)
-  set(KERNEL_CLANGXX_FLAGS "${DEFAULT_KERNEL_CLANGXX_FLAGS}")
-endif()
-if(NOT DEFINED KERNEL_CL_FLAGS)
-  set(KERNEL_CL_FLAGS "${DEFAULT_KERNEL_CL_FLAGS}")
-endif()
+set(KERNEL_CLANGXX_FLAGS "${DEFAULT_KERNEL_CLANGXX_FLAGS} ${EXTRA_KERNEL_CXX_FLAGS}")
+set(KERNEL_CL_FLAGS "${DEFAULT_KERNEL_CL_FLAGS} ${EXTRA_KERNEL_CL_FLAGS}")
 
 message(STATUS "Clang++ flags for compiling kernel library: ${KERNEL_CLANGXX_FLAGS}")
 message(STATUS "OpenCL flags for compiling kernel library: ${KERNEL_CL_FLAGS}")
@@ -312,7 +351,6 @@ message(STATUS "OpenCL flags for compiling kernel library: ${KERNEL_CL_FLAGS}")
 
 ######################################################################################
 
-# DONE
 option(STATIC_LLVM "Link LLVM statically. Default is dynamic." OFF)
 
 if(STATIC_LLVM)
@@ -331,18 +369,6 @@ else()
   endif()
 endif()
 
-if (MSVC)
-  string(REPLACE "-L${LLVM_LIBDIR}" "" LLVM_LDFLAGS "${LLVM_LDFLAGS}")
-  string(STRIP "${LLVM_LDFLAGS}" LLVM_LDFLAGS)
-endif()
-
-######################################################################################
-
-option(CUSTOM_BUFFER_ALLOCATOR "Use a custom OpenCL optimized region-based memory allocator instead of allocating buffers with malloc directly" ON)
-
-######################################################################################
-
-# examples are in their own cmakelists.txt
 
 ######################################################################################
 
@@ -368,8 +394,7 @@ endif()
 
 ######################################################################################
 
-if (MSVC)
-  set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+if(MSVC)
   find_package( PthreadsWin32 )
   if(NOT Pthreads_FOUND)
     message(FATAL_ERROR "Could not find pthreads-win32 libs!")
@@ -378,7 +403,8 @@ if (MSVC)
 else()
   include(FindThreads)
   if(Threads_FOUND)
-    set(LD_FLAGS_BIN ${CMAKE_THREAD_LIBS_INIT})
+    set(PTHREAD_LDFLAGS ${CMAKE_THREAD_LIBS_INIT})
+    set(PTHREAD_CFLAGS "")
   else()
     message(FATAL_ERROR "Could not find threading library for this system")
   endif()
@@ -486,9 +512,6 @@ endif()
 
 
 ######################################################################################
-# DONE
-
-# OpenCL headers
 
 if(DEFINED INSTALL_OPENCL_HEADERS)
   message(STATUS "Install POCL's OpenCL headers: ${INSTALL_OPENCL_HEADERS}")
@@ -504,17 +527,6 @@ else() # Undefined = auto -> check
   set(INSTALL_OPENCL_HEADERS ${IOH} CACHE BOOL "Install POCL's OpenCL headers. (Ones from Kronos should be installed instead)")
 endif()
 
-######################################################################################
-# DONE
-
-# TODO Check cl.hpp usability. It is broken on a few platforms.
-
-if(DEFINED HAVE_OPENCL_HPP)
-  message(STATUS "cl2.hpp (cached): ${HAVE_OPENCL_HPP}")
-else()
-  message(STATUS "tests will use pocl's own cl2.hpp")
-  set(HAVE_OPENCL_HPP ${_OPENCL_HPP} CACHE INTERNAL "cl2.hpp found & usable")
-endif()
 
 ######################################################################################
 # TODO check if this works!
@@ -525,9 +537,8 @@ if(PEDANTIC)
 endif()
 
 ######################################################################################
-# DONE
 
-if (MSVC)
+if(MSVC)
   message(STATUS "Finding libGLEW binary distribution (http://glew.sourceforge.net/index.html) for Widows not implemented. Consider writing FindGlew.cmake macro.")
   message(STATUS "libGLEW not found. A few tests will not work")
 else()
@@ -542,20 +553,18 @@ else()
 endif()
 
 ######################################################################################
-# DONE
 
-option(KERNEL_CACHE "Enable the kernel compile cache." ON)
-if(KERNEL_CACHE)
-  set(POCL_BUILD_KERNEL_CACHE 1)
+option(KERNEL_CACHE_DEFAULT "Default value for the kernel compile cache. If disabled, pocl will still use the kernel cache, but will delete cachefiles on exit. You can still enable keeping the files it at runtime with an env var." ON)
+if(KERNEL_CACHE_DEFAULT)
+  set(POCL_KERNEL_CACHE_DEFAULT 1)
 else()
-  set(POCL_BUILD_KERNEL_CACHE 0)
+  set(POCL_KERNEL_CACHE_DEFAULT 0)
 endif()
 
 string(TIMESTAMP POCL_BUILD_TIMESTAMP "%d%m%Y%H%M%S")
 file(WRITE "${CMAKE_BINARY_DIR}/pocl_build_timestamp.h" "#define POCL_BUILD_TIMESTAMP \"${POCL_BUILD_TIMESTAMP}\"")
 
 ######################################################################################
-# DONE
 
 if("${LLVM_CLANG_VERSION}" MATCHES "SPIR")
   set(CLANG_SPIR 1)
@@ -592,14 +601,8 @@ endif()
 
 # TODO some more work required here
 
-set(DEFAULT_HOST_LLC_FLAGS "-relocation-model=pic")
-# TODO host clang flags / llc flags - properly add_compile_options etc
-set(DEFAULT_HOST_CLANG_FLAGS "${DEFAULT_HOST_CLANG_FLAGS} ${CLANG_TARGET_OPTION}${LLC_TRIPLE}")
-set(DEFAULT_HOST_LLC_FLAGS "${DEFAULT_HOST_LLC_FLAGS} -mtriple=${LLC_TRIPLE}")
-if(NOT LLC_HOST_CPU MATCHES "unknown")
-  set(DEFAULT_HOST_CLANG_FLAGS "${DEFAULT_HOST_CLANG_FLAGS} ${CLANG_MARCH_FLAG}${LLC_HOST_CPU}")
-  set(DEFAULT_HOST_LLC_FLAGS "${DEFAULT_HOST_LLC_FLAGS} -mcpu=${LLC_HOST_CPU}")
-endif()
+set(DEFAULT_HOST_CLANG_FLAGS "${CLANG_TARGET_OPTION}${LLC_TRIPLE}")
+set(DEFAULT_HOST_LLC_FLAGS "-relocation-model=pic -mtriple=${LLC_TRIPLE}")
 
 if(LLC_TRIPLE MATCHES "^arm")
   if(LLC_TRIPLE MATCHES "gnueabihf")
@@ -623,34 +626,55 @@ if(CL_DISABLE_HALF)
   set(DEFAULT_HOST_CLANG_FLAGS "${DEFAULT_HOST_CLANG_FLAGS} -D_CL_DISABLE_HALF")
 endif()
 
-####################################################################
+# define it here, b/c we'll need these both at runtime and buildtime
+set(HOST_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_spir")
+if(NOT CL_DISABLE_HALF)
+  set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_fp16")
+endif()
+if(NOT CL_DISABLE_LONG)
+  set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_int64 cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics")
+endif()
 
-set(HOST_AS_FLAGS "${DEFAULT_HOST_AS_FLAGS}" CACHE STRING "Parameters to as for code generation in the host.")
-set(HOST_LD_FLAGS "${DEFAULT_HOST_LD_FLAGS}" CACHE STRING "Parameter to compiler to generate loadable module.")
-set(HOST_CLANG_FLAGS "${DEFAULT_HOST_CLANG_FLAGS}" CACHE STRING "Parameters to clang for host compilation.")
-set(HOST_LLC_FLAGS "${DEFAULT_HOST_LLC_FLAGS}" CACHE STRING "Parameters to llc for code generation in the host.")
+set(TEMP_EXT "${HOST_DEVICE_EXTENSIONS}")
+set(HOST_DEVICE_EXTENSION_DEFINES "")
+separate_arguments(TEMP_EXT)
+foreach(EXT ${TEMP_EXT})
+  set(HOST_DEVICE_EXTENSION_DEFINES "${HOST_DEVICE_EXTENSION_DEFINES} -D${EXT}")
+endforeach()
 
-####################################################################
+set(HOST_DEVICE_CL_VERSION "200")
 
-#line 760
+if(NOT DEFINED KERNELLIB_HOST_CPU_VARIANTS)
+  set(KERNELLIB_HOST_CPU_VARIANTS "native")
+# else TODO test cpu list for unknown values
+endif()
 
-set(OCL_TARGETS "host")
+set(KERNELLIB_HOST_DISTRO_VARIANTS 0)
+if(KERNELLIB_HOST_CPU_VARIANTS STREQUAL "distro")
+  if(X86_64 OR I386)
+    set(KERNELLIB_HOST_CPU_VARIANTS sse2 ssse3 sse41 avx avx_fma4 avx2 avx512)
+  else()
+    message(FATAL_ERROR "Don't know what CPU variants to use for kernel library on this platform.")
+  endif()
+  set(KERNELLIB_HOST_DISTRO_VARIANTS 1)
+endif()
 
+####################################################################
+
+set(EXTRA_HOST_AS_FLAGS "" CACHE STRING "Extra parameters to as for code generation in the host. (default: empty)")
+set(EXTRA_HOST_LD_FLAGS "" CACHE STRING "Extra parameter to compiler to generate loadable module. (default: empty)")
+set(EXTRA_HOST_CLANG_FLAGS "" CACHE STRING "Extra parameters to clang for host compilation. (default: empty)")
+set(EXTRA_HOST_LLC_FLAGS "" CACHE STRING "Extra parameters to llc for code generation in the host. (default: empty)")
 
-# THESE are only used in makefile.am & scripts/pocl*
-set(TCE_TARGET_CLANG_FLAGS "" CACHE STRING "Extra parameters to Clang for TCE compilation.")
-set(CELL_TARGET_CLANG_FLAGS "" CACHE STRING "Extra parameters to Clang for CELL compilation.")
-# TODO HOST_CLANG_FLAGS -> CPU_TARGET_CLANG_FLAGS
-#set(HOST_CLANG_FLAGS "" CACHE STRING "Extra parameters to Clang for cpu target compilation.")
-#AC_ARG_VAR([TARGET_CLANG_FLAGS],
-#           [Parameters to for target compilation.])
+####################################################################
 
-set(TCE_TARGET_LLC_FLAGS "" CACHE STRING "Extra parameters to LLVM's llc for TCE compilation.")
-set(CELL_TARGET_LLC_FLAGS "" CACHE STRING "Extra parameters to LLVM's llc for CELL compilation.")
-# TODO HOST_LLC_FLAGS -> CPU_TARGET_LLC_FLAGS
-#set(HOST_LLC_FLAGS "" CACHE STRING "Extra parameters to LLVM's llc for cpu target compilation.")
-#AC_DEFINE_UNQUOTED([TARGET_LLC_FLAGS],
-#                   [Parameters to llc for code generation in the target.])
+set(HOST_AS_FLAGS "${DEFAULT_HOST_AS_FLAGS} ${EXTRA_HOST_AS_FLAGS}")
+set(HOST_LD_FLAGS "${DEFAULT_HOST_LD_FLAGS} ${EXTRA_HOST_LD_FLAGS}" )
+set(HOST_CLANG_FLAGS "${DEFAULT_HOST_CLANG_FLAGS} ${EXTRA_HOST_CLANG_FLAGS}")
+set(HOST_LLC_FLAGS "${DEFAULT_HOST_LLC_FLAGS} ${EXTRA_HOST_LLC_FLAGS}")
+
+set(OCL_TARGETS "host")
+set(OCL_DRIVERS "basic pthreads")
 
 # TODO OCL_KERNEL_TARGET -> CPU_TARGET_TRIPLE
 # TODO OCL_KERNEL_TARGET_CPU -> OCL_KERNEL_TARGET_CPU
@@ -660,54 +684,80 @@ set(CELL_TARGET_LLC_FLAGS "" CACHE STRING "Extra parameters to LLVM's llc for CE
 set(OCL_KERNEL_TARGET "${LLC_TRIPLE}") #The kernel target triplet.
 set(OCL_KERNEL_TARGET_CPU "${LLC_HOST_CPU}") #The kernel target CPU variant.
 
-# TODO this is required
-# mostly used in tests to skip based on cpu
-#AC_SUBST([HOST_CPU], [$host_cpu])
-#AC_DEFINE_UNQUOTED([HOST_CPU], ["$host_cpu"], [The host CPU type.])
-
-
 ####################################################################
 
 # Determine which device drivers to build.
 
-set(OCL_DRIVERS "basic pthreads")
-
 if(NOT DEFINED DEFAULT_ENABLE_TCE)
 
-  find_program(TCE_CONFIG NAMES "tce-config")
-  if(TCE_CONFIG)
-
-    find_program(TCECC NAMES "tcecc")
-    set(TCE_USABLE 1)
+  set(HAVE_TCE 0)
+  set(HAVE_TCEMC 0)
 
+  if (NOT WITH_TCE)
+    set(WITH_TCE ENV PATH)
   endif()
 
-  if(TCE_CONFIG AND TCECC AND TCE_USABLE)
+  # THESE are only used in makefile.am & scripts/pocl*
+  set(TCE_TARGET_CLANG_FLAGS "" CACHE STRING "Extra parameters to Clang for TCE compilation.")
+  set(TCE_TARGET_LLC_FLAGS "" CACHE STRING "Extra parameters to LLVM's llc for TCE compilation.")
 
-    execute_process(COMMAND "${TCE_CONFIG}" --libs OUTPUT_VARIABLE TCE_LIBS RESULT_VARIABLE RESV)
-    string(STRIP "${TCE_LIBS}" TCE_LIBS)
-    set(TCE_LIBS "${TCE_LIBS}" CACHE INTERNAL "tce-config --libs")
-    # TODO
-    set(LD_FLAGS_BIN ${LD_FLAGS_BIN} ${TCE_LIBS})
+  find_program(TCE_CONFIG NAMES "tce-config" HINTS ${WITH_TCE})
+  find_program(TCECC NAMES "tcecc" HINTS ${WITH_TCE})
+  find_program(TTASIM NAMES "ttasim" HINTS ${WITH_TCE})
+
+  if(TCE_CONFIG AND TCECC AND TTASIM)
 
-    execute_process(COMMAND "${TCE_CONFIG}" --includes OUTPUT_VARIABLE TCE_INCLUDES RESULT_VARIABLE RESV)
+    message(STATUS "Found tcecc + tce-config + ttasim, testing setup")
+
+    get_filename_component(TCE_BASEDIR "${TCE_CONFIG}" DIRECTORY)
+    find_library(TCE_LIBS "tce" HINTS "${TCE_BASEDIR}/../lib" ENV PATH)
+    if(NOT TCE_LIBS)
+      execute_process(COMMAND "${TCE_CONFIG}" --libs OUTPUT_VARIABLE TCE_LIBS RESULT_VARIABLE RESV1)
+    endif()
+    execute_process(COMMAND "${TCE_CONFIG}" --includes OUTPUT_VARIABLE TCE_INCLUDES RESULT_VARIABLE RESV2)
+    execute_process(COMMAND "${TCE_CONFIG}" --version OUTPUT_VARIABLE TCE_VERSION RESULT_VARIABLE RESV3)
+    execute_process(COMMAND "${TCE_CONFIG}" --cxxflags OUTPUT_VARIABLE TCE_CXXFLAGS RESULT_VARIABLE RESV4)
+    execute_process(COMMAND "${TCE_CONFIG}" --prefix OUTPUT_VARIABLE TCE_PREFIX RESULT_VARIABLE RESV5)
+    execute_process(COMMAND "${TTASIM}" --help OUTPUT_VARIABLE TTASIM_HELP RESULT_VARIABLE RESV9)
+
+    if (RESV1 OR RESV2 OR RESV3 OR RESV4 OR RESV5)
+      message(WARNING "tce-config: Nonzero exit status, disabling TCE")
+    elseif (RESV9)
+      message(WARNING "ttasim: Nonzero exit status, disabling TCE")
+    else()
+
+    string(STRIP "${TCE_LIBS}" TCE_LIBS)
+    separate_arguments(TCE_LIBS)
     string(STRIP "${TCE_INCLUDES}" TCE_INCLUDES)
-    set(TCE_INCLUDES "${TCE_INCLUDES}" CACHE INTERNAL "tce-config --includes")
+    separate_arguments(TCE_INCLUDES)
+    string(STRIP "${TCE_CXXFLAGS}" TCE_CXXFLAGS)
+    separate_arguments(TCE_CXXFLAGS)
+    string(STRIP "${TCE_VERSION}" TCE_VERSION)
+    string(STRIP "${TCE_PREFIX}" TCE_PREFIX)
 
-    execute_process(COMMAND "${TCE_CONFIG}" --version OUTPUT_VARIABLE TCE_VERSION RESULT_VARIABLE RESV)
+    set(TCE_LIBS "${TCE_LIBS}" CACHE INTERNAL "tce-config --libs")
+    set(TCE_INCLUDES "${TCE_INCLUDES}" CACHE INTERNAL "tce-config --includes")
     set(TCE_VERSION "${TCE_VERSION}" CACHE INTERNAL "tce-config --version")
+    set(TCE_CXXFLAGS "${TCE_CXXFLAGS}" CACHE INTERNAL "tce-config --cxxflags")
+    set(TCE_PREFIX "${TCE_PREFIX}" CACHE INTERNAL "tce-config --prefix")
 
-    set(DEFAULT_ENABLE_TCE 1 CACHE INTERNAL "TCE available")
+    # TODO
+    set(LD_FLAGS_BIN ${LD_FLAGS_BIN} ${TCE_LIBS})
 
+    set(HAVE_TCE 1)
     if(TCE_VERSION MATCHES "trunk")
-      set(DEFAULT_ENABLE_TCEMC 1 CACHE INTERNAL "TCEMC available")
+      set(HAVE_TCEMC 1)
+    endif()
+
     endif()
 
   else()
-    set(DEFAULT_ENABLE_TCE 0 CACHE INTERNAL "TCE available")
-    set(DEFAULT_ENABLE_TCEMC 0 CACHE INTERNAL "TCEMC available")
+    message(STATUS "Failed to find tcecc or tce-config, disabling TCE")
   endif()
 
+  set(DEFAULT_ENABLE_TCE ${HAVE_TCE} CACHE INTERNAL "TCE available")
+  set(DEFAULT_ENABLE_TCEMC ${HAVE_TCEMC} CACHE INTERNAL "TCEMC available")
+
 endif()
 
 setup_cached_var(ENABLE_TCE "TCE"
@@ -721,38 +771,18 @@ if(ENABLE_TCE)
     set(ENABLE_TCEMC 1)
     set(OCL_DRIVERS "${OCL_DRIVERS} tcemc") # TCEMC is a "superset" of TCE (lp:tce) features.
   endif()
-else()
-  set(ENABLE_TCEMC 0)
-endif()
-
-##########################################################
-
-# Check if CellSPU support is found
-
-# The libspe version requirement is not strict. This is the only one tested.
-# SPU backend was removed in LLVM 3.3 (and we don't accept LLVM 3.1 anymore in pocl)
-# so compile the spu backend only when LLVM 3.2 is found
-
-if(NOT DEFINED DEFAULT_ENABLE_SPU)
-  set(_SPU 0)
-  if(LLVM_VERSION MATCHES "3.2")
-    pkg_check_modules(LIBSPE "libspe2>=2.2.80")
-    if(LIBSPE_FOUND)
-      set(_SPU 1)
-    endif()
-  else()
-    message(STATUS "Skipping cellspu target, needs llvm 3.2")
-  endif()
-  set(DEFAULT_ENABLE_SPU ${_SPU} CACHE INTERNAL "Build cell SPU")
-endif()
+  set(TCE_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16")
+  set(TEMP_EXT "${TCE_DEVICE_EXTENSIONS}")
+  set(TCE_DEVICE_EXTENSION_DEFINES "")
+  separate_arguments(TEMP_EXT)
+  foreach(EXT ${TEMP_EXT})
+    set(TCE_DEVICE_EXTENSION_DEFINES "${TCE_DEVICE_EXTENSION_DEFINES} -D${EXT}")
+  endforeach()
 
-setup_cached_var(ENABLE_SPU "Cell SPU"
-  "Requested enabling Cell SPU, but no usable libspe found !"
-  "Cell SPU available, but requested disabling it")
+  set(TCE_DEVICE_CL_VERSION "120")
 
-if(ENABLE_SPU)
-  set(OCL_DRIVERS "${OCL_DRIVERS} spu")
-  set(OCL_TARGETS "${OCL_TARGETS} cellspu")
+else()
+  set(ENABLE_TCEMC 0)
 endif()
 
 ##########################################################
@@ -763,6 +793,10 @@ if(ENABLE_HSA)
   # this is for config.h
   # TODO unify with autotools
   set(BUILD_HSA 1)
+
+  set(HSA_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64 cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics")
+  set(HSA_DEVICE_CL_VERSION "200")
+  find_path(HAVE_HSA_EXT_AMD_H "hsa_ext_amd.h" HINTS "${HSA_INCLUDEDIR}" ENV PATH)
 endif()
 
 ##########################################################
@@ -790,22 +824,6 @@ endif()
 
 ##########################################################
 
-# TODO
-
-#AC_MSG_NOTICE([Checking host compiler characteristics])
-#message(STATUS "Checking host compiler characteristics")
-
-#CFLAGS="-h $host"
-# -> TRIPLE = LLC_TRIPLE
-
-# HOST_SIZEOF_* is actually unused; some code still uses SIZEOF_* though.
-
-#AC_CHECK_SIZEOF([long])
-#AC_SUBST([HOST_SIZEOF_LONG], )
-CHECK_SIZEOF("__fp16" SIZEOF___FP16 ${LLC_TRIPLE})
-CHECK_SIZEOF("void*" SIZEOF_VOID_P ${LLC_TRIPLE})
-
-
 CHECK_ALIGNOF("float16" "typedef float float16  __attribute__((__ext_vector_type__(16)));" ALIGNOF_FLOAT16 ${LLC_TRIPLE})
 
 CHECK_ALIGNOF("double16" "typedef double double16  __attribute__((__ext_vector_type__(16)));" ALIGNOF_DOUBLE16 ${LLC_TRIPLE})
@@ -819,10 +837,10 @@ endif()
 ##########################################################
 
 # POCL_DEVICE_ADDRESS_BITS
-# Value based on host processor, for basic and pthreads devices
-if("${SIZEOF_VOID_P}" STREQUAL "8")
+# TODO rename to HOST addess bits
+if(CMAKE_SIZEOF_VOID_P EQUAL 8)
   set(POCL_DEVICE_ADDRESS_BITS 64)
-elseif("${SIZEOF_VOID_P}" STREQUAL "4")
+elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
   set(POCL_DEVICE_ADDRESS_BITS 32)
 else()
   message(FATAL_ERROR "Cannot figure out POCL_DEVICE_ADDRESS_BITS")
@@ -869,12 +887,12 @@ endif()
 # 4:0:3 == 0.10 (currently backwards compatible with 0.7, thus age = 3).
 # 5:0:4 == 0.11 (currently backwards compatible with 0.7, thus age = 4).
 # 6:0:5 == 0.12 (currently backwards compatible with 0.7, thus age = 5).
+# 7:0:6 == 0.13 (currently backwards compatible with 0.7, thus age = 6).
 
-set(LIB_CURRENT_VERSION 6)
+set(LIB_CURRENT_VERSION 7)
 set(LIB_REVISION_VERSION 0)
-set(LIB_AGE_VERSION 5)
+set(LIB_AGE_VERSION 6)
 
-# LIB_FIRST_VERSION=$(($LIB_CURRENT_VERSION - $LIB_AGE_VERSION))
 math(EXPR LIB_FIRST_VERSION "${LIB_CURRENT_VERSION} - ${LIB_AGE_VERSION}")
 
 # libtool takes "c:r:a" arguments, but the result is "<lib>.so.(c-a).a.r"
@@ -888,28 +906,13 @@ set(LIB_API_VERSION "${LIB_FIRST_VERSION}")
 # drastically. Let's try to follow the similar 'current' numbering as
 # the pocl host API library and perhaps tune the 'revision' and 'age' later.
 
-# DONE - lib/llvmopencl..
-# AC_SUBST([KERNEL_COMPILER_LIB_VERSION], ["3:0:0"])
 set(KERNEL_COMPILER_LIB_VERSION "${LIB_CURRENT_VERSION}.0.0")
 
 ##########################################################
 
-function(rename_if_different SRC DST)
-  if(EXISTS "${DST}")
-    file(MD5 "${SRC}" OLD_MD5)
-    file(MD5 "${DST}" NEW_MD5)
-    if(NOT OLD_MD5 STREQUAL NEW_MD5)
-      file(RENAME "${SRC}" "${DST}")
-    endif()
-  else()
-    file(RENAME "${SRC}" "${DST}")
-  endif()
-endfunction()
-
 #TODO
 # these vars are copies b/c tons of sources use BUILD_ICD etc
 set(BUILD_ICD ${ENABLE_ICD})
-set(BUILD_SPU ${ENABLE_SPU})
 set(TCE_AVAILABLE ${ENABLE_TCE})
 set(TCEMC_AVAILABLE ${ENABLE_TCEMC})
 set(_CL_DISABLE_LONG ${CL_DISABLE_LONG})
@@ -970,30 +973,13 @@ message(STATUS "POCLU LINK OPTS: ${POCLU_LINK_OPTIONS}")
 # DONE - just pocl-standalone script
 add_subdirectory("scripts")
 
-# for tests & examples
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc")
-  set(POWERPC 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv7")
-  set(ARMV7 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv6")
-  set(ARMV6 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(i386|AMD64|x86_64)")
-  if(POCL_DEVICE_ADDRESS_BITS MATCHES "32")
-    set(I386 1)
-  else()
-    set(X86_64 1)
-  endif()
-endif()
-
-
 # TODO In progress
 add_subdirectory("tests")
 
 # TODO In progress
-set(ALL_TESTSUITES "AMD;AMDSDK2.9;opencl-book-samples;Parboil;Piglit;Rodinia;VexCL;ViennaCL;Halide;OpenCV;CloverLeaf;hsa")
 add_subdirectory("examples")
 
-add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND})
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} ${COMMAND_USES_TERMINAL})
 
 ##########################################################
 
@@ -1019,27 +1005,23 @@ MESSAGE(STATUS "******* Enabled features:")
 MESSAGE(STATUS " ")
 
 MESSAGE(STATUS "CLANG_SPIR: ${CLANG_SPIR}")
-MESSAGE(STATUS "CL_DISABLE_LONG (Disable cl_khr_int64): ${CL_DISABLE_LONG}")
 MESSAGE(STATUS "DIRECT_LINKAGE: ${DIRECT_LINKAGE}")
 MESSAGE(STATUS "ENABLE_DOCS: ${ENABLE_DOCS}")
 MESSAGE(STATUS "ENABLE_ICD: ${ENABLE_ICD}")
-MESSAGE(STATUS "ENABLE_SPU (Cell SPU enabled): ${ENABLE_SPU}")
 MESSAGE(STATUS "ENABLE_TCE: ${ENABLE_TCE}")
 MESSAGE(STATUS "ENABLE_TCEMC: ${ENABLE_TCEMC}")
 MESSAGE(STATUS "ENABLE_HSA: ${ENABLE_HSA}")
-MESSAGE(STATUS "HAVE_CLOCK_GETTIME: ${HAVE_CLOCK_GETTIME}")
-MESSAGE(STATUS "HAVE_GLEW: ${HAVE_GLEW}")
-MESSAGE(STATUS "HAVE_OPENCL_HPP: ${HAVE_OPENCL_HPP}")
 MESSAGE(STATUS "INSTALL_OPENCL_HEADERS (Install our headers): ${INSTALL_OPENCL_HEADERS}")
-MESSAGE(STATUS "NEW_PRINTF_WORKS: ${NEW_PRINTF_WORKS}")
 MESSAGE(STATUS "OCL_DRIVERS (Drivers built): ${OCL_DRIVERS}")
 MESSAGE(STATUS "OCL_TARGETS (Targets built): ${OCL_TARGETS}")
 MESSAGE(STATUS "STATIC_LLVM: ${STATIC_LLVM}")
 MESSAGE(STATUS "TESTS_USE_ICD: ${TESTS_USE_ICD}")
 MESSAGE(STATUS "USE_VECMATHLIB: ${USE_VECMATHLIB}")
 MESSAGE(STATUS "Available testsuites: ${ALL_TESTSUITES}")
-MESSAGE(STATUS "Enabled testsuites: ${ENABLED_TESTSUITES}")
-MESSAGE(STATUS "Kernel caching: ${KERNEL_CACHE}")
+MESSAGE(STATUS "Enabled testsuites: ${ACTUALLY_ENABLED_TESTSUITES}")
+MESSAGE(STATUS "Kernel caching: ${KERNEL_CACHE_DEFAULT}")
+MESSAGE(STATUS "Kernel library CPU variants: ${KERNELLIB_HOST_CPU_VARIANTS}")
+MESSAGE(STATUS "Kernel library distro build: ${KERNELLIB_HOST_DISTRO_VARIANTS}")
 
 MESSAGE(STATUS " ")
 MESSAGE(STATUS "******* Programs:")
@@ -1047,7 +1029,7 @@ MESSAGE(STATUS " ")
 
 MESSAGE(STATUS "LLVM_CONFIG: ${LLVM_CONFIG}")
 MESSAGE(STATUS "LLVM_OPT: ${LLVM_OPT}")
-MESSAGE(STATUS "LLC: ${LLC}")
+MESSAGE(STATUS "LLVM_LLC: ${LLVM_LLC}")
 MESSAGE(STATUS "LLVM_AS: ${LLVM_AS}")
 MESSAGE(STATUS "LLVM_LINK: ${LLVM_LINK}")
 MESSAGE(STATUS "LLVM_LLI: ${LLVM_LLI}")
@@ -1058,12 +1040,12 @@ MESSAGE(STATUS " ")
 MESSAGE(STATUS "******* Various Flags:")
 MESSAGE(STATUS " ")
 
-MESSAGE(STATUS "CELL_TARGET_CLANG_FLAGS: ${CELL_TARGET_CLANG_FLAGS}")
-MESSAGE(STATUS "CELL_TARGET_LLC_FLAGS: ${CELL_TARGET_LLC_FLAGS}")
 MESSAGE(STATUS "CLANG_MARCH_FLAG: ${CLANG_MARCH_FLAG}")
 MESSAGE(STATUS "CLANG_TARGET_OPTION: ${CLANG_TARGET_OPTION}")
 MESSAGE(STATUS "CL_DISABLE_HALF: ${CL_DISABLE_HALF}")
 MESSAGE(STATUS "CL_DISABLE_LONG: ${CL_DISABLE_LONG}")
+MESSAGE(STATUS "HAVE_CLOCK_GETTIME: ${HAVE_CLOCK_GETTIME}")
+MESSAGE(STATUS "HAVE_GLEW: ${HAVE_GLEW}")
 MESSAGE(STATUS "HOST_AS_FLAGS: ${HOST_AS_FLAGS}")
 MESSAGE(STATUS "HOST_CLANG_FLAGS: ${HOST_CLANG_FLAGS}")
 MESSAGE(STATUS "HOST_LD_FLAGS: ${HOST_LD_FLAGS}")
@@ -1074,7 +1056,8 @@ MESSAGE(STATUS "HSAIL_ASM: ${HSAIL_ASM}")
 MESSAGE(STATUS "ICD_LD_FLAGS: ${ICD_LD_FLAGS}")
 MESSAGE(STATUS "KERNEL_CLANGXX_FLAGS: ${KERNEL_CLANGXX_FLAGS}")
 MESSAGE(STATUS "KERNEL_CL_FLAGS: ${KERNEL_CL_FLAGS}")
-MESSAGE(STATUS "LD_FLAGS_BIN: ${LD_FLAGS_BIN}")
+MESSAGE(STATUS "PTHREAD_LDFLAGS: ${PTHREAD_LDFLAGS}")
+MESSAGE(STATUS "PTHREAD_CFLAGS: ${PTHREAD_CFLAGS}")
 MESSAGE(STATUS "LIB_API_VERSION: ${LIB_API_VERSION}")
 MESSAGE(STATUS "LIB_BUILD_VERSION: ${LIB_BUILD_VERSION}")
 MESSAGE(STATUS "LLVM_VERSION: ${LLVM_VERSION}")
@@ -1088,18 +1071,6 @@ MESSAGE(STATUS "LLVM_INCLUDEDIR: ${LLVM_INCLUDEDIR}")
 MESSAGE(STATUS "LLVM_SRC_ROOT: ${LLVM_SRC_ROOT}")
 MESSAGE(STATUS "LLVM_OBJ_ROOT: ${LLVM_OBJ_ROOT}")
 MESSAGE(STATUS "LLVM_INCLUDE_DIRS: ${LLVM_INCLUDE_DIRS}")
-MESSAGE(STATUS "LLVM_SYSLIBS: ${LLVM_SYSLIBS}")
-MESSAGE(STATUS "----------- LLVM_LIBS (LLVM SHARED LIBS) --------")
-MESSAGE(STATUS "${LLVM_LIBNAMES}")
-MESSAGE(STATUS "----------- LLVM_LIBFILES (LLVM STATIC LIBS) --------")
-MESSAGE(STATUS "${LLVM_LIBFILES}")
-MESSAGE(STATUS "----------- CLANG_LIBS (LLVM SHARED LIBS) --------")
-MESSAGE(STATUS "${CLANG_LIBNAMES}")
-MESSAGE(STATUS "----------- CLANG_LIBFILES (LLVM STATIC LIBS) --------")
-MESSAGE(STATUS "${CLANG_LIBFILES}")
-MESSAGE(STATUS "----------- -------------------------------- --------")
-MESSAGE(STATUS "POCL_LLVM_LIBS: ${POCL_LLVM_LIBS}")
-MESSAGE(STATUS "----------- -------------------------------- --------")
 MESSAGE(STATUS "LLVM_ALL_TARGETS: ${LLVM_ALL_TARGETS}")
 MESSAGE(STATUS "LLVM_HOST_TARGET: ${LLVM_HOST_TARGET}")
 MESSAGE(STATUS "LLC_TRIPLE: ${LLC_TRIPLE}")
@@ -1110,6 +1081,19 @@ MESSAGE(STATUS "OCL_KERNEL_TARGET_CPU: ${OCL_KERNEL_TARGET_CPU}")
 MESSAGE(STATUS "POCL_DEVICE_ADDRESS_BITS: ${POCL_DEVICE_ADDRESS_BITS}")
 MESSAGE(STATUS "TCE_TARGET_CLANG_FLAGS: ${TCE_TARGET_CLANG_FLAGS}")
 MESSAGE(STATUS "TCE_TARGET_LLC_FLAGS: ${TCE_TARGET_LLC_FLAGS}")
-
-
-MESSAGE(STATUS " ")
+MESSAGE(STATUS "TCE_CXXFLAGS: ${TCE_CXXFLAGS}")
+MESSAGE(STATUS "TCE_INCLUDES: ${TCE_INCLUDES}")
+MESSAGE(STATUS "TCE_LIBS: ${TCE_LIBS}")
+MESSAGE(STATUS "TCE_VERSION: ${TCE_VERSION}")
+MESSAGE(STATUS "TCE_PREFIX: ${TCE_PREFIX}")
+MESSAGE(STATUS "")
+MESSAGE(STATUS "----------- -------------------------------- --------")
+MESSAGE(STATUS "llvm libs libpocl will be linked to (POCL_LLVM_LIBS):")
+MESSAGE(STATUS "${POCL_LLVM_LIBS}")
+MESSAGE(STATUS "----------- -------------------------------- --------")
+MESSAGE(STATUS "clang libs libpocl will be linked to (CLANG_LIBFILES):")
+MESSAGE(STATUS "${CLANG_LIBFILES}")
+MESSAGE(STATUS "----------- -------------------------------- --------")
+MESSAGE(STATUS "system libs libpocl will be linked to (LLVM_SYSLIBS):")
+MESSAGE(STATUS "${LLVM_SYSLIBS}")
+MESSAGE(STATUS "----------- -------------------------------- --------")
diff --git a/CREDITS b/CREDITS
index 5d2e905..de7a1e2 100644
--- a/CREDITS
+++ b/CREDITS
@@ -44,3 +44,4 @@ Chen Chou-chuan <ccchen at pllab.cs.nthu.edu.tw>
 Shao-chung Wang <scwang at pllab.cs.nthu.edu.tw>
 Pavan Yalamanchili <pavan at arrayfire.com>
 Romaric Jodin <rjodin at kalray.eu>
+Masataro Asai <guicho2.71828 at gmail.com>
diff --git a/INSTALL b/INSTALL
deleted file mode 100644
index e264cfa..0000000
--- a/INSTALL
+++ /dev/null
@@ -1,429 +0,0 @@
-Requirements
-============
-
-In order to build pocl, you need the following support libraries and
-tools:
-
-  * LLVM & Clang
-  * GNU make
-  * libtool dlopen wrapper files (e.g. libltdl3-dev in Debian)
-  * pthread (should be installed by default)
-  * hwloc v1.0 or newer (e.g. libhwloc-dev)
-  * pkg-config
-  * autotools or cmake
-
-IMPORTANT NOTE! In order to use LLVM with pocl you need to configure LLVM 
-with '--enable-shared' switch and some platforms also require that you 
-compile it with 'make REQUIRES_RTTI=1', as follows:
-
-  ./configure --enable-shared --prefix=YOUR_INSTALLATION_PREFIX_HERE 
-  make REQUIRES_RTTI=1 && make install
-
-Build using autotools
----------------------
-
-After all the requirements are installed. The installation procedure
-follows the usual autotools build+install. If you are using a development
-source tree, you need to generate the autotool build files with 
-
-  "./autogen.sh".
-
-NOTE: automake 1.11 is known to work,
-      automake 1.96 might not work
-
-Build using cmake
------------------
-Cmake version 2.8.12 or higher is required.
-
-NOTE cmake buildsystem in pocl is not feature complete (compared to autotools);
-in particular, if you want to use external testsuites, you have to build using
-autotools. For more information on current status of cmake in pocl,
-see https://github.com/pocl/pocl/wiki/CMake-status
-
-The build+install is the usual cmake way:
-  cd <directory-with-pocl-sources>
-  mkdir build
-  cd build
-  cmake [-D<option>=<value> ...] ..
-
-To see the default values, run "cmake .." without any options, it will
-produce a summary.
-
-A few useful options are:
-  -DWITH_LLVM_CONFIG=<path-to-llvm-config> allows you to choose your llvm
-     installation to build against
-  -DENABLE_ICD=<ON/OFF> enable / disable icd
-  -DPOCL_INSTALL_<item>_DIR=<path> A path to install specific items into.
-
-
-Known issues
-------------
-
-There are unsolved issues and bugs in pocl. See the bug listing
-for a complete listing at https://bugs.launchpad.net/pocl
-
-Known issues not related to pocl are listed below.
-
-* Using Clang compiled with gcc 4.7 causes indeterminism in the
-kernel compilation results. See the LLVM bug report: 
-http://llvm.org/bugs/show_bug.cgi?id=12945
-
-* autogen.sh whines about AC_MSG_ERROR(). This happens (for some reason)
-if you do not have pkg-config installed.
-
-The standard build installations as provided by FSF are as follows:
-
-Basic Installation
-==================
-
-   Briefly, the shell commands `./configure; make; make install' should
-configure, build, and install this package.  The following
-more-detailed instructions are generic; see the `README' file for
-instructions specific to this package.  Some packages provide this
-`INSTALL' file but do not implement all of the features documented
-below.  The lack of an optional feature in a given package is not
-necessarily a bug.  More recommendations for GNU packages can be found
-in *note Makefile Conventions: (standards)Makefile Conventions.
-
-   The `configure' shell script attempts to guess correct values for
-various system-dependent variables used during compilation.  It uses
-those values to create a `Makefile' in each directory of the package.
-It may also create one or more `.h' files containing system-dependent
-definitions.  Finally, it creates a shell script `config.status' that
-you can run in the future to recreate the current configuration, and a
-file `config.log' containing compiler output (useful mainly for
-debugging `configure').
-
-   It can also use an optional file (typically called `config.cache'
-and enabled with `--cache-file=config.cache' or simply `-C') that saves
-the results of its tests to speed up reconfiguring.  Caching is
-disabled by default to prevent problems with accidental use of stale
-cache files.
-
-   If you need to do unusual things to compile the package, please try
-to figure out how `configure' could check whether to do them, and mail
-diffs or instructions to the address given in the `README' so they can
-be considered for the next release.  If you are using the cache, and at
-some point `config.cache' contains results you don't want to keep, you
-may remove or edit it.
-
-   The file `configure.ac' (or `configure.in') is used to create
-`configure' by a program called `autoconf'.  You need `configure.ac' if
-you want to change it or regenerate `configure' using a newer version
-of `autoconf'.
-
-   The simplest way to compile this package is:
-
-  1. `cd' to the directory containing the package's source code and type
-     `./configure' to configure the package for your system.
-
-     Running `configure' might take a while.  While running, it prints
-     some messages telling which features it is checking for.
-
-  2. Type `make' to compile the package.
-
-  3. Optionally, type `make check' to run any self-tests that come with
-     the package, generally using the just-built uninstalled binaries.
-
-  4. Type `make install' to install the programs and any data files and
-     documentation.  When installing into a prefix owned by root, it is
-     recommended that the package be configured and built as a regular
-     user, and only the `make install' phase executed with root
-     privileges.
-
-  5. Optionally, type `make installcheck' to repeat any self-tests, but
-     this time using the binaries in their final installed location.
-     This target does not install anything.  Running this target as a
-     regular user, particularly if the prior `make install' required
-     root privileges, verifies that the installation completed
-     correctly.
-
-  6. You can remove the program binaries and object files from the
-     source code directory by typing `make clean'.  To also remove the
-     files that `configure' created (so you can compile the package for
-     a different kind of computer), type `make distclean'.  There is
-     also a `make maintainer-clean' target, but that is intended mainly
-     for the package's developers.  If you use it, you may have to get
-     all sorts of other programs in order to regenerate files that came
-     with the distribution.
-
-  7. Often, you can also type `make uninstall' to remove the installed
-     files again.  In practice, not all packages have tested that
-     uninstallation works correctly, even though it is required by the
-     GNU Coding Standards.
-
-  8. Some packages, particularly those that use Automake, provide `make
-     distcheck', which can by used by developers to test that all other
-     targets like `make install' and `make uninstall' work correctly.
-     This target is generally not run by end users.
-
-Compilers and Options
-=====================
-
-   Some systems require unusual options for compilation or linking that
-the `configure' script does not know about.  Run `./configure --help'
-for details on some of the pertinent environment variables.
-
-   You can give `configure' initial values for configuration parameters
-by setting variables in the command line or in the environment.  Here
-is an example:
-
-     ./configure CC=c99 CFLAGS=-g LIBS=-lposix
-
-   *Note Defining Variables::, for more details.
-
-Compiling For Multiple Architectures
-====================================
-
-   You can compile the package for more than one kind of computer at the
-same time, by placing the object files for each architecture in their
-own directory.  To do this, you can use GNU `make'.  `cd' to the
-directory where you want the object files and executables to go and run
-the `configure' script.  `configure' automatically checks for the
-source code in the directory that `configure' is in and in `..'.  This
-is known as a "VPATH" build.
-
-   With a non-GNU `make', it is safer to compile the package for one
-architecture at a time in the source code directory.  After you have
-installed the package for one architecture, use `make distclean' before
-reconfiguring for another architecture.
-
-   On MacOS X 10.5 and later systems, you can create libraries and
-executables that work on multiple system types--known as "fat" or
-"universal" binaries--by specifying multiple `-arch' options to the
-compiler but only a single `-arch' option to the preprocessor.  Like
-this:
-
-     ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
-                 CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
-                 CPP="gcc -E" CXXCPP="g++ -E"
-
-   This is not guaranteed to produce working output in all cases, you
-may have to build one architecture at a time and combine the results
-using the `lipo' tool if you have problems.
-
-Installation Names
-==================
-
-   By default, `make install' installs the package's commands under
-`/usr/local/bin', include files under `/usr/local/include', etc.  You
-can specify an installation prefix other than `/usr/local' by giving
-`configure' the option `--prefix=PREFIX', where PREFIX must be an
-absolute file name.
-
-   You can specify separate installation prefixes for
-architecture-specific files and architecture-independent files.  If you
-pass the option `--exec-prefix=PREFIX' to `configure', the package uses
-PREFIX as the prefix for installing programs and libraries.
-Documentation and other data files still use the regular prefix.
-
-   In addition, if you use an unusual directory layout you can give
-options like `--bindir=DIR' to specify different values for particular
-kinds of files.  Run `configure --help' for a list of the directories
-you can set and what kinds of files go in them.  In general, the
-default for these options is expressed in terms of `${prefix}', so that
-specifying just `--prefix' will affect all of the other directory
-specifications that were not explicitly provided.
-
-   The most portable way to affect installation locations is to pass the
-correct locations to `configure'; however, many packages provide one or
-both of the following shortcuts of passing variable assignments to the
-`make install' command line to change installation locations without
-having to reconfigure or recompile.
-
-   The first method involves providing an override variable for each
-affected directory.  For example, `make install
-prefix=/alternate/directory' will choose an alternate location for all
-directory configuration variables that were expressed in terms of
-`${prefix}'.  Any directories that were specified during `configure',
-but not in terms of `${prefix}', must each be overridden at install
-time for the entire installation to be relocated.  The approach of
-makefile variable overrides for each directory variable is required by
-the GNU Coding Standards, and ideally causes no recompilation.
-However, some platforms have known limitations with the semantics of
-shared libraries that end up requiring recompilation when using this
-method, particularly noticeable in packages that use GNU Libtool.
-
-   The second method involves providing the `DESTDIR' variable.  For
-example, `make install DESTDIR=/alternate/directory' will prepend
-`/alternate/directory' before all installation names.  The approach of
-`DESTDIR' overrides is not required by the GNU Coding Standards, and
-does not work on platforms that have drive letters.  On the other hand,
-it does better at avoiding recompilation issues, and works well even
-when some directory options were not specified in terms of `${prefix}'
-at `configure' time.
-
-Optional Features
-=================
-
-   If the package supports it, you can cause programs to be installed
-with an extra prefix or suffix on their names by giving `configure' the
-option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
-
-   Some packages pay attention to `--enable-FEATURE' options to
-`configure', where FEATURE indicates an optional part of the package.
-They may also pay attention to `--with-PACKAGE' options, where PACKAGE
-is something like `gnu-as' or `x' (for the X Window System).  The
-`README' should mention any `--enable-' and `--with-' options that the
-package recognizes.
-
-   For packages that use the X Window System, `configure' can usually
-find the X include and library files automatically, but if it doesn't,
-you can use the `configure' options `--x-includes=DIR' and
-`--x-libraries=DIR' to specify their locations.
-
-   Some packages offer the ability to configure how verbose the
-execution of `make' will be.  For these packages, running `./configure
---enable-silent-rules' sets the default to minimal output, which can be
-overridden with `make V=1'; while running `./configure
---disable-silent-rules' sets the default to verbose, which can be
-overridden with `make V=0'.
-
-Particular systems
-==================
-
-   On HP-UX, the default C compiler is not ANSI C compatible.  If GNU
-CC is not installed, it is recommended to use the following options in
-order to use an ANSI C compiler:
-
-     ./configure CC="cc -Ae -D_XOPEN_SOURCE=500"
-
-and if that doesn't work, install pre-built binaries of GCC for HP-UX.
-
-   On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
-parse its `<wchar.h>' header file.  The option `-nodtk' can be used as
-a workaround.  If GNU CC is not installed, it is therefore recommended
-to try
-
-     ./configure CC="cc"
-
-and if that doesn't work, try
-
-     ./configure CC="cc -nodtk"
-
-   On Solaris, don't put `/usr/ucb' early in your `PATH'.  This
-directory contains several dysfunctional programs; working variants of
-these programs are available in `/usr/bin'.  So, if you need `/usr/ucb'
-in your `PATH', put it _after_ `/usr/bin'.
-
-   On Haiku, software installed for all users goes in `/boot/common',
-not `/usr/local'.  It is recommended to use the following options:
-
-     ./configure --prefix=/boot/common
-
-Specifying the System Type
-==========================
-
-   There may be some features `configure' cannot figure out
-automatically, but needs to determine by the type of machine the package
-will run on.  Usually, assuming the package is built to be run on the
-_same_ architectures, `configure' can figure that out, but if it prints
-a message saying it cannot guess the machine type, give it the
-`--build=TYPE' option.  TYPE can either be a short name for the system
-type, such as `sun4', or a canonical name which has the form:
-
-     CPU-COMPANY-SYSTEM
-
-where SYSTEM can have one of these forms:
-
-     OS
-     KERNEL-OS
-
-   See the file `config.sub' for the possible values of each field.  If
-`config.sub' isn't included in this package, then this package doesn't
-need to know the machine type.
-
-   If you are _building_ compiler tools for cross-compiling, you should
-use the option `--target=TYPE' to select the type of system they will
-produce code for.
-
-   If you want to _use_ a cross compiler, that generates code for a
-platform different from the build platform, you should specify the
-"host" platform (i.e., that on which the generated programs will
-eventually be run) with `--host=TYPE'.
-
-Sharing Defaults
-================
-
-   If you want to set default values for `configure' scripts to share,
-you can create a site shell script called `config.site' that gives
-default values for variables like `CC', `cache_file', and `prefix'.
-`configure' looks for `PREFIX/share/config.site' if it exists, then
-`PREFIX/etc/config.site' if it exists.  Or, you can set the
-`CONFIG_SITE' environment variable to the location of the site script.
-A warning: not all `configure' scripts look for a site script.
-
-Defining Variables
-==================
-
-   Variables not defined in a site shell script can be set in the
-environment passed to `configure'.  However, some packages may run
-configure again during the build, and the customized values of these
-variables may be lost.  In order to avoid this problem, you should set
-them in the `configure' command line, using `VAR=value'.  For example:
-
-     ./configure CC=/usr/local2/bin/gcc
-
-causes the specified `gcc' to be used as the C compiler (unless it is
-overridden in the site shell script).
-
-Unfortunately, this technique does not work for `CONFIG_SHELL' due to
-an Autoconf bug.  Until the bug is fixed you can use this workaround:
-
-     CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
-
-`configure' Invocation
-======================
-
-   `configure' recognizes the following options to control how it
-operates.
-
-`--help'
-`-h'
-     Print a summary of all of the options to `configure', and exit.
-
-`--help=short'
-`--help=recursive'
-     Print a summary of the options unique to this package's
-     `configure', and exit.  The `short' variant lists options used
-     only in the top level, while the `recursive' variant lists options
-     also present in any nested packages.
-
-`--version'
-`-V'
-     Print the version of Autoconf used to generate the `configure'
-     script, and exit.
-
-`--cache-file=FILE'
-     Enable the cache: use and save the results of the tests in FILE,
-     traditionally `config.cache'.  FILE defaults to `/dev/null' to
-     disable caching.
-
-`--config-cache'
-`-C'
-     Alias for `--cache-file=config.cache'.
-
-`--quiet'
-`--silent'
-`-q'
-     Do not print messages saying which checks are being made.  To
-     suppress all normal output, redirect it to `/dev/null' (any error
-     messages will still be shown).
-
-`--srcdir=DIR'
-     Look for the package's source code in directory DIR.  Usually
-     `configure' can determine that directory automatically.
-
-`--prefix=DIR'
-     Use DIR as the installation prefix.  *note Installation Names::
-     for more details, including other options available for fine-tuning
-     the installation locations.
-
-`--no-create'
-`-n'
-     Run the configure checks, but stop before creating any output
-     files.
-
-`configure' also accepts some other, not widely useful, options.  Run
-`configure --help' for more details.
-
diff --git a/INSTALL b/INSTALL
new file mode 120000
index 0000000..6b3ea26
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1 @@
+doc/sphinx/source/install.rst
\ No newline at end of file
diff --git a/Makefile.am b/Makefile.am
index 6d753e7..36f8eb7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -59,8 +59,8 @@ endif
 .PHONY: ${PHONIES} prepare-examples clean-examples
 
 EXTRA_DIST = config/xclang tools/data/test_machine.adf tools/data/test_machine_fp16.adf \
-  doc/build-envs.txt CHANGES fix-include pocl.icd.in README README.ARM README.Cell \
-  README.Windows README.mips README.OSX README.powerpc README.packaging CREDITS LICENSE \
+  doc/build-envs.txt CHANGES fix-include pocl.icd.in README README.ARM README.mips \
+  README.Windows README.OSX README.powerpc README.packaging CREDITS LICENSE \
   tools/uncrustify_cxx.cfg windows/setup_and_build_win64.sh README.mipsel
 
 # CMake
diff --git a/Makefile.in b/Makefile.in
index 95c1fa3..f2a5599 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -319,6 +319,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -326,6 +327,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -343,8 +345,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -420,6 +420,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -497,9 +498,9 @@ PHONIES = install-paths.h $(am__append_5)
 # Tools, patches etc
 EXTRA_DIST = config/xclang tools/data/test_machine.adf \
 	tools/data/test_machine_fp16.adf doc/build-envs.txt CHANGES \
-	fix-include pocl.icd.in README README.ARM README.Cell \
-	README.Windows README.mips README.OSX README.powerpc \
-	README.packaging CREDITS LICENSE tools/uncrustify_cxx.cfg \
+	fix-include pocl.icd.in README README.ARM README.mips \
+	README.Windows README.OSX README.powerpc README.packaging \
+	CREDITS LICENSE tools/uncrustify_cxx.cfg \
 	windows/setup_and_build_win64.sh README.mipsel CMakeLists.txt \
 	cmake/FindHwloc.cmake cmake/FindPthreadsWin32.cmake \
 	cmake/LLVM.cmake cmake/Sphinx.cmake cmake/bitcode_rules.cmake \
diff --git a/README.Cell b/README.Cell
deleted file mode 100644
index 5152caf..0000000
--- a/README.Cell
+++ /dev/null
@@ -1,94 +0,0 @@
-Notes on compiling pocl in Debian Sid/Cell/Playstation 3.
-
-The "default" debian for Powerpc has a 32bit userspace runtime, 
-even if the processor itself is a 64bit PowerPC. You can run 
-multilibs on debian to be able to build 64bit binaries in the
-32bit environment, but the 'dlopen' of 64 bit libraries in 32bit
-binaries will fail. Hence, both libpocl and the opencl kernels
-must be of the same bitwidth.
-
-status
-------
-32bit binaries are assumed, and built by default. If you know you
-have a 64bit userspace, you can override by giving
-	--enable-ppc64
-option to configure.
-
-The only thing that currently will fail are the tests that use 
-double precision floating points (see pocl bug #911911).
-
-powerpc64
----------
-Compiling in powerpc64 mode compiles the pocl but dlopening
-the kernels fail. The problem is likely the combination of
-32bit pocl libraries and the libtool dlopen library and
-the kernels which are compiled in 64bit mode by Clang.
-
-Forcing the build to use the 64bit mode by passing 
-CXXFLAGS=-m64 CFLAGS=-m64 fails at configure time because
-the libltdl lib is 32-bit only in this env and does not
-get detected.
-
-powerpc32
----------
-Configure with:
-./configure 
-
-This makes pocl to use the 32bit mode for the
-kernel compilation.
-
-
-spu device driver
------------------
-
-Initial implememtation is in place. It uses libspe2 
-to communucate with the spus.
-
-A context for one (current status - to be improved upon)
-spu is created at driver initialization stage. In this image, 
-we allocate memory for the opencl global/local buffers. This
-memory chunk is managed by buffalloc in the driver. See 
-lib/CL/drivrs/cellspu/cellspu.h for a memory map.
-
-clRead/WriteBuffer can operate directly on this context, 
-as it can be memory mapped in the host, and loading an
-ELF into this context doesn't seem to overwrite unused/
-uninitialized areas.
-
-At run time, a __kernel_exec_cmd data structure is 
-filled, similar to as with the TCE driver, this control
-structure is copied into the context, and the SPU is
-started. The SPU has a similar wrapper function as the TTA
-to parse this structure.
-
-This is still unfinished. Major parts missing are:
--get a non-trivial 'hello world' running (i.e. one with
-arguments).
--multithreaded driver. spe_context_run *blocks*, so 
-we need at least 1 thread/spu. Possibly more - depends
-on how the kernel schedules the SPU contexts.
-
-
-to do
------
-* Fix the 64bit build. 
-
-It requires forcing pocl to be built in the 64 bit mode
-(-m64) and all the required libraries (at least the 
-libltdl) to be found as 64 bit versions by configure.
-
-* A device layer implementation for the SPEs. 
-
-The SPEs have local memories and no random access to 
-a shared memory (AFAIK) so implementing the global buffers 
-across SPEs is problematic. Therefore, at least the
-first version could present one device per SPE thus require 
-one command queue per SPE to utilize fully.
-
-* The load balancing of multiple work groups from a single
-kernel execution across all the SPEs. 
-
-Might require changing the global accesses in the LLVM 
-bitcode to DMA transfer calls which possibly ruins
-the performance, but could be provided as an option.
-
diff --git a/README.FreeBSD b/README.FreeBSD
new file mode 100644
index 0000000..f6c8c39
--- /dev/null
+++ b/README.FreeBSD
@@ -0,0 +1,6 @@
+It should just work if you beware some known issues:
+
+* https://github.com/pocl/pocl/issues/263
+
+In short, you should build pocl with Clang 3.6 or newer even though the
+default Clang of FreeBSD can be older.
diff --git a/TODO.piglit b/TODO.piglit
new file mode 100644
index 0000000..984292c
--- /dev/null
+++ b/TODO.piglit
@@ -0,0 +1,29 @@
+Problems found by Piglit testing framework
+==========================================
+
+by: Victor Oliveira
+
+http://people.freedesktop.org/~nh/piglit/
+
+API
+---
+
+- clCreateKernel with non-existent kernel name
+- build options support is incomplete
+- clang accepts invalid opencl version in command-line
+- unimplemented parts in clReleaseProgram
+- clGetProgramBuildInfo returns an empty string
+- clSetKernelArg various error checks
+
+Execute
+-------
+
+- increment and decrement vector types (e.g. int4)
+- implicit conversion between vector types (e.g. int4 -> float4)
+
+Unimplemented
+-------------
+
+- clGetProgramInfo
+- clCreateKernelsInProgram
+
diff --git a/aclocal.m4 b/aclocal.m4
index b4b67cd..82ba9e8 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -20,32 +20,63 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])
 
-# pkg.m4 - Macros to locate and utilise pkg-config.            -*- Autoconf -*-
-# serial 1 (pkg-config-0.24)
-# 
-# Copyright © 2004 Scott James Remnant <scott at netsplit.com>.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# PKG_PROG_PKG_CONFIG([MIN-VERSION])
-# ----------------------------------
+dnl pkg.m4 - Macros to locate and utilise pkg-config.   -*- Autoconf -*-
+dnl serial 11 (pkg-config-0.29.1)
+dnl
+dnl Copyright © 2004 Scott James Remnant <scott at netsplit.com>.
+dnl Copyright © 2012-2015 Dan Nicholson <dbn.lists at gmail.com>
+dnl
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or
+dnl (at your option) any later version.
+dnl
+dnl This program is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software
+dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+dnl 02111-1307, USA.
+dnl
+dnl As a special exception to the GNU General Public License, if you
+dnl distribute this file as part of a program that contains a
+dnl configuration script generated by Autoconf, you may include it under
+dnl the same distribution terms that you use for the rest of that
+dnl program.
+
+dnl PKG_PREREQ(MIN-VERSION)
+dnl -----------------------
+dnl Since: 0.29
+dnl
+dnl Verify that the version of the pkg-config macros are at least
+dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's
+dnl installed version of pkg-config, this checks the developer's version
+dnl of pkg.m4 when generating configure.
+dnl
+dnl To ensure that this macro is defined, also add:
+dnl m4_ifndef([PKG_PREREQ],
+dnl     [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])])
+dnl
+dnl See the "Since" comment for each macro you use to see what version
+dnl of the macros you require.
+m4_defun([PKG_PREREQ],
+[m4_define([PKG_MACROS_VERSION], [0.29.1])
+m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1,
+    [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])])
+])dnl PKG_PREREQ
+
+dnl PKG_PROG_PKG_CONFIG([MIN-VERSION])
+dnl ----------------------------------
+dnl Since: 0.16
+dnl
+dnl Search for the pkg-config tool and set the PKG_CONFIG variable to
+dnl first found in the path. Checks that the version of pkg-config found
+dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is
+dnl used since that's the first version where most current features of
+dnl pkg-config existed.
 AC_DEFUN([PKG_PROG_PKG_CONFIG],
 [m4_pattern_forbid([^_?PKG_[A-Z_]+$])
 m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$])
@@ -67,18 +98,19 @@ if test -n "$PKG_CONFIG"; then
 		PKG_CONFIG=""
 	fi
 fi[]dnl
-])# PKG_PROG_PKG_CONFIG
+])dnl PKG_PROG_PKG_CONFIG
 
-# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
-#
-# Check to see whether a particular set of modules exists.  Similar
-# to PKG_CHECK_MODULES(), but does not set variables or print errors.
-#
-# Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG])
-# only at the first occurence in configure.ac, so if the first place
-# it's called might be skipped (such as if it is within an "if", you
-# have to call PKG_CHECK_EXISTS manually
-# --------------------------------------------------------------
+dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+dnl -------------------------------------------------------------------
+dnl Since: 0.18
+dnl
+dnl Check to see whether a particular set of modules exists. Similar to
+dnl PKG_CHECK_MODULES(), but does not set variables or print errors.
+dnl
+dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+dnl only at the first occurence in configure.ac, so if the first place
+dnl it's called might be skipped (such as if it is within an "if", you
+dnl have to call PKG_CHECK_EXISTS manually
 AC_DEFUN([PKG_CHECK_EXISTS],
 [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
 if test -n "$PKG_CONFIG" && \
@@ -88,8 +120,10 @@ m4_ifvaln([$3], [else
   $3])dnl
 fi])
 
-# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
-# ---------------------------------------------
+dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
+dnl ---------------------------------------------
+dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting
+dnl pkg_failed based on the result.
 m4_define([_PKG_CONFIG],
 [if test -n "$$1"; then
     pkg_cv_[]$1="$$1"
@@ -101,10 +135,11 @@ m4_define([_PKG_CONFIG],
  else
     pkg_failed=untried
 fi[]dnl
-])# _PKG_CONFIG
+])dnl _PKG_CONFIG
 
-# _PKG_SHORT_ERRORS_SUPPORTED
-# -----------------------------
+dnl _PKG_SHORT_ERRORS_SUPPORTED
+dnl ---------------------------
+dnl Internal check to see if pkg-config supports short errors.
 AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
 [AC_REQUIRE([PKG_PROG_PKG_CONFIG])
 if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
@@ -112,19 +147,17 @@ if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
 else
         _pkg_short_errors_supported=no
 fi[]dnl
-])# _PKG_SHORT_ERRORS_SUPPORTED
+])dnl _PKG_SHORT_ERRORS_SUPPORTED
 
 
-# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
-# [ACTION-IF-NOT-FOUND])
-#
-#
-# Note that if there is a possibility the first call to
-# PKG_CHECK_MODULES might not happen, you should be sure to include an
-# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
-#
-#
-# --------------------------------------------------------------
+dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+dnl   [ACTION-IF-NOT-FOUND])
+dnl --------------------------------------------------------------
+dnl Since: 0.4.0
+dnl
+dnl Note that if there is a possibility the first call to
+dnl PKG_CHECK_MODULES might not happen, you should be sure to include an
+dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
 AC_DEFUN([PKG_CHECK_MODULES],
 [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
 AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
@@ -178,16 +211,40 @@ else
         AC_MSG_RESULT([yes])
 	$3
 fi[]dnl
-])# PKG_CHECK_MODULES
+])dnl PKG_CHECK_MODULES
 
 
-# PKG_INSTALLDIR(DIRECTORY)
-# -------------------------
-# Substitutes the variable pkgconfigdir as the location where a module
-# should install pkg-config .pc files. By default the directory is
-# $libdir/pkgconfig, but the default can be changed by passing
-# DIRECTORY. The user can override through the --with-pkgconfigdir
-# parameter.
+dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+dnl   [ACTION-IF-NOT-FOUND])
+dnl ---------------------------------------------------------------------
+dnl Since: 0.29
+dnl
+dnl Checks for existence of MODULES and gathers its build flags with
+dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags
+dnl and VARIABLE-PREFIX_LIBS from --libs.
+dnl
+dnl Note that if there is a possibility the first call to
+dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to
+dnl include an explicit call to PKG_PROG_PKG_CONFIG in your
+dnl configure.ac.
+AC_DEFUN([PKG_CHECK_MODULES_STATIC],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+_save_PKG_CONFIG=$PKG_CONFIG
+PKG_CONFIG="$PKG_CONFIG --static"
+PKG_CHECK_MODULES($@)
+PKG_CONFIG=$_save_PKG_CONFIG[]dnl
+])dnl PKG_CHECK_MODULES_STATIC
+
+
+dnl PKG_INSTALLDIR([DIRECTORY])
+dnl -------------------------
+dnl Since: 0.27
+dnl
+dnl Substitutes the variable pkgconfigdir as the location where a module
+dnl should install pkg-config .pc files. By default the directory is
+dnl $libdir/pkgconfig, but the default can be changed by passing
+dnl DIRECTORY. The user can override through the --with-pkgconfigdir
+dnl parameter.
 AC_DEFUN([PKG_INSTALLDIR],
 [m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])])
 m4_pushdef([pkg_description],
@@ -198,16 +255,18 @@ AC_ARG_WITH([pkgconfigdir],
 AC_SUBST([pkgconfigdir], [$with_pkgconfigdir])
 m4_popdef([pkg_default])
 m4_popdef([pkg_description])
-]) dnl PKG_INSTALLDIR
+])dnl PKG_INSTALLDIR
 
 
-# PKG_NOARCH_INSTALLDIR(DIRECTORY)
-# -------------------------
-# Substitutes the variable noarch_pkgconfigdir as the location where a
-# module should install arch-independent pkg-config .pc files. By
-# default the directory is $datadir/pkgconfig, but the default can be
-# changed by passing DIRECTORY. The user can override through the
-# --with-noarch-pkgconfigdir parameter.
+dnl PKG_NOARCH_INSTALLDIR([DIRECTORY])
+dnl --------------------------------
+dnl Since: 0.27
+dnl
+dnl Substitutes the variable noarch_pkgconfigdir as the location where a
+dnl module should install arch-independent pkg-config .pc files. By
+dnl default the directory is $datadir/pkgconfig, but the default can be
+dnl changed by passing DIRECTORY. The user can override through the
+dnl --with-noarch-pkgconfigdir parameter.
 AC_DEFUN([PKG_NOARCH_INSTALLDIR],
 [m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])])
 m4_pushdef([pkg_description],
@@ -218,13 +277,15 @@ AC_ARG_WITH([noarch-pkgconfigdir],
 AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir])
 m4_popdef([pkg_default])
 m4_popdef([pkg_description])
-]) dnl PKG_NOARCH_INSTALLDIR
+])dnl PKG_NOARCH_INSTALLDIR
 
 
-# PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE,
-# [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
-# -------------------------------------------
-# Retrieves the value of the pkg-config variable for the given module.
+dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE,
+dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+dnl -------------------------------------------
+dnl Since: 0.28
+dnl
+dnl Retrieves the value of the pkg-config variable for the given module.
 AC_DEFUN([PKG_CHECK_VAR],
 [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
 AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl
@@ -233,7 +294,7 @@ _PKG_CONFIG([$1], [variable="][$3]["], [$2])
 AS_VAR_COPY([$1], [pkg_cv_][$1])
 
 AS_VAR_IF([$1], [""], [$5], [$4])dnl
-])# PKG_CHECK_VAR
+])dnl PKG_CHECK_VAR
 
 # Copyright (C) 2002-2014 Free Software Foundation, Inc.
 #
diff --git a/android/CLONE_POCL_PREBUILTS_HERE b/android/CLONE_POCL_PREBUILTS_HERE
new file mode 100644
index 0000000..17973fc
--- /dev/null
+++ b/android/CLONE_POCL_PREBUILTS_HERE
@@ -0,0 +1 @@
+clone pocl-android-prebuilts from https://github.com/krrishnarraj/pocl-android-prebuilts
diff --git a/android/build-arm.sh b/android/build-arm.sh
new file mode 100755
index 0000000..a09cc45
--- /dev/null
+++ b/android/build-arm.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+#
+# Build script for Android
+#
+#   Copyright (c) 2014 Krishnaraj R Bhat (krrishnarraj at gmail.com)
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+# Usage: build-arm.sh [release]
+# default - builds debug version for quick testing
+# release - builds release version with flto options. Much Much slower
+
+PWD=`pwd`
+I_AM=`id -un`
+MY_GROUP=`id -gn`
+ANDROID_TOOLCHAIN=/tmp/android-toolchain/
+
+echo "NDK standalone toolchain setup..."
+if [ ! -e $ANDROID_NDK/build/tools/make-standalone-toolchain.sh ]; then
+    echo "Install Android NDK and set environment variable ANDROID_NDK to its root"
+    return
+fi
+$ANDROID_NDK/build/tools/make-standalone-toolchain.sh \
+				--toolchain=arm-linux-androideabi-4.9 \
+				--arch=arm \
+				--platform=android-16 \
+				--install-dir=$ANDROID_TOOLCHAIN
+
+INSTALL_PREFIX=/data/data/org.pocl.libs/files/
+# Create directories for PREFIX, target location in android
+if [ ! -e $INSTALL_PREFIX ]; then
+    sudo mkdir -p $INSTALL_PREFIX
+    sudo mkdir -p $INSTALL_PREFIX/lib/pkgconfig/
+    sudo chown -R $I_AM:$MY_GROUP $INSTALL_PREFIX
+    sudo chmod 755 -R $INSTALL_PREFIX
+fi
+
+# Prebuilt llvm that runson(android) -> target(android)
+LLVM_HOST_ANDROID_TARGET_ANDROID=$PWD/pocl-android-prebuilts/arm/llvm/android
+if [ ! -e $LLVM_HOST_ANDROID_TARGET_ANDROID/lib/libclangFrontend.a  ]; then
+    echo "Build and place llvm(android) at " $LLVM_HOST_ANDROID_TARGET_ANDROID
+    return
+fi
+
+if [ ! -e $ANDROID_TOOLCHAIN/sysroot/usr/lib/libclangFrontend.a  ]; then
+echo "Copying llvm libs(android) to sysroot..."
+cp -rf $LLVM_HOST_ANDROID_TARGET_ANDROID/* $ANDROID_TOOLCHAIN/sysroot/usr/
+fi
+
+# Prebuilt llvm that runon(x64) -> target(android)
+LLVM_HOST_x64_TARGET_ANDROID=$PWD/pocl-android-prebuilts/arm/llvm/cross_compiler_for_android
+if [ ! -e $LLVM_HOST_x64_TARGET_ANDROID/bin/clang ]; then
+    echo "Build and place llvm runson(x64) -> target(android) at " $LLVM_HOST_x64_TARGET_ANDROID
+    return
+fi
+
+if [ ! -e $ANDROID_TOOLCHAIN/sysroot/usr/bin/clang ]; then
+echo "copying llvm(host) to sysroot...."
+cp -rf $LLVM_HOST_x64_TARGET_ANDROID/* $ANDROID_TOOLCHAIN/sysroot/usr/
+fi
+
+PREBUILT_NCURSES=$PWD/pocl-android-prebuilts/arm/ncurses
+if [ ! -e $PREBUILT_NCURSES/lib/libncurses.a ]; then
+    echo "Build and place ncurses for android at " $PREBUILT_NCURSES
+    return
+fi
+echo "copying ncurses to sysroot...."
+cp -rf $PREBUILT_NCURSES/* $ANDROID_TOOLCHAIN/sysroot/usr/
+ln -sf $ANDROID_TOOLCHAIN/sysroot/usr/lib/libncurses.a $ANDROID_TOOLCHAIN/sysroot/usr/lib/libcurses.a
+
+
+PREBUILT_LTDL=$PWD/pocl-android-prebuilts/arm/libtool
+if [ ! -e $PREBUILT_LTDL/lib/libltdl.a ]; then
+    echo "Build and place libltdl for android at " $PREBUILT_LTDL
+    return
+fi
+echo "copying ltdl to sysroot...."
+cp -rf $PREBUILT_LTDL/* $ANDROID_TOOLCHAIN/sysroot/usr/
+
+PREBUILT_HWLOC=$PWD/pocl-android-prebuilts/arm/hwloc
+if [ ! -e $PREBUILT_HWLOC/lib/libhwloc.a ]; then
+    echo "Build and place libhwloc for android at " $PREBUILT_HWLOC
+    return
+fi
+echo "copying hwloc to sysroot...."
+cp -rf $PREBUILT_HWLOC/* $ANDROID_TOOLCHAIN/sysroot/usr/
+
+PREBUILT_BINUTILS=$PWD/pocl-android-prebuilts/arm/binutils
+if [ ! -e $PREBUILT_BINUTILS/bin/ld ]; then
+    echo "Build and place binutils for android at " $PREBUILT_BINUTILS
+    return
+fi
+echo "copying ld to "$INSTALL_PREFIX
+cp -rf $PREBUILT_BINUTILS/* $INSTALL_PREFIX/
+
+ln -sf $ANDROID_TOOLCHAIN/sysroot/usr/lib/libc.so $ANDROID_TOOLCHAIN/sysroot/usr/lib/libpthread.so
+ln -sf $ANDROID_TOOLCHAIN/sysroot/usr/lib/libc.so $ANDROID_TOOLCHAIN/sysroot/usr/lib/librt.so
+ln -sf $ANDROID_TOOLCHAIN/sysroot/usr/include/GLES $ANDROID_TOOLCHAIN/sysroot/usr/include/GL
+rm $ANDROID_TOOLCHAIN/sysroot/usr/lib/libstdc++.*
+
+export PATH=$ANDROID_TOOLCHAIN/bin:$ANDROID_TOOLCHAIN/sysroot/usr/bin/:$PATH
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ANDROID_TOOLCHAIN/sysroot/usr/lib/
+export HOST=arm-linux-androideabi
+export PREFIX=$INSTALL_PREFIX
+export SYSROOT=$ANDROID_TOOLCHAIN/sysroot/usr/
+export TARGET_CPU="cortex-a9"
+
+# flto option in gcc 4.8 eats all memory & eventually /tmp. Better to place tmp file in disk
+export TMPDIR=$HOME/tmp/junk/
+if [ ! -e $TMPDIR ]; then
+    mkdir -p $TMPDIR
+fi
+
+
+#if [ ! -e $PWD/../configure ]; then
+#    cd ..; ./autogen.sh; cd -
+#fi
+
+DEBUG_BUILD=1
+if [ $# -gt 0 ]  && [ $1 = "release" ] ; then
+    DEBUG_BUILD=0
+fi
+
+if [ $DEBUG_BUILD == 1 ] ; then
+#CC="arm-linux-androideabi-gcc  -static-libstdc++ " CXX="arm-linux-androideabi-g++  -static-libstdc++ "  ac_cv_c_bigendian=no LLC_HOST_CPU=$TARGET_CPU HWLOC_CFLAGS="-I"$ANDROID_TOOLCHAIN"/sysroot/usr/include" HWLOC_LIBS="-L"$ANDROID_TOOLCHAIN"/sysroot/usr/lib -lhwloc" CFLAGS=" -Os " CPPFLAGS=" -Os " LDFLAGS=" "  SYSROOTDIR=$ANDROID_TOOLCHAIN/sysroot/ ../configure --prefix=$PREFIX --host=$HOST --disable-icd --with-sysroot=$ANDROID_TOOLCHAIN/sysroot/ --enable-debug --verbose
+
+cmake -DCMAKE_TOOLCHAIN_FILE=androideabi.cmake -DCMAKE_BUILD_TYPE:STRING=Debug -DCMAKE_AR:FILEPATH=$HOST-gcc-ar -DCMAKE_RANLIB:FILEPATH=$HOST-gcc-ranlib -DCMAKE_CXX_FLAGS:STRING="-Os -ffunction-sections -fdata-sections -fno-lto" -DCMAKE_C_FLAGS:STRING="-Os -ffunction-sections -fdata-sections -fno-lto" -DCMAKE_EXE_LINKER_FLAGS:STRING='-fno-lto -fuse-linker-plugin -Wl,--gc-sections' -DCMAKE_MODULE_LINKER_FLAGS:STRING='-fno-lto -fuse-linker-plugin -Wl,--gc-sections'  -DCMAKE_SHARED_LINKER_F [...]
+make -j4
+
+else
+#ac_cv_c_bigendian=no LLC_HOST_CPU=$TARGET_CPU HWLOC_CFLAGS="-I"$ANDROID_TOOLCHAIN"/sysroot/usr/include" HWLOC_LIBS="-L"$ANDROID_TOOLCHAIN"/sysroot/usr/lib -lhwloc" CFLAGS=" -ffunction-sections -fdata-sections -Os -flto " CPPFLAGS=" -ffunction-sections -fdata-sections -Os -flto " LDFLAGS=" -Wl,--gc-sections -flto " SYSROOTDIR=$ANDROID_TOOLCHAIN/sysroot/ ../configure --prefix=$PREFIX --host=$HOST --disable-icd --with-sysroot=$ANDROID_TOOLCHAIN/sysroot/
+
+cmake -DCMAKE_TOOLCHAIN_FILE=androideabi.cmake -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_AR:FILEPATH=$HOST-gcc-ar -DCMAKE_RANLIB:FILEPATH=$HOST-gcc-ranlib -DCMAKE_CXX_FLAGS:STRING="-Os -ffunction-sections -fdata-sections -flto" -DCMAKE_C_FLAGS:STRING="-Os -ffunction-sections -fdata-sections -flto" -DCMAKE_EXE_LINKER_FLAGS:STRING='-flto -fuse-linker-plugin -Wl,--gc-sections' -DCMAKE_MODULE_LINKER_FLAGS:STRING='-flto -fuse-linker-plugin -Wl,--gc-sections'  -DCMAKE_SHARED_LINKER_FLAGS:STRIN [...]
+make
+
+fi
+
+make install
+
+# Copy license files to install folder
+cp -f $ANDROID_TOOLCHAIN/sysroot/usr/share/LICENSE* $INSTALL_PREFIX/share/
+cp -f ../LICENSE $INSTALL_PREFIX/share/LICENSE.pocl
+
+echo -e "\n\nBuild completed...\nBuilt files are at "$PREFIX"\n"
+
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 0000000..5bce9ba
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1,1578 @@
+#!/bin/sh
+#                        a u t o g e n . s h
+#
+# Copyright (c) 2005-2009 United States Government as represented by
+# the U.S. Army Research Laboratory.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+#
+# 3. The name of the author may not be used to endorse or promote
+# products derived from this software without specific prior written
+# permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+###
+#
+# Script for automatically preparing the sources for compilation by
+# performing the myriad of necessary steps.  The script attempts to
+# detect proper version support, and outputs warnings about particular
+# systems that have autotool peculiarities.
+#
+# Basically, if everything is set up and installed correctly, the
+# script will validate that minimum versions of the GNU Build System
+# tools are installed, account for several common configuration
+# issues, and then simply run autoreconf for you.
+#
+# If autoreconf fails, which can happen for many valid configurations,
+# this script proceeds to run manual preparation steps effectively
+# providing a POSIX shell script (mostly complete) reimplementation of
+# autoreconf.
+#
+# The AUTORECONF, AUTOCONF, AUTOMAKE, LIBTOOLIZE, ACLOCAL, AUTOHEADER
+# environment variables and corresponding _OPTIONS variables (e.g.
+# AUTORECONF_OPTIONS) may be used to override the default automatic
+# detection behaviors.  Similarly the _VERSION variables will override
+# the minimum required version numbers.
+#
+# Examples:
+#
+#   To obtain help on usage:
+#     ./autogen.sh --help
+#
+#   To obtain verbose output:
+#     ./autogen.sh --verbose
+#
+#   To skip autoreconf and prepare manually:
+#     AUTORECONF=false ./autogen.sh
+#
+#   To verbosely try running with an older (unsupported) autoconf:
+#     AUTOCONF_VERSION=2.50 ./autogen.sh --verbose
+#
+# Author:
+#   Christopher Sean Morrison <morrison at brlcad.org>
+#
+# Patches:
+#   Sebastian Pipping <sebastian at pipping.org>
+#
+######################################################################
+
+# set to minimum acceptable version of autoconf
+if [ "x$AUTOCONF_VERSION" = "x" ] ; then
+    AUTOCONF_VERSION=2.52
+fi
+# set to minimum acceptable version of automake
+if [ "x$AUTOMAKE_VERSION" = "x" ] ; then
+    AUTOMAKE_VERSION=1.6.0
+fi
+# set to minimum acceptable version of libtool
+if [ "x$LIBTOOL_VERSION" = "x" ] ; then
+    LIBTOOL_VERSION=1.4.2
+fi
+
+
+##################
+# ident function #
+##################
+ident ( ) {
+    # extract copyright from header
+    __copyright="`grep Copyright $AUTOGEN_SH | head -${HEAD_N}1 | awk '{print $4}'`"
+    if [ "x$__copyright" = "x" ] ; then
+	__copyright="`date +%Y`"
+    fi
+
+    # extract version from CVS Id string
+    __id="$Id: autogen.sh 33925 2009-03-01 23:27:06Z brlcad $"
+    __version="`echo $__id | sed 's/.*\([0-9][0-9][0-9][0-9]\)[-\/]\([0-9][0-9]\)[-\/]\([0-9][0-9]\).*/\1\2\3/'`"
+    if [ "x$__version" = "x" ] ; then
+	__version=""
+    fi
+
+    echo "autogen.sh build preparation script by Christopher Sean Morrison"
+    echo "  + config.guess download patch by Sebastian Pipping (2008-12-03)"
+    echo "revised 3-clause BSD-style license, copyright (c) $__copyright"
+    echo "script version $__version, ISO/IEC 9945 POSIX shell script"
+}
+
+
+##################
+# USAGE FUNCTION #
+##################
+usage ( ) {
+    echo "Usage: $AUTOGEN_SH [-h|--help] [-v|--verbose] [-q|--quiet] [-d|--download] [--version]"
+    echo "    --help      Help on $NAME_OF_AUTOGEN usage"
+    echo "    --verbose   Verbose progress output"
+    echo "    --quiet     Quiet suppressed progress output"
+    echo "    --download  Download the latest config.guess from gnulib"
+    echo "    --version   Only perform GNU Build System version checks"
+    echo
+    echo "Description: This script will validate that minimum versions of the"
+    echo "GNU Build System tools are installed and then run autoreconf for you."
+    echo "Should autoreconf fail, manual preparation steps will be run"
+    echo "potentially accounting for several common preparation issues.  The"
+
+    echo "AUTORECONF, AUTOCONF, AUTOMAKE, LIBTOOLIZE, ACLOCAL, AUTOHEADER,"
+    echo "PROJECT, & CONFIGURE environment variables and corresponding _OPTIONS"
+    echo "variables (e.g. AUTORECONF_OPTIONS) may be used to override the"
+    echo "default automatic detection behavior."
+    echo
+
+    ident
+
+    return 0
+}
+
+
+##########################
+# VERSION_ERROR FUNCTION #
+##########################
+version_error ( ) {
+    if [ "x$1" = "x" ] ; then
+	echo "INTERNAL ERROR: version_error was not provided a version"
+	exit 1
+    fi
+    if [ "x$2" = "x" ] ; then
+	echo "INTERNAL ERROR: version_error was not provided an application name"
+	exit 1
+    fi
+    $ECHO
+    $ECHO "ERROR:  To prepare the ${PROJECT} build system from scratch,"
+    $ECHO "        at least version $1 of $2 must be installed."
+    $ECHO
+    $ECHO "$NAME_OF_AUTOGEN does not need to be run on the same machine that will"
+    $ECHO "run configure or make.  Either the GNU Autotools will need to be installed"
+    $ECHO "or upgraded on this system, or $NAME_OF_AUTOGEN must be run on the source"
+    $ECHO "code on another system and then transferred to here. -- Cheers!"
+    $ECHO
+}
+
+##########################
+# VERSION_CHECK FUNCTION #
+##########################
+version_check ( ) {
+    if [ "x$1" = "x" ] ; then
+	echo "INTERNAL ERROR: version_check was not provided a minimum version"
+	exit 1
+    fi
+    _min="$1"
+    if [ "x$2" = "x" ] ; then
+	echo "INTERNAL ERROR: version check was not provided a comparison version"
+	exit 1
+    fi
+    _cur="$2"
+
+    # needed to handle versions like 1.10 and 1.4-p6
+    _min="`echo ${_min}. | sed 's/[^0-9]/./g' | sed 's/\.\././g'`"
+    _cur="`echo ${_cur}. | sed 's/[^0-9]/./g' | sed 's/\.\././g'`"
+
+    _min_major="`echo $_min | cut -d. -f1`"
+    _min_minor="`echo $_min | cut -d. -f2`"
+    _min_patch="`echo $_min | cut -d. -f3`"
+
+    _cur_major="`echo $_cur | cut -d. -f1`"
+    _cur_minor="`echo $_cur | cut -d. -f2`"
+    _cur_patch="`echo $_cur | cut -d. -f3`"
+
+    if [ "x$_min_major" = "x" ] ; then
+	_min_major=0
+    fi
+    if [ "x$_min_minor" = "x" ] ; then
+	_min_minor=0
+    fi
+    if [ "x$_min_patch" = "x" ] ; then
+	_min_patch=0
+    fi
+    if [ "x$_cur_minor" = "x" ] ; then
+	_cur_major=0
+    fi
+    if [ "x$_cur_minor" = "x" ] ; then
+	_cur_minor=0
+    fi
+    if [ "x$_cur_patch" = "x" ] ; then
+	_cur_patch=0
+    fi
+
+    $VERBOSE_ECHO "Checking if ${_cur_major}.${_cur_minor}.${_cur_patch} is greater than ${_min_major}.${_min_minor}.${_min_patch}"
+
+    if [ $_min_major -lt $_cur_major ] ; then
+	return 0
+    elif [ $_min_major -eq $_cur_major ] ; then
+	if [ $_min_minor -lt $_cur_minor ] ; then
+	    return 0
+	elif [ $_min_minor -eq $_cur_minor ] ; then
+	    if [ $_min_patch -lt $_cur_patch ] ; then
+		return 0
+	    elif [ $_min_patch -eq $_cur_patch ] ; then
+		return 0
+	    fi
+	fi
+    fi
+    return 1
+}
+
+
+######################################
+# LOCATE_CONFIGURE_TEMPLATE FUNCTION #
+######################################
+locate_configure_template ( ) {
+    _pwd="`pwd`"
+    if test -f "./configure.ac" ; then
+	echo "./configure.ac"
+    elif test -f "./configure.in" ; then
+	echo "./configure.in"
+    elif test -f "$_pwd/configure.ac" ; then
+	echo "$_pwd/configure.ac"
+    elif test -f "$_pwd/configure.in" ; then
+	echo "$_pwd/configure.in"
+    elif test -f "$PATH_TO_AUTOGEN/configure.ac" ; then
+	echo "$PATH_TO_AUTOGEN/configure.ac"
+    elif test -f "$PATH_TO_AUTOGEN/configure.in" ; then
+	echo "$PATH_TO_AUTOGEN/configure.in"
+    fi
+}
+
+
+##################
+# argument check #
+##################
+ARGS="$*"
+PATH_TO_AUTOGEN="`dirname $0`"
+NAME_OF_AUTOGEN="`basename $0`"
+AUTOGEN_SH="$PATH_TO_AUTOGEN/$NAME_OF_AUTOGEN"
+
+LIBTOOL_M4="${PATH_TO_AUTOGEN}/misc/libtool.m4"
+
+if [ "x$HELP" = "x" ] ; then
+    HELP=no
+fi
+if [ "x$QUIET" = "x" ] ; then
+    QUIET=no
+fi
+if [ "x$VERBOSE" = "x" ] ; then
+    VERBOSE=no
+fi
+if [ "x$VERSION_ONLY" = "x" ] ; then
+    VERSION_ONLY=no
+fi
+if [ "x$DOWNLOAD" = "x" ] ; then
+    DOWNLOAD=no
+fi
+if [ "x$AUTORECONF_OPTIONS" = "x" ] ; then
+    AUTORECONF_OPTIONS="-i -f"
+fi
+if [ "x$AUTOCONF_OPTIONS" = "x" ] ; then
+    AUTOCONF_OPTIONS="-f"
+fi
+if [ "x$AUTOMAKE_OPTIONS" = "x" ] ; then
+    AUTOMAKE_OPTIONS="-a -c -f"
+fi
+ALT_AUTOMAKE_OPTIONS="-a -c"
+if [ "x$LIBTOOLIZE_OPTIONS" = "x" ] ; then
+    LIBTOOLIZE_OPTIONS="--automake -c -f"
+fi
+ALT_LIBTOOLIZE_OPTIONS="--automake --copy --force"
+if [ "x$ACLOCAL_OPTIONS" = "x" ] ; then
+    ACLOCAL_OPTIONS=""
+fi
+if [ "x$AUTOHEADER_OPTIONS" = "x" ] ; then
+    AUTOHEADER_OPTIONS=""
+fi
+if [ "x$CONFIG_GUESS_URL" = "x" ] ; then
+    CONFIG_GUESS_URL="http://git.savannah.gnu.org/gitweb/?p=gnulib.git;a=blob_plain;f=build-aux/config.guess;hb=HEAD"
+fi
+for arg in $ARGS ; do
+    case "x$arg" in
+	x--help) HELP=yes ;;
+	x-[hH]) HELP=yes ;;
+	x--quiet) QUIET=yes ;;
+	x-[qQ]) QUIET=yes ;;
+	x--verbose) VERBOSE=yes ;;
+	x-[dD]) DOWNLOAD=yes ;;
+	x--download) DOWNLOAD=yes ;;
+	x-[vV]) VERBOSE=yes ;;
+	x--version) VERSION_ONLY=yes ;;
+	*)
+	    echo "Unknown option: $arg"
+	    echo
+	    usage
+	    exit 1
+	    ;;
+    esac
+done
+
+
+#####################
+# environment check #
+#####################
+
+# sanity check before recursions potentially begin
+if [ ! -f "$AUTOGEN_SH" ] ; then
+    echo "INTERNAL ERROR: $AUTOGEN_SH does not exist"
+    if [ ! "x$0" = "x$AUTOGEN_SH" ] ; then
+	echo "INTERNAL ERROR: dirname/basename inconsistency: $0 != $AUTOGEN_SH"
+    fi
+    exit 1
+fi
+
+# force locale setting to C so things like date output as expected
+LC_ALL=C
+
+# commands that this script expects
+for __cmd in echo head tail pwd ; do
+    echo "test" | $__cmd > /dev/null 2>&1
+    if [ $? != 0 ] ; then
+	echo "INTERNAL ERROR: '${__cmd}' command is required"
+	exit 2
+    fi
+done
+echo "test" | grep "test" > /dev/null 2>&1
+if test ! x$? = x0 ; then
+    echo "INTERNAL ERROR: grep command is required"
+    exit 1
+fi
+echo "test" | sed "s/test/test/" > /dev/null 2>&1
+if test ! x$? = x0 ; then
+    echo "INTERNAL ERROR: sed command is required"
+    exit 1
+fi
+
+
+# determine the behavior of echo
+case `echo "testing\c"; echo 1,2,3`,`echo -n testing; echo 1,2,3` in
+    *c*,-n*) ECHO_N= ECHO_C='
+' ECHO_T='	' ;;
+    *c*,*  ) ECHO_N=-n ECHO_C= ECHO_T= ;;
+    *)       ECHO_N= ECHO_C='\c' ECHO_T= ;;
+esac
+
+# determine the behavior of head
+case "x`echo 'head' | head -n 1 2>&1`" in
+    *xhead*) HEAD_N="n " ;;
+    *) HEAD_N="" ;;
+esac
+
+# determine the behavior of tail
+case "x`echo 'tail' | tail -n 1 2>&1`" in
+    *xtail*) TAIL_N="n " ;;
+    *) TAIL_N="" ;;
+esac
+
+VERBOSE_ECHO=:
+ECHO=:
+if [ "x$QUIET" = "xyes" ] ; then
+    if [ "x$VERBOSE" = "xyes" ] ; then
+	echo "Verbose output quelled by quiet option.  Further output disabled."
+    fi
+else
+    ECHO=echo
+    if [ "x$VERBOSE" = "xyes" ] ; then
+	echo "Verbose output enabled"
+	VERBOSE_ECHO=echo
+    fi
+fi
+
+
+# allow a recursive run to disable further recursions
+if [ "x$RUN_RECURSIVE" = "x" ] ; then
+    RUN_RECURSIVE=yes
+fi
+
+
+################################################
+# check for help arg and bypass version checks #
+################################################
+if [ "x`echo $ARGS | sed 's/.*[hH][eE][lL][pP].*/help/'`" = "xhelp" ] ; then
+    HELP=yes
+fi
+if [ "x$HELP" = "xyes" ] ; then
+    usage
+    $ECHO "---"
+    $ECHO "Help was requested.  No preparation or configuration will be performed."
+    exit 0
+fi
+
+
+#######################
+# set up signal traps #
+#######################
+untrap_abnormal ( ) {
+    for sig in 1 2 13 15; do
+	trap - $sig
+    done
+}
+
+# do this cleanup whenever we exit.
+trap '
+    # start from the root
+    if test -d "$START_PATH" ; then
+	cd "$START_PATH"
+    fi
+
+    # restore/delete backup files
+    if test "x$PFC_INIT" = "x1" ; then
+	recursive_restore
+    fi
+' 0
+
+# trap SIGHUP (1), SIGINT (2), SIGPIPE (13), SIGTERM (15)
+for sig in 1 2 13 15; do
+    trap '
+	$ECHO ""
+	$ECHO "Aborting $NAME_OF_AUTOGEN: caught signal '$sig'"
+
+	# start from the root
+	if test -d "$START_PATH" ; then
+	    cd "$START_PATH"
+	fi
+
+	# clean up on abnormal exit
+	$VERBOSE_ECHO "rm -rf autom4te.cache"
+	rm -rf autom4te.cache
+
+	if test -f "acinclude.m4.$$.backup" ; then
+	    $VERBOSE_ECHO "cat acinclude.m4.$$.backup > acinclude.m4"
+	    chmod u+w acinclude.m4
+	    cat acinclude.m4.$$.backup > acinclude.m4
+
+	    $VERBOSE_ECHO "rm -f acinclude.m4.$$.backup"
+	    rm -f acinclude.m4.$$.backup
+        fi
+
+	{ (exit 1); exit 1; }
+' $sig
+done
+
+
+#############################
+# look for a configure file #
+#############################
+if [ "x$CONFIGURE" = "x" ] ; then
+    CONFIGURE="`locate_configure_template`"
+    if [ ! "x$CONFIGURE" = "x" ] ; then
+	$VERBOSE_ECHO "Found a configure template: $CONFIGURE"
+    fi
+else
+    $ECHO "Using CONFIGURE environment variable override: $CONFIGURE"
+fi
+if [ "x$CONFIGURE" = "x" ] ; then
+    if [ "x$VERSION_ONLY" = "xyes" ] ; then
+	CONFIGURE=/dev/null
+    else
+	$ECHO
+	$ECHO "A configure.ac or configure.in file could not be located implying"
+	$ECHO "that the GNU Build System is at least not used in this directory.  In"
+	$ECHO "any case, there is nothing to do here without one of those files."
+	$ECHO
+	$ECHO "ERROR: No configure.in or configure.ac file found in `pwd`"
+	exit 1
+    fi
+fi
+
+####################
+# get project name #
+####################
+if [ "x$PROJECT" = "x" ] ; then
+    PROJECT="`grep AC_INIT $CONFIGURE | grep -v '.*#.*AC_INIT' | tail -${TAIL_N}1 | sed 's/^[ 	]*AC_INIT(\([^,)]*\).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+    if [ "x$PROJECT" = "xAC_INIT" ] ; then
+	# projects might be using the older/deprecated arg-less AC_INIT .. look for AM_INIT_AUTOMAKE instead
+	PROJECT="`grep AM_INIT_AUTOMAKE $CONFIGURE | grep -v '.*#.*AM_INIT_AUTOMAKE' | tail -${TAIL_N}1 | sed 's/^[ 	]*AM_INIT_AUTOMAKE(\([^,)]*\).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+    fi
+    if [ "x$PROJECT" = "xAM_INIT_AUTOMAKE" ] ; then
+	PROJECT="project"
+    fi
+    if [ "x$PROJECT" = "x" ] ; then
+	PROJECT="project"
+    fi
+else
+    $ECHO "Using PROJECT environment variable override: $PROJECT"
+fi
+$ECHO "Preparing the $PROJECT build system...please wait"
+$ECHO
+
+
+########################
+# check for autoreconf #
+########################
+HAVE_AUTORECONF=no
+if [ "x$AUTORECONF" = "x" ] ; then
+    for AUTORECONF in autoreconf ; do
+	$VERBOSE_ECHO "Checking autoreconf version: $AUTORECONF --version"
+	$AUTORECONF --version > /dev/null 2>&1
+	if [ $? = 0 ] ; then
+	    HAVE_AUTORECONF=yes
+	    break
+	fi
+    done
+else
+    HAVE_AUTORECONF=yes
+    $ECHO "Using AUTORECONF environment variable override: $AUTORECONF"
+fi
+
+
+##########################
+# autoconf version check #
+##########################
+_acfound=no
+if [ "x$AUTOCONF" = "x" ] ; then
+    for AUTOCONF in autoconf ; do
+	$VERBOSE_ECHO "Checking autoconf version: $AUTOCONF --version"
+	$AUTOCONF --version > /dev/null 2>&1
+	if [ $? = 0 ] ; then
+	    _acfound=yes
+	    break
+	fi
+    done
+else
+    _acfound=yes
+    $ECHO "Using AUTOCONF environment variable override: $AUTOCONF"
+fi
+
+_report_error=no
+if [ ! "x$_acfound" = "xyes" ] ; then
+    $ECHO "ERROR:  Unable to locate GNU Autoconf."
+    _report_error=yes
+else
+    _version="`$AUTOCONF --version | head -${HEAD_N}1 | sed 's/[^0-9]*\([0-9\.][0-9\.]*\)/\1/'`"
+    if [ "x$_version" = "x" ] ; then
+	_version="0.0.0"
+    fi
+    $ECHO "Found GNU Autoconf version $_version"
+    version_check "$AUTOCONF_VERSION" "$_version"
+    if [ $? -ne 0 ] ; then
+	_report_error=yes
+    fi
+fi
+if [ "x$_report_error" = "xyes" ] ; then
+    version_error "$AUTOCONF_VERSION" "GNU Autoconf"
+    exit 1
+fi
+
+
+##########################
+# automake version check #
+##########################
+_amfound=no
+if [ "x$AUTOMAKE" = "x" ] ; then
+    for AUTOMAKE in automake ; do
+	$VERBOSE_ECHO "Checking automake version: $AUTOMAKE --version"
+	$AUTOMAKE --version > /dev/null 2>&1
+	if [ $? = 0 ] ; then
+	    _amfound=yes
+	    break
+	fi
+    done
+else
+    _amfound=yes
+    $ECHO "Using AUTOMAKE environment variable override: $AUTOMAKE"
+fi
+
+
+_report_error=no
+if [ ! "x$_amfound" = "xyes" ] ; then
+    $ECHO
+    $ECHO "ERROR: Unable to locate GNU Automake."
+    _report_error=yes
+else
+    _version="`$AUTOMAKE --version | head -${HEAD_N}1 | sed 's/[^0-9]*\([0-9\.][0-9\.]*\)/\1/'`"
+    if [ "x$_version" = "x" ] ; then
+	_version="0.0.0"
+    fi
+    $ECHO "Found GNU Automake version $_version"
+    version_check "$AUTOMAKE_VERSION" "$_version"
+    if [ $? -ne 0 ] ; then
+	_report_error=yes
+    fi
+fi
+if [ "x$_report_error" = "xyes" ] ; then
+    version_error "$AUTOMAKE_VERSION" "GNU Automake"
+    exit 1
+fi
+
+
+########################
+# check for libtoolize #
+########################
+HAVE_LIBTOOLIZE=yes
+HAVE_ALT_LIBTOOLIZE=no
+_ltfound=no
+if [ "x$LIBTOOLIZE" = "x" ] ; then
+    LIBTOOLIZE=libtoolize
+    $VERBOSE_ECHO "Checking libtoolize version: $LIBTOOLIZE --version"
+    $LIBTOOLIZE --version > /dev/null 2>&1
+    if [ ! $? = 0 ] ; then
+	HAVE_LIBTOOLIZE=no
+	$ECHO
+	if [ "x$HAVE_AUTORECONF" = "xno" ] ; then
+	    $ECHO "Warning:  libtoolize does not appear to be available."
+	else
+	    $ECHO "Warning:  libtoolize does not appear to be available.  This means that"
+	    $ECHO "the automatic build preparation via autoreconf will probably not work."
+	    $ECHO "Preparing the build by running each step individually, however, should"
+	    $ECHO "work and will be done automatically for you if autoreconf fails."
+	fi
+
+	# look for some alternates
+	for tool in glibtoolize libtoolize15 libtoolize14 libtoolize13 ; do
+	    $VERBOSE_ECHO "Checking libtoolize alternate: $tool --version"
+	    _glibtoolize="`$tool --version > /dev/null 2>&1`"
+	    if [ $? = 0 ] ; then
+		$VERBOSE_ECHO "Found $tool --version"
+		_glti="`which $tool`"
+		if [ "x$_glti" = "x" ] ; then
+		    $VERBOSE_ECHO "Cannot find $tool with which"
+		    continue;
+		fi
+		if test ! -f "$_glti" ; then
+		    $VERBOSE_ECHO "Cannot use $tool, $_glti is not a file"
+		    continue;
+		fi
+		_gltidir="`dirname $_glti`"
+		if [ "x$_gltidir" = "x" ] ; then
+		    $VERBOSE_ECHO "Cannot find $tool path with dirname of $_glti"
+		    continue;
+		fi
+		if test ! -d "$_gltidir" ; then
+		    $VERBOSE_ECHO "Cannot use $tool, $_gltidir is not a directory"
+		    continue;
+		fi
+		HAVE_ALT_LIBTOOLIZE=yes
+		LIBTOOLIZE="$tool"
+		$ECHO
+		$ECHO "Fortunately, $tool was found which means that your system may simply"
+		$ECHO "have a non-standard or incomplete GNU Autotools install.  If you have"
+		$ECHO "sufficient system access, it may be possible to quell this warning by"
+		$ECHO "running:"
+		$ECHO
+		sudo -V > /dev/null 2>&1
+		if [ $? = 0 ] ; then
+		    $ECHO "   sudo ln -s $_glti $_gltidir/libtoolize"
+		    $ECHO
+		else
+		    $ECHO "   ln -s $_glti $_gltidir/libtoolize"
+		    $ECHO
+		    $ECHO "Run that as root or with proper permissions to the $_gltidir directory"
+		    $ECHO
+		fi
+		_ltfound=yes
+		break
+	    fi
+	done
+    else
+	_ltfound=yes
+    fi
+else
+    _ltfound=yes
+    $ECHO "Using LIBTOOLIZE environment variable override: $LIBTOOLIZE"
+fi
+
+
+############################
+# libtoolize version check #
+############################
+_report_error=no
+if [ ! "x$_ltfound" = "xyes" ] ; then
+    $ECHO
+    $ECHO "ERROR: Unable to locate GNU Libtool."
+    _report_error=yes
+else
+    _version="`$LIBTOOLIZE --version | head -${HEAD_N}1 | sed 's/[^0-9]*\([0-9\.][0-9\.]*\)/\1/'`"
+    if [ "x$_version" = "x" ] ; then
+	_version="0.0.0"
+    fi
+    $ECHO "Found GNU Libtool version $_version"
+    version_check "$LIBTOOL_VERSION" "$_version"
+    if [ $? -ne 0 ] ; then
+	_report_error=yes
+    fi
+fi
+if [ "x$_report_error" = "xyes" ] ; then
+    version_error "$LIBTOOL_VERSION" "GNU Libtool"
+    exit 1
+fi
+
+
+#####################
+# check for aclocal #
+#####################
+if [ "x$ACLOCAL" = "x" ] ; then
+    for ACLOCAL in aclocal ; do
+	$VERBOSE_ECHO "Checking aclocal version: $ACLOCAL --version"
+	$ACLOCAL --version > /dev/null 2>&1
+	if [ $? = 0 ] ; then
+	    break
+	fi
+    done
+else
+    $ECHO "Using ACLOCAL environment variable override: $ACLOCAL"
+fi
+
+
+########################
+# check for autoheader #
+########################
+if [ "x$AUTOHEADER" = "x" ] ; then
+    for AUTOHEADER in autoheader ; do
+	$VERBOSE_ECHO "Checking autoheader version: $AUTOHEADER --version"
+	$AUTOHEADER --version > /dev/null 2>&1
+	if [ $? = 0 ] ; then
+	    break
+	fi
+    done
+else
+    $ECHO "Using AUTOHEADER environment variable override: $AUTOHEADER"
+fi
+
+
+#########################
+# check if version only #
+#########################
+$VERBOSE_ECHO "Checking whether to only output version information"
+if [ "x$VERSION_ONLY" = "xyes" ] ; then
+    $ECHO
+    ident
+    $ECHO "---"
+    $ECHO "Version requested.  No preparation or configuration will be performed."
+    exit 0
+fi
+
+
+#################################
+# PROTECT_FROM_CLOBBER FUNCTION #
+#################################
+protect_from_clobber ( ) {
+    PFC_INIT=1
+
+    # protect COPYING & INSTALL from overwrite by automake.  the
+    # automake force option will (inappropriately) ignore the existing
+    # contents of a COPYING and/or INSTALL files (depending on the
+    # version) instead of just forcing *missing* files like it does
+    # for AUTHORS, NEWS, and README. this is broken but extremely
+    # prevalent behavior, so we protect against it by keeping a backup
+    # of the file that can later be restored.
+
+    for file in COPYING INSTALL ; do
+	if test -f ${file} ; then
+	    if test -f ${file}.$$.protect_from_automake.backup ; then
+		$VERBOSE_ECHO "Already backed up ${file} in `pwd`"
+	    else
+		$VERBOSE_ECHO "Backing up ${file} in `pwd`"
+		$VERBOSE_ECHO "cp -p ${file} ${file}.$$.protect_from_automake.backup"
+		cp -p ${file} ${file}.$$.protect_from_automake.backup
+	    fi
+	fi
+    done
+}
+
+
+##############################
+# RECURSIVE_PROTECT FUNCTION #
+##############################
+recursive_protect ( ) {
+
+    # for projects using recursive configure, run the build
+    # preparation steps for the subdirectories.  this function assumes
+    # START_PATH was set to pwd before recursion begins so that
+    # relative paths work.
+
+    # git 'r done, protect COPYING and INSTALL from being clobbered
+    protect_from_clobber
+
+    if test -d autom4te.cache ; then
+	$VERBOSE_ECHO "Found an autom4te.cache directory, deleting it"
+	$VERBOSE_ECHO "rm -rf autom4te.cache"
+	rm -rf autom4te.cache
+    fi
+
+    # find configure template
+    _configure="`locate_configure_template`"
+    if [ "x$_configure" = "x" ] ; then
+	return
+    fi
+    # $VERBOSE_ECHO "Looking for configure template found `pwd`/$_configure"
+
+    # look for subdirs
+    # $VERBOSE_ECHO "Looking for subdirs in `pwd`"
+    _det_config_subdirs="`grep AC_CONFIG_SUBDIRS $_configure | grep -v '.*#.*AC_CONFIG_SUBDIRS' | sed 's/^[ 	]*AC_CONFIG_SUBDIRS(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+    CHECK_DIRS=""
+    for dir in $_det_config_subdirs ; do
+	if test -d "`pwd`/$dir" ; then
+	    CHECK_DIRS="$CHECK_DIRS \"`pwd`/$dir\""
+	fi
+    done
+
+    # process subdirs
+    if [ ! "x$CHECK_DIRS" = "x" ] ; then
+	$VERBOSE_ECHO "Recursively scanning the following directories:"
+	$VERBOSE_ECHO "  $CHECK_DIRS"
+	for dir in $CHECK_DIRS ; do
+	    $VERBOSE_ECHO "Protecting files from automake in $dir"
+	    cd "$START_PATH"
+	    eval "cd $dir"
+
+	    # recursively git 'r done
+	    recursive_protect
+	done
+    fi
+} # end of recursive_protect
+
+
+#############################
+# RESTORE_CLOBBERED FUNCION #
+#############################
+restore_clobbered ( ) {
+
+    # The automake (and autoreconf by extension) -f/--force-missing
+    # option may overwrite COPYING and INSTALL even if they do exist.
+    # Here we restore the files if necessary.
+
+    spacer=no
+
+    for file in COPYING INSTALL ; do
+	if test -f ${file}.$$.protect_from_automake.backup ; then
+	    if test -f ${file} ; then
+	    # compare entire content, restore if needed
+	    if test "x`cat ${file}`" != "x`cat ${file}.$$.protect_from_automake.backup`" ; then
+		if test "x$spacer" = "xno" ; then
+		    $VERBOSE_ECHO
+		    spacer=yes
+		fi
+		# restore the backup
+		$VERBOSE_ECHO "Restoring ${file} from backup (automake -f likely clobbered it)"
+		$VERBOSE_ECHO "rm -f ${file}"
+		rm -f ${file}
+		$VERBOSE_ECHO "mv ${file}.$$.protect_from_automake.backup ${file}"
+		mv ${file}.$$.protect_from_automake.backup ${file}
+	    fi # check contents
+	    elif test -f ${file}.$$.protect_from_automake.backup ; then
+		$VERBOSE_ECHO "mv ${file}.$$.protect_from_automake.backup ${file}"
+		mv ${file}.$$.protect_from_automake.backup ${file}
+	    fi # -f ${file}
+	
+	    # just in case
+	    $VERBOSE_ECHO "rm -f ${file}.$$.protect_from_automake.backup"
+	    rm -f ${file}.$$.protect_from_automake.backup
+	fi # -f ${file}.$$.protect_from_automake.backup
+    done
+
+    CONFIGURE="`locate_configure_template`"
+    if [ "x$CONFIGURE" = "x" ] ; then
+	return
+    fi
+
+    _aux_dir="`grep AC_CONFIG_AUX_DIR $CONFIGURE | grep -v '.*#.*AC_CONFIG_AUX_DIR' | tail -${TAIL_N}1 | sed 's/^[ 	]*AC_CONFIG_AUX_DIR(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+    if test ! -d "$_aux_dir" ; then
+	_aux_dir=.
+    fi
+
+    for file in config.guess config.sub ltmain.sh ; do
+	if test -f "${_aux_dir}/${file}" ; then
+	    $VERBOSE_ECHO "rm -f \"${_aux_dir}/${file}.backup\""
+	    rm -f "${_aux_dir}/${file}.backup"
+	fi
+    done
+} # end of restore_clobbered
+
+
+##############################
+# RECURSIVE_RESTORE FUNCTION #
+##############################
+recursive_restore ( ) {
+
+    # restore COPYING and INSTALL from backup if they were clobbered
+    # for each directory recursively.
+
+    # git 'r undone
+    restore_clobbered
+
+    # find configure template
+    _configure="`locate_configure_template`"
+    if [ "x$_configure" = "x" ] ; then
+	return
+    fi
+
+    # look for subdirs
+    _det_config_subdirs="`grep AC_CONFIG_SUBDIRS $_configure | grep -v '.*#.*AC_CONFIG_SUBDIRS' | sed 's/^[ 	]*AC_CONFIG_SUBDIRS(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+    CHECK_DIRS=""
+    for dir in $_det_config_subdirs ; do
+	if test -d "`pwd`/$dir" ; then
+	    CHECK_DIRS="$CHECK_DIRS \"`pwd`/$dir\""
+	fi
+    done
+
+    # process subdirs
+    if [ ! "x$CHECK_DIRS" = "x" ] ; then
+	$VERBOSE_ECHO "Recursively scanning the following directories:"
+	$VERBOSE_ECHO "  $CHECK_DIRS"
+	for dir in $CHECK_DIRS ; do
+	    $VERBOSE_ECHO "Checking files for automake damage in $dir"
+	    cd "$START_PATH"
+	    eval "cd $dir"
+
+	    # recursively git 'r undone
+	    recursive_restore
+	done
+    fi
+} # end of recursive_restore
+
+
+#######################
+# INITIALIZE FUNCTION #
+#######################
+initialize ( ) {
+
+    # this routine performs a variety of directory-specific
+    # initializations.  some are sanity checks, some are preventive,
+    # and some are necessary setup detection.
+    #
+    # this function sets:
+    #   CONFIGURE
+    #   SEARCH_DIRS
+    #   CONFIG_SUBDIRS
+
+    ##################################
+    # check for a configure template #
+    ##################################
+    CONFIGURE="`locate_configure_template`"
+    if [ "x$CONFIGURE" = "x" ] ; then
+	$ECHO
+	$ECHO "A configure.ac or configure.in file could not be located implying"
+	$ECHO "that the GNU Build System is at least not used in this directory.  In"
+	$ECHO "any case, there is nothing to do here without one of those files."
+	$ECHO
+	$ECHO "ERROR: No configure.in or configure.ac file found in `pwd`"
+	exit 1
+    fi
+
+    #####################
+    # detect an aux dir #
+    #####################
+    _aux_dir="`grep AC_CONFIG_AUX_DIR $CONFIGURE | grep -v '.*#.*AC_CONFIG_AUX_DIR' | tail -${TAIL_N}1 | sed 's/^[ 	]*AC_CONFIG_AUX_DIR(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+    if test ! -d "$_aux_dir" ; then
+	_aux_dir=.
+    else
+	$VERBOSE_ECHO "Detected auxillary directory: $_aux_dir"
+    fi
+
+    ################################
+    # detect a recursive configure #
+    ################################
+    CONFIG_SUBDIRS=""
+    _det_config_subdirs="`grep AC_CONFIG_SUBDIRS $CONFIGURE | grep -v '.*#.*AC_CONFIG_SUBDIRS' | sed 's/^[ 	]*AC_CONFIG_SUBDIRS(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+    for dir in $_det_config_subdirs ; do
+	if test -d "`pwd`/$dir" ; then
+	    $VERBOSE_ECHO "Detected recursive configure directory: `pwd`/$dir"
+	    CONFIG_SUBDIRS="$CONFIG_SUBDIRS `pwd`/$dir"
+	fi
+    done
+
+    ###########################################################
+    # make sure certain required files exist for GNU projects #
+    ###########################################################
+    _marker_found=""
+    _marker_found_message_intro='Detected non-GNU marker "'
+    _marker_found_message_mid='" in '
+    for marker in foreign cygnus ; do
+	_marker_found_message=${_marker_found_message_intro}${marker}${_marker_found_message_mid}
+	_marker_found="`grep 'AM_INIT_AUTOMAKE.*'${marker} $CONFIGURE`"
+	if [ ! "x$_marker_found" = "x" ] ; then
+	    $VERBOSE_ECHO "${_marker_found_message}`basename \"$CONFIGURE\"`"
+	    break
+	fi
+	if test -f "`dirname \"$CONFIGURE\"/Makefile.am`" ; then
+	    _marker_found="`grep 'AUTOMAKE_OPTIONS.*'${marker} Makefile.am`"
+	    if [ ! "x$_marker_found" = "x" ] ; then
+		$VERBOSE_ECHO "${_marker_found_message}Makefile.am"
+		break
+	    fi
+	fi
+    done
+    if [ "x${_marker_found}" = "x" ] ; then
+	_suggest_foreign=no
+	for file in AUTHORS COPYING ChangeLog INSTALL NEWS README ; do
+	    if [ ! -f $file ] ; then
+		$VERBOSE_ECHO "Touching ${file} since it does not exist"
+		_suggest_foreign=yes
+		touch $file
+	    fi
+	done
+
+	if [ "x${_suggest_foreign}" = "xyes" ] ; then
+	    $ECHO
+	    $ECHO "Warning: Several files expected of projects that conform to the GNU"
+	    $ECHO "coding standards were not found.  The files were automatically added"
+	    $ECHO "for you since you do not have a 'foreign' declaration specified."
+	    $ECHO
+	    $ECHO "Considered adding 'foreign' to AM_INIT_AUTOMAKE in `basename \"$CONFIGURE\"`"
+	    if test -f "`dirname \"$CONFIGURE\"/Makefile.am`" ; then
+		$ECHO "or to AUTOMAKE_OPTIONS in your top-level Makefile.am file."
+	    fi
+	    $ECHO
+	fi
+    fi
+
+    ##################################################
+    # make sure certain generated files do not exist #
+    ##################################################
+    for file in config.guess config.sub ltmain.sh ; do
+	if test -f "${_aux_dir}/${file}" ; then
+	    $VERBOSE_ECHO "mv -f \"${_aux_dir}/${file}\" \"${_aux_dir}/${file}.backup\""
+	    mv -f "${_aux_dir}/${file}" "${_aux_dir}/${file}.backup"
+	fi
+    done
+
+    ############################
+    # search alternate m4 dirs #
+    ############################
+    SEARCH_DIRS=""
+    for dir in m4 ; do
+	if [ -d $dir ] ; then
+	    $VERBOSE_ECHO "Found extra aclocal search directory: $dir"
+	    SEARCH_DIRS="$SEARCH_DIRS -I $dir"
+	fi
+    done
+
+    ######################################
+    # remove any previous build products #
+    ######################################
+    if test -d autom4te.cache ; then
+	$VERBOSE_ECHO "Found an autom4te.cache directory, deleting it"
+	$VERBOSE_ECHO "rm -rf autom4te.cache"
+	rm -rf autom4te.cache
+    fi
+# tcl/tk (and probably others) have a customized aclocal.m4, so can't delete it
+#     if test -f aclocal.m4 ; then
+# 	$VERBOSE_ECHO "Found an aclocal.m4 file, deleting it"
+# 	$VERBOSE_ECHO "rm -f aclocal.m4"
+# 	rm -f aclocal.m4
+#     fi
+
+} # end of initialize()
+
+
+##############
+# initialize #
+##############
+
+# stash path
+START_PATH="`pwd`"
+
+# Before running autoreconf or manual steps, some prep detection work
+# is necessary or useful.  Only needs to occur once per directory, but
+# does need to traverse the entire subconfigure hierarchy to protect
+# files from being clobbered even by autoreconf.
+recursive_protect
+
+# start from where we started
+cd "$START_PATH"
+
+# get ready to process
+initialize
+
+
+#########################################
+# DOWNLOAD_GNULIB_CONFIG_GUESS FUNCTION #
+#########################################
+
+# TODO - should make sure wget/curl exist and/or work before trying to
+# use them.
+
+download_gnulib_config_guess () {
+    # abuse gitweb to download gnulib's latest config.guess via HTTP
+    config_guess_temp="config.guess.$$.download"
+    ret=1
+    for __cmd in wget curl fetch ; do
+	$VERBOSE_ECHO "Checking for command ${__cmd}"
+	${__cmd} --version > /dev/null 2>&1
+	ret=$?
+	if [ ! $ret = 0 ] ; then
+	    continue
+        fi
+
+	__cmd_version=`${__cmd} --version | head -n 1 | sed -e 's/^[^0-9]\+//' -e 's/ .*//'`
+	$VERBOSE_ECHO "Found ${__cmd} ${__cmd_version}"
+
+	opts=""
+	case ${__cmd} in
+	    wget)
+		opts="-O" 
+		;;
+	    curl)
+		opts="-o"
+		;;
+	    fetch)
+		opts="-t 5 -f"
+		;;
+	esac
+
+	$VERBOSE_ECHO "Running $__cmd \"${CONFIG_GUESS_URL}\" $opts \"${config_guess_temp}\""
+	eval "$__cmd \"${CONFIG_GUESS_URL}\" $opts \"${config_guess_temp}\"" > /dev/null 2>&1
+	if [ $? = 0 ] ; then
+	    mv -f "${config_guess_temp}" ${_aux_dir}/config.guess
+	    ret=0
+	    break
+	fi
+    done
+
+    if [ ! $ret = 0 ] ; then
+	$ECHO "Warning: config.guess download failed from: $CONFIG_GUESS_URL"
+	rm -f "${config_guess_temp}"
+    fi
+}
+
+
+##############################
+# LIBTOOLIZE_NEEDED FUNCTION #
+##############################
+libtoolize_needed () {
+    ret=1 # means no, don't need libtoolize
+    for feature in AC_PROG_LIBTOOL AM_PROG_LIBTOOL LT_INIT ; do
+	$VERBOSE_ECHO "Searching for $feature in $CONFIGURE"
+	found="`grep \"^$feature.*\" $CONFIGURE`"
+	if [ ! "x$found" = "x" ] ; then
+	    ret=0 # means yes, need to run libtoolize
+	    break
+	fi
+    done
+    return ${ret}
+}
+
+
+
+############################################
+# prepare build via autoreconf or manually #
+############################################
+reconfigure_manually=no
+if [ "x$HAVE_AUTORECONF" = "xyes" ] ; then
+    $ECHO
+    $ECHO $ECHO_N "Automatically preparing build ... $ECHO_C"
+
+    $VERBOSE_ECHO "$AUTORECONF $SEARCH_DIRS $AUTORECONF_OPTIONS"
+    autoreconf_output="`$AUTORECONF $SEARCH_DIRS $AUTORECONF_OPTIONS 2>&1`"
+    ret=$?
+    $VERBOSE_ECHO "$autoreconf_output"
+
+    if [ ! $ret = 0 ] ; then
+	if [ "x$HAVE_ALT_LIBTOOLIZE" = "xyes" ] ; then
+	    if [ ! "x`echo \"$autoreconf_output\" | grep libtoolize | grep \"No such file or directory\"`" = "x" ] ; then
+		$ECHO
+		$ECHO "Warning: autoreconf failed but due to what is usually a common libtool"
+		$ECHO "misconfiguration issue.  This problem is encountered on systems that"
+		$ECHO "have installed libtoolize under a different name without providing a"
+		$ECHO "symbolic link or without setting the LIBTOOLIZE environment variable."
+		$ECHO
+		$ECHO "Restarting the preparation steps with LIBTOOLIZE set to $LIBTOOLIZE"
+
+		export LIBTOOLIZE
+		RUN_RECURSIVE=no
+		export RUN_RECURSIVE
+		untrap_abnormal
+
+		$VERBOSE_ECHO sh $AUTOGEN_SH "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9"
+		sh "$AUTOGEN_SH" "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9"
+		exit $?
+	    fi
+	fi
+
+	$ECHO "Warning: $AUTORECONF failed"
+
+	if test -f ltmain.sh ; then
+	    $ECHO "libtoolize being run by autoreconf is not creating ltmain.sh in the auxillary directory like it should"
+	fi
+
+	$ECHO "Attempting to run the preparation steps individually"
+	reconfigure_manually=yes
+    else
+	if [ "x$DOWNLOAD" = "xyes" ] ; then
+	    if libtoolize_needed ; then
+		download_gnulib_config_guess
+	    fi
+	fi
+    fi
+else
+    reconfigure_manually=yes
+fi
+
+
+############################
+# LIBTOOL_FAILURE FUNCTION #
+############################
+libtool_failure ( ) {
+
+    # libtool is rather error-prone in comparison to the other
+    # autotools and this routine attempts to compensate for some
+    # common failures.  the output after a libtoolize failure is
+    # parsed for an error related to AC_PROG_LIBTOOL and if found, we
+    # attempt to inject a project-provided libtool.m4 file.
+
+    _autoconf_output="$1"
+
+    if [ "x$RUN_RECURSIVE" = "xno" ] ; then
+	# we already tried the libtool.m4, don't try again
+	return 1
+    fi
+
+    if test -f "$LIBTOOL_M4" ; then
+	found_libtool="`$ECHO $_autoconf_output | grep AC_PROG_LIBTOOL`"
+	if test ! "x$found_libtool" = "x" ; then
+	    if test -f acinclude.m4 ; then
+		rm -f acinclude.m4.$$.backup
+		$VERBOSE_ECHO "cat acinclude.m4 > acinclude.m4.$$.backup"
+		cat acinclude.m4 > acinclude.m4.$$.backup
+	    fi
+	    $VERBOSE_ECHO "cat \"$LIBTOOL_M4\" >> acinclude.m4"
+	    chmod u+w acinclude.m4
+	    cat "$LIBTOOL_M4" >> acinclude.m4
+
+	    # don't keep doing this
+	    RUN_RECURSIVE=no
+	    export RUN_RECURSIVE
+	    untrap_abnormal
+
+	    $ECHO
+	    $ECHO "Restarting the preparation steps with libtool macros in acinclude.m4"
+	    $VERBOSE_ECHO sh $AUTOGEN_SH "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9"
+	    sh "$AUTOGEN_SH" "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9"
+	    exit $?
+	fi
+    fi
+}
+
+
+###########################
+# MANUAL_AUTOGEN FUNCTION #
+###########################
+manual_autogen ( ) {
+
+    ##################################################
+    # Manual preparation steps taken are as follows: #
+    #   aclocal [-I m4]                              #
+    #   libtoolize --automake -c -f                  #
+    #   aclocal [-I m4]                              #
+    #   autoconf -f                                  #
+    #   autoheader                                   #
+    #   automake -a -c -f                            #
+    ##################################################
+
+    ###########
+    # aclocal #
+    ###########
+    $VERBOSE_ECHO "$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS"
+    aclocal_output="`$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS 2>&1`"
+    ret=$?
+    $VERBOSE_ECHO "$aclocal_output"
+    if [ ! $ret = 0 ] ; then $ECHO "ERROR: $ACLOCAL failed" && exit 2 ; fi
+
+    ##############
+    # libtoolize #
+    ##############
+    if libtoolize_needed ; then
+	if [ "x$HAVE_LIBTOOLIZE" = "xyes" ] ; then
+	    $VERBOSE_ECHO "$LIBTOOLIZE $LIBTOOLIZE_OPTIONS"
+	    libtoolize_output="`$LIBTOOLIZE $LIBTOOLIZE_OPTIONS 2>&1`"
+	    ret=$?
+	    $VERBOSE_ECHO "$libtoolize_output"
+
+	    if [ ! $ret = 0 ] ; then $ECHO "ERROR: $LIBTOOLIZE failed" && exit 2 ; fi
+	else
+	    if [ "x$HAVE_ALT_LIBTOOLIZE" = "xyes" ] ; then
+		$VERBOSE_ECHO "$LIBTOOLIZE $ALT_LIBTOOLIZE_OPTIONS"
+		libtoolize_output="`$LIBTOOLIZE $ALT_LIBTOOLIZE_OPTIONS 2>&1`"
+		ret=$?
+		$VERBOSE_ECHO "$libtoolize_output"
+
+		if [ ! $ret = 0 ] ; then $ECHO "ERROR: $LIBTOOLIZE failed" && exit 2 ; fi
+	    fi
+	fi
+
+	###########
+	# aclocal #
+	###########
+	# re-run again as instructed by libtoolize
+	$VERBOSE_ECHO "$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS"
+	aclocal_output="`$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS 2>&1`"
+	ret=$?
+	$VERBOSE_ECHO "$aclocal_output"
+
+	# libtoolize might put ltmain.sh in the wrong place
+	if test -f ltmain.sh ; then
+	    if test ! -f "${_aux_dir}/ltmain.sh" ; then
+		$ECHO
+		$ECHO "Warning:  $LIBTOOLIZE is creating ltmain.sh in the wrong directory"
+		$ECHO
+		$ECHO "Fortunately, the problem can be worked around by simply copying the"
+		$ECHO "file to the appropriate location (${_aux_dir}/).  This has been done for you."
+		$ECHO
+		$VERBOSE_ECHO "cp -p ltmain.sh \"${_aux_dir}/ltmain.sh\""
+		cp -p ltmain.sh "${_aux_dir}/ltmain.sh"
+		$ECHO $ECHO_N "Continuing build preparation ... $ECHO_C"
+	    fi
+	fi # ltmain.sh
+
+	if [ "x$DOWNLOAD" = "xyes" ] ; then
+	    download_gnulib_config_guess
+	fi
+    fi # libtoolize_needed
+
+    ############
+    # autoconf #
+    ############
+    $VERBOSE_ECHO
+    $VERBOSE_ECHO "$AUTOCONF $AUTOCONF_OPTIONS"
+    autoconf_output="`$AUTOCONF $AUTOCONF_OPTIONS 2>&1`"
+    ret=$?
+    $VERBOSE_ECHO "$autoconf_output"
+
+    if [ ! $ret = 0 ] ; then
+	# retry without the -f and check for usage of macros that are too new
+	ac2_59_macros="AC_C_RESTRICT AC_INCLUDES_DEFAULT AC_LANG_ASSERT AC_LANG_WERROR AS_SET_CATFILE"
+	ac2_55_macros="AC_COMPILER_IFELSE AC_FUNC_MBRTOWC AC_HEADER_STDBOOL AC_LANG_CONFTEST AC_LANG_SOURCE AC_LANG_PROGRAM AC_LANG_CALL AC_LANG_FUNC_TRY_LINK AC_MSG_FAILURE AC_PREPROC_IFELSE"
+	ac2_54_macros="AC_C_BACKSLASH_A AC_CONFIG_LIBOBJ_DIR AC_GNU_SOURCE AC_PROG_EGREP AC_PROG_FGREP AC_REPLACE_FNMATCH AC_FUNC_FNMATCH_GNU AC_FUNC_REALLOC AC_TYPE_MBSTATE_T"
+
+	macros_to_search=""
+	ac_major="`echo ${AUTOCONF_VERSION}. | cut -d. -f1 | sed 's/[^0-9]//g'`"
+	ac_minor="`echo ${AUTOCONF_VERSION}. | cut -d. -f2 | sed 's/[^0-9]//g'`"
+
+	if [ $ac_major -lt 2 ] ; then
+	    macros_to_search="$ac2_59_macros $ac2_55_macros $ac2_54_macros"
+	else
+	    if [ $ac_minor -lt 54 ] ; then
+		macros_to_search="$ac2_59_macros $ac2_55_macros $ac2_54_macros"
+	    elif [ $ac_minor -lt 55 ] ; then
+		macros_to_search="$ac2_59_macros $ac2_55_macros"
+	    elif [ $ac_minor -lt 59 ] ; then
+		macros_to_search="$ac2_59_macros"
+	    fi
+	fi
+
+	configure_ac_macros=__none__
+	for feature in $macros_to_search ; do
+	    $VERBOSE_ECHO "Searching for $feature in $CONFIGURE"
+	    found="`grep \"^$feature.*\" $CONFIGURE`"
+	    if [ ! "x$found" = "x" ] ; then
+		if [ "x$configure_ac_macros" = "x__none__" ] ; then
+		    configure_ac_macros="$feature"
+		else
+		    configure_ac_macros="$feature $configure_ac_macros"
+		fi
+	    fi
+	done
+	if [ ! "x$configure_ac_macros" = "x__none__" ] ; then
+	    $ECHO
+	    $ECHO "Warning:  Unsupported macros were found in $CONFIGURE"
+	    $ECHO
+	    $ECHO "The `basename \"$CONFIGURE\"` file was scanned in order to determine if any"
+	    $ECHO "unsupported macros are used that exceed the minimum version"
+	    $ECHO "settings specified within this file.  As such, the following macros"
+	    $ECHO "should be removed from configure.ac or the version numbers in this"
+	    $ECHO "file should be increased:"
+	    $ECHO
+	    $ECHO "$configure_ac_macros"
+	    $ECHO
+	    $ECHO $ECHO_N "Ignorantly continuing build preparation ... $ECHO_C"
+	fi
+
+	###################
+	# autoconf, retry #
+	###################
+	$VERBOSE_ECHO
+	$VERBOSE_ECHO "$AUTOCONF"
+	autoconf_output="`$AUTOCONF 2>&1`"
+	ret=$?
+	$VERBOSE_ECHO "$autoconf_output"
+
+	if [ ! $ret = 0 ] ; then
+	    # test if libtool is busted
+	    libtool_failure "$autoconf_output"
+
+	    # let the user know what went wrong
+	    cat <<EOF
+$autoconf_output
+EOF
+	    $ECHO "ERROR: $AUTOCONF failed"
+	    exit 2
+	else
+	    # autoconf sans -f and possibly sans unsupported options succeed so warn verbosely
+	    $ECHO
+	    $ECHO "Warning: autoconf seems to have succeeded by removing the following options:"
+	    $ECHO "	AUTOCONF_OPTIONS=\"$AUTOCONF_OPTIONS\""
+	    $ECHO
+	    $ECHO "Removing those options should not be necessary and indicate some other"
+	    $ECHO "problem with the build system.  The build preparation is highly suspect"
+	    $ECHO "and may result in configuration or compilation errors.  Consider"
+	    if [ "x$VERBOSE_ECHO" = "x:" ] ; then
+		$ECHO "rerunning the build preparation with verbose output enabled."
+		$ECHO "	$AUTOGEN_SH --verbose"
+	    else
+		$ECHO "reviewing the minimum GNU Autotools version settings contained in"
+		$ECHO "this script along with the macros being used in your `basename \"$CONFIGURE\"` file."
+	    fi
+	    $ECHO
+	    $ECHO $ECHO_N "Continuing build preparation ... $ECHO_C"
+	fi # autoconf ret = 0
+    fi # autoconf ret = 0
+
+    ##############
+    # autoheader #
+    ##############
+    need_autoheader=no
+    for feature in AM_CONFIG_HEADER AC_CONFIG_HEADER ; do
+	$VERBOSE_ECHO "Searching for $feature in $CONFIGURE"
+	found="`grep \"^$feature.*\" $CONFIGURE`"
+	if [ ! "x$found" = "x" ] ; then
+	    need_autoheader=yes
+	    break
+	fi
+    done
+    if [ "x$need_autoheader" = "xyes" ] ; then
+	$VERBOSE_ECHO "$AUTOHEADER $AUTOHEADER_OPTIONS"
+	autoheader_output="`$AUTOHEADER $AUTOHEADER_OPTIONS 2>&1`"
+	ret=$?
+	$VERBOSE_ECHO "$autoheader_output"
+	if [ ! $ret = 0 ] ; then $ECHO "ERROR: $AUTOHEADER failed" && exit 2 ; fi
+    fi # need_autoheader
+
+    ############
+    # automake #
+    ############
+    need_automake=no
+    for feature in AM_INIT_AUTOMAKE ; do
+	$VERBOSE_ECHO "Searching for $feature in $CONFIGURE"
+	found="`grep \"^$feature.*\" $CONFIGURE`"
+	if [ ! "x$found" = "x" ] ; then
+	    need_automake=yes
+	    break
+	fi
+    done
+
+    if [ "x$need_automake" = "xyes" ] ; then
+	$VERBOSE_ECHO "$AUTOMAKE $AUTOMAKE_OPTIONS"
+	automake_output="`$AUTOMAKE $AUTOMAKE_OPTIONS 2>&1`"
+	ret=$?
+	$VERBOSE_ECHO "$automake_output"
+
+	if [ ! $ret = 0 ] ; then
+
+	    ###################
+	    # automake, retry #
+	    ###################
+	    $VERBOSE_ECHO
+	    $VERBOSE_ECHO "$AUTOMAKE $ALT_AUTOMAKE_OPTIONS"
+	    # retry without the -f
+	    automake_output="`$AUTOMAKE $ALT_AUTOMAKE_OPTIONS 2>&1`"
+	    ret=$?
+	    $VERBOSE_ECHO "$automake_output"
+
+	    if [ ! $ret = 0 ] ; then
+	 	# test if libtool is busted
+		libtool_failure "$automake_output"
+
+		# let the user know what went wrong
+		cat <<EOF
+$automake_output
+EOF
+		$ECHO "ERROR: $AUTOMAKE failed"
+		exit 2
+	    fi # automake retry
+	fi # automake ret = 0
+    fi # need_automake
+} # end of manual_autogen
+
+
+#####################################
+# RECURSIVE_MANUAL_AUTOGEN FUNCTION #
+#####################################
+recursive_manual_autogen ( ) {
+
+    # run the build preparation steps manually for this directory
+    manual_autogen
+
+    # for projects using recursive configure, run the build
+    # preparation steps for the subdirectories.
+    if [ ! "x$CONFIG_SUBDIRS" = "x" ] ; then
+	$VERBOSE_ECHO "Recursively configuring the following directories:"
+	$VERBOSE_ECHO "  $CONFIG_SUBDIRS"
+	for dir in $CONFIG_SUBDIRS ; do
+	    $VERBOSE_ECHO "Processing recursive configure in $dir"
+	    cd "$START_PATH"
+	    cd "$dir"
+
+	    # new directory, prepare
+	    initialize
+
+	    # run manual steps for the subdir and any others below
+	    recursive_manual_autogen
+	done
+    fi
+}
+
+
+################################
+# run manual preparation steps #
+################################
+if [ "x$reconfigure_manually" = "xyes" ] ; then
+    $ECHO
+    $ECHO $ECHO_N "Preparing build ... $ECHO_C"
+
+    recursive_manual_autogen
+fi
+
+
+#########################
+# restore and summarize #
+#########################
+cd "$START_PATH"
+
+# restore COPYING and INSTALL from backup if necessary
+recursive_restore
+
+# make sure we end up with a configure script
+config_ac="`locate_configure_template`"
+config="`echo $config_ac | sed 's/\.ac$//' | sed 's/\.in$//'`"
+if [ "x$config" = "x" ] ; then
+    $VERBOSE_ECHO "Could not locate the configure template (from `pwd`)"
+fi
+
+# summarize
+$ECHO "done"
+$ECHO
+if test "x$config" = "x" -o ! -f "$config" ; then
+    $ECHO "WARNING: The $PROJECT build system should now be prepared but there"
+    $ECHO "does not seem to be a resulting configure file.  This is unexpected"
+    $ECHO "and likely the result of an error.  You should run $NAME_OF_AUTOGEN"
+    $ECHO "with the --verbose option to get more details on a potential"
+    $ECHO "misconfiguration."
+else
+    $ECHO "The $PROJECT build system is now prepared.  To build here, run:"
+    $ECHO "  $config"
+    $ECHO "  make"
+fi
+
+
+# Local Variables:
+# mode: sh
+# tab-width: 8
+# sh-basic-offset: 4
+# sh-indentation: 4
+# indent-tabs-mode: t
+# End:
+# ex: shiftwidth=4 tabstop=8
diff --git a/cmake/LLVM.cmake b/cmake/LLVM.cmake
index 2785f43..ea1a116 100644
--- a/cmake/LLVM.cmake
+++ b/cmake/LLVM.cmake
@@ -36,12 +36,8 @@ else()
   # search for any version
   find_program(LLVM_CONFIG
     NAMES "llvm-config"
+      "llvm-config-mp-3.8" "llvm-config-3.8" "llvm-config38"
       "llvm-config-mp-3.7" "llvm-config-3.7" "llvm-config37"
-      "llvm-config-mp-3.6" "llvm-config-3.6" "llvm-config36"
-      "llvm-config-mp-3.5" "llvm-config-3.5" "llvm-config35"
-      "llvm-config-mp-3.4" "llvm-config-3.4" "llvm-config34"
-      "llvm-config-mp-3.3" "llvm-config-3.3" "llvm-config33"
-      "llvm-config-mp-3.2" "llvm-config-3.2" "llvm-config32"
     DOC "llvm-config executable")
 endif()
 
@@ -119,13 +115,15 @@ run_llvm_config(LLVM_OBJ_ROOT --obj-root)
 string(REPLACE "${LLVM_PREFIX}" "${LLVM_PREFIX_CMAKE}" LLVM_OBJ_ROOT "${LLVM_OBJ_ROOT}")
 run_llvm_config(LLVM_ALL_TARGETS --targets-built)
 run_llvm_config(LLVM_HOST_TARGET --host-target)
-# TODO can be changed to --assertion-mode once we drop LLVM < 3.5 support
-run_llvm_config(LLVM_BUILD_MODE --build-mode)
-if(LLVM_BUILD_MODE MATCHES "Asserts")
-  set(LLVM_ASSERTS_BUILD 1)
-else()
-  set(LLVM_ASSERTS_BUILD 0)
+run_llvm_config(LLVM_ASSERTS_BUILD --assertion-mode)
+run_llvm_config(LLVM_SYSLIBS --system-libs)
+string(STRIP "${LLVM_SYSLIBS}" LLVM_SYSLIBS)
+
+if (MSVC)
+  string(REPLACE "-L${LLVM_LIBDIR}" "" LLVM_LDFLAGS "${LLVM_LDFLAGS}")
+  string(STRIP "${LLVM_LDFLAGS}" LLVM_LDFLAGS)
 endif()
+
 # Ubuntu's llvm reports "arm-unknown-linux-gnueabihf" triple, then if one tries
 # `clang --target=arm-unknown-linux-gnueabihf ...` it will produce armv6 code,
 # even if one's running armv7;
@@ -146,20 +144,13 @@ endif(WIN32)
 
 # required for sources..
 if(LLVM_VERSION MATCHES "3[.]([0-9]+)")
+  set(LLVM_MAJOR 3)
   string(STRIP "${CMAKE_MATCH_1}" LLVM_MINOR)
   message(STATUS "Minor llvm version: ${LLVM_MINOR}")
-  if(LLVM_MINOR STREQUAL "2")
-    set(LLVM_3_2 1)
-  elseif(LLVM_MINOR STREQUAL "3")
-    set(LLVM_3_3 1)
-  elseif(LLVM_MINOR STREQUAL "4")
-    set(LLVM_3_4 1)
-  elseif(LLVM_MINOR STREQUAL "5")
-    set(LLVM_3_5 1)
-  elseif(LLVM_MINOR STREQUAL "6")
-    set(LLVM_3_6 1)
-  elseif(LLVM_MINOR STREQUAL "7")
+  if(LLVM_MINOR STREQUAL "7")
     set(LLVM_3_7 1)
+  elseif(LLVM_MINOR STREQUAL "8")
+    set(LLVM_3_8 1)
   else()
     message(FATAL_ERROR "Unknown/unsupported minor llvm version: ${LLVM_MINOR}")
   endif()
@@ -176,15 +167,6 @@ if("${LLVM_CXXFLAGS}" MATCHES "-fno-rtti")
        See the INSTALL file for more information.")
 endif()
 
-# Ubuntu's LLVM 3.5 is broken (is really 3.4svn with
-# some patches, neither 3.4 nor 3.5 in the end..
-if((LLVM_MINOR GREATER 4) AND (CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  message(STATUS "Testing for Ubuntu's broken LLVM 3.5+")
-  if(NOT EXISTS "${LLVM_INCLUDEDIR}/llvm/IR/CFG.h")
-    message(FATAL_ERROR "Your llvm installation is broken. This is known to be the case on Ubuntu and clones with llvm 3.5; official llvm 3.5 downloads should work though.")
-  endif()
-endif()
-
 # A few work-arounds for llvm-config issues
 
 # - pocl doesn't compile with '-pedantic'
@@ -198,10 +180,6 @@ if (NOT MSVC)
   set(LLVM_CXXFLAGS "${LLVM_CXXFLAGS} -fno-rtti")
 endif()
 
-if(NOT LLVM_VERSION VERSION_LESS "3.5")
-  run_llvm_config(LLVM_SYSLIBS --system-libs)
-  string(STRIP "${LLVM_SYSLIBS}" LLVM_SYSLIBS)
-endif()
 
 # Llvm-config may be installed or it might be used from build directory, in which case
 # we need to add few extra include paths to find clang includes and compiled includes
@@ -212,11 +190,10 @@ list(APPEND LLVM_INCLUDE_DIRS
   "${LLVM_OBJ_ROOT}/tools/clang/include")
 
 # Llvm-config does not include clang libs
-set(CLANG_LIBNAMES clangFrontendTool clangFrontend clangDriver clangSerialization clangCodeGen clangParse clangSema)
-if(LLVM_MINOR GREATER 4)
-  list(APPEND CLANG_LIBNAMES clangRewrite)
-endif()
-list(APPEND CLANG_LIBNAMES clangRewriteFrontend clangStaticAnalyzerFrontend clangStaticAnalyzerCheckers clangStaticAnalyzerCore clangAnalysis clangEdit clangAST clangLex clangBasic)
+set(CLANG_LIBNAMES clangFrontendTool clangFrontend clangDriver clangSerialization
+    clangCodeGen clangParse clangSema clangRewrite clangRewriteFrontend
+    clangStaticAnalyzerFrontend clangStaticAnalyzerCheckers
+    clangStaticAnalyzerCore clangAnalysis clangEdit clangAST clangLex clangBasic)
 
 foreach(LIBNAME ${CLANG_LIBNAMES})
   find_library(C_LIBFILE_${LIBNAME} NAMES "${LIBNAME}" HINTS "${LLVM_LIBDIR}")
@@ -257,7 +234,7 @@ if(CLANGXX_RES OR CLANG_RES)
 endif()
 
 find_program_or_die(LLVM_OPT "opt" "LLVM optimizer")
-find_program_or_die(LLC "llc" "LLVM static compiler") # TODO rename to LLVM_LLC
+find_program_or_die(LLVM_LLC "llc" "LLVM static compiler")
 find_program_or_die(LLVM_AS "llvm-as" "LLVM assembler")
 find_program_or_die(LLVM_LINK "llvm-link" "LLVM IR linker")
 find_program_or_die(LLVM_LLI "lli" "LLVM interpreter")
@@ -265,7 +242,7 @@ find_program_or_die(LLVM_LLI "lli" "LLVM interpreter")
 ####################################################################
 
 # try compile with any compiler (supplied as argument)
-macro(custom_try_compile_any COMPILER SUFFIX SOURCE RES_VAR)
+macro(custom_try_compile_any SILENT COMPILER SUFFIX SOURCE RES_VAR)
   string(RANDOM RNDNAME)
   set(RANDOM_FILENAME "${CMAKE_BINARY_DIR}/compile_test_${RNDNAME}.${SUFFIX}")
   file(WRITE "${RANDOM_FILENAME}" "${SOURCE}")
@@ -273,7 +250,7 @@ macro(custom_try_compile_any COMPILER SUFFIX SOURCE RES_VAR)
   math(EXPR LSIZE "${ARGC} - 4")
 
   execute_process(COMMAND "${COMPILER}" ${ARGN} "${RANDOM_FILENAME}" RESULT_VARIABLE ${RES_VAR} OUTPUT_VARIABLE OV ERROR_VARIABLE EV)
-  if(${${RES_VAR}})
+  if(${${RES_VAR}} AND (NOT ${SILENT}))
     message(STATUS " ########## The command: ")
     string(REPLACE ";" " " ARGN_STR "${ARGN}")
     message(STATUS "${COMPILER} ${ARGN_STR} ${RANDOM_FILENAME}")
@@ -299,7 +276,20 @@ macro(custom_try_compile_c_cxx COMPILER SUFFIX SOURCE1 SOURCE2 RES_VAR)
   ${SOURCE2}
 
   }")
-  custom_try_compile_any("${COMPILER}" ${SUFFIX} "${SOURCE_PROG}" ${RES_VAR} ${ARGN})
+  custom_try_compile_any(FALSE "${COMPILER}" ${SUFFIX} "${SOURCE_PROG}" ${RES_VAR} ${ARGN})
+endmacro()
+
+# convenience c/c++ source wrapper
+macro(custom_try_compile_c_cxx_silent COMPILER SUFFIX SOURCE1 SOURCE2 RES_VAR)
+  set(SOURCE_PROG "
+  ${SOURCE1}
+
+  int main(int argc, char** argv) {
+
+  ${SOURCE2}
+
+  }")
+  custom_try_compile_any(TRUE "${COMPILER}" ${SUFFIX} "${SOURCE_PROG}" ${RES_VAR} ${ARGN})
 endmacro()
 
 # clang++ try-compile macro
@@ -338,7 +328,7 @@ macro(custom_try_run_exe SOURCE1 SOURCE2 OUTPUT_VAR RES_VAR)
 endmacro()
 
 # clang try-compile-run macro, run via lli, the llvm interpreter
-macro(custom_try_run_lli SOURCE1 SOURCE2 OUTPUT_VAR RES_VAR)
+macro(custom_try_run_lli SILENT SOURCE1 SOURCE2 OUTPUT_VAR RES_VAR)
 # this uses "lli" - the interpreter, so we can run any -target
 # TODO variable for target !!
   set(OUTF "${CMAKE_BINARY_DIR}/try_run.bc")
@@ -354,7 +344,7 @@ macro(custom_try_run_lli SOURCE1 SOURCE2 OUTPUT_VAR RES_VAR)
     execute_process(COMMAND "${LLVM_LLI}" "-force-interpreter" "${OUTF}" RESULT_VARIABLE RESV OUTPUT_VARIABLE ${OUTPUT_VAR} ERROR_VARIABLE EV)
     set(${RES_VAR} ${RESV})
     file(REMOVE "${OUTF}")
-    if(${RESV})
+    if(${RESV} AND (NOT ${SILENT}))
       message(STATUS " ########## The command ${LLVM_LLI} -force-interpreter ${OUTF}")
       message(STATUS " ########## Exited with nonzero status: ${RESV}")
       if(${${OUTPUT_VAR}})
@@ -415,25 +405,12 @@ set_cache_var(CLANG_TARGET_OPTION "Clang option used to specify the target" )
 
 ####################################################################
 
-macro(CHECK_SIZEOF TYPE RES_VAR TRIPLE)
-  setup_cache_var_name(SIZEOF "${TYPE}-${TRIPLE}-${CLANG}")
-
-  if(NOT DEFINED ${CACHE_VAR_NAME})
-    custom_try_run_lli("" "return sizeof(${TYPE});" SIZEOF_STDOUT ${RES_VAR} "${CLANG_TARGET_OPTION}${TRIPLE}")
-    if(NOT ${RES_VAR})
-      message(SEND_ERROR "Could not determine sizeof(${TYPE})")
-    endif()
-  endif()
-
-  set_cache_var(${RES_VAR} "Size of ${TYPE}")
-endmacro()
-
 macro(CHECK_ALIGNOF TYPE TYPEDEF RES_VAR TRIPLE)
   setup_cache_var_name(ALIGNOF "${TYPE}-${TYPEDEF}-${TRIPLE}-${CLANG}")
 
   if(NOT DEFINED ${CACHE_VAR_NAME})
 
-    custom_try_run_lli("
+    custom_try_run_lli(TRUE "
 #ifndef offsetof
 #define offsetof(type, member) ((char *) &((type *) 0)->member - (char *) 0)
 #endif
@@ -506,10 +483,6 @@ endif()
 # llvm-config does not always report the "-DNDEBUG" flag correctly
 # (see LLVM bug 18253). If LLVM and the pocl passes are built with
 # different NDEBUG settings, problems arise
-#
-# TODO: How this test should actually recognize, if llvm
-#       is built without assertions? On OSX this always
-#       passed and thinks there is no assertions...
 
 if(NOT LLVM_CXXFLAGS MATCHES "-DNDEBUG")
 
@@ -549,8 +522,6 @@ endif()
 
 ####################################################################
 
-# DONE
-
 # TODO: We need to set both target-triple and cpu-type when
 # building, since the ABI depends on both. We can either add flags
 # to all the scripts, or set the respective flags here in
@@ -590,8 +561,8 @@ endif()
 set_cache_var(LLC_TRIPLE "LLC_TRIPLE")
 
 if(NOT DEFINED LLC_HOST_CPU AND NOT CMAKE_CROSSCOMPILING)
-  message(STATUS "Find out LLC host CPU with ${LLC}")
-  execute_process(COMMAND ${LLC} "--version" RESULT_VARIABLE RES_VAR OUTPUT_VARIABLE OUTPUT_VAR)
+  message(STATUS "Find out LLC host CPU with ${LLVM_LLC}")
+  execute_process(COMMAND ${LLVM_LLC} "--version" RESULT_VARIABLE RES_VAR OUTPUT_VARIABLE OUTPUT_VAR)
   # WTF, ^^ has return value 1
   #if(RES_VAR)
   #  message(FATAL_ERROR "Error ${RES_VAR} while determining LLC host CPU")
@@ -613,64 +584,82 @@ endif()
 set(LLC_HOST_CPU "${LLC_HOST_CPU}" CACHE STRING "The Host CPU to use with llc")
 
 ####################################################################
-#X86 has -march and -mcpu reversed, for clang
 
-if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(powerpc|armv7)")
-  set(CLANG_MARCH_FLAG "-mcpu=")
-else()
-  set(CLANG_MARCH_FLAG "-march=")
-endif()
+# This tests that we can actually link to the llvm libraries.
+# Mostly to catch issues like #295 - cannot find -ledit
 
-####################################################################
-# line 823 in configure.ac:
-# case $host_cpu in
-
-#~
-#~ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "armv6l")
-    #~ MESSAGE(STATUS "Using the ARM optimized kernel lib for the native device")
-    #~ # TODO better...
-    #~ ;;
-#~
-#~
-#~ elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64|AMD64)")
-  #~ message(STATUS "using the x86_64 optimized kernel lib for the native device")
-#~
-#~ endif()
+setup_cache_var_name(LLVM_LINK_TEST "LLVM_LINK_TEST-${LLVM_HOST_TARGET}-${CLANG}")
 
+if(NOT DEFINED ${CACHE_VAR_NAME})
 
-####################################################################
+  set(LLVM_LINK_TEST_SOURCE "
+    #include <stdio.h>
+    #include \"llvm/IR/LLVMContext.h\"
+    #include \"llvm/Support/SourceMgr.h\"
+    #include \"llvm/IR/Module.h\"
+    #include \"llvm/IRReader/IRReader.h\"
 
-# Work-around a clang bug in LLVM 3.3: On 32-bit platforms, the size
-# of Open CL C long is not 8 bytes
+    int main( int argc, char* argv[] )
+    {
+       if( argc < 2 )
+         exit(2);
 
-#  set(_CL_DISABLE_LONG ${BUG_PRESENT} CACHE INTERNAL "bug in LLVM 3.3: On 32-bit platforms, the size of Open CL C long is not 8 bytes")
+       llvm::LLVMContext &context = llvm::getGlobalContext();
+       llvm::SMDiagnostic err;
+       std::unique_ptr<llvm::Module> module = llvm::parseIRFile( argv[1], err, context );
 
-setup_cache_var_name(CL_DISABLE_LONG "CL_DISABLE_LONG-${LLVM_HOST_TARGET}-${CLANG}")
+       if( !module )
+         exit(1);
+       else
+         module->dump();
 
-if(NOT DEFINED ${CACHE_VAR_NAME})
-  set(CL_DISABLE_LONG 0)
-  # TODO -march=CPU flags !
-  custom_try_compile_any("${CLANG}" "cl" "constant int test[sizeof(long)==8?1:-1]={1};" RESV  -x cl -S ${CLANG_TARGET_OPTION}${LLC_TRIPLE} ${CLANG_MARCH_FLAG}${LLC_HOST_CPU})
-  if(RESV)
-    set(CL_DISABLE_LONG 1)
+       return 0;
+    }")
+
+  string(RANDOM RNDNAME)
+  set(LLVM_LINK_TEST_FILENAME "${CMAKE_BINARY_DIR}/llvm_link_test_${RNDNAME}.cc")
+  file(WRITE "${LLVM_LINK_TEST_FILENAME}" "${LLVM_LINK_TEST_SOURCE}")
+
+  try_compile(LLVM_LINK_TEST ${CMAKE_BINARY_DIR} "${LLVM_LINK_TEST_FILENAME}"
+              CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${LLVM_INCLUDE_DIRS}"
+              CMAKE_FLAGS "-DLINK_DIRECTORIES:STRING=${LLVM_LIBDIR}"
+              LINK_LIBRARIES "${LLVM_LDFLAGS} ${LLVM_LIBS} ${LLVM_SYSLIBS}"
+              COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS}"
+              OUTPUT_VARIABLE _TRY_COMPILE_OUTPUT)
+
+  if (LLVM_LINK_TEST)
+    message(STATUS "LLVM link test OK")
+  else()
+    message(STATUS "LLVM link test output: ${_TRY_COMPILE_OUTPUT}")
+    message(FATAL_ERROR "LLVM link test FAILED. This mostly happens when your LLVM installation does not have all dependencies installed.")
   endif()
+
 endif()
 
-set_cache_var(CL_DISABLE_LONG "Disable cl_khr_int64 because of buggy llvm")
+set_cache_var(LLVM_LINK_TEST "LLVM link test result")
 
+####################################################################
+#X86 has -march and -mcpu reversed, for clang
+
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(powerpc|armv7)")
+  set(CLANG_MARCH_FLAG "-mcpu=")
+else()
+  set(CLANG_MARCH_FLAG "-march=")
+endif()
 
 ####################################################################
 
 if(NOT DEFINED ${CL_DISABLE_HALF})
   set(CL_DISABLE_HALF 0)
-  # TODO -march=CPU flags !
-  custom_try_compile_c_cxx("${CLANG}" "c" "__fp16 callfp16(__fp16 a) { return a * (__fp16)1.8; };" "__fp16 x=callfp16((__fp16)argc);" RESV -c ${CLANG_TARGET_OPTION}${LLC_TRIPLE} ${CLANG_MARCH_FLAG}${LLC_HOST_CPU})
+  message(STATUS "Checking fp16 support")
+  custom_try_compile_c_cxx_silent("${CLANG}" "c" "__fp16 callfp16(__fp16 a) { return a * (__fp16)1.8; };" "__fp16 x=callfp16((__fp16)argc);" RESV -c ${CLANG_TARGET_OPTION}${LLC_TRIPLE} ${CLANG_MARCH_FLAG}${LLC_HOST_CPU})
   if(RESV)
     set(CL_DISABLE_HALF 1)
   endif()
 endif()
 
 set(CL_DISABLE_HALF "${CL_DISABLE_HALF}" CACHE BOOL "Disable cl_khr_fp16 because fp16 is not supported")
+message(STATUS "fp16 disabled: ${CL_DISABLE_HALF}")
 
 ####################################################################
 
@@ -691,7 +680,7 @@ if(ENABLE_HSA)
     set(HSA_RUNTIME_DIR "/opt/hsa")
   endif()
 
-  if((IS_ABSOLUTE "${WITH_HSA_RUNTIME_DIR}") AND (EXISTS "${WITH_HSA_RUNTIME_DIR}"))
+  if((IS_ABSOLUTE "${HSA_RUNTIME_DIR}") AND (EXISTS "${HSA_RUNTIME_DIR}"))
     set(HSA_INCLUDEDIR "${HSA_RUNTIME_DIR}/include")
     set(HSA_LIBDIR "${HSA_RUNTIME_DIR}/lib")
   else()
@@ -713,15 +702,19 @@ if(ENABLE_HSA)
   if(DEFINED WITH_HSAILASM_PATH)
     set(HSAILASM_SEARCH_PATH "${WITH_HSAILASM_PATH}")
   else()
-    set(HSAILASM_SEARCH_PATH "${HSA_RUNTIME_DIR}/bin")
+    set(HSAILASM_SEARCH_PATH "${HSA_RUNTIME_DIR}")
   endif()
 
-  find_program(HSAIL_ASM "HSAILasm${CMAKE_EXECUTABLE_SUFFIX}" PATHS "${HSAILASM_SEARCH_PATH}")
+  if((EXISTS "${HSAILASM_SEARCH_PATH}") AND
+     (NOT IS_DIRECTORY "${HSAILASM_SEARCH_PATH}"))
+    set(HSAIL_ASM "${HSAILASM_SEARCH_PATH}")
+  else()
+    find_program(HSAIL_ASM "HSAILasm${CMAKE_EXECUTABLE_SUFFIX}" PATHS "${HSAILASM_SEARCH_PATH}" "${HSAILASM_SEARCH_PATH}/bin")
+  endif()
   if(NOT HSAIL_ASM)
     message(FATAL_ERROR "HSAILasm executable not found (use -DWITH_HSAILASM_PATH=... to specify)")
   endif()
 
-
   message(STATUS "OK, building HSA")
 endif()
 
diff --git a/cmake/bitcode_rules.cmake b/cmake/bitcode_rules.cmake
index 1d3c64f..dd6853b 100644
--- a/cmake/bitcode_rules.cmake
+++ b/cmake/bitcode_rules.cmake
@@ -30,28 +30,29 @@ separate_arguments(KERNEL_CLANGXX_FLAGS)
 
 #/usr/bin/clang --target=x86_64-pc-linux-gnu -march=bdver1 -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off -D__OPENCL_VERSION__=120 -DPOCL_VECMATHLIB_BUILTIN -D__CBUILD__ -o get_local_id.bc -c ${CMAKE_SOURCE_DIR}/lib/kernel/get_local_id.c -include ${CMAKE_SOURCE_DIR}/include/_kernel_c.h
 #	  @CLANG@ ${CLANG_FLAGS} ${KERNEL_CL_FLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $< 
-function(compile_c_to_bc FILENAME BC_FILE_LIST)
+function(compile_c_to_bc FILENAME SUBDIR BC_FILE_LIST)
     get_filename_component(FNAME "${FILENAME}" NAME)
-    set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${FNAME}.bc")
+    set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}/${FNAME}.bc")
     set(${BC_FILE_LIST} ${${BC_FILE_LIST}} ${BC_FILE} PARENT_SCOPE)
     set(FULL_F_PATH "${CMAKE_SOURCE_DIR}/lib/kernel/${FILENAME}")
 
     add_custom_command( OUTPUT "${BC_FILE}"
         DEPENDS "${FULL_F_PATH}"
         "${CMAKE_SOURCE_DIR}/include/pocl_types.h"
-        "${CMAKE_SOURCE_DIR}/include/pocl_features.h"
         "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
         ${KERNEL_DEPEND_HEADERS}
-        COMMAND "${CLANG}" ${CLANG_FLAGS} ${KERNEL_CL_FLAGS} "-D__CBUILD__" "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}" "-include" "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
+        COMMAND "${CLANG}" ${CLANG_FLAGS} ${KERNEL_CL_FLAGS} ${DEVICE_CL_FLAGS}
+        "-D__CBUILD__" "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}"
+        "-include" "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
         COMMENT "Building C to LLVM bitcode ${BC_FILE}" 
         VERBATIM)
 endfunction()
 
-# /usr/bin/clang++ --target=x86_64-pc-linux-gnu -march=bdver1 -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off -DVML_NO_IOSTREAM -DPOCL_VECMATHLIB_BUILTIN -o trunc.bc -c ${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib-pocl/trunc.cc -include ${CMAKE_SOURCE_DIR}/include/pocl_features.h
-# 	@CLANGXX@ ${CLANG_FLAGS} ${KERNEL_CLANGXX_FLAGS} -c -o $@ $< -include ${abs_top_srcdir}/include/pocl_features.h
-function(compile_cc_to_bc FILENAME BC_FILE_LIST)
+# /usr/bin/clang++ --target=x86_64-pc-linux-gnu -march=bdver1 -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off -DVML_NO_IOSTREAM -DPOCL_VECMATHLIB_BUILTIN -o trunc.bc -c ${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib-pocl/trunc.cc
+# 	@CLANGXX@ ${CLANG_FLAGS} ${KERNEL_CLANGXX_FLAGS} -c -o $@ $<
+function(compile_cc_to_bc FILENAME SUBDIR BC_FILE_LIST)
     get_filename_component(FNAME "${FILENAME}" NAME)
-    set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${FNAME}.bc")
+    set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}/${FNAME}.bc")
     set(${BC_FILE_LIST} ${${BC_FILE_LIST}} ${BC_FILE} PARENT_SCOPE)
     set(FULL_F_PATH "${CMAKE_SOURCE_DIR}/lib/kernel/${FILENAME}")
 
@@ -59,17 +60,17 @@ function(compile_cc_to_bc FILENAME BC_FILE_LIST)
 
     add_custom_command(OUTPUT "${BC_FILE}"
         DEPENDS "${FULL_F_PATH}"
-          "${CMAKE_SOURCE_DIR}/include/pocl_features.h"
           ${KERNEL_DEPEND_HEADERS}
-        COMMAND  "${CLANGXX}" ${CLANG_FLAGS} ${KERNEL_CLANGXX_FLAGS} "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}" "-include" "${CMAKE_SOURCE_DIR}/include/pocl_features.h"
+        COMMAND  "${CLANGXX}" ${CLANG_FLAGS} ${KERNEL_CLANGXX_FLAGS}
+        ${DEVICE_CL_FLAGS} "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}"
         COMMENT "Building C++ to LLVM bitcode ${BC_FILE}" 
         VERBATIM)
 endfunction()
 
 # /usr/bin/clang --target=x86_64-pc-linux-gnu -march=bdver1 -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off -x cl -D__OPENCL_VERSION__=120 -DPOCL_VECMATHLIB_BUILTIN -fsigned-char -o atan2pi.bc -c ${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib-pocl/atan2pi.cl -include ${CMAKE_SOURCE_DIR}/include/_kernel.h
-function(compile_cl_to_bc FILENAME BC_FILE_LIST)
+function(compile_cl_to_bc FILENAME SUBDIR BC_FILE_LIST)
     get_filename_component(FNAME "${FILENAME}" NAME)
-    set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${FNAME}.bc")
+    set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}/${FNAME}.bc")
     set(${BC_FILE_LIST} ${${BC_FILE_LIST}} ${BC_FILE} PARENT_SCOPE)
     set(FULL_F_PATH "${CMAKE_SOURCE_DIR}/lib/kernel/${FILENAME}")
 
@@ -80,17 +81,19 @@ function(compile_cl_to_bc FILENAME BC_FILE_LIST)
           "${CMAKE_SOURCE_DIR}/include/_kernel.h"
           "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
           "${CMAKE_SOURCE_DIR}/include/pocl_types.h" 
-          "${CMAKE_SOURCE_DIR}/include/pocl_features.h"
           ${KERNEL_DEPEND_HEADERS}
-        COMMAND "${CLANG}" ${CLANG_FLAGS} "-x" "cl" ${KERNEL_CL_FLAGS}  "-fsigned-char"  "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}" "-include" "${CMAKE_SOURCE_DIR}/include/_kernel.h"
+        COMMAND "${CLANG}" ${CLANG_FLAGS} "-x" "cl" ${KERNEL_CL_FLAGS} ${DEVICE_CL_FLAGS}
+        "-fsigned-char" "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}"
+        "-include" "${CMAKE_SOURCE_DIR}/include/_kernel.h"
         COMMENT "Building CL to LLVM bitcode ${BC_FILE}" 
         VERBATIM)
 endfunction()
 
 
-function(compile_ll_to_bc FILENAME BC_FILE_LIST)
+
+function(compile_ll_to_bc FILENAME SUBDIR BC_FILE_LIST)
     get_filename_component(FNAME "${FILENAME}" NAME)
-    set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${FNAME}.bc")
+    set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}/${FNAME}.bc")
     set(${BC_FILE_LIST} ${${BC_FILE_LIST}} ${BC_FILE} PARENT_SCOPE)
     set(FULL_F_PATH "${CMAKE_SOURCE_DIR}/lib/kernel/${FILENAME}")
 
@@ -103,16 +106,16 @@ function(compile_ll_to_bc FILENAME BC_FILE_LIST)
 endfunction()
 
 
-macro(compile_to_bc OUTPUT_FILE_LIST)
+macro(compile_to_bc SUBDIR OUTPUT_FILE_LIST)
   foreach(FILENAME ${ARGN})
   if(FILENAME MATCHES "[.]c$")
-    compile_c_to_bc("${FILENAME}" ${OUTPUT_FILE_LIST})
+    compile_c_to_bc("${FILENAME}" "${SUBDIR}" ${OUTPUT_FILE_LIST})
   elseif(FILENAME MATCHES "[.]cc$")
-    compile_cc_to_bc("${FILENAME}" ${OUTPUT_FILE_LIST})
+    compile_cc_to_bc("${FILENAME}" "${SUBDIR}" ${OUTPUT_FILE_LIST})
   elseif(FILENAME MATCHES "[.]cl$")
-    compile_cl_to_bc("${FILENAME}" ${OUTPUT_FILE_LIST})
+    compile_cl_to_bc("${FILENAME}" "${SUBDIR}" ${OUTPUT_FILE_LIST})
   elseif(FILENAME MATCHES "[.]ll$")
-    compile_ll_to_bc("${FILENAME}" ${OUTPUT_FILE_LIST})
+    compile_ll_to_bc("${FILENAME}" "${SUBDIR}" ${OUTPUT_FILE_LIST})
   else()
     message(FATAL_ERROR "Dont know how to compile ${FILENAME} to .bc !")
   endif()
@@ -121,20 +124,21 @@ endmacro()
 
 
 
-function(make_kernel_bc OUTPUT_VAR NAME)
+function(make_kernel_bc OUTPUT_VAR NAME SUBDIR)
   set(KERNEL_BC "${CMAKE_CURRENT_BINARY_DIR}/kernel-${NAME}.bc")
   set(${OUTPUT_VAR} "${KERNEL_BC}" PARENT_SCOPE)
 
-  compile_to_bc(BC_LIST ${ARGN})
+  file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}")
+  compile_to_bc("${SUBDIR}" BC_LIST ${ARGN})
 
   # fix too long commandline with cat and xargs
-  SET(BC_LIST_FILE_TXT "")
+  set(BC_LIST_FILE_TXT "")
   foreach(FILENAME ${BC_LIST})
     # straight parsing semicolon separated list with xargs -d didn't work on windows.. no such switch available
-    SET(BC_LIST_FILE_TXT "${BC_LIST_FILE_TXT} \"${FILENAME}\"")
+    set(BC_LIST_FILE_TXT "${BC_LIST_FILE_TXT} \"${FILENAME}\"")
   endforeach()
-  SET (BC_LIST_FILE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/kernel_${NAME}_linklist.txt")
-  FILE (WRITE "${BC_LIST_FILE}" "${BC_LIST_FILE_TXT}")
+  set(BC_LIST_FILE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/kernel_${NAME}_linklist.txt")
+  file(WRITE "${BC_LIST_FILE}" "${BC_LIST_FILE_TXT}")
 
   add_custom_command( OUTPUT "${KERNEL_BC}"
 # ${KERNEL_BC}: ${OBJ}
@@ -145,17 +149,5 @@ function(make_kernel_bc OUTPUT_VAR NAME)
         COMMENT "Linking Kernel bitcode ${KERNEL_BC}" 
         VERBATIM)
 
-  add_custom_command( OUTPUT "${CMAKE_BINARY_DIR}/kernellib_hash.h"
-    COMMAND "${CMAKE_COMMAND}" -DKERNELBC='${KERNEL_BC}'
-        -DINCLUDEDIR='${CMAKE_SOURCE_DIR}/include'
-        -DOUTPUT='${CMAKE_BINARY_DIR}/kernellib_hash.h'
-        -P "${CMAKE_SOURCE_DIR}/cmake/kernellib_hash.cmake"
-    DEPENDS "${KERNEL_BC}" "${CMAKE_SOURCE_DIR}/include/_kernel.h"
-        "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
-        "${CMAKE_SOURCE_DIR}/include/pocl_types.h"
-        "${CMAKE_SOURCE_DIR}/include/pocl_features.h"
-    COMMENT "Generating SHA1 of kernel lib..."
-    VERBATIM)
-
 endfunction()
 
diff --git a/cmake/kernellib_hash.cmake b/cmake/kernellib_hash.cmake
index 7959a86..70e75b5 100644
--- a/cmake/kernellib_hash.cmake
+++ b/cmake/kernellib_hash.cmake
@@ -1,7 +1,31 @@
-file(SHA1 "${KERNELBC}" S1)
+# TODO this is duplicated in top CMakeLists.txt
+function(rename_if_different SRC DST)
+  if(EXISTS "${DST}")
+    file(MD5 "${SRC}" OLD_MD5)
+    file(MD5 "${DST}" NEW_MD5)
+    if(NOT OLD_MD5 STREQUAL NEW_MD5)
+      message(STATUS "Renaming ${SRC} to ${DST}")
+      file(RENAME "${SRC}" "${DST}")
+    endif()
+  else()
+    message(STATUS "Renaming ${SRC} to ${DST}")
+    file(RENAME "${SRC}" "${DST}")
+  endif()
+endfunction()
+
+
+string(REPLACE "****" ";" KERNEL_BC_LIST "${KERNEL_BC_LIST_ESCAPED}")
+foreach(KERNEL_BC IN LISTS KERNEL_BC_LIST)
+  if(EXISTS ${KERNEL_BC})
+    file(SHA1 "${KERNEL_BC}" S)
+    set(S1 "${S}__${S1}")
+  endif()
+endforeach()
+
 file(SHA1 "${INCLUDEDIR}/_kernel.h" S2)
 file(SHA1 "${INCLUDEDIR}/_kernel_c.h" S3)
 file(SHA1 "${INCLUDEDIR}/pocl_types.h" S4)
-file(SHA1 "${INCLUDEDIR}/pocl_features.h" S5)
 
-file(WRITE "${OUTPUT}" "#define POCL_KERNELLIB_SHA1 \"${S1}${S2}${S3}${S4}${S5}\"")
+file(WRITE "${OUTPUT}.new" "#define POCL_KERNELLIB_SHA1 \"${S1}${S2}_${S3}_${S4}\"")
+
+rename_if_different("${OUTPUT}.new" "${OUTPUT}")
diff --git a/cmake/run_test.cmake b/cmake/run_test.cmake
index 5e4bf1a..24db0db 100644
--- a/cmake/run_test.cmake
+++ b/cmake/run_test.cmake
@@ -3,57 +3,64 @@
 if( NOT test_cmd )
   message( FATAL_ERROR "Variable test_cmd not defined" )
 endif()
-# output_blessed contains the name of the "blessed" output file
-if( NOT output_blessed )
-  message( FATAL_ERROR "Variable output_blessed not defined" )
-else()
+
+# output_blessed contains the name of the file with expected output
+if(output_blessed)
   message(STATUS "Expecting output: ${output_blessed}")
 endif()
 
-message(STATUS "POCL_DEVICES: $ENV{POCL_DEVICES}")
-message(STATUS "POCL_WORK_GROUP_METHOD: $ENV{POCL_WORK_GROUP_METHOD}")
-
 string(REPLACE "####" ";" test_cmd_separated "${test_cmd}")
 
-string(RANDOM RAND_STR)
-# TODO properly handle tmpdir
-set(RANDOM_FILE "/tmp/cmake_testrun_${RAND_STR}")
-
 execute_process(
   COMMAND ${test_cmd_separated}
   RESULT_VARIABLE test_not_successful
-  OUTPUT_FILE "${RANDOM_FILE}"
+  OUTPUT_VARIABLE stdout
   ERROR_VARIABLE stderr
 )
 
-if( sort_output )
-  message(STATUS "SORTING FILE")
-  file(STRINGS "${RANDOM_FILE}" output_string_list)
-  list(SORT output_string_list)
-  # for some reason sorting doesn't work when list contains newlines,
-  # have to add them after the sort
-  file(REMOVE "${RANDOM_FILE}")
-  string(REPLACE ";" "\n" OUTPUT "${output_string_list}")
-  set(RANDOM_FILE "${RANDOM_FILE}_sorted")
-  file(WRITE "${RANDOM_FILE}" "${OUTPUT}\n")
+if( test_not_successful )
+  message( SEND_ERROR "FAIL: Test exited with nonzero code: ${test_cmd_separated}\nSTDOUT:\n${stdout}\nSTDERR:\n${stderr}" )
+else()
+  message("${stdout}")
+  message("${stderr}")
 endif()
 
-if( test_not_successful )
-  message( SEND_ERROR "Test exited with nonzero code: ${test_cmd_separated}\nSTDERR:\n${stderr}" )
+if(output_blessed)
+
+  string(RANDOM RAND_STR)
+  set(RANDOM_FILE "/tmp/cmake_testrun_${RAND_STR}")
+  file(WRITE "${RANDOM_FILE}" "${stdout}")
+
+  if( sort_output )
+    message(STATUS "SORTING FILE")
+    file(STRINGS "${RANDOM_FILE}" output_string_list)
+    list(SORT output_string_list)
+    # for some reason sorting doesn't work when list contains newlines,
+    # have to add them after the sort
+    file(REMOVE "${RANDOM_FILE}")
+    string(REPLACE ";" "\n" OUTPUT "${output_string_list}")
+    set(RANDOM_FILE "${RANDOM_FILE}_sorted")
+    file(WRITE "${RANDOM_FILE}" "${OUTPUT}\n")
+  endif()
+
+  message(STATUS "Comparing output..")
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} -E compare_files "${output_blessed}" "${RANDOM_FILE}"
+    RESULT_VARIABLE test_not_successful
+    )
+
+  if( test_not_successful )
+    message(SEND_ERROR "FAIL: Test output does not match the expected output; output stored in ${RANDOM_FILE}" )
+  else()
+    file(REMOVE "${RANDOM_FILE}")
+  endif()
+
 endif()
 
-#~ if( sort_output )
-  #~ execute_process(
-    #~ COMMAND "sort"
-#~ endif()
+if ((NOT "${stdout}${stderr}" MATCHES "OK")
+    AND
+    (NOT "${stdout}${stderr}" MATCHES "FAIL"))
 
-execute_process(
-  COMMAND ${CMAKE_COMMAND} -E compare_files "${output_blessed}" "${RANDOM_FILE}"
-  RESULT_VARIABLE test_not_successful
-)
+  message(STATUS "OK")
 
-if( test_not_successful )
-  message( SEND_ERROR "Test output does not match the expected output; output stored in ${RANDOM_FILE}" )
-else()
-  file(REMOVE "${RANDOM_FILE}")
 endif()
diff --git a/config.h.in b/config.h.in
index 5ef3011..b789dc6 100644
--- a/config.h.in
+++ b/config.h.in
@@ -18,9 +18,6 @@
 /* "Build with ICD" */
 #undef BUILD_ICD
 
-/* Defined when CellSPU tools are found */
-#undef BUILD_SPU
-
 /* clang executable. */
 #undef CLANG
 
@@ -36,9 +33,6 @@
 /* Additional CL compiler flags. */
 #undef CLFLAGS
 
-/* "Use a custom buffer allocator" */
-#undef CUSTOM_BUFFER_ALLOCATOR
-
 /* "Export OpenCL symbols" */
 #undef DIRECT_LINKAGE
 
@@ -54,9 +48,15 @@
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #undef HAVE_DLFCN_H
 
+/* Define to 1 if you have the `fork' function. */
+#undef HAVE_FORK
+
 /* Defined if The OpenGL Extension Wrangler library is found */
 #undef HAVE_GLEW
 
+/* Have AMD-specific HSA headers */
+#undef HAVE_HSA_EXT_AMD_H
+
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
@@ -96,6 +96,9 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
 
+/* Define to 1 if you have the `vfork' function. */
+#undef HAVE_VFORK
+
 /* Define to 1 if __fp16 supports arithmetic operations float. */
 #undef HAVE_WORKING_HALF
 
@@ -111,6 +114,16 @@
 /* The host CPU type. */
 #undef HOST_CPU
 
+/* OpenCL major version supported by host device */
+#undef HOST_DEVICE_CL_VERSION_MAJOR
+
+/* OpenCL minor version supported by host device */
+#undef HOST_DEVICE_CL_VERSION_MINOR
+
+/* OpenCL device extensions implemented by the host devices basic and pthreads
+   */
+#undef HOST_DEVICE_EXTENSIONS
+
 /* "basic and pthreads devices use soft-float ABI" */
 #undef HOST_FLOAT_SOFT_ABI
 
@@ -123,33 +136,30 @@
 /* Path to HSAILasm executable */
 #undef HSAIL_ASM
 
-/* Use the libkernel from lib/kernel/$KERNEL_DIR/ */
-#undef KERNEL_DIR
-
-/* LLVM compiler executable. */
-#undef LLC
+/* OpenCL major version supported by HSA device */
+#undef HSA_DEVICE_CL_VERSION_MAJOR
 
-/* "Using LLVM 3.2" */
-#undef LLVM_3_2
+/* OpenCL major version supported by HSA device */
+#undef HSA_DEVICE_CL_VERSION_MINOR
 
-/* "Using LLVM 3.3" */
-#undef LLVM_3_3
+/* aoeuaoe */
+#undef HSA_DEVICE_EXTENSIONS
 
-/* "Using LLVM 3.4" */
-#undef LLVM_3_4
-
-/* "Using LLVM 3.5" */
-#undef LLVM_3_5
-
-/* "Using LLVM 3.6" */
-#undef LLVM_3_6
+/* Use the libkernel from lib/kernel/$KERNEL_DIR/ */
+#undef KERNEL_DIR
 
-/* "Using LLVM svn */
+/* "Using LLVM 3.7" */
 #undef LLVM_3_7
 
+/* "Using LLVM 3.8" */
+#undef LLVM_3_8
+
 /* "LLVM was built with Assertions on." */
 #undef LLVM_BUILT_WITH_ASSERTS
 
+/* LLVM compiler executable. */
+#undef LLVM_LLC
+
 /* "LLVM version as a string." */
 #undef LLVM_VERSION
 
@@ -195,12 +205,12 @@
 /* Directory where pocl files are installed in android */
 #undef POCL_ANDROID_PREFIX
 
-/* "Disabled kernel cache feature" */
-#undef POCL_BUILD_KERNEL_CACHE
-
 /* Timestamp of build */
 #undef POCL_BUILD_TIMESTAMP
 
+/* OpenCL version string as reported by clGetPlatformInfo */
+#undef POCL_CL_VERSION
+
 /* "Build pocl in debug mode" */
 #undef POCL_DEBUG_BUILD
 
@@ -210,6 +220,9 @@
 /* Value based on host processor, for basic and pthreads devices */
 #undef POCL_DEVICE_ADDRESS_BITS
 
+/* "Disabled kernel cache feature" */
+#undef POCL_KERNEL_CACHE_DEFAULT
+
 /* Define to necessary symbol if this constant uses a non-standard name on
    your system. */
 #undef PTHREAD_CREATE_JOINABLE
@@ -247,6 +260,18 @@
 /* Defined to 1 if TCE libraries and tools are available */
 #undef TCE_AVAILABLE
 
+/* OpenCL major version supported by TCE device */
+#undef TCE_DEVICE_CL_VERSION_MAJOR
+
+/* OpenCL minor version supported by TCE device */
+#undef TCE_DEVICE_CL_VERSION_MINOR
+
+/* TCE device supported extension list */
+#undef TCE_DEVICE_EXTENSIONS
+
+/* TCE device supported extension list */
+#undef TCE_DEVICE_EXTENSION_DEFINES
+
 /* "Use vecmathlib if available for the target." */
 #undef USE_VECMATHLIB
 
@@ -258,6 +283,3 @@
 
 /* Disable cl_khr_fp16 on host based devices. */
 #undef _CL_DISABLE_HALF
-
-/* Disable cl_khr_int64 on host based devices. */
-#undef _CL_DISABLE_LONG
diff --git a/config.h.in.cmake b/config.h.in.cmake
index f5499d2..8367d29 100644
--- a/config.h.in.cmake
+++ b/config.h.in.cmake
@@ -5,8 +5,6 @@
 /* The normal alignment of `float16', in bytes. */
 #define ALIGNOF_FLOAT16 @ALIGNOF_FLOAT16@
 
-#cmakedefine BUILD_SPU
-
 #cmakedefine BUILD_HSA
 
 #define POCL_BUILT_WITH_CMAKE
@@ -16,7 +14,9 @@
 /* "Build with ICD" */
 #cmakedefine BUILD_ICD
 
-#define LLVM_VERSION "@LLVM_VERSION@"
+#ifndef LLVM_VERSION
+#define LLVM_VERSION "@LLVM_VERSION_FULL@"
+#endif
 
 #define CLANG "@CLANG@"
 
@@ -37,16 +37,14 @@
 #define KERNEL_CL_FLAGS  "@KERNEL_CL_FLAGS@"
 
 
-/* "Use a custom buffer allocator" */
-#cmakedefine CUSTOM_BUFFER_ALLOCATOR
-
-
 #cmakedefine DIRECT_LINKAGE
 
 
 #define FORCED_CLFLAGS  "@FORCED_CLFLAGS@"
 
+#cmakedefine HAVE_FORK
 
+#cmakedefine HAVE_VFORK
 
 #cmakedefine HAVE_CLOCK_GETTIME
 
@@ -55,7 +53,7 @@
 /* Defined if posix_memalign is available. */
 #cmakedefine HAVE_POSIX_MEMALIGN
 
-
+#cmakedefine HAVE_HSA_EXT_AMD_H
 
 
 #define HOST  "@HOST@"
@@ -64,6 +62,8 @@
 
 #define HOST_CLANG_FLAGS  "@HOST_CLANG_FLAGS@"
 
+#define HOST_DEVICE_EXTENSIONS "@HOST_DEVICE_EXTENSIONS@"
+
 #define HOST_CPU  "@HOST_CPU@"
 
 #define HOST_LD_FLAGS  "@HOST_LD_FLAGS@"
@@ -72,29 +72,22 @@
 
 #cmakedefine HOST_FLOAT_SOFT_ABI
 
+#define HSA_DEVICE_EXTENSIONS "@HSA_DEVICE_EXTENSIONS@"
 
 
-#define LLC "@LLC@"
-
-
-/* "Using LLVM 3.2" */
-#cmakedefine LLVM_3_2
+#define KERNELLIB_HOST_CPU_VARIANTS "@KERNELLIB_HOST_CPU_VARIANTS@"
 
-/* "Using LLVM 3.3" */
-#cmakedefine LLVM_3_3
+#cmakedefine KERNELLIB_HOST_DISTRO_VARIANTS
 
-/* "Using LLVM 3.4" */
-#cmakedefine LLVM_3_4
+#define LLVM_LLC "@LLVM_LLC@"
 
-/* "Using LLVM 3.5" */
-#cmakedefine LLVM_3_5
-
-/* "Using LLVM 3.6" */
-#cmakedefine LLVM_3_6
 
 /* "Using LLVM 3.7" */
 #cmakedefine LLVM_3_7
 
+/* "Using LLVM 3.8" */
+#cmakedefine LLVM_3_8
+
 
 /* Defined to greatest expected alignment for extended types, in bytes. */
 #define MAX_EXTENDED_ALIGNMENT @MAX_EXTENDED_ALIGNMENT@
@@ -109,7 +102,7 @@
 #define PACKAGE_VERSION "@PACKAGE_VERSION@"
 
 
-#define POCL_BUILD_KERNEL_CACHE @POCL_BUILD_KERNEL_CACHE@
+#define POCL_KERNEL_CACHE_DEFAULT @POCL_KERNEL_CACHE_DEFAULT@
 
 #define POCL_DEVICE_ADDRESS_BITS @POCL_DEVICE_ADDRESS_BITS@
 
@@ -137,6 +130,7 @@
 
 #cmakedefine TCE_AVAILABLE
 
+#define TCE_DEVICE_EXTENSIONS "@TCE_DEVICE_EXTENSIONS@"
 
 /* "Use vecmathlib if available for the target." */
 #cmakedefine USE_VECMATHLIB
@@ -150,3 +144,14 @@
 
 /* Disable cl_khr_fp16 because fp16 is not supported */
 #cmakedefine _CL_DISABLE_HALF
+
+#define POCL_CL_VERSION "2.0"
+
+#define HSA_DEVICE_CL_VERSION_MAJOR 2
+#define HSA_DEVICE_CL_VERSION_MINOR 0
+
+#define HOST_DEVICE_CL_VERSION_MAJOR 2
+#define HOST_DEVICE_CL_VERSION_MINOR 0
+
+#define TCE_DEVICE_CL_VERSION_MAJOR 1
+#define TCE_DEVICE_CL_VERSION_MINOR 2
diff --git a/configure b/configure
index 8cd69ae..a9ad95e 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for pocl 0.12.
+# Generated by GNU Autoconf 2.69 for pocl 0.13.
 #
 # Report bugs to <pocl-devel at lists.sourceforge.net>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='pocl'
 PACKAGE_TARNAME='pocl'
-PACKAGE_VERSION='0.12'
-PACKAGE_STRING='pocl 0.12'
+PACKAGE_VERSION='0.13'
+PACKAGE_STRING='pocl 0.13'
 PACKAGE_BUGREPORT='pocl-devel at lists.sourceforge.net'
 PACKAGE_URL=''
 
@@ -632,6 +632,7 @@ ac_includes_default="\
 # include <unistd.h>
 #endif"
 
+ac_func_list=
 ac_subst_vars='am__EXEEXT_FALSE
 am__EXEEXT_TRUE
 LTLIBOBJS
@@ -644,6 +645,7 @@ LIB_AGE_VERSION
 LIB_REVISION_VERSION
 LIB_CURRENT_VERSION
 LIB_VERSION
+HOST_DEVICE_EXTENSION_DEFINES
 HOST_SIZEOF_VOID_P
 HOST_SIZEOF_DOUBLE
 HOST_SIZEOF_HALF
@@ -660,19 +662,17 @@ BUILD_AMDGCN_TRUE
 POAT_TESTSUITES
 TEST_SUITE_HSA_FALSE
 TEST_SUITE_HSA_TRUE
+HSA_DEVICE_EXTENSION_DEFINES
 HSAILASM
 HSA_LIBS
 HSA_INCLUDES
-BUILD_SPU_FALSE
-BUILD_SPU_TRUE
-LIBSPE_LIBS
-LIBSPE_CFLAGS
 POCL_ANDROID_FALSE
 POCL_ANDROID_TRUE
 TCEMC_AVAILABLE_FALSE
 TCEMC_AVAILABLE_TRUE
 TCE_AVAILABLE_FALSE
 TCE_AVAILABLE_TRUE
+TCE_DEVICE_EXTENSION_DEFINES
 TCE_AVAILABLE
 TCEMC_AVAILABLE
 TCECC
@@ -687,6 +687,8 @@ HOST_CLANG_FLAGS
 OCL_KERNEL_ARCH
 OCL_KERNEL_TARGET_CPU
 OCL_KERNEL_TARGET
+HOST_CPU_IS_X86_64_FALSE
+HOST_CPU_IS_X86_64_TRUE
 HOST_CPU
 HOST
 TARGET_LLC_FLAGS
@@ -730,6 +732,8 @@ PTHREAD_CFLAGS
 PTHREAD_LIBS
 PTHREAD_CC
 acx_pthread_config
+TEST_SUITE_INTELSVM_FALSE
+TEST_SUITE_INTELSVM_TRUE
 TEST_SUITE_CLOVERLEAF_FALSE
 TEST_SUITE_CLOVERLEAF_TRUE
 TEST_SUITE_OPENCV_FALSE
@@ -740,6 +744,8 @@ TEST_SUITE_PIGLIT_FALSE
 TEST_SUITE_PIGLIT_TRUE
 TEST_SUITE_VEXCL_FALSE
 TEST_SUITE_VEXCL_TRUE
+TEST_SUITE_AMDSDK3_0_FALSE
+TEST_SUITE_AMDSDK3_0_TRUE
 TEST_SUITE_AMDSDK2_9_FALSE
 TEST_SUITE_AMDSDK2_9_TRUE
 TEST_SUITE_AMD_FALSE
@@ -757,11 +763,7 @@ BOOST_CPPFLAGS
 SDL_LIBS
 SDL_CFLAGS
 LLVM_LIBS
-NEW_PRINTF_WORKS_FALSE
-NEW_PRINTF_WORKS_TRUE
 LLVM_LDFLAGS
-LLVM_3_6_FALSE
-LLVM_3_6_TRUE
 LLVM_VERSION
 LLVM_CONFIG
 CXXCPP
@@ -912,7 +914,6 @@ with_gnu_ld
 with_sysroot
 enable_libtool_lock
 enable_static_llvm
-enable_region_allocator
 enable_testsuites
 with_boost
 with_boost_libdir
@@ -969,8 +970,6 @@ TARGET_CLANG_FLAGS
 HOST_CLANG_FLAGS
 HOST_LLC_FLAGS
 HOST_AS_FLAGS
-LIBSPE_CFLAGS
-LIBSPE_LIBS
 HSAILASM'
 
 
@@ -1512,7 +1511,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures pocl 0.12 to adapt to many kinds of systems.
+\`configure' configures pocl 0.13 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1583,7 +1582,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of pocl 0.12:";;
+     short | recursive ) echo "Configuration of pocl 0.13:";;
    esac
   cat <<\_ACEOF
 
@@ -1603,14 +1602,9 @@ Optional Features:
                           optimize for fast installation [default=yes]
   --disable-libtool-lock  avoid locking (might break parallel builds)
   --enable-static-llvm    Link LLVM statically. Default is dynamic.
-  --enable-region-allocator
-                          Use a custom OpenCL optimized region-based memory
-                          allocator for the CPU devices instead of allocating
-                          buffers directly with malloc (experimental with
-                          known issues!).
   --enable-testsuites=suite1,suite2,...
                           choose enabled external project testsuites
-                          (all,opencl-book-samples,ViennaCL,Rodinia,Parboil,amd,amdsdk2_9,VexCL,Piglit,Halide,OpenCV,CloverLeaf,hsa
+                          (all,opencl-book-samples,ViennaCL,Rodinia,Parboil,amd,amdsdk2_9,amdsdk3_0,VexCL,Piglit,Halide,OpenCV,CloverLeaf,hsa,IntelSVM
   --enable-icd            Build pocl with the ICD extensions (default yes)
   --enable-direct-linkage Allow OpenCL programs to be linked directly against
                           the pocl library instead of using a ICD Loader
@@ -1704,9 +1698,6 @@ Some influential environment variables:
 
   HOST_AS_FLAGS
 
-  LIBSPE_CFLAGS
-              C compiler flags for LIBSPE, overriding pkg-config
-  LIBSPE_LIBS linker flags for LIBSPE, overriding pkg-config
   HSAILASM    Path to HSAILasm executable
 
 Use these variables to override the choices made by `configure' or to help
@@ -1775,7 +1766,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-pocl configure 0.12
+pocl configure 0.13
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2448,7 +2439,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by pocl $as_me 0.12, which was
+It was created by pocl $as_me 0.13, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2728,6 +2719,8 @@ $as_echo "$as_me: creating cache $cache_file" >&6;}
   >$cache_file
 fi
 
+as_fn_append ac_func_list " vfork"
+as_fn_append ac_func_list " fork"
 # Check that the precious variables saved in the cache have kept the same
 # value.
 ac_cache_corrupted=false
@@ -3428,7 +3421,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='pocl'
- VERSION='0.12'
+ VERSION='0.13'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -16740,7 +16733,7 @@ LD_FLAGS_BIN=""
 # LLVM configuration
 #
 
-for ac_prog in llvm-config llvm-config-mp-3.3 llvm-config-3.3 llvm-config33 llvm-config-mp-3.4 llvm-config-3.4 llvm-config34 llvm-config-mp-3.2 llvm-config-3.2 llvm-config32
+for ac_prog in llvm-config llvm-config-mp-3.7 llvm-config-3.7 llvm-config37 llvm-config-mp-3.6 llvm-config-3.6 llvm-config36
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
@@ -16794,9 +16787,6 @@ LLVM_VERSION=`$LLVM_CONFIG --version`
 LLVM_BINDIR=`$LLVM_CONFIG --bindir`
 LLVM_LIBDIR=`$LLVM_CONFIG --libdir`
 LLVM_LDFLAGS=`$LLVM_CONFIG --ldflags`
-# Whether we can use our own printf implementation
-NEW_PRINTF_WORKS=true
-LLVM_3_6=false
 
 LLVM_VERSION=$LLVM_VERSION
 
@@ -16807,69 +16797,28 @@ _ACEOF
 
 
 case "$LLVM_VERSION" in
-     3.2*)
-
-$as_echo "#define LLVM_3_2 /**/" >>confdefs.h
-
-     NEW_PRINTF_WORKS=false
-   ;;
-     3.3*)
-
-$as_echo "#define LLVM_3_3 /**/" >>confdefs.h
-
-     NEW_PRINTF_WORKS=false
-   ;;
-     3.4*)
-
-$as_echo "#define LLVM_3_4 /**/" >>confdefs.h
-
-   ;;
-     3.5*)
-
-$as_echo "#define LLVM_3_5 /**/" >>confdefs.h
-
-     LLVM_LDFLAGS="$LLVM_LDFLAGS `$LLVM_CONFIG --system-libs`"
-   ;;
-     3.6*)
+     3.7*)
 
-$as_echo "#define LLVM_3_6 /**/" >>confdefs.h
+$as_echo "#define LLVM_3_7 /**/" >>confdefs.h
 
-     LLVM_3_6=true
      LLVM_LDFLAGS="$LLVM_LDFLAGS `$LLVM_CONFIG --system-libs`"
    ;;
-     3.7*)
+     3.8*)
 
-$as_echo "#define LLVM_3_7 /**/" >>confdefs.h
+$as_echo "#define LLVM_3_8 /**/" >>confdefs.h
 
      LLVM_LDFLAGS="$LLVM_LDFLAGS `$LLVM_CONFIG --system-libs`"
    ;;
      *)
    as_fn_error $? "
-Unsupported LLVM version. Please use LLVM version 3.2, 3.3, 3.4, 3.5, 3.6, 3.7.
+Unsupported LLVM version. Please use LLVM version 3.7 or 3.8.
    " "$LINENO" 5
    LLVM_VERSION=
    ;;
 esac
 
- if test "x$LLVM_3_6" = "xtrue"; then
-  LLVM_3_6_TRUE=
-  LLVM_3_6_FALSE='#'
-else
-  LLVM_3_6_TRUE='#'
-  LLVM_3_6_FALSE=
-fi
-
-
 LLVM_LDFLAGS=$LLVM_LDFLAGS
 
- if $NEW_PRINTF_WORKS; then
-  NEW_PRINTF_WORKS_TRUE=
-  NEW_PRINTF_WORKS_FALSE='#'
-else
-  NEW_PRINTF_WORKS_TRUE='#'
-  NEW_PRINTF_WORKS_FALSE=
-fi
-
 
 # When building with API linking, clang is always linked statically, so user might want to link llvm static to libpocl too
 # or risk causing version mismatches. Also useful when other platform libraries use LLVM too, see issue #46.
@@ -16909,7 +16858,7 @@ fi
 
 
 
-if `$LLVM_CONFIG --build-mode | grep -q Asserts`;
+if `$LLVM_CONFIG --assertion-mode | grep -q ON`;
 then
 
 $as_echo "#define LLVM_BUILT_WITH_ASSERTS /**/" >>confdefs.h
@@ -16927,16 +16876,6 @@ You should rebuild LLVM with 'make REQUIRES_RTTI=1'.
 See the INSTALL file for more information." >&2;}
 fi
 
-# Check whether --enable-region-allocator was given.
-if test "${enable_region_allocator+set}" = set; then :
-  enableval=$enable_region_allocator;
-
-$as_echo "#define CUSTOM_BUFFER_ALLOCATOR /**/" >>confdefs.h
-
-
-fi
-
-
 ####################################################################
 # Manage optional testsuites
 
@@ -16955,11 +16894,13 @@ enable_testsuite_rodinia=no
 enable_testsuite_parboil=no
 enable_testsuite_amd=no
 enable_testsuite_amdsdk2_9=no
+enable_testsuite_amdsdk3_0=no
 enable_testsuite_vexcl=no
 enable_testsuite_piglit=no
 enable_testsuite_halide=no
 enable_testsuite_opencv=no
 enable_testsuite_hsa=no
+enable_testsuite_intel_svm=no
 if test x"$enable_testsuites" = xcheck ; then
   if test -d "$srcdir/examples/opencl-book-samples/checkout" ; then
     enable_testsuite_opencl_book_samples=yes
@@ -16981,6 +16922,9 @@ if test x"$enable_testsuites" = xcheck ; then
   if test -f "$srcdir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64.tgz" ; then
      enable_testsuite_amdsdk2_9=yes
   fi
+  if test -f "$srcdir/examples/AMDSDK3.0/AMD-APP-SDKInstaller-v3.0.130.135-GA-linux64.tar.bz2" ; then
+     enable_testsuite_amdsdk3_0=yes
+  fi
   if test -f "$srcdir/examples/VexCL/vexcl/README.md" ; then
     enable_testsuite_vexcl=yes
   fi
@@ -16996,10 +16940,13 @@ if test x"$enable_testsuites" = xcheck ; then
   if test -f "$srcdir/examples/CloverLeaf/CloverLeaf_OpenCL/Makefile" ; then
     enable_testsuite_cloverleaf=yes
   fi
+  if test -f "$srcdir/examples/IntelSVM/intel_ocl_svm_basic_win.zip"; then
+    enable_testsuite_intel_svm=yes
+  fi
 fi
 case ,"$enable_testsuites", in #(
   *,all,*|*,yes,*) :
-    enable_testsuites="opencl-book-samples,ViennaCL,Rodinia,Parboil,amd,amdsdk2_9,VexCL,piglit,Halide,OpenCV,CloverLeaf" ;; #(
+    enable_testsuites="opencl-book-samples,ViennaCL,Rodinia,Parboil,amd,amdsdk2_9,amdsdk3_0,VexCL,piglit,Halide,OpenCV,CloverLeaf" ;; #(
   *,no,*) :
     enable_testsuites=""
  ;; #(
@@ -17043,6 +16990,7 @@ case ,"$enable_testsuites", in #(
 
     enable_testsuite_amd=yes
     enable_testsuite_amdsdk2_9=yes
+    enable_testsuite_amdsdk3_0=yes
    ;; #(
   *) :
      ;;
@@ -17064,6 +17012,14 @@ case ,"$enable_testsuites", in #(
      ;;
 esac
 case ,"$enable_testsuites", in #(
+  *,intelsvm,*|*,IntelSVM,*) :
+
+    enable_testsuite_intel_svm=yes
+   ;; #(
+  *) :
+     ;;
+esac
+case ,"$enable_testsuites", in #(
   *,halide,*|*,Halide,*) :
 
     enable_testsuite_halide=yes
@@ -17257,6 +17213,14 @@ $as_echo "yes" >&6; }
 fi
 fi
 
+if test "$enable_testsuite_intel_svm" = "yes" ; then
+  if ! test -f "$srcdir/examples/IntelSVM/intel_ocl_svm_basic_win.zip"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Disabling Intel SVM tests, could not find intel_ocl_svm_basic_win.zip" >&5
+$as_echo "$as_me: WARNING: Disabling Intel SVM tests, could not find intel_ocl_svm_basic_win.zip" >&2;}
+    enable_testsuite_intel_svm=no
+  fi
+fi
+
 if test "$enable_testsuite_amdsdk2_9" = "yes" ; then
    # Check for software and sources for AMD APP SDK
   if ! test -f "$srcdir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64.tgz"; then
@@ -17343,6 +17307,93 @@ $as_echo "yes" >&6; }
 fi
 fi
 
+if test "$enable_testsuite_amdsdk3_0" = "yes" ; then
+   # Check for software and sources for AMD APP SDK
+  if ! test -f "$srcdir/examples/AMDSDK3.0/AMD-APP-SDKInstaller-v3.0.130.135-GA-linux64.tar.bz2"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: disabling AMD APP SDK testsuite as the 3.0 installer package not in '$srcdir/examples/AMDSDK3.0'." >&5
+$as_echo "$as_me: WARNING: disabling AMD APP SDK testsuite as the 3.0 installer package not in '$srcdir/examples/AMDSDK3.0'." >&2;}
+    enable_testsuite_amdsdk3_0=no
+  fi
+
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SDL" >&5
+$as_echo_n "checking for SDL... " >&6; }
+
+if test -n "$SDL_CFLAGS"; then
+    pkg_cv_SDL_CFLAGS="$SDL_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"sdl >= 1.2\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "sdl >= 1.2") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_SDL_CFLAGS=`$PKG_CONFIG --cflags "sdl >= 1.2" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+if test -n "$SDL_LIBS"; then
+    pkg_cv_SDL_LIBS="$SDL_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"sdl >= 1.2\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "sdl >= 1.2") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_SDL_LIBS=`$PKG_CONFIG --libs "sdl >= 1.2" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi
+        if test $_pkg_short_errors_supported = yes; then
+	        SDL_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "sdl >= 1.2" 2>&1`
+        else
+	        SDL_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "sdl >= 1.2" 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$SDL_PKG_ERRORS" >&5
+
+	 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: pkg-config could not find libSDL, AMD APP SDK 3.0 testsuite is not used " >&5
+$as_echo "$as_me: WARNING: pkg-config could not find libSDL, AMD APP SDK 3.0 testsuite is not used " >&2;}
+      enable_testsuite_amdsdk3_0=no
+
+elif test $pkg_failed = untried; then
+     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	 { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: pkg-config could not find libSDL, AMD APP SDK 3.0 testsuite is not used " >&5
+$as_echo "$as_me: WARNING: pkg-config could not find libSDL, AMD APP SDK 3.0 testsuite is not used " >&2;}
+      enable_testsuite_amdsdk3_0=no
+
+else
+	SDL_CFLAGS=$pkg_cv_SDL_CFLAGS
+	SDL_LIBS=$pkg_cv_SDL_LIBS
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+	:
+fi
+fi
+
+
 if test "$enable_testsuite_vexcl" = "yes" ; then
   # Check for software and sources for VexCL
   if ! test -f "$srcdir/examples/VexCL/vexcl/README.md"; then
@@ -17959,6 +18010,14 @@ else
   TEST_SUITE_AMDSDK2_9_FALSE=
 fi
 
+ if test "$enable_testsuite_amdsdk3_0" = "yes"; then
+  TEST_SUITE_AMDSDK3_0_TRUE=
+  TEST_SUITE_AMDSDK3_0_FALSE='#'
+else
+  TEST_SUITE_AMDSDK3_0_TRUE='#'
+  TEST_SUITE_AMDSDK3_0_FALSE=
+fi
+
  if test "$enable_testsuite_vexcl" = "yes"; then
   TEST_SUITE_VEXCL_TRUE=
   TEST_SUITE_VEXCL_FALSE='#'
@@ -17999,6 +18058,14 @@ else
   TEST_SUITE_CLOVERLEAF_FALSE=
 fi
 
+ if test "$enable_testsuite_intel_svm" = "yes"; then
+  TEST_SUITE_INTELSVM_TRUE=
+  TEST_SUITE_INTELSVM_FALSE='#'
+else
+  TEST_SUITE_INTELSVM_TRUE='#'
+  TEST_SUITE_INTELSVM_FALSE=
+fi
+
 
 # Some information for the user
 { $as_echo "$as_me:${as_lineno-$LINENO}: internal tests are enabled" >&5
@@ -18052,6 +18119,14 @@ else
   { $as_echo "$as_me:${as_lineno-$LINENO}: tests from AMD APP SDK 2.9 are disabled" >&5
 $as_echo "$as_me: tests from AMD APP SDK 2.9 are disabled" >&6;}
 fi
+if test "$enable_testsuite_amdsdk3_0" = "yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: tests from AMD APP SDK 3.0 are enabled" >&5
+$as_echo "$as_me: tests from AMD APP SDK 3.0 are enabled" >&6;}
+  POAT_TESTSUITES="$POAT_TESTSUITES amdsdk3_0"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: tests from AMD APP SDK 3.0 are disabled" >&5
+$as_echo "$as_me: tests from AMD APP SDK 3.0 are disabled" >&6;}
+fi
 if test "$enable_testsuite_vexcl" = "yes"; then
   { $as_echo "$as_me:${as_lineno-$LINENO}: tests from VexCL are enabled" >&5
 $as_echo "$as_me: tests from VexCL are enabled" >&6;}
@@ -18092,7 +18167,14 @@ else
   { $as_echo "$as_me:${as_lineno-$LINENO}: tests from CloverLeaf are disabled" >&5
 $as_echo "$as_me: tests from CloverLeaf are disabled" >&6;}
 fi
-
+if test "$enable_testsuite_intel_svm" = "yes" ; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: tests from Intel SVM samples are enabled" >&5
+$as_echo "$as_me: tests from Intel SVM samples are enabled" >&6;}
+  POAT_TESTSUITES="$POAT_TESTSUITES IntelSVM"
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: tests from Intel SVM samples are disabled" >&5
+$as_echo "$as_me: tests from Intel SVM samples are disabled" >&6;}
+fi
 
 ####################################################################
 # Pthread Library
@@ -18949,7 +19031,7 @@ if test "x$debug" = "xno"
 then
 	CXXFLAGS="$CXXFLAGS"
 else
-	CFLAGS="$CXXFLAGS -O0 -g"
+	CFLAGS="$CFLAGS -O0 -g"
 	CXXFLAGS="$CXXFLAGS -O0 -g"
 
 $as_echo "#define POCL_DEBUG_BUILD /**/" >>confdefs.h
@@ -19005,6 +19087,27 @@ fi
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
+
+
+
+  for ac_func in $ac_func_list
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+done
+
+
+
+
+
+
+
 # Enable debug message output when the env POCL_DEBUG=1 is set.
 
 $as_echo "#define POCL_DEBUG_MESSAGES 1" >>confdefs.h
@@ -19031,11 +19134,11 @@ fi
 if test "$cache_kernel" = "yes"
 then
 
-$as_echo "#define POCL_BUILD_KERNEL_CACHE 1" >>confdefs.h
+$as_echo "#define POCL_KERNEL_CACHE_DEFAULT 1" >>confdefs.h
 
 else
 
-$as_echo "#define POCL_BUILD_KERNEL_CACHE 0" >>confdefs.h
+$as_echo "#define POCL_KERNEL_CACHE_DEFAULT 0" >>confdefs.h
 
 fi
 
@@ -19395,7 +19498,7 @@ fi
 # Define the OpenCL version when compiling OpenCL C code. This also
 # serves as indicator for the OpenCL C language, similar to
 # __cplusplus for C++.
-CLFLAGS="$CLFLAGS -D__OPENCL_VERSION__=120"
+CLFLAGS="$CLFLAGS -Xclang -cl-std=CL2.0 -D__OPENCL_C_VERSION__=200"
 
  if test ! -z $CLANGXX; then
   CLANGXX_AVAILABLE_TRUE=
@@ -19739,7 +19842,7 @@ export LLC
 
 
 cat >>confdefs.h <<_ACEOF
-#define LLC "$LLC"
+#define LLVM_LLC "$LLC"
 _ACEOF
 
 
@@ -19948,22 +20051,6 @@ then
   HOST_LLC_FLAGS="$HOST_LLC_FLAGS -mcpu=$llc_host_cpu"
 fi
 
-# Work-around a clang bug in LLVM 3.3: On 32-bit platforms, the size
-# of Open CL C long is not 8 bytes
-if echo "constant int test[sizeof(long)==8?1:-1]={1};" | $CLANG $HOST_CLANG_FLAGS -S -x cl -
-then
-    : # OpenCL C long is supported
-else
-    # AC_DEFINE needed for host code, HOST_CLANG_FLGAS for kernel code.
-
-cat >>confdefs.h <<_ACEOF
-#define _CL_DISABLE_LONG /**/
-_ACEOF
-
-    HOST_CLANG_FLAGS="$HOST_CLANG_FLAGS -D_CL_DISABLE_LONG"
-fi
-rm -f ./-.s
-
 
 cat >>confdefs.h <<_ACEOF
 #define KERNEL_DIR "$kernel_dir"
@@ -20006,6 +20093,14 @@ cat >>confdefs.h <<_ACEOF
 #define HOST_CPU "$host_cpu"
 _ACEOF
 
+ if test "$host_cpu" = "x86_64"; then
+  HOST_CPU_IS_X86_64_TRUE=
+  HOST_CPU_IS_X86_64_FALSE='#'
+else
+  HOST_CPU_IS_X86_64_TRUE='#'
+  HOST_CPU_IS_X86_64_FALSE=
+fi
+
 OCL_KERNEL_TARGET=$llc_triple
 
 
@@ -20186,8 +20281,8 @@ ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 old_CXXFLAGS=$CXXFLAGS
 CXXFLAGS=`$TCE_CONFIG --libs --cxxflags --includes`
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for TCE" >&5
-$as_echo_n "checking for TCE... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking compiling against a TCE header" >&5
+$as_echo_n "checking compiling against a TCE header... " >&6; }
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
  #include <Application.hh>
@@ -20210,16 +20305,29 @@ $as_echo "no" >&6; }
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking running a TCE binary" >&5
+$as_echo_n "checking running a TCE binary... " >&6; }
+if ttasim --help > /dev/null;
+then
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+TCE_EXEC=ok
+else
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
 CXXFLAGS=$old_CXXFLAGS
 
 fi
 
 if test -z "$TCECC" || \
    test -z "$TCE_CONFIG" || \
-   test -z "$TCE_LIBRARY"; then
+   test -z "$TCE_LIBRARY" || \
+   test -z "$TCE_EXEC"; then
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: TCE installation not found. The ttasim device will not be built." >&5
-$as_echo "$as_me: TCE installation not found. The ttasim device will not be built." >&6;}
+{ $as_echo "$as_me:${as_lineno-$LINENO}: TCE installation not found or doesn't work. The ttasim device will not be built." >&5
+$as_echo "$as_me: TCE installation not found or doesn't work. The ttasim device will not be built." >&6;}
 
 else
 
@@ -20255,6 +20363,28 @@ $as_echo "#define TCE_AVAILABLE 1" >>confdefs.h
  TCE_AVAILABLE=1
 fi
 
+# List of extension supported by the TCE device
+TCE_DEVICE_EXTENSIONS="cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16"
+TCE_DEVICE_EXTENSION_DEFINES=`echo $TCE_DEVICE_EXTENSIONS | sed s/cl_khr/-Dcl_khr/g`
+
+cat >>confdefs.h <<_ACEOF
+#define TCE_DEVICE_EXTENSIONS "$TCE_DEVICE_EXTENSIONS"
+_ACEOF
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define TCE_DEVICE_EXTENSION_DEFINES "$TCE_DEVICE_EXTENSION_DEFINES"
+_ACEOF
+
+
+$as_echo "#define TCE_DEVICE_CL_VERSION_MAJOR 2" >>confdefs.h
+
+
+$as_echo "#define TCE_DEVICE_CL_VERSION_MINOR 0" >>confdefs.h
+
+
+
 OCL_TARGETS="$OCL_TARGETS tce"
 fi
 
@@ -20320,113 +20450,6 @@ else
 fi
 
 
-# Check if CellSPU support is found
-# The libspe version requirement is not strict. This is the only one tested.
-# SPU backend was removed in LLVM 3.3 (and we don't accept LLVM 3.1 anymore in pocl)
-# so compile the spu backend only when LLVM 3.2 is found
-if test "$LLVM_VERSION" == 3.2 -o "$LLVM_VERSION" == 3.2svn
-then
-
-pkg_failed=no
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for LIBSPE" >&5
-$as_echo_n "checking for LIBSPE... " >&6; }
-
-if test -n "$LIBSPE_CFLAGS"; then
-    pkg_cv_LIBSPE_CFLAGS="$LIBSPE_CFLAGS"
- elif test -n "$PKG_CONFIG"; then
-    if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libspe2 >= 2.2.80\""; } >&5
-  ($PKG_CONFIG --exists --print-errors "libspe2 >= 2.2.80") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then
-  pkg_cv_LIBSPE_CFLAGS=`$PKG_CONFIG --cflags "libspe2 >= 2.2.80" 2>/dev/null`
-		      test "x$?" != "x0" && pkg_failed=yes
-else
-  pkg_failed=yes
-fi
- else
-    pkg_failed=untried
-fi
-if test -n "$LIBSPE_LIBS"; then
-    pkg_cv_LIBSPE_LIBS="$LIBSPE_LIBS"
- elif test -n "$PKG_CONFIG"; then
-    if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libspe2 >= 2.2.80\""; } >&5
-  ($PKG_CONFIG --exists --print-errors "libspe2 >= 2.2.80") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then
-  pkg_cv_LIBSPE_LIBS=`$PKG_CONFIG --libs "libspe2 >= 2.2.80" 2>/dev/null`
-		      test "x$?" != "x0" && pkg_failed=yes
-else
-  pkg_failed=yes
-fi
- else
-    pkg_failed=untried
-fi
-
-
-
-if test $pkg_failed = yes; then
-   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
-        _pkg_short_errors_supported=yes
-else
-        _pkg_short_errors_supported=no
-fi
-        if test $_pkg_short_errors_supported = yes; then
-	        LIBSPE_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libspe2 >= 2.2.80" 2>&1`
-        else
-	        LIBSPE_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libspe2 >= 2.2.80" 2>&1`
-        fi
-	# Put the nasty error message in config.log where it belongs
-	echo "$LIBSPE_PKG_ERRORS" >&5
-
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: libspe2 not found (by pkg-config). Spu driver not built." >&5
-$as_echo "$as_me: libspe2 not found (by pkg-config). Spu driver not built." >&6;}
-    LIBSPE_AVAILABLE=no
-    BUILD_SPU=0
-
-elif test $pkg_failed = untried; then
-     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: libspe2 not found (by pkg-config). Spu driver not built." >&5
-$as_echo "$as_me: libspe2 not found (by pkg-config). Spu driver not built." >&6;}
-    LIBSPE_AVAILABLE=no
-    BUILD_SPU=0
-
-else
-	LIBSPE_CFLAGS=$pkg_cv_LIBSPE_CFLAGS
-	LIBSPE_LIBS=$pkg_cv_LIBSPE_LIBS
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: libspe2 found." >&5
-$as_echo "$as_me: libspe2 found." >&6;}
-    LIBSPE_AVAILABLE=yes
-    OCL_DRIVERS="$OCL_DRIVERS spu"
-    OCL_TARGETS="$OCL_TARGETS cellspu"
-
-$as_echo "#define BUILD_SPU /**/" >>confdefs.h
-
-    BUILD_SPU=1
-
-fi
-fi
- if echo $OCL_DRIVERS | grep spu; then
-  BUILD_SPU_TRUE=
-  BUILD_SPU_FALSE='#'
-else
-  BUILD_SPU_TRUE='#'
-  BUILD_SPU_FALSE=
-fi
-
-
 # Enable HSA by default in case the dependencies are met.
 # Check whether --enable-hsa-amdgcn was given.
 if test "${enable_hsa_amdgcn+set}" = set; then :
@@ -20607,6 +20630,33 @@ enable_hsa="no"
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AMD-specific HSA runtime headers" >&5
+$as_echo_n "checking for AMD-specific HSA runtime headers... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+ #include <hsa_ext_amd.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define HAVE_HSA_EXT_AMD_H /**/" >>confdefs.h
+
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
 CPPFLAGS=$tempCPPFLAGS
 
 
@@ -20667,8 +20717,12 @@ $as_echo "$as_me: Could not find HSAILasm executable, disabling HSA support" >&6
   enable_hsa="no"
 fi
 
+HSA_DEVICE_EXTENSIONS=""
+HSA_DEVICE_EXTENSION_DEFINES=""
 if test "$enable_hsa" = "yes";
 then
+  HSA_DEVICE_EXTENSIONS="cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64 cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics"
+  HSA_DEVICE_EXTENSION_DEFINES=$(echo "$HSA_DEVICE_EXTENSIONS" | sed 's/cl_khr/-Dcl_khr/g')
   OCL_DRIVERS="$OCL_DRIVERS hsa"
   OCL_TARGETS="$OCL_TARGETS hsail64"
 
@@ -20676,11 +20730,22 @@ $as_echo "#define BUILD_HSA 1" >>confdefs.h
 
   enable_testsuite_hsa=yes
   POAT_TESTSUITES="$POAT_TESTSUITES hsa"
-
   # -lHSAILUtil is missing from llvm-config --libfiles
   LLVM_LIBS="$LLVM_LIBS -lLLVMHSAILUtil"
+
+$as_echo "#define HSA_DEVICE_CL_VERSION_MAJOR 2" >>confdefs.h
+
+
+$as_echo "#define HSA_DEVICE_CL_VERSION_MINOR 0" >>confdefs.h
+
 fi
 
+cat >>confdefs.h <<_ACEOF
+#define HSA_DEVICE_EXTENSIONS "$HSA_DEVICE_EXTENSIONS"
+_ACEOF
+
+
+
 # TODO: move above
 
 # Enable the HSA test suite in case HSA support enabled.
@@ -21229,6 +21294,35 @@ _ACEOF
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
+HOST_DEVICE_EXTENSIONS="cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_spir"
+if test "$HAVE_WORKING_HALF" == "1"; then
+  HOST_DEVICE_EXTENSIONS="$HOST_DEVICE_EXTENSIONS cl_khr_fp16"
+fi
+if test -z "$CL_DISABLE_LONG"; then
+  HOST_DEVICE_EXTENSIONS="$HOST_DEVICE_EXTENSIONS cl_khr_int64 cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics"
+fi
+HOST_DEVICE_EXTENSION_DEFINES=`echo $HOST_DEVICE_EXTENSIONS | sed s/cl_khr/-Dcl_khr/g`
+
+cat >>confdefs.h <<_ACEOF
+#define HOST_DEVICE_EXTENSIONS "$HOST_DEVICE_EXTENSIONS"
+_ACEOF
+
+
+#NB: OCL version is a property of the clPlatform, not the device. But the "host devices"
+#(basic and pthread) are probably always the ones cathing the new OCL spec versions first.
+
+$as_echo "#define HOST_DEVICE_CL_VERSION_MAJOR 2" >>confdefs.h
+
+
+$as_echo "#define HOST_DEVICE_CL_VERSION_MINOR 0" >>confdefs.h
+
+
+cat >>confdefs.h <<_ACEOF
+#define POCL_CL_VERSION "2.0"
+_ACEOF
+
+
+
 # Seems to be unused
 #AC_MSG_CHECKING([whether __fp16 can be converted to float])
 #AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[__fp16 x; float y;]],
@@ -21495,9 +21589,11 @@ $as_echo "#define AC_APPLE_UNIVERSAL_BUILD 1" >>confdefs.h
 # 4:0:3 == 0.10 (currently backwards compatible with 0.7, thus age = 3).
 # 5:0:4 == 0.11 (currently backwards compatible with 0.7, thus age = 4).
 # 6:0:5 == 0.12 (currently backwards compatible with 0.7, thus age = 5).
-LIB_CURRENT_VERSION=6
+# 7:0:6 == 0.13 (currently backwards compatible with 0.7, thus age = 6).
+
+LIB_CURRENT_VERSION=7
 LIB_REVISION_VERSION=0
-LIB_AGE_VERSION=5
+LIB_AGE_VERSION=6
 LIB_FIRST_VERSION=$(($LIB_CURRENT_VERSION - $LIB_AGE_VERSION))
 BUILD_TIMESTAMP=""
 LIB_VERSION="$LIB_CURRENT_VERSION:$LIB_REVISION_VERSION:$LIB_AGE_VERSION"
@@ -21527,7 +21623,7 @@ KERNEL_COMPILER_LIB_VERSION="$LIB_CURRENT_VERSION:0:0"
 
 
 
-ac_config_files="$ac_config_files Makefile pocl.pc tests/atlocal ocl-vendors/pocl-tests.icd include/Makefile include/CL/Makefile include/OpenCL/Makefile lib/Makefile lib/CL/Makefile lib/CL/devices/Makefile lib/CL/devices/pthread/Makefile lib/CL/devices/basic/Makefile lib/CL/devices/tce/Makefile lib/CL/devices/tce/ttasim/Makefile lib/CL/devices/topology/Makefile lib/CL/devices/cellspu/Makefile lib/CL/devices/hsa/Makefile lib/llvmopencl/Makefile lib/kernel/Makefile lib/kernel/cellspu/Makef [...]
+ac_config_files="$ac_config_files Makefile pocl.pc tests/atlocal ocl-vendors/pocl-tests.icd include/Makefile include/CL/Makefile include/OpenCL/Makefile lib/Makefile lib/CL/Makefile lib/CL/devices/Makefile lib/CL/devices/pthread/Makefile lib/CL/devices/basic/Makefile lib/CL/devices/tce/Makefile lib/CL/devices/tce/ttasim/Makefile lib/CL/devices/topology/Makefile lib/CL/devices/hsa/Makefile lib/llvmopencl/Makefile lib/kernel/Makefile lib/kernel/host/Makefile lib/kernel/tce/Makefile lib/ker [...]
 
 
 cat >confcache <<\_ACEOF
@@ -21683,14 +21779,6 @@ if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then
   as_fn_error $? "conditional \"am__fastdepCXX\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${LLVM_3_6_TRUE}" && test -z "${LLVM_3_6_FALSE}"; then
-  as_fn_error $? "conditional \"LLVM_3_6\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${NEW_PRINTF_WORKS_TRUE}" && test -z "${NEW_PRINTF_WORKS_FALSE}"; then
-  as_fn_error $? "conditional \"NEW_PRINTF_WORKS\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${TEST_SUITE_SAMPLES_TRUE}" && test -z "${TEST_SUITE_SAMPLES_FALSE}"; then
   as_fn_error $? "conditional \"TEST_SUITE_SAMPLES\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -21715,6 +21803,10 @@ if test -z "${TEST_SUITE_AMDSDK2_9_TRUE}" && test -z "${TEST_SUITE_AMDSDK2_9_FAL
   as_fn_error $? "conditional \"TEST_SUITE_AMDSDK2_9\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${TEST_SUITE_AMDSDK3_0_TRUE}" && test -z "${TEST_SUITE_AMDSDK3_0_FALSE}"; then
+  as_fn_error $? "conditional \"TEST_SUITE_AMDSDK3_0\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${TEST_SUITE_VEXCL_TRUE}" && test -z "${TEST_SUITE_VEXCL_FALSE}"; then
   as_fn_error $? "conditional \"TEST_SUITE_VEXCL\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -21735,6 +21827,10 @@ if test -z "${TEST_SUITE_CLOVERLEAF_TRUE}" && test -z "${TEST_SUITE_CLOVERLEAF_F
   as_fn_error $? "conditional \"TEST_SUITE_CLOVERLEAF\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${TEST_SUITE_INTELSVM_TRUE}" && test -z "${TEST_SUITE_INTELSVM_FALSE}"; then
+  as_fn_error $? "conditional \"TEST_SUITE_INTELSVM\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${BUILD_ICD_TRUE}" && test -z "${BUILD_ICD_FALSE}"; then
   as_fn_error $? "conditional \"BUILD_ICD\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -21763,6 +21859,10 @@ if test -z "${CLANGXX_AVAILABLE_TRUE}" && test -z "${CLANGXX_AVAILABLE_FALSE}";
   as_fn_error $? "conditional \"CLANGXX_AVAILABLE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${HOST_CPU_IS_X86_64_TRUE}" && test -z "${HOST_CPU_IS_X86_64_FALSE}"; then
+  as_fn_error $? "conditional \"HOST_CPU_IS_X86_64\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${USE_VECMATHLIB_TRUE}" && test -z "${USE_VECMATHLIB_FALSE}"; then
   as_fn_error $? "conditional \"USE_VECMATHLIB\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -21779,10 +21879,6 @@ if test -z "${POCL_ANDROID_TRUE}" && test -z "${POCL_ANDROID_FALSE}"; then
   as_fn_error $? "conditional \"POCL_ANDROID\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${BUILD_SPU_TRUE}" && test -z "${BUILD_SPU_FALSE}"; then
-  as_fn_error $? "conditional \"BUILD_SPU\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${TEST_SUITE_HSA_TRUE}" && test -z "${TEST_SUITE_HSA_FALSE}"; then
   as_fn_error $? "conditional \"TEST_SUITE_HSA\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -22193,7 +22289,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by pocl $as_me 0.12, which was
+This file was extended by pocl $as_me 0.13, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22259,7 +22355,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-pocl config.status 0.12
+pocl config.status 0.13
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
@@ -22789,11 +22885,9 @@ do
     "lib/CL/devices/tce/Makefile") CONFIG_FILES="$CONFIG_FILES lib/CL/devices/tce/Makefile" ;;
     "lib/CL/devices/tce/ttasim/Makefile") CONFIG_FILES="$CONFIG_FILES lib/CL/devices/tce/ttasim/Makefile" ;;
     "lib/CL/devices/topology/Makefile") CONFIG_FILES="$CONFIG_FILES lib/CL/devices/topology/Makefile" ;;
-    "lib/CL/devices/cellspu/Makefile") CONFIG_FILES="$CONFIG_FILES lib/CL/devices/cellspu/Makefile" ;;
     "lib/CL/devices/hsa/Makefile") CONFIG_FILES="$CONFIG_FILES lib/CL/devices/hsa/Makefile" ;;
     "lib/llvmopencl/Makefile") CONFIG_FILES="$CONFIG_FILES lib/llvmopencl/Makefile" ;;
     "lib/kernel/Makefile") CONFIG_FILES="$CONFIG_FILES lib/kernel/Makefile" ;;
-    "lib/kernel/cellspu/Makefile") CONFIG_FILES="$CONFIG_FILES lib/kernel/cellspu/Makefile" ;;
     "lib/kernel/host/Makefile") CONFIG_FILES="$CONFIG_FILES lib/kernel/host/Makefile" ;;
     "lib/kernel/tce/Makefile") CONFIG_FILES="$CONFIG_FILES lib/kernel/tce/Makefile" ;;
     "lib/kernel/hsail64/Makefile") CONFIG_FILES="$CONFIG_FILES lib/kernel/hsail64/Makefile" ;;
@@ -22811,8 +22905,10 @@ do
     "examples/Rodinia/Makefile") CONFIG_FILES="$CONFIG_FILES examples/Rodinia/Makefile" ;;
     "examples/Parboil/Makefile") CONFIG_FILES="$CONFIG_FILES examples/Parboil/Makefile" ;;
     "examples/ViennaCL/Makefile") CONFIG_FILES="$CONFIG_FILES examples/ViennaCL/Makefile" ;;
+    "examples/IntelSVM/Makefile") CONFIG_FILES="$CONFIG_FILES examples/IntelSVM/Makefile" ;;
     "examples/AMD/Makefile") CONFIG_FILES="$CONFIG_FILES examples/AMD/Makefile" ;;
     "examples/AMDSDK2.9/Makefile") CONFIG_FILES="$CONFIG_FILES examples/AMDSDK2.9/Makefile" ;;
+    "examples/AMDSDK3.0/Makefile") CONFIG_FILES="$CONFIG_FILES examples/AMDSDK3.0/Makefile" ;;
     "examples/EinsteinToolkit/Makefile") CONFIG_FILES="$CONFIG_FILES examples/EinsteinToolkit/Makefile" ;;
     "examples/VexCL/Makefile") CONFIG_FILES="$CONFIG_FILES examples/VexCL/Makefile" ;;
     "examples/piglit/Makefile") CONFIG_FILES="$CONFIG_FILES examples/piglit/Makefile" ;;
@@ -22829,8 +22925,6 @@ do
     "tests/tce/ttasim/Makefile") CONFIG_FILES="$CONFIG_FILES tests/tce/ttasim/Makefile" ;;
     "tests/tce/tcemc/Makefile") CONFIG_FILES="$CONFIG_FILES tests/tce/tcemc/Makefile" ;;
     "tests/tce/fp16/Makefile") CONFIG_FILES="$CONFIG_FILES tests/tce/fp16/Makefile" ;;
-    "tests/cell/Makefile") CONFIG_FILES="$CONFIG_FILES tests/cell/Makefile" ;;
-    "tests/cell/hello/Makefile") CONFIG_FILES="$CONFIG_FILES tests/cell/hello/Makefile" ;;
 
   *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
   esac
diff --git a/configure.ac b/configure.ac
index 06f1133..13ffe7d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,7 +23,7 @@
 # THE SOFTWARE.
 
 AC_PREREQ([2.64])
-AC_INIT([pocl], [0.12], [pocl-devel at lists.sourceforge.net])
+AC_INIT([pocl], [0.13], [pocl-devel at lists.sourceforge.net])
 AC_CONFIG_SRCDIR([config.h.in])
 AC_CONFIG_HEADER([config.h])
 AC_CONFIG_AUX_DIR([config])
@@ -79,58 +79,36 @@ LD_FLAGS_BIN=""
 # LLVM configuration
 #
 AC_ARG_VAR([LLVM_CONFIG], [Program used to retrieve LLVM options and binaries])
-AC_PATH_PROGS([LLVM_CONFIG], [llvm-config llvm-config-mp-3.3 llvm-config-3.3 llvm-config33 llvm-config-mp-3.4 llvm-config-3.4 llvm-config34 llvm-config-mp-3.2 llvm-config-3.2 llvm-config32 ])
+AC_PATH_PROGS([LLVM_CONFIG], [llvm-config llvm-config-mp-3.7 llvm-config-3.7 llvm-config37 llvm-config-mp-3.6 llvm-config-3.6 llvm-config36])
 test -z "$LLVM_CONFIG" && AC_MSG_FAILURE([no llvm-config found in \$PATH])
 
 LLVM_VERSION=`$LLVM_CONFIG --version`
 LLVM_BINDIR=`$LLVM_CONFIG --bindir`
 LLVM_LIBDIR=`$LLVM_CONFIG --libdir`
 LLVM_LDFLAGS=`$LLVM_CONFIG --ldflags`
-# Whether we can use our own printf implementation
-NEW_PRINTF_WORKS=true
-LLVM_3_6=false
 
 AC_SUBST([LLVM_VERSION], [$LLVM_VERSION])
 AC_DEFINE_UNQUOTED([LLVM_VERSION], ["$LLVM_VERSION"], "LLVM version as a string.")
 
 case "$LLVM_VERSION" in
-     3.2*)
-     AC_DEFINE([LLVM_3_2], [], "Using LLVM 3.2")
-     NEW_PRINTF_WORKS=false
-   ;;
-     3.3*)
-     AC_DEFINE([LLVM_3_3], [], "Using LLVM 3.3")
-     NEW_PRINTF_WORKS=false
-   ;;
-     3.4*)
-     AC_DEFINE([LLVM_3_4], [], "Using LLVM 3.4")
-   ;;
-     3.5*)
-     AC_DEFINE([LLVM_3_5], [], "Using LLVM 3.5")
-     LLVM_LDFLAGS="$LLVM_LDFLAGS `$LLVM_CONFIG --system-libs`"
-   ;;
-     3.6*)
-     AC_DEFINE([LLVM_3_6], [], "Using LLVM 3.6")
-     LLVM_3_6=true
+     3.7*)
+     AC_DEFINE([LLVM_3_7], [], "Using LLVM 3.7")
      LLVM_LDFLAGS="$LLVM_LDFLAGS `$LLVM_CONFIG --system-libs`"
    ;;
-     3.7*)
-     AC_DEFINE([LLVM_3_7], [], "Using LLVM svn, upcoming 3.7")
+     3.8*)
+     AC_DEFINE([LLVM_3_8], [], "Using LLVM 3.8")
      LLVM_LDFLAGS="$LLVM_LDFLAGS `$LLVM_CONFIG --system-libs`"
    ;;
      *)
    AC_MSG_ERROR(
    [
-Unsupported LLVM version. Please use LLVM version 3.2, 3.3, 3.4, 3.5, 3.6, 3.7.
+Unsupported LLVM version. Please use LLVM version 3.7 or 3.8.
    ])
    LLVM_VERSION=
    ;;
 esac
 
-AM_CONDITIONAL([LLVM_3_6], [test "x$LLVM_3_6" = "xtrue"])
-
 AC_SUBST([LLVM_LDFLAGS], [$LLVM_LDFLAGS], [llvm-config returned ldflags])
-AM_CONDITIONAL([NEW_PRINTF_WORKS], $NEW_PRINTF_WORKS)
 
 # When building with API linking, clang is always linked statically, so user might want to link llvm static to libpocl too
 # or risk causing version mismatches. Also useful when other platform libraries use LLVM too, see issue #46.
@@ -167,7 +145,7 @@ fi
 
 AC_SUBST([LLVM_LIBS])
 
-if `$LLVM_CONFIG --build-mode | grep -q Asserts`;
+if `$LLVM_CONFIG --assertion-mode | grep -q ON`;
 then
 AC_DEFINE([LLVM_BUILT_WITH_ASSERTS], [], "LLVM was built with Assertions on.")
 fi
@@ -180,21 +158,13 @@ You should rebuild LLVM with 'make REQUIRES_RTTI=1'.
 See the INSTALL file for more information.])
 fi
 
-AC_ARG_ENABLE([region-allocator],
-[AS_HELP_STRING([--enable-region-allocator], 
- [Use a custom OpenCL optimized region-based memory allocator for the CPU devices instead of allocating buffers directly with malloc (experimental with known issues!).])],
-[
-AC_DEFINE([CUSTOM_BUFFER_ALLOCATOR], [], "Use a custom buffer allocator")
-],
-[])
-
 ####################################################################
 # Manage optional testsuites
 
 # Option to choose external testsuites
 AC_ARG_ENABLE([testsuites],
 	      [AS_HELP_STRING([--enable-testsuites=suite1,suite2,...],
-          [choose enabled external project testsuites (all,opencl-book-samples,ViennaCL,Rodinia,Parboil,amd,amdsdk2_9,VexCL,Piglit,Halide,OpenCV,CloverLeaf,hsa])],
+          [choose enabled external project testsuites (all,opencl-book-samples,ViennaCL,Rodinia,Parboil,amd,amdsdk2_9,amdsdk3_0,VexCL,Piglit,Halide,OpenCV,CloverLeaf,hsa,IntelSVM])],
 	      [],[enable_testsuites=check])
 
 enable_testsuite_opencl_book_samples=no
@@ -203,11 +173,13 @@ enable_testsuite_rodinia=no
 enable_testsuite_parboil=no
 enable_testsuite_amd=no
 enable_testsuite_amdsdk2_9=no
+enable_testsuite_amdsdk3_0=no
 enable_testsuite_vexcl=no
 enable_testsuite_piglit=no
 enable_testsuite_halide=no
 enable_testsuite_opencv=no
 enable_testsuite_hsa=no
+enable_testsuite_intel_svm=no
 if test x"$enable_testsuites" = xcheck ; then
   if test -d "$srcdir/examples/opencl-book-samples/checkout" ; then
     enable_testsuite_opencl_book_samples=yes
@@ -229,6 +201,9 @@ if test x"$enable_testsuites" = xcheck ; then
   if test -f "$srcdir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64.tgz" ; then
      enable_testsuite_amdsdk2_9=yes
   fi
+  if test -f "$srcdir/examples/AMDSDK3.0/AMD-APP-SDKInstaller-v3.0.130.135-GA-linux64.tar.bz2" ; then
+     enable_testsuite_amdsdk3_0=yes
+  fi
   if test -f "$srcdir/examples/VexCL/vexcl/README.md" ; then
     enable_testsuite_vexcl=yes
   fi
@@ -244,9 +219,12 @@ if test x"$enable_testsuites" = xcheck ; then
   if test -f "$srcdir/examples/CloverLeaf/CloverLeaf_OpenCL/Makefile" ; then
     enable_testsuite_cloverleaf=yes
   fi  
+  if test -f "$srcdir/examples/IntelSVM/intel_ocl_svm_basic_win.zip"; then
+    enable_testsuite_intel_svm=yes
+  fi
 fi
 AS_CASE([,"$enable_testsuites",],
-  [*,all,*|*,yes,*], [enable_testsuites="opencl-book-samples,ViennaCL,Rodinia,Parboil,amd,amdsdk2_9,VexCL,piglit,Halide,OpenCV,CloverLeaf"],
+  [*,all,*|*,yes,*], [enable_testsuites="opencl-book-samples,ViennaCL,Rodinia,Parboil,amd,amdsdk2_9,amdsdk3_0,VexCL,piglit,Halide,OpenCV,CloverLeaf"],
   [*,no,*], [enable_testsuites=""]
 )
 AS_CASE([,"$enable_testsuites",],
@@ -269,6 +247,7 @@ AS_CASE([,"$enable_testsuites",],
   [*,AMD,*|*,amd,*], [
     enable_testsuite_amd=yes
     enable_testsuite_amdsdk2_9=yes
+    enable_testsuite_amdsdk3_0=yes
   ])
 AS_CASE([,"$enable_testsuites",],
   [*,VexCL,*|*,vexcl,*], [
@@ -279,6 +258,10 @@ AS_CASE([,"$enable_testsuites",],
     enable_testsuite_piglit=yes
   ])
 AS_CASE([,"$enable_testsuites",],
+  [*,intelsvm,*|*,IntelSVM,*], [
+    enable_testsuite_intel_svm=yes
+  ])
+AS_CASE([,"$enable_testsuites",],
   [*,halide,*|*,Halide,*], [
     enable_testsuite_halide=yes
   ])
@@ -334,6 +317,13 @@ if test "$enable_testsuite_amd" = "yes" ; then
     ])
 fi
 
+if test "$enable_testsuite_intel_svm" = "yes" ; then
+  if ! test -f "$srcdir/examples/IntelSVM/intel_ocl_svm_basic_win.zip"; then
+    AC_MSG_WARN([Disabling Intel SVM tests, could not find intel_ocl_svm_basic_win.zip])
+    enable_testsuite_intel_svm=no
+  fi
+fi
+
 if test "$enable_testsuite_amdsdk2_9" = "yes" ; then
    # Check for software and sources for AMD APP SDK
   if ! test -f "$srcdir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64.tgz"; then
@@ -346,6 +336,19 @@ if test "$enable_testsuite_amdsdk2_9" = "yes" ; then
     ])
 fi
 
+if test "$enable_testsuite_amdsdk3_0" = "yes" ; then
+   # Check for software and sources for AMD APP SDK
+  if ! test -f "$srcdir/examples/AMDSDK3.0/AMD-APP-SDKInstaller-v3.0.130.135-GA-linux64.tar.bz2"; then
+    AC_MSG_WARN([disabling AMD APP SDK testsuite as the 3.0 installer package not in '$srcdir/examples/AMDSDK3.0'.])
+    enable_testsuite_amdsdk3_0=no
+  fi
+  PKG_CHECK_MODULES([SDL], [sdl >= 1.2], [:],
+    [ AC_MSG_WARN([pkg-config could not find libSDL, AMD APP SDK 3.0 testsuite is not used ])
+      enable_testsuite_amdsdk3_0=no
+    ])
+fi
+
+
 if test "$enable_testsuite_vexcl" = "yes" ; then
   # Check for software and sources for VexCL
   if ! test -f "$srcdir/examples/VexCL/vexcl/README.md"; then
@@ -389,11 +392,13 @@ AM_CONDITIONAL([TEST_SUITE_RODINIA], [test "$enable_testsuite_rodinia" = "yes"])
 AM_CONDITIONAL([TEST_SUITE_PARBOIL], [test "$enable_testsuite_parboil" = "yes"])
 AM_CONDITIONAL([TEST_SUITE_AMD], [test "$enable_testsuite_amd" = "yes"])
 AM_CONDITIONAL([TEST_SUITE_AMDSDK2_9], [test "$enable_testsuite_amdsdk2_9" = "yes"])
+AM_CONDITIONAL([TEST_SUITE_AMDSDK3_0], [test "$enable_testsuite_amdsdk3_0" = "yes"])
 AM_CONDITIONAL([TEST_SUITE_VEXCL], [test "$enable_testsuite_vexcl" = "yes"])
 AM_CONDITIONAL([TEST_SUITE_PIGLIT], [test "$enable_testsuite_piglit" = "yes"])
 AM_CONDITIONAL([TEST_SUITE_HALIDE], [test "$enable_testsuite_halide" = "yes"])
 AM_CONDITIONAL([TEST_SUITE_OPENCV], [test "$enable_testsuite_opencv" = "yes"])
 AM_CONDITIONAL([TEST_SUITE_CLOVERLEAF], [test "$enable_testsuite_cloverleaf" = "yes"])
+AM_CONDITIONAL([TEST_SUITE_INTELSVM], [test "$enable_testsuite_intel_svm" = "yes"])
 
 # Some information for the user
 AC_MSG_NOTICE([internal tests are enabled])
@@ -434,6 +439,12 @@ if test "$enable_testsuite_amdsdk2_9" = "yes"; then
 else
   AC_MSG_NOTICE([tests from AMD APP SDK 2.9 are disabled])
 fi
+if test "$enable_testsuite_amdsdk3_0" = "yes"; then
+  AC_MSG_NOTICE([tests from AMD APP SDK 3.0 are enabled])
+  POAT_TESTSUITES="$POAT_TESTSUITES amdsdk3_0"
+else
+  AC_MSG_NOTICE([tests from AMD APP SDK 3.0 are disabled])
+fi
 if test "$enable_testsuite_vexcl" = "yes"; then
   AC_MSG_NOTICE([tests from VexCL are enabled])
   POAT_TESTSUITES="$POAT_TESTSUITES vexcl"
@@ -464,7 +475,12 @@ if test "$enable_testsuite_cloverleaf" = "yes"; then
 else
   AC_MSG_NOTICE([tests from CloverLeaf are disabled])
 fi
-
+if test "$enable_testsuite_intel_svm" = "yes" ; then
+  AC_MSG_NOTICE([tests from Intel SVM samples are enabled])
+  POAT_TESTSUITES="$POAT_TESTSUITES IntelSVM"
+else
+  AC_MSG_NOTICE([tests from Intel SVM samples are disabled])
+fi
 
 ####################################################################
 # Pthread Library
@@ -647,7 +663,7 @@ if test "x$debug" = "xno"
 then
 	CXXFLAGS="$CXXFLAGS"
 else
-	CFLAGS="$CXXFLAGS -O0 -g"
+	CFLAGS="$CFLAGS -O0 -g"
 	CXXFLAGS="$CXXFLAGS -O0 -g"
 	AC_DEFINE([POCL_DEBUG_BUILD], [], "Build pocl in debug mode")
 fi
@@ -666,6 +682,8 @@ AC_COMPILE_IFELSE(
    AM_CONDITIONAL([HAVE_CLOCK_GETTIME], false)]
 )
 
+AC_CHECK_FUNCS_ONCE([vfork fork])
+
 # Enable debug message output when the env POCL_DEBUG=1 is set.
 AC_DEFINE([POCL_DEBUG_MESSAGES], [1], [Printout debug messages in case POCL_DEBUG env is set])
 
@@ -685,9 +703,9 @@ AC_ARG_ENABLE([kernel-cache],
               [cache_kernel="no"])
 if test "$cache_kernel" = "yes"
 then
-  AC_DEFINE([POCL_BUILD_KERNEL_CACHE], [1], "Enabled kernel cache feature")
+  AC_DEFINE([POCL_KERNEL_CACHE_DEFAULT], [1], "Enabled kernel cache feature")
 else
-  AC_DEFINE([POCL_BUILD_KERNEL_CACHE], [0], "Disabled kernel cache feature")
+  AC_DEFINE([POCL_KERNEL_CACHE_DEFAULT], [0], "Disabled kernel cache feature")
 fi
 
 
@@ -830,7 +848,7 @@ AC_SUBST(LLVM_CXX_FLAGS)
 # Define the OpenCL version when compiling OpenCL C code. This also
 # serves as indicator for the OpenCL C language, similar to
 # __cplusplus for C++.
-CLFLAGS="$CLFLAGS -D__OPENCL_VERSION__=120"
+CLFLAGS="$CLFLAGS -Xclang -cl-std=CL2.0 -D__OPENCL_C_VERSION__=200"
 
 AM_CONDITIONAL([CLANGXX_AVAILABLE], test ! -z $CLANGXX)
 
@@ -872,7 +890,7 @@ LLVM_PROG([LLVM_LINK], [llvm-link], [LLVM IR linker])
 export CLANG
 export LLC
 
-AC_DEFINE_UNQUOTED([LLC], ["$LLC"], [LLVM compiler executable.])
+AC_DEFINE_UNQUOTED([LLVM_LLC], ["$LLC"], [LLVM compiler executable.])
 
 AC_MSG_CHECKING([Linker option to build a shared library])
 if echo $host | grep -q darwin; then
@@ -1063,18 +1081,6 @@ then
   HOST_LLC_FLAGS="$HOST_LLC_FLAGS -mcpu=$llc_host_cpu"
 fi
 
-# Work-around a clang bug in LLVM 3.3: On 32-bit platforms, the size
-# of Open CL C long is not 8 bytes 
-if [echo "constant int test[sizeof(long)==8?1:-1]={1};" | $CLANG $HOST_CLANG_FLAGS -S -x cl - ]
-then
-    : # OpenCL C long is supported
-else
-    # AC_DEFINE needed for host code, HOST_CLANG_FLGAS for kernel code.
-    AC_DEFINE_UNQUOTED([_CL_DISABLE_LONG],[],[Disable cl_khr_int64 on host based devices.])
-    HOST_CLANG_FLAGS="$HOST_CLANG_FLAGS -D_CL_DISABLE_LONG"
-fi
-rm -f ./-.s
-	
 AC_DEFINE_UNQUOTED([KERNEL_DIR], "$kernel_dir", [Use the libkernel from lib/kernel/$KERNEL_DIR/])
 
 AC_SUBST([TARGET], [$target])
@@ -1094,6 +1100,7 @@ AC_SUBST([HOST], [$host])
 AC_DEFINE_UNQUOTED([HOST], "$host", [The host triple.])
 AC_SUBST([HOST_CPU], [$host_cpu])
 AC_DEFINE_UNQUOTED([HOST_CPU], ["$host_cpu"], [The host CPU type.])
+AM_CONDITIONAL([HOST_CPU_IS_X86_64], [test "$host_cpu" = "x86_64"])
 AC_SUBST([OCL_KERNEL_TARGET],[$llc_triple])
 AC_DEFINE_UNQUOTED([OCL_KERNEL_TARGET], ["$llc_triple"], [The kernel target triplet.])
 AC_SUBST([OCL_KERNEL_TARGET_CPU],[$llc_host_cpu])
@@ -1155,7 +1162,7 @@ AC_LANG([C++])
 old_CXXFLAGS=$CXXFLAGS
 CXXFLAGS=`$TCE_CONFIG --libs --cxxflags --includes`
 
-AC_MSG_CHECKING([for TCE])
+AC_MSG_CHECKING([compiling against a TCE header])
 AC_COMPILE_IFELSE(
   [AC_LANG_PROGRAM(
     [[ #include <Application.hh> ]],
@@ -1165,15 +1172,25 @@ AC_COMPILE_IFELSE(
   [TCE_LIBRARY=ok]],
   [AC_MSG_RESULT(no)])
 
+AC_MSG_CHECKING([running a TCE binary])
+if ttasim --help > /dev/null;
+then
+AC_MSG_RESULT(yes)
+TCE_EXEC=ok
+else
+AC_MSG_RESULT(no)
+fi
+
 CXXFLAGS=$old_CXXFLAGS
 
 fi
 
 if test -z "$TCECC" || \
    test -z "$TCE_CONFIG" || \
-   test -z "$TCE_LIBRARY"; then
+   test -z "$TCE_LIBRARY" || \
+   test -z "$TCE_EXEC"; then
 
-AC_MSG_NOTICE([TCE installation not found. The ttasim device will not be built.])
+AC_MSG_NOTICE([TCE installation not found or doesn't work. The ttasim device will not be built.])
 
 else
 
@@ -1198,6 +1215,16 @@ then
  TCE_AVAILABLE=1
 fi
 
+# List of extension supported by the TCE device
+TCE_DEVICE_EXTENSIONS="cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16"
+TCE_DEVICE_EXTENSION_DEFINES=`echo $TCE_DEVICE_EXTENSIONS | sed s/cl_khr/-Dcl_khr/g`
+AC_DEFINE_UNQUOTED([TCE_DEVICE_EXTENSIONS], ["$TCE_DEVICE_EXTENSIONS"], [TCE device supported extension list])
+AC_SUBST([TCE_DEVICE_EXTENSION_DEFINES])
+AC_DEFINE_UNQUOTED([TCE_DEVICE_EXTENSION_DEFINES], ["$TCE_DEVICE_EXTENSION_DEFINES"], [TCE device supported extension list])
+AC_DEFINE([TCE_DEVICE_CL_VERSION_MAJOR], [2], [OpenCL major version supported by TCE device])
+AC_DEFINE([TCE_DEVICE_CL_VERSION_MINOR], [0], [OpenCL minor version supported by TCE device])
+
+
 OCL_TARGETS="$OCL_TARGETS tce"
 fi
 
@@ -1225,29 +1252,6 @@ AC_DEFINE([POCL_ANDROID_PREFIX],
 fi
 AM_CONDITIONAL([POCL_ANDROID], [test "$ANDROID_PRESENT" = ok])
 
-# Check if CellSPU support is found
-# The libspe version requirement is not strict. This is the only one tested.
-# SPU backend was removed in LLVM 3.3 (and we don't accept LLVM 3.1 anymore in pocl)
-# so compile the spu backend only when LLVM 3.2 is found
-if test "$LLVM_VERSION" == 3.2 -o "$LLVM_VERSION" == 3.2svn
-then
-PKG_CHECK_MODULES([LIBSPE], [libspe2 >= 2.2.80],
-  [
-    AC_MSG_NOTICE([libspe2 found.])
-    LIBSPE_AVAILABLE=yes
-    OCL_DRIVERS="$OCL_DRIVERS spu"
-    OCL_TARGETS="$OCL_TARGETS cellspu"
-    AC_DEFINE([BUILD_SPU], [], [Defined when CellSPU tools are found])
-    BUILD_SPU=1
-  ],
-  [
-    AC_MSG_NOTICE([libspe2 not found (by pkg-config). Spu driver not built.])
-    LIBSPE_AVAILABLE=no
-    BUILD_SPU=0
-  ])
-fi
-AM_CONDITIONAL([BUILD_SPU],[echo $OCL_DRIVERS | grep spu])
-
 # Enable HSA by default in case the dependencies are met.
 AC_ARG_ENABLE([hsa-amdgcn],
               [AS_HELP_STRING([--disable-hsa-amdgcn],
@@ -1330,6 +1334,16 @@ AC_COMPILE_IFELSE(
 enable_hsa="no"
 ])
 
+AC_MSG_CHECKING([for AMD-specific HSA runtime headers])
+AC_COMPILE_IFELSE(
+  [AC_LANG_PROGRAM(
+    [[ #include <hsa_ext_amd.h> ]]
+  )],
+[AC_MSG_RESULT(yes)
+AC_DEFINE([HAVE_HSA_EXT_AMD_H], [], [Have AMD-specific HSA headers])
+],
+[AC_MSG_RESULT(no)])
+
 CPPFLAGS=$tempCPPFLAGS
 
 AC_ARG_VAR([HSAILASM], [Path to HSAILasm executable])
@@ -1341,17 +1355,24 @@ else
   enable_hsa="no"
 fi
 
+HSA_DEVICE_EXTENSIONS=""
+HSA_DEVICE_EXTENSION_DEFINES=""
 if test "$enable_hsa" = "yes";
 then
+  HSA_DEVICE_EXTENSIONS="cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64 cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics"
+  HSA_DEVICE_EXTENSION_DEFINES=$(echo "$HSA_DEVICE_EXTENSIONS" | sed 's/cl_khr/-Dcl_khr/g')
   OCL_DRIVERS="$OCL_DRIVERS hsa"
   OCL_TARGETS="$OCL_TARGETS hsail64"
   AC_DEFINE([BUILD_HSA], [1], [Defined to 1 in case the HSA driver should be built.])
   enable_testsuite_hsa=yes
   POAT_TESTSUITES="$POAT_TESTSUITES hsa"
-
   # -lHSAILUtil is missing from llvm-config --libfiles
   LLVM_LIBS="$LLVM_LIBS -lLLVMHSAILUtil"
+  AC_DEFINE([HSA_DEVICE_CL_VERSION_MAJOR], [2], [OpenCL major version supported by HSA device])
+  AC_DEFINE([HSA_DEVICE_CL_VERSION_MINOR], [0], [OpenCL major version supported by HSA device])
 fi
+AC_DEFINE_UNQUOTED([HSA_DEVICE_EXTENSIONS], ["$HSA_DEVICE_EXTENSIONS"], [aoeuaoe])
+AC_SUBST([HSA_DEVICE_EXTENSION_DEFINES])
 
 # TODO: move above
 
@@ -1476,6 +1497,25 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[__fp16 callfp16(__fp16 a) { return a * (__f
                   ]
                  )
 
+HOST_DEVICE_EXTENSIONS="cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_spir"
+if test "$HAVE_WORKING_HALF" == "1"; then
+  HOST_DEVICE_EXTENSIONS="$HOST_DEVICE_EXTENSIONS cl_khr_fp16"
+fi
+if test -z "$CL_DISABLE_LONG"; then
+  HOST_DEVICE_EXTENSIONS="$HOST_DEVICE_EXTENSIONS cl_khr_int64 cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics"
+fi
+HOST_DEVICE_EXTENSION_DEFINES=`echo $HOST_DEVICE_EXTENSIONS | sed s/cl_khr/-Dcl_khr/g`
+AC_DEFINE_UNQUOTED([HOST_DEVICE_EXTENSIONS], ["$HOST_DEVICE_EXTENSIONS"],
+                   [OpenCL device extensions implemented by the host devices basic and pthreads])
+AC_SUBST([HOST_DEVICE_EXTENSION_DEFINES])
+#NB: OCL version is a property of the clPlatform, not the device. But the "host devices"
+#(basic and pthread) are probably always the ones cathing the new OCL spec versions first.
+AC_DEFINE([HOST_DEVICE_CL_VERSION_MAJOR], [2], [OpenCL major version supported by host device])
+AC_DEFINE([HOST_DEVICE_CL_VERSION_MINOR], [0], [OpenCL minor version supported by host device])
+AC_DEFINE_UNQUOTED([POCL_CL_VERSION], ["2.0"],
+                   [OpenCL version string as reported by clGetPlatformInfo])
+
+
 # Seems to be unused
 #AC_MSG_CHECKING([whether __fp16 can be converted to float])
 #AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[__fp16 x; float y;]],
@@ -1518,9 +1558,11 @@ AC_C_BIGENDIAN(
 # 4:0:3 == 0.10 (currently backwards compatible with 0.7, thus age = 3).
 # 5:0:4 == 0.11 (currently backwards compatible with 0.7, thus age = 4).
 # 6:0:5 == 0.12 (currently backwards compatible with 0.7, thus age = 5).
-LIB_CURRENT_VERSION=6
+# 7:0:6 == 0.13 (currently backwards compatible with 0.7, thus age = 6).
+
+LIB_CURRENT_VERSION=7
 LIB_REVISION_VERSION=0
-LIB_AGE_VERSION=5
+LIB_AGE_VERSION=6
 LIB_FIRST_VERSION=$(($LIB_CURRENT_VERSION - $LIB_AGE_VERSION))
 BUILD_TIMESTAMP=""
 AC_SUBST([LIB_VERSION], ["$LIB_CURRENT_VERSION:$LIB_REVISION_VERSION:$LIB_AGE_VERSION"])
@@ -1554,11 +1596,9 @@ AC_CONFIG_FILES([Makefile
                  lib/CL/devices/tce/Makefile
                  lib/CL/devices/tce/ttasim/Makefile
                  lib/CL/devices/topology/Makefile
-                 lib/CL/devices/cellspu/Makefile
                  lib/CL/devices/hsa/Makefile
                  lib/llvmopencl/Makefile
                  lib/kernel/Makefile
-                 lib/kernel/cellspu/Makefile
                  lib/kernel/host/Makefile
                  lib/kernel/tce/Makefile
                  lib/kernel/hsail64/Makefile
@@ -1576,8 +1616,10 @@ AC_CONFIG_FILES([Makefile
                  examples/Rodinia/Makefile
                  examples/Parboil/Makefile
                  examples/ViennaCL/Makefile
+                 examples/IntelSVM/Makefile
                  examples/AMD/Makefile
                  examples/AMDSDK2.9/Makefile
+                 examples/AMDSDK3.0/Makefile
                  examples/EinsteinToolkit/Makefile
                  examples/VexCL/Makefile
                  examples/piglit/Makefile
@@ -1594,8 +1636,6 @@ AC_CONFIG_FILES([Makefile
                  tests/tce/ttasim/Makefile
                  tests/tce/tcemc/Makefile
                  tests/tce/fp16/Makefile
-                 tests/cell/Makefile
-                 tests/cell/hello/Makefile
                  ])
 
 AC_OUTPUT
diff --git a/doc/luxmark.txt b/doc/luxmark.txt
new file mode 100644
index 0000000..712692c
--- /dev/null
+++ b/doc/luxmark.txt
@@ -0,0 +1,13 @@
+All of the Luxmark v2.0 scenes work with pocl. 
+
+Tested with an LLVM 3.3 (trunk at 2013-03-14).
+
+There's a small issue in the Luxmark itself:
+
+http://www.luxrender.net/forum/viewtopic.php?f=34&t=7769&p=93888#p93888
+
+To circumvent this, use the 'basic' device driver which
+reports only CL_DEVICE_TYPE_CPU. It doesn't multithread so
+the best performance is not reached, but should be OK for
+verification.
+
diff --git a/doc/sphinx/source/conf.py b/doc/sphinx/source/conf.py
index ab52649..93c1931 100644
--- a/doc/sphinx/source/conf.py
+++ b/doc/sphinx/source/conf.py
@@ -45,9 +45,9 @@ copyright = u'2010-2015, pocl developers'
 # built documents.
 #
 # The short X.Y version.
-version = '0.12'
+version = '0.13'
 # The full version, including alpha/beta/rc tags.
-release = '0.12'
+release = '0.13'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/doc/sphinx/source/env_variables.rst b/doc/sphinx/source/env_variables.rst
index b1d0495..80e6d2e 100644
--- a/doc/sphinx/source/env_variables.rst
+++ b/doc/sphinx/source/env_variables.rst
@@ -1,23 +1,23 @@
-Tuning pocl behavior
---------------------
+Tuning pocl behavior with ENV variables
+---------------------------------------
 
 The behavior of pocl can be controlled with multiple environment variables
 listed below. The variables are helpful both when using and when developing
 pocl.
 
-* POCL_BUILDING
+- **POCL_BUILDING**
 
  If  set, the pocl helper scripts, kernel library and headers are 
  searched first from the pocl build directory.
 
-* POCL_BBVECTORIZE
+- **POCL_BBVECTORIZE**
 
  If set to 1, makes the pocl kernel compiler execute the LLVM BBVectorizer in
  addition to the SLP vectorizer and the inner loop vectorizer. BBVectorizer
  has known stability issues, therefore it's disabled by default, but it can
  provide performance improvements. See: https://github.com/pocl/pocl/issues/251
 
-* POCL_CACHE_DIR
+- **POCL_CACHE_DIR**
 
  If this is set to an existing directory, pocl uses it as the cache
  directory for all compilation results. This allows reusing compilation
@@ -25,35 +25,36 @@ pocl.
  default cache directory will be used, which is ``$XDG_CACHE_DIR/pocl/kcache``
  (if set) or ``$HOME/.cache/pocl/kcache/`` on Unix-like systems.
 
-* POCL_DEBUG
+- **POCL_DEBUG**
 
  Enables debug messages to stderr. This will be mostly messages from error
- condition checks in OpenCL API calls. Useful to e.g. distinguish between various
- reasons a call can return CL_INVALID_VALUE. If clock_gettime is available,
- messages will include a timestamp.
+ condition checks in OpenCL API calls and Event/API timing information.
+ Useful to e.g. distinguish between various reasons a call could return
+ CL_INVALID_VALUE. If clock_gettime is available, messages
+ will include a timestamp.
 
-* POCL_DEVICES and POCL_x_PARAMETERS
+- **POCL_DEVICES** and **POCL_x_PARAMETERS**
 
  POCL_DEVICES is a space separated list of the device instances to be enabled.
  This environment variable is used for the following devices:
 
- *         basic        A minimalistic example device driver for executing
+ *         **basic**    A minimalistic example device driver for executing
                         kernels on the host CPU. No multithreading.
 
- *         pthread      Native kernel execution on the host CPU with
+ *         **pthread**  Native kernel execution on the host CPU with
                         threaded execution of work groups using pthreads.
 
- *         ttasim       Device that simulates a TTA device using the
+ *         **ttasim**   Device that simulates a TTA device using the
                         TCE's ttasim library. Enabled only if TCE libraries
                         installed.
 
- *         hsa          Uses HSA Runtime API to control HSA-compliant
+ *         **hsa***     Uses HSA Runtime API to control HSA-compliant
                         kernel agents that support HSAIL finalization.
 
  If POCL_DEVICES is not set, one pthread device will be used.
  To specify parameters for drivers, the POCL_<drivername><instance>_PARAMETERS
  environment variable can be specified (where drivername is in uppercase).
- Example:
+ Example::
 
   export POCL_DEVICES="pthread ttasim ttasim"
   export POCL_TTASIM0_PARAMETERS="/path/to/my/machine0.adf"
@@ -65,54 +66,52 @@ pocl.
  POCL_TTASIM0_PARAMETERS will be passed to the first ttasim driver instantiated
  and POCL_TTASIM1_PARAMETERS to the second one.
 
-* POCL_IMPLICIT_FINISH
+- **POCL_IMPLICIT_FINISH**
 
  Add an implicit call to clFinish afer every clEnqueue* call. Useful mostly for
  pocl internal development, and is enabled only if pocl is configured with
- '--enable-debug'.
+ ``--enable-debug``.
 
-* POCL_KERNEL_CACHE
+- **POCL_KERNEL_CACHE**
 
  If this is set to 0 at runtime, kernel compilation files will be deleted at
  clReleaseProgram(). Note that it's currently not possible for pocl to avoid
  interacting with LLVM via on-disk files, so pocl requires some disk space at
- least temporarily (at runtime). Also, the locking mechanism on cache files
- does not work with LLVM < 3.5, so you might get errors if you try to run
- multiple processes running the same CL code via pocl.
+ least temporarily (at runtime).
 
-* POCL_KERNEL_COMPILER_OPT_SWITCH
+- **POCL_KERNEL_COMPILER_OPT_SWITCH**
 
  Override the default "-O3" that is passed to the LLVM opt as a final
  optimization switch.
 
-* POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES
+- **POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES**
 
  If this is set to 1, the kernel compiler cache/temporary directory that
  contains all the intermediate compiler files are left as it is. This
  will be handy for debugging
 
-* POCL_MAX_PTHREAD_COUNT
+- **POCL_MAX_PTHREAD_COUNT**
 
  The maximum number of threads created for work group execution in the
  pthread device driver. The default is to determine this from the number of
  hardware threads available in the CPU.
 
-* POCL_MAX_WORK_GROUP_SIZE
+- **POCL_MAX_WORK_GROUP_SIZE**
 
  Forces the maximum WG size returned by the device or kernel work group queries
  to be at most this number.
 
-* POCL_VECTORIZER_REMARKS
+- **POCL_VECTORIZER_REMARKS**
 
  When set to 1, prints out remarks produced by the loop vectorizer of LLVM
  during kernel compilation.
 
-* POCL_VERBOSE
+- **POCL_VERBOSE**
 
  If set to 1, output the LLVM commands as they are executed to compile
  and run kernels.
 
-* POCL_WORK_GROUP_METHOD
+- **POCL_WORK_GROUP_METHOD**
 
  The kernel compiler method to produce the work group functions from
  multiple work items. Legal values:
diff --git a/doc/sphinx/source/faq.rst b/doc/sphinx/source/faq.rst
index afb3b65..71ff76e 100644
--- a/doc/sphinx/source/faq.rst
+++ b/doc/sphinx/source/faq.rst
@@ -7,9 +7,12 @@ are listed here.
 Using pocl
 ----------
 
+.. _supported-compilers:
+
 Supported compilers and compiler combinations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Note that pocl usually uses two different compilers (though may be built
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Pocl usually uses two different compilers (though may be built
 using only one). One is used to compile C and C++ files - this is usually
 the "system compiler". It's specified by CC and CXX vars to configure
 script, or CMAKE_C{,XX}_COMPILER variables to cmake, but usually just
diff --git a/doc/sphinx/source/features.rst b/doc/sphinx/source/features.rst
index eec3ae2..381d56f 100644
--- a/doc/sphinx/source/features.rst
+++ b/doc/sphinx/source/features.rst
@@ -7,6 +7,17 @@ listed here as encountered.
 Frontend/Clang
 --------------
 
+* OpenCL 1.x
+
+  * OpenGL interoperability
+  * Image support is incomplete
+
+* OpenCL 2.0
+
+  * generic address space (recognized by LLVM 3.8+ but incomplete)
+  * pipes (WIP)
+  * device-side enqueue
+
 * cl_khr_f16: half precision float literals
 
   Compiling "3434.0h" fails with:
@@ -14,3 +25,10 @@ Frontend/Clang
 
   Tested with Clang 3.4 on 2014-07-10.
 
+
+Unimplemented host side functions
+---------------------------------
+
+The list of unimplemented host-side API functions can be seen as the NULLs in the ICD dispatch struct in
+https://github.com/pocl/pocl/blob/master/lib/CL/clGetPlatformIDs.c
+
diff --git a/doc/sphinx/source/hsa.rst b/doc/sphinx/source/hsa.rst
index d2c44e6..9eabb07 100644
--- a/doc/sphinx/source/hsa.rst
+++ b/doc/sphinx/source/hsa.rst
@@ -21,48 +21,68 @@ Installing prerequisite software
   lists some common issues (like /dev/kfd permissions) and run sample/vector_copy
   to verify you have a working runtime.
 
-2) Build HSAIL-Tools
-~~~~~~~~~~~~~~~~~~~~~
-
-   `git clone https://github.com/HSAFoundation/HSAIL-Tools`
-
-   In particular **HSAILasm** executable will be required by pocl.
-
-3) Build & install the LLVM with HSAIL support
+2) Build & install the LLVM with HSAIL support
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-  `git clone https://github.com/HSAFoundation/HLC-HSAIL-Development-LLVM/`
+  Fetch the HSAIL branch of LLVM 3.7:
 
-  Use the branch hsail-stable-3.7; before build, patch it with
+  `git clone https://github.com/HSAFoundation/HLC-HSAIL-Development-LLVM/ -b hsail-stable-3.7`
 
-  `pocl/tools/patches/llvm-3.7-hsail-branch.patch`
+  Patch it a bit with:
 
-  Build it with a Clang 3.7 (branch release_37)
+  `patch -p1 PATHTO/pocl/tools/patches/llvm-3.7-hsail-branch.patch`
 
-  `cd tools; svn co http://llvm.org/svn/llvm-project/cfe/branches/release_37 clang`
+  Fetch the upstream Clang's 3.7 branch:
 
-  patched with
+  `cd tools; svn co http://llvm.org/svn/llvm-project/cfe/branches/release_37 clang`
 
-  `pocl/tools/patches/clang-3.7-hsail-branch.patch`
+  Patch it also:
 
-  to get the HSAIL Clang support.
+  `cd clang; patch -p0 pocl/tools/patches/clang-3.7-hsail-branch.patch`
 
   An LLVM cmake configuration command like this worked for me:
 
-  `mkdir build; cd build; cmake .. -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=HSAIL
-  -DBUILD_SHARED_LIBS=off -DCMAKE_INSTALL_PREFIX=INSTALL_DIR -DLLVM_ENABLE_RTTI=on
-  -DLLVM_BUILD_LLVM_DYLIB=on -DLLVM_ENABLE_EH=ON`
+  `cd ../../ ; mkdir build; cd build; cmake .. -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=HSAIL \
+  -DBUILD_SHARED_LIBS=off -DCMAKE_INSTALL_PREFIX=INSTALL_DIR -DLLVM_ENABLE_RTTI=on \
+  -DLLVM_BUILD_LLVM_DYLIB=on -DLLVM_ENABLE_EH=ON -DHSAIL_USE_LIBHSAIL=OFF`
+
+  HSAIL_USE_LIBHSAIL=OFF is only for safety. If you accidentally build clang with libHSAIL,
+  it will cause mysterious link errors later when building pocl.
 
-  Note that these are **required** :
+  Change INSTALL_DIR to your target prefix of choice. Note that these are **required** :
 
-  `-DLLVM_ENABLE_RTTI=ON -DLLVM_ENABLE_EH=ON
-  -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=HSAIL`
+  `-DLLVM_ENABLE_RTTI=ON -DLLVM_ENABLE_EH=ON -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=HSAIL`
 
   Also, if you don't want to build all the default targets, you'll need AMDGPU.
 
+  Then build and install the Clang/LLVM:
+
+  `make -j4 && make install`
+
+
+3) Build HSAIL-Tools
+~~~~~~~~~~~~~~~~~~~~~
+
+   `git clone https://github.com/HSAFoundation/HSAIL-Tools`
+
+   Build it (check CMAKE_INSTALL_PREFIX):
+
+   `mkdir -p build/lnx64
+    cd build/lnx64
+    cmake ../.. -DCMAKE_INSTALL_PREFIX=$HOME/bin
+    make -j`
+
+   You might need to add
+
+   `-DCMAKE_CXX_FLAGS=-I$HOME/llvm-3.7-hsa/include` or similar to the cmake command line
+   if it doesn't find your LLVM headers.
+
+   In particular **HSAILasm** executable will be required by pocl.
+
 
 4) Build pocl.
 ~~~~~~~~~~~~~~~
+
   Using autotools:
 
     `./configure --with-hsa-runtime-dir=\</opt/hsa\>
@@ -87,6 +107,7 @@ Installing prerequisite software
 
 HSA Support notes
 ------------------
+
 Note that the support is still experimental and very much unfinished. You're
 welcome to try it out and report any issues, though.
 
diff --git a/doc/sphinx/source/index.rst b/doc/sphinx/source/index.rst
index eddb50d..fc3f42f 100644
--- a/doc/sphinx/source/index.rst
+++ b/doc/sphinx/source/index.rst
@@ -11,10 +11,10 @@ Contents:
 .. toctree::
    :maxdepth: 2
 
+   install
    using
    env_variables
    faq
-   env_variables
    development
    releasing
    design
diff --git a/doc/sphinx/source/install.rst b/doc/sphinx/source/install.rst
new file mode 100644
index 0000000..f8c81e8
--- /dev/null
+++ b/doc/sphinx/source/install.rst
@@ -0,0 +1,139 @@
+============
+Installation
+============
+
+Requirements
+------------
+
+In order to build pocl, you need the following support libraries and
+tools:
+
+  * LLVM & Clang; for more details see :ref:`supported-compilers`
+  * GNU make
+  * libtool dlopen wrapper files (e.g. libltdl3-dev in Debian)
+  * pthread (should be installed by default)
+  * hwloc v1.0 or newer (e.g. libhwloc-dev)
+  * pkg-config
+  * autotools or cmake
+
+Clang / LLVM Notes
+------------------
+
+**IMPORTANT NOTE!** Some platforms (TCE and possibly HSA) require that
+you compile & build LLVM with ``make REQUIRES_RTTI=1``, as follows:
+
+  ``./configure --<llvm-configure-options>`` or ``cmake -D<llvm-options>``
+
+  ``make REQUIRES_RTTI=1 && make install``
+
+**Supported versions**
+
+  Note that pocl aims to support **the latest LLVM version** at the time
+  of pocl release, **plus the previous** LLVM version. All older LLVM
+  versions are unsupported.
+
+
+Configure & Build using autotools
+---------------------------------
+
+After all the requirements are installed. The installation procedure
+follows the usual autotools configure, make, make install. If you are
+using a development source tree, you need to generate the autotool
+build files with
+
+  ``./autogen.sh``
+
+Autotools: important options & features
+-----------------------------------------
+
+- ``LLVM_CONFIG`` **IMPORTANT** Path to a llvm-config binary.
+  This determines the LLVM installation used by pocl.
+  If not specified, pocl will try to find and link against
+  llvm-config in PATH env var (usually means your system LLVM).
+- ``--enable-static-llvm`` enable this to link LLVM statically into pocl.
+  Note that you need LLVM built with static libs. This option might result
+  in much longer build times and much larger pocl library, but the
+  resulting libpocl will not require an LLVM installation to run.
+- ``--enable-icd`` and ``--enable-direct-linkage`` By default pocl's
+  buildsystem will try to find an ICD and build pocl as a dynamic library
+  named "libpocl". These options are useful if you want to avoid ICD and
+  build pocl directly as libOpenCL library. See also :ref:`linking-with-icd`
+
+Configure & Build using CMake
+-----------------------------
+
+CMake version 2.8.12 or higher is required.
+
+The build+install is the usual CMake way::
+
+  cd <directory-with-pocl-sources>
+  mkdir build
+  cd build
+  cmake [-D<option>=<value> ...] ..
+
+To see the default detected values, run ``cmake ..`` without any options,
+it will produce a summary.
+
+
+CMake: important options & features
+-------------------------------------
+
+For multiple-item options, use ";" as separator (you'll have to escape it for bash).
+
+- ``-DWITH_LLVM_CONFIG=<path-to-llvm-config>``
+  **IMPORTANT** Path to a llvm-config binary.
+  This determines the LLVM installation used by pocl.
+  If not specified, pocl will try to find and link against
+  llvm-config in PATH env var (usually means your system LLVM).
+- ``-DSTATIC_LLVM`` enable this to link LLVM statically into pocl.
+  Note that you need LLVM built with static libs. This option might result
+  in much longer build/link times and much larger pocl library, but the
+  resulting libpocl will not require an LLVM installation to run.
+- ``-DENABLE_ICD`` and ``-DDIRECT_LINKAGE`` By default pocl's
+  buildsystem will try to find an ICD and build pocl as a dynamic library
+  named "libpocl". These options are useful if you want to avoid ICD and
+  build pocl directly as libOpenCL library. See also :ref:`linking-with-icd`
+- ``-DPOCL_INSTALL_<something>_DIR`` The equivalent of ``--bindir``,
+  ``--sbindir`` etc fine-tuning of paths for autotools. See the beginning
+  of toplevel CMakeLists.txt for all the variables.
+- ``-DKERNELLIB_HOST_CPU_VARIANTS`` You can control which CPUs the
+  kernel library will be built for. Defaults to "native" which will be
+  converted to the build machine's CPU at buildtime. Available CPUs are
+  listed by ``llc -mcpu=help``; you can specify multiple CPUs, and pocl will
+  look for a kernel library for the runtime-detected CPU.
+
+  For x86(64) there is another possibility, ``distro``, which builds a few
+  preselected sse/avx variants covering 99.99% of x86 processors, and pocl
+  will use the most appropriate one at runtime, based on detected CPU features.
+  With ``distro``, the minimum requirement on CPU is SSE2.
+
+- ``-DENABLE_TESTSUITES`` Which external (source outside pocl) testsuites to enable.
+  For the list of testsuites, see examples/CMakeLists.txt or the ``examples``
+  directory. Set to ``all`` and pocl will try to autodetect & enable everything
+  it can.
+
+  Note that you may build testsuites outside pocl's build tree, and test
+  multiple pocl builds with a single testsuite directory. To use this,
+  run cmake with ``-DTESTSUITE_BASEDIR=<tests-builddir>`` and ``-DTESTSUITE_SOURCE_BASEDIR=<tests-sourcedir>``.
+  The directory structure mirrors that of ``pocl/examples``. So to build e.g. AMD SDK 2.9
+  with ``-DTESTSUITE_BASEDIR=/home/pocltest-build -DTESTSUITE_SOURCE_BASEDIR=/home/pocltest-src``,
+  place the ``AMD-APP-SDK-v2.9-RC-lnx64.tgz`` file into ``/home/pocltest-src/AMDSDK2.9`` directory.
+
+
+Known build-time issues
+-----------------------
+
+There are unsolved issues and bugs in pocl. See the bug listing
+for a complete listing at https://github.com/pocl/pocl/issues
+
+Known issues not related to pocl are listed below.
+
+- automake 1.11 is known to work,
+  automake 1.96 might not work
+
+- Using Clang compiled with gcc 4.7 causes indeterminism in the
+  kernel compilation results. See the LLVM bug report:
+  http://llvm.org/bugs/show_bug.cgi?id=12945
+
+- autogen.sh whines about AC_MSG_ERROR(). This happens (for some reason)
+  if you do not have pkg-config installed.
diff --git a/doc/sphinx/source/using.rst b/doc/sphinx/source/using.rst
index 3680d6c..0182fe5 100644
--- a/doc/sphinx/source/using.rst
+++ b/doc/sphinx/source/using.rst
@@ -10,6 +10,8 @@ Android applications can use pocl using jni. App has to dlopen
 “/data/data/org.pocl.libs/files/lib/libpocl.so” and dlsym OpenCL function
 symbols from it.
 
+.. _linking-with-icd:
+
 Linking your program with pocl through an icd loader
 ----------------------------------------------------
 
diff --git a/examples/AMD/CMakeLists.txt b/examples/AMD/CMakeLists.txt
new file mode 100644
index 0000000..b82039b
--- /dev/null
+++ b/examples/AMD/CMakeLists.txt
@@ -0,0 +1,115 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
+
+  set(TS_NAME AMD)
+  set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+  set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+  set(AMD_APP_SDK_TGZ "${TS_SRCDIR}/AMD-APP-SDK-v2.8-RC-lnx64.tgz")
+
+  if(EXISTS "${AMD_APP_SDK_TGZ}")
+
+    message(STATUS "Enabling testsuite ${TS_NAME}")
+    list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+    set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+    ExternalProject_Add(
+      ${TS_NAME}
+      PREFIX "${TS_BASEDIR}"
+      DOWNLOAD_COMMAND "/bin/true"
+      PATCH_COMMAND tar -xzf "${AMD_APP_SDK_TGZ}" && pwd && patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/AMDSDK.patch
+      CONFIGURE_COMMAND "/bin/true"
+      BUILD_IN_SOURCE 1
+      BUILD_COMMAND "/bin/true"
+      INSTALL_COMMAND "/bin/true"
+    )
+
+    set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+    add_dependencies(prepare_examples ${TS_NAME})
+
+    set(AMD_SAMPLES
+    AESEncryptDecrypt
+    AtomicCounters
+    BinarySearch
+    BinomialOption
+    BinomialOptionMultiGPU
+    BitonicSort
+    BlackScholes
+    BlackScholesDP
+    BoxFilter
+    BufferBandwidth
+    DCT
+    DeviceFission
+    DeviceFission11Ext
+    DwtHaar1D
+    FastWalshTransform
+    FloydWarshall
+    HelloWorld
+    Histogram
+    ImageBandwidth
+    ImageOverlap
+    KernelLaunch
+    LUDecomposition
+    MatrixMulImage
+    MatrixMultiplication
+    MatrixTranspose
+    MemoryModel
+    MonteCarloAsian
+    MonteCarloAsianMultiGPU
+    PrefixSum
+    QuasiRandomSequence
+    RadixSort
+    RecursiveGaussian
+    Reduction
+    ScanLargeArrays
+    SimpleConvolution
+    SimpleImage
+    SimpleMultiDevice
+    SobelFilter
+    Template
+    TemplateC
+    TransferOverlap
+    URNG)
+
+    # disabled tests:  Mandelbrot NBody
+    #   SimpleGL BoxFilterGL FluidSimulation2D GaussianNoiseGL
+    #   MonteCarloAsianDP AdvancedMultiGPU BasicDebug
+
+    foreach(SAMPLE IN LISTS AMD_SAMPLES)
+      add_test(NAME "AMD_28_${SAMPLE}"
+        COMMAND "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.8-RC-lnx64/samples/opencl/bin/x86_64/${SAMPLE}"
+        WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.8-RC-lnx64/samples/opencl/bin/x86_64")
+      set_tests_properties("AMD_28_${SAMPLE}" PROPERTIES LABELS "amdsdk_28")
+    endforeach()
+
+  else()
+    message(WARNING "Disabling testsuite ${TS_NAME} - tarball not found")
+  endif()
+
+else()
+  message(warning "AMD APP SDK testsuite is only enabled for x86_64 Linux systems currently")
+endif()
diff --git a/examples/AMD/Makefile.in b/examples/AMD/Makefile.in
index 4c32cc4..dd1c3f8 100644
--- a/examples/AMD/Makefile.in
+++ b/examples/AMD/Makefile.in
@@ -193,6 +193,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -200,6 +201,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -217,8 +219,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -294,6 +294,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/AMDSDK2.9/CMakeLists.txt b/examples/AMDSDK2.9/CMakeLists.txt
new file mode 100644
index 0000000..87f9d53
--- /dev/null
+++ b/examples/AMDSDK2.9/CMakeLists.txt
@@ -0,0 +1,123 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
+
+  set(TS_NAME AMDSDK2.9)
+  set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+  set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+  set(AMD_APP_SDK_TGZ "${TS_SRCDIR}/AMD-APP-SDK-v2.9-RC-lnx64.tgz")
+
+  if(EXISTS "${AMD_APP_SDK_TGZ}")
+
+    message(STATUS "Enabling testsuite ${TS_NAME}")
+    list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+    set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+    ExternalProject_Add(
+      ${TS_NAME}
+      PREFIX "${TS_BASEDIR}"
+      DOWNLOAD_COMMAND "/bin/true"
+      PATCH_COMMAND tar -xzf "${AMD_APP_SDK_TGZ}" && pwd && patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/AMDSDK2_9.patch
+      CONFIGURE_COMMAND "/bin/true"
+      BUILD_IN_SOURCE 1
+      BUILD_COMMAND "/bin/true"
+      INSTALL_COMMAND "/bin/true"
+    )
+
+    set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+    add_dependencies(prepare_examples ${TS_NAME})
+
+    set(AMD_SAMPLES
+    BinarySearch
+    BinomialOption
+    BinomialOptionMultiGPU
+    BitonicSort
+    BlackScholes
+    BlackScholesDP
+    DCT
+    DwtHaar1D
+    DynamicOpenCLDetection
+    FastWalshTransform
+    FloydWarshall
+    GlobalMemoryBandwidth
+    HelloWorld
+    Histogram
+    HistogramAtomics
+    ImageOverlap
+    LUDecomposition
+    MatrixMulImage
+    MatrixMultiplication
+    MatrixTranspose
+    MemoryModel
+    MonteCarloAsianMultiGPU
+    PrefixSum
+    QuasiRandomSequence
+    RadixSort
+    RecursiveGaussian
+    Reduction
+    ScanLargeArrays
+    SimpleConvolution
+    SimpleImage
+    SimpleMultiDevice
+    SobelFilter
+    StringSearch
+    Template
+    TransferOverlap
+    URNG
+    URNGNoiseGL)
+
+    # disabled c++ tests: DwtHaar1DCPPKernel EigenValue FFT IntroStaticCPPKernel MatrixMultiplicationCPPKernel MersenneTwister SoAversusAoS TransferOverlapCPP
+    # disabled but fixable: CplusplusWrapper DeviceFission FluidSimulation2D GaussianNoise HDRToneMapping ImageBandwidth KernelLaunch MatrixMulDouble MonteCarloAsian MonteCarloAsianDP SobelFilterImage UnsharpMask
+    # disabled tests: NBody AtomicCounters BasicDebug DeviceFission11Ext
+    # disabled graphics tests: Mandelbrot  KmeansAutoclustering GaussianNoiseGL SimpleGL
+    # very slow: LDSBandwidth ConstantBandwidth MemoryOptimizations
+
+    foreach(SAMPLE IN LISTS AMD_SAMPLES)
+      add_test(NAME "AMD_29_${SAMPLE}"
+        COMMAND "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/bin/x86_64/${SAMPLE}"
+        WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/bin/x86_64")
+      set_tests_properties("AMD_29_${SAMPLE}" PROPERTIES LABELS "amdsdk_29")
+    endforeach()
+
+  # TODO
+  set_tests_properties(
+    AMD_29_BitonicSort AMD_29_BinarySearch AMD_29_BinomialOption AMD_29_DCT
+    AMD_29_BlackScholes AMD_29_FastWalshTransform AMD_29_FloydWarshall
+    AMD_29_HelloWorld AMD_29_Histogram AMD_29_MatrixMultiplication
+    AMD_29_MatrixTranspose AMD_29_PrefixSum AMD_29_QuasiRandomSequence
+    AMD_29_ScanLargeArrays AMD_29_SimpleConvolution AMD_29_URNG
+    PROPERTIES
+      LABELS "hsa")
+
+
+  else()
+    message(WARNING "Disabling testsuite ${TS_NAME} - tarball not found")
+  endif()
+
+else()
+  message(warning "AMD APP SDK testsuite is only enabled for x86_64 Linux systems currently")
+endif()
diff --git a/examples/AMDSDK2.9/Makefile.in b/examples/AMDSDK2.9/Makefile.in
index 9a914d8..4beb7e1 100644
--- a/examples/AMDSDK2.9/Makefile.in
+++ b/examples/AMDSDK2.9/Makefile.in
@@ -193,6 +193,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -200,6 +201,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -217,8 +219,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -294,6 +294,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/AMDSDK3.0/CMakeLists.txt b/examples/AMDSDK3.0/CMakeLists.txt
new file mode 100644
index 0000000..1bb0bd2
--- /dev/null
+++ b/examples/AMDSDK3.0/CMakeLists.txt
@@ -0,0 +1,152 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
+
+  set(TS_NAME AMDSDK3.0)
+  set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+  set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+  set(AMD_APP_SDK_TGZ "${TS_SRCDIR}/AMD-APP-SDK-v3.0.130.135-GA-linux64.sh")
+
+  if(EXISTS "${AMD_APP_SDK_TGZ}")
+
+    message(STATUS "Enabling testsuite ${TS_NAME}")
+    list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+    set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+    ExternalProject_Add(
+      ${TS_NAME}
+      PREFIX "${TS_BASEDIR}"
+      DOWNLOAD_COMMAND "/bin/true"
+      PATCH_COMMAND /bin/sh "${AMD_APP_SDK_TGZ}" --noexec --keep --target AMD-APP-SDK-3.0 &&
+             patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/amdsdk3_0.patch
+      CONFIGURE_COMMAND "/bin/true"
+      BUILD_IN_SOURCE 1
+      BUILD_COMMAND "/bin/true"
+      INSTALL_COMMAND "/bin/true"
+    )
+
+    set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+    add_dependencies(prepare_examples ${TS_NAME})
+
+    set(AMD_SAMPLES
+    AdvancedConvolution
+    BasicDebug
+    BinomialOption
+    BinomialOptionMultiGPU
+    BitonicSort
+    BlackScholes
+    BlackScholesDP
+    BoxFilter
+    BufferImageInterop
+    BuiltInScan
+    CalcPie
+    ConcurrentKernel
+    DCT
+    DeviceFission
+    DeviceFission11Ext
+    DwtHaar1D
+    FastWalshTransform
+    FloydWarshall
+    GaussianNoise
+    HDRToneMapping
+    HelloWorld
+    Histogram
+    HistogramAtomics
+    ImageBinarization
+    ImageOverlap
+    KernelLaunch
+    LUDecomposition
+    MatrixMulImage
+    MatrixMultiplication
+    MatrixTranspose
+    MemoryModel
+    MonteCarloAsian
+    MonteCarloAsianMultiGPU
+    NBody
+    PrefixSum
+    QuasiRandomSequence
+    RadixSort
+    RecursiveGaussian
+    Reduction
+    ScanLargeArrays
+    SimpleConvolution
+    SimpleImage
+    SimpleMultiDevice
+    SobelFilter
+    StringSearch
+    Template
+    UnsharpMask
+    URNG)
+
+    # freeze (requires async queue): FineGrainSVM
+    # disabled c++ tests: DwtHaar1DCPPKernel EigenValue
+    #     IntroStaticCPPKernel FFT MatrixMultiplicationCPPKernel
+    #     MersenneTwister SoAversusAoS TransferOverlapCPP
+    # disabled but fixable: BuiltInScan CalcPie CplusplusWrapper
+    #     DeviceFission FluidSimulation2D GaussianNoise HDRToneMapping
+    #     HeatPDE ImageBandwidth KernelLaunch MatrixMulDouble
+    #     MonteCarloAsianDP SobelFilterImage UnsharpMask DynamicOpenCLDetection
+    #     FineGrainSVMCAS RecursiveGaussian_ProgramScope SobelFilterImage
+    #     SVMAtomicsBinaryTreeInsert SVMBinaryTreeSearch
+    # disabled tests: AsyncDataTransfer AtomicCounters BasicDebug
+    #     DeviceFission11Ext RangeMinimumQuery SimpleDepthImage
+    #     SimpleGenericAddressSpace
+    # disabled graphics tests: Mandelbrot  KmeansAutoclustering
+    #     GaussianNoiseGL SimpleGL BoxFilterGL URNGNoiseGL
+    # very slow: BufferBandwidth LDSBandwidth ConstantBandwidth
+    #     MemoryOptimizations TransferOverlap
+    # disabled - requires dev queue: BinarySearchDeviceSideEnqueue
+    #     DeviceEnqueueBFS ExtractPrimes GlobalMemoryBandwidth
+    #     RegionGrowingSegmentation SimpleSPIR
+    # disabled - requires pipe: PipeProducerConsumerKernels SimplePipe
+
+
+    foreach(SAMPLE IN LISTS AMD_SAMPLES)
+      add_test(NAME "AMD_30_${SAMPLE}"
+        COMMAND "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-3.0/samples/opencl/bin/x86_64/${SAMPLE}"
+        WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-3.0/samples/opencl/bin/x86_64")
+      set_tests_properties("AMD_30_${SAMPLE}" PROPERTIES LABELS "amdsdk_30")
+    endforeach()
+
+    # AMD_30_HeatPDE
+    # AMD_30_FineGrainSVMCAS
+    set_tests_properties(
+        AMD_30_BinomialOption AMD_30_CalcPie AMD_30_DCT AMD_30_BlackScholes
+        AMD_30_FastWalshTransform AMD_30_FloydWarshall AMD_30_HelloWorld
+        AMD_30_Histogram AMD_30_MatrixMultiplication AMD_30_MatrixTranspose
+        AMD_30_PrefixSum AMD_30_QuasiRandomSequence AMD_30_ScanLargeArrays
+        AMD_30_SimpleConvolution AMD_30_URNG
+      PROPERTIES
+        LABELS "hsa")
+
+  else()
+    message(WARNING "Disabling testsuite ${TS_NAME} - tarball not found")
+  endif()
+
+else()
+  message(warning "AMD APP SDK testsuite is only enabled for x86_64 Linux systems currently")
+endif()
diff --git a/examples/AMDSDK3.0/Makefile.am b/examples/AMDSDK3.0/Makefile.am
new file mode 100644
index 0000000..0a01381
--- /dev/null
+++ b/examples/AMDSDK3.0/Makefile.am
@@ -0,0 +1,417 @@
+# Process this file with automake to produce Makefile.in (in this,
+# and all subdirectories).
+# Makefile.am for pocl/examples/AMD.
+#
+# Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
+# Copyright (c) 2012 Vincent Danjean <Vincent.Danjean at ens-lyon.org>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+EXTRA_DIST = $(srcdir)/*.patch
+
+if TEST_SUITE_AMDSDK3_0
+testsuite_pocl_dir=${abs_top_srcdir}/examples/AMDSDK3.0
+testsuite_src_dir=${testsuite_pocl_dir}/AMD-APP-SDK-3.0
+ocl_sample_dir=${testsuite_pocl_dir}/AMD-APP-SDK-3.0/samples/opencl
+tar_bz=AMD-APP-SDKInstaller-v3.0.130.135-GA-linux64.tar.bz2
+tar_sh=AMD-APP-SDK-v3.0.130.135-GA-linux64.sh
+build_type=RelWithDebInfo
+
+.PHONY: build prepare-examples
+
+prepare-examples: $(testsuite_src_dir) build
+
+$(tar_sh): $(tar_bz)
+	test -f $(tar_sh) || (cd $(testsuite_pocl_dir) && tar xjf $(tar_bz) )
+
+$(testsuite_src_dir): $(tar_sh)
+	test -d $(testsuite_src_dir) || (cd $(testsuite_pocl_dir) && ./$(tar_sh) --noexec --keep --target AMD-APP-SDK-3.0 && \
+	( patch -sNp1 < $(testsuite_pocl_dir)/amdsdk3_0.patch || true ) && \
+	find $(testsuite_src_dir)/samples -name "CMakeLists.txt" -exec sed -i \
+		-e 's/mark_as_advanced( OPENCL_LIBRARIES )/set(OPENCL_LIBRARIES "-lOpenCL")/g' \
+		-e 's/mark_as_advanced(OPENCL_LIBRARIES)/set(OPENCL_LIBRARIES "-lOpenCL")/g' "{}" \;)
+
+# Some of the tests do not build with OpenGL 3.0. Just skip them by using make -k.
+build: $(testsuite_src_dir)
+	cd $(testsuite_src_dir)/samples && \
+	LDFLAGS="-pthread ${LDFLAGS}" cmake -DCMAKE_BUILD_TYPE=$(build_type) -DBUILD_OPENCL=ON -DBUILD_OPENCV=OFF \
+	-DBUILD_BOLT=OFF . && cmake --build . -- -k -j`getconf _NPROCESSORS_ONLN`
+
+test_AsyncDataTransfer:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/AsyncDataTransfer/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./AsyncDataTransfer -e
+
+test_AtomicCounters:
+	cd $(ocl_sample_dir)/cl/1.x/AtomicCounters/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./AtomicCounters -e
+
+test_BasicDebug:
+	cd $(ocl_sample_dir)/cl/1.x/BasicDebug/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BasicDebug -e
+
+test_BinomialOption:
+	cd $(ocl_sample_dir)/cl/1.x/BinomialOption/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BinomialOption -e
+
+test_BinomialOptionMultiGPU:
+	cd $(ocl_sample_dir)/cl/1.x/BinomialOptionMultiGPU/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BinomialOptionMultiGPU -e
+
+test_BitonicSort:
+	cd $(ocl_sample_dir)/cl/1.x/BitonicSort/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BitonicSort -e
+
+test_BlackScholes:
+	cd $(ocl_sample_dir)/cl/1.x/BlackScholes/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BlackScholes -e
+
+test_BlackScholesDP:
+	cd $(ocl_sample_dir)/cl/1.x/BlackScholesDP/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BlackScholesDP -e
+
+test_BoxFilter:
+	cd $(ocl_sample_dir)/cl/1.x/BoxFilter/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BoxFilter -e
+
+test_BoxFilterGL:
+	cd $(ocl_sample_dir)/cl/1.x/BoxFilterGL/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BoxFilterGL -e
+
+test_BufferBandwidth:
+	cd $(ocl_sample_dir)/benchmark/BufferBandwidth/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BufferBandwidth -e
+
+test_ConcurrentKernel:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/ConcurrentKernel/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ConcurrentKernel -e
+
+test_ConstantBandwidth:
+	cd $(ocl_sample_dir)/benchmark/ConstantBandwidth/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ConstantBandwidth -e
+
+test_CplusplusWrapper:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/CplusplusWrapper/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./CplusplusWrapper -e
+
+test_DCT:
+	cd $(ocl_sample_dir)/cl/1.x/DCT/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DCT -e
+
+test_DeviceFission:
+	cd $(ocl_sample_dir)/cl/1.x/DeviceFission/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DeviceFission -e
+
+test_DeviceFission11Ext:
+	cd $(ocl_sample_dir)/cl/1.x/DeviceFission11Ext/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DeviceFission11Ext -e
+
+test_DwtHaar1D:
+	cd $(ocl_sample_dir)/cl/1.x/DwtHaar1D/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DwtHaar1D -e
+
+test_DwtHaar1DCPPKernel:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/DwtHaar1DCPPKernel/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DwtHaar1DCPPKernel -e
+
+test_Eigenvalue:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/Eigenvalue/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Eigenvalue -e
+
+test_FastWalshTransform:
+	cd $(ocl_sample_dir)/cl/1.x/FastWalshTransform/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FastWalshTransform -e
+
+test_FFT:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/FFT/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FFT -e
+
+test_FloydWarshall:
+	cd $(ocl_sample_dir)/cl/1.x/FloydWarshall/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FloydWarshall -e
+
+test_FluidSimulation2D:
+	cd $(ocl_sample_dir)/cl/1.x/FluidSimulation/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FluidSimulation2D -e
+
+test_GaussianNoise:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/GaussianNoise/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./GaussianNoise -e --platformId 0
+
+test_GaussianNoiseGL:
+	cd $(ocl_sample_dir)/cl/1.x/GaussianNoiseGL/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./GaussianNoiseGL -e
+
+test_GlobalMemoryBandwidth:
+	cd $(ocl_sample_dir)/benchmark/GlobalMemoryBandwidth/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./GlobalMemoryBandwidth -e
+
+test_HDRToneMapping:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/HDRToneMapping/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./HDRToneMapping -e --platformId 0
+
+test_HelloWorld:
+	cd $(ocl_sample_dir)/cl/1.x/HelloWorld/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./HelloWorld -e
+
+test_Histogram:
+	cd $(ocl_sample_dir)/cl/1.x/Histogram/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Histogram -e
+
+test_HistogramAtomics:
+	cd $(ocl_sample_dir)/cl/1.x/HistogramAtomics/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./HistogramAtomics -e
+
+test_ImageBandwidth:
+	cd $(ocl_sample_dir)/cl/1.x/ImageBandwidth/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ImageBandwidth -e
+
+test_ImageOverlap:
+	cd $(ocl_sample_dir)/cl/1.x/ImageOverlap/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ImageOverlap -e
+
+test_IntroStaticCPPKernel:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/IntroStaticCPPKernel/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./IntroStaticCPPKernel -e
+
+test_KernelLaunch:
+	cd $(ocl_sample_dir)/cl/1.x/KernelLaunch/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./KernelLaunch -e
+
+test_KmeansAutoclustering:
+	cd $(ocl_sample_dir)/cl/1.x/KmeansAutoclustering/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./KmeansAutoclustering -e
+
+test_LDSBandwidth:
+	cd $(ocl_sample_dir)/benchmark/LDSBandwidth/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./LDSBandwidth -e
+
+test_LUDecomposition:
+	cd $(ocl_sample_dir)/cl/1.x/LUDecomposition/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./LUDecomposition -e
+
+test_Mandelbrot:
+	cd $(ocl_sample_dir)/cl/1.x/Mandelbrot/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Mandelbrot -e
+
+test_MatrixMulDouble:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/MatrixMulDouble/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MatrixMulDouble -e --platformId 0
+
+test_MatrixMulImage:
+	cd $(ocl_sample_dir)/cl/1.x/MatrixMulImage/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MatrixMulImage -e
+
+test_MatrixMultiplication:
+	cd $(ocl_sample_dir)/cl/1.x/MatrixMultiplication/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MatrixMultiplication -e
+
+test_MatrixTranspose:
+	cd $(ocl_sample_dir)/cl/1.x/MatrixTranspose/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MatrixTranspose -e
+
+test_MemoryModel:
+	cd $(ocl_sample_dir)/cl/1.x/MemoryModel/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MemoryModel -e
+
+test_MemoryOptimizations:
+	cd $(ocl_sample_dir)/benchmark/MemoryOptimizations/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MemoryOptimizations -e
+
+test_MersenneTwister:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/MerzenneTwister/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MersenneTwister -e
+
+test_MonteCarloAsian:
+	cd $(ocl_sample_dir)/cl/1.x/MonteCarloAsian/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MonteCarloAsian -e
+
+test_MonteCarloAsianDP:
+	cd $(ocl_sample_dir)/cl/1.x/MonteCarloAsianDP/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MonteCarloAsianDP -e
+
+test_MonteCarloAsianMultiGPU:
+	cd $(ocl_sample_dir)/cl/1.x/MonteCarloAsianMultiGPU/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MonteCarloAsianMultiGPU -e
+
+test_NBody:
+	cd $(ocl_sample_dir)/cl/1.x/NBody/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./NBody -e
+
+test_PrefixSum:
+	cd $(ocl_sample_dir)/cl/1.x/PrefixSum/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./PrefixSum -e
+
+test_QuasiRandomSequence:
+	cd $(ocl_sample_dir)/cl/1.x/QuasiRandomSequence/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./QuasiRandomSequence -e
+
+test_RadixSort:
+	cd $(ocl_sample_dir)/cl/1.x/RadixSort/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RadixSort -e
+
+test_RecursiveGaussian:
+	cd $(ocl_sample_dir)/cl/1.x/RecursiveGaussian/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RecursiveGaussian -e
+
+test_Reduction:
+	cd $(ocl_sample_dir)/cl/1.x/Reduction/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Reduction -e
+
+test_ScanLargeArrays:
+	cd $(ocl_sample_dir)/cl/1.x/ScanLargeArrays/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ScanLargeArrays -e
+
+test_SimpleConvolution:
+	cd $(ocl_sample_dir)/cl/1.x/SimpleConvolution/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleConvolution -e
+
+test_SimpleGL:
+	cd $(ocl_sample_dir)/cl/1.x/SimpleGL/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleGL -e
+
+test_SimpleImage:
+	cd $(ocl_sample_dir)/cl/1.x/SimpleImage/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleImage -e
+
+test_SimpleSPIR:
+	cd $(ocl_sample_dir)/cl/1.x/SimpleSPIR/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleSPIR -e
+
+test_SoAversusAoS:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/SoAversusAoS/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SoAversusAoS -e
+
+test_SobelFilter:
+	cd $(ocl_sample_dir)/cl/1.x/SobelFilter/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SobelFilter -e
+
+test_SobelFilterImage:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/SobelFilterImage/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SobelFilterImage -e
+
+test_StringSearch:
+	cd $(ocl_sample_dir)/cl/1.x/StringSearch/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./StringSearch -e
+
+test_Template:
+	cd $(ocl_sample_dir)/cl/1.x/Template/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Template -e
+
+test_TransferOverlap:
+	cd $(ocl_sample_dir)/cl/1.x/TransferOverlap/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./TransferOverlap -e
+
+test_TransferOverlapCPP:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/TransferOverlapCPP/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./TransferOverlapCPP -e
+
+test_UnsharpMask:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/UnsharpMask/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./UnsharpMask -e
+
+test_URNG:
+	cd $(ocl_sample_dir)/cl/1.x/URNG/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./URNG -e
+
+test_URNGNoiseGL:
+	cd $(ocl_sample_dir)/cpp_cl/1.x/URNGNoiseGL/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./URNGNoiseGL -e
+
+clean-examples:
+	rm -fr $(testsuite_src_dir) Install-AMD-APP.sh default-install_lnx_64.pl ReadMe.txt
+
+# 2.0 samples
+
+test_BinarySearchDeviceSideEnqueue:
+	cd $(ocl_sample_dir)/cl/2.0/BinarySearchDeviceSideEnqueue/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BinarySearchDeviceSideEnqueue -e
+
+test_BufferImageInterop:
+	cd $(ocl_sample_dir)/cl/2.0/BufferImageInterop/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BufferImageInterop -e
+
+test_BuiltInScan:
+	cd $(ocl_sample_dir)/cl/2.0/BuiltInScan/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BuiltInScan -e
+
+test_CalcPie:
+	cd $(ocl_sample_dir)/cl/2.0/CalcPie/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./CalcPie -e
+
+test_DeviceEnqueueBFS:
+	cd $(ocl_sample_dir)/cl/2.0/DeviceEnqueueBFS/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DeviceEnqueueBFS -e
+
+test_ExtractPrimes:
+	cd $(ocl_sample_dir)/cl/2.0/ExtractPrimes/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ExtractPrimes -e
+
+test_FineGrainSVM:
+	cd $(ocl_sample_dir)/cl/2.0/FineGrainSVM/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FineGrainSVM -e
+
+test_FineGrainSVMCAS:
+	cd $(ocl_sample_dir)/cl/2.0/FineGrainSVMCAS/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FineGrainSVMCAS -e
+
+test_HeatPDE:
+	cd $(ocl_sample_dir)/cl/2.0/HeatPDE/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./HeatPDE -e
+
+test_ImageBinarization:
+	cd $(ocl_sample_dir)/cl/2.0/ImageBinarization/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ImageBinarization -e
+
+test_PipeProducerConsumerKernels:
+	cd $(ocl_sample_dir)/cl/2.0/PipeProducerConsumerKernels/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./PipeProducerConsumerKernels -e
+
+test_RangeMinimumQuery:
+	cd $(ocl_sample_dir)/cl/2.0/RangeMinimumQuery/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RangeMinimumQuery -e
+
+test_RecursiveGaussian_ProgramScope:
+	cd $(ocl_sample_dir)/cl/2.0/RecursiveGaussian_ProgramScope/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RecursiveGaussian_ProgramScope -e
+
+test_RegionGrowingSegmentation:
+	cd $(ocl_sample_dir)/cl/2.0/RegionGrowingSegmentation/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RegionGrowingSegmentation -e
+
+test_SimpleDepthImage:
+	cd $(ocl_sample_dir)/cl/2.0/SimpleDepthImage/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleDepthImage -e
+
+test_SimpleGenericAddressSpace:
+	cd $(ocl_sample_dir)/cl/2.0/SimpleGenericAddressSpace/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleGenericAddressSpace -e
+
+test_SimplePipe:
+	cd $(ocl_sample_dir)/cl/2.0/SimplePipe/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimplePipe -e
+
+test_SVMAtomicsBinaryTreeInsert:
+	cd $(ocl_sample_dir)/cl/2.0/SVMAtomicsBinaryTreeInsert/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SVMAtomicsBinaryTreeInsert -e
+
+test_SVMBinaryTreeSearch:
+	cd $(ocl_sample_dir)/cl/2.0/SVMBinaryTreeSearch/bin/x86_64/$(build_type)/ && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SVMBinaryTreeSearch -e
+
+endif
diff --git a/examples/AMDSDK3.0/Makefile.in b/examples/AMDSDK3.0/Makefile.in
new file mode 100644
index 0000000..e732cf8
--- /dev/null
+++ b/examples/AMDSDK3.0/Makefile.in
@@ -0,0 +1,939 @@
+# Makefile.in generated by automake 1.15 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+ at SET_MAKE@
+
+# Process this file with automake to produce Makefile.in (in this,
+# and all subdirectories).
+# Makefile.am for pocl/examples/AMD.
+#
+# Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
+# Copyright (c) 2012 Vincent Danjean <Vincent.Danjean at ens-lyon.org>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = examples/AMDSDK3.0
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
+	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+AM_V_P = $(am__v_P_ at AM_V@)
+am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_ at AM_V@)
+am__v_GEN_ = $(am__v_GEN_ at AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_ at AM_V@)
+am__v_at_ = $(am__v_at_ at AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+am__DIST_COMMON = $(srcdir)/Makefile.in
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
+BOOST_LDFLAGS = @BOOST_LDFLAGS@
+BUILD_TIMESTAMP = @BUILD_TIMESTAMP@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CLANG = @CLANG@
+CLANGXX = @CLANGXX@
+CLANGXX_FLAGS = @CLANGXX_FLAGS@
+CLFLAGS = @CLFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FGREP = @FGREP@
+FORCED_CLFLAGS = @FORCED_CLFLAGS@
+GLEW_CFLAGS = @GLEW_CFLAGS@
+GLEW_LIBS = @GLEW_LIBS@
+GREP = @GREP@
+HOST = @HOST@
+HOST_AS_FLAGS = @HOST_AS_FLAGS@
+HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
+HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
+HOST_LD_FLAGS = @HOST_LD_FLAGS@
+HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
+HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
+HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
+HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
+HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
+HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
+HSA_INCLUDES = @HSA_INCLUDES@
+HSA_LIBS = @HSA_LIBS@
+HWLOC_CFLAGS = @HWLOC_CFLAGS@
+HWLOC_LIBS = @HWLOC_LIBS@
+ICD_LD_FLAGS = @ICD_LD_FLAGS@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+KERNEL_COMPILER_LIB_VERSION = @KERNEL_COMPILER_LIB_VERSION@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LD_FLAGS_BIN = @LD_FLAGS_BIN@
+LIBOBJS = @LIBOBJS@
+LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIB_AGE_VERSION = @LIB_AGE_VERSION@
+LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
+LIB_FIRST_VERSION = @LIB_FIRST_VERSION@
+LIB_REVISION_VERSION = @LIB_REVISION_VERSION@
+LIB_VERSION = @LIB_VERSION@
+LIPO = @LIPO@
+LLC = @LLC@
+LLVM_AS = @LLVM_AS@
+LLVM_CONFIG = @LLVM_CONFIG@
+LLVM_CXX_FLAGS = @LLVM_CXX_FLAGS@
+LLVM_LDFLAGS = @LLVM_LDFLAGS@
+LLVM_LIBS = @LLVM_LIBS@
+LLVM_LINK = @LLVM_LINK@
+LLVM_OPT = @LLVM_OPT@
+LLVM_VERSION = @LLVM_VERSION@
+LN_S = @LN_S@
+LTDL_LIBS = @LTDL_LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCL_ICD_CFLAGS = @OCL_ICD_CFLAGS@
+OCL_ICD_LIBS = @OCL_ICD_LIBS@
+OCL_KERNEL_ARCH = @OCL_KERNEL_ARCH@
+OCL_KERNEL_TARGET = @OCL_KERNEL_TARGET@
+OCL_KERNEL_TARGET_CPU = @OCL_KERNEL_TARGET_CPU@
+OCL_TARGETS = @OCL_TARGETS@
+OPENCL_CFLAGS = @OPENCL_CFLAGS@
+OPENCL_CMAKE = @OPENCL_CMAKE@
+OPENCL_EXTLIBS = @OPENCL_EXTLIBS@
+OPENCL_LIBS = @OPENCL_LIBS@
+OPT = @OPT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PKG_CONFIG = @PKG_CONFIG@
+PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
+PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
+POAT_TESTSUITES = @POAT_TESTSUITES@
+POCL_DEVICE_ADDRESS_BITS = @POCL_DEVICE_ADDRESS_BITS@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SDL_CFLAGS = @SDL_CFLAGS@
+SDL_LIBS = @SDL_LIBS@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+TARGET = @TARGET@
+TARGET_CLANG_FLAGS = @TARGET_CLANG_FLAGS@
+TARGET_CPU = @TARGET_CPU@
+TARGET_LLC_FLAGS = @TARGET_LLC_FLAGS@
+TARGET_SIZEOF_DOUBLE = @TARGET_SIZEOF_DOUBLE@
+TARGET_SIZEOF_HALF = @TARGET_SIZEOF_HALF@
+TARGET_SIZEOF_LONG = @TARGET_SIZEOF_LONG@
+TARGET_SIZEOF_VOID_P = @TARGET_SIZEOF_VOID_P@
+TCECC = @TCECC@
+TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
+TCE_AVAILABLE = @TCE_AVAILABLE@
+TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+EXTRA_DIST = $(srcdir)/*.patch
+ at TEST_SUITE_AMDSDK3_0_TRUE@testsuite_pocl_dir = ${abs_top_srcdir}/examples/AMDSDK3.0
+ at TEST_SUITE_AMDSDK3_0_TRUE@testsuite_src_dir = ${testsuite_pocl_dir}/AMD-APP-SDK-3.0
+ at TEST_SUITE_AMDSDK3_0_TRUE@ocl_sample_dir = ${testsuite_pocl_dir}/AMD-APP-SDK-3.0/samples/opencl
+ at TEST_SUITE_AMDSDK3_0_TRUE@tar_bz = AMD-APP-SDKInstaller-v3.0.130.135-GA-linux64.tar.bz2
+ at TEST_SUITE_AMDSDK3_0_TRUE@tar_sh = AMD-APP-SDK-v3.0.130.135-GA-linux64.sh
+ at TEST_SUITE_AMDSDK3_0_TRUE@build_type = RelWithDebInfo
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign examples/AMDSDK3.0/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign examples/AMDSDK3.0/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure:  $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+tags TAGS:
+
+ctags CTAGS:
+
+cscope cscopelist:
+
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+	cscopelist-am ctags-am distclean distclean-generic \
+	distclean-libtool distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags-am uninstall uninstall-am
+
+.PRECIOUS: Makefile
+
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@.PHONY: build prepare-examples
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@prepare-examples: $(testsuite_src_dir) build
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@$(tar_sh): $(tar_bz)
+ at TEST_SUITE_AMDSDK3_0_TRUE@	test -f $(tar_sh) || (cd $(testsuite_pocl_dir) && tar xjf $(tar_bz) )
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@$(testsuite_src_dir): $(tar_sh)
+ at TEST_SUITE_AMDSDK3_0_TRUE@	test -d $(testsuite_src_dir) || (cd $(testsuite_pocl_dir) && ./$(tar_sh) --noexec --keep --target AMD-APP-SDK-3.0 && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	( patch -sNp1 < $(testsuite_pocl_dir)/amdsdk3_0.patch || true ) && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	find $(testsuite_src_dir)/samples -name "CMakeLists.txt" -exec sed -i \
+ at TEST_SUITE_AMDSDK3_0_TRUE@		-e 's/mark_as_advanced( OPENCL_LIBRARIES )/set(OPENCL_LIBRARIES "-lOpenCL")/g' \
+ at TEST_SUITE_AMDSDK3_0_TRUE@		-e 's/mark_as_advanced(OPENCL_LIBRARIES)/set(OPENCL_LIBRARIES "-lOpenCL")/g' "{}" \;)
+
+# Some of the tests do not build with OpenGL 3.0. Just skip them by using make -k.
+ at TEST_SUITE_AMDSDK3_0_TRUE@build: $(testsuite_src_dir)
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(testsuite_src_dir)/samples && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	LDFLAGS="-pthread ${LDFLAGS}" cmake -DCMAKE_BUILD_TYPE=$(build_type) -DBUILD_OPENCL=ON -DBUILD_OPENCV=OFF \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	-DBUILD_BOLT=OFF . && cmake --build . -- -k -j`getconf _NPROCESSORS_ONLN`
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_AsyncDataTransfer:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/AsyncDataTransfer/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./AsyncDataTransfer -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_AtomicCounters:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/AtomicCounters/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./AtomicCounters -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BasicDebug:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/BasicDebug/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BasicDebug -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BinomialOption:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/BinomialOption/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BinomialOption -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BinomialOptionMultiGPU:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/BinomialOptionMultiGPU/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BinomialOptionMultiGPU -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BitonicSort:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/BitonicSort/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BitonicSort -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BlackScholes:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/BlackScholes/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BlackScholes -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BlackScholesDP:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/BlackScholesDP/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BlackScholesDP -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BoxFilter:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/BoxFilter/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BoxFilter -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BoxFilterGL:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/BoxFilterGL/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BoxFilterGL -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BufferBandwidth:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/benchmark/BufferBandwidth/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BufferBandwidth -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_ConcurrentKernel:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/ConcurrentKernel/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ConcurrentKernel -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_ConstantBandwidth:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/benchmark/ConstantBandwidth/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ConstantBandwidth -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_CplusplusWrapper:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/CplusplusWrapper/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./CplusplusWrapper -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_DCT:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/DCT/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DCT -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_DeviceFission:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/DeviceFission/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DeviceFission -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_DeviceFission11Ext:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/DeviceFission11Ext/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DeviceFission11Ext -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_DwtHaar1D:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/DwtHaar1D/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DwtHaar1D -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_DwtHaar1DCPPKernel:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/DwtHaar1DCPPKernel/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DwtHaar1DCPPKernel -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_Eigenvalue:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/Eigenvalue/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Eigenvalue -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_FastWalshTransform:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/FastWalshTransform/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FastWalshTransform -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_FFT:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/FFT/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FFT -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_FloydWarshall:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/FloydWarshall/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FloydWarshall -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_FluidSimulation2D:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/FluidSimulation/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FluidSimulation2D -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_GaussianNoise:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/GaussianNoise/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./GaussianNoise -e --platformId 0
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_GaussianNoiseGL:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/GaussianNoiseGL/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./GaussianNoiseGL -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_GlobalMemoryBandwidth:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/benchmark/GlobalMemoryBandwidth/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./GlobalMemoryBandwidth -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_HDRToneMapping:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/HDRToneMapping/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./HDRToneMapping -e --platformId 0
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_HelloWorld:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/HelloWorld/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./HelloWorld -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_Histogram:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/Histogram/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Histogram -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_HistogramAtomics:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/HistogramAtomics/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./HistogramAtomics -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_ImageBandwidth:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/ImageBandwidth/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ImageBandwidth -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_ImageOverlap:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/ImageOverlap/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ImageOverlap -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_IntroStaticCPPKernel:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/IntroStaticCPPKernel/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./IntroStaticCPPKernel -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_KernelLaunch:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/KernelLaunch/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./KernelLaunch -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_KmeansAutoclustering:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/KmeansAutoclustering/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./KmeansAutoclustering -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_LDSBandwidth:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/benchmark/LDSBandwidth/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./LDSBandwidth -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_LUDecomposition:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/LUDecomposition/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./LUDecomposition -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_Mandelbrot:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/Mandelbrot/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Mandelbrot -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MatrixMulDouble:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/MatrixMulDouble/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MatrixMulDouble -e --platformId 0
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MatrixMulImage:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/MatrixMulImage/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MatrixMulImage -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MatrixMultiplication:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/MatrixMultiplication/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MatrixMultiplication -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MatrixTranspose:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/MatrixTranspose/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MatrixTranspose -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MemoryModel:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/MemoryModel/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MemoryModel -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MemoryOptimizations:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/benchmark/MemoryOptimizations/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MemoryOptimizations -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MersenneTwister:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/MerzenneTwister/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MersenneTwister -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MonteCarloAsian:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/MonteCarloAsian/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MonteCarloAsian -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MonteCarloAsianDP:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/MonteCarloAsianDP/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MonteCarloAsianDP -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_MonteCarloAsianMultiGPU:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/MonteCarloAsianMultiGPU/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./MonteCarloAsianMultiGPU -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_NBody:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/NBody/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./NBody -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_PrefixSum:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/PrefixSum/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./PrefixSum -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_QuasiRandomSequence:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/QuasiRandomSequence/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./QuasiRandomSequence -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_RadixSort:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/RadixSort/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RadixSort -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_RecursiveGaussian:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/RecursiveGaussian/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RecursiveGaussian -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_Reduction:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/Reduction/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Reduction -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_ScanLargeArrays:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/ScanLargeArrays/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ScanLargeArrays -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SimpleConvolution:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/SimpleConvolution/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleConvolution -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SimpleGL:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/SimpleGL/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleGL -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SimpleImage:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/SimpleImage/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleImage -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SimpleSPIR:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/SimpleSPIR/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleSPIR -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SoAversusAoS:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/SoAversusAoS/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SoAversusAoS -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SobelFilter:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/SobelFilter/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SobelFilter -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SobelFilterImage:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/SobelFilterImage/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SobelFilterImage -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_StringSearch:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/StringSearch/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./StringSearch -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_Template:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/Template/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./Template -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_TransferOverlap:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/TransferOverlap/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./TransferOverlap -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_TransferOverlapCPP:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/TransferOverlapCPP/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./TransferOverlapCPP -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_UnsharpMask:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/UnsharpMask/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./UnsharpMask -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_URNG:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/1.x/URNG/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./URNG -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_URNGNoiseGL:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cpp_cl/1.x/URNGNoiseGL/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./URNGNoiseGL -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@clean-examples:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	rm -fr $(testsuite_src_dir) Install-AMD-APP.sh default-install_lnx_64.pl ReadMe.txt
+
+# 2.0 samples
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BinarySearchDeviceSideEnqueue:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/BinarySearchDeviceSideEnqueue/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BinarySearchDeviceSideEnqueue -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BufferImageInterop:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/BufferImageInterop/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BufferImageInterop -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_BuiltInScan:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/BuiltInScan/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./BuiltInScan -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_CalcPie:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/CalcPie/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./CalcPie -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_DeviceEnqueueBFS:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/DeviceEnqueueBFS/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./DeviceEnqueueBFS -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_ExtractPrimes:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/ExtractPrimes/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ExtractPrimes -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_FineGrainSVM:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/FineGrainSVM/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FineGrainSVM -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_FineGrainSVMCAS:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/FineGrainSVMCAS/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./FineGrainSVMCAS -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_HeatPDE:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/HeatPDE/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./HeatPDE -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_ImageBinarization:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/ImageBinarization/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./ImageBinarization -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_PipeProducerConsumerKernels:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/PipeProducerConsumerKernels/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./PipeProducerConsumerKernels -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_RangeMinimumQuery:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/RangeMinimumQuery/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RangeMinimumQuery -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_RecursiveGaussian_ProgramScope:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/RecursiveGaussian_ProgramScope/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RecursiveGaussian_ProgramScope -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_RegionGrowingSegmentation:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/RegionGrowingSegmentation/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./RegionGrowingSegmentation -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SimpleDepthImage:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/SimpleDepthImage/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleDepthImage -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SimpleGenericAddressSpace:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/SimpleGenericAddressSpace/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimpleGenericAddressSpace -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SimplePipe:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/SimplePipe/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SimplePipe -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SVMAtomicsBinaryTreeInsert:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/SVMAtomicsBinaryTreeInsert/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SVMAtomicsBinaryTreeInsert -e
+
+ at TEST_SUITE_AMDSDK3_0_TRUE@test_SVMBinaryTreeSearch:
+ at TEST_SUITE_AMDSDK3_0_TRUE@	cd $(ocl_sample_dir)/cl/2.0/SVMBinaryTreeSearch/bin/x86_64/$(build_type)/ && \
+ at TEST_SUITE_AMDSDK3_0_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./SVMBinaryTreeSearch -e
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/examples/AMDSDK3.0/amdsdk3_0.patch b/examples/AMDSDK3.0/amdsdk3_0.patch
new file mode 100644
index 0000000..0be15f4
--- /dev/null
+++ b/examples/AMDSDK3.0/amdsdk3_0.patch
@@ -0,0 +1,50 @@
+--- a/AMD-APP-SDK-3.0/samples/opencl/cl/2.0/FineGrainSVMCAS/FineGrainSVMCAS_Kernels.cl	2015-12-02 18:29:15.569686180 +0100
++++ b/AMD-APP-SDK-3.0/samples/opencl/cl/2.0/FineGrainSVMCAS/FineGrainSVMCAS_Kernels.cl	2015-12-02 18:29:36.613019709 +0100
+@@ -26,7 +26,7 @@
+ 	if (i != get_global_size(0)) {
+ 		do {			
+ 				list[i] = head;
+-		   } while (!atomic_compare_exchange_strong((atomic_int *)&list[0], &head, i));
++		   } while (!atomic_compare_exchange_strong((global atomic_int *)&list[0], &head, i));
+ 		}
+ }
+ 
+@@ -40,6 +40,6 @@
+ 		do {
+ 			if (head == 0) return;
+ 			next = list[list[0]];
+-		} while (!atomic_compare_exchange_strong ((atomic_int *)&list[0], &head, next ));
++		} while (!atomic_compare_exchange_strong ((global atomic_int *)&list[0], &head, next ));
+ 	}
+ }
+--- a/AMD-APP-SDK-3.0/samples/opencl/cl/2.0/CalcPie/CalcPie_Kernels.cl	2015-12-02 18:26:51.179684865 +0100
++++ b/AMD-APP-SDK-3.0/samples/opencl/cl/2.0/CalcPie/CalcPie_Kernels.cl	2015-12-02 18:27:01.906351623 +0100
+@@ -24,7 +24,7 @@
+ 	 
+   float r = sqrt((rX[i]*rX[i])+(rY[i]*rY[i]));
+   if (r <= 1) 
+-	atomic_fetch_add_explicit ((atomic_int *)inside, 1, memory_order_seq_cst, memory_scope_device);
++	atomic_fetch_add_explicit ((global atomic_int *)inside, 1, memory_order_seq_cst, memory_scope_device);
+ }
+ 
+ 
+--- a/AMD-APP-SDK-3.0/include/SDKUtil/CLUtil.hpp	2015-12-02 19:19:06.119713395 +0100
++++ b/AMD-APP-SDK-3.0/include/SDKUtil/CLUtil.hpp	2015-12-02 19:22:34.209715291 +0100
+@@ -749,7 +749,7 @@
+ 				case CL_DEVICE_TYPE_GPU :
+ 				{
+ 					//first find platform having GPU
+-					platformFound = getDefaultPlatform(numPlatforms, platforms, platform, dType);
++					platformFound = getDefaultPlatform(numPlatforms, platforms, platform, CL_DEVICE_TYPE_GPU);
+ 					if(platformFound)
+ 					{
+ 						break;
+@@ -763,7 +763,7 @@
+ 					//then find platform having CPU
+ 					if(!platformFound)
+ 					{
+-						platformFound = getDefaultPlatform(numPlatforms, platforms, platform, dType);
++						platformFound = getDefaultPlatform(numPlatforms, platforms, platform, CL_DEVICE_TYPE_CPU);
+ 					}
+ 				} /*end of CPU case*/
+ 
diff --git a/examples/ASL/CMakeLists.txt b/examples/ASL/CMakeLists.txt
new file mode 100644
index 0000000..9fa49f0
--- /dev/null
+++ b/examples/ASL/CMakeLists.txt
@@ -0,0 +1,188 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "ASL")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+set(ASL_TGZ "${TS_SRCDIR}/v0.1.6.tar.gz")
+
+# find required libs - boost & vtk
+find_package(Boost 1.55)
+find_package(VTK)
+set(VTK_VER "${VTK_MAJOR_VERSION}.${VTK_MINOR_VERSION}")
+
+if ((NOT "${CMAKE_VERSION}" VERSION_LESS "3.0.2")
+    AND VTK_FOUND AND (NOT VTK_VER VERSION_LESS "6.1")
+    AND Boost_FOUND AND (NOT Boost_VERSION VERSION_LESS "1.55"))
+
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    # GIT_REPOSITORY "https://github.com/AvtechScientific/ASL"
+    URL "https://github.com/AvtechScientific/ASL/archive/v0.1.6.tar.gz"
+    CMAKE_ARGS
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo
+      -DWITH_API_DOC:BOOL=OFF
+      -DWITH_EXAMPLES:BOOL=ON
+      -DWITH_MATIO:BOOL=OFF
+      -DWITH_TESTS:BOOL=ON
+      "-DCMAKE_CXX_FLAGS_RELWITHDEBINFO=-O2 -g -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+      "-DCMAKE_C_FLAGS_RELWITHDEBINFO=-O2 -g -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+    INSTALL_COMMAND /bin/true
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+  add_test(NAME ASL_testABDFormat
+           COMMAND "${TS_BUILDDIR}/test/testABD/testABDFormat")
+  add_test(NAME ASL_testVectorOfElements
+           COMMAND "${TS_BUILDDIR}/test/testACL/testVectorOfElements")
+  add_test(NAME ASL_testMatrixOfElements
+           COMMAND "${TS_BUILDDIR}/test/testACL/testMatrixOfElements")
+  add_test(NAME ASL_testKernel
+           COMMAND "${TS_BUILDDIR}/test/testACL/testKernel")
+  add_test(NAME ASL_testOperators
+           COMMAND "${TS_BUILDDIR}/test/testACL/testOperators")
+  add_test(NAME ASL_testKernelMerger
+           COMMAND "${TS_BUILDDIR}/test/testACL/testKernelMerger")
+  add_test(NAME ASL_testPrivateVar
+           COMMAND "${TS_BUILDDIR}/test/testACL/testPrivateVar")
+  add_test(NAME ASL_testASLData
+           COMMAND "${TS_BUILDDIR}/test/testMath/testASLData")
+  add_test(NAME ASL_testDistanceFunction
+           COMMAND "${TS_BUILDDIR}/test/testMath/testDistanceFunction")
+  add_test(NAME ASL_testReductionFunction
+           COMMAND "${TS_BUILDDIR}/test/testMath/testReductionFunction")
+
+
+  add_test(NAME ASL_example_bus_wind
+           COMMAND "${TS_BUILDDIR}/examples/flow/bus_wind/asl-bus_wind")
+  add_test(NAME ASL_example_compressor
+           COMMAND "${TS_BUILDDIR}/examples/flow/compressor/asl-compressor")
+  add_test(NAME ASL_example_flow
+           COMMAND "${TS_BUILDDIR}/examples/flow/flow/asl-flow")
+  add_test(NAME ASL_example_flow2
+           COMMAND "${TS_BUILDDIR}/examples/flow/flow2/asl-flow2")
+  add_test(NAME ASL_example_flow3
+           COMMAND "${TS_BUILDDIR}/examples/flow/flow3/asl-flow3")
+  add_test(NAME ASL_example_flowKDPGrowth
+           COMMAND "${TS_BUILDDIR}/examples/flow/flowKDPGrowth/asl-flowKDPGrowth")
+  add_test(NAME ASL_example_flowRotatingCylinders
+           COMMAND "${TS_BUILDDIR}/examples/flow/flowRotatingCylinders/asl-flowRotatingCylinders")
+  add_test(NAME ASL_example_locomotive
+           COMMAND "${TS_BUILDDIR}/examples/flow/locomotive/asl-locomotive")
+  add_test(NAME ASL_example_locomotive_laminar
+           COMMAND "${TS_BUILDDIR}/examples/flow/locomotive_laminar/asl-locomotive_laminar")
+  add_test(NAME ASL_example_locomotive_stability
+           COMMAND "${TS_BUILDDIR}/examples/flow/locomotive_stability/asl-locomotive_stability")
+  add_test(NAME ASL_example_multicomponent_flow
+           COMMAND "${TS_BUILDDIR}/examples/flow/multicomponent_flow/asl-multicomponent_flow")
+  add_test(NAME ASL_example_multiphase_flow
+           COMMAND "${TS_BUILDDIR}/examples/flow/multiphase_flow/asl-multiphase_flow")
+  add_test(NAME ASL_example_pitot_tube_ice
+           COMMAND "${TS_BUILDDIR}/examples/flow/pitot_tube_ice/asl-pitot_tube_ice")
+  add_test(NAME ASL_example_acousticWaves
+           COMMAND "${TS_BUILDDIR}/examples/elastic/acousticWaves/asl-acousticWaves")
+  add_test(NAME ASL_example_cubeGravity
+           COMMAND "${TS_BUILDDIR}/examples/elastic/cubeGravity/asl-cubeGravity")
+  add_test(NAME ASL_example_cubeIncompressibleGravity
+           COMMAND "${TS_BUILDDIR}/examples/elastic/cubeIncompressibleGravity/asl-cubeIncompressibleGravity")
+  add_test(NAME ASL_example_cubePoroelasticGravity
+           COMMAND "${TS_BUILDDIR}/examples/elastic/cubePoroelasticGravity/asl-cubePoroelasticGravity")
+  add_test(NAME ASL_example_poroelastic
+           COMMAND "${TS_BUILDDIR}/examples/elastic/poroelastic/asl-poroelastic")
+  add_test(NAME ASL_example_levelSetBasic
+           COMMAND "${TS_BUILDDIR}/examples/levelSet/levelSetBasic/asl-levelSetBasic")
+  add_test(NAME ASL_example_levelSetFacetedGrowth
+           COMMAND "${TS_BUILDDIR}/examples/levelSet/levelSetFacetedGrowth/asl-levelSetFacetedGrowth")
+  add_test(NAME ASL_example_levelSetNormalGrowth
+           COMMAND "${TS_BUILDDIR}/examples/levelSet/levelSetNormalGrowth/asl-levelSetNormalGrowth")
+  add_test(NAME ASL_example_jumpingBox
+           COMMAND "${TS_BUILDDIR}/examples/jumpingObjects/jumpingBox/asl-jumpingBox")
+  add_test(NAME ASL_example_surfaceFlux
+           COMMAND "${TS_BUILDDIR}/examples/heatTransfer/surfaceFlux/asl-surfaceFlux")
+  add_test(NAME ASL_example_testSMDiff
+           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMDiff/asl-testSMDiff")
+  add_test(NAME ASL_example_testSMDiff3C
+           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMDiff3C/asl-testSMDiff3C")
+  add_test(NAME ASL_example_testSMPhi
+           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMPhi/asl-testSMPhi")
+  add_test(NAME ASL_example_testSMPhiBV
+           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMPhiBV/asl-testSMPhiBV")
+
+
+  set_tests_properties(
+    ASL_testABDFormat
+    ASL_testVectorOfElements
+    ASL_testMatrixOfElements
+    ASL_testKernel
+    ASL_testOperators
+    ASL_testKernelMerger
+    ASL_testPrivateVar
+    ASL_testASLData
+    ASL_testDistanceFunction
+    ASL_testReductionFunction
+    ASL_example_bus_wind
+    ASL_example_compressor
+    ASL_example_flow
+    ASL_example_flow2
+    ASL_example_flow3
+    ASL_example_flowKDPGrowth
+    ASL_example_flowRotatingCylinders
+    ASL_example_locomotive
+    ASL_example_locomotive_laminar
+    ASL_example_locomotive_stability
+    ASL_example_multicomponent_flow
+    ASL_example_multiphase_flow
+    ASL_example_pitot_tube_ice
+    ASL_example_acousticWaves
+    ASL_example_cubeGravity
+    ASL_example_cubeIncompressibleGravity
+    ASL_example_cubePoroelasticGravity
+    ASL_example_poroelastic
+    ASL_example_levelSetBasic
+    ASL_example_levelSetFacetedGrowth
+    ASL_example_levelSetNormalGrowth
+    ASL_example_jumpingBox
+    ASL_example_surfaceFlux
+    ASL_example_testSMDiff
+    ASL_example_testSMDiff3C
+    ASL_example_testSMPhi
+    ASL_example_testSMPhiBV
+
+    PROPERTIES
+      LABELS "ASL")
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required files not found" )
+
+endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c13e994..0e9dbc9 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,36 +23,72 @@
 #
 #=============================================================================
 
-if(NOT DEFINED ENABLED_TESTSUITES)
-  message(STATUS "Disabling all testsuites")
-else()
+add_subdirectory("example1")
+add_subdirectory("example1-spir32")
+add_subdirectory("example1-spir64")
+add_subdirectory("example2")
+add_subdirectory("example2a")
+add_subdirectory("standalone")
+add_subdirectory("scalarwave")
+add_subdirectory("trig")
+add_subdirectory("EinsteinToolkit")
+
+# TODO:   opencl-book-samples  PyOpenCL
+set(ALL_TESTSUITES
+    AMD AMDSDK2.9 AMDSDK3.0
+    ASL arrayfire clBLAS clFFT
+    CloverLeaf Halide IntelSVM
+    opencl-book-samples OpenCV
+    Parboil piglit PyOpenCL
+    Rodinia VexCL ViennaCL)
+
+if("${ENABLE_TESTSUITES}" STREQUAL "all")
+  set(ENABLE_TESTSUITES ${ALL_TESTSUITES})
+endif()
+
+message(STATUS "Trying to enable testsuites: ${ENABLE_TESTSUITES}")
+
+include(ExternalProject)
+
+set(ACTUALLY_ENABLED_TESTSUITES "")
+
+# invoke this to build all examples
+add_custom_target(prepare_examples)
 
-  include(ExternalProject)
+if(ENABLE_TESTSUITES)
 
-  if("${ENABLED_TESTSUITES}" STREQUAL "all")
-    message(STATUS "Enabling all testsuites")
-    set(ENABLED_TESTSUITES ${ALL_TESTSUITES})
+  if(NOT DEFINED TESTSUITE_BASEDIR)
+    # TODO maybe current src dir ?
+    set(TESTSUITE_BASEDIR "${CMAKE_CURRENT_BINARY_DIR}")
+  endif()
+  if(NOT IS_DIRECTORY "${TESTSUITE_BASEDIR}")
+    message(FATAL_ERROR "TESTSUITE_BASEDIR (${TESTSUITE_BASEDIR}) is not a directory.")
+  endif()
+
+  if(NOT DEFINED TESTSUITE_SOURCE_BASEDIR)
+    set(TESTSUITE_SOURCE_BASEDIR "${CMAKE_CURRENT_SOURCE_DIR}")
+  endif()
+  if(NOT IS_DIRECTORY "${TESTSUITE_SOURCE_BASEDIR}")
+    message(FATAL_ERROR "TESTSUITE_SOURCE_BASEDIR (${TESTSUITE_SOURCE_BASEDIR}) is not a directory.")
   endif()
 
-  foreach(TESTSUITE IN LISTS ENABLED_TESTSUITES)
+  message(STATUS "Testsuite base dirs: ")
+  message(STATUS "    binary: ${TESTSUITE_BASEDIR}")
+  message(STATUS "    source: ${TESTSUITE_SOURCE_BASEDIR}")
+
+  foreach(TESTSUITE IN LISTS ENABLE_TESTSUITES)
     if(ALL_TESTSUITES MATCHES ${TESTSUITE})
-      message(STATUS "Enabling testsuite ${TESTSUITE}")
-      add_subdirectory("${TESTSUITE}")
+      if(IS_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/${TESTSUITE}")
+        add_subdirectory("${TESTSUITE}")
+      else()
+        message(FATAL_ERROR "Cannot find source dir for testsuite: ${TESTSUITE}")
+      endif()
     else()
       message(WARNING "Unknown testsuite ${TESTSUITE} requested")
     endif()
   endforeach()
-
 endif()
-set(ENABLED_TESTSUITES ${ENABLED_TESTSUITES} PARENT_SCOPE)
 
-
-add_subdirectory("example1")
-add_subdirectory("example1-spir32")
-add_subdirectory("example1-spir64")
-add_subdirectory("example2")
-add_subdirectory("example2a")
-add_subdirectory("standalone")
-add_subdirectory("scalarwave")
-add_subdirectory("trig")
+set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+set(ALL_TESTSUITES ${ALL_TESTSUITES} PARENT_SCOPE)
 
diff --git a/examples/CloverLeaf/CMakeLists.txt b/examples/CloverLeaf/CMakeLists.txt
new file mode 100644
index 0000000..0116be9
--- /dev/null
+++ b/examples/CloverLeaf/CMakeLists.txt
@@ -0,0 +1,82 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME CloverLeaf)
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+message(STATUS "Enabling testsuite ${TS_NAME}")
+list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+if(UNIX)
+  pkg_check_modules(OPENMPI ompi)
+  if (OPENMPI_FOUND)
+    set(EXTRA_LIB_HINTS "HINTS" "${OPENMPI_LIBRARY_DIRS}")
+  endif()
+endif()
+
+#You need to have "openmpi" and "gfortran" packages installed to build it.
+find_program(MPI_F90 mpif90)
+find_program(MPI_CC  mpicc)
+find_program(MPI_CXX  mpiCC)
+find_library(MPI_LIB mpi_cxx ${EXTRA_LIB_HINTS})
+
+if (MPI_F90 AND MPI_CC AND MPI_CXX AND MPI_LIB)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    #DOWNLOAD_COMMAND "/bin/true"
+    GIT_REPOSITORY "https://github.com/UK-MAC/CloverLeaf_OpenCL.git"
+    GIT_TAG "Bristol"
+    CONFIGURE_COMMAND "/bin/true"
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND
+      make COMPILER=GNU
+      "C_OPTIONS=-g -I${CMAKE_SOURCE_DIR}/include -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+      "MPI_COMPILER=${MPI_F90}"
+      "C_MPI_COMPILER=${MPI_CC}"
+      "CXX_MPI_COMPILER=${MPI_CXX}"
+      "MPICXX_LIB=${MPI_LIB}"
+    INSTALL_COMMAND "/bin/true"
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+  add_test(NAME CloverLeaf
+           COMMAND "${TS_BUILDDIR}/clover_leaf")
+
+  set_tests_properties(CloverLeaf
+    PROPERTIES
+      LABELS "CloverLeaf")
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required files not found" )
+
+endif()
diff --git a/examples/CloverLeaf/Makefile.in b/examples/CloverLeaf/Makefile.in
index 61d6061..dd559eb 100644
--- a/examples/CloverLeaf/Makefile.in
+++ b/examples/CloverLeaf/Makefile.in
@@ -192,6 +192,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -199,6 +200,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -216,8 +218,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -293,6 +293,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/example2/CMakeLists.txt b/examples/EinsteinToolkit/CMakeLists.txt
similarity index 73%
copy from examples/example2/CMakeLists.txt
copy to examples/EinsteinToolkit/CMakeLists.txt
index 19c55d7..7fa734b 100644
--- a/examples/example2/CMakeLists.txt
+++ b/examples/EinsteinToolkit/CMakeLists.txt
@@ -1,7 +1,7 @@
 #=============================================================================
 #   CMake build system files
 #
-#   Copyright (c) 2014 pocl developers
+#   Copyright (c) 2015 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -26,25 +26,21 @@
 #AM_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -DSRCDIR='"$(abs_srcdir)"'
 add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
-# example1_CFLAGS = @OPENCL_CFLAGS@
+# EinsteinToolkit_CFLAGS = @OPENCL_CFLAGS@
 #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 ${OPENCL_CFLAGS}")
-add_compile_options(${OPENCL_CFLAGS})
-
+add_compile_options(${OPENCL_CFLAGS} -std=c99)
 
 if (MSVC)
-  set_source_files_properties( example2.c PROPERTIES LANGUAGE CXX )
+  set_source_files_properties( EinsteinToolkit.c PROPERTIES LANGUAGE CXX )
 endif(MSVC)
-add_executable("example2" example2.c example2.cl)
+add_executable("EinsteinToolkit" EinsteinToolkit.c ML_BSSN_CL_RHS1.cl ML_BSSN_CL_RHS2.cl)
 
-# example1_LDADD = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
-target_link_libraries("example2" ${POCLU_LINK_OPTIONS})
+# EinsteinToolkit_LDADD = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
+target_link_libraries("EinsteinToolkit" ${POCLU_LINK_OPTIONS})
 
-add_test("spec_tests/example2_matrix_transpose" "example2")
+add_test(NAME "EinsteinToolkit" COMMAND "EinsteinToolkit")
 
-set_tests_properties( "spec_tests/example2_matrix_transpose"
+set_tests_properties( "EinsteinToolkit"
   PROPERTIES
-    COST 3.0
-    PASS_REGULAR_EXPRESSION "OK\n"
-    PROCESSORS 1
-    LABELS "OpenCL_Spec"
+    LABELS "internal;Einstein"
     DEPENDS "pocl_version_check")
diff --git a/examples/EinsteinToolkit/Makefile.in b/examples/EinsteinToolkit/Makefile.in
index 4433cc1..7dc5d75 100644
--- a/examples/EinsteinToolkit/Makefile.in
+++ b/examples/EinsteinToolkit/Makefile.in
@@ -247,6 +247,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -254,6 +255,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -271,8 +273,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -348,6 +348,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/Halide/CMakeLists.txt b/examples/Halide/CMakeLists.txt
new file mode 100644
index 0000000..65f7865
--- /dev/null
+++ b/examples/Halide/CMakeLists.txt
@@ -0,0 +1,1076 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME Halide)
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+message(STATUS "Enabling testsuite ${TS_NAME}")
+list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+# Halide disabled for now, need to find out which tests actually use OpenCL
+if(0)
+
+ExternalProject_Add(
+  ${TS_NAME}
+  PREFIX "${TS_BASEDIR}"
+  #DOWNLOAD_COMMAND "/bin/true"
+  GIT_REPOSITORY "https://github.com/Halide/Halide.git"
+  #PATCH_COMMAND /bin/sh "${AMD_APP_SDK_TGZ}" --noexec --keep --target AMD-APP-SDK-3.0 &&
+  #     patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/amdsdk3_0.patch
+  CMAKE_ARGS
+    -DCMAKE_BUILD_TYPE=RelWithDebInfo
+    -DTARGET_ARM=OFF
+    -DTARGET_AARCH64=OFF
+    -DTARGET_MIPS=OFF
+    -DTARGET_METAL=OFF
+    -DTARGET_OPENCL=ON
+    -DTARGET_OPENGL=OFF
+    -DTARGET_OPENGLCOMPUTE=OFF
+    -DTARGET_PTX=OFF
+    -DTARGET_X86=ON
+  CMAKE_CACHE_ARGS
+    "-DLLVM_BIN:STRING=${LLVM_BINDIR}"
+    "-DLLVM_LIB:STRING=${LLVM_LIBDIR}"
+    "-DLLVM_INCLUDE:STRING=${LLVM_INCLUDEDIR}"
+    "-DLLVM_VERSION:STRING=${LLVM_MAJOR}${LLVM_MINOR}"
+  INSTALL_COMMAND "/bin/true"
+)
+
+set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+add_dependencies(prepare_examples ${TS_NAME})
+
+# TODO Probably not tests
+# acquire_release.generator argvcall.generator cleanup_on_error.generator
+# embed_image.generator example.generator extended_buffer_t.generator
+# error_codes.generator gpu_object_lifetime.generator gpu_only.generator
+# mandelbrot.generator matlab.generator metadata_tester.generator
+# nested_externs.generator paramtest.generator pyramid.generator
+# runtime.generator tiled_blur.generator tiled_blur_blur.generator
+# user_context.generator user_context_insanity.generator
+
+add_test(NAME halide_bilateral_grid
+         COMMAND "${TS_BUILDDIR}/bin/bilateral_grid")
+add_test(NAME halide_bitcode2cpp
+         COMMAND "${TS_BUILDDIR}/bin/bitcode2cpp")
+add_test(NAME halide_blur_test
+         COMMAND "${TS_BUILDDIR}/bin/blur_test")
+add_test(NAME halide_build_halide_h
+         COMMAND "${TS_BUILDDIR}/bin/build_halide_h")
+add_test(NAME halide_camera_pipe
+         COMMAND "${TS_BUILDDIR}/bin/camera_pipe")
+add_test(NAME halide_correctness_argmax
+         COMMAND "${TS_BUILDDIR}/bin/correctness_argmax")
+add_test(NAME halide_correctness_assertion_failure_in_parallel_for
+         COMMAND "${TS_BUILDDIR}/bin/correctness_assertion_failure_in_parallel_for")
+add_test(NAME halide_correctness_autotune_bug
+         COMMAND "${TS_BUILDDIR}/bin/correctness_autotune_bug")
+add_test(NAME halide_correctness_autotune_bug_2
+         COMMAND "${TS_BUILDDIR}/bin/correctness_autotune_bug_2")
+add_test(NAME halide_correctness_autotune_bug_3
+         COMMAND "${TS_BUILDDIR}/bin/correctness_autotune_bug_3")
+add_test(NAME halide_correctness_autotune_bug_4
+         COMMAND "${TS_BUILDDIR}/bin/correctness_autotune_bug_4")
+add_test(NAME halide_correctness_autotune_bug_5
+         COMMAND "${TS_BUILDDIR}/bin/correctness_autotune_bug_5")
+add_test(NAME halide_correctness_bad_likely
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bad_likely")
+add_test(NAME halide_correctness_bit_counting
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bit_counting")
+add_test(NAME halide_correctness_bitwise_ops
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bitwise_ops")
+add_test(NAME halide_correctness_bool_compute_root_vectorize
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bool_compute_root_vectorize")
+add_test(NAME halide_correctness_bound
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bound")
+add_test(NAME halide_correctness_boundary_conditions
+         COMMAND "${TS_BUILDDIR}/bin/correctness_boundary_conditions")
+add_test(NAME halide_correctness_bounds
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bounds")
+add_test(NAME halide_correctness_bounds_inference
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bounds_inference")
+add_test(NAME halide_correctness_bounds_inference_chunk
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bounds_inference_chunk")
+add_test(NAME halide_correctness_bounds_inference_complex
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bounds_inference_complex")
+add_test(NAME halide_correctness_bounds_of_abs
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bounds_of_abs")
+add_test(NAME halide_correctness_bounds_of_cast
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bounds_of_cast")
+add_test(NAME halide_correctness_bounds_of_func
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bounds_of_func")
+add_test(NAME halide_correctness_bounds_of_monotonic_math
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bounds_of_monotonic_math")
+add_test(NAME halide_correctness_bounds_query
+         COMMAND "${TS_BUILDDIR}/bin/correctness_bounds_query")
+add_test(NAME halide_correctness_buffer_t
+         COMMAND "${TS_BUILDDIR}/bin/correctness_buffer_t")
+add_test(NAME halide_correctness_cascaded_filters
+         COMMAND "${TS_BUILDDIR}/bin/correctness_cascaded_filters")
+add_test(NAME halide_correctness_cast
+         COMMAND "${TS_BUILDDIR}/bin/correctness_cast")
+add_test(NAME halide_correctness_cast_handle
+         COMMAND "${TS_BUILDDIR}/bin/correctness_cast_handle")
+add_test(NAME halide_correctness_c_function
+         COMMAND "${TS_BUILDDIR}/bin/correctness_c_function")
+add_test(NAME halide_correctness_chunk
+         COMMAND "${TS_BUILDDIR}/bin/correctness_chunk")
+add_test(NAME halide_correctness_chunk_sharing
+         COMMAND "${TS_BUILDDIR}/bin/correctness_chunk_sharing")
+add_test(NAME halide_correctness_circular_reference_leak
+         COMMAND "${TS_BUILDDIR}/bin/correctness_circular_reference_leak")
+add_test(NAME halide_correctness_code_explosion
+         COMMAND "${TS_BUILDDIR}/bin/correctness_code_explosion")
+add_test(NAME halide_correctness_compare_vars
+         COMMAND "${TS_BUILDDIR}/bin/correctness_compare_vars")
+add_test(NAME halide_correctness_compile_to
+         COMMAND "${TS_BUILDDIR}/bin/correctness_compile_to")
+add_test(NAME halide_correctness_compile_to_bitcode
+         COMMAND "${TS_BUILDDIR}/bin/correctness_compile_to_bitcode")
+add_test(NAME halide_correctness_compile_to_lowered_stmt
+         COMMAND "${TS_BUILDDIR}/bin/correctness_compile_to_lowered_stmt")
+add_test(NAME halide_correctness_compute_at_split_rvar
+         COMMAND "${TS_BUILDDIR}/bin/correctness_compute_at_split_rvar")
+add_test(NAME halide_correctness_computed_index
+         COMMAND "${TS_BUILDDIR}/bin/correctness_computed_index")
+add_test(NAME halide_correctness_compute_outermost
+         COMMAND "${TS_BUILDDIR}/bin/correctness_compute_outermost")
+add_test(NAME halide_correctness_constant_expr
+         COMMAND "${TS_BUILDDIR}/bin/correctness_constant_expr")
+add_test(NAME halide_correctness_constant_type
+         COMMAND "${TS_BUILDDIR}/bin/correctness_constant_type")
+add_test(NAME halide_correctness_constraints
+         COMMAND "${TS_BUILDDIR}/bin/correctness_constraints")
+add_test(NAME halide_correctness_convolution
+         COMMAND "${TS_BUILDDIR}/bin/correctness_convolution")
+add_test(NAME halide_correctness_convolution_multiple_kernels
+         COMMAND "${TS_BUILDDIR}/bin/correctness_convolution_multiple_kernels")
+add_test(NAME halide_correctness_cross_compilation
+         COMMAND "${TS_BUILDDIR}/bin/correctness_cross_compilation")
+add_test(NAME halide_correctness_custom_allocator
+         COMMAND "${TS_BUILDDIR}/bin/correctness_custom_allocator")
+add_test(NAME halide_correctness_custom_error_reporter
+         COMMAND "${TS_BUILDDIR}/bin/correctness_custom_error_reporter")
+add_test(NAME halide_correctness_custom_lowering_pass
+         COMMAND "${TS_BUILDDIR}/bin/correctness_custom_lowering_pass")
+add_test(NAME halide_correctness_debug_to_file
+         COMMAND "${TS_BUILDDIR}/bin/correctness_debug_to_file")
+add_test(NAME halide_correctness_deinterleave4
+         COMMAND "${TS_BUILDDIR}/bin/correctness_deinterleave4")
+add_test(NAME halide_correctness_div_mod
+         COMMAND "${TS_BUILDDIR}/bin/correctness_div_mod")
+add_test(NAME halide_correctness_dynamic_reduction_bounds
+         COMMAND "${TS_BUILDDIR}/bin/correctness_dynamic_reduction_bounds")
+add_test(NAME halide_correctness_erf
+         COMMAND "${TS_BUILDDIR}/bin/correctness_erf")
+add_test(NAME halide_correctness_exception
+         COMMAND "${TS_BUILDDIR}/bin/correctness_exception")
+add_test(NAME halide_correctness_explicit_inline_reductions
+         COMMAND "${TS_BUILDDIR}/bin/correctness_explicit_inline_reductions")
+add_test(NAME halide_correctness_extern_bounds_inference
+         COMMAND "${TS_BUILDDIR}/bin/correctness_extern_bounds_inference")
+add_test(NAME halide_correctness_extern_consumer
+         COMMAND "${TS_BUILDDIR}/bin/correctness_extern_consumer")
+add_test(NAME halide_correctness_extern_error
+         COMMAND "${TS_BUILDDIR}/bin/correctness_extern_error")
+add_test(NAME halide_correctness_extern_output_expansion
+         COMMAND "${TS_BUILDDIR}/bin/correctness_extern_output_expansion")
+add_test(NAME halide_correctness_extern_producer
+         COMMAND "${TS_BUILDDIR}/bin/correctness_extern_producer")
+add_test(NAME halide_correctness_extern_sort
+         COMMAND "${TS_BUILDDIR}/bin/correctness_extern_sort")
+add_test(NAME halide_correctness_extern_stage
+         COMMAND "${TS_BUILDDIR}/bin/correctness_extern_stage")
+add_test(NAME halide_correctness_fibonacci
+         COMMAND "${TS_BUILDDIR}/bin/correctness_fibonacci")
+add_test(NAME halide_correctness_float16_t_comparison
+         COMMAND "${TS_BUILDDIR}/bin/correctness_float16_t_comparison")
+add_test(NAME halide_correctness_float16_t_constants
+         COMMAND "${TS_BUILDDIR}/bin/correctness_float16_t_constants")
+add_test(NAME halide_correctness_float16_t_image_type
+         COMMAND "${TS_BUILDDIR}/bin/correctness_float16_t_image_type")
+add_test(NAME halide_correctness_float16_t_implicit_upcast
+         COMMAND "${TS_BUILDDIR}/bin/correctness_float16_t_implicit_upcast")
+add_test(NAME halide_correctness_float16_t_realize_constant
+         COMMAND "${TS_BUILDDIR}/bin/correctness_float16_t_realize_constant")
+add_test(NAME halide_correctness_func_lifetime
+         COMMAND "${TS_BUILDDIR}/bin/correctness_func_lifetime")
+add_test(NAME halide_correctness_func_lifetime_2
+         COMMAND "${TS_BUILDDIR}/bin/correctness_func_lifetime_2")
+add_test(NAME halide_correctness_fuse
+         COMMAND "${TS_BUILDDIR}/bin/correctness_fuse")
+add_test(NAME halide_correctness_fused_where_inner_extent_is_zero
+         COMMAND "${TS_BUILDDIR}/bin/correctness_fused_where_inner_extent_is_zero")
+add_test(NAME halide_correctness_fuzz_simplify
+         COMMAND "${TS_BUILDDIR}/bin/correctness_fuzz_simplify")
+add_test(NAME halide_correctness_gameoflife
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gameoflife")
+add_test(NAME halide_correctness_gpu_data_flows
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_data_flows")
+add_test(NAME halide_correctness_gpu_dynamic_shared
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_dynamic_shared")
+add_test(NAME halide_correctness_gpu_free_sync
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_free_sync")
+add_test(NAME halide_correctness_gpu_large_alloc
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_large_alloc")
+add_test(NAME halide_correctness_gpu_mixed_dimensionality
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_mixed_dimensionality")
+add_test(NAME halide_correctness_gpu_mixed_shared_mem_types
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_mixed_shared_mem_types")
+add_test(NAME halide_correctness_gpu_multi_device
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_multi_device")
+add_test(NAME halide_correctness_gpu_multi_kernel
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_multi_kernel")
+add_test(NAME halide_correctness_gpu_non_contiguous_copy
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_non_contiguous_copy")
+add_test(NAME halide_correctness_gpu_object_lifetime
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_object_lifetime")
+add_test(NAME halide_correctness_gpu_specialize
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_specialize")
+add_test(NAME halide_correctness_gpu_sum_scan
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_sum_scan")
+add_test(NAME halide_correctness_gpu_thread_barrier
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_thread_barrier")
+add_test(NAME halide_correctness_gpu_transpose
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_transpose")
+add_test(NAME halide_correctness_gpu_vectorize_div_mod
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_vectorize_div_mod")
+add_test(NAME halide_correctness_gpu_vectorized_shared_memory
+         COMMAND "${TS_BUILDDIR}/bin/correctness_gpu_vectorized_shared_memory")
+add_test(NAME halide_correctness_handle
+         COMMAND "${TS_BUILDDIR}/bin/correctness_handle")
+add_test(NAME halide_correctness_heap_cleanup
+         COMMAND "${TS_BUILDDIR}/bin/correctness_heap_cleanup")
+add_test(NAME halide_correctness_hello_gpu
+         COMMAND "${TS_BUILDDIR}/bin/correctness_hello_gpu")
+add_test(NAME halide_correctness_histogram
+         COMMAND "${TS_BUILDDIR}/bin/correctness_histogram")
+add_test(NAME halide_correctness_histogram_equalize
+         COMMAND "${TS_BUILDDIR}/bin/correctness_histogram_equalize")
+add_test(NAME halide_correctness_image_of_lists
+         COMMAND "${TS_BUILDDIR}/bin/correctness_image_of_lists")
+add_test(NAME halide_correctness_implicit_args
+         COMMAND "${TS_BUILDDIR}/bin/correctness_implicit_args")
+add_test(NAME halide_correctness_infer_arguments
+         COMMAND "${TS_BUILDDIR}/bin/correctness_infer_arguments")
+add_test(NAME halide_correctness_inline_reduction
+         COMMAND "${TS_BUILDDIR}/bin/correctness_inline_reduction")
+add_test(NAME halide_correctness_in_place
+         COMMAND "${TS_BUILDDIR}/bin/correctness_in_place")
+add_test(NAME halide_correctness_input_image_bounds_check
+         COMMAND "${TS_BUILDDIR}/bin/correctness_input_image_bounds_check")
+add_test(NAME halide_correctness_input_larger_than_two_gigs
+         COMMAND "${TS_BUILDDIR}/bin/correctness_input_larger_than_two_gigs")
+add_test(NAME halide_correctness_integer_powers
+         COMMAND "${TS_BUILDDIR}/bin/correctness_integer_powers")
+add_test(NAME halide_correctness_interleave
+         COMMAND "${TS_BUILDDIR}/bin/correctness_interleave")
+add_test(NAME halide_correctness_introspection
+         COMMAND "${TS_BUILDDIR}/bin/correctness_introspection")
+add_test(NAME halide_correctness_inverse
+         COMMAND "${TS_BUILDDIR}/bin/correctness_inverse")
+add_test(NAME halide_correctness_isnan
+         COMMAND "${TS_BUILDDIR}/bin/correctness_isnan")
+add_test(NAME halide_correctness_iterate_over_circle
+         COMMAND "${TS_BUILDDIR}/bin/correctness_iterate_over_circle")
+add_test(NAME halide_correctness_lambda
+         COMMAND "${TS_BUILDDIR}/bin/correctness_lambda")
+add_test(NAME halide_correctness_lazy_convolution
+         COMMAND "${TS_BUILDDIR}/bin/correctness_lazy_convolution")
+add_test(NAME halide_correctness_legal_race_condition
+         COMMAND "${TS_BUILDDIR}/bin/correctness_legal_race_condition")
+add_test(NAME halide_correctness_lerp
+         COMMAND "${TS_BUILDDIR}/bin/correctness_lerp")
+add_test(NAME halide_correctness_likely
+         COMMAND "${TS_BUILDDIR}/bin/correctness_likely")
+add_test(NAME halide_correctness_logical
+         COMMAND "${TS_BUILDDIR}/bin/correctness_logical")
+add_test(NAME halide_correctness_loop_invariant_extern_calls
+         COMMAND "${TS_BUILDDIR}/bin/correctness_loop_invariant_extern_calls")
+add_test(NAME halide_correctness_make_struct
+         COMMAND "${TS_BUILDDIR}/bin/correctness_make_struct")
+add_test(NAME halide_correctness_many_dimensions
+         COMMAND "${TS_BUILDDIR}/bin/correctness_many_dimensions")
+add_test(NAME halide_correctness_many_small_extern_stages
+         COMMAND "${TS_BUILDDIR}/bin/correctness_many_small_extern_stages")
+add_test(NAME halide_correctness_many_updates
+         COMMAND "${TS_BUILDDIR}/bin/correctness_many_updates")
+add_test(NAME halide_correctness_math
+         COMMAND "${TS_BUILDDIR}/bin/correctness_math")
+add_test(NAME halide_correctness_memoize
+         COMMAND "${TS_BUILDDIR}/bin/correctness_memoize")
+add_test(NAME halide_correctness_min_extent
+         COMMAND "${TS_BUILDDIR}/bin/correctness_min_extent")
+add_test(NAME halide_correctness_mod
+         COMMAND "${TS_BUILDDIR}/bin/correctness_mod")
+add_test(NAME halide_correctness_multi_output_pipeline_with_bad_sizes
+         COMMAND "${TS_BUILDDIR}/bin/correctness_multi_output_pipeline_with_bad_sizes")
+add_test(NAME halide_correctness_multipass_constraints
+         COMMAND "${TS_BUILDDIR}/bin/correctness_multipass_constraints")
+add_test(NAME halide_correctness_multi_pass_reduction
+         COMMAND "${TS_BUILDDIR}/bin/correctness_multi_pass_reduction")
+add_test(NAME halide_correctness_multiple_outputs
+         COMMAND "${TS_BUILDDIR}/bin/correctness_multiple_outputs")
+add_test(NAME halide_correctness_multi_way_select
+         COMMAND "${TS_BUILDDIR}/bin/correctness_multi_way_select")
+add_test(NAME halide_correctness_named_updates
+         COMMAND "${TS_BUILDDIR}/bin/correctness_named_updates")
+add_test(NAME halide_correctness_newtons_method
+         COMMAND "${TS_BUILDDIR}/bin/correctness_newtons_method")
+add_test(NAME halide_correctness_obscure_image_references
+         COMMAND "${TS_BUILDDIR}/bin/correctness_obscure_image_references")
+add_test(NAME halide_correctness_oddly_sized_output
+         COMMAND "${TS_BUILDDIR}/bin/correctness_oddly_sized_output")
+add_test(NAME halide_correctness_out_of_memory
+         COMMAND "${TS_BUILDDIR}/bin/correctness_out_of_memory")
+add_test(NAME halide_correctness_output_larger_than_two_gigs
+         COMMAND "${TS_BUILDDIR}/bin/correctness_output_larger_than_two_gigs")
+add_test(NAME halide_correctness_parallel
+         COMMAND "${TS_BUILDDIR}/bin/correctness_parallel")
+add_test(NAME halide_correctness_parallel_alloc
+         COMMAND "${TS_BUILDDIR}/bin/correctness_parallel_alloc")
+add_test(NAME halide_correctness_parallel_gpu_nested
+         COMMAND "${TS_BUILDDIR}/bin/correctness_parallel_gpu_nested")
+add_test(NAME halide_correctness_parallel_nested
+         COMMAND "${TS_BUILDDIR}/bin/correctness_parallel_nested")
+add_test(NAME halide_correctness_parallel_reductions
+         COMMAND "${TS_BUILDDIR}/bin/correctness_parallel_reductions")
+add_test(NAME halide_correctness_parallel_rvar
+         COMMAND "${TS_BUILDDIR}/bin/correctness_parallel_rvar")
+add_test(NAME halide_correctness_param
+         COMMAND "${TS_BUILDDIR}/bin/correctness_param")
+add_test(NAME halide_correctness_parameter_constraints
+         COMMAND "${TS_BUILDDIR}/bin/correctness_parameter_constraints")
+add_test(NAME halide_correctness_partial_application
+         COMMAND "${TS_BUILDDIR}/bin/correctness_partial_application")
+add_test(NAME halide_correctness_partition_loops_bug
+         COMMAND "${TS_BUILDDIR}/bin/correctness_partition_loops_bug")
+add_test(NAME halide_correctness_pipeline_set_jit_externs_func
+         COMMAND "${TS_BUILDDIR}/bin/correctness_pipeline_set_jit_externs_func")
+add_test(NAME halide_correctness_print
+         COMMAND "${TS_BUILDDIR}/bin/correctness_print")
+add_test(NAME halide_correctness_process_some_tiles
+         COMMAND "${TS_BUILDDIR}/bin/correctness_process_some_tiles")
+add_test(NAME halide_correctness_random
+         COMMAND "${TS_BUILDDIR}/bin/correctness_random")
+add_test(NAME halide_correctness_realize_larger_than_two_gigs
+         COMMAND "${TS_BUILDDIR}/bin/correctness_realize_larger_than_two_gigs")
+add_test(NAME halide_correctness_realize_over_shifted_domain
+         COMMAND "${TS_BUILDDIR}/bin/correctness_realize_over_shifted_domain")
+add_test(NAME halide_correctness_reduction_chain
+         COMMAND "${TS_BUILDDIR}/bin/correctness_reduction_chain")
+add_test(NAME halide_correctness_reduction_schedule
+         COMMAND "${TS_BUILDDIR}/bin/correctness_reduction_schedule")
+add_test(NAME halide_correctness_reduction_subregion
+         COMMAND "${TS_BUILDDIR}/bin/correctness_reduction_subregion")
+add_test(NAME halide_correctness_reorder_rvars
+         COMMAND "${TS_BUILDDIR}/bin/correctness_reorder_rvars")
+add_test(NAME halide_correctness_reorder_storage
+         COMMAND "${TS_BUILDDIR}/bin/correctness_reorder_storage")
+add_test(NAME halide_correctness_reschedule
+         COMMAND "${TS_BUILDDIR}/bin/correctness_reschedule")
+add_test(NAME halide_correctness_reuse_stack_alloc
+         COMMAND "${TS_BUILDDIR}/bin/correctness_reuse_stack_alloc")
+add_test(NAME halide_correctness_round
+         COMMAND "${TS_BUILDDIR}/bin/correctness_round")
+add_test(NAME halide_correctness_runtime_float16_t_upcast
+         COMMAND "${TS_BUILDDIR}/bin/correctness_runtime_float16_t_upcast")
+add_test(NAME halide_correctness_scatter
+         COMMAND "${TS_BUILDDIR}/bin/correctness_scatter")
+add_test(NAME halide_correctness_shared_self_references
+         COMMAND "${TS_BUILDDIR}/bin/correctness_shared_self_references")
+add_test(NAME halide_correctness_shifted_image
+         COMMAND "${TS_BUILDDIR}/bin/correctness_shifted_image")
+add_test(NAME halide_correctness_side_effects
+         COMMAND "${TS_BUILDDIR}/bin/correctness_side_effects")
+add_test(NAME halide_correctness_simd_op_check
+         COMMAND "${TS_BUILDDIR}/bin/correctness_simd_op_check")
+add_test(NAME halide_correctness_simplified_away_embedded_image
+         COMMAND "${TS_BUILDDIR}/bin/correctness_simplified_away_embedded_image")
+add_test(NAME halide_correctness_skip_stages
+         COMMAND "${TS_BUILDDIR}/bin/correctness_skip_stages")
+add_test(NAME halide_correctness_skip_stages_external_array_functions
+         COMMAND "${TS_BUILDDIR}/bin/correctness_skip_stages_external_array_functions")
+add_test(NAME halide_correctness_sliding_backwards
+         COMMAND "${TS_BUILDDIR}/bin/correctness_sliding_backwards")
+add_test(NAME halide_correctness_sliding_reduction
+         COMMAND "${TS_BUILDDIR}/bin/correctness_sliding_reduction")
+add_test(NAME halide_correctness_sliding_window
+         COMMAND "${TS_BUILDDIR}/bin/correctness_sliding_window")
+add_test(NAME halide_correctness_sort_exprs
+         COMMAND "${TS_BUILDDIR}/bin/correctness_sort_exprs")
+add_test(NAME halide_correctness_specialize
+         COMMAND "${TS_BUILDDIR}/bin/correctness_specialize")
+add_test(NAME halide_correctness_specialize_to_gpu
+         COMMAND "${TS_BUILDDIR}/bin/correctness_specialize_to_gpu")
+add_test(NAME halide_correctness_split_fuse_rvar
+         COMMAND "${TS_BUILDDIR}/bin/correctness_split_fuse_rvar")
+add_test(NAME halide_correctness_split_reuse_inner_name_bug
+         COMMAND "${TS_BUILDDIR}/bin/correctness_split_reuse_inner_name_bug")
+add_test(NAME halide_correctness_split_store_compute
+         COMMAND "${TS_BUILDDIR}/bin/correctness_split_store_compute")
+add_test(NAME halide_correctness_stack_allocations
+         COMMAND "${TS_BUILDDIR}/bin/correctness_stack_allocations")
+add_test(NAME halide_correctness_stmt_to_html
+         COMMAND "${TS_BUILDDIR}/bin/correctness_stmt_to_html")
+add_test(NAME halide_correctness_storage_folding
+         COMMAND "${TS_BUILDDIR}/bin/correctness_storage_folding")
+add_test(NAME halide_correctness_stream_compaction
+         COMMAND "${TS_BUILDDIR}/bin/correctness_stream_compaction")
+add_test(NAME halide_correctness_strided_load
+         COMMAND "${TS_BUILDDIR}/bin/correctness_strided_load")
+add_test(NAME halide_correctness_target
+         COMMAND "${TS_BUILDDIR}/bin/correctness_target")
+add_test(NAME halide_correctness_tracing
+         COMMAND "${TS_BUILDDIR}/bin/correctness_tracing")
+add_test(NAME halide_correctness_tracing_bounds
+         COMMAND "${TS_BUILDDIR}/bin/correctness_tracing_bounds")
+add_test(NAME halide_correctness_tracing_stack
+         COMMAND "${TS_BUILDDIR}/bin/correctness_tracing_stack")
+add_test(NAME halide_correctness_transitive_bounds
+         COMMAND "${TS_BUILDDIR}/bin/correctness_transitive_bounds")
+add_test(NAME halide_correctness_tuple_reduction
+         COMMAND "${TS_BUILDDIR}/bin/correctness_tuple_reduction")
+add_test(NAME halide_correctness_two_vector_args
+         COMMAND "${TS_BUILDDIR}/bin/correctness_two_vector_args")
+add_test(NAME halide_correctness_undef
+         COMMAND "${TS_BUILDDIR}/bin/correctness_undef")
+add_test(NAME halide_correctness_uninitialized_read
+         COMMAND "${TS_BUILDDIR}/bin/correctness_uninitialized_read")
+add_test(NAME halide_correctness_unique_func_image
+         COMMAND "${TS_BUILDDIR}/bin/correctness_unique_func_image")
+add_test(NAME halide_correctness_unrolled_reduction
+         COMMAND "${TS_BUILDDIR}/bin/correctness_unrolled_reduction")
+add_test(NAME halide_correctness_update_chunk
+         COMMAND "${TS_BUILDDIR}/bin/correctness_update_chunk")
+add_test(NAME halide_correctness_vector_bounds_inference
+         COMMAND "${TS_BUILDDIR}/bin/correctness_vector_bounds_inference")
+add_test(NAME halide_correctness_vector_cast
+         COMMAND "${TS_BUILDDIR}/bin/correctness_vector_cast")
+add_test(NAME halide_correctness_vector_extern
+         COMMAND "${TS_BUILDDIR}/bin/correctness_vector_extern")
+add_test(NAME halide_correctness_vectorized_initialization
+         COMMAND "${TS_BUILDDIR}/bin/correctness_vectorized_initialization")
+add_test(NAME halide_correctness_vectorized_reduction_bug
+         COMMAND "${TS_BUILDDIR}/bin/correctness_vectorized_reduction_bug")
+add_test(NAME halide_correctness_vectorize_mixed_widths
+         COMMAND "${TS_BUILDDIR}/bin/correctness_vectorize_mixed_widths")
+add_test(NAME halide_correctness_vector_math
+         COMMAND "${TS_BUILDDIR}/bin/correctness_vector_math")
+add_test(NAME halide_error_ambiguous_inline_reductions
+         COMMAND "${TS_BUILDDIR}/bin/error_ambiguous_inline_reductions")
+add_test(NAME halide_error_bad_bound
+         COMMAND "${TS_BUILDDIR}/bin/error_bad_bound")
+add_test(NAME halide_error_bad_compute_at
+         COMMAND "${TS_BUILDDIR}/bin/error_bad_compute_at")
+add_test(NAME halide_error_bad_const_cast
+         COMMAND "${TS_BUILDDIR}/bin/error_bad_const_cast")
+add_test(NAME halide_error_bad_rvar_order
+         COMMAND "${TS_BUILDDIR}/bin/error_bad_rvar_order")
+add_test(NAME halide_error_bad_schedule
+         COMMAND "${TS_BUILDDIR}/bin/error_bad_schedule")
+add_test(NAME halide_error_bad_store_at
+         COMMAND "${TS_BUILDDIR}/bin/error_bad_store_at")
+add_test(NAME halide_error_buffer_larger_than_two_gigs
+         COMMAND "${TS_BUILDDIR}/bin/error_buffer_larger_than_two_gigs")
+add_test(NAME halide_error_constrain_wrong_output_buffer
+         COMMAND "${TS_BUILDDIR}/bin/error_constrain_wrong_output_buffer")
+add_test(NAME halide_error_define_after_realize
+         COMMAND "${TS_BUILDDIR}/bin/error_define_after_realize")
+add_test(NAME halide_error_define_after_use
+         COMMAND "${TS_BUILDDIR}/bin/error_define_after_use")
+add_test(NAME halide_error_expanding_reduction
+         COMMAND "${TS_BUILDDIR}/bin/error_expanding_reduction")
+add_test(NAME halide_error_five_d_gpu_buffer
+         COMMAND "${TS_BUILDDIR}/bin/error_five_d_gpu_buffer")
+add_test(NAME halide_error_float16_t_implicit_downcast
+         COMMAND "${TS_BUILDDIR}/bin/error_float16_t_implicit_downcast")
+add_test(NAME halide_error_float16_t_overflow
+         COMMAND "${TS_BUILDDIR}/bin/error_float16_t_overflow")
+add_test(NAME halide_error_float16_t_overflow_int_conv
+         COMMAND "${TS_BUILDDIR}/bin/error_float16_t_overflow_int_conv")
+add_test(NAME halide_error_float_arg
+         COMMAND "${TS_BUILDDIR}/bin/error_float_arg")
+add_test(NAME halide_error_impossible_constraints
+         COMMAND "${TS_BUILDDIR}/bin/error_impossible_constraints")
+add_test(NAME halide_error_lerp_float_weight_out_of_range
+         COMMAND "${TS_BUILDDIR}/bin/error_lerp_float_weight_out_of_range")
+add_test(NAME halide_error_lerp_mismatch
+         COMMAND "${TS_BUILDDIR}/bin/error_lerp_mismatch")
+add_test(NAME halide_error_lerp_signed_weight
+         COMMAND "${TS_BUILDDIR}/bin/error_lerp_signed_weight")
+add_test(NAME halide_error_memoize_different_compute_store
+         COMMAND "${TS_BUILDDIR}/bin/error_memoize_different_compute_store")
+add_test(NAME halide_error_missing_args
+         COMMAND "${TS_BUILDDIR}/bin/error_missing_args")
+add_test(NAME halide_error_modulo_constant_zero
+         COMMAND "${TS_BUILDDIR}/bin/error_modulo_constant_zero")
+add_test(NAME halide_error_nonexistent_update_stage
+         COMMAND "${TS_BUILDDIR}/bin/error_nonexistent_update_stage")
+add_test(NAME halide_error_old_implicit_args
+         COMMAND "${TS_BUILDDIR}/bin/error_old_implicit_args")
+add_test(NAME halide_error_pointer_arithmetic
+         COMMAND "${TS_BUILDDIR}/bin/error_pointer_arithmetic")
+add_test(NAME halide_error_race_condition
+         COMMAND "${TS_BUILDDIR}/bin/error_race_condition")
+add_test(NAME halide_error_realize_constantly_larger_than_two_gigs
+         COMMAND "${TS_BUILDDIR}/bin/error_realize_constantly_larger_than_two_gigs")
+add_test(NAME halide_error_reduction_bounds
+         COMMAND "${TS_BUILDDIR}/bin/error_reduction_bounds")
+add_test(NAME halide_error_reduction_type_mismatch
+         COMMAND "${TS_BUILDDIR}/bin/error_reduction_type_mismatch")
+add_test(NAME halide_error_reused_args
+         COMMAND "${TS_BUILDDIR}/bin/error_reused_args")
+add_test(NAME halide_error_reuse_var_in_schedule
+         COMMAND "${TS_BUILDDIR}/bin/error_reuse_var_in_schedule")
+add_test(NAME halide_error_thread_id_outside_block_id
+         COMMAND "${TS_BUILDDIR}/bin/error_thread_id_outside_block_id")
+add_test(NAME halide_error_too_many_args
+         COMMAND "${TS_BUILDDIR}/bin/error_too_many_args")
+add_test(NAME halide_error_unbounded_input
+         COMMAND "${TS_BUILDDIR}/bin/error_unbounded_input")
+add_test(NAME halide_error_unbounded_output
+         COMMAND "${TS_BUILDDIR}/bin/error_unbounded_output")
+add_test(NAME halide_error_undefined_rdom_dimension
+         COMMAND "${TS_BUILDDIR}/bin/error_undefined_rdom_dimension")
+add_test(NAME halide_error_vectorize_dynamic
+         COMMAND "${TS_BUILDDIR}/bin/error_vectorize_dynamic")
+add_test(NAME halide_error_vectorize_too_little
+         COMMAND "${TS_BUILDDIR}/bin/error_vectorize_too_little")
+add_test(NAME halide_error_vectorize_too_much
+         COMMAND "${TS_BUILDDIR}/bin/error_vectorize_too_much")
+add_test(NAME halide_error_wrong_type
+         COMMAND "${TS_BUILDDIR}/bin/error_wrong_type")
+add_test(NAME halide_filter
+         COMMAND "${TS_BUILDDIR}/bin/filter")
+add_test(NAME halide_generator_aot_acquire_release
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_acquire_release")
+add_test(NAME halide_generator_aot_argvcall
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_argvcall")
+add_test(NAME halide_generator_aot_cleanup_on_error
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_cleanup_on_error")
+add_test(NAME halide_generator_aot_embed_image
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_embed_image")
+add_test(NAME halide_generator_aot_error_codes
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_error_codes")
+add_test(NAME halide_generator_aot_example
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_example")
+add_test(NAME halide_generator_aot_extended_buffer_t
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_extended_buffer_t")
+add_test(NAME halide_generator_aot_gpu_object_lifetime
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_gpu_object_lifetime")
+add_test(NAME halide_generator_aot_gpu_only
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_gpu_only")
+add_test(NAME halide_generator_aot_mandelbrot
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_mandelbrot")
+add_test(NAME halide_generator_aot_matlab
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_matlab")
+add_test(NAME halide_generator_aot_metadata_tester
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_metadata_tester")
+add_test(NAME halide_generator_aot_nested_externs
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_nested_externs")
+add_test(NAME halide_generator_aot_pyramid
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_pyramid")
+add_test(NAME halide_generator_aot_tiled_blur
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_tiled_blur")
+add_test(NAME halide_generator_aot_tiled_blur_interleaved
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_tiled_blur_interleaved")
+add_test(NAME halide_generator_aot_user_context
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_user_context")
+add_test(NAME halide_generator_aot_user_context_insanity
+         COMMAND "${TS_BUILDDIR}/bin/generator_aot_user_context_insanity")
+add_test(NAME halide_generator_jit_example
+         COMMAND "${TS_BUILDDIR}/bin/generator_jit_example")
+add_test(NAME halide_generator_jit_paramtest
+         COMMAND "${TS_BUILDDIR}/bin/generator_jit_paramtest")
+add_test(NAME halide_halide_blur
+         COMMAND "${TS_BUILDDIR}/bin/halide_blur")
+add_test(NAME halide_HalideTraceViz
+         COMMAND "${TS_BUILDDIR}/bin/HalideTraceViz")
+add_test(NAME halide_lesson_01_basics
+         COMMAND "${TS_BUILDDIR}/bin/lesson_01_basics")
+add_test(NAME halide_lesson_02_input_image
+         COMMAND "${TS_BUILDDIR}/bin/lesson_02_input_image")
+add_test(NAME halide_lesson_03_debugging_1
+         COMMAND "${TS_BUILDDIR}/bin/lesson_03_debugging_1")
+add_test(NAME halide_lesson_04_debugging_2
+         COMMAND "${TS_BUILDDIR}/bin/lesson_04_debugging_2")
+add_test(NAME halide_lesson_05_scheduling_1
+         COMMAND "${TS_BUILDDIR}/bin/lesson_05_scheduling_1")
+add_test(NAME halide_lesson_06_realizing_over_shifted_domains
+         COMMAND "${TS_BUILDDIR}/bin/lesson_06_realizing_over_shifted_domains")
+add_test(NAME halide_lesson_07_multi_stage_pipelines
+         COMMAND "${TS_BUILDDIR}/bin/lesson_07_multi_stage_pipelines")
+add_test(NAME halide_lesson_08_scheduling_2
+         COMMAND "${TS_BUILDDIR}/bin/lesson_08_scheduling_2")
+add_test(NAME halide_lesson_09_update_definitions
+         COMMAND "${TS_BUILDDIR}/bin/lesson_09_update_definitions")
+add_test(NAME halide_lesson_10_aot_compilation_generate
+         COMMAND "${TS_BUILDDIR}/bin/lesson_10_aot_compilation_generate")
+add_test(NAME halide_lesson_10_aot_compilation_run
+         COMMAND "${TS_BUILDDIR}/bin/lesson_10_aot_compilation_run")
+add_test(NAME halide_lesson_11_cross_compilation
+         COMMAND "${TS_BUILDDIR}/bin/lesson_11_cross_compilation")
+add_test(NAME halide_lesson_12_using_the_gpu
+         COMMAND "${TS_BUILDDIR}/bin/lesson_12_using_the_gpu")
+add_test(NAME halide_lesson_13_tuples
+         COMMAND "${TS_BUILDDIR}/bin/lesson_13_tuples")
+add_test(NAME halide_lesson_14_types
+         COMMAND "${TS_BUILDDIR}/bin/lesson_14_types")
+add_test(NAME halide_ll_process
+         COMMAND "${TS_BUILDDIR}/bin/ll_process")
+add_test(NAME halide_local_laplacian_gen
+         COMMAND "${TS_BUILDDIR}/bin/local_laplacian_gen")
+add_test(NAME halide_performance_block_transpose
+         COMMAND "${TS_BUILDDIR}/bin/performance_block_transpose")
+add_test(NAME halide_performance_boundary_conditions
+         COMMAND "${TS_BUILDDIR}/bin/performance_boundary_conditions")
+add_test(NAME halide_performance_clamped_vector_load
+         COMMAND "${TS_BUILDDIR}/bin/performance_clamped_vector_load")
+add_test(NAME halide_performance_const_division
+         COMMAND "${TS_BUILDDIR}/bin/performance_const_division")
+add_test(NAME halide_performance_fast_inverse
+         COMMAND "${TS_BUILDDIR}/bin/performance_fast_inverse")
+add_test(NAME halide_performance_fast_pow
+         COMMAND "${TS_BUILDDIR}/bin/performance_fast_pow")
+add_test(NAME halide_performance_inner_loop_parallel
+         COMMAND "${TS_BUILDDIR}/bin/performance_inner_loop_parallel")
+add_test(NAME halide_performance_jit_stress
+         COMMAND "${TS_BUILDDIR}/bin/performance_jit_stress")
+add_test(NAME halide_performance_matrix_multiplication
+         COMMAND "${TS_BUILDDIR}/bin/performance_matrix_multiplication")
+add_test(NAME halide_performance_memcpy
+         COMMAND "${TS_BUILDDIR}/bin/performance_memcpy")
+add_test(NAME halide_performance_packed_planar_fusion
+         COMMAND "${TS_BUILDDIR}/bin/performance_packed_planar_fusion")
+add_test(NAME halide_performance_parallel_performance
+         COMMAND "${TS_BUILDDIR}/bin/performance_parallel_performance")
+add_test(NAME halide_performance_profiler
+         COMMAND "${TS_BUILDDIR}/bin/performance_profiler")
+add_test(NAME halide_performance_rgb_interleaved
+         COMMAND "${TS_BUILDDIR}/bin/performance_rgb_interleaved")
+add_test(NAME halide_performance_sort
+         COMMAND "${TS_BUILDDIR}/bin/performance_sort")
+add_test(NAME halide_performance_vectorize
+         COMMAND "${TS_BUILDDIR}/bin/performance_vectorize")
+add_test(NAME halide_pipeline
+         COMMAND "${TS_BUILDDIR}/bin/pipeline")
+add_test(NAME halide_process
+         COMMAND "${TS_BUILDDIR}/bin/process")
+add_test(NAME halide_renderscript_aot_copy
+         COMMAND "${TS_BUILDDIR}/bin/renderscript_aot_copy")
+add_test(NAME halide_renderscript_aot_copy_error
+         COMMAND "${TS_BUILDDIR}/bin/renderscript_aot_copy_error")
+add_test(NAME halide_renderscript_jit_copy
+         COMMAND "${TS_BUILDDIR}/bin/renderscript_jit_copy")
+add_test(NAME halide_run_c_backend_and_native
+         COMMAND "${TS_BUILDDIR}/bin/run_c_backend_and_native")
+add_test(NAME halide_test_internal
+         COMMAND "${TS_BUILDDIR}/bin/test_internal")
+add_test(NAME halide_warning_double_vectorize
+         COMMAND "${TS_BUILDDIR}/bin/warning_double_vectorize")
+add_test(NAME halide_warning_float16_t_underflow
+         COMMAND "${TS_BUILDDIR}/bin/warning_float16_t_underflow")
+add_test(NAME halide_warning_hidden_pure_definition
+         COMMAND "${TS_BUILDDIR}/bin/warning_hidden_pure_definition")
+add_test(NAME halide_warning_parallel_size_one
+         COMMAND "${TS_BUILDDIR}/bin/warning_parallel_size_one")
+add_test(NAME halide_warning_vectorize_size_one
+         COMMAND "${TS_BUILDDIR}/bin/warning_vectorize_size_one")
+
+
+
+set_tests_properties(
+  halide_bilateral_grid
+  halide_bitcode2cpp
+  halide_blur_test
+  halide_build_halide_h
+  halide_camera_pipe
+  halide_correctness_argmax
+  halide_correctness_assertion_failure_in_parallel_for
+  halide_correctness_autotune_bug
+  halide_correctness_autotune_bug_2
+  halide_correctness_autotune_bug_3
+  halide_correctness_autotune_bug_4
+  halide_correctness_autotune_bug_5
+  halide_correctness_bad_likely
+  halide_correctness_bit_counting
+  halide_correctness_bitwise_ops
+  halide_correctness_bool_compute_root_vectorize
+  halide_correctness_bound
+  halide_correctness_boundary_conditions
+  halide_correctness_bounds
+  halide_correctness_bounds_inference
+  halide_correctness_bounds_inference_chunk
+  halide_correctness_bounds_inference_complex
+  halide_correctness_bounds_of_abs
+  halide_correctness_bounds_of_cast
+  halide_correctness_bounds_of_func
+  halide_correctness_bounds_of_monotonic_math
+  halide_correctness_bounds_query
+  halide_correctness_buffer_t
+  halide_correctness_cascaded_filters
+  halide_correctness_cast
+  halide_correctness_cast_handle
+  halide_correctness_c_function
+  halide_correctness_chunk
+  halide_correctness_chunk_sharing
+  halide_correctness_circular_reference_leak
+  halide_correctness_code_explosion
+  halide_correctness_compare_vars
+  halide_correctness_compile_to
+  halide_correctness_compile_to_bitcode
+  halide_correctness_compile_to_lowered_stmt
+  halide_correctness_compute_at_split_rvar
+  halide_correctness_computed_index
+  halide_correctness_compute_outermost
+  halide_correctness_constant_expr
+  halide_correctness_constant_type
+  halide_correctness_constraints
+  halide_correctness_convolution
+  halide_correctness_convolution_multiple_kernels
+  halide_correctness_cross_compilation
+  halide_correctness_custom_allocator
+  halide_correctness_custom_error_reporter
+  halide_correctness_custom_lowering_pass
+  halide_correctness_debug_to_file
+  halide_correctness_deinterleave4
+  halide_correctness_div_mod
+  halide_correctness_dynamic_reduction_bounds
+  halide_correctness_erf
+  halide_correctness_exception
+  halide_correctness_explicit_inline_reductions
+  halide_correctness_extern_bounds_inference
+  halide_correctness_extern_consumer
+  halide_correctness_extern_error
+  halide_correctness_extern_output_expansion
+  halide_correctness_extern_producer
+  halide_correctness_extern_sort
+  halide_correctness_extern_stage
+  halide_correctness_fibonacci
+  halide_correctness_float16_t_comparison
+  halide_correctness_float16_t_constants
+  halide_correctness_float16_t_image_type
+  halide_correctness_float16_t_implicit_upcast
+  halide_correctness_float16_t_realize_constant
+  halide_correctness_func_lifetime
+  halide_correctness_func_lifetime_2
+  halide_correctness_fuse
+  halide_correctness_fused_where_inner_extent_is_zero
+  halide_correctness_fuzz_simplify
+  halide_correctness_gameoflife
+  halide_correctness_gpu_data_flows
+  halide_correctness_gpu_dynamic_shared
+  halide_correctness_gpu_free_sync
+  halide_correctness_gpu_large_alloc
+  halide_correctness_gpu_mixed_dimensionality
+  halide_correctness_gpu_mixed_shared_mem_types
+  halide_correctness_gpu_multi_device
+  halide_correctness_gpu_multi_kernel
+  halide_correctness_gpu_non_contiguous_copy
+  halide_correctness_gpu_object_lifetime
+  halide_correctness_gpu_specialize
+  halide_correctness_gpu_sum_scan
+  halide_correctness_gpu_thread_barrier
+  halide_correctness_gpu_transpose
+  halide_correctness_gpu_vectorize_div_mod
+  halide_correctness_gpu_vectorized_shared_memory
+  halide_correctness_handle
+  halide_correctness_heap_cleanup
+  halide_correctness_hello_gpu
+  halide_correctness_histogram
+  halide_correctness_histogram_equalize
+  halide_correctness_image_of_lists
+  halide_correctness_implicit_args
+  halide_correctness_infer_arguments
+  halide_correctness_inline_reduction
+  halide_correctness_in_place
+  halide_correctness_input_image_bounds_check
+  halide_correctness_input_larger_than_two_gigs
+  halide_correctness_integer_powers
+  halide_correctness_interleave
+  halide_correctness_introspection
+  halide_correctness_inverse
+  halide_correctness_isnan
+  halide_correctness_iterate_over_circle
+  halide_correctness_lambda
+  halide_correctness_lazy_convolution
+  halide_correctness_legal_race_condition
+  halide_correctness_lerp
+  halide_correctness_likely
+  halide_correctness_logical
+  halide_correctness_loop_invariant_extern_calls
+  halide_correctness_make_struct
+  halide_correctness_many_dimensions
+  halide_correctness_many_small_extern_stages
+  halide_correctness_many_updates
+  halide_correctness_math
+  halide_correctness_memoize
+  halide_correctness_min_extent
+  halide_correctness_mod
+  halide_correctness_multi_output_pipeline_with_bad_sizes
+  halide_correctness_multipass_constraints
+  halide_correctness_multi_pass_reduction
+  halide_correctness_multiple_outputs
+  halide_correctness_multi_way_select
+  halide_correctness_named_updates
+  halide_correctness_newtons_method
+  halide_correctness_obscure_image_references
+  halide_correctness_oddly_sized_output
+  halide_correctness_out_of_memory
+  halide_correctness_output_larger_than_two_gigs
+  halide_correctness_parallel
+  halide_correctness_parallel_alloc
+  halide_correctness_parallel_gpu_nested
+  halide_correctness_parallel_nested
+  halide_correctness_parallel_reductions
+  halide_correctness_parallel_rvar
+  halide_correctness_param
+  halide_correctness_parameter_constraints
+  halide_correctness_partial_application
+  halide_correctness_partition_loops_bug
+  halide_correctness_pipeline_set_jit_externs_func
+  halide_correctness_print
+  halide_correctness_process_some_tiles
+  halide_correctness_random
+  halide_correctness_realize_larger_than_two_gigs
+  halide_correctness_realize_over_shifted_domain
+  halide_correctness_reduction_chain
+  halide_correctness_reduction_schedule
+  halide_correctness_reduction_subregion
+  halide_correctness_reorder_rvars
+  halide_correctness_reorder_storage
+  halide_correctness_reschedule
+  halide_correctness_reuse_stack_alloc
+  halide_correctness_round
+  halide_correctness_runtime_float16_t_upcast
+  halide_correctness_scatter
+  halide_correctness_shared_self_references
+  halide_correctness_shifted_image
+  halide_correctness_side_effects
+  halide_correctness_simd_op_check
+  halide_correctness_simplified_away_embedded_image
+  halide_correctness_skip_stages
+  halide_correctness_skip_stages_external_array_functions
+  halide_correctness_sliding_backwards
+  halide_correctness_sliding_reduction
+  halide_correctness_sliding_window
+  halide_correctness_sort_exprs
+  halide_correctness_specialize
+  halide_correctness_specialize_to_gpu
+  halide_correctness_split_fuse_rvar
+  halide_correctness_split_reuse_inner_name_bug
+  halide_correctness_split_store_compute
+  halide_correctness_stack_allocations
+  halide_correctness_stmt_to_html
+  halide_correctness_storage_folding
+  halide_correctness_stream_compaction
+  halide_correctness_strided_load
+  halide_correctness_target
+  halide_correctness_tracing
+  halide_correctness_tracing_bounds
+  halide_correctness_tracing_stack
+  halide_correctness_transitive_bounds
+  halide_correctness_tuple_reduction
+  halide_correctness_two_vector_args
+  halide_correctness_undef
+  halide_correctness_uninitialized_read
+  halide_correctness_unique_func_image
+  halide_correctness_unrolled_reduction
+  halide_correctness_update_chunk
+  halide_correctness_vector_bounds_inference
+  halide_correctness_vector_cast
+  halide_correctness_vector_extern
+  halide_correctness_vectorized_initialization
+  halide_correctness_vectorized_reduction_bug
+  halide_correctness_vectorize_mixed_widths
+  halide_correctness_vector_math
+  halide_error_ambiguous_inline_reductions
+  halide_error_bad_bound
+  halide_error_bad_compute_at
+  halide_error_bad_const_cast
+  halide_error_bad_rvar_order
+  halide_error_bad_schedule
+  halide_error_bad_store_at
+  halide_error_buffer_larger_than_two_gigs
+  halide_error_constrain_wrong_output_buffer
+  halide_error_define_after_realize
+  halide_error_define_after_use
+  halide_error_expanding_reduction
+  halide_error_five_d_gpu_buffer
+  halide_error_float16_t_implicit_downcast
+  halide_error_float16_t_overflow
+  halide_error_float16_t_overflow_int_conv
+  halide_error_float_arg
+  halide_error_impossible_constraints
+  halide_error_lerp_float_weight_out_of_range
+  halide_error_lerp_mismatch
+  halide_error_lerp_signed_weight
+  halide_error_memoize_different_compute_store
+  halide_error_missing_args
+  halide_error_modulo_constant_zero
+  halide_error_nonexistent_update_stage
+  halide_error_old_implicit_args
+  halide_error_pointer_arithmetic
+  halide_error_race_condition
+  halide_error_realize_constantly_larger_than_two_gigs
+  halide_error_reduction_bounds
+  halide_error_reduction_type_mismatch
+  halide_error_reused_args
+  halide_error_reuse_var_in_schedule
+  halide_error_thread_id_outside_block_id
+  halide_error_too_many_args
+  halide_error_unbounded_input
+  halide_error_unbounded_output
+  halide_error_undefined_rdom_dimension
+  halide_error_vectorize_dynamic
+  halide_error_vectorize_too_little
+  halide_error_vectorize_too_much
+  halide_error_wrong_type
+  halide_filter
+  halide_generator_aot_acquire_release
+  halide_generator_aot_argvcall
+  halide_generator_aot_cleanup_on_error
+  halide_generator_aot_embed_image
+  halide_generator_aot_error_codes
+  halide_generator_aot_example
+  halide_generator_aot_extended_buffer_t
+  halide_generator_aot_gpu_object_lifetime
+  halide_generator_aot_gpu_only
+  halide_generator_aot_mandelbrot
+  halide_generator_aot_matlab
+  halide_generator_aot_metadata_tester
+  halide_generator_aot_nested_externs
+  halide_generator_aot_pyramid
+  halide_generator_aot_tiled_blur
+  halide_generator_aot_tiled_blur_interleaved
+  halide_generator_aot_user_context
+  halide_generator_aot_user_context_insanity
+  halide_generator_jit_example
+  halide_generator_jit_paramtest
+  halide_halide_blur
+  halide_HalideTraceViz
+  halide_lesson_01_basics
+  halide_lesson_02_input_image
+  halide_lesson_03_debugging_1
+  halide_lesson_04_debugging_2
+  halide_lesson_05_scheduling_1
+  halide_lesson_06_realizing_over_shifted_domains
+  halide_lesson_07_multi_stage_pipelines
+  halide_lesson_08_scheduling_2
+  halide_lesson_09_update_definitions
+  halide_lesson_10_aot_compilation_generate
+  halide_lesson_10_aot_compilation_run
+  halide_lesson_11_cross_compilation
+  halide_lesson_12_using_the_gpu
+  halide_lesson_13_tuples
+  halide_lesson_14_types
+  halide_ll_process
+  halide_local_laplacian_gen
+  halide_performance_block_transpose
+  halide_performance_boundary_conditions
+  halide_performance_clamped_vector_load
+  halide_performance_const_division
+  halide_performance_fast_inverse
+  halide_performance_fast_pow
+  halide_performance_inner_loop_parallel
+  halide_performance_jit_stress
+  halide_performance_matrix_multiplication
+  halide_performance_memcpy
+  halide_performance_packed_planar_fusion
+  halide_performance_parallel_performance
+  halide_performance_profiler
+  halide_performance_rgb_interleaved
+  halide_performance_sort
+  halide_performance_vectorize
+  halide_pipeline
+  halide_process
+  halide_renderscript_aot_copy
+  halide_renderscript_aot_copy_error
+  halide_renderscript_jit_copy
+  halide_run_c_backend_and_native
+  halide_test_internal
+  halide_warning_double_vectorize
+  halide_warning_float16_t_underflow
+  halide_warning_hidden_pure_definition
+  halide_warning_parallel_size_one
+  halide_warning_vectorize_size_one
+  PROPERTIES
+    LABELS "Halide"
+    ENVIRONMENT "HL_JIT_TARGET=opencl" "HL_TARGET=opencl")
+
+# disabled OpenGL tests
+
+# halide_opengl_copy_pixels
+# halide_opengl_copy_to_device
+# halide_opengl_copy_to_host
+# halide_opengl_float_texture
+# halide_opengl_internal
+# halide_opengl_lut
+# halide_opengl_produce
+# halide_opengl_select
+# halide_opengl_set_pixels
+# halide_opengl_shifted_domains
+# halide_opengl_special_funcs
+# halide_opengl_test
+# halide_opengl_varying
+# halide_glsl_halide_blur
+# halide_glsl_halide_ycc
+# add_test(NAME halide_opengl_copy_pixels
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_copy_pixels")
+# add_test(NAME halide_opengl_copy_to_device
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_copy_to_device")
+# add_test(NAME halide_opengl_copy_to_host
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_copy_to_host")
+# add_test(NAME halide_opengl_float_texture
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_float_texture")
+# add_test(NAME halide_opengl_internal
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_internal")
+# add_test(NAME halide_opengl_lut
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_lut")
+# add_test(NAME halide_opengl_produce
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_produce")
+# add_test(NAME halide_opengl_select
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_select")
+# add_test(NAME halide_opengl_set_pixels
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_set_pixels")
+# add_test(NAME halide_opengl_shifted_domains
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_shifted_domains")
+# add_test(NAME halide_opengl_special_funcs
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_special_funcs")
+# add_test(NAME halide_opengl_test
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_test")
+# add_test(NAME halide_opengl_varying
+         # COMMAND "${TS_BUILDDIR}/bin/opengl_varying")
+# add_test(NAME halide_glsl_halide_blur
+         # COMMAND "${TS_BUILDDIR}/bin/glsl_halide_blur")
+# add_test(NAME halide_glsl_halide_ycc
+         # COMMAND "${TS_BUILDDIR}/bin/glsl_halide_ycc")
+
+
+
+endif()
diff --git a/examples/Halide/Makefile.in b/examples/Halide/Makefile.in
index 36fc874..5751e34 100644
--- a/examples/Halide/Makefile.in
+++ b/examples/Halide/Makefile.in
@@ -192,6 +192,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -199,6 +200,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -216,8 +218,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -293,6 +293,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/IntelSVM/CMakeLists.txt b/examples/IntelSVM/CMakeLists.txt
new file mode 100644
index 0000000..bd35443
--- /dev/null
+++ b/examples/IntelSVM/CMakeLists.txt
@@ -0,0 +1,68 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME IntelSVM)
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+set(INTEL_ZIP "${TS_SRCDIR}/intel_ocl_svm_basic_win.zip")
+
+if (EXISTS "${INTEL_ZIP}")
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    DOWNLOAD_COMMAND /bin/true
+    PATCH_COMMAND pwd && unzip "${INTEL_ZIP}" &&
+      cp "${CMAKE_CURRENT_SOURCE_DIR}/intelsvm_CMakeLists.txt" ./CMakeLists.txt &&
+      patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/intelsvm.patch
+    CMAKE_ARGS
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo
+      "-DCMAKE_CXX_FLAGS_RELWITHDEBINFO=-O2 -g -DCL_USE_DEPRECATED_OPENCL_1_2_APIS"
+      -DOPENCL_LIBRARIES:STRING=OpenCL
+    INSTALL_COMMAND "/bin/true"
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+  add_test(NAME intel_svm_coarse
+    COMMAND "${TS_BUILDDIR}/coarse" -p 0 -t default
+    WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/SVMBasicCoarseGrained")
+  add_test(NAME intel_svm_fine
+    COMMAND "${TS_BUILDDIR}/fine" -p 0 -t default
+    WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/SVMBasicFineGrained")
+
+  set_tests_properties( intel_svm_coarse intel_svm_fine
+    PROPERTIES  LABELS "IntelSVM;hsa")
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required files not found" )
+
+endif()
diff --git a/examples/IntelSVM/Makefile.am b/examples/IntelSVM/Makefile.am
new file mode 100644
index 0000000..ba12709
--- /dev/null
+++ b/examples/IntelSVM/Makefile.am
@@ -0,0 +1,70 @@
+# examples / Intel OpenCL 2.0 Shared Virtual Memory Code Sample
+#
+#   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+if TEST_SUITE_INTELSVM
+testsuite_pocl_dir=${abs_top_srcdir}/examples/IntelSVM
+testsuite_src_dir=${testsuite_pocl_dir}/source
+
+.PHONY: build prepare-examples
+
+prepare-examples: $(testsuite_src_dir) build
+
+$(testsuite_src_dir):
+	test -d $(testsuite_src_dir) || (mkdir $(testsuite_src_dir) && \
+	cd $(testsuite_src_dir) && unzip ../intel_ocl_svm_basic_win.zip)
+
+build: $(testsuite_src_dir) common coarse fine
+
+common: common/basic.o common/cmdparser.o  common/oclobject.o
+
+common/basic.o:
+	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o common/basic.o -c common/basic.cpp
+
+common/cmdparser.o:
+	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o common/cmdparser.o -c common/cmdparser.cpp
+
+common/oclobject.o:
+	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o common/oclobject.o -c common/oclobject.cpp
+
+coarse: common
+	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o SVMBasicCoarseGrained/svmbasic.o -c SVMBasicCoarseGrained/svmbasic.cpp
+	cd $(testsuite_src_dir) && $(CXX) $(LDFLAGS) -o SVMBasicCoarseGrained/svmbasic SVMBasicCoarseGrained/svmbasic.o \
+	common/basic.o  common/cmdparser.o  common/oclobject.o -lOpenCL
+
+fine: common
+	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o SVMBasicFineGrained/svmbasic.o -c SVMBasicFineGrained/svmbasic.cpp
+	cd $(testsuite_src_dir) && $(CXX) $(LDFLAGS) -o SVMBasicFineGrained/svmbasic SVMBasicFineGrained/svmbasic.o \
+	common/basic.o  common/cmdparser.o  common/oclobject.o -lOpenCL
+
+
+test_CoarseGrained:
+	cd $(testsuite_src_dir)/SVMBasicCoarseGrained && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./svmbasic -p 0 -t all
+
+test_FineGrained:
+	cd $(testsuite_src_dir)/SVMBasicFineGrained && \
+	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./svmbasic -p 0 -t all
+
+clean-examples:
+	rm -fr $(testsuite_src_dir)
+
+endif
diff --git a/examples/piglit/Makefile.in b/examples/IntelSVM/Makefile.in
similarity index 83%
copy from examples/piglit/Makefile.in
copy to examples/IntelSVM/Makefile.in
index 5364c6e..be79c4a 100644
--- a/examples/piglit/Makefile.in
+++ b/examples/IntelSVM/Makefile.in
@@ -14,22 +14,20 @@
 
 @SET_MAKE@
 
-# Process this file with automake to produce Makefile.in (in this,
-# and all subdirectories).
-# Makefile.am for pocl/examples/VexCL
-# 
-# Copyright (c) 2013 Ville Korhonen / Tampere University of Technology
-# 
+# examples / Intel OpenCL 2.0 Shared Virtual Memory Code Sample
+#
+#   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-# 
+#
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -112,7 +110,7 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
-subdir = examples/piglit
+subdir = examples/IntelSVM
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
@@ -192,6 +190,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -199,6 +198,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -216,8 +216,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -293,6 +291,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -352,7 +351,8 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-EXTRA_DIST = produce_results.sh
+ at TEST_SUITE_INTELSVM_TRUE@testsuite_pocl_dir = ${abs_top_srcdir}/examples/IntelSVM
+ at TEST_SUITE_INTELSVM_TRUE@testsuite_src_dir = ${testsuite_pocl_dir}/source
 all: all-am
 
 .SUFFIXES:
@@ -365,9 +365,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign examples/piglit/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign examples/IntelSVM/Makefile'; \
 	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign examples/piglit/Makefile
+	  $(AUTOMAKE) --foreign examples/IntelSVM/Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
 	  *config.status*) \
@@ -545,24 +545,47 @@ uninstall-am:
 .PRECIOUS: Makefile
 
 
- at TEST_SUITE_PIGLIT_TRUE@.PHONY: build
-
- at TEST_SUITE_PIGLIT_TRUE@prepare-examples:
- at TEST_SUITE_PIGLIT_TRUE@	cd piglit ; \
- at TEST_SUITE_PIGLIT_TRUE@	cmake \
- at TEST_SUITE_PIGLIT_TRUE@		-DPIGLIT_BUILD_GL_TESTS=OFF \
- at TEST_SUITE_PIGLIT_TRUE@		-DPIGLIT_BUILD_GLES1_TESTS=OFF \
- at TEST_SUITE_PIGLIT_TRUE@		-DPIGLIT_BUILD_GLES2_TESTS=OFF \
- at TEST_SUITE_PIGLIT_TRUE@		-DPIGLIT_BUILD_GLES3_TESTS=OFF \
- at TEST_SUITE_PIGLIT_TRUE@		-DPIGLIT_BUILD_CL_TESTS=ON \
- at TEST_SUITE_PIGLIT_TRUE@		-DPIGLIT_USE_WAFFLE=OFF \
- at TEST_SUITE_PIGLIT_TRUE@		-DPIGLIT_BUILD_GLX_TESTS=OFF \
- at TEST_SUITE_PIGLIT_TRUE@		-DOPENCL_INCLUDE_PATH="../../../include/" \
- at TEST_SUITE_PIGLIT_TRUE@		-DOPENCL_opencl_LIBRARY:STRING="-l OpenCL" .; \
- at TEST_SUITE_PIGLIT_TRUE@	make
-
- at TEST_SUITE_PIGLIT_TRUE@clean-examples:
- at TEST_SUITE_PIGLIT_TRUE@	cd piglit; make clean
+ at TEST_SUITE_INTELSVM_TRUE@.PHONY: build prepare-examples
+
+ at TEST_SUITE_INTELSVM_TRUE@prepare-examples: $(testsuite_src_dir) build
+
+ at TEST_SUITE_INTELSVM_TRUE@$(testsuite_src_dir):
+ at TEST_SUITE_INTELSVM_TRUE@	test -d $(testsuite_src_dir) || (mkdir $(testsuite_src_dir) && \
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir) && unzip ../intel_ocl_svm_basic_win.zip)
+
+ at TEST_SUITE_INTELSVM_TRUE@build: $(testsuite_src_dir) common coarse fine
+
+ at TEST_SUITE_INTELSVM_TRUE@common: common/basic.o common/cmdparser.o  common/oclobject.o
+
+ at TEST_SUITE_INTELSVM_TRUE@common/basic.o:
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o common/basic.o -c common/basic.cpp
+
+ at TEST_SUITE_INTELSVM_TRUE@common/cmdparser.o:
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o common/cmdparser.o -c common/cmdparser.cpp
+
+ at TEST_SUITE_INTELSVM_TRUE@common/oclobject.o:
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o common/oclobject.o -c common/oclobject.cpp
+
+ at TEST_SUITE_INTELSVM_TRUE@coarse: common
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o SVMBasicCoarseGrained/svmbasic.o -c SVMBasicCoarseGrained/svmbasic.cpp
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir) && $(CXX) $(LDFLAGS) -o SVMBasicCoarseGrained/svmbasic SVMBasicCoarseGrained/svmbasic.o \
+ at TEST_SUITE_INTELSVM_TRUE@	common/basic.o  common/cmdparser.o  common/oclobject.o -lOpenCL
+
+ at TEST_SUITE_INTELSVM_TRUE@fine: common
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir) && $(CXX) $(CFLAGS) -std=c++11 -Icommon -o SVMBasicFineGrained/svmbasic.o -c SVMBasicFineGrained/svmbasic.cpp
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir) && $(CXX) $(LDFLAGS) -o SVMBasicFineGrained/svmbasic SVMBasicFineGrained/svmbasic.o \
+ at TEST_SUITE_INTELSVM_TRUE@	common/basic.o  common/cmdparser.o  common/oclobject.o -lOpenCL
+
+ at TEST_SUITE_INTELSVM_TRUE@test_CoarseGrained:
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir)/SVMBasicCoarseGrained && \
+ at TEST_SUITE_INTELSVM_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./svmbasic -p 0 -t all
+
+ at TEST_SUITE_INTELSVM_TRUE@test_FineGrained:
+ at TEST_SUITE_INTELSVM_TRUE@	cd $(testsuite_src_dir)/SVMBasicFineGrained && \
+ at TEST_SUITE_INTELSVM_TRUE@	OCL_ICD_VENDORS=$(abs_top_builddir)/ocl-vendors ./svmbasic -p 0 -t all
+
+ at TEST_SUITE_INTELSVM_TRUE@clean-examples:
+ at TEST_SUITE_INTELSVM_TRUE@	rm -fr $(testsuite_src_dir)
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/examples/IntelSVM/README b/examples/IntelSVM/README
new file mode 100644
index 0000000..a383140
--- /dev/null
+++ b/examples/IntelSVM/README
@@ -0,0 +1,18 @@
+1) Download intel_ocl_svm_basic_win.zip from:
+     https://software.intel.com/en-us/articles/opencl-20-shared-virtual-memory-code-sample
+   to this folder
+
+3) reconfigure pocl ( ./configure ) with --enable-testsuites=check
+   or --enable-testsuites=IntelSVM
+
+5) call 'make prepare-examples' once to get the examples built
+   Note: you need OpenCL 2.0 headers installed in your system for this
+   step to succeed.
+   
+   You can install them from pocl via
+   
+     ./configure --enable-install-opencl-headers
+     make -C include/CL install
+
+6) run tests: make check
+
diff --git a/examples/standalone/CMakeLists.txt b/examples/IntelSVM/intelsvm_CMakeLists.txt
similarity index 66%
copy from examples/standalone/CMakeLists.txt
copy to examples/IntelSVM/intelsvm_CMakeLists.txt
index 42d30d8..e478cc7 100644
--- a/examples/standalone/CMakeLists.txt
+++ b/examples/IntelSVM/intelsvm_CMakeLists.txt
@@ -1,7 +1,7 @@
 #=============================================================================
 #   CMake build system files
 #
-#   Copyright (c) 2014 pocl developers
+#   Copyright (c) 2015 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -23,18 +23,22 @@
 #
 #=============================================================================
 
-#EXTRA_DIST = standalone.cl
+cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
 
-#noinst_DATA = $(EXTRA_DIST:.cl=.bc)
+project(intelsvm)
 
-#.cl.bc:
-# ../../scripts/pocl-standalone -h $(@:.bc=.h) -o $@ $<
+add_compile_options("-std=c++11")
 
-#clean-local:
-# rm -f $(EXTRA_DIST:.cl=.bc) $(EXTRA_DIST:.cl=.h)
+include_directories("common")
 
-add_test("pocl-standalone" "/bin/sh" "${CMAKE_BINARY_DIR}/scripts/pocl-standalone"
-          -h "${CMAKE_BINARY_DIR}/standalone.h"
-          -o "${CMAKE_BINARY_DIR}/standalone.bc"
-          "${CMAKE_CURRENT_SOURCE_DIR}/standalone.cl")
+add_executable(fine SVMBasicFineGrained/svmbasic.cpp
+    common/basic.cpp  common/basic.hpp
+    common/cmdparser.cpp  common/cmdparser.hpp
+    common/oclobject.cpp  common/oclobject.hpp)
+target_link_libraries(fine "${OPENCL_LIBRARIES}")
 
+add_executable(coarse SVMBasicCoarseGrained/svmbasic.cpp
+    common/basic.cpp  common/basic.hpp
+    common/cmdparser.cpp  common/cmdparser.hpp
+    common/oclobject.cpp  common/oclobject.hpp)
+target_link_libraries(coarse "${OPENCL_LIBRARIES}")
diff --git a/examples/Makefile.am b/examples/Makefile.am
index feb3457..271f968 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -31,7 +31,7 @@
 
 SUBDIRS = example1 example1-spir32 example1-spir64 example2 example2a trig \
 	scalarwave standalone opencl-book-samples VexCL ViennaCL Rodinia Parboil \
-	AMD AMDSDK2.9 EinsteinToolkit piglit Halide OpenCV CloverLeaf
+	AMD AMDSDK2.9 AMDSDK3.0 EinsteinToolkit piglit Halide OpenCV CloverLeaf IntelSVM
 
 BASIC_EXAMPLES = example1 example1-spir32 example1-spir64 example2 example2a trig \
 	scalarwave standalone opencl-book-samples EinsteinToolkit
@@ -60,6 +60,9 @@ endif
 if TEST_SUITE_AMDSDK2_9
 EXAMPLES_TO_PREPARE += "AMDSDK2.9 "
 endif
+if TEST_SUITE_AMDSDK3_0
+EXAMPLES_TO_PREPARE += "AMDSDK3.0 "
+endif
 if TEST_SUITE_PIGLIT
 EXAMPLES_TO_PREPARE += "piglit "
 endif
@@ -69,7 +72,9 @@ endif
 if TEST_SUITE_CLOVERLEAF
 EXAMPLES_TO_PREPARE += "CloverLeaf "
 endif
-
+if TEST_SUITE_INTELSVM
+EXAMPLES_TO_PREPARE += "IntelSVM "
+endif
 if TEST_SUITE_OPENCV
 EXAMPLES_TO_PREPARE += "OpenCV "
 endif
diff --git a/examples/Makefile.in b/examples/Makefile.in
index ff82c22..4e8c849 100644
--- a/examples/Makefile.in
+++ b/examples/Makefile.in
@@ -125,10 +125,12 @@ target_triplet = @target@
 @TEST_SUITE_PARBOIL_TRUE at am__append_4 = "Parboil "
 @TEST_SUITE_AMD_TRUE at am__append_5 = "AMD "
 @TEST_SUITE_AMDSDK2_9_TRUE at am__append_6 = "AMDSDK2.9 "
- at TEST_SUITE_PIGLIT_TRUE@am__append_7 = "piglit "
- at TEST_SUITE_HALIDE_TRUE@am__append_8 = "Halide "
- at TEST_SUITE_CLOVERLEAF_TRUE@am__append_9 = "CloverLeaf "
- at TEST_SUITE_OPENCV_TRUE@am__append_10 = "OpenCV "
+ at TEST_SUITE_AMDSDK3_0_TRUE@am__append_7 = "AMDSDK3.0 "
+ at TEST_SUITE_PIGLIT_TRUE@am__append_8 = "piglit "
+ at TEST_SUITE_HALIDE_TRUE@am__append_9 = "Halide "
+ at TEST_SUITE_CLOVERLEAF_TRUE@am__append_10 = "CloverLeaf "
+ at TEST_SUITE_INTELSVM_TRUE@am__append_11 = "IntelSVM "
+ at TEST_SUITE_OPENCV_TRUE@am__append_12 = "OpenCV "
 subdir = examples
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
@@ -269,6 +271,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -276,6 +279,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -293,8 +297,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -370,6 +372,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -431,7 +434,7 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 SUBDIRS = example1 example1-spir32 example1-spir64 example2 example2a trig \
 	scalarwave standalone opencl-book-samples VexCL ViennaCL Rodinia Parboil \
-	AMD AMDSDK2.9 EinsteinToolkit piglit Halide OpenCV CloverLeaf
+	AMD AMDSDK2.9 AMDSDK3.0 EinsteinToolkit piglit Halide OpenCV CloverLeaf IntelSVM
 
 BASIC_EXAMPLES = example1 example1-spir32 example1-spir64 example2 example2a trig \
 	scalarwave standalone opencl-book-samples EinsteinToolkit
@@ -443,7 +446,7 @@ EXTRA_DIST = CMakeLists.txt PyOpenCL/README
 EXAMPLES_TO_PREPARE = $(am__append_1) $(am__append_2) $(am__append_3) \
 	$(am__append_4) $(am__append_5) $(am__append_6) \
 	$(am__append_7) $(am__append_8) $(am__append_9) \
-	$(am__append_10)
+	$(am__append_10) $(am__append_11) $(am__append_12)
 all: all-recursive
 
 .SUFFIXES:
diff --git a/examples/OpenCV/CMakeLists.txt b/examples/OpenCV/CMakeLists.txt
new file mode 100644
index 0000000..d8c4b10
--- /dev/null
+++ b/examples/OpenCV/CMakeLists.txt
@@ -0,0 +1,332 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "OpenCV")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+set(OPENCV "opencv-3.0.0-beta")
+set(OPENCV_ZIP "${TS_SRCDIR}/${OPENCV}.zip")
+
+if(EXISTS "${OPENCV_ZIP}")
+
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    # Download URL is "https://github.com/Itseez/opencv/archive/${OPENCV_ZIP}"
+    # but for some reason it doesn't work from cmake.
+    DOWNLOAD_COMMAND pwd && unzip "${OPENCV_ZIP}"
+      && patch -p1 -i "${CMAKE_CURRENT_SOURCE_DIR}/opencv.patch"
+      && rmdir OpenCV && mv ${OPENCV} OpenCV
+
+    # TODO this should be optimized to only build
+    # the stuff required for testing OpenCL
+    CMAKE_ARGS -DCMAKE_BUILD_TYPE=RelWithDebInfo
+      -DWITH_CUDA=OFF
+      -DWITH_OPENCL=ON
+      -DWITH_FFMPEG=OFF
+      -DBUILD_TESTS=ON
+      -DBUILD_PERF_TESTS=ON
+      -DBUILD_EXAMPLES=ON
+      -DBUILD_DOCS=OFF
+
+    INSTALL_COMMAND /bin/true
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+  add_test(NAME test_UMat
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=UMat.*")
+
+  add_test(NAME test_Core_UMat
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=Core_UMat.*")
+
+  add_test(NAME test_Image2D
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=Image2D.*")
+
+  add_test(NAME test_UMat_UMatBasicTests
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=UMat/UMatBasicTests.*")
+
+  add_test(NAME test_UMat_UMatTestReshape
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=UMat/UMatTestReshape.*")
+
+  add_test(NAME test_UMat_UMatTestRoi
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=UMat/UMatTestRoi.*")
+
+  add_test(NAME test_UMat_UMatTestSizeOperations
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=UMat/UMatTestSizeOperations.*")
+
+  add_test(NAME test_UMat_UMatTestUMatOperations
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=UMat/UMatTestUMatOperations.*")
+
+  add_test(NAME test_OCL_MeanStdDev_
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_MeanStdDev_.*")
+
+  add_test(NAME test_OCL_Channels_Merge
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Channels/Merge.*")
+
+  add_test(NAME test_OCL_Channels_Split
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Channels/Split.*")
+
+  add_test(NAME test_OCL_Channels_MixChannels
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Channels/MixChannels.*")
+
+  add_test(NAME test_OCL_Channels_InsertChannels
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Channels/InsertChannels.*")
+
+  add_test(NAME test_OCL_Channels_ExtractChannels
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Channels/ExtractChannels.*")
+
+  add_test(NAME test_OCL_Arithm_Lut
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Lut.*")
+
+  add_test(NAME test_OCL_Arithm_Add
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Add.*")
+
+  add_test(NAME test_OCL_Arithm_Subtract
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Substract.*")
+
+  add_test(NAME test_OCL_Arithm_Mul
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Mul.*")
+
+  add_test(NAME test_OCL_Arithm_Div
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Div.*")
+
+  add_test(NAME test_OCL_Arithm_AddWeighted
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/AddWeighted.*")
+
+  add_test(NAME test_OCL_Arithm_Min
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Min.*")
+
+  add_test(NAME test_OCL_Arithm_Max
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Max.*")
+
+  add_test(NAME test_OCL_Arithm_Absdiff
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Absdiff.*")
+
+  add_test(NAME test_OCL_Arithm_CartToPolar
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/CartToPolar.*")
+
+  add_test(NAME test_OCL_Arithm_PolarToCart
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/PolarToCart.*")
+
+  add_test(NAME test_OCL_Arithm_Transpose
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Transpose.*")
+
+  add_test(NAME test_OCL_Arithm_Bitwise_and
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/BitWise_and.*")
+
+  add_test(NAME test_OCL_Arithm_Bitwise_or
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Bitwise_or.*")
+
+  add_test(NAME test_OCL_Arithm_Bitwise_xor
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Bitwise_xor.*")
+
+  add_test(NAME test_OCL_Arithm_Bitwise_not
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Bitwise_not.*")
+
+  add_test(NAME test_OCL_Arithm_Compare
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Compare.*")
+
+  add_test(NAME test_OCL_Arithm_Pow
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Pow.*")
+
+  add_test(NAME test_OCL_Arithm_SetIdentity
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/SetIdentity.*")
+
+  add_test(NAME test_OCL_Arithm_Repeat
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Repeat.*")
+
+  add_test(NAME test_OCL_Arithm_CountNonZero
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/CountNonZero.*")
+
+  add_test(NAME test_OCL_Arithm_Sum
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Sum.*")
+
+  add_test(NAME test_OCL_Arithm_MeanStdDev
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/MeanStdDev.*")
+
+  add_test(NAME test_OCL_Arithm_Log
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Log.*")
+
+  add_test(NAME test_OCL_Arithm_Exp
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Exp.*")
+
+  add_test(NAME test_OCL_Arithm_Phase
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Phase.*")
+
+  add_test(NAME test_OCL_Arithm_Magnitude
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Magnitude.*")
+
+  add_test(NAME test_OCL_Arithm_Flip
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Flip.*")
+
+  add_test(NAME test_OCL_Arithm_MinMaxIdx
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/MinMaxIdx.*")
+
+  add_test(NAME test_OCL_Arithm_MinMaxIdx_Mask
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/MinMaxIdx_Mask.*")
+
+  add_test(NAME test_OCL_Arithm_Norm
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Norm.*")
+
+  add_test(NAME test_OCL_Arithm_UMatDot
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/UMatDot.*")
+
+  add_test(NAME test_OCL_Arithm_Sqrt
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Sqrt.*")
+
+  add_test(NAME test_OCL_Arithm_Normalize
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Normalize.*")
+
+  add_test(NAME test_OCL_Arithm_InRange
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/InRange.*")
+
+  add_test(NAME test_OCL_Arithm_ConvertScaleAbs
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/ConvertScaleAbs.*")
+
+  add_test(NAME test_OCL_Arithm_ScaleAdd
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/ScaleAdd.*")
+
+  add_test(NAME test_OCL_Arithm_PatchNaNs
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/PatchNaNs.*")
+
+  add_test(NAME test_OCL_Arithm_Psnr
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/Psnr.*")
+
+  add_test(NAME test_OCL_Arithm_ReduceSum
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/ReduceSum.*")
+
+  add_test(NAME test_OCL_Arithm_ReduceMax
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/ReduceMax.*")
+
+  add_test(NAME test_OCL_Arithm_ReduceMin
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/ReduceMin.*")
+
+  add_test(NAME test_OCL_Arithm_ReduceAvg
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Arithm/ReduceAvg.*")
+
+  add_test(NAME test_OCL_Core_Gemm
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Core/Gemm.*")
+
+  add_test(NAME test_OCL_Core_Dft
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_Core/Dft.*")
+
+  add_test(NAME test_OCL_OCL_ImgProc_MultiSpectrums
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_OCL_ImgProc/MultiSpectrums.*")
+
+  add_test(NAME test_OCL_MatrixOperation_ConvertTo
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_MatrixOperation/ConvertTo.*")
+
+  add_test(NAME test_OCL_MatrixOperation_CopyTo
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_MatrixOperation/CopyTo.*")
+
+  add_test(NAME test_OCL_MatrixOperation_SetTo
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_MatrixOperation/SetTo.*")
+
+  add_test(NAME test_OCL_MatrixOperation_UMatExpr
+     COMMAND "${TS_BUILDDIR}/bin/opencv_test_core" "--gtest_filter=OCL_MatrixOperation/UMatExpr.*")
+
+  set_tests_properties(
+    test_UMat
+    test_Core_UMat
+    test_Image2D
+    test_UMat_UMatBasicTests
+    test_UMat_UMatTestReshape
+    test_UMat_UMatTestRoi
+    test_UMat_UMatTestSizeOperations
+    test_UMat_UMatTestUMatOperations
+    test_OCL_MeanStdDev_
+    test_OCL_Channels_Merge
+    test_OCL_Channels_Split
+    test_OCL_Channels_MixChannels
+    test_OCL_Channels_InsertChannels
+    test_OCL_Channels_ExtractChannels
+    test_OCL_Arithm_Lut
+    test_OCL_Arithm_Add
+    test_OCL_Arithm_Subtract
+    test_OCL_Arithm_Mul
+    test_OCL_Arithm_Div
+    test_OCL_Arithm_AddWeighted
+    test_OCL_Arithm_Min
+    test_OCL_Arithm_Max
+    test_OCL_Arithm_Absdiff
+    test_OCL_Arithm_CartToPolar
+    test_OCL_Arithm_PolarToCart
+    test_OCL_Arithm_Transpose
+    test_OCL_Arithm_Bitwise_and
+    test_OCL_Arithm_Bitwise_or
+    test_OCL_Arithm_Bitwise_xor
+    test_OCL_Arithm_Bitwise_not
+    test_OCL_Arithm_Compare
+    test_OCL_Arithm_Pow
+    test_OCL_Arithm_SetIdentity
+    test_OCL_Arithm_Repeat
+    test_OCL_Arithm_CountNonZero
+    test_OCL_Arithm_Sum
+    test_OCL_Arithm_MeanStdDev
+    test_OCL_Arithm_Log
+    test_OCL_Arithm_Exp
+    test_OCL_Arithm_Phase
+    test_OCL_Arithm_Magnitude
+    test_OCL_Arithm_Flip
+    test_OCL_Arithm_MinMaxIdx
+    test_OCL_Arithm_MinMaxIdx_Mask
+    test_OCL_Arithm_Norm
+    test_OCL_Arithm_UMatDot
+    test_OCL_Arithm_Sqrt
+    test_OCL_Arithm_Normalize
+    test_OCL_Arithm_InRange
+    test_OCL_Arithm_ConvertScaleAbs
+    test_OCL_Arithm_ScaleAdd
+    test_OCL_Arithm_PatchNaNs
+    test_OCL_Arithm_Psnr
+    test_OCL_Arithm_ReduceSum
+    test_OCL_Arithm_ReduceMax
+    test_OCL_Arithm_ReduceMin
+    test_OCL_Arithm_ReduceAvg
+    test_OCL_Core_Gemm
+    test_OCL_Core_Dft
+    test_OCL_OCL_ImgProc_MultiSpectrums
+    test_OCL_MatrixOperation_ConvertTo
+    test_OCL_MatrixOperation_CopyTo
+    test_OCL_MatrixOperation_SetTo
+    test_OCL_MatrixOperation_UMatExpr
+
+    PROPERTIES
+      LABELS "OpenCV")
+
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required files not found" )
+
+endif()
diff --git a/examples/OpenCV/Makefile.in b/examples/OpenCV/Makefile.in
index 2b8bc70..a60923d 100644
--- a/examples/OpenCV/Makefile.in
+++ b/examples/OpenCV/Makefile.in
@@ -192,6 +192,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -199,6 +200,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -216,8 +218,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -293,6 +293,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/OpenCV/opencv.patch b/examples/OpenCV/opencv.patch
new file mode 100644
index 0000000..78aea87
--- /dev/null
+++ b/examples/OpenCV/opencv.patch
@@ -0,0 +1,13 @@
+diff --git a/opencv-3.0.0-beta/cmake/OpenCVDetectOpenCL.cmake b/opencv-3.0.0-beta/cmake/OpenCVDetectOpenCL.cmake
+
+--- a/opencv-3.0.0-beta/cmake/OpenCVDetectOpenCL.cmake
++++ b/opencv-3.0.0-beta/cmake/OpenCVDetectOpenCL.cmake
+@@ -7,7 +7,7 @@
+ else(APPLE)
+   set(OPENCL_FOUND YES)
+   set(HAVE_OPENCL_STATIC OFF)
+-  set(OPENCL_INCLUDE_DIR "${OpenCV_SOURCE_DIR}/3rdparty/include/opencl/1.2")
++  set(OPENCL_INCLUDE_DIR "${OpenCV_SOURCE_DIR}/../../../include/")
+ endif(APPLE)
+ 
+ if(OPENCL_FOUND)
diff --git a/examples/Parboil/CMakeLists.txt b/examples/Parboil/CMakeLists.txt
new file mode 100644
index 0000000..6fca393
--- /dev/null
+++ b/examples/Parboil/CMakeLists.txt
@@ -0,0 +1,149 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "Parboil")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+set(PB_DRIVER "${TS_SRCDIR}/pb2.5driver.tgz")
+set(PB_BENCH "${TS_SRCDIR}/pb2.5benchmarks.tgz")
+set(PB_DATASET "${TS_SRCDIR}/pb2.5datasets_standard.tgz")
+set(PB_MAKE "${CMAKE_BINARY_DIR}/Parboil.makefile")
+
+if (EXISTS "${PB_DRIVER}" AND
+    EXISTS "${PB_BENCH}" AND
+    EXISTS "${PB_DATASET}")
+
+  # Parboil is not python 3 compatible
+  find_package(PythonInterp 2.7)
+  if (NOT PYTHONINTERP_FOUND)
+    message(FATAL_ERROR "Parboil testsuite requires python 2.7, can't find it")
+  endif()
+
+  file(WRITE "${PB_MAKE}" "OPENCL_PATH=${CMAKE_SOURCE_DIR}\n")
+  #file(APPEND "${PB_MAKE}" "OPENCL_LIB_PATH=/usr/lib\n")
+
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    DOWNLOAD_COMMAND pwd &&
+      tar xvzf "${PB_DRIVER}" && cd parboil &&
+      tar xvzf "${PB_BENCH}" &&
+      tar xvzf "${PB_DATASET}" && cd .. &&
+      rmdir Parboil && mv parboil Parboil &&
+      cp "${PB_MAKE}" ./Parboil/common/Makefile.conf
+
+    CONFIGURE_COMMAND /bin/true
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND pwd &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile spmv opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile stencil opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile tpacf opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile cutcp opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile mri-gridding opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile sad opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile bfs opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile histo opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile sgemm opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile lbm opencl_base &&
+      "${PYTHON_EXECUTABLE}" ./parboil compile mri-q opencl
+
+    INSTALL_COMMAND /bin/true
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+  add_test(NAME parboil_spmv
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           spmv opencl_base small
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_stencil
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           stencil opencl_base small
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_tpacf
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           tpacf opencl_base small
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_cutcp
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           cutcp opencl_base small
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_mri_gridding
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           mri-gridding opencl_base small
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_sad
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           sad opencl_base default
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_bfs
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           bfs opencl_base NY
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_histo
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           histo opencl_base default
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_sgemm
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           sgemm opencl_base medium
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_mri_q
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           mri-q opencl small
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+  add_test(NAME parboil_lbm
+           COMMAND "${PYTHON_EXECUTABLE}" ./parboil run
+           lbm opencl_base short
+           WORKING_DIRECTORY "${TS_BUILDDIR}")
+
+  set_tests_properties(
+    parboil_spmv
+    parboil_stencil
+    parboil_tpacf
+    parboil_cutcp
+    parboil_mri_gridding
+    parboil_sad
+    parboil_bfs
+    parboil_histo
+    parboil_sgemm
+    parboil_mri_q
+    parboil_lbm
+
+    PROPERTIES
+      LABELS "Parboil")
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required files not found" )
+
+endif()
diff --git a/examples/Parboil/Makefile.in b/examples/Parboil/Makefile.in
index 1d45f4a..55a34b3 100644
--- a/examples/Parboil/Makefile.in
+++ b/examples/Parboil/Makefile.in
@@ -192,6 +192,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -199,6 +200,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -216,8 +218,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -293,6 +293,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/PyOpenCL/README b/examples/PyOpenCL/README
index 5a6dae9..baca996 100644
--- a/examples/PyOpenCL/README
+++ b/examples/PyOpenCL/README
@@ -1,7 +1,7 @@
 Installation:
 
 $ git clone --recursive git://github.com/inducer/pyopencl
-$ apt-get install python-pip python-virtualenv
+$ apt-get install python-pip python-virtualenv python-numpy python-mako
 
   (or equivalent, for example:
   curl -O https://raw.github.com/pypa/pip/master/contrib/get-pip.py
diff --git a/examples/Rodinia/CMakeLists.txt b/examples/Rodinia/CMakeLists.txt
new file mode 100644
index 0000000..ddff6fa
--- /dev/null
+++ b/examples/Rodinia/CMakeLists.txt
@@ -0,0 +1,115 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "Rodinia")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+set(RODINIA "rodinia_3.1")
+set(RODINIA_TGZ "${TS_SRCDIR}/${RODINIA}.tar.bz2")
+
+if (EXISTS "${RODINIA_TGZ}")
+
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    DOWNLOAD_COMMAND test -d ${RODINIA} || (pwd && echo "extracting rodinia tgz"
+      && tar xjf "${RODINIA_TGZ}"
+      && patch -p1 -i -r- -N "${CMAKE_CURRENT_SOURCE_DIR}/Rodinia.patch"
+      && rmdir Rodinia && mv ${RODINIA} Rodinia)
+    CONFIGURE_COMMAND /bin/true
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND pwd && make OPENCL
+      "CFLAGS=-Wno-unused-result -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+      "OPENCL_INC=${CMAKE_SOURCE_DIR}/include"
+    INSTALL_COMMAND /bin/true
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+  add_test(NAME rodinia_backprop                          COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/backprop")
+  add_test(NAME rodinia_bfs                               COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/bfs")
+  add_test(NAME rodinia_euler3d                           COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/cfd")
+  add_test(NAME rodinia_gaussian                          COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/gaussian")
+  add_test(NAME rodinia_heartwall                         COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/heartwall")
+  add_test(NAME rodinia_hotspot                           COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/hotspot")
+  add_test(NAME rodinia_kmeans                            COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/kmeans")
+  add_test(NAME rodinia_lavaMD                            COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/lavaMD")
+  add_test(NAME rodinia_leukocyte                         COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/leukocyte/OpenCL")
+  add_test(NAME rodinia_lud                               COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/lud/ocl")
+  add_test(NAME rodinia_nn                                COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/nn")
+  add_test(NAME rodinia_nw                                COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/nw")
+  add_test(NAME rodinia_particlefilter                    COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/particlefilter")
+  add_test(NAME rodinia_pathfinder                        COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/pathfinder")
+  add_test(NAME rodinia_srad                              COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/srad")
+  add_test(NAME rodinia_streamcluster                     COMMAND ./run
+           WORKING_DIRECTORY "${TS_BUILDDIR}/opencl/streamcluster")
+
+  set_tests_properties(
+    rodinia_backprop
+    rodinia_bfs
+    rodinia_euler3d
+    rodinia_gaussian
+    rodinia_heartwall
+    rodinia_hotspot
+    rodinia_kmeans
+    rodinia_lavaMD
+    rodinia_leukocyte
+    rodinia_lud
+    rodinia_nn
+    rodinia_nw
+    rodinia_particlefilter
+    rodinia_pathfinder
+    rodinia_srad
+    rodinia_streamcluster
+
+    PROPERTIES
+      LABELS "Rodinia")
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required files not found" )
+
+endif()
diff --git a/examples/Rodinia/Makefile.in b/examples/Rodinia/Makefile.in
index 3475dfd..8fd54a2 100644
--- a/examples/Rodinia/Makefile.in
+++ b/examples/Rodinia/Makefile.in
@@ -193,6 +193,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -200,6 +201,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -217,8 +219,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -294,6 +294,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/Rodinia/Rodinia.patch b/examples/Rodinia/Rodinia.patch
index ab15841..daee038 100644
--- a/examples/Rodinia/Rodinia.patch
+++ b/examples/Rodinia/Rodinia.patch
@@ -1,16 +1,13 @@
-diff -uNr a/rodinia_2.0.1/opencl/pathfinder/OpenCL.cpp b/rodinia_2.0.1/opencl/pathfinder/OpenCL.cpp
---- a/rodinia_2.0.1/opencl/pathfinder/OpenCL.cpp	2011-09-20 16:45:58.000000000 +0300
-+++ b/rodinia_2.0.1/opencl/pathfinder/OpenCL.cpp	2014-02-14 13:00:35.137395239 +0200
-@@ -120,10 +120,10 @@
- 	source_size = ftell(theFile);
- 	rewind(theFile);
- 	// Read in the file.
--	source_str = (char*) malloc(sizeof(char) * source_size);
-+	source_str = (char*) malloc(sizeof(char) * source_size + 1);
- 	fread(source_str, 1, source_size, theFile);
- 	fclose(theFile);
--
-+	source_str[source_size] = 0;
- 	// Create a program from the kernel source.
- 	program = clCreateProgramWithSource(context,
- 	                                    1,
+--- a/rodinia_3.1/opencl/leukocyte/OpenCL/Makefile	2015-12-19 19:13:31.078863235 +0100
++++ b/rodinia_3.1/opencl/leukocyte/OpenCL/Makefile	2015-12-19 19:15:37.812197722 +0100
+@@ -22,8 +22,8 @@
+ leukocyte: detect_main.o avilib.o find_ellipse.o find_ellipse_opencl.o track_ellipse.o track_ellipse_opencl.o misc_math.o OpenCL_helper_library.o $(MATRIX_DIR)/meschach.a
+ 	$(CC) $(CC_FLAGS) -lm avilib.o find_ellipse.o find_ellipse_opencl.o track_ellipse.o track_ellipse_opencl.o misc_math.o OpenCL_helper_library.o detect_main.o -o leukocyte $(MATRIX_DIR)/meschach.a -L$(OPENCL_LIB) -lm -lOpenCL
+ 
+-%.o: %.[ch]
+-	$(CC) $(OUTPUT) $(CC_FLAGS) $< -c
++%.o: %.c
++	$(CC) $(OUTPUT) $(CC_FLAGS) -o $@ -c $<
+ 
+ detect_main.o: detect_main.c find_ellipse.h track_ellipse.h avilib.h
+ 
diff --git a/examples/VexCL/CMakeLists.txt b/examples/VexCL/CMakeLists.txt
new file mode 100644
index 0000000..377b8ac
--- /dev/null
+++ b/examples/VexCL/CMakeLists.txt
@@ -0,0 +1,113 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "VexCL")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+message(STATUS "Enabling testsuite ${TS_NAME}")
+list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+ExternalProject_Add(
+  ${TS_NAME}
+  PREFIX "${TS_BASEDIR}"
+  GIT_REPOSITORY "https://github.com/ddemidov/vexcl.git"
+  CMAKE_ARGS
+    -DVEXCL_CACHE_KERNELS=OFF
+    -DCMAKE_BUILD_TYPE=RelWithDebInfo
+    -DVEXCL_BACKEND=OpenCL
+  INSTALL_COMMAND /bin/true
+)
+
+set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+add_dependencies(prepare_examples ${TS_NAME})
+
+add_test(NAME vexcl_boost_version
+         COMMAND "${TS_BUILDDIR}/tests/boost_version")
+add_test(NAME vexcl_types
+         COMMAND "${TS_BUILDDIR}/tests/types")
+add_test(NAME vexcl_deduce
+         COMMAND "${TS_BUILDDIR}/tests/deduce")
+add_test(NAME vexcl_context
+         COMMAND "${TS_BUILDDIR}/tests/context")
+add_test(NAME vexcl_vector_create
+         COMMAND "${TS_BUILDDIR}/tests/vector_create")
+add_test(NAME vexcl_vector_copy
+         COMMAND "${TS_BUILDDIR}/tests/vector_copy")
+add_test(NAME vexcl_vector_arithmetics
+         COMMAND "${TS_BUILDDIR}/tests/vector_arithmetics")
+add_test(NAME vexcl_vector_view
+         COMMAND "${TS_BUILDDIR}/tests/vector_view")
+add_test(NAME vexcl_tensordot
+         COMMAND "${TS_BUILDDIR}/tests/tensordot")
+add_test(NAME vexcl_vector_pointer
+         COMMAND "${TS_BUILDDIR}/tests/vector_pointer")
+add_test(NAME vexcl_tagged_terminal
+         COMMAND "${TS_BUILDDIR}/tests/tagged_terminal")
+add_test(NAME vexcl_temporary
+         COMMAND "${TS_BUILDDIR}/tests/temporary")
+add_test(NAME vexcl_cast
+         COMMAND "${TS_BUILDDIR}/tests/cast")
+add_test(NAME vexcl_multivector_create
+         COMMAND "${TS_BUILDDIR}/tests/multivector_create")
+add_test(NAME vexcl_multivector_arithmetics
+         COMMAND "${TS_BUILDDIR}/tests/multivector_arithmetics")
+add_test(NAME vexcl_multi_array
+         COMMAND "${TS_BUILDDIR}/tests/multi_array")
+add_test(NAME vexcl_spmv
+         COMMAND "${TS_BUILDDIR}/tests/spmv")
+add_test(NAME vexcl_stencil
+         COMMAND "${TS_BUILDDIR}/tests/stencil")
+add_test(NAME vexcl_generator
+         COMMAND "${TS_BUILDDIR}/tests/generator")
+add_test(NAME vexcl_mba
+         COMMAND "${TS_BUILDDIR}/tests/mba")
+add_test(NAME vexcl_random
+         COMMAND "${TS_BUILDDIR}/tests/random")
+add_test(NAME vexcl_sort
+         COMMAND "${TS_BUILDDIR}/tests/sort")
+add_test(NAME vexcl_scan
+         COMMAND "${TS_BUILDDIR}/tests/scan")
+add_test(NAME vexcl_scan_by_key
+         COMMAND "${TS_BUILDDIR}/tests/scan_by_key")
+add_test(NAME vexcl_reduce_by_key
+         COMMAND "${TS_BUILDDIR}/tests/reduce_by_key")
+add_test(NAME vexcl_logical
+         COMMAND "${TS_BUILDDIR}/tests/logical")
+add_test(NAME vexcl_threads
+         COMMAND "${TS_BUILDDIR}/tests/threads")
+
+set_tests_properties(vexcl_boost_version vexcl_types vexcl_deduce
+  vexcl_context vexcl_vector_create vexcl_vector_copy
+  vexcl_vector_arithmetics vexcl_vector_view vexcl_tensordot
+  vexcl_vector_pointer vexcl_tagged_terminal vexcl_temporary
+  vexcl_cast vexcl_multivector_create vexcl_multivector_arithmetics
+  vexcl_multi_array vexcl_spmv vexcl_stencil vexcl_generator
+  vexcl_mba vexcl_random vexcl_sort vexcl_scan vexcl_scan_by_key
+  vexcl_reduce_by_key vexcl_logical vexcl_threads
+  PROPERTIES
+    LABELS "VexCL")
diff --git a/examples/VexCL/Makefile.in b/examples/VexCL/Makefile.in
index 96339b2..18cc3f5 100644
--- a/examples/VexCL/Makefile.in
+++ b/examples/VexCL/Makefile.in
@@ -192,6 +192,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -199,6 +200,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -216,8 +218,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -293,6 +293,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/ViennaCL/CMakeLists.txt b/examples/ViennaCL/CMakeLists.txt
new file mode 100644
index 0000000..6439f8c
--- /dev/null
+++ b/examples/ViennaCL/CMakeLists.txt
@@ -0,0 +1,298 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "ViennaCL")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+set(VIENNA "ViennaCL-1.7.0")
+set(ViennaCL_TGZ "${TS_SRCDIR}/${VIENNA}.tar.gz")
+
+if(EXISTS "${ViennaCL_TGZ}")
+
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    SOURCE_DIR "${TS_BASEDIR}/src/${VIENNA}"
+    DOWNLOAD_COMMAND pwd &&
+      test -d "${TS_BASEDIR}/src/${VIENNA}" || tar xzf "${ViennaCL_TGZ}"
+    PATCH_COMMAND pwd && patch -N -r- -p1 -i
+      ${CMAKE_CURRENT_SOURCE_DIR}/vienna_170.patch || /bin/true
+    UPDATE_COMMAND /bin/true
+    CMAKE_ARGS
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo
+      -DBUILD_TESTING:BOOL=ON
+      -DBUILD_EXAMPLES:BOOL=ON
+      -DENABLE_OPENCL:BOOL=ON
+      -DENABLE_CUDA:BOOL=OFF
+      -DENABLE_UBLAS:BOOL=OFF
+      "-DCMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-g -O0 -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+    INSTALL_COMMAND /bin/true
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+  add_test(NAME viennacl_examples_amg
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/amg"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+#  add_test(NAME viennacl_examples_bandwidth_reduction
+#          COMMAND "${TS_BUILDDIR}/examples/tutorial/bandwidth-reduction"
+#          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_custom_kernels
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/custom-kernels"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_blas1
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/blas1"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_custom_context
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/custom-context"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_bisect
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/bisect"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_fft
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/fft"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_iterative_custom
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/iterative-custom"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_matrix_free
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/matrix-free"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_nmf
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/nmf"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_viennacl_info
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/viennacl-info"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_wrap_host_buffer
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/wrap-host-buffer"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_scheduler
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/scheduler"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_examples_libviennacl_tutorial
+          COMMAND "${TS_BUILDDIR}/examples/tutorial/libviennacl-tutorial"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+
+  add_test(NAME viennacl_benchmarks_opencl_bench_opencl
+          COMMAND "${TS_BUILDDIR}/examples/benchmarks/opencl-bench-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_benchmarks_dense_blas_bench_opencl
+          COMMAND "${TS_BUILDDIR}/examples/benchmarks/dense_blas-bench-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+
+  add_test(NAME viennacl_tests_bisect_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/bisect-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_fft_1d_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/fft_1d-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_external_linkage_opencl
+          COMMAND "${TS_BUILDDIR}/tests/external_linkage-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_blas3_solve_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/blas3_solve-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_fft_2d_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/fft_2d-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_libviennacl_blas1_test
+          COMMAND "${TS_BUILDDIR}/tests/libviennacl_blas1-test"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_iterators_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/iterators-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_global_variables_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/global_variables-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_libviennacl_blas2_test
+          COMMAND "${TS_BUILDDIR}/tests/libviennacl_blas2-test"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_libviennacl_blas3_test
+          COMMAND "${TS_BUILDDIR}/tests/libviennacl_blas3-test"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_col_double_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_col_double-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_col_float_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_col_float-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_col_int_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_col_int-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_product_double_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_product_double-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_convert_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_convert-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_product_float_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_product_float-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_row_double_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_row_double-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_row_float_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_row_float-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_row_int_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_row_int-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_vector_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_vector-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_nmf_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/nmf-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_scalar_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/scalar-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_matrix_vector_int_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/matrix_vector_int-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_qr_method_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/qr_method-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_qr_method_func_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/qr_method_func-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_scan_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/scan-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_self_assign_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/self_assign-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_sparse_prod_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/sparse_prod-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_sparse_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/sparse-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_structured_matrices_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/structured-matrices-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_spmdm_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/spmdm-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_svd_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/svd-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_tql_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/tql-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_vector_convert_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/vector_convert-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_vector_float_double_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/vector_float_double-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_vector_multi_inner_prod_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/vector_multi_inner_prod-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_vector_int_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/vector_int-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+  add_test(NAME viennacl_tests_vector_uint_test_opencl
+          COMMAND "${TS_BUILDDIR}/tests/vector_uint-test-opencl"
+          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+
+#  viennacl_examples_bandwidth_reduction
+  set_tests_properties(
+  viennacl_examples_amg
+  viennacl_examples_custom_kernels
+  viennacl_examples_blas1
+  viennacl_examples_custom_context
+  viennacl_examples_bisect
+  viennacl_examples_fft
+  viennacl_examples_iterative_custom
+  viennacl_examples_matrix_free
+  viennacl_examples_nmf
+  viennacl_examples_viennacl_info
+  viennacl_examples_wrap_host_buffer
+  viennacl_examples_scheduler
+  viennacl_examples_libviennacl_tutorial
+    PROPERTIES
+      LABELS "ViennaCL ViennaCL_examples")
+
+  set_tests_properties(
+  viennacl_benchmarks_opencl_bench_opencl
+  viennacl_benchmarks_dense_blas_bench_opencl
+    PROPERTIES
+      LABELS "ViennaCL ViennaCL_benchmarks")
+
+  set_tests_properties(
+  viennacl_tests_bisect_test_opencl
+  viennacl_tests_fft_1d_test_opencl
+  viennacl_tests_external_linkage_opencl
+  viennacl_tests_blas3_solve_test_opencl
+  viennacl_tests_fft_2d_test_opencl
+  viennacl_tests_libviennacl_blas1_test
+  viennacl_tests_iterators_test_opencl
+  viennacl_tests_global_variables_test_opencl
+  viennacl_tests_libviennacl_blas2_test
+  viennacl_tests_libviennacl_blas3_test
+  viennacl_tests_matrix_col_double_test_opencl
+  viennacl_tests_matrix_col_float_test_opencl
+  viennacl_tests_matrix_col_int_test_opencl
+  viennacl_tests_matrix_product_double_test_opencl
+  viennacl_tests_matrix_convert_test_opencl
+  viennacl_tests_matrix_product_float_test_opencl
+  viennacl_tests_matrix_row_double_test_opencl
+  viennacl_tests_matrix_row_float_test_opencl
+  viennacl_tests_matrix_row_int_test_opencl
+  viennacl_tests_matrix_vector_test_opencl
+  viennacl_tests_nmf_test_opencl
+  viennacl_tests_scalar_test_opencl
+  viennacl_tests_matrix_vector_int_test_opencl
+  viennacl_tests_qr_method_test_opencl
+  viennacl_tests_qr_method_func_test_opencl
+  viennacl_tests_scan_test_opencl
+  viennacl_tests_self_assign_test_opencl
+  viennacl_tests_sparse_prod_test_opencl
+  viennacl_tests_sparse_test_opencl
+  viennacl_tests_structured_matrices_test_opencl
+  viennacl_tests_spmdm_test_opencl
+  viennacl_tests_svd_test_opencl
+  viennacl_tests_tql_test_opencl
+  viennacl_tests_vector_convert_test_opencl
+  viennacl_tests_vector_float_double_test_opencl
+  viennacl_tests_vector_multi_inner_prod_test_opencl
+  viennacl_tests_vector_int_test_opencl
+  viennacl_tests_vector_uint_test_opencl
+    PROPERTIES
+      LABELS "ViennaCL ViennaCL_tests")
+
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required files not found" )
+
+endif()
diff --git a/examples/ViennaCL/Makefile.in b/examples/ViennaCL/Makefile.in
index be94001..4f7d85e 100644
--- a/examples/ViennaCL/Makefile.in
+++ b/examples/ViennaCL/Makefile.in
@@ -193,6 +193,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -200,6 +201,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -217,8 +219,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -294,6 +294,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/ViennaCL/vienna_170.patch b/examples/ViennaCL/vienna_170.patch
new file mode 100644
index 0000000..b804101
--- /dev/null
+++ b/examples/ViennaCL/vienna_170.patch
@@ -0,0 +1,11 @@
+--- a/viennacl/device_specific/builtin_database/common.hpp.orig	2016-01-20 16:41:12.219439157 +0100
++++ b/viennacl/device_specific/builtin_database/common.hpp	2016-01-20 16:57:01.459447796 +0100
+@@ -94,7 +94,7 @@
+            viennacl::device_specific::at(
+            viennacl::device_specific::at(
+              viennacl::device_specific::at(map.d, p0).d,
+-           p1).d,
++           (p1 & (~1)) ).d,
+            p2).d,
+          p3).d,
+          p4);
diff --git a/examples/arrayfire/CMakeLists.txt b/examples/arrayfire/CMakeLists.txt
new file mode 100644
index 0000000..d801ab2
--- /dev/null
+++ b/examples/arrayfire/CMakeLists.txt
@@ -0,0 +1,697 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "arrayfire")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+find_package(LAPACK)
+find_package(Boost)
+
+if(LAPACK_FOUND AND
+   Boost_FOUND AND (NOT Boost_VERSION VERSION_LESS "1.48"))
+
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    GIT_REPOSITORY "https://github.com/arrayfire/${TS_NAME}.git"
+    #PATCH_COMMAND  pwd && sed -i "s/CL_DEVICE_TYPE_GPU/CL_DEVICE_TYPE_CPU/g" *.cpp
+    #UPDATE_COMMAND /bin/true
+    CMAKE_ARGS
+      -DBUILD_GRAPHICS=OFF
+      -DBUILD_CUDA=OFF
+      -DBUILD_CPU=OFF
+      -DBUILD_OPENCL=ON
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo
+      "-DCMAKE_C_FLAGS_RELWITHDEBINFO=-O2 -g -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+      "-DCMAKE_CXX_FLAGS_RELWITHDEBINFO=-O2 -g -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+    INSTALL_COMMAND /bin/true
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+
+
+  add_test(NAME arrayfire_tests_approx1_opencl
+           COMMAND "${TS_BUILDDIR}/test/approx1_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_approx2_opencl
+           COMMAND "${TS_BUILDDIR}/test/approx2_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_array_opencl
+           COMMAND "${TS_BUILDDIR}/test/array_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_assign_opencl
+           COMMAND "${TS_BUILDDIR}/test/assign_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_backend_opencl
+           COMMAND "${TS_BUILDDIR}/test/backend_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_basic_c_opencl
+           COMMAND "${TS_BUILDDIR}/test/basic_c_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_basic_opencl
+           COMMAND "${TS_BUILDDIR}/test/basic_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_bilateral_opencl
+           COMMAND "${TS_BUILDDIR}/test/bilateral_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_binary_opencl
+           COMMAND "${TS_BUILDDIR}/test/binary_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_blas_opencl
+           COMMAND "${TS_BUILDDIR}/test/blas_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_cholesky_dense_opencl
+           COMMAND "${TS_BUILDDIR}/test/cholesky_dense_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_complex_opencl
+           COMMAND "${TS_BUILDDIR}/test/complex_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_constant_opencl
+           COMMAND "${TS_BUILDDIR}/test/constant_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_convolve_opencl
+           COMMAND "${TS_BUILDDIR}/test/convolve_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_corrcoef_opencl
+           COMMAND "${TS_BUILDDIR}/test/corrcoef_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_covariance_opencl
+           COMMAND "${TS_BUILDDIR}/test/covariance_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_diagonal_opencl
+           COMMAND "${TS_BUILDDIR}/test/diagonal_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_diff1_opencl
+           COMMAND "${TS_BUILDDIR}/test/diff1_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_diff2_opencl
+           COMMAND "${TS_BUILDDIR}/test/diff2_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_dog_opencl
+           COMMAND "${TS_BUILDDIR}/test/dog_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_dot_opencl
+           COMMAND "${TS_BUILDDIR}/test/dot_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_fast_opencl
+           COMMAND "${TS_BUILDDIR}/test/fast_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_fft_large_opencl
+           COMMAND "${TS_BUILDDIR}/test/fft_large_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_fft_opencl
+           COMMAND "${TS_BUILDDIR}/test/fft_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_fft_real_opencl
+           COMMAND "${TS_BUILDDIR}/test/fft_real_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_fftconvolve_opencl
+           COMMAND "${TS_BUILDDIR}/test/fftconvolve_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_flat_opencl
+           COMMAND "${TS_BUILDDIR}/test/flat_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_flip_opencl
+           COMMAND "${TS_BUILDDIR}/test/flip_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_gaussiankernel_opencl
+           COMMAND "${TS_BUILDDIR}/test/gaussiankernel_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_gen_assign_opencl
+           COMMAND "${TS_BUILDDIR}/test/gen_assign_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_gen_index_opencl
+           COMMAND "${TS_BUILDDIR}/test/gen_index_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_getting_started_opencl
+           COMMAND "${TS_BUILDDIR}/test/getting_started_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_gfor_opencl
+           COMMAND "${TS_BUILDDIR}/test/gfor_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_gloh_nonfree_opencl
+           COMMAND "${TS_BUILDDIR}/test/gloh_nonfree_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_gradient_opencl
+           COMMAND "${TS_BUILDDIR}/test/gradient_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_hamming_opencl
+           COMMAND "${TS_BUILDDIR}/test/hamming_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_harris_opencl
+           COMMAND "${TS_BUILDDIR}/test/harris_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_histogram_opencl
+           COMMAND "${TS_BUILDDIR}/test/histogram_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_homography_opencl
+           COMMAND "${TS_BUILDDIR}/test/homography_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_hsv_rgb_opencl
+           COMMAND "${TS_BUILDDIR}/test/hsv_rgb_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_iir_opencl
+           COMMAND "${TS_BUILDDIR}/test/iir_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_imageio_opencl
+           COMMAND "${TS_BUILDDIR}/test/imageio_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_index_opencl
+           COMMAND "${TS_BUILDDIR}/test/index_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_info_opencl
+           COMMAND "${TS_BUILDDIR}/test/info_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_inverse_dense_opencl
+           COMMAND "${TS_BUILDDIR}/test/inverse_dense_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_iota_opencl
+           COMMAND "${TS_BUILDDIR}/test/iota_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_ireduce_opencl
+           COMMAND "${TS_BUILDDIR}/test/ireduce_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_jit_opencl
+           COMMAND "${TS_BUILDDIR}/test/jit_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_join_opencl
+           COMMAND "${TS_BUILDDIR}/test/join_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_lu_dense_opencl
+           COMMAND "${TS_BUILDDIR}/test/lu_dense_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_manual_memory_test_opencl
+           COMMAND "${TS_BUILDDIR}/test/manual_memory_test_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_match_template_opencl
+           COMMAND "${TS_BUILDDIR}/test/match_template_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_math_opencl
+           COMMAND "${TS_BUILDDIR}/test/math_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_matrix_manipulation_opencl
+           COMMAND "${TS_BUILDDIR}/test/matrix_manipulation_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_mean_opencl
+           COMMAND "${TS_BUILDDIR}/test/mean_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_meanshift_opencl
+           COMMAND "${TS_BUILDDIR}/test/meanshift_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_medfilt_opencl
+           COMMAND "${TS_BUILDDIR}/test/medfilt_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_median_opencl
+           COMMAND "${TS_BUILDDIR}/test/median_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_memory_lock_opencl
+           COMMAND "${TS_BUILDDIR}/test/memory_lock_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_memory_opencl
+           COMMAND "${TS_BUILDDIR}/test/memory_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_missing_opencl
+           COMMAND "${TS_BUILDDIR}/test/missing_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_moddims_opencl
+           COMMAND "${TS_BUILDDIR}/test/moddims_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_morph_opencl
+           COMMAND "${TS_BUILDDIR}/test/morph_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_nearest_neighbour_opencl
+           COMMAND "${TS_BUILDDIR}/test/nearest_neighbour_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_orb_opencl
+           COMMAND "${TS_BUILDDIR}/test/orb_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_qr_dense_opencl
+           COMMAND "${TS_BUILDDIR}/test/qr_dense_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_random_opencl
+           COMMAND "${TS_BUILDDIR}/test/random_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_rank_dense_opencl
+           COMMAND "${TS_BUILDDIR}/test/rank_dense_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_range_opencl
+           COMMAND "${TS_BUILDDIR}/test/range_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_regions_opencl
+           COMMAND "${TS_BUILDDIR}/test/regions_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_reduce_opencl
+           COMMAND "${TS_BUILDDIR}/test/reduce_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_replace_opencl
+           COMMAND "${TS_BUILDDIR}/test/replace_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_reorder_opencl
+           COMMAND "${TS_BUILDDIR}/test/reorder_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_resize_opencl
+           COMMAND "${TS_BUILDDIR}/test/resize_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_sat_opencl
+           COMMAND "${TS_BUILDDIR}/test/sat_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_scan_opencl
+           COMMAND "${TS_BUILDDIR}/test/scan_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_rotate_linear_opencl
+           COMMAND "${TS_BUILDDIR}/test/rotate_linear_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_rotate_opencl
+           COMMAND "${TS_BUILDDIR}/test/rotate_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_select_opencl
+           COMMAND "${TS_BUILDDIR}/test/select_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_set_opencl
+           COMMAND "${TS_BUILDDIR}/test/set_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_sift_nonfree_opencl
+           COMMAND "${TS_BUILDDIR}/test/sift_nonfree_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_sobel_opencl
+           COMMAND "${TS_BUILDDIR}/test/sobel_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_solve_dense_opencl
+           COMMAND "${TS_BUILDDIR}/test/solve_dense_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_shift_opencl
+           COMMAND "${TS_BUILDDIR}/test/shift_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_sort_by_key_opencl
+           COMMAND "${TS_BUILDDIR}/test/sort_by_key_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_sort_index_opencl
+           COMMAND "${TS_BUILDDIR}/test/sort_index_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_sort_opencl
+           COMMAND "${TS_BUILDDIR}/test/sort_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_stdev_opencl
+           COMMAND "${TS_BUILDDIR}/test/stdev_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_susan_opencl
+           COMMAND "${TS_BUILDDIR}/test/susan_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_svd_dense_opencl
+           COMMAND "${TS_BUILDDIR}/test/svd_dense_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_tile_opencl
+           COMMAND "${TS_BUILDDIR}/test/tile_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_translate_opencl
+           COMMAND "${TS_BUILDDIR}/test/translate_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_transpose_inplace_opencl
+           COMMAND "${TS_BUILDDIR}/test/transpose_inplace_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_transpose_opencl
+           COMMAND "${TS_BUILDDIR}/test/transpose_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_triangle_opencl
+           COMMAND "${TS_BUILDDIR}/test/triangle_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_var_opencl
+           COMMAND "${TS_BUILDDIR}/test/var_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_where_opencl
+           COMMAND "${TS_BUILDDIR}/test/where_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_unwrap_opencl
+           COMMAND "${TS_BUILDDIR}/test/unwrap_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_write_opencl
+           COMMAND "${TS_BUILDDIR}/test/write_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_ycbcr_rgb_opencl
+           COMMAND "${TS_BUILDDIR}/test/ycbcr_rgb_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  add_test(NAME arrayfire_tests_wrap_opencl
+           COMMAND "${TS_BUILDDIR}/test/wrap_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+
+
+  add_test(NAME arrayfire_examples_adaptive_thresholding_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/adaptive_thresholding_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_binary_thresholding_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/binary_thresholding_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_brain_segmentation_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/brain_segmentation_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_edge_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/edge_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_filters_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/filters_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_image_demo_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/image_demo_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_image_editing_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/image_editing_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_morphing_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/morphing_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_optical_flow_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/optical_flow_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_pyramids_opencl
+           COMMAND "${TS_BUILDDIR}/examples/image_processing/pyramids_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+  add_test(NAME arrayfire_examples_bagging_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/bagging_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_deep_belief_net_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/deep_belief_net_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_kmeans_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/kmeans_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_knn_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/knn_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_logistic_regression_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/logistic_regression_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_naive_bayes_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/naive_bayes_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_neural_network_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/neural_network_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_perceptron_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/perceptron_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_rbm_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/rbm_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_softmax_regression_opencl
+           COMMAND "${TS_BUILDDIR}/examples/machine_learning/softmax_regression_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+  add_test(NAME arrayfire_examples_basic_opencl
+           COMMAND "${TS_BUILDDIR}/examples/unified/basic_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/unified")
+  add_test(NAME arrayfire_examples_black_scholes_options_opencl
+           COMMAND "${TS_BUILDDIR}/examples/financial/black_scholes_options_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/financial")
+  add_test(NAME arrayfire_examples_heston_model_opencl
+           COMMAND "${TS_BUILDDIR}/examples/financial/heston_model_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/financial")
+  add_test(NAME arrayfire_examples_monte_carlo_options_opencl
+           COMMAND "${TS_BUILDDIR}/examples/financial/monte_carlo_options_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/financial")
+  add_test(NAME arrayfire_examples_blas_opencl
+           COMMAND "${TS_BUILDDIR}/examples/benchmarks/blas_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/benchmarks")
+  add_test(NAME arrayfire_examples_fft_opencl
+           COMMAND "${TS_BUILDDIR}/examples/benchmarks/fft_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/benchmarks")
+  add_test(NAME arrayfire_examples_pi_opencl
+           COMMAND "${TS_BUILDDIR}/examples/benchmarks/pi_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/benchmarks")
+  add_test(NAME arrayfire_examples_cholesky_opencl
+           COMMAND "${TS_BUILDDIR}/examples/lin_algebra/cholesky_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/lin_algebra")
+  add_test(NAME arrayfire_examples_lu_opencl
+           COMMAND "${TS_BUILDDIR}/examples/lin_algebra/lu_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/lin_algebra")
+  add_test(NAME arrayfire_examples_qr_opencl
+           COMMAND "${TS_BUILDDIR}/examples/lin_algebra/qr_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/lin_algebra")
+  add_test(NAME arrayfire_examples_svd_opencl
+           COMMAND "${TS_BUILDDIR}/examples/lin_algebra/svd_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/lin_algebra")
+  add_test(NAME arrayfire_examples_convolve_opencl
+           COMMAND "${TS_BUILDDIR}/examples/getting_started/convolve_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/getting_started")
+  add_test(NAME arrayfire_examples_integer_opencl
+           COMMAND "${TS_BUILDDIR}/examples/getting_started/integer_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/getting_started")
+  add_test(NAME arrayfire_examples_rainfall_opencl
+           COMMAND "${TS_BUILDDIR}/examples/getting_started/rainfall_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/getting_started")
+  add_test(NAME arrayfire_examples_vectorize_opencl
+           COMMAND "${TS_BUILDDIR}/examples/getting_started/vectorize_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/getting_started")
+  add_test(NAME arrayfire_examples_conway_opencl
+           COMMAND "${TS_BUILDDIR}/examples/graphics/conway_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+  add_test(NAME arrayfire_examples_conway_pretty_opencl
+           COMMAND "${TS_BUILDDIR}/examples/graphics/conway_pretty_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+  add_test(NAME arrayfire_examples_fractal_opencl
+           COMMAND "${TS_BUILDDIR}/examples/graphics/fractal_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+  add_test(NAME arrayfire_examples_histogram_opencl
+           COMMAND "${TS_BUILDDIR}/examples/graphics/histogram_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+  add_test(NAME arrayfire_examples_plot2d_opencl
+           COMMAND "${TS_BUILDDIR}/examples/graphics/plot2d_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+  add_test(NAME arrayfire_examples_plot3_opencl
+           COMMAND "${TS_BUILDDIR}/examples/graphics/plot3_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+  add_test(NAME arrayfire_examples_surface_opencl
+           COMMAND "${TS_BUILDDIR}/examples/graphics/surface_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+  add_test(NAME arrayfire_examples_fast_opencl
+           COMMAND "${TS_BUILDDIR}/examples/computer_vision/fast_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
+  add_test(NAME arrayfire_examples_harris_opencl
+           COMMAND "${TS_BUILDDIR}/examples/computer_vision/harris_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
+  add_test(NAME arrayfire_examples_matching_opencl
+           COMMAND "${TS_BUILDDIR}/examples/computer_vision/matching_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
+  add_test(NAME arrayfire_examples_susan_opencl
+           COMMAND "${TS_BUILDDIR}/examples/computer_vision/susan_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
+  add_test(NAME arrayfire_examples_helloworld_opencl
+           COMMAND "${TS_BUILDDIR}/examples/helloworld/helloworld_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/helloworld")
+  add_test(NAME arrayfire_examples_swe_opencl
+           COMMAND "${TS_BUILDDIR}/examples/pde/swe_opencl"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/pde")
+
+  set_tests_properties(
+    arrayfire_tests_approx1_opencl
+    arrayfire_tests_approx2_opencl
+    arrayfire_tests_array_opencl
+    arrayfire_tests_assign_opencl
+    arrayfire_tests_backend_opencl
+    arrayfire_tests_basic_c_opencl
+    arrayfire_tests_basic_opencl
+    arrayfire_tests_bilateral_opencl
+    arrayfire_tests_binary_opencl
+    arrayfire_tests_blas_opencl
+    arrayfire_tests_cholesky_dense_opencl
+    arrayfire_tests_complex_opencl
+    arrayfire_tests_constant_opencl
+    arrayfire_tests_convolve_opencl
+    arrayfire_tests_corrcoef_opencl
+    arrayfire_tests_covariance_opencl
+    arrayfire_tests_diagonal_opencl
+    arrayfire_tests_diff1_opencl
+    arrayfire_tests_diff2_opencl
+    arrayfire_tests_dog_opencl
+    arrayfire_tests_dot_opencl
+    arrayfire_tests_fast_opencl
+    arrayfire_tests_fft_large_opencl
+    arrayfire_tests_fft_opencl
+    arrayfire_tests_fft_real_opencl
+    arrayfire_tests_fftconvolve_opencl
+    arrayfire_tests_flat_opencl
+    arrayfire_tests_flip_opencl
+    arrayfire_tests_gaussiankernel_opencl
+    arrayfire_tests_gen_assign_opencl
+    arrayfire_tests_gen_index_opencl
+    arrayfire_tests_getting_started_opencl
+    arrayfire_tests_gfor_opencl
+    arrayfire_tests_gloh_nonfree_opencl
+    arrayfire_tests_gradient_opencl
+    arrayfire_tests_hamming_opencl
+    arrayfire_tests_harris_opencl
+    arrayfire_tests_histogram_opencl
+    arrayfire_tests_homography_opencl
+    arrayfire_tests_hsv_rgb_opencl
+    arrayfire_tests_iir_opencl
+    arrayfire_tests_imageio_opencl
+    arrayfire_tests_index_opencl
+    arrayfire_tests_info_opencl
+    arrayfire_tests_inverse_dense_opencl
+    arrayfire_tests_iota_opencl
+    arrayfire_tests_ireduce_opencl
+    arrayfire_tests_jit_opencl
+    arrayfire_tests_join_opencl
+    arrayfire_tests_lu_dense_opencl
+    arrayfire_tests_manual_memory_test_opencl
+    arrayfire_tests_match_template_opencl
+    arrayfire_tests_math_opencl
+    arrayfire_tests_matrix_manipulation_opencl
+    arrayfire_tests_mean_opencl
+    arrayfire_tests_meanshift_opencl
+    arrayfire_tests_medfilt_opencl
+    arrayfire_tests_median_opencl
+    arrayfire_tests_memory_lock_opencl
+    arrayfire_tests_memory_opencl
+    arrayfire_tests_missing_opencl
+    arrayfire_tests_moddims_opencl
+    arrayfire_tests_morph_opencl
+    arrayfire_tests_nearest_neighbour_opencl
+#    arrayfire_tests_ocl_ext_context_opencl
+    arrayfire_tests_orb_opencl
+    arrayfire_tests_qr_dense_opencl
+    arrayfire_tests_random_opencl
+    arrayfire_tests_rank_dense_opencl
+    arrayfire_tests_range_opencl
+    arrayfire_tests_regions_opencl
+    arrayfire_tests_reduce_opencl
+    arrayfire_tests_replace_opencl
+    arrayfire_tests_reorder_opencl
+    arrayfire_tests_resize_opencl
+    arrayfire_tests_sat_opencl
+    arrayfire_tests_scan_opencl
+    arrayfire_tests_rotate_linear_opencl
+    arrayfire_tests_rotate_opencl
+    arrayfire_tests_select_opencl
+    arrayfire_tests_set_opencl
+    arrayfire_tests_sift_nonfree_opencl
+    arrayfire_tests_sobel_opencl
+    arrayfire_tests_solve_dense_opencl
+    arrayfire_tests_shift_opencl
+    arrayfire_tests_sort_by_key_opencl
+    arrayfire_tests_sort_index_opencl
+    arrayfire_tests_sort_opencl
+    arrayfire_tests_stdev_opencl
+    arrayfire_tests_susan_opencl
+    arrayfire_tests_svd_dense_opencl
+#    arrayfire_tests_transform_coordinates_opencl
+#    arrayfire_tests_transform_opencl
+    arrayfire_tests_tile_opencl
+    arrayfire_tests_translate_opencl
+    arrayfire_tests_transpose_inplace_opencl
+    arrayfire_tests_transpose_opencl
+    arrayfire_tests_triangle_opencl
+    arrayfire_tests_var_opencl
+    arrayfire_tests_where_opencl
+    arrayfire_tests_unwrap_opencl
+    arrayfire_tests_write_opencl
+    arrayfire_tests_ycbcr_rgb_opencl
+    arrayfire_tests_wrap_opencl
+    PROPERTIES
+      LABELS "${TS_NAME} ${TS_NAME}_tests")
+
+  set_tests_properties(
+    arrayfire_examples_adaptive_thresholding_opencl
+    arrayfire_examples_binary_thresholding_opencl
+    arrayfire_examples_brain_segmentation_opencl
+    arrayfire_examples_edge_opencl
+    arrayfire_examples_filters_opencl
+    arrayfire_examples_image_demo_opencl
+    arrayfire_examples_image_editing_opencl
+    arrayfire_examples_morphing_opencl
+    arrayfire_examples_optical_flow_opencl
+    arrayfire_examples_pyramids_opencl
+    arrayfire_examples_bagging_opencl
+    arrayfire_examples_deep_belief_net_opencl
+    arrayfire_examples_kmeans_opencl
+    arrayfire_examples_knn_opencl
+    arrayfire_examples_logistic_regression_opencl
+    arrayfire_examples_naive_bayes_opencl
+    arrayfire_examples_neural_network_opencl
+    arrayfire_examples_perceptron_opencl
+    arrayfire_examples_rbm_opencl
+    arrayfire_examples_softmax_regression_opencl
+    arrayfire_examples_basic_opencl
+    arrayfire_examples_black_scholes_options_opencl
+    arrayfire_examples_heston_model_opencl
+    arrayfire_examples_monte_carlo_options_opencl
+    arrayfire_examples_blas_opencl
+    arrayfire_examples_fft_opencl
+    arrayfire_examples_pi_opencl
+    arrayfire_examples_cholesky_opencl
+    arrayfire_examples_lu_opencl
+    arrayfire_examples_qr_opencl
+    arrayfire_examples_svd_opencl
+    arrayfire_examples_convolve_opencl
+    arrayfire_examples_integer_opencl
+    arrayfire_examples_rainfall_opencl
+    arrayfire_examples_vectorize_opencl
+    arrayfire_examples_conway_opencl
+    arrayfire_examples_conway_pretty_opencl
+    arrayfire_examples_fractal_opencl
+#    arrayfire_examples_gravity_sim_opencl
+    arrayfire_examples_histogram_opencl
+    arrayfire_examples_plot2d_opencl
+    arrayfire_examples_plot3_opencl
+    arrayfire_examples_surface_opencl
+    arrayfire_examples_fast_opencl
+    arrayfire_examples_harris_opencl
+    arrayfire_examples_matching_opencl
+    arrayfire_examples_susan_opencl
+    arrayfire_examples_helloworld_opencl
+    arrayfire_examples_swe_opencl
+    PROPERTIES
+      LABELS "${TS_NAME} ${TS_NAME}_examples")
+
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required files (Boost/LAPACK) not found" )
+
+endif()
+
+# disabled tests / examples
+
+  #~ add_test(NAME arrayfire_tests_ocl_ext_context_opencl
+           #~ COMMAND "${TS_BUILDDIR}/test/ocl_ext_context_opencl"
+           #~ WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+
+  #~ add_test(NAME arrayfire_tests_transform_opencl
+           #~ COMMAND "${TS_BUILDDIR}/test/transform_opencl"
+           #~ WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+  #~ add_test(NAME arrayfire_tests_transform_coordinates_opencl
+           #~ COMMAND "${TS_BUILDDIR}/test/transform_coordinates_opencl"
+           #~ WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+
+
+  #~ add_test(NAME arrayfire_examples_gravity_sim_opencl
+           #~ COMMAND "${TS_BUILDDIR}/examples/graphics/gravity_sim_opencl"
+           #~ WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
diff --git a/examples/clBLAS/CMakeLists.txt b/examples/clBLAS/CMakeLists.txt
new file mode 100644
index 0000000..eda3c8d
--- /dev/null
+++ b/examples/clBLAS/CMakeLists.txt
@@ -0,0 +1,266 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "clBLAS")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+find_package(Boost 1.44)
+
+if (Boost_FOUND)
+
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    GIT_REPOSITORY "https://github.com/clMathLibraries/${TS_NAME}.git"
+    PATCH_COMMAND cd src && pwd && find samples/ -type f -name *.c | xargs sed -i "s/CL_DEVICE_TYPE_GPU/CL_DEVICE_TYPE_CPU/g"
+    UPDATE_COMMAND /bin/true
+    CONFIGURE_COMMAND ${CMAKE_COMMAND}
+      -DBUILD_RUNTIME=ON
+      -DBUILD_TEST=OFF
+      -DBUILD_PERFORMANCE=ON
+      -DBUILD_SAMPLE=ON
+      -DBUILD_CLIENT=ON
+      -DBUILD_KTEST=ON
+      -DBUILD_SHARED_LIBS=ON
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo
+      "-DCMAKE_C_FLAGS_RELWITHDEBINFO=-O2 -g -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+      "${TS_BASEDIR}/src/${TS_NAME}/src"
+    INSTALL_COMMAND /bin/true
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+  # all fails on
+  # /home/LLVM_380_rwdi_A_rtti/include/clang/Frontend/CompilerInstance.h:414: clang::SourceManager& clang::CompilerInstance::getSourceManager() const: Assertion `SourceMgr && "Compiler instance has no source manager!"' failed.
+
+  add_test(NAME clBLAS_samples_example_chbmv
+           COMMAND "${TS_BUILDDIR}/samples/example_chbmv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_chemm
+           COMMAND "${TS_BUILDDIR}/samples/example_chemm"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_cher
+           COMMAND "${TS_BUILDDIR}/samples/example_cher"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_cher2k
+           COMMAND "${TS_BUILDDIR}/samples/example_cher2k"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_cherk
+           COMMAND "${TS_BUILDDIR}/samples/example_cherk"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_chpmv
+           COMMAND "${TS_BUILDDIR}/samples/example_chpmv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_chpr
+           COMMAND "${TS_BUILDDIR}/samples/example_chpr"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_csscal
+           COMMAND "${TS_BUILDDIR}/samples/example_csscal"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_dtrmv
+           COMMAND "${TS_BUILDDIR}/samples/example_dtrmv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_isamax
+           COMMAND "${TS_BUILDDIR}/samples/example_isamax"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sasum
+           COMMAND "${TS_BUILDDIR}/samples/example_sasum"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_saxpy
+           COMMAND "${TS_BUILDDIR}/samples/example_saxpy"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_scopy
+           COMMAND "${TS_BUILDDIR}/samples/example_scopy"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sdot
+           COMMAND "${TS_BUILDDIR}/samples/example_sdot"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sgbmv
+           COMMAND "${TS_BUILDDIR}/samples/example_sgbmv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sgemm
+           COMMAND "${TS_BUILDDIR}/samples/example_sgemm"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sgemv
+           COMMAND "${TS_BUILDDIR}/samples/example_sgemv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sger
+           COMMAND "${TS_BUILDDIR}/samples/example_sger"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_snrm2
+           COMMAND "${TS_BUILDDIR}/samples/example_snrm2"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_srot
+           COMMAND "${TS_BUILDDIR}/samples/example_srot"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_srotg
+           COMMAND "${TS_BUILDDIR}/samples/example_srotg"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_srotm
+           COMMAND "${TS_BUILDDIR}/samples/example_srotm"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_srotmg
+           COMMAND "${TS_BUILDDIR}/samples/example_srotmg"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_ssbmv
+           COMMAND "${TS_BUILDDIR}/samples/example_ssbmv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sscal
+           COMMAND "${TS_BUILDDIR}/samples/example_sscal"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sspmv
+           COMMAND "${TS_BUILDDIR}/samples/example_sspmv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sspr
+           COMMAND "${TS_BUILDDIR}/samples/example_sspr"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sspr2
+           COMMAND "${TS_BUILDDIR}/samples/example_sspr2"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_sswap
+           COMMAND "${TS_BUILDDIR}/samples/example_sswap"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_ssymm
+           COMMAND "${TS_BUILDDIR}/samples/example_ssymm"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_ssymv
+           COMMAND "${TS_BUILDDIR}/samples/example_ssymv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_ssyr
+           COMMAND "${TS_BUILDDIR}/samples/example_ssyr"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_ssyr2
+           COMMAND "${TS_BUILDDIR}/samples/example_ssyr2"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_ssyr2k
+           COMMAND "${TS_BUILDDIR}/samples/example_ssyr2k"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_ssyrk
+           COMMAND "${TS_BUILDDIR}/samples/example_ssyrk"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_stbmv
+           COMMAND "${TS_BUILDDIR}/samples/example_stbmv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_stbsv
+           COMMAND "${TS_BUILDDIR}/samples/example_stbsv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_stpmv
+           COMMAND "${TS_BUILDDIR}/samples/example_stpmv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_stpsv
+           COMMAND "${TS_BUILDDIR}/samples/example_stpsv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_strmm
+           COMMAND "${TS_BUILDDIR}/samples/example_strmm"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_strmv
+           COMMAND "${TS_BUILDDIR}/samples/example_strmv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_strsm
+           COMMAND "${TS_BUILDDIR}/samples/example_strsm"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_strsv
+           COMMAND "${TS_BUILDDIR}/samples/example_strsv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_zhemv
+           COMMAND "${TS_BUILDDIR}/samples/example_zhemv"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_zher2
+           COMMAND "${TS_BUILDDIR}/samples/example_zher2"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+  add_test(NAME clBLAS_samples_example_zhpr2
+           COMMAND "${TS_BUILDDIR}/samples/example_zhpr2"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/samples")
+
+  # TODO
+  #./staging/testPerfWrapper
+  #./staging/client
+
+
+  set_tests_properties(
+
+    clBLAS_samples_example_chbmv
+    clBLAS_samples_example_chemm
+    clBLAS_samples_example_cher
+    clBLAS_samples_example_cher2k
+    clBLAS_samples_example_cherk
+    clBLAS_samples_example_chpmv
+    clBLAS_samples_example_chpr
+    clBLAS_samples_example_csscal
+    clBLAS_samples_example_dtrmv
+    clBLAS_samples_example_isamax
+    clBLAS_samples_example_sasum
+    clBLAS_samples_example_saxpy
+    clBLAS_samples_example_scopy
+    clBLAS_samples_example_sdot
+    clBLAS_samples_example_sgbmv
+    clBLAS_samples_example_sgemm
+    clBLAS_samples_example_sgemv
+    clBLAS_samples_example_sger
+    clBLAS_samples_example_snrm2
+    clBLAS_samples_example_srot
+    clBLAS_samples_example_srotg
+    clBLAS_samples_example_srotm
+    clBLAS_samples_example_srotmg
+    clBLAS_samples_example_ssbmv
+    clBLAS_samples_example_sscal
+    clBLAS_samples_example_sspmv
+    clBLAS_samples_example_sspr
+    clBLAS_samples_example_sspr2
+    clBLAS_samples_example_sswap
+    clBLAS_samples_example_ssymm
+    clBLAS_samples_example_ssymv
+    clBLAS_samples_example_ssyr
+    clBLAS_samples_example_ssyr2
+    clBLAS_samples_example_ssyr2k
+    clBLAS_samples_example_ssyrk
+    clBLAS_samples_example_stbmv
+    clBLAS_samples_example_stbsv
+    clBLAS_samples_example_stpmv
+    clBLAS_samples_example_stpsv
+    clBLAS_samples_example_strmm
+    clBLAS_samples_example_strmv
+    clBLAS_samples_example_strsm
+    clBLAS_samples_example_strsv
+    clBLAS_samples_example_zhemv
+    clBLAS_samples_example_zher2
+    clBLAS_samples_example_zhpr2
+
+    PROPERTIES
+      LABELS "${TS_NAME}"
+      ENVIRONMENT "LD_LIBRARY_PATH=${TS_BUILDDIR}/library")
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required Boost version not found" )
+
+endif()
diff --git a/examples/clFFT/CMakeLists.txt b/examples/clFFT/CMakeLists.txt
new file mode 100644
index 0000000..5dd6b74
--- /dev/null
+++ b/examples/clFFT/CMakeLists.txt
@@ -0,0 +1,87 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "clFFT")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+find_package(Boost 1.44)
+
+if (Boost_FOUND)
+
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    GIT_REPOSITORY "https://github.com/clMathLibraries/${TS_NAME}.git"
+    #PATCH_COMMAND  pwd && sed -i "s/CL_DEVICE_TYPE_GPU/CL_DEVICE_TYPE_CPU/g" *.cpp
+    UPDATE_COMMAND /bin/true
+    CONFIGURE_COMMAND ${CMAKE_COMMAND}
+      -DBUILD_RUNTIME=ON
+      -DBUILD_CLIENT=ON
+      -DBUILD_TEST=OFF
+      -DBUILD_SHARED_LIBS=ON
+      -DBUILD_EXAMPLES=ON
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo
+      "-DCMAKE_C_FLAGS_RELWITHDEBINFO=-O2 -g -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+      "-DCMAKE_CXX_FLAGS_RELWITHDEBINFO=-O2 -g -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
+      "${TS_BASEDIR}/src/${TS_NAME}/src"
+    INSTALL_COMMAND /bin/true
+  )
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
+
+  add_test(NAME clFFT_samples_fft1d
+           COMMAND "${TS_BUILDDIR}/examples/examples/fft1d"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/examples")
+  add_test(NAME clFFT_samples_fft3d
+           COMMAND "${TS_BUILDDIR}/examples/examples/fft3d"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/examples")
+  add_test(NAME clFFT_samples_fft2d
+           COMMAND "${TS_BUILDDIR}/examples/examples/fft2d"
+           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/examples")
+
+  # TODO
+  #./staging/clFFT-client-2.5.0
+
+
+  set_tests_properties(
+    clFFT_samples_fft1d
+    clFFT_samples_fft2d
+    clFFT_samples_fft3d
+    PROPERTIES
+      LABELS "${TS_NAME}")
+
+else()
+
+  message(STATUS "Disabling testsuite ${TS_NAME}, required Boost version not found" )
+
+endif()
diff --git a/examples/example1-spir32/CMakeLists.txt b/examples/example1-spir32/CMakeLists.txt
index a191826..60684d7 100644
--- a/examples/example1-spir32/CMakeLists.txt
+++ b/examples/example1-spir32/CMakeLists.txt
@@ -52,7 +52,7 @@ set_tests_properties( "spec_tests/example1_dot_product_spir32"
 [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] [.] [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] = 36[.]000000
 OK"
     PROCESSORS 1
-    LABELS "OpenCL_Spec"
+    LABELS "internal;spir"
     DEPENDS "pocl_version_check")
 
 endif()
diff --git a/examples/example1-spir32/Makefile.in b/examples/example1-spir32/Makefile.in
index b24b203..bc77c3c 100644
--- a/examples/example1-spir32/Makefile.in
+++ b/examples/example1-spir32/Makefile.in
@@ -247,6 +247,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -254,6 +255,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -271,8 +273,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -348,6 +348,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/example1-spir32/example1.c b/examples/example1-spir32/example1.c
index 5dcd372..cb9425d 100644
--- a/examples/example1-spir32/example1.c
+++ b/examples/example1-spir32/example1.c
@@ -48,7 +48,6 @@ main (void)
   int source_size;
   cl_float4 *srcA, *srcB;
   cl_float *dst;
-  int ierr;
   int i;
 
   source_file = fopen("example1.spir", "r");
@@ -84,8 +83,11 @@ main (void)
       srcB[i].s[3] = (cl_float)i;
     }
 
-  ierr = exec_dot_product_kernel (source, source_size, N, srcA, srcB, dst);
-  if (ierr) printf ("ERROR\n");
+  if (exec_dot_product_kernel (source, source_size, N, srcA, srcB, dst))
+    {
+      printf ("Error running the tests\n");
+      return -1;
+    }
 
   for (i = 0; i < 4; ++i)
     {
diff --git a/examples/example1-spir32/generate_spir32.sh b/examples/example1-spir32/generate_spir32.sh
new file mode 100755
index 0000000..1c7dd20
--- /dev/null
+++ b/examples/example1-spir32/generate_spir32.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+# NOTE:
+# 1) Install the official SPIR generator version of Clang/LLVM:
+#    https://github.com/KhronosGroup/SPIR  
+# 
+# 2) Download opencl_spir.h from 
+#    https://raw.github.com/KhronosGroup/SPIR-Tools/master/headers/opencl_spir.h
+#    and add "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" in the beginning of
+#    it to make it compile. 
+clang -cc1 -emit-llvm-bc -triple spir-unknown-unknown -include opencl_spir.h -o example1.spir example1.cl
diff --git a/examples/example1-spir64/CMakeLists.txt b/examples/example1-spir64/CMakeLists.txt
index becd6f0..27ddc5c 100644
--- a/examples/example1-spir64/CMakeLists.txt
+++ b/examples/example1-spir64/CMakeLists.txt
@@ -51,7 +51,7 @@ set_tests_properties( "spec_tests/example1_dot_product_spir64"
 [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] [.] [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] = 36[.]000000
 OK"
     PROCESSORS 1
-    LABELS "OpenCL_Spec"
+    LABELS "internal;spir"
     DEPENDS "pocl_version_check")
 
 endif()
diff --git a/examples/example1-spir64/Makefile.in b/examples/example1-spir64/Makefile.in
index 0cf501e..020bb6f 100644
--- a/examples/example1-spir64/Makefile.in
+++ b/examples/example1-spir64/Makefile.in
@@ -246,6 +246,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -253,6 +254,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -270,8 +272,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -347,6 +347,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/example1-spir64/example1.c b/examples/example1-spir64/example1.c
index 5dcd372..cb9425d 100644
--- a/examples/example1-spir64/example1.c
+++ b/examples/example1-spir64/example1.c
@@ -48,7 +48,6 @@ main (void)
   int source_size;
   cl_float4 *srcA, *srcB;
   cl_float *dst;
-  int ierr;
   int i;
 
   source_file = fopen("example1.spir", "r");
@@ -84,8 +83,11 @@ main (void)
       srcB[i].s[3] = (cl_float)i;
     }
 
-  ierr = exec_dot_product_kernel (source, source_size, N, srcA, srcB, dst);
-  if (ierr) printf ("ERROR\n");
+  if (exec_dot_product_kernel (source, source_size, N, srcA, srcB, dst))
+    {
+      printf ("Error running the tests\n");
+      return -1;
+    }
 
   for (i = 0; i < 4; ++i)
     {
diff --git a/examples/example1-spir64/generate_spir.sh b/examples/example1-spir64/generate_spir.sh
new file mode 100755
index 0000000..28da381
--- /dev/null
+++ b/examples/example1-spir64/generate_spir.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+# NOTE:
+# 1) Install the official SPIR generator version of Clang/LLVM:
+#    https://github.com/KhronosGroup/SPIR  
+# 
+# 2) Download opencl_spir.h from 
+#    https://raw.github.com/KhronosGroup/SPIR-Tools/master/headers/opencl_spir.h
+#    and add "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" in the beginning of
+#    it to make it compile. 
+clang -cc1 -emit-llvm-bc -triple spir64-unknown-unknown -include opencl_spir.h -o example1.spir example1.cl
diff --git a/examples/example1/CMakeLists.txt b/examples/example1/CMakeLists.txt
index 1de70ef..6898126 100644
--- a/examples/example1/CMakeLists.txt
+++ b/examples/example1/CMakeLists.txt
@@ -38,9 +38,9 @@ add_executable("example1" example1.c example1_exec.c example1.cl)
 # example1_LDADD = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
 target_link_libraries("example1" ${POCLU_LINK_OPTIONS})
 
-add_test("spec_tests/example1_dot_product" "example1")
+add_test("examples/example1_dot_product" "example1")
 
-set_tests_properties( "spec_tests/example1_dot_product"
+set_tests_properties( "examples/example1_dot_product"
   PROPERTIES
     COST 40.0
     PASS_REGULAR_EXPRESSION "[(]0[.]000000, 0[.]000000, 0[.]000000, 0[.]000000[)] [.] [(]0[.]000000, 0[.]000000, 0[.]000000, 0[.]000000[)] = 0[.]000000
@@ -49,5 +49,5 @@ set_tests_properties( "spec_tests/example1_dot_product"
 [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] [.] [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] = 36[.]000000
 OK"
     PROCESSORS 1
-    LABELS "OpenCL_Spec"
+    LABELS "internal;hsa;tce"
     DEPENDS "pocl_version_check")
diff --git a/examples/example1/Makefile.in b/examples/example1/Makefile.in
index 5ab44b4..2ea4910 100644
--- a/examples/example1/Makefile.in
+++ b/examples/example1/Makefile.in
@@ -246,6 +246,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -253,6 +254,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -270,8 +272,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -347,6 +347,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/example2/CMakeLists.txt b/examples/example2/CMakeLists.txt
index 19c55d7..dbc65aa 100644
--- a/examples/example2/CMakeLists.txt
+++ b/examples/example2/CMakeLists.txt
@@ -39,12 +39,12 @@ add_executable("example2" example2.c example2.cl)
 # example1_LDADD = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
 target_link_libraries("example2" ${POCLU_LINK_OPTIONS})
 
-add_test("spec_tests/example2_matrix_transpose" "example2")
+add_test("examples/example2_matrix_transpose" "example2")
 
-set_tests_properties( "spec_tests/example2_matrix_transpose"
+set_tests_properties( "examples/example2_matrix_transpose"
   PROPERTIES
     COST 3.0
     PASS_REGULAR_EXPRESSION "OK\n"
     PROCESSORS 1
-    LABELS "OpenCL_Spec"
+    LABELS "internal;hsa;tce"
     DEPENDS "pocl_version_check")
diff --git a/examples/example2/Makefile.in b/examples/example2/Makefile.in
index 50c33c9..f5bc690 100644
--- a/examples/example2/Makefile.in
+++ b/examples/example2/Makefile.in
@@ -245,6 +245,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -252,6 +253,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -269,8 +271,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -346,6 +346,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/example2a/CMakeLists.txt b/examples/example2a/CMakeLists.txt
index dd3e8b2..3b0b7b4 100644
--- a/examples/example2a/CMakeLists.txt
+++ b/examples/example2a/CMakeLists.txt
@@ -39,12 +39,12 @@ add_executable("example2a" example2a.c example2a.cl)
 # example1_LDADD = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
 target_link_libraries("example2a" ${POCLU_LINK_OPTIONS})
 
-add_test("spec_tests/example2_matrix_transpose_alocals" "example2a")
+add_test("examples/example2_matrix_transpose_alocals" "example2a")
 
-set_tests_properties( "spec_tests/example2_matrix_transpose_alocals"
+set_tests_properties( "examples/example2_matrix_transpose_alocals"
   PROPERTIES
     COST 3.0
     PASS_REGULAR_EXPRESSION "OK\n"
     PROCESSORS 1
-    LABELS "OpenCL_Spec"
+    LABELS "internal;hsa;tce"
     DEPENDS "pocl_version_check")
diff --git a/examples/example2a/Makefile.in b/examples/example2a/Makefile.in
index b84e0c1..340a6bb 100644
--- a/examples/example2a/Makefile.in
+++ b/examples/example2a/Makefile.in
@@ -245,6 +245,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -252,6 +253,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -269,8 +271,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -346,6 +346,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/opencl-book-samples/CMakeLists.txt b/examples/opencl-book-samples/CMakeLists.txt
new file mode 100644
index 0000000..b4b7583
--- /dev/null
+++ b/examples/opencl-book-samples/CMakeLists.txt
@@ -0,0 +1,105 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "opencl-book-samples")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TS_BASEDIR}/src/${TS_NAME}")
+
+message(STATUS "Enabling testsuite ${TS_NAME}")
+set(ENABLED_TESTSUITES "${ENABLED_TESTSUITES};${TS_NAME}" PARENT_SCOPE)
+
+ExternalProject_Add(
+  ${TS_NAME}
+  PREFIX "${TS_BASEDIR}"
+  #DOWNLOAD_COMMAND "/bin/true"
+  SVN_REPOSITORY "http://opencl-book-samples.googlecode.com/svn/trunk"
+  PATCH_COMMAND pwd && echo Patching &&	sed -i "s/bool doCPU = false/bool doCPU = true/g" src/Chapter_16/Dijkstra/oclDijkstra.cpp &&
+  sed -i "s/size_t localWorkSize = maxWorkGroupSize/size_t localWorkSize = 2/g" src/Chapter_16/Dijkstra/oclDijkstraKernel.cpp &&
+  sed -i "s/device.j..type == CL_DEVICE_TYPE_GPU/device[j].type \\& CL_DEVICE_TYPE_GPU/g" src/Chapter_22/spmv.c &&
+  sed -i "s/context.CL_DEVICE_TYPE_GPU/context\(CL_DEVICE_TYPE_CPU/g" src/Chapter_12/VectorAdd/vecadd.cpp &&
+  rm -f src/Chapter_12/Sinewave/CMakeLists.txt
+
+  # update restores src/Chapter_12/Sinewave/CMakeLists.txt, so disable it
+  UPDATE_COMMAND "/bin/true"
+  CMAKE_ARGS -DCMAKE_BUILD_TYPE=RelWithDebInfo
+  "-DCMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-g -O3 -DCL_USE_DEPRECATED_OPENCL_1_1_APIS -DCL_USE_DEPRECATED_OPENCL_1_2_APIS"
+  "-DCMAKE_EXE_LINKER_FLAGS=-lGL -lglut -pthread"
+
+  INSTALL_COMMAND "/bin/true"
+)
+
+set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+add_dependencies(prepare_examples ${TS_NAME})
+
+
+add_test(NAME "opencl_book_samples_HelloWorld"
+         COMMAND "${TS_BUILDDIR}/src/Chapter_2/HelloWorld/HelloWorld"
+         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_2/HelloWorld")
+add_test(NAME "opencl_book_samples_OpenCLInfo"
+         COMMAND "${TS_BUILDDIR}/src/Chapter_3/OpenCLInfo/OpenCLInfo"
+         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_3/OpenCLInfo")
+add_test(NAME "opencl_book_samples_OpenCLConvolutionChap3"
+         COMMAND "${TS_BUILDDIR}/src/Chapter_3/OpenCLConvolution/OpenCLConvolutionChap3"
+         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_3/OpenCLConvolution")
+add_test(NAME "opencl_book_samples_HelloBinaryWorld"
+         COMMAND "${TS_BUILDDIR}/src/Chapter_6/HelloBinaryWorld/HelloBinaryWorld"
+         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_6/HelloBinaryWorld")
+add_test(NAME "opencl_book_samples_SimpleBufferSubBuffer"
+         COMMAND "${TS_BUILDDIR}/src/Chapter_7/SimpleBufferSubBuffer/SimpleBufferSubBuffer"
+         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_7/SimpleBufferSubBuffer")
+# requires input images
+#add_test(NAME "opencl_book_samples_ImageFilter2D"
+#         COMMAND "${TS_BUILDDIR}/src/Chapter_8/ImageFilter2D/ImageFilter2D"
+#         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_8/ImageFilter2D")
+add_test(NAME "opencl_book_samples_vecadd"
+         COMMAND "${TS_BUILDDIR}/src/Chapter_12/VectorAdd/vecadd"
+         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_12/VectorAdd")
+#add_test(NAME "opencl_book_samples_histogram"
+#         COMMAND "${TS_BUILDDIR}/src/Chapter_14/histogram/histogram"
+#         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_14/histogram")
+add_test(NAME "opencl_book_samples_Dijkstra"
+         COMMAND "${TS_BUILDDIR}/src/Chapter_16/Dijkstra/Dijkstra" --cpu
+         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_16/Dijkstra")
+# doesn't work
+#add_test(NAME "opencl_book_samples_spmv"
+#         COMMAND "${TS_BUILDDIR}/src/Chapter_22/spmv"
+#         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_22")
+
+
+
+set_tests_properties(
+  opencl_book_samples_HelloWorld
+  opencl_book_samples_OpenCLInfo
+  opencl_book_samples_OpenCLConvolutionChap3
+  opencl_book_samples_HelloBinaryWorld
+  opencl_book_samples_SimpleBufferSubBuffer
+#  opencl_book_samples_ImageFilter2D
+  opencl_book_samples_vecadd
+#  opencl_book_samples_histogram
+  opencl_book_samples_Dijkstra
+#  opencl_book_samples_spmv
+  PROPERTIES
+    LABELS "opencl-book-samples")
diff --git a/examples/opencl-book-samples/Makefile.in b/examples/opencl-book-samples/Makefile.in
index b1eafb8..af4d8c4 100644
--- a/examples/opencl-book-samples/Makefile.in
+++ b/examples/opencl-book-samples/Makefile.in
@@ -192,6 +192,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -199,6 +200,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -216,8 +218,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -293,6 +293,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/piglit/CMakeLists.txt b/examples/piglit/CMakeLists.txt
new file mode 100644
index 0000000..9f73f77
--- /dev/null
+++ b/examples/piglit/CMakeLists.txt
@@ -0,0 +1,220 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2015 pocl developers
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "piglit")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+message(STATUS "Enabling testsuite ${TS_NAME}")
+list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+#-- Found PythonInterp: /usr/bin/python2.7 (found suitable version "2.7.11", minimum required is "2.7")
+#-- Found PythonNumpy: success (found suitable version "1.10.2", minimum required is "1.6.2")
+#-- Found PythonMako: success (found suitable version "1.0.3", minimum required is "0.8.0")
+#-- Found PythonSix: success (found suitable version "1.10.0", minimum required is "1.4.0")
+
+ExternalProject_Add(
+  ${TS_NAME}
+  PREFIX "${TS_BASEDIR}"
+  GIT_REPOSITORY "git://anongit.freedesktop.org/piglit"
+  CMAKE_ARGS
+    -DCMAKE_BUILD_TYPE=RelWithDebInfo
+    "-DCMAKE_CXX_FLAGS_RELWITHDEBINFO=-O2 -g -DCL_USE_DEPRECATED_OPENCL_1_2_APIS"
+    -DPIGLIT_BUILD_CL_TESTS=ON
+    -DPIGLIT_BUILD_DMA_BUF_TESTS:BOOL=OFF
+    -DPIGLIT_BUILD_GL_TESTS=OFF
+    -DPIGLIT_BUILD_GLES1_TESTS=OFF
+    -DPIGLIT_BUILD_GLES2_TESTS=OFF
+    -DPIGLIT_BUILD_GLES3_TESTS=OFF
+    -DPIGLIT_USE_WAFFLE=OFF
+    -DPIGLIT_BUILD_GLX_TESTS=OFF
+    "-DOPENCL_INCLUDE_PATH=${CMAKE_SOURCE_DIR}/include/"
+    "-DOPENCL_opencl_LIBRARY:LIST=OpenCL"
+  INSTALL_COMMAND /bin/true
+)
+
+set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+add_dependencies(prepare_examples ${TS_NAME})
+
+add_test(NAME piglit_cl_api_build_program
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-build-program")
+add_test(NAME piglit_cl_api_compile_program
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-compile-program")
+add_test(NAME piglit_cl_api_create_buffer
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-buffer")
+add_test(NAME piglit_cl_api_create_command_queue
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-command-queue")
+add_test(NAME piglit_cl_api_create_context
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-context")
+add_test(NAME piglit_cl_api_create_context_from_type
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-context-from-type")
+add_test(NAME piglit_cl_api_create_image
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-image")
+add_test(NAME piglit_cl_api_create_kernel
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-kernel")
+add_test(NAME piglit_cl_api_create_kernels_in_program
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-kernels-in-program")
+add_test(NAME piglit_cl_api_create_program_with_binary
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-program-with-binary")
+add_test(NAME piglit_cl_api_create_program_with_source
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-program-with-source")
+add_test(NAME piglit_cl_api_create_sampler
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-create-sampler")
+add_test(NAME piglit_cl_api_enqueue_copy_buffer
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-enqueue-copy-buffer")
+add_test(NAME piglit_cl_api_enqueue_copy_buffer_rect
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-enqueue-copy-buffer-rect")
+add_test(NAME piglit_cl_api_enqueue_fill_buffer
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-enqueue-fill-buffer")
+add_test(NAME piglit_cl_api_enqueue_fill_image
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-enqueue-fill-image")
+add_test(NAME piglit_cl_api_enqueue_map_buffer
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-enqueue-map-buffer")
+add_test(NAME piglit_cl_api_enqueue_migrate_mem_objects
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-enqueue-migrate-mem-objects")
+add_test(NAME piglit_cl_api_enqueue_read_write_buffer
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-enqueue-read_write-buffer")
+add_test(NAME piglit_cl_api_get_command_queue_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-command-queue-info")
+add_test(NAME piglit_cl_api_get_context_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-context-info")
+add_test(NAME piglit_cl_api_get_device_ids
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-device-ids")
+add_test(NAME piglit_cl_api_get_device_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-device-info")
+add_test(NAME piglit_cl_api_get_event_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-event-info")
+add_test(NAME piglit_cl_api_get_image_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-image-info")
+add_test(NAME piglit_cl_api_get_kernel_arg_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-kernel-arg-info")
+add_test(NAME piglit_cl_api_get_kernel_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-kernel-info")
+add_test(NAME piglit_cl_api_get_kernel_work_group_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-kernel-work-group-info")
+add_test(NAME piglit_cl_api_get_mem_object_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-mem-object-info")
+add_test(NAME piglit_cl_api_get_platform_ids
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-platform-ids")
+add_test(NAME piglit_cl_api_get_platform_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-platform-info")
+add_test(NAME piglit_cl_api_get_program_build_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-program-build-info")
+add_test(NAME piglit_cl_api_get_program_info
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-get-program-info")
+add_test(NAME piglit_cl_api_link_program
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-link-program")
+add_test(NAME piglit_cl_api_retain_release_command_queue
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-command-queue")
+add_test(NAME piglit_cl_api_retain_release_context
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-context")
+add_test(NAME piglit_cl_api_retain_release_event
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-event")
+add_test(NAME piglit_cl_api_retain_release_kernel
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-kernel")
+add_test(NAME piglit_cl_api_retain_release_mem_object
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-mem-object")
+add_test(NAME piglit_cl_api_retain_release_program
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-program")
+add_test(NAME piglit_cl_api_set_kernel_arg
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-set-kernel-arg")
+add_test(NAME piglit_cl_api_unload_compiler
+         COMMAND "${TS_BUILDDIR}/bin/cl-api-unload-compiler")
+add_test(NAME piglit_cl_custom_buffer_flags
+         COMMAND "${TS_BUILDDIR}/bin/cl-custom-buffer-flags")
+add_test(NAME piglit_cl_custom_flush_after_enqueue_kernel
+         COMMAND "${TS_BUILDDIR}/bin/cl-custom-flush-after-enqueue-kernel")
+add_test(NAME piglit_cl_custom_r600_create_release_buffer_bug
+         COMMAND "${TS_BUILDDIR}/bin/cl-custom-r600-create-release-buffer-bug")
+add_test(NAME piglit_cl_custom_run_simple_kernel
+         COMMAND "${TS_BUILDDIR}/bin/cl-custom-run-simple-kernel")
+add_test(NAME piglit_cl_custom_use_sub_buffer_in_kernel
+         COMMAND "${TS_BUILDDIR}/bin/cl-custom-use-sub-buffer-in-kernel")
+add_test(NAME piglit_cl_program_bitcoin_phatk
+         COMMAND "${TS_BUILDDIR}/bin/cl-program-bitcoin-phatk")
+add_test(NAME piglit_cl_program_max_work_item_sizes
+         COMMAND "${TS_BUILDDIR}/bin/cl-program-max-work-item-sizes")
+add_test(NAME piglit_cl_program_tester
+         COMMAND "${TS_BUILDDIR}/bin/cl-program-tester")
+
+
+
+
+set_tests_properties(
+  piglit_cl_api_build_program
+  piglit_cl_api_compile_program
+  piglit_cl_api_create_buffer
+  piglit_cl_api_create_command_queue
+  piglit_cl_api_create_context
+  piglit_cl_api_create_context_from_type
+  piglit_cl_api_create_image
+  piglit_cl_api_create_kernel
+  piglit_cl_api_create_kernels_in_program
+  piglit_cl_api_create_program_with_binary
+  piglit_cl_api_create_program_with_source
+  piglit_cl_api_create_sampler
+  piglit_cl_api_enqueue_copy_buffer
+  piglit_cl_api_enqueue_copy_buffer_rect
+  piglit_cl_api_enqueue_fill_buffer
+  piglit_cl_api_enqueue_fill_image
+  piglit_cl_api_enqueue_map_buffer
+  piglit_cl_api_enqueue_migrate_mem_objects
+  piglit_cl_api_enqueue_read_write_buffer
+  piglit_cl_api_get_command_queue_info
+  piglit_cl_api_get_context_info
+  piglit_cl_api_get_device_ids
+  piglit_cl_api_get_device_info
+  piglit_cl_api_get_event_info
+  piglit_cl_api_get_image_info
+  piglit_cl_api_get_kernel_arg_info
+  piglit_cl_api_get_kernel_info
+  piglit_cl_api_get_kernel_work_group_info
+  piglit_cl_api_get_mem_object_info
+  piglit_cl_api_get_platform_ids
+  piglit_cl_api_get_platform_info
+  piglit_cl_api_get_program_build_info
+  piglit_cl_api_get_program_info
+  piglit_cl_api_link_program
+  piglit_cl_api_retain_release_command_queue
+  piglit_cl_api_retain_release_context
+  piglit_cl_api_retain_release_event
+  piglit_cl_api_retain_release_kernel
+  piglit_cl_api_retain_release_mem_object
+  piglit_cl_api_retain_release_program
+  piglit_cl_api_set_kernel_arg
+  piglit_cl_api_unload_compiler
+  piglit_cl_custom_buffer_flags
+  piglit_cl_custom_flush_after_enqueue_kernel
+  piglit_cl_custom_r600_create_release_buffer_bug
+  piglit_cl_custom_run_simple_kernel
+  piglit_cl_custom_use_sub_buffer_in_kernel
+  piglit_cl_program_bitcoin_phatk
+  piglit_cl_program_max_work_item_sizes
+  piglit_cl_program_tester
+
+  PROPERTIES
+    LABELS "piglit")
diff --git a/examples/piglit/Makefile.in b/examples/piglit/Makefile.in
index 5364c6e..af8ef74 100644
--- a/examples/piglit/Makefile.in
+++ b/examples/piglit/Makefile.in
@@ -192,6 +192,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -199,6 +200,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -216,8 +218,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -293,6 +293,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/pocl-android-sample/.cproject b/examples/pocl-android-sample/.cproject
new file mode 100644
index 0000000..3d21f95
--- /dev/null
+++ b/examples/pocl-android-sample/.cproject
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.992571829">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.992571829" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.MakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.debug.992571829" name="Debug" parent="cdt.managedbuild.config.gnu.exe.debug">
+					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.992571829." name="/" resourcePath="">
+						<toolChain id="com.android.toolchain.gcc.1441634911" name="Android GCC" superClass="com.android.toolchain.gcc">
+							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF" id="com.android.targetPlatform.1141118736" isAbstract="false" superClass="com.android.targetPlatform"/>
+							<builder id="com.android.builder.1583783833" managedBuildOn="false" name="Android Builder.Debug" superClass="com.android.builder"/>
+							<tool id="com.android.gcc.compiler.1926787798" name="Android GCC Compiler" superClass="com.android.gcc.compiler"/>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+		<cconfiguration id="cdt.managedbuild.config.gnu.exe.release.1227773580">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.release.1227773580" moduleId="org.eclipse.cdt.core.settings" name="Release">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.MakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="cdt.managedbuild.config.gnu.exe.release.1227773580" name="Release" parent="cdt.managedbuild.config.gnu.exe.release">
+					<folderInfo id="cdt.managedbuild.config.gnu.exe.release.1227773580." name="/" resourcePath="">
+						<toolChain id="com.android.toolchain.gcc.1224622279" name="Android GCC" superClass="com.android.toolchain.gcc">
+							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF" id="com.android.targetPlatform.195213851" isAbstract="false" superClass="com.android.targetPlatform"/>
+							<builder id="com.android.builder.113622563" managedBuildOn="false" name="Android Builder.Release" superClass="com.android.builder"/>
+							<tool id="com.android.gcc.compiler.136863320" name="Android GCC Compiler" superClass="com.android.gcc.compiler"/>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="pocl-android-sample.cdt.managedbuild.target.gnu.exe.96854699" name="Executable" projectType="cdt.managedbuild.target.gnu.exe"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.992571829;cdt.managedbuild.config.gnu.exe.debug.992571829.;cdt.managedbuild.tool.gnu.c.compiler.exe.debug.1536310473;cdt.managedbuild.tool.gnu.c.compiler.input.819161304">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.debug.992571829;cdt.managedbuild.config.gnu.exe.debug.992571829.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1913731540;cdt.managedbuild.tool.gnu.cpp.compiler.input.282903054">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1227773580;cdt.managedbuild.config.gnu.exe.release.1227773580.;cdt.managedbuild.tool.gnu.c.compiler.exe.release.1214078834;cdt.managedbuild.tool.gnu.c.compiler.input.85325774">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="cdt.managedbuild.config.gnu.exe.release.1227773580;cdt.managedbuild.config.gnu.exe.release.1227773580.;cdt.managedbuild.tool.gnu.cpp.compiler.exe.release.1092673189;cdt.managedbuild.tool.gnu.cpp.compiler.input.871413014">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/pocl-android-sample"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/pocl-android-sample"/>
+		</configuration>
+	</storageModule>
+</cproject>
diff --git a/examples/pocl-android-sample/.project b/examples/pocl-android-sample/.project
new file mode 100644
index 0000000..c7c9cb0
--- /dev/null
+++ b/examples/pocl-android-sample/.project
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>pocl-android-sample</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>com.android.ide.eclipse.adt.ResourceManagerBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>com.android.ide.eclipse.adt.PreCompilerBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>com.android.ide.eclipse.adt.ApkBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>com.android.ide.eclipse.adt.AndroidNature</nature>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+</projectDescription>
diff --git a/examples/pocl-android-sample/AndroidManifest.xml b/examples/pocl-android-sample/AndroidManifest.xml
new file mode 100644
index 0000000..a9a712c
--- /dev/null
+++ b/examples/pocl-android-sample/AndroidManifest.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.pocl.sample1"
+    android:versionCode="3"
+    android:versionName="3" >
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
+
+    <uses-sdk
+        android:minSdkVersion="17"
+        android:targetSdkVersion="21" />
+
+    <application
+        android:allowBackup="true"
+        android:icon="@drawable/ic_launcher"
+        android:label="@string/app_name"
+        android:theme="@style/AppTheme" >
+        <activity
+            android:name="org.pocl.sample1.MainActivity"
+            android:label="@string/app_name" >
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
diff --git a/examples/pocl-android-sample/jni/Android.mk b/examples/pocl-android-sample/jni/Android.mk
new file mode 100644
index 0000000..13de7d4
--- /dev/null
+++ b/examples/pocl-android-sample/jni/Android.mk
@@ -0,0 +1,12 @@
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE := poclVecAdd
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/libopencl-stub/include/
+LOCAL_SRC_FILES := vectorAdd.cpp
+LOCAL_CFLAGS   = -fPIC -O2
+LOCAL_STATIC_LIBRARIES := OpenCL
+LOCAL_LDLIBS := -ldl -llog
+include $(BUILD_SHARED_LIBRARY)
+
+include $(LOCAL_PATH)/libopencl-stub/Android.mk
diff --git a/examples/pocl-android-sample/jni/Application.mk b/examples/pocl-android-sample/jni/Application.mk
new file mode 100644
index 0000000..4fc6ba5
--- /dev/null
+++ b/examples/pocl-android-sample/jni/Application.mk
@@ -0,0 +1,2 @@
+APP_STL := gnustl_static
+APP_ABI := armeabi-v7a
diff --git a/examples/pocl-android-sample/jni/CLONE_LIBOPENCL_STUB_HERE b/examples/pocl-android-sample/jni/CLONE_LIBOPENCL_STUB_HERE
new file mode 100644
index 0000000..e7060f8
--- /dev/null
+++ b/examples/pocl-android-sample/jni/CLONE_LIBOPENCL_STUB_HERE
@@ -0,0 +1 @@
+clone libopencl-stub from https://github.com/krrishnarraj/libopencl-stub
diff --git a/examples/pocl-android-sample/jni/vectorAdd.cpp b/examples/pocl-android-sample/jni/vectorAdd.cpp
new file mode 100644
index 0000000..d33c69f
--- /dev/null
+++ b/examples/pocl-android-sample/jni/vectorAdd.cpp
@@ -0,0 +1,137 @@
+#include <CL/cl.h>
+#include <string.h>
+#include <stdlib.h>
+#include <android/log.h>
+#include "vectorAdd.h"
+
+#define LOCAL_SIZE  64
+
+
+#define CHECK_AND_RETURN(ret, msg)                                          \
+    if(ret != CL_SUCCESS) {                                                 \
+        __android_log_print(ANDROID_LOG_ERROR, "opencl vector add",         \
+				"ERROR: %s at line %d in %s returned with %d\n",            \
+					msg, __LINE__, __FILE__, ret);                          \
+        return ret;                                                         \
+    }
+
+
+static const char *vector_add_str="											\
+		__kernel void vec_add(int N, __global float *A,						\
+								__global float *B, __global float *C)		\
+		{																	\
+			int id = get_global_id(0);										\
+																			\
+			if(id < N) {													\
+				C[id] = A[id] + B[id];										\
+			}																\
+		}																	\
+		";
+
+static cl_context clContext = NULL;
+static cl_command_queue clCommandQueue = NULL;
+static cl_program clProgram = NULL;
+static cl_kernel clKernel = NULL;
+
+jint Java_org_pocl_sample1_MainActivity_initCL(JNIEnv *je, jobject jo)
+{
+	cl_platform_id clPlatform;
+	cl_device_id clDevice;
+    cl_int	status;
+
+    status = clGetPlatformIDs(1, &clPlatform, NULL);
+    CHECK_AND_RETURN(status, "getting platform id failed");
+
+	status = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &clDevice, NULL);
+    CHECK_AND_RETURN(status, "getting device id failed");
+
+    cl_context_properties cps[] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform,
+                                      0 };
+
+    clContext = clCreateContext(cps, 1, &clDevice, NULL, NULL, &status);
+    CHECK_AND_RETURN(status, "creating context failed");
+
+	clCommandQueue = clCreateCommandQueue(clContext, clDevice, 0, &status);
+    CHECK_AND_RETURN(status, "creating command queue failed");
+
+    size_t strSize = strlen(vector_add_str);
+    clProgram = clCreateProgramWithSource(clContext, 1, &vector_add_str, &strSize, &status);
+    CHECK_AND_RETURN(status, "creating program failed");
+
+	status = clBuildProgram(clProgram, 1, &clDevice, NULL, NULL, NULL);
+    CHECK_AND_RETURN(status, "build program failed");
+
+	clKernel = clCreateKernel(clProgram, "vec_add", &status);
+    CHECK_AND_RETURN(status, "creating kernel failed");
+
+    return 0;
+}
+
+
+jint Java_org_pocl_sample1_MainActivity_destroyCL(JNIEnv *je, jobject jo)
+{
+	if(clKernel)		clReleaseKernel(clKernel);
+	if(clProgram)		clReleaseProgram(clProgram);
+	if(clCommandQueue) 	clReleaseCommandQueue(clCommandQueue);
+	if(clContext)	    clReleaseContext(clContext);
+
+    return 0;
+}
+
+
+jint Java_org_pocl_sample1_MainActivity_vectorAddCL(JNIEnv *je , jobject jo,
+						jint N, jfloatArray _A, jfloatArray _B, jfloatArray _C)
+{
+    cl_int	status;
+    int byteSize = N * sizeof(float);
+
+    // Get pointers to array from jni wrapped floatArray
+    jfloat* A = je->GetFloatArrayElements(_A, 0);
+    jfloat* B = je->GetFloatArrayElements(_B, 0);
+    jfloat* C = je->GetFloatArrayElements(_C, 0);
+
+    cl_mem A_obj = clCreateBuffer(clContext, (CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR),
+                                                    byteSize, A, &status);
+    CHECK_AND_RETURN(status, "create buffer A failed");
+
+    cl_mem B_obj = clCreateBuffer(clContext, (CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR),
+                                                    byteSize, B, &status);
+    CHECK_AND_RETURN(status, "create buffer B failed");
+
+    cl_mem C_obj = clCreateBuffer(clContext, (CL_MEM_WRITE_ONLY|CL_MEM_USE_HOST_PTR),
+                                                    byteSize, C, &status);
+    CHECK_AND_RETURN(status, "create buffer C failed");
+
+    status = clSetKernelArg(clKernel, 0, sizeof(cl_int), (void *)&N);
+    status |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&A_obj);
+    status |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B_obj);
+    status |= clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&C_obj);
+    CHECK_AND_RETURN(status, "clSetKernelArg failed");
+
+    size_t localSize = LOCAL_SIZE;
+    size_t wgs = (N + localSize - 1) / localSize;
+    size_t globalSize = wgs * localSize;
+
+    status = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL,
+                            &globalSize, &localSize, 0, NULL, NULL);
+    CHECK_AND_RETURN(status, "clEnqueueNDRange failed");
+
+    status = clFinish(clCommandQueue);
+    CHECK_AND_RETURN(status, "clFinish failed");
+
+    je->ReleaseFloatArrayElements(_A, A, 0);
+    je->ReleaseFloatArrayElements(_B, B, 0);
+    je->ReleaseFloatArrayElements(_C, C, 0);
+
+    return 0;
+}
+
+void Java_org_pocl_sample1_MainActivity_setenv(JNIEnv *jniEnv,
+						jobject _jObj, jstring key, jstring value)
+{
+	setenv((char*) jniEnv->GetStringUTFChars(key, 0),
+			(char*) jniEnv->GetStringUTFChars(value, 0), 1);
+}
+
+
+
diff --git a/examples/pocl-android-sample/jni/vectorAdd.h b/examples/pocl-android-sample/jni/vectorAdd.h
new file mode 100644
index 0000000..fb10c29
--- /dev/null
+++ b/examples/pocl-android-sample/jni/vectorAdd.h
@@ -0,0 +1,49 @@
+/* This file can be generated using
+ * javah -jni -classpath src/ -o jni/vectorAdd.h org.pocl.sample1.MainActivity
+ */
+
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_pocl_sample1_MainActivity */
+
+#ifndef _Included_org_pocl_sample1_MainActivity
+#define _Included_org_pocl_sample1_MainActivity
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     org_pocl_sample1_MainActivity
+ * Method:    initCL
+ * Signature: ()I
+ */
+JNIEXPORT jint JNICALL Java_org_pocl_sample1_MainActivity_initCL
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_pocl_sample1_MainActivity
+ * Method:    vectorAddCL
+ * Signature: (I[F[F[F)I
+ */
+JNIEXPORT jint JNICALL Java_org_pocl_sample1_MainActivity_vectorAddCL
+  (JNIEnv *, jobject, jint, jfloatArray, jfloatArray, jfloatArray);
+
+/*
+ * Class:     org_pocl_sample1_MainActivity
+ * Method:    destroyCL
+ * Signature: ()I
+ */
+JNIEXPORT jint JNICALL Java_org_pocl_sample1_MainActivity_destroyCL
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_pocl_sample1_MainActivity
+ * Method:    setenv
+ * Signature: (Ljava/lang/String;Ljava/lang/String;)V
+ */
+JNIEXPORT void JNICALL Java_org_pocl_sample1_MainActivity_setenv
+  (JNIEnv *, jobject, jstring, jstring);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/examples/pocl-android-sample/project.properties b/examples/pocl-android-sample/project.properties
new file mode 100644
index 0000000..a3ee5ab
--- /dev/null
+++ b/examples/pocl-android-sample/project.properties
@@ -0,0 +1,14 @@
+# This file is automatically generated by Android Tools.
+# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
+#
+# This file must be checked in Version Control Systems.
+#
+# To customize properties used by the Ant build system edit
+# "ant.properties", and override values to adapt the script to your
+# project structure.
+#
+# To enable ProGuard to shrink and obfuscate your code, uncomment this (available properties: sdk.dir, user.home):
+#proguard.config=${sdk.dir}/tools/proguard/proguard-android.txt:proguard-project.txt
+
+# Project target.
+target=android-17
diff --git a/examples/pocl-android-sample/res/drawable-hdpi/ic_launcher.png b/examples/pocl-android-sample/res/drawable-hdpi/ic_launcher.png
new file mode 100644
index 0000000..a57e8ca
Binary files /dev/null and b/examples/pocl-android-sample/res/drawable-hdpi/ic_launcher.png differ
diff --git a/examples/pocl-android-sample/res/drawable-mdpi/ic_launcher.png b/examples/pocl-android-sample/res/drawable-mdpi/ic_launcher.png
new file mode 100644
index 0000000..74ea327
Binary files /dev/null and b/examples/pocl-android-sample/res/drawable-mdpi/ic_launcher.png differ
diff --git a/examples/pocl-android-sample/res/drawable-xhdpi/ic_launcher.png b/examples/pocl-android-sample/res/drawable-xhdpi/ic_launcher.png
new file mode 100644
index 0000000..df67752
Binary files /dev/null and b/examples/pocl-android-sample/res/drawable-xhdpi/ic_launcher.png differ
diff --git a/examples/pocl-android-sample/res/drawable-xxhdpi/ic_launcher.png b/examples/pocl-android-sample/res/drawable-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000..adb0a88
Binary files /dev/null and b/examples/pocl-android-sample/res/drawable-xxhdpi/ic_launcher.png differ
diff --git a/examples/pocl-android-sample/res/values-v11/styles.xml b/examples/pocl-android-sample/res/values-v11/styles.xml
new file mode 100644
index 0000000..3c02242
--- /dev/null
+++ b/examples/pocl-android-sample/res/values-v11/styles.xml
@@ -0,0 +1,11 @@
+<resources>
+
+    <!--
+        Base application theme for API 11+. This theme completely replaces
+        AppBaseTheme from res/values/styles.xml on API 11+ devices.
+    -->
+    <style name="AppBaseTheme" parent="android:Theme.Holo.Light">
+        <!-- API 11 theme customizations can go here. -->
+    </style>
+
+</resources>
diff --git a/examples/pocl-android-sample/res/values-v14/styles.xml b/examples/pocl-android-sample/res/values-v14/styles.xml
new file mode 100644
index 0000000..a91fd03
--- /dev/null
+++ b/examples/pocl-android-sample/res/values-v14/styles.xml
@@ -0,0 +1,12 @@
+<resources>
+
+    <!--
+        Base application theme for API 14+. This theme completely replaces
+        AppBaseTheme from BOTH res/values/styles.xml and
+        res/values-v11/styles.xml on API 14+ devices.
+    -->
+    <style name="AppBaseTheme" parent="android:Theme.Holo.Light.DarkActionBar">
+        <!-- API 14 theme customizations can go here. -->
+    </style>
+
+</resources>
diff --git a/examples/pocl-android-sample/res/values-w820dp/dimens.xml b/examples/pocl-android-sample/res/values-w820dp/dimens.xml
new file mode 100644
index 0000000..f3e7020
--- /dev/null
+++ b/examples/pocl-android-sample/res/values-w820dp/dimens.xml
@@ -0,0 +1,10 @@
+<resources>
+
+    <!--
+         Example customization of dimensions originally defined in res/values/dimens.xml
+         (such as screen margins) for screens with more than 820dp of available width. This
+         would include 7" and 10" devices in landscape (~960dp and ~1280dp respectively).
+    -->
+    <dimen name="activity_horizontal_margin">64dp</dimen>
+
+</resources>
diff --git a/examples/pocl-android-sample/res/values/dimens.xml b/examples/pocl-android-sample/res/values/dimens.xml
new file mode 100644
index 0000000..55c1e59
--- /dev/null
+++ b/examples/pocl-android-sample/res/values/dimens.xml
@@ -0,0 +1,7 @@
+<resources>
+
+    <!-- Default screen margins, per the Android Design guidelines. -->
+    <dimen name="activity_horizontal_margin">16dp</dimen>
+    <dimen name="activity_vertical_margin">16dp</dimen>
+
+</resources>
diff --git a/examples/pocl-android-sample/res/values/strings.xml b/examples/pocl-android-sample/res/values/strings.xml
new file mode 100644
index 0000000..8939ed8
--- /dev/null
+++ b/examples/pocl-android-sample/res/values/strings.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+    <string name="app_name">pocl vector addition</string>
+
+</resources>
diff --git a/examples/pocl-android-sample/res/values/styles.xml b/examples/pocl-android-sample/res/values/styles.xml
new file mode 100644
index 0000000..6ce89c7
--- /dev/null
+++ b/examples/pocl-android-sample/res/values/styles.xml
@@ -0,0 +1,20 @@
+<resources>
+
+    <!--
+        Base application theme, dependent on API level. This theme is replaced
+        by AppBaseTheme from res/values-vXX/styles.xml on newer devices.
+    -->
+    <style name="AppBaseTheme" parent="android:Theme.Light">
+        <!--
+            Theme customizations available in newer API levels can go in
+            res/values-vXX/styles.xml, while customizations related to
+            backward-compatibility can go here.
+        -->
+    </style>
+
+    <!-- Application theme. -->
+    <style name="AppTheme" parent="AppBaseTheme">
+        <!-- All customizations that are NOT specific to a particular API-level can go here. -->
+    </style>
+
+</resources>
diff --git a/examples/pocl-android-sample/src/org/pocl/sample1/MainActivity.java b/examples/pocl-android-sample/src/org/pocl/sample1/MainActivity.java
new file mode 100644
index 0000000..d5d137e
--- /dev/null
+++ b/examples/pocl-android-sample/src/org/pocl/sample1/MainActivity.java
@@ -0,0 +1,101 @@
+package org.pocl.sample1;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.widget.TextView;
+
+public class MainActivity extends Activity
+{
+
+    // These native functions are defined in jni/vectorAdd.cpp
+    public native int initCL();
+    public native int vectorAddCL(int N, float[] A, float[] B, float[] C);
+    public native int destroyCL();
+    public native void setenv(String key, String value);
+
+    TextView text;
+
+    static {
+        System.loadLibrary("poclVecAdd");
+    }
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState)
+    {
+        super.onCreate(savedInstanceState);
+
+        // Forcibly set opencl-stub to use pocl
+        setenv("LIBOPENCL_SO_PATH", "/data/data/org.pocl.libs/files/lib/libpocl.so");
+
+        text = new TextView(this);
+        text.setText("\nOpenCL vector addition example using pocl\n");
+
+        setContentView(text);
+
+        // Running in separate thread to avoid UI hangs
+        Thread td = new Thread() {
+            public void run() {
+                doVectorAdd();
+            }
+        };
+
+        td.start();
+    }
+
+    void doVectorAdd()
+    {
+        // Error checkings are not done for simplicity. Check logcat
+
+        printLog("\ncalling opencl init functions... ");
+        initCL();
+
+        // Create 2 vectors A & B
+        // And yes, this array size is embarrassingly huge for demo!
+        float A[] = {1, 2, 3, 4, 5, 6, 7};
+        float B[] = {8, 9, 0, 6, 7, 8, 9};
+        float C[] = new float[A.length];
+
+        printLog("\n A: ");
+        for(int i=0; i<A.length; i++)
+            printLog(Float.toString(A[i]) + "    ");
+
+        printLog("\n B: ");
+        for(int i=0; i<B.length; i++)
+            printLog(Float.toString(B[i]) + "    ");
+
+        printLog("\n\ncalling opencl vector-addition kernel... ");
+        vectorAddCL(C.length, A, B, C);
+
+        printLog("\n C: ");
+        for(int i=0; i<C.length; i++)
+            printLog(Float.toString(C[i]) + "    ");
+
+        boolean correct = true;
+        for(int i=0; i<C.length; i++)
+        {
+            if(C[i] != (A[i] + B[i])) {
+                correct = false;
+                break;
+            }
+        }
+
+        if(correct)
+            printLog("\n\nresult: passed\n");
+        else
+            printLog("\n\nresult: failed\n");
+
+        printLog("\ndestroy opencl resources... ");
+        destroyCL();
+    }
+
+    void printLog(final String str)
+    {
+        // UI updates should happen only in UI thread
+        runOnUiThread(new Runnable() {
+             @Override
+             public void run() {
+                 text.append(str);
+             }
+        });
+    }
+}
diff --git a/examples/scalarwave/CMakeLists.txt b/examples/scalarwave/CMakeLists.txt
index cf16203..171d29e 100644
--- a/examples/scalarwave/CMakeLists.txt
+++ b/examples/scalarwave/CMakeLists.txt
@@ -71,12 +71,12 @@ add_executable("scalarwave" scalarwave.c scalarwave.cl)
 #scalarwave_LDADD = -lm @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
 target_link_libraries("scalarwave" ${POCLU_LINK_OPTIONS})
 
-add_test("full_applications/scalarwave" "scalarwave")
+add_test("examples/scalarwave" "scalarwave")
 
-set_tests_properties( "full_applications/scalarwave"
+set_tests_properties( "examples/scalarwave"
   PROPERTIES
     COST 3.0
     PASS_REGULAR_EXPRESSION "Done.\n"
     PROCESSORS 1
-    LABELS "FullApplications"
+    LABELS "internal"
     DEPENDS "pocl_version_check")
diff --git a/examples/scalarwave/Makefile.in b/examples/scalarwave/Makefile.in
index 0ae8676..f968a5c 100644
--- a/examples/scalarwave/Makefile.in
+++ b/examples/scalarwave/Makefile.in
@@ -245,6 +245,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -252,6 +253,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -269,8 +271,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -346,6 +346,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/standalone/CMakeLists.txt b/examples/standalone/CMakeLists.txt
index 42d30d8..861b949 100644
--- a/examples/standalone/CMakeLists.txt
+++ b/examples/standalone/CMakeLists.txt
@@ -38,3 +38,9 @@ add_test("pocl-standalone" "/bin/sh" "${CMAKE_BINARY_DIR}/scripts/pocl-standalon
           -o "${CMAKE_BINARY_DIR}/standalone.bc"
           "${CMAKE_CURRENT_SOURCE_DIR}/standalone.cl")
 
+set_tests_properties( "pocl-standalone"
+  PROPERTIES
+    COST 3.0
+    PROCESSORS 1
+    LABELS "internal"
+    DEPENDS "pocl_version_check")
diff --git a/examples/standalone/Makefile.in b/examples/standalone/Makefile.in
index 7236b07..c982b1d 100644
--- a/examples/standalone/Makefile.in
+++ b/examples/standalone/Makefile.in
@@ -194,6 +194,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -201,6 +202,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -218,8 +220,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -295,6 +295,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/trig/Makefile.in b/examples/trig/Makefile.in
index 7c7508c..abc65df 100644
--- a/examples/trig/Makefile.in
+++ b/examples/trig/Makefile.in
@@ -245,6 +245,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -252,6 +253,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -269,8 +271,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -346,6 +346,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/trig/trig.c b/examples/trig/trig.c
index 943b964..5e4ca0c 100644
--- a/examples/trig/trig.c
+++ b/examples/trig/trig.c
@@ -80,7 +80,11 @@ main (void)
       }
     }
 
-  assert(exec_trig_kernel (source, N, srcA, dst) != -1);
+  if (exec_trig_kernel (source, N, srcA, dst) < 0)
+    {
+      printf("Failed to run the kernel.\n");
+      return -1;
+    }
 
   for (i = 0; i < N; ++i)
     {
diff --git a/fix-include/CL/cl_platform.h b/fix-include/CL/cl_platform.h
index b9e556e..2e39d40 100644
--- a/fix-include/CL/cl_platform.h
+++ b/fix-include/CL/cl_platform.h
@@ -21,6 +21,10 @@
    THE SOFTWARE.
 */
 
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+
 #ifdef _MSC_VER
 #  include <CL/cl_platform.h>
 #else
diff --git a/include/CL/Makefile.in b/include/CL/Makefile.in
index 4a89437..b0a75ac 100644
--- a/include/CL/Makefile.in
+++ b/include/CL/Makefile.in
@@ -244,6 +244,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -251,6 +252,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -268,8 +270,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -345,6 +345,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/include/CL/cl.h b/include/CL/cl.h
index 203c659..b303330 100644
--- a/include/CL/cl.h
+++ b/include/CL/cl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ * Copyright (c) 2008 - 2013 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -55,16 +55,19 @@ typedef cl_bitfield         cl_device_fp_config;
 typedef cl_uint             cl_device_mem_cache_type;
 typedef cl_uint             cl_device_local_mem_type;
 typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_device_svm_capabilities;
 typedef cl_bitfield         cl_command_queue_properties;
 typedef intptr_t            cl_device_partition_property;
 typedef cl_bitfield         cl_device_affinity_domain;
 
 typedef intptr_t            cl_context_properties;
 typedef cl_uint             cl_context_info;
+typedef cl_bitfield         cl_queue_properties;
 typedef cl_uint             cl_command_queue_info;
 typedef cl_uint             cl_channel_order;
 typedef cl_uint             cl_channel_type;
 typedef cl_bitfield         cl_mem_flags;
+typedef cl_bitfield         cl_svm_mem_flags;
 typedef cl_uint             cl_mem_object_type;
 typedef cl_uint             cl_mem_info;
 typedef cl_bitfield         cl_mem_migration_flags;
@@ -74,6 +77,8 @@ typedef cl_uint             cl_addressing_mode;
 typedef cl_uint             cl_filter_mode;
 typedef cl_uint             cl_sampler_info;
 typedef cl_bitfield         cl_map_flags;
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
 typedef cl_uint             cl_program_info;
 typedef cl_uint             cl_program_build_info;
 typedef cl_uint             cl_program_binary_type;
@@ -87,7 +92,8 @@ typedef cl_uint             cl_kernel_work_group_info;
 typedef cl_uint             cl_event_info;
 typedef cl_uint             cl_command_type;
 typedef cl_uint             cl_profiling_info;
-
+typedef cl_bitfield         cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
 
 typedef struct _cl_image_format {
     cl_channel_order        image_channel_order;
@@ -104,7 +110,13 @@ typedef struct _cl_image_desc {
     size_t                  image_slice_pitch;
     cl_uint                 num_mip_levels;
     cl_uint                 num_samples;
-    cl_mem                  buffer;
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+    union {
+      cl_mem                  buffer;
+      cl_mem                  mem_object;
+    };
 } cl_image_desc;
 
 typedef struct _cl_buffer_region {
@@ -176,11 +188,14 @@ typedef struct _cl_buffer_region {
 #define CL_INVALID_COMPILER_OPTIONS                 -66
 #define CL_INVALID_LINKER_OPTIONS                   -67
 #define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
 
 /* OpenCL Version */
 #define CL_VERSION_1_0                              1
 #define CL_VERSION_1_1                              1
 #define CL_VERSION_1_2                              1
+#define CL_VERSION_2_0                              1
 
 /* cl_bool */
 #define CL_FALSE                                    0
@@ -204,82 +219,98 @@ typedef struct _cl_buffer_region {
 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
 
 /* cl_device_info */
-#define CL_DEVICE_TYPE                              0x1000
-#define CL_DEVICE_VENDOR_ID                         0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
-#define CL_DEVICE_ADDRESS_BITS                      0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
-#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
-#define CL_DEVICE_MAX_SAMPLERS                      0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
-#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
-#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
-#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
-#define CL_DEVICE_AVAILABLE                         0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
-#define CL_DEVICE_NAME                              0x102B
-#define CL_DEVICE_VENDOR                            0x102C
-#define CL_DRIVER_VERSION                           0x102D
-#define CL_DEVICE_PROFILE                           0x102E
-#define CL_DEVICE_VERSION                           0x102F
-#define CL_DEVICE_EXTENSIONS                        0x1030
-#define CL_DEVICE_PLATFORM                          0x1031
-#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+#define CL_DEVICE_TYPE                                  0x1000
+#define CL_DEVICE_VENDOR_ID                             0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                     0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS              0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                   0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                   0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR           0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT          0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT            0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG           0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT          0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE         0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                   0x100C
+#define CL_DEVICE_ADDRESS_BITS                          0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                   0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                  0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                    0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                     0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                    0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                     0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                    0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                     0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                         0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                    0x1017
+#define CL_DEVICE_MAX_SAMPLERS                          0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                   0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE              0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                      0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                 0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE             0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                 0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                       0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE              0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                     0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                        0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                        0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT              0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION            0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                         0x1026
+#define CL_DEVICE_AVAILABLE                             0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                    0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                      0x102A    /* deprecated */
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES              0x102A
+#define CL_DEVICE_NAME                                  0x102B
+#define CL_DEVICE_VENDOR                                0x102C
+#define CL_DRIVER_VERSION                               0x102D
+#define CL_DEVICE_PROFILE                               0x102E
+#define CL_DEVICE_VERSION                               0x102F
+#define CL_DEVICE_EXTENSIONS                            0x1030
+#define CL_DEVICE_PLATFORM                              0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                      0x1032
 /* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
-#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
-#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
-#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
-#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
-#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
-#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
-#define CL_DEVICE_PARENT_DEVICE                     0x1042
-#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
-#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
-#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
-#define CL_DEVICE_PARTITION_TYPE                    0x1046
-#define CL_DEVICE_REFERENCE_COUNT                   0x1047
-#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
-#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
-#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
-#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF           0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                   0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR              0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT             0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT               0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG              0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT             0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE            0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF              0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                      0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                      0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                      0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                 0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                  0x1041
+#define CL_DEVICE_PARENT_DEVICE                         0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES             0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                  0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN             0x1045
+#define CL_DEVICE_PARTITION_TYPE                        0x1046
+#define CL_DEVICE_REFERENCE_COUNT                       0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC           0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                    0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                 0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT          0x104B
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS             0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE              0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES            0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE        0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE              0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                  0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                  0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                      0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE  0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                         0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS          0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                  0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT   0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT     0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT      0x105A
 
 /* cl_device_fp_config - bitfield */
 #define CL_FP_DENORM                                (1 << 0)
@@ -307,6 +338,8 @@ typedef struct _cl_buffer_region {
 /* cl_command_queue_properties - bitfield */
 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
 #define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
 
 /* cl_context_info  */
 #define CL_CONTEXT_REFERENCE_COUNT                  0x1080
@@ -325,30 +358,40 @@ typedef struct _cl_buffer_region {
 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
     
 /* cl_device_affinity_domain */
-#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
-#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
-#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
-#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
-#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
-#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
 
 /* cl_command_queue_info */
 #define CL_QUEUE_CONTEXT                            0x1090
 #define CL_QUEUE_DEVICE                             0x1091
 #define CL_QUEUE_REFERENCE_COUNT                    0x1092
 #define CL_QUEUE_PROPERTIES                         0x1093
+#define CL_QUEUE_SIZE                               0x1094
 
-/* cl_mem_flags - bitfield */
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
 #define CL_MEM_READ_WRITE                           (1 << 0)
 #define CL_MEM_WRITE_ONLY                           (1 << 1)
 #define CL_MEM_READ_ONLY                            (1 << 2)
 #define CL_MEM_USE_HOST_PTR                         (1 << 3)
 #define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)
-// reserved                                         (1 << 6)    
+/* reserved                                         (1 << 6)    */
 #define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
 #define CL_MEM_HOST_READ_ONLY                       (1 << 8)
 #define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
 
 /* cl_mem_migration_flags - bitfield */
 #define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
@@ -370,6 +413,11 @@ typedef struct _cl_buffer_region {
 #define CL_RGBx                                     0x10BC
 #define CL_DEPTH                                    0x10BD
 #define CL_DEPTH_STENCIL                            0x10BE
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
 
 /* cl_channel_type */
 #define CL_SNORM_INT8                               0x10D0
@@ -397,6 +445,7 @@ typedef struct _cl_buffer_region {
 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
 #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
 #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#define CL_MEM_OBJECT_PIPE                          0x10F7
 
 /* cl_mem_info */
 #define CL_MEM_TYPE                                 0x1100
@@ -408,6 +457,7 @@ typedef struct _cl_buffer_region {
 #define CL_MEM_CONTEXT                              0x1106
 #define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
 #define CL_MEM_OFFSET                               0x1108
+#define CL_MEM_USES_SVM_POINTER                     0x1109
 
 /* cl_image_info */
 #define CL_IMAGE_FORMAT                             0x1110
@@ -422,6 +472,10 @@ typedef struct _cl_buffer_region {
 #define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
 #define CL_IMAGE_NUM_SAMPLES                        0x111A
 
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
+
 /* cl_addressing_mode */
 #define CL_ADDRESS_NONE                             0x1130
 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
@@ -439,6 +493,9 @@ typedef struct _cl_buffer_region {
 #define CL_SAMPLER_NORMALIZED_COORDS                0x1152
 #define CL_SAMPLER_ADDRESSING_MODE                  0x1153
 #define CL_SAMPLER_FILTER_MODE                      0x1154
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
 
 /* cl_map_flags - bitfield */
 #define CL_MAP_READ                                 (1 << 0)
@@ -461,6 +518,7 @@ typedef struct _cl_buffer_region {
 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
 #define CL_PROGRAM_BUILD_LOG                        0x1183
 #define CL_PROGRAM_BINARY_TYPE                      0x1184
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
     
 /* cl_program_binary_type */
 #define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
@@ -506,6 +564,7 @@ typedef struct _cl_buffer_region {
 #define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
 #define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
 #define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
 
 /* cl_kernel_work_group_info */
 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
@@ -515,6 +574,10 @@ typedef struct _cl_buffer_region {
 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
 #define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
 
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
+
 /* cl_event_info  */
 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
 #define CL_EVENT_COMMAND_TYPE                       0x11D1
@@ -548,6 +611,11 @@ typedef struct _cl_buffer_region {
 #define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
 #define CL_COMMAND_FILL_BUFFER                      0x1207
 #define CL_COMMAND_FILL_IMAGE                       0x1208
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
 
 /* command execution status */
 #define CL_COMPLETE                                 0x0
@@ -563,6 +631,7 @@ typedef struct _cl_buffer_region {
 #define CL_PROFILING_COMMAND_SUBMIT                 0x1281
 #define CL_PROFILING_COMMAND_START                  0x1282
 #define CL_PROFILING_COMMAND_END                    0x1283
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
 
 /********************************************************************************************************/
 
@@ -638,10 +707,10 @@ clGetContextInfo(cl_context         /* context */,
 
 /* Command Queue APIs */
 extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context                     /* context */, 
-                     cl_device_id                   /* device */, 
-                     cl_command_queue_properties    /* properties */,
-                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateCommandQueueWithProperties(cl_context               /* context */,
+                                   cl_device_id             /* device */,
+                                   const cl_queue_properties *    /* properties */,
+                                   cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
@@ -679,6 +748,14 @@ clCreateImage(cl_context              /* context */,
               void *                  /* host_ptr */,
               cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
                         
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 /* context */,
+             cl_mem_flags               /* flags */,
+             cl_uint                    /* pipe_packet_size */,
+             cl_uint                    /* pipe_max_packets */,
+             const cl_pipe_properties * /* properties */,
+             cl_int *                   /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
 
@@ -708,17 +785,34 @@ clGetImageInfo(cl_mem           /* image */,
                size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
-                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
-                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+clGetPipeInfo(cl_mem           /* pipe */,
+              cl_pipe_info     /* param_name */,
+              size_t           /* param_value_size */,
+              void *           /* param_value */,
+              size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(cl_mem /* memobj */,
+                                 void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                 void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
+
+/* SVM Allocation APIs */
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       /* context */,
+           cl_svm_mem_flags /* flags */,
+           size_t           /* size */,
+           cl_uint          /* alignment */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context        /* context */,
+          void *            /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
 
 /* Sampler APIs */
 extern CL_API_ENTRY cl_sampler CL_API_CALL
-clCreateSampler(cl_context          /* context */,
-                cl_bool             /* normalized_coords */, 
-                cl_addressing_mode  /* addressing_mode */, 
-                cl_filter_mode      /* filter_mode */,
-                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateSamplerWithProperties(cl_context                     /* context */,
+                              const cl_sampler_properties *  /* normalized_coords */,
+                              cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
@@ -837,6 +931,17 @@ clSetKernelArg(cl_kernel    /* kernel */,
                const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    /* kernel */,
+                         cl_uint      /* arg_index */,
+                         const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            /* kernel */,
+                    cl_kernel_exec_info  /* param_name */,
+                    size_t               /* param_value_size */,
+                    const void *         /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
 clGetKernelInfo(cl_kernel       /* kernel */,
                 cl_kernel_info  /* param_name */,
                 size_t          /* param_value_size */,
@@ -917,7 +1022,7 @@ clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
                     cl_uint             /* num_events_in_wait_list */,
                     const cl_event *    /* event_wait_list */,
                     cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
                         cl_mem              /* buffer */,
@@ -933,7 +1038,7 @@ clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
                         cl_uint             /* num_events_in_wait_list */,
                         const cl_event *    /* event_wait_list */,
                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
                      cl_mem             /* buffer */, 
@@ -944,7 +1049,7 @@ clEnqueueWriteBuffer(cl_command_queue   /* command_queue */,
                      cl_uint            /* num_events_in_wait_list */, 
                      const cl_event *   /* event_wait_list */, 
                      cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
                          cl_mem              /* buffer */,
@@ -1122,13 +1227,6 @@ clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
                        cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueTask(cl_command_queue  /* command_queue */,
-              cl_kernel         /* kernel */,
-              cl_uint           /* num_events_in_wait_list */,
-              const cl_event *  /* event_wait_list */,
-              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
 					  void (CL_CALLBACK * /*user_func*/)(void *), 
                       void *            /* args */,
@@ -1141,17 +1239,67 @@ clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
                       cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+clEnqueueMarkerWithWaitList(cl_command_queue  /* command_queue */,
                             cl_uint           /* num_events_in_wait_list */,
                             const cl_event *  /* event_wait_list */,
                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+clEnqueueBarrierWithWaitList(cl_command_queue  /* command_queue */,
                              cl_uint           /* num_events_in_wait_list */,
                              const cl_event *  /* event_wait_list */,
                              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
 
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  /* command_queue */,
+                 cl_uint           /* num_svm_pointers */,
+                 void *[]          /* svm_pointers[] */,
+                 void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                        cl_uint          /* num_svm_pointers */,
+                                                        void *[]         /* svm_pointers[] */,
+                                                        void *           /* user_data */),
+                 void *            /* user_data */,
+                 cl_uint           /* num_events_in_wait_list */,
+                 const cl_event *  /* event_wait_list */,
+                 cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_copy */,
+                   void *            /* dst_ptr */,
+                   const void *      /* src_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  /* command_queue */,
+                    void *            /* svm_ptr */,
+                    const void *      /* pattern */,
+                    size_t            /* pattern_size */,
+                    size_t            /* size */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  /* command_queue */,
+                cl_bool           /* blocking_map */,
+                cl_map_flags      /* flags */,
+                void *            /* svm_ptr */,
+                size_t            /* size */,
+                cl_uint           /* num_events_in_wait_list */,
+                const cl_event *  /* event_wait_list */,
+                cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  /* command_queue */,
+                  void *            /* svm_ptr */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
 
 /* Extension function access
  *
@@ -1165,7 +1313,7 @@ clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
                                          const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
     
 
-// Deprecated OpenCL 1.1 APIs
+/* Deprecated OpenCL 1.1 APIs */
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateImage2D(cl_context              /* context */,
                 cl_mem_flags            /* flags */,
@@ -1206,6 +1354,27 @@ clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
 clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 
+/* Deprecated OpenCL 2.0 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */,
+                     cl_device_id                   /* device */,
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */,
+                cl_addressing_mode  /* addressing_mode */,
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 632cb21..6199478 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -34,10 +34,10 @@ extern "C" {
 #endif
 
 #ifdef __APPLE__
-	#include <OpenCL/cl.h>
+        #include <OpenCL/cl.h>
     #include <AvailabilityMacros.h>
 #else
-	#include <CL/cl.h>
+        #include <CL/cl.h>
 #endif
 
 /* cl_khr_fp16 extension - no extension #define since it has no functions  */
@@ -61,7 +61,7 @@ extern "C" {
  * before using.
  */
 #define cl_APPLE_SetMemObjectDestructor 1
-cl_int	CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */,
                                         void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
                                         void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
 
@@ -134,15 +134,15 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
  * cl_khr_initalize_memory extension *
  *************************************/
     
-#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
     
     
 /**************************************
  * cl_khr_terminate_context extension *
  **************************************/
     
-#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x200F
-#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
 
 #define cl_khr_terminate_context 1
 extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
@@ -157,6 +157,10 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
  * Standard Portable Intermediate Representation (SPIR) instance
  */
 
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
 /******************************************
 * cl_nv_device_attribute_query extension *
 ******************************************/
@@ -169,12 +173,17 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
 #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
 #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
 
-
 /*********************************
 * cl_amd_device_attribute_query *
 *********************************/
 #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
 
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
 #ifdef CL_VERSION_1_1
    /***********************************
     * cl_ext_device_fission extension *
@@ -239,10 +248,100 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
     #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
     #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
 
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
 
+    /* ION file descriptor */
+    int                  ion_filedesc;
+
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
 
 #endif /* CL_VERSION_1_1 */
 
+
+#ifdef CL_VERSION_2_0
+/*********************************
+* cl_khr_sub_groups extension
+*********************************/
+#define cl_khr_sub_groups 1
+
+typedef cl_uint  cl_kernel_sub_group_info;
+
+/* cl_khr_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR	0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR		0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+						   cl_device_id /*in_device*/,
+						   cl_kernel_sub_group_info /* param_name */,
+						   size_t /*input_value_size*/,
+						   const void * /*input_value*/,
+						   size_t /*param_value_size*/,
+						   void* /*param_value*/,
+						   size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int
+     ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+						      cl_device_id /*in_device*/,
+						      cl_kernel_sub_group_info /* param_name */,
+						      size_t /*input_value_size*/,
+						      const void * /*input_value*/,
+						      size_t * /*param_value_size*/,
+						      void* /*param_value*/,
+						      size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0;
+#endif /* CL_VERSION_2_0 */
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
index af2036c..0408093 100644
--- a/include/CL/cl_gl.h
+++ b/include/CL/cl_gl.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ * Copyright (c) 2008 - 2013 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -104,7 +104,7 @@ clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
                           cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
 
 
-// Deprecated OpenCL 1.1 APIs
+/* Deprecated OpenCL 1.1 APIs */
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateFromGLTexture2D(cl_context      /* context */,
                         cl_mem_flags    /* flags */,
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
index 77d5353..a46e0a2 100644
--- a/include/CL/cl_gl_ext.h
+++ b/include/CL/cl_gl_ext.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
index 191f057..864907f 100644
--- a/include/CL/cl_platform.h
+++ b/include/CL/cl_platform.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -36,7 +36,7 @@ extern "C" {
 #endif
 
 #if defined(_WIN32)
-    #define CL_API_ENTRY    __declspec(dllexport)
+    #define CL_API_ENTRY
     #define CL_API_CALL     __stdcall
     #define CL_CALLBACK     __stdcall
 #else
@@ -45,6 +45,14 @@ extern "C" {
     #define CL_CALLBACK
 #endif
 
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+
 #ifdef __APPLE__
     #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
     #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
@@ -67,6 +75,12 @@ extern "C" {
         #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
         #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
     #endif
+
+    #define CL_API_SUFFIX__VERSION_2_0                  CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_2_0                  CL_EXT_SUFFIX__VERSION_1_2
+    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED       CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED       CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+
 #else
     #define CL_EXTENSION_WEAK_LINK  
     #define CL_API_SUFFIX__VERSION_1_0
@@ -75,6 +89,8 @@ extern "C" {
     #define CL_EXT_SUFFIX__VERSION_1_1
     #define CL_API_SUFFIX__VERSION_1_2
     #define CL_EXT_SUFFIX__VERSION_1_2
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
     
     #ifdef __GNUC__
         #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
@@ -84,7 +100,7 @@ extern "C" {
             #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
             #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
         #endif
-    
+
         #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
             #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED    
             #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
@@ -92,15 +108,23 @@ extern "C" {
             #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
             #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
         #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+         #endif
     #elif _WIN32
         #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
             #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
         #else
             #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
             #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)     
         #endif
-    
+
         #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
             #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
             #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
@@ -108,12 +132,23 @@ extern "C" {
             #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
             #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
         #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated)
+        #endif
     #else
         #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
         #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
     
         #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
         #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
     #endif
 #endif
 
@@ -213,16 +248,16 @@ typedef double                  cl_double;
 /* scalar types  */
 typedef int8_t          cl_char;
 typedef uint8_t         cl_uchar;
-typedef int16_t         cl_short    __attribute__((__aligned__(2)));
-typedef uint16_t        cl_ushort   __attribute__((__aligned__(2)));
-typedef int32_t         cl_int      __attribute__((__aligned__(4)));
-typedef uint32_t        cl_uint     __attribute__((__aligned__(4)));
-typedef int64_t         cl_long     __attribute__((__aligned__(8)));
-typedef uint64_t        cl_ulong    __attribute__((__aligned__(8)));
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
 
-typedef uint16_t        cl_half     __attribute__((__aligned__(2)));
-typedef float           cl_float    __attribute__((__aligned__(4)));
-typedef double          cl_double   __attribute__((__aligned__(8)));
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
 
 /* Macro names and corresponding values defined by OpenCL */
 #define CL_CHAR_BIT         8
@@ -451,9 +486,27 @@ typedef unsigned int cl_GLenum;
     #define __CL_DOUBLE4__  1
 #endif
 
+/* Define capabilities for anonymous struct members. */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && (_MSC_VER >= 1500)
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
 /* Define alignment keys */
 #if defined( __GNUC__ )
-    #define CL_ALIGNED(_x)          __attribute__ ((__aligned__(_x)))
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
 #elif defined( _WIN32) && (_MSC_VER)
     /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
     /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
@@ -466,7 +519,7 @@ typedef unsigned int cl_GLenum;
 #endif
 
 /* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#if __CL_HAS_ANON_STRUCT__
     /* .xyzw and .s0123...{f|F} are supported */
     #define CL_HAS_NAMED_VECTOR_FIELDS 1
     /* .hi and .lo are supported */
@@ -479,10 +532,10 @@ typedef unsigned int cl_GLenum;
 typedef union
 {
     cl_char  CL_ALIGNED(2) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y; };
-   __extension__ struct{ cl_char  s0, s1; };
-   __extension__ struct{ cl_char  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
 #endif
 #if defined( __CL_CHAR2__) 
     __cl_char2     v2;
@@ -492,10 +545,10 @@ typedef union
 typedef union
 {
     cl_char  CL_ALIGNED(4) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y, z, w; };
-   __extension__ struct{ cl_char  s0, s1, s2, s3; };
-   __extension__ struct{ cl_char2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
 #endif
 #if defined( __CL_CHAR2__) 
     __cl_char2     v2[2];
@@ -511,10 +564,10 @@ typedef  cl_char4  cl_char3;
 typedef union
 {
     cl_char   CL_ALIGNED(8) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y, z, w; };
-   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_char4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
 #endif
 #if defined( __CL_CHAR2__) 
     __cl_char2     v2[4];
@@ -530,10 +583,10 @@ typedef union
 typedef union
 {
     cl_char  CL_ALIGNED(16) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_char8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
 #endif
 #if defined( __CL_CHAR2__) 
     __cl_char2     v2[8];
@@ -554,10 +607,10 @@ typedef union
 typedef union
 {
     cl_uchar  CL_ALIGNED(2) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y; };
-   __extension__ struct{ cl_uchar  s0, s1; };
-   __extension__ struct{ cl_uchar  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
 #endif
 #if defined( __cl_uchar2__) 
     __cl_uchar2     v2;
@@ -567,10 +620,10 @@ typedef union
 typedef union
 {
     cl_uchar  CL_ALIGNED(4) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y, z, w; };
-   __extension__ struct{ cl_uchar  s0, s1, s2, s3; };
-   __extension__ struct{ cl_uchar2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
 #endif
 #if defined( __CL_UCHAR2__) 
     __cl_uchar2     v2[2];
@@ -586,10 +639,10 @@ typedef  cl_uchar4  cl_uchar3;
 typedef union
 {
     cl_uchar   CL_ALIGNED(8) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y, z, w; };
-   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_uchar4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
 #endif
 #if defined( __CL_UCHAR2__) 
     __cl_uchar2     v2[4];
@@ -605,10 +658,10 @@ typedef union
 typedef union
 {
     cl_uchar  CL_ALIGNED(16) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_uchar8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
 #endif
 #if defined( __CL_UCHAR2__) 
     __cl_uchar2     v2[8];
@@ -629,10 +682,10 @@ typedef union
 typedef union
 {
     cl_short  CL_ALIGNED(4) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y; };
-   __extension__ struct{ cl_short  s0, s1; };
-   __extension__ struct{ cl_short  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
 #endif
 #if defined( __CL_SHORT2__) 
     __cl_short2     v2;
@@ -642,10 +695,10 @@ typedef union
 typedef union
 {
     cl_short  CL_ALIGNED(8) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y, z, w; };
-   __extension__ struct{ cl_short  s0, s1, s2, s3; };
-   __extension__ struct{ cl_short2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
 #endif
 #if defined( __CL_SHORT2__) 
     __cl_short2     v2[2];
@@ -661,10 +714,10 @@ typedef  cl_short4  cl_short3;
 typedef union
 {
     cl_short   CL_ALIGNED(16) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y, z, w; };
-   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_short4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
 #endif
 #if defined( __CL_SHORT2__) 
     __cl_short2     v2[4];
@@ -680,10 +733,10 @@ typedef union
 typedef union
 {
     cl_short  CL_ALIGNED(32) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_short8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
 #endif
 #if defined( __CL_SHORT2__) 
     __cl_short2     v2[8];
@@ -704,10 +757,10 @@ typedef union
 typedef union
 {
     cl_ushort  CL_ALIGNED(4) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y; };
-   __extension__ struct{ cl_ushort  s0, s1; };
-   __extension__ struct{ cl_ushort  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
 #endif
 #if defined( __CL_USHORT2__) 
     __cl_ushort2     v2;
@@ -717,10 +770,10 @@ typedef union
 typedef union
 {
     cl_ushort  CL_ALIGNED(8) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y, z, w; };
-   __extension__ struct{ cl_ushort  s0, s1, s2, s3; };
-   __extension__ struct{ cl_ushort2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
 #endif
 #if defined( __CL_USHORT2__) 
     __cl_ushort2     v2[2];
@@ -736,10 +789,10 @@ typedef  cl_ushort4  cl_ushort3;
 typedef union
 {
     cl_ushort   CL_ALIGNED(16) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y, z, w; };
-   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_ushort4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
 #endif
 #if defined( __CL_USHORT2__) 
     __cl_ushort2     v2[4];
@@ -755,10 +808,10 @@ typedef union
 typedef union
 {
     cl_ushort  CL_ALIGNED(32) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_ushort8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
 #endif
 #if defined( __CL_USHORT2__) 
     __cl_ushort2     v2[8];
@@ -778,10 +831,10 @@ typedef union
 typedef union
 {
     cl_int  CL_ALIGNED(8) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y; };
-   __extension__ struct{ cl_int  s0, s1; };
-   __extension__ struct{ cl_int  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
 #endif
 #if defined( __CL_INT2__) 
     __cl_int2     v2;
@@ -791,10 +844,10 @@ typedef union
 typedef union
 {
     cl_int  CL_ALIGNED(16) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y, z, w; };
-   __extension__ struct{ cl_int  s0, s1, s2, s3; };
-   __extension__ struct{ cl_int2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
 #endif
 #if defined( __CL_INT2__) 
     __cl_int2     v2[2];
@@ -810,10 +863,10 @@ typedef  cl_int4  cl_int3;
 typedef union
 {
     cl_int   CL_ALIGNED(32) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y, z, w; };
-   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_int4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
 #endif
 #if defined( __CL_INT2__) 
     __cl_int2     v2[4];
@@ -829,10 +882,10 @@ typedef union
 typedef union
 {
     cl_int  CL_ALIGNED(64) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_int8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
 #endif
 #if defined( __CL_INT2__) 
     __cl_int2     v2[8];
@@ -853,10 +906,10 @@ typedef union
 typedef union
 {
     cl_uint  CL_ALIGNED(8) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y; };
-   __extension__ struct{ cl_uint  s0, s1; };
-   __extension__ struct{ cl_uint  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
 #endif
 #if defined( __CL_UINT2__) 
     __cl_uint2     v2;
@@ -866,10 +919,10 @@ typedef union
 typedef union
 {
     cl_uint  CL_ALIGNED(16) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y, z, w; };
-   __extension__ struct{ cl_uint  s0, s1, s2, s3; };
-   __extension__ struct{ cl_uint2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
 #endif
 #if defined( __CL_UINT2__) 
     __cl_uint2     v2[2];
@@ -885,10 +938,10 @@ typedef  cl_uint4  cl_uint3;
 typedef union
 {
     cl_uint   CL_ALIGNED(32) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y, z, w; };
-   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_uint4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
 #endif
 #if defined( __CL_UINT2__) 
     __cl_uint2     v2[4];
@@ -904,10 +957,10 @@ typedef union
 typedef union
 {
     cl_uint  CL_ALIGNED(64) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_uint8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
 #endif
 #if defined( __CL_UINT2__) 
     __cl_uint2     v2[8];
@@ -927,10 +980,10 @@ typedef union
 typedef union
 {
     cl_long  CL_ALIGNED(16) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y; };
-   __extension__ struct{ cl_long  s0, s1; };
-   __extension__ struct{ cl_long  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
 #endif
 #if defined( __CL_LONG2__) 
     __cl_long2     v2;
@@ -940,10 +993,10 @@ typedef union
 typedef union
 {
     cl_long  CL_ALIGNED(32) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y, z, w; };
-   __extension__ struct{ cl_long  s0, s1, s2, s3; };
-   __extension__ struct{ cl_long2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
 #endif
 #if defined( __CL_LONG2__) 
     __cl_long2     v2[2];
@@ -959,10 +1012,10 @@ typedef  cl_long4  cl_long3;
 typedef union
 {
     cl_long   CL_ALIGNED(64) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y, z, w; };
-   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_long4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
 #endif
 #if defined( __CL_LONG2__) 
     __cl_long2     v2[4];
@@ -978,10 +1031,10 @@ typedef union
 typedef union
 {
     cl_long  CL_ALIGNED(128) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_long8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
 #endif
 #if defined( __CL_LONG2__) 
     __cl_long2     v2[8];
@@ -1002,10 +1055,10 @@ typedef union
 typedef union
 {
     cl_ulong  CL_ALIGNED(16) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y; };
-   __extension__ struct{ cl_ulong  s0, s1; };
-   __extension__ struct{ cl_ulong  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
 #endif
 #if defined( __CL_ULONG2__) 
     __cl_ulong2     v2;
@@ -1015,10 +1068,10 @@ typedef union
 typedef union
 {
     cl_ulong  CL_ALIGNED(32) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y, z, w; };
-   __extension__ struct{ cl_ulong  s0, s1, s2, s3; };
-   __extension__ struct{ cl_ulong2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
 #endif
 #if defined( __CL_ULONG2__) 
     __cl_ulong2     v2[2];
@@ -1034,10 +1087,10 @@ typedef  cl_ulong4  cl_ulong3;
 typedef union
 {
     cl_ulong   CL_ALIGNED(64) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y, z, w; };
-   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_ulong4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
 #endif
 #if defined( __CL_ULONG2__) 
     __cl_ulong2     v2[4];
@@ -1053,10 +1106,10 @@ typedef union
 typedef union
 {
     cl_ulong  CL_ALIGNED(128) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_ulong8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
 #endif
 #if defined( __CL_ULONG2__) 
     __cl_ulong2     v2[8];
@@ -1078,10 +1131,10 @@ typedef union
 typedef union
 {
     cl_float  CL_ALIGNED(8) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float  x, y; };
-   __extension__ struct{ cl_float  s0, s1; };
-   __extension__ struct{ cl_float  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
 #endif
 #if defined( __CL_FLOAT2__) 
     __cl_float2     v2;
@@ -1091,10 +1144,10 @@ typedef union
 typedef union
 {
     cl_float  CL_ALIGNED(16) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float   x, y, z, w; };
-   __extension__ struct{ cl_float   s0, s1, s2, s3; };
-   __extension__ struct{ cl_float2  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
 #endif
 #if defined( __CL_FLOAT2__) 
     __cl_float2     v2[2];
@@ -1110,10 +1163,10 @@ typedef  cl_float4  cl_float3;
 typedef union
 {
     cl_float   CL_ALIGNED(32) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float   x, y, z, w; };
-   __extension__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_float4  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
 #endif
 #if defined( __CL_FLOAT2__) 
     __cl_float2     v2[4];
@@ -1129,10 +1182,10 @@ typedef union
 typedef union
 {
     cl_float  CL_ALIGNED(64) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_float8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
 #endif
 #if defined( __CL_FLOAT2__) 
     __cl_float2     v2[8];
@@ -1153,10 +1206,10 @@ typedef union
 typedef union
 {
     cl_double  CL_ALIGNED(16) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y; };
-   __extension__ struct{ cl_double s0, s1; };
-   __extension__ struct{ cl_double lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
 #endif
 #if defined( __CL_DOUBLE2__) 
     __cl_double2     v2;
@@ -1166,10 +1219,10 @@ typedef union
 typedef union
 {
     cl_double  CL_ALIGNED(32) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y, z, w; };
-   __extension__ struct{ cl_double  s0, s1, s2, s3; };
-   __extension__ struct{ cl_double2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
 #endif
 #if defined( __CL_DOUBLE2__) 
     __cl_double2     v2[2];
@@ -1185,10 +1238,10 @@ typedef  cl_double4  cl_double3;
 typedef union
 {
     cl_double   CL_ALIGNED(64) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y, z, w; };
-   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_double4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
 #endif
 #if defined( __CL_DOUBLE2__) 
     __cl_double2     v2[4];
@@ -1204,10 +1257,10 @@ typedef union
 typedef union
 {
     cl_double  CL_ALIGNED(128) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_double8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
 #endif
 #if defined( __CL_DOUBLE2__) 
     __cl_double2     v2[8];
@@ -1251,4 +1304,10 @@ typedef union
 }
 #endif
 
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && (_MSC_VER >= 1500)
+#pragma warning( pop )
+#endif
+
 #endif  /* __CL_PLATFORM_H  */
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 43ff228..0d2ab68 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -25,7 +25,7 @@
 
 add_subdirectory("CL")
 
-set(PRIVATE_HEADERS  _kernel.h _kernel_c.h _kernel_constants.h pocl_types.h pocl_features.h pocl_device.h pocl.h)
+set(PRIVATE_HEADERS  _kernel.h _kernel_c.h _kernel_constants.h pocl_types.h pocl_device.h pocl.h)
 
 install(FILES ${PRIVATE_HEADERS}
         DESTINATION ${POCL_INSTALL_PRIVATE_HEADER_DIR})
diff --git a/include/Makefile.am b/include/Makefile.am
index e7d18f0..cef45f9 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -24,7 +24,7 @@
 
 # Directory containing "private" headers
 pkgdataincludedir = $(pkgdatadir)/include
-pkgdatainclude_HEADERS = _kernel.h _kernel_c.h pocl_types.h pocl_features.h pocl_device.h pocl.h pocl_tests.h _kernel_constants.h
+pkgdatainclude_HEADERS = _kernel.h _kernel_c.h pocl_types.h pocl_device.h pocl.h pocl_tests.h _kernel_constants.h
 
 # Public - and default - includes dir
 include_HEADERS = poclu.h
diff --git a/include/Makefile.in b/include/Makefile.in
index 98f1971..a0e0ec6 100644
--- a/include/Makefile.in
+++ b/include/Makefile.in
@@ -284,6 +284,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -291,6 +292,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -308,8 +310,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -385,6 +385,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -447,7 +448,7 @@ top_srcdir = @top_srcdir@
 
 # Directory containing "private" headers
 pkgdataincludedir = $(pkgdatadir)/include
-pkgdatainclude_HEADERS = _kernel.h _kernel_c.h pocl_types.h pocl_features.h pocl_device.h pocl.h pocl_tests.h _kernel_constants.h
+pkgdatainclude_HEADERS = _kernel.h _kernel_c.h pocl_types.h pocl_device.h pocl.h pocl_tests.h _kernel_constants.h
 
 # Public - and default - includes dir
 include_HEADERS = poclu.h
diff --git a/include/OpenCL/Makefile.in b/include/OpenCL/Makefile.in
index b24ee15..c5f4198 100644
--- a/include/OpenCL/Makefile.in
+++ b/include/OpenCL/Makefile.in
@@ -243,6 +243,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -250,6 +251,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -267,8 +269,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -344,6 +344,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/include/_kernel.h b/include/_kernel.h
index 54ea23d..f90bc50 100644
--- a/include/_kernel.h
+++ b/include/_kernel.h
@@ -26,12 +26,32 @@
 */
 
 /* Language feature detection */
-#if (__clang_major__ == 3) && (__clang_minor__ >= 3)
-#  define _CL_HAS_EVENT_T
-#  define _CL_HAS_IMAGE_ACCESS
-#endif
 #include "_kernel_c.h"
 
+/* If the -cl-std build option is not specified, the highest OpenCL C 1.x
+ * language version supported by each device is used as the version of
+ * OpenCL C when compiling the program for each device.
+ */
+#ifndef __OPENCL_C_VERSION__
+#define __OPENCL_C_VERSION__ 120
+#endif
+
+#if (__OPENCL_C_VERSION__ > 99)
+#define CL_VERSION_1_0 100
+#endif
+
+#if (__OPENCL_C_VERSION__ > 109)
+#define CL_VERSION_1_1 110
+#endif
+
+#if (__OPENCL_C_VERSION__ > 119)
+#define CL_VERSION_1_2 120
+#endif
+
+#if (__OPENCL_C_VERSION__ > 199)
+#define CL_VERSION_2_0 200
+#endif
+
 /* Enable double precision. This should really only be done when
    building the run-time library; when building application code, we
    should instead check a macro to see whether the application has
@@ -66,6 +86,19 @@
 #  define __IF_FP64(x)
 #endif
 
+#ifdef cl_khr_int64_base_atomics
+#define __IF_BA64(x) x
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#else
+#define __IF_BA64(x)
+#endif
+
+#ifdef cl_khr_int64_extended_atomics
+#define __IF_EA64(x) x
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+#else
+#define __IF_EA64(x)
+#endif
 
 /* A static assert statement to catch inconsistencies at build time */
 #if __has_extension(__c_static_assert__)
@@ -76,8 +109,6 @@
 
 typedef uint cl_mem_fence_flags;
 
-

-
 /* Ensure the data types have the right sizes */
 _CL_STATIC_ASSERT(char  , sizeof(char  ) == 1);
 _CL_STATIC_ASSERT(char2 , sizeof(char2 ) == 2 *sizeof(char));
@@ -402,7 +433,7 @@ _CL_DECLARE_CONVERT_TYPE_SRC_DST_SIZE(_rte)
 _CL_DECLARE_CONVERT_TYPE_SRC_DST_SIZE(_rtp)
 _CL_DECLARE_CONVERT_TYPE_SRC_DST_SIZE(_rtn)
 
-

+
 /* Work-Item Functions */
 
 uint _CL_OVERLOADABLE get_work_dim(void);
@@ -421,7 +452,7 @@ barrier (cl_mem_fence_flags flags);
 void _CL_OVERLOADABLE barrier (cl_mem_fence_flags flags);
 #endif
 
-

+
 /* Math Constants */
 
 /* half */
@@ -1338,7 +1369,7 @@ _CL_DECLARE_FUNC_V_V(_cl_native_tan)
 #define ULONG_MAX 0xffffffffffffffffUL
 #endif
 
-

+
 /* Integer Functions */
 #define _CL_DECLARE_FUNC_G_G(NAME)              \
   char     _CL_OVERLOADABLE NAME(char    );     \
@@ -1846,7 +1877,7 @@ _CL_DECLARE_FUNC_G_G(popcount)
 _CL_DECLARE_FUNC_J_JJJ(mad24)
 _CL_DECLARE_FUNC_J_JJ(mul24)
 
-

+
 /* Common Functions */
 
 _CL_DECLARE_FUNC_V_VVV(clamp)
@@ -1865,7 +1896,7 @@ _CL_DECLARE_FUNC_V_VVV(smoothstep)
 _CL_DECLARE_FUNC_V_SSV(smoothstep)
 _CL_DECLARE_FUNC_V_V(sign)
 
-

+
 /* Geometric Functions */
 
 __IF_FP16(    
@@ -1911,7 +1942,7 @@ _CL_DECLARE_FUNC_G_GGUG(select)
 _CL_DECLARE_FUNC_V_VVJ(select)
 _CL_DECLARE_FUNC_V_VVU(select)
 
-

+
 /* Vector Functions */
 
 #define _CL_DECLARE_VLOAD(TYPE, MOD)                                    \
@@ -2085,7 +2116,7 @@ _CL_DECLARE_VSTORE_HALF(__private , _rtn)
 
 #endif
 
-

+
 /* Atomic operations */
 
 #define _CL_DECLARE_ATOMICS(MOD, TYPE)                                  \
@@ -2100,19 +2131,34 @@ _CL_DECLARE_VSTORE_HALF(__private , _rtn)
   _CL_OVERLOADABLE TYPE atomic_and    (volatile MOD TYPE *p, TYPE val); \
   _CL_OVERLOADABLE TYPE atomic_or     (volatile MOD TYPE *p, TYPE val); \
   _CL_OVERLOADABLE TYPE atomic_xor    (volatile MOD TYPE *p, TYPE val);
+
 _CL_DECLARE_ATOMICS(__global, int  )
 _CL_DECLARE_ATOMICS(__global, uint )
 _CL_DECLARE_ATOMICS(__local , int  )
 _CL_DECLARE_ATOMICS(__local , uint )
-
-_CL_DECLARE_ATOMICS(__global, long )
-_CL_DECLARE_ATOMICS(__global, ulong)
-_CL_DECLARE_ATOMICS(__local , long )
-_CL_DECLARE_ATOMICS(__local , ulong)
-
 _CL_OVERLOADABLE float atomic_xchg(volatile __global float *p, float val);
 _CL_OVERLOADABLE float atomic_xchg(volatile __local  float *p, float val);
 
+#define _CL_DECLARE_ATOMICS64(MOD, TYPE)                                \
+  __IF_BA64(                                                            \
+  _CL_OVERLOADABLE TYPE atomic_add    (volatile MOD TYPE *p, TYPE val); \
+  _CL_OVERLOADABLE TYPE atomic_sub    (volatile MOD TYPE *p, TYPE val); \
+  _CL_OVERLOADABLE TYPE atomic_xchg   (volatile MOD TYPE *p, TYPE val); \
+  _CL_OVERLOADABLE TYPE atomic_inc    (volatile MOD TYPE *p);           \
+  _CL_OVERLOADABLE TYPE atomic_dec    (volatile MOD TYPE *p);           \
+  _CL_OVERLOADABLE TYPE atomic_cmpxchg(volatile MOD TYPE *p, TYPE cmp, TYPE val);) \
+  __IF_EA64(                                                            \
+  _CL_OVERLOADABLE TYPE atomic_min    (volatile MOD TYPE *p, TYPE val); \
+  _CL_OVERLOADABLE TYPE atomic_max    (volatile MOD TYPE *p, TYPE val); \
+  _CL_OVERLOADABLE TYPE atomic_and    (volatile MOD TYPE *p, TYPE val); \
+  _CL_OVERLOADABLE TYPE atomic_or     (volatile MOD TYPE *p, TYPE val); \
+  _CL_OVERLOADABLE TYPE atomic_xor    (volatile MOD TYPE *p, TYPE val);)
+
+_CL_DECLARE_ATOMICS64(__global, long )
+_CL_DECLARE_ATOMICS64(__global, ulong)
+_CL_DECLARE_ATOMICS64(__local , long )
+_CL_DECLARE_ATOMICS64(__local , ulong)
+
 #define atom_add     atomic_add
 #define atom_sub     atomic_sub
 #define atom_xchg    atomic_xchg
@@ -2125,7 +2171,99 @@ _CL_OVERLOADABLE float atomic_xchg(volatile __local  float *p, float val);
 #define atom_or      atomic_or
 #define atom_xor     atomic_xor
 
-

+
+/* OpenCL 2.0 Atomics */
+
+#if (__clang_major__ == 3) && (__clang_minor__ >= 7)
+
+#if (__OPENCL_C_VERSION__ > 199) && (__OPENCL_VERSION__ > 199)
+
+#define ATOMIC_VAR_INIT(value) (value)
+
+typedef enum memory_order {
+  memory_order_relaxed,
+  memory_order_acquire,
+  memory_order_release,
+  memory_order_acq_rel,
+  memory_order_seq_cst,
+} memory_order;
+
+typedef enum memory_scope {
+  memory_scope_work_item,
+  memory_scope_work_group,
+  memory_scope_device,
+  memory_scope_all_svm_devices,
+  memory_scope_sub_group,
+} memory_scope;
+
+
+void atomic_work_item_fence(cl_mem_fence_flags flags,
+                            memory_order order,
+                            memory_scope scope);
+
+#define _CL_DECL_ATOMICS_EXPL1(RET, NAME, MOD, A)           \
+  RET _CL_OVERLOADABLE NAME(volatile MOD A *object);        \
+  RET _CL_OVERLOADABLE NAME ## _explicit(volatile MOD       \
+            A *object, memory_order order);                 \
+  RET _CL_OVERLOADABLE NAME ## _explicit(volatile MOD       \
+            A *object, memory_order order, memory_scope scope);
+
+#define _CL_DECL_ATOMICS_EXPL2(RET, NAME, MOD, A, C) \
+  RET _CL_OVERLOADABLE NAME(volatile MOD A *object, C value);           \
+  RET _CL_OVERLOADABLE NAME ## _explicit (volatile MOD A *object,       \
+                                          C value, memory_order order); \
+  RET _CL_OVERLOADABLE NAME ## _explicit (volatile MOD A *object,       \
+                                          C value, memory_order order,  \
+                                          memory_scope scope);
+
+#define _CL_DECL_ATOMICS_EXPL_CMPXCH(TYPE, MOD, A, C)                   \
+  _CL_OVERLOADABLE bool atomic_compare_exchange_##TYPE(                 \
+                      volatile MOD A *object, private C *expected, C desired);  \
+  _CL_OVERLOADABLE bool atomic_compare_exchange_##TYPE##_explicit(      \
+                      volatile MOD A *object, private C *expected, C desired,   \
+                      memory_order success, memory_order failure);      \
+  _CL_OVERLOADABLE bool atomic_compare_exchange_##TYPE##_explicit(      \
+                      volatile MOD A *object, private C *expected, C desired,   \
+                      memory_order success, memory_order failure,       \
+                      memory_scope scope);
+
+#define _CL_DECLARE_ATOMICS_DECL(MOD, A, C)                                 \
+  _CL_DECL_ATOMICS_EXPL2(void, atomic_store, MOD, A, C)                     \
+  _CL_OVERLOADABLE void atomic_init (volatile MOD A *object, C value);      \
+  _CL_DECL_ATOMICS_EXPL1(C, atomic_load, MOD, A)                            \
+  _CL_DECL_ATOMICS_EXPL2(C, atomic_exchange, MOD, A, C)                     \
+  _CL_DECL_ATOMICS_EXPL_CMPXCH(strong, MOD, A, C)                           \
+  _CL_DECL_ATOMICS_EXPL_CMPXCH(weak, MOD, A, C)                             \
+  _CL_DECL_ATOMICS_EXPL2(C, atomic_fetch_add, MOD, A, C)                    \
+  _CL_DECL_ATOMICS_EXPL2(C, atomic_fetch_sub, MOD, A, C)                    \
+  _CL_DECL_ATOMICS_EXPL2(C, atomic_fetch_or, MOD, A, C)                     \
+  _CL_DECL_ATOMICS_EXPL2(C, atomic_fetch_xor, MOD, A, C)                    \
+  _CL_DECL_ATOMICS_EXPL2(C, atomic_fetch_and, MOD, A, C)                    \
+  _CL_DECL_ATOMICS_EXPL2(C, atomic_fetch_min, MOD, A, C)                    \
+  _CL_DECL_ATOMICS_EXPL2(C, atomic_fetch_max, MOD, A, C)
+
+#define _CL_DECLARE_ATOMICS2(MOD)                                           \
+  _CL_DECL_ATOMICS_EXPL1(bool, atomic_flag_test_and_set, MOD, atomic_flag)  \
+  _CL_DECL_ATOMICS_EXPL1(void, atomic_flag_clear, MOD, atomic_flag)         \
+  _CL_DECLARE_ATOMICS_DECL(MOD, atomic_int, int)                            \
+  _CL_DECLARE_ATOMICS_DECL(MOD, atomic_uint, uint)                          \
+  _CL_DECLARE_ATOMICS_DECL(MOD, atomic_float, float)                        \
+  __IF_EA64(_CL_DECLARE_ATOMICS_DECL(MOD, atomic_long, long))               \
+  __IF_EA64(_CL_DECLARE_ATOMICS_DECL(MOD, atomic_ulong, ulong))             \
+  __IF_FP64(_CL_DECLARE_ATOMICS_DECL(MOD, atomic_double, double))           \
+  _CL_DECLARE_ATOMICS_DECL(MOD, atomic_intptr_t, intptr_t)                  \
+  _CL_DECLARE_ATOMICS_DECL(MOD, atomic_uintptr_t, uintptr_t)                \
+  _CL_DECLARE_ATOMICS_DECL(MOD, atomic_ptrdiff_t, ptrdiff_t)                \
+  _CL_DECLARE_ATOMICS_DECL(MOD, atomic_size_t, size_t)
+
+_CL_DECLARE_ATOMICS2(global)
+
+_CL_DECLARE_ATOMICS2(local)
+
+#endif
+
+#endif
+
 /* Miscellaneous Vector Functions */
 
 
@@ -2162,18 +2300,6 @@ _CL_DECLARE_SHUFFLE_MN(ulong , ulong ))
 __IF_FP64(
 _CL_DECLARE_SHUFFLE_MN(double, ulong ))
 
-

-#if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 4)
-
-// If Clang is too old, we just wrap the libc printf
-// Note: These older versions of Clang do not put string literals into
-// the "constant" address space, so we have to use "const" here.
-// Note: We cannot use __attribute__((format(printf, 1, 2))), since
-// this is confused about the difference between C long and OpenCL C
-// long.
-int printf(const char* restrict fmt, ...);
-
-#else
 
 // We provide our own printf
 // Note: We declare our printf as taking a constant format string, but
@@ -2185,16 +2311,10 @@ int printf(const char* restrict fmt, ...);
 int _cl_printf(constant char* restrict format, ...);
 #define printf _cl_printf
 
-#endif
 
-

 /* Async Copies from Global to Local Memory, Local to
    Global Memory, and Prefetch */
 
-#ifndef _CL_HAS_EVENT_T
-typedef uint event_t;
-#endif
-
 #define _CL_DECLARE_ASYNC_COPY_FUNCS_SINGLE(GENTYPE)            \
   _CL_OVERLOADABLE                                              \
   event_t async_work_group_copy (__local GENTYPE *dst,          \
diff --git a/include/_kernel_c.h b/include/_kernel_c.h
index be6732b..36c7527 100644
--- a/include/_kernel_c.h
+++ b/include/_kernel_c.h
@@ -155,11 +155,10 @@ typedef ulong ulong16 __attribute__((__ext_vector_type__(16)));
 
 /* Starting from Clang 3.3 the image and sampler are detected
    as opaque types by the frontend. In order to define
-   the default builtins we use C functions which require 
-   the typedefs to the actual underlying types. Clang 3.2
-   the typedefs throughout as the types are not detected
-   by the frontend. */
-#if !defined(_CL_HAS_IMAGE_ACCESS)
+   the default builtins we use C functions which require
+   the typedefs to the actual underlying types.
+*/
+#if defined(__CBUILD__)
 typedef int sampler_t;
 
 /* Since some built-ins have different return types
@@ -179,8 +178,6 @@ typedef struct _pocl_image1d_array_t { dev_image_t base; }* image1d_array_t;
 #endif
 
 
-//#ifdef _CL_HAS_IMAGE_ACCESS
-
 float4 _CL_OVERLOADABLE read_imagef (image2d_t image, sampler_t sampler,
                                      int2 coord);
 
diff --git a/include/pocl.h b/include/pocl.h
index 6eec4a0..99bb156 100644
--- a/include/pocl.h
+++ b/include/pocl.h
@@ -49,6 +49,7 @@
 #define POCL_ADDRESS_SPACE_GLOBAL 1
 #define POCL_ADDRESS_SPACE_LOCAL 2
 #define POCL_ADDRESS_SPACE_CONSTANT 3
+#define POCL_ADDRESS_SPACE_GENERIC 4
 
 #define POCL_FILENAME_LENGTH 1024
 
@@ -134,7 +135,6 @@ typedef struct
   mem_mapping_t *mapping;
 } _cl_command_map;
 
-
 /* clEnqueue(Write/Read)Image */
 typedef struct
 {
@@ -170,9 +170,48 @@ typedef struct
 
 typedef struct
 {
+  void* ptr;
+  size_t size, offset;
+  void* pattern;
+  size_t pattern_size;
+} _cl_command_fill;
+
+typedef struct
+{
   void *data;
 } _cl_command_marker;
 
+typedef struct
+{
+  void* data;
+  void* queue;
+  unsigned  num_svm_pointers;
+  void  **svm_pointers;
+  void (CL_CALLBACK  *pfn_free_func) ( cl_command_queue queue,
+                                       unsigned num_svm_pointers,
+                                       void *svm_pointers[],
+                                       void  *user_data);
+} _cl_command_svm_free;
+
+typedef struct
+{
+  void* svm_ptr;
+  size_t size;
+  cl_map_flags flags;
+} _cl_command_svm_map;
+
+typedef struct
+{
+  void* svm_ptr;
+} _cl_command_svm_unmap;
+
+typedef struct
+{
+  const void* src;
+  void* dst;
+  size_t size;
+} _cl_command_svm_cpy;
+
 typedef union
 {
   _cl_command_run run;
@@ -185,6 +224,12 @@ typedef union
   _cl_command_rw_image rw_image;
   _cl_command_marker marker;
   _cl_command_unmap unmap;
+  _cl_command_fill memfill;
+
+  _cl_command_svm_free svm_free;
+  _cl_command_svm_map svm_map;
+  _cl_command_svm_unmap svm_unmap;
+  _cl_command_svm_cpy svm_memcpy;
 } _cl_command_t;
 
 // one item in the command queue
@@ -201,19 +246,8 @@ typedef struct _cl_command_node_struct
 
 /* Additional LLVM version macros to simplify ifdefs */
 
-#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4)
-#define LLVM_OLDER_THAN_3_5 1
-#endif
-
-#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || \
-    defined(LLVM_3_5)
-
-# define LLVM_OLDER_THAN_3_6 1
-# define LLVM_OLDER_THAN_3_7 1
-#elif (defined LLVM_3_6)
-
-# define LLVM_OLDER_THAN_3_7 1
-
+#if (defined LLVM_3_7)
+# define LLVM_OLDER_THAN_3_8 1
 #endif
 
 #endif /* POCL_H */
diff --git a/include/pocl_cache.h b/include/pocl_cache.h
index 23fc694..1bc207c 100644
--- a/include/pocl_cache.h
+++ b/include/pocl_cache.h
@@ -44,7 +44,7 @@ void pocl_cache_init_topdir();
 int
 pocl_cache_create_program_cachedir(cl_program program, unsigned device_i,
                                    const char* preprocessed_source, size_t source_len,
-                                   char *program_bc_path, void **cache_lock);
+                                   char *program_bc_path);
 
 void pocl_cache_cleanup_cachedir(cl_program program);
 
@@ -52,6 +52,10 @@ void* pocl_cache_acquire_writer_lock_i(cl_program program, unsigned device_i);
 
 void* pocl_cache_acquire_writer_lock(cl_program program, cl_device_id device);
 
+void* pocl_cache_acquire_reader_lock_i(cl_program program, unsigned device_i);
+
+void* pocl_cache_acquire_reader_lock(cl_program program, cl_device_id device);
+
 void pocl_cache_release_lock(void* lock);
 
 int pocl_cl_device_to_index(cl_program   program,
diff --git a/include/pocl_features.h b/include/pocl_features.h
deleted file mode 100644
index 7486366..0000000
--- a/include/pocl_features.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Supported datataypes
-
-// All supported Clang versions support __fp16 to some extent,
-// however, the support for 'half' of OpenCL C properly added
-// only in 3.3, and even that does not handle half vectors well
-// for targets without native support. 
-
-#if (__clang_major__ == 3) && (__clang_minor__ > 2) && !defined(_CL_DISABLE_HALF)
-#  define cl_khr_fp16
-#else
-#  undef cl_khr_fp16
-#endif
-
-// Is long supported in OpenCL C?
-// This is checked at configure-time
-#ifndef _CL_DISABLE_LONG
-#  define cl_khr_int64
-#else
-#  undef cl_khr_int64
-#endif
-
-// Is double supported?
-#if defined cl_khr_int64 && __SIZEOF_DOUBLE__ == 8
-#  if !defined(cl_khr_fp64)
-#    define cl_khr_fp64
-#  endif
-#else
-#  undef cl_khr_fp64
-#endif
-
-// Architecture-specific overrides
-#ifdef __TCE__
-#  define __EMBEDDED_PROFILE__ 1
-// TODO: Are these necessary?
-#  undef cl_khr_int64
-#  undef cl_khr_fp64
-#endif
diff --git a/include/pocl_types.h b/include/pocl_types.h
index fd9a82a..2ba8e62 100644
--- a/include/pocl_types.h
+++ b/include/pocl_types.h
@@ -1,7 +1,5 @@
 // Scalar type definitions
 
-#include "pocl_features.h"
-
 #if defined cl_khr_fp64 && !defined cl_khr_int64
 #  error "cl_khr_fp64 requires cl_khr_int64"
 #endif
diff --git a/include/vccompat.hpp b/include/vccompat.hpp
index f931fd4..9f772c1 100644
--- a/include/vccompat.hpp
+++ b/include/vccompat.hpp
@@ -32,6 +32,9 @@
 #define __restrict__ __restrict
 #define restrict __restrict
 
+#include <intrin.h>
+#define __builtin_popcount __popcnt
+
 // ERROR is used as label for goto in some OCL API functions
 #undef ERROR
 
diff --git a/lib/CL/CMakeLists.txt b/lib/CL/CMakeLists.txt
index 7631a32..be32339 100644
--- a/lib/CL/CMakeLists.txt
+++ b/lib/CL/CMakeLists.txt
@@ -34,12 +34,14 @@ set(POCL_LIB_SOURCES  "clCreateContextFromType.c"
                    "clRetainContext.c"
                    "clGetContextInfo.c"
                    "clCreateCommandQueue.c"
+                   "clCreateCommandQueueWithProperties.c"
                    "clReleaseCommandQueue.c"
                    "clRetainCommandQueue.c"
                    "clGetCommandQueueInfo.c"
                    "clCreateBuffer.c"
                    "clCreateSubBuffer.c"
                    "clEnqueueFillImage.c"
+                   "clEnqueueFillBuffer.c"
                    "clEnqueueReadBuffer.c"
                    "clEnqueueReadBufferRect.c"
                    "clEnqueueMapBuffer.c"  "clEnqueueMapBuffer.h"
@@ -115,13 +117,21 @@ set(POCL_LIB_SOURCES  "clCreateContextFromType.c"
                    "pocl_cl.h" "pocl_util.h" "pocl_util.c"
                    "pocl_queue_util.h" "pocl_queue_util.c"
                    "pocl_image_util.c" "pocl_image_util.h"
-                   "pocl_icd.h" "pocl_llvm.h" "pocl_cache.c"
+                   "pocl_img_buf_cpy.h" "pocl_img_buf_cpy.c"
+                   "pocl_icd.h" "pocl_llvm.h"
                    "pocl_runtime_config.c" "pocl_runtime_config.h"
                    "pocl_mem_management.c"  "pocl_mem_management.h"
                    "pocl_llvm_api.cc" "pocl_hash.c"
-                   "pocl_debug.h" "pocl_debug.c")
+                   "pocl_debug.h" "pocl_debug.c" "pocl_timing.c"
+                   "clSVMAlloc.c" "clSVMFree.c" "clEnqueueSVMFree.c"
+                   "clEnqueueSVMMap.c" "clEnqueueSVMUnmap.c"
+                   "clEnqueueSVMMemcpy.c" "clEnqueueSVMMemFill.c"
+                   "clSetKernelArgSVMPointer.c" "clSetKernelExecInfo.c")
 
-set(LIBPOCL_OBJS "$<TARGET_OBJECTS:llvmpasses>;$<TARGET_OBJECTS:libpocl_unlinked_objs>;${POCL_DEVICES_OBJS}")
+set(LIBPOCL_OBJS "$<TARGET_OBJECTS:llvmpasses>"
+                 "$<TARGET_OBJECTS:libpocl_unlinked_objs>"
+                 "$<TARGET_OBJECTS:pocl_cache>"
+                 ${POCL_DEVICES_OBJS})
 
 add_compile_options(${OCL_ICD_CFLAGS})
 
@@ -137,8 +147,11 @@ endif(MSVC)
 # this is so that we don't compile twice when building both libpocl and libOpenCL
 add_library("libpocl_unlinked_objs" OBJECT ${POCL_LIB_SOURCES})
 
-# pocl_cache.c depends on a SHA1 hash of the built kernel-<machine>.bc
-add_dependencies("libpocl_unlinked_objs" "kernel_host")
+add_library("pocl_cache" OBJECT "pocl_cache.c")
+
+# pocl_cache.c depends on a SHA1 hash of all built kernel-<target>.bc
+add_dependencies("pocl_cache" "kernellib_hash")
+
 #################################################################
 
 if (MSVC)
diff --git a/lib/CL/Makefile.am b/lib/CL/Makefile.am
index 3d91f8d..0f3b6df 100644
--- a/lib/CL/Makefile.am
+++ b/lib/CL/Makefile.am
@@ -43,12 +43,14 @@ libpocl_la_SOURCES = clCreateContextFromType.c	\
                    clRetainContext.c		\
                    clGetContextInfo.c		\
                    clCreateCommandQueue.c	\
+                   clCreateCommandQueueWithProperties.c	\
                    clReleaseCommandQueue.c	\
                    clRetainCommandQueue.c	\
                    clGetCommandQueueInfo.c	\
                    clCreateBuffer.c		\
                    clCreateSubBuffer.c		\
                    clEnqueueFillImage.c	\
+                   clEnqueueFillBuffer.c	\
                    clEnqueueReadBuffer.c	\
                    clEnqueueReadBufferRect.c	\
                    clEnqueueMapBuffer.c	\
@@ -122,12 +124,22 @@ libpocl_la_SOURCES = clCreateContextFromType.c	\
                    clCreateSubDevices.c \
                    clReleaseDevice.c \
                    clRetainDevice.c \
+                   clSVMAlloc.c \
+                   clSVMFree.c \
+                   clEnqueueSVMFree.c \
+                   clEnqueueSVMMap.c \
+                   clEnqueueSVMUnmap.c \
+                   clEnqueueSVMMemcpy.c \
+                   clEnqueueSVMMemFill.c \
+                   clSetKernelArgSVMPointer.c \
+                   clSetKernelExecInfo.c \
                    pocl_cl.h \
                    pocl_util.c pocl_util.h \
                    pocl_queue_util.c pocl_queue_util.h \
                    pocl_image_util.c pocl_image_util.h \
-                   pocl_icd.h \
-                   pocl_intfn.h \
+                   pocl_img_buf_cpy.h pocl_img_buf_cpy.c \
+                   pocl_timing.c pocl_timing.h \
+                   pocl_intfn.h pocl_icd.h \
                    pocl_llvm.h \
                    pocl_runtime_config.c pocl_runtime_config.h \
                    pocl_mem_management.c pocl_mem_management.h \
@@ -160,9 +172,6 @@ if HAVE_CLOCK_GETTIME
 libpocl_la_LIBADD += -lrt
 endif
 
-if BUILD_SPU
-libpocl_la_LIBADD += -lspe2
-endif
 
 #Kludge: compile pocl_llvm_api.cc into a library of its own.
 #The source file is necessarely a C++ file, and having a C++ file
@@ -186,9 +195,6 @@ libpocl_la_LIBADD += -lclangStaticAnalyzerFrontend -lclangStaticAnalyzerCore
 libpocl_la_LIBADD += -lclangAnalysis -lclangCodeGen -lclangAST
 libpocl_la_LIBADD += -lstdc++ -lm @LLVM_LIBS@
 
-if LLVM_3_6
-libpocl_la_LIBADD += -lclangToolingCore 
-endif
 
 libpocl_la_LIBADD += -lclangASTMatchers -lclangBasic
 
@@ -208,5 +214,5 @@ BUILT_SOURCES = kernellib_hash.h
 FORCE:
 
 kernellib_hash.h: FORCE
-	echo '#define POCL_KERNELLIB_SHA1 "'`sha1sum ../../include/_kernel.h ../../include/_kernel_c.h ../../include/pocl_types.h ../../include/pocl_features.h ../kernel/*.cl ../kernel/*.c ../kernel/vecmathlib-pocl/*.cl ../kernel/vecmathlib-pocl/*.cc | sha1sum -`'"' > kernellib_hash.new
+	echo '#define POCL_KERNELLIB_SHA1 "'`sha1sum ../../include/_kernel.h ../../include/_kernel_c.h ../../include/pocl_types.h ../kernel/*.cl ../kernel/*.c ../kernel/vecmathlib-pocl/*.cl ../kernel/vecmathlib-pocl/*.cc | sha1sum -`'"' > kernellib_hash.new
 	cmp kernellib_hash.new kernellib_hash.h || mv kernellib_hash.new kernellib_hash.h
diff --git a/lib/CL/Makefile.in b/lib/CL/Makefile.in
index 9572e6c..0bb5afa 100644
--- a/lib/CL/Makefile.in
+++ b/lib/CL/Makefile.in
@@ -131,8 +131,6 @@ target_triplet = @target@
 # have it in a separate lib.
 # Mac doesnt have lrt, or clock_gettime
 @HAVE_CLOCK_GETTIME_TRUE at am__append_4 = -lrt
- at BUILD_SPU_TRUE@am__append_5 = -lspe2
- at LLVM_3_6_TRUE@am__append_6 = -lclangToolingCore 
 subdir = lib/CL
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
@@ -178,15 +176,15 @@ am__installdirs = "$(DESTDIR)$(libdir)"
 LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES)
 am__DEPENDENCIES_1 =
 am__DEPENDENCIES_2 = devices/libpocl-devices.la $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) libpoclllvm.la \
-	${top_builddir}/lib/llvmopencl/libllvmpasses.la \
-	$(am__DEPENDENCIES_1)
+	$(am__DEPENDENCIES_1) libpoclllvm.la \
+	${top_builddir}/lib/llvmopencl/libllvmpasses.la
 @BUILD_ICD_FALSE at libOpenCL_la_DEPENDENCIES = $(am__DEPENDENCIES_2)
 am__libOpenCL_la_SOURCES_DIST = clCreateContextFromType.c \
 	clReleaseContext.c clRetainContext.c clGetContextInfo.c \
-	clCreateCommandQueue.c clReleaseCommandQueue.c \
-	clRetainCommandQueue.c clGetCommandQueueInfo.c \
-	clCreateBuffer.c clCreateSubBuffer.c clEnqueueFillImage.c \
+	clCreateCommandQueue.c clCreateCommandQueueWithProperties.c \
+	clReleaseCommandQueue.c clRetainCommandQueue.c \
+	clGetCommandQueueInfo.c clCreateBuffer.c clCreateSubBuffer.c \
+	clEnqueueFillImage.c clEnqueueFillBuffer.c \
 	clEnqueueReadBuffer.c clEnqueueReadBufferRect.c \
 	clEnqueueMapBuffer.c clEnqueueMapBuffer.h \
 	clEnqueueUnmapMemObject.c clEnqueueMarkerWithWaitList.c \
@@ -216,23 +214,30 @@ am__libOpenCL_la_SOURCES_DIST = clCreateContextFromType.c \
 	clCreateFromGLTexture3D.c clUnloadCompiler.c \
 	clGetSupportedImageFormats.c clGetExtensionFunctionAddress.c \
 	clIcdGetPlatformIDsKHR.c clCreateSubDevices.c \
-	clReleaseDevice.c clRetainDevice.c pocl_cl.h pocl_util.c \
-	pocl_util.h pocl_queue_util.c pocl_queue_util.h \
-	pocl_image_util.c pocl_image_util.h pocl_icd.h pocl_intfn.h \
-	pocl_llvm.h pocl_runtime_config.c pocl_runtime_config.h \
-	pocl_mem_management.c pocl_mem_management.h pocl_hash.c \
-	pocl_hash.h pocl_cache.c pocl_debug.c pocl_debug.h
+	clReleaseDevice.c clRetainDevice.c clSVMAlloc.c clSVMFree.c \
+	clEnqueueSVMFree.c clEnqueueSVMMap.c clEnqueueSVMUnmap.c \
+	clEnqueueSVMMemcpy.c clEnqueueSVMMemFill.c \
+	clSetKernelArgSVMPointer.c clSetKernelExecInfo.c pocl_cl.h \
+	pocl_util.c pocl_util.h pocl_queue_util.c pocl_queue_util.h \
+	pocl_image_util.c pocl_image_util.h pocl_img_buf_cpy.h \
+	pocl_img_buf_cpy.c pocl_timing.c pocl_timing.h pocl_intfn.h \
+	pocl_icd.h pocl_llvm.h pocl_runtime_config.c \
+	pocl_runtime_config.h pocl_mem_management.c \
+	pocl_mem_management.h pocl_hash.c pocl_hash.h pocl_cache.c \
+	pocl_debug.c pocl_debug.h
 am__objects_1 = libOpenCL_la-clCreateContextFromType.lo \
 	libOpenCL_la-clReleaseContext.lo \
 	libOpenCL_la-clRetainContext.lo \
 	libOpenCL_la-clGetContextInfo.lo \
 	libOpenCL_la-clCreateCommandQueue.lo \
+	libOpenCL_la-clCreateCommandQueueWithProperties.lo \
 	libOpenCL_la-clReleaseCommandQueue.lo \
 	libOpenCL_la-clRetainCommandQueue.lo \
 	libOpenCL_la-clGetCommandQueueInfo.lo \
 	libOpenCL_la-clCreateBuffer.lo \
 	libOpenCL_la-clCreateSubBuffer.lo \
 	libOpenCL_la-clEnqueueFillImage.lo \
+	libOpenCL_la-clEnqueueFillBuffer.lo \
 	libOpenCL_la-clEnqueueReadBuffer.lo \
 	libOpenCL_la-clEnqueueReadBufferRect.lo \
 	libOpenCL_la-clEnqueueMapBuffer.lo \
@@ -297,8 +302,17 @@ am__objects_1 = libOpenCL_la-clCreateContextFromType.lo \
 	libOpenCL_la-clIcdGetPlatformIDsKHR.lo \
 	libOpenCL_la-clCreateSubDevices.lo \
 	libOpenCL_la-clReleaseDevice.lo libOpenCL_la-clRetainDevice.lo \
-	libOpenCL_la-pocl_util.lo libOpenCL_la-pocl_queue_util.lo \
+	libOpenCL_la-clSVMAlloc.lo libOpenCL_la-clSVMFree.lo \
+	libOpenCL_la-clEnqueueSVMFree.lo \
+	libOpenCL_la-clEnqueueSVMMap.lo \
+	libOpenCL_la-clEnqueueSVMUnmap.lo \
+	libOpenCL_la-clEnqueueSVMMemcpy.lo \
+	libOpenCL_la-clEnqueueSVMMemFill.lo \
+	libOpenCL_la-clSetKernelArgSVMPointer.lo \
+	libOpenCL_la-clSetKernelExecInfo.lo libOpenCL_la-pocl_util.lo \
+	libOpenCL_la-pocl_queue_util.lo \
 	libOpenCL_la-pocl_image_util.lo \
+	libOpenCL_la-pocl_img_buf_cpy.lo libOpenCL_la-pocl_timing.lo \
 	libOpenCL_la-pocl_runtime_config.lo \
 	libOpenCL_la-pocl_mem_management.lo libOpenCL_la-pocl_hash.lo \
 	libOpenCL_la-pocl_cache.lo libOpenCL_la-pocl_debug.lo
@@ -313,19 +327,19 @@ libOpenCL_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(libOpenCL_la_LDFLAGS) $(LDFLAGS) -o $@
 @BUILD_ICD_FALSE at am_libOpenCL_la_rpath = -rpath $(libdir)
 libpocl_la_DEPENDENCIES = devices/libpocl-devices.la \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_1) libpoclllvm.la \
-	${top_builddir}/lib/llvmopencl/libllvmpasses.la \
-	$(am__DEPENDENCIES_1)
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) libpoclllvm.la \
+	${top_builddir}/lib/llvmopencl/libllvmpasses.la
 am_libpocl_la_OBJECTS = libpocl_la-clCreateContextFromType.lo \
 	libpocl_la-clReleaseContext.lo libpocl_la-clRetainContext.lo \
 	libpocl_la-clGetContextInfo.lo \
 	libpocl_la-clCreateCommandQueue.lo \
+	libpocl_la-clCreateCommandQueueWithProperties.lo \
 	libpocl_la-clReleaseCommandQueue.lo \
 	libpocl_la-clRetainCommandQueue.lo \
 	libpocl_la-clGetCommandQueueInfo.lo \
 	libpocl_la-clCreateBuffer.lo libpocl_la-clCreateSubBuffer.lo \
 	libpocl_la-clEnqueueFillImage.lo \
+	libpocl_la-clEnqueueFillBuffer.lo \
 	libpocl_la-clEnqueueReadBuffer.lo \
 	libpocl_la-clEnqueueReadBufferRect.lo \
 	libpocl_la-clEnqueueMapBuffer.lo \
@@ -380,8 +394,15 @@ am_libpocl_la_OBJECTS = libpocl_la-clCreateContextFromType.lo \
 	libpocl_la-clGetExtensionFunctionAddress.lo \
 	libpocl_la-clIcdGetPlatformIDsKHR.lo \
 	libpocl_la-clCreateSubDevices.lo libpocl_la-clReleaseDevice.lo \
-	libpocl_la-clRetainDevice.lo libpocl_la-pocl_util.lo \
+	libpocl_la-clRetainDevice.lo libpocl_la-clSVMAlloc.lo \
+	libpocl_la-clSVMFree.lo libpocl_la-clEnqueueSVMFree.lo \
+	libpocl_la-clEnqueueSVMMap.lo libpocl_la-clEnqueueSVMUnmap.lo \
+	libpocl_la-clEnqueueSVMMemcpy.lo \
+	libpocl_la-clEnqueueSVMMemFill.lo \
+	libpocl_la-clSetKernelArgSVMPointer.lo \
+	libpocl_la-clSetKernelExecInfo.lo libpocl_la-pocl_util.lo \
 	libpocl_la-pocl_queue_util.lo libpocl_la-pocl_image_util.lo \
+	libpocl_la-pocl_img_buf_cpy.lo libpocl_la-pocl_timing.lo \
 	libpocl_la-pocl_runtime_config.lo \
 	libpocl_la-pocl_mem_management.lo libpocl_la-pocl_hash.lo \
 	libpocl_la-pocl_cache.lo libpocl_la-pocl_debug.lo
@@ -564,6 +585,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -571,6 +593,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -588,8 +611,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -665,6 +686,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -731,12 +753,14 @@ libpocl_la_SOURCES = clCreateContextFromType.c	\
                    clRetainContext.c		\
                    clGetContextInfo.c		\
                    clCreateCommandQueue.c	\
+                   clCreateCommandQueueWithProperties.c	\
                    clReleaseCommandQueue.c	\
                    clRetainCommandQueue.c	\
                    clGetCommandQueueInfo.c	\
                    clCreateBuffer.c		\
                    clCreateSubBuffer.c		\
                    clEnqueueFillImage.c	\
+                   clEnqueueFillBuffer.c	\
                    clEnqueueReadBuffer.c	\
                    clEnqueueReadBufferRect.c	\
                    clEnqueueMapBuffer.c	\
@@ -810,12 +834,22 @@ libpocl_la_SOURCES = clCreateContextFromType.c	\
                    clCreateSubDevices.c \
                    clReleaseDevice.c \
                    clRetainDevice.c \
+                   clSVMAlloc.c \
+                   clSVMFree.c \
+                   clEnqueueSVMFree.c \
+                   clEnqueueSVMMap.c \
+                   clEnqueueSVMUnmap.c \
+                   clEnqueueSVMMemcpy.c \
+                   clEnqueueSVMMemFill.c \
+                   clSetKernelArgSVMPointer.c \
+                   clSetKernelExecInfo.c \
                    pocl_cl.h \
                    pocl_util.c pocl_util.h \
                    pocl_queue_util.c pocl_queue_util.h \
                    pocl_image_util.c pocl_image_util.h \
-                   pocl_icd.h \
-                   pocl_intfn.h \
+                   pocl_img_buf_cpy.h pocl_img_buf_cpy.c \
+                   pocl_timing.c pocl_timing.h \
+                   pocl_intfn.h pocl_icd.h \
                    pocl_llvm.h \
                    pocl_runtime_config.c pocl_runtime_config.h \
                    pocl_mem_management.c pocl_mem_management.h \
@@ -827,14 +861,14 @@ libpocl_la_LDFLAGS = -lltdl @PTHREAD_CFLAGS@ -version-info \
 	${LIB_VERSION} $(am__append_2) $(am__append_3) @LLVM_LDFLAGS@ \
 	-L${top_builddir}/lib/CL/ @ICD_LD_FLAGS@
 libpocl_la_LIBADD = devices/libpocl-devices.la ${LTDL_LIBS} \
-	$(am__append_4) $(am__append_5) libpoclllvm.la \
+	$(am__append_4) libpoclllvm.la \
 	${top_builddir}/lib/llvmopencl/libllvmpasses.la \
 	-lclangFrontend -lclangDriver -lclangParse -lclangSema \
 	-lclangEdit -lclangLex -lclangSerialization -lclangAST \
 	-lclangBasic -lclangFrontendTool -lclangRewriteFrontend \
 	-lclangStaticAnalyzerFrontend -lclangStaticAnalyzerCore \
 	-lclangAnalysis -lclangCodeGen -lclangAST -lstdc++ -lm \
-	@LLVM_LIBS@ $(am__append_6) -lclangASTMatchers -lclangBasic
+	@LLVM_LIBS@ -lclangASTMatchers -lclangBasic
 
 #Kludge: compile pocl_llvm_api.cc into a library of its own.
 #The source file is necessarely a C++ file, and having a C++ file
@@ -952,6 +986,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clBuildProgram.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clCreateBuffer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clCreateCommandQueue.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clCreateCommandQueueWithProperties.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clCreateContext.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clCreateContextFromType.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clCreateFromGLTexture2D.Plo at am__quote@
@@ -973,6 +1008,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueCopyBufferToImage.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueCopyImage.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueCopyImageToBuffer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueFillBuffer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueFillImage.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueMapBuffer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueMapImage.Plo at am__quote@
@@ -983,6 +1019,11 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueReadBuffer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueReadBufferRect.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueReadImage.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueSVMFree.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueSVMMap.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueSVMMemFill.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueSVMMemcpy.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueSVMUnmap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueTask.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueUnmapMemObject.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clEnqueueWaitForEvents.Plo at am__quote@
@@ -1026,8 +1067,12 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clRetainMemObject.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clRetainProgram.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clRetainSampler.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clSVMAlloc.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clSVMFree.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clSetEventCallback.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clSetKernelArg.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clSetKernelArgSVMPointer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clSetKernelExecInfo.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clSetMemObjectDestructorCallback.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clSetUserEventStatus.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-clUnloadCompiler.Plo at am__quote@
@@ -1036,13 +1081,16 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-pocl_debug.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-pocl_hash.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-pocl_image_util.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-pocl_img_buf_cpy.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-pocl_mem_management.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-pocl_queue_util.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-pocl_runtime_config.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-pocl_timing.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libOpenCL_la-pocl_util.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clBuildProgram.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clCreateBuffer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clCreateCommandQueue.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clCreateCommandQueueWithProperties.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clCreateContext.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clCreateContextFromType.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clCreateFromGLTexture2D.Plo at am__quote@
@@ -1064,6 +1112,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueCopyBufferToImage.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueCopyImage.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueCopyImageToBuffer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueFillBuffer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueFillImage.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueMapBuffer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueMapImage.Plo at am__quote@
@@ -1074,6 +1123,11 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueReadBuffer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueReadBufferRect.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueReadImage.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueSVMFree.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueSVMMap.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueSVMMemFill.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueSVMMemcpy.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueSVMUnmap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueTask.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueUnmapMemObject.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clEnqueueWaitForEvents.Plo at am__quote@
@@ -1117,8 +1171,12 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clRetainMemObject.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clRetainProgram.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clRetainSampler.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clSVMAlloc.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clSVMFree.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clSetEventCallback.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clSetKernelArg.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clSetKernelArgSVMPointer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clSetKernelExecInfo.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clSetMemObjectDestructorCallback.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clSetUserEventStatus.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-clUnloadCompiler.Plo at am__quote@
@@ -1127,9 +1185,11 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-pocl_debug.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-pocl_hash.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-pocl_image_util.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-pocl_img_buf_cpy.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-pocl_mem_management.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-pocl_queue_util.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-pocl_runtime_config.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-pocl_timing.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_la-pocl_util.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpoclllvm_la-pocl_llvm_api.Plo at am__quote@
 
@@ -1189,6 +1249,13 @@ libOpenCL_la-clCreateCommandQueue.lo: clCreateCommandQueue.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clCreateCommandQueue.lo `test -f 'clCreateCommandQueue.c' || echo '$(srcdir)/'`clCreateCommandQueue.c
 
+libOpenCL_la-clCreateCommandQueueWithProperties.lo: clCreateCommandQueueWithProperties.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clCreateCommandQueueWithProperties.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clCreateCommandQueueWithProperties.Tpo -c -o libOpenCL_la-clCreateCommandQueueWithProperties.lo `test -f 'clCreateCommandQueueWithProperties.c' || echo '$(srcdir)/'`clCreateCommandQueu [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clCreateCommandQueueWithProperties.Tpo $(DEPDIR)/libOpenCL_la-clCreateCommandQueueWithProperties.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clCreateCommandQueueWithProperties.c' object='libOpenCL_la-clCreateCommandQueueWithProperties.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clCreateCommandQueueWithProperties.lo `test -f 'clCreateCommandQueueWithProperties.c' || echo '$(srcdir)/'`clCreateCommandQueueWithProperties.c
+
 libOpenCL_la-clReleaseCommandQueue.lo: clReleaseCommandQueue.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clReleaseCommandQueue.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clReleaseCommandQueue.Tpo -c -o libOpenCL_la-clReleaseCommandQueue.lo `test -f 'clReleaseCommandQueue.c' || echo '$(srcdir)/'`clReleaseCommandQueue.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clReleaseCommandQueue.Tpo $(DEPDIR)/libOpenCL_la-clReleaseCommandQueue.Plo
@@ -1231,6 +1298,13 @@ libOpenCL_la-clEnqueueFillImage.lo: clEnqueueFillImage.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clEnqueueFillImage.lo `test -f 'clEnqueueFillImage.c' || echo '$(srcdir)/'`clEnqueueFillImage.c
 
+libOpenCL_la-clEnqueueFillBuffer.lo: clEnqueueFillBuffer.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clEnqueueFillBuffer.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clEnqueueFillBuffer.Tpo -c -o libOpenCL_la-clEnqueueFillBuffer.lo `test -f 'clEnqueueFillBuffer.c' || echo '$(srcdir)/'`clEnqueueFillBuffer.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clEnqueueFillBuffer.Tpo $(DEPDIR)/libOpenCL_la-clEnqueueFillBuffer.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueFillBuffer.c' object='libOpenCL_la-clEnqueueFillBuffer.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clEnqueueFillBuffer.lo `test -f 'clEnqueueFillBuffer.c' || echo '$(srcdir)/'`clEnqueueFillBuffer.c
+
 libOpenCL_la-clEnqueueReadBuffer.lo: clEnqueueReadBuffer.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clEnqueueReadBuffer.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clEnqueueReadBuffer.Tpo -c -o libOpenCL_la-clEnqueueReadBuffer.lo `test -f 'clEnqueueReadBuffer.c' || echo '$(srcdir)/'`clEnqueueReadBuffer.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clEnqueueReadBuffer.Tpo $(DEPDIR)/libOpenCL_la-clEnqueueReadBuffer.Plo
@@ -1735,6 +1809,69 @@ libOpenCL_la-clRetainDevice.lo: clRetainDevice.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clRetainDevice.lo `test -f 'clRetainDevice.c' || echo '$(srcdir)/'`clRetainDevice.c
 
+libOpenCL_la-clSVMAlloc.lo: clSVMAlloc.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clSVMAlloc.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clSVMAlloc.Tpo -c -o libOpenCL_la-clSVMAlloc.lo `test -f 'clSVMAlloc.c' || echo '$(srcdir)/'`clSVMAlloc.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clSVMAlloc.Tpo $(DEPDIR)/libOpenCL_la-clSVMAlloc.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clSVMAlloc.c' object='libOpenCL_la-clSVMAlloc.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clSVMAlloc.lo `test -f 'clSVMAlloc.c' || echo '$(srcdir)/'`clSVMAlloc.c
+
+libOpenCL_la-clSVMFree.lo: clSVMFree.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clSVMFree.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clSVMFree.Tpo -c -o libOpenCL_la-clSVMFree.lo `test -f 'clSVMFree.c' || echo '$(srcdir)/'`clSVMFree.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clSVMFree.Tpo $(DEPDIR)/libOpenCL_la-clSVMFree.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clSVMFree.c' object='libOpenCL_la-clSVMFree.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clSVMFree.lo `test -f 'clSVMFree.c' || echo '$(srcdir)/'`clSVMFree.c
+
+libOpenCL_la-clEnqueueSVMFree.lo: clEnqueueSVMFree.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clEnqueueSVMFree.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clEnqueueSVMFree.Tpo -c -o libOpenCL_la-clEnqueueSVMFree.lo `test -f 'clEnqueueSVMFree.c' || echo '$(srcdir)/'`clEnqueueSVMFree.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clEnqueueSVMFree.Tpo $(DEPDIR)/libOpenCL_la-clEnqueueSVMFree.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMFree.c' object='libOpenCL_la-clEnqueueSVMFree.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clEnqueueSVMFree.lo `test -f 'clEnqueueSVMFree.c' || echo '$(srcdir)/'`clEnqueueSVMFree.c
+
+libOpenCL_la-clEnqueueSVMMap.lo: clEnqueueSVMMap.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clEnqueueSVMMap.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clEnqueueSVMMap.Tpo -c -o libOpenCL_la-clEnqueueSVMMap.lo `test -f 'clEnqueueSVMMap.c' || echo '$(srcdir)/'`clEnqueueSVMMap.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clEnqueueSVMMap.Tpo $(DEPDIR)/libOpenCL_la-clEnqueueSVMMap.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMMap.c' object='libOpenCL_la-clEnqueueSVMMap.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clEnqueueSVMMap.lo `test -f 'clEnqueueSVMMap.c' || echo '$(srcdir)/'`clEnqueueSVMMap.c
+
+libOpenCL_la-clEnqueueSVMUnmap.lo: clEnqueueSVMUnmap.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clEnqueueSVMUnmap.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clEnqueueSVMUnmap.Tpo -c -o libOpenCL_la-clEnqueueSVMUnmap.lo `test -f 'clEnqueueSVMUnmap.c' || echo '$(srcdir)/'`clEnqueueSVMUnmap.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clEnqueueSVMUnmap.Tpo $(DEPDIR)/libOpenCL_la-clEnqueueSVMUnmap.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMUnmap.c' object='libOpenCL_la-clEnqueueSVMUnmap.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clEnqueueSVMUnmap.lo `test -f 'clEnqueueSVMUnmap.c' || echo '$(srcdir)/'`clEnqueueSVMUnmap.c
+
+libOpenCL_la-clEnqueueSVMMemcpy.lo: clEnqueueSVMMemcpy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clEnqueueSVMMemcpy.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clEnqueueSVMMemcpy.Tpo -c -o libOpenCL_la-clEnqueueSVMMemcpy.lo `test -f 'clEnqueueSVMMemcpy.c' || echo '$(srcdir)/'`clEnqueueSVMMemcpy.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clEnqueueSVMMemcpy.Tpo $(DEPDIR)/libOpenCL_la-clEnqueueSVMMemcpy.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMMemcpy.c' object='libOpenCL_la-clEnqueueSVMMemcpy.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clEnqueueSVMMemcpy.lo `test -f 'clEnqueueSVMMemcpy.c' || echo '$(srcdir)/'`clEnqueueSVMMemcpy.c
+
+libOpenCL_la-clEnqueueSVMMemFill.lo: clEnqueueSVMMemFill.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clEnqueueSVMMemFill.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clEnqueueSVMMemFill.Tpo -c -o libOpenCL_la-clEnqueueSVMMemFill.lo `test -f 'clEnqueueSVMMemFill.c' || echo '$(srcdir)/'`clEnqueueSVMMemFill.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clEnqueueSVMMemFill.Tpo $(DEPDIR)/libOpenCL_la-clEnqueueSVMMemFill.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMMemFill.c' object='libOpenCL_la-clEnqueueSVMMemFill.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clEnqueueSVMMemFill.lo `test -f 'clEnqueueSVMMemFill.c' || echo '$(srcdir)/'`clEnqueueSVMMemFill.c
+
+libOpenCL_la-clSetKernelArgSVMPointer.lo: clSetKernelArgSVMPointer.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clSetKernelArgSVMPointer.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clSetKernelArgSVMPointer.Tpo -c -o libOpenCL_la-clSetKernelArgSVMPointer.lo `test -f 'clSetKernelArgSVMPointer.c' || echo '$(srcdir)/'`clSetKernelArgSVMPointer.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clSetKernelArgSVMPointer.Tpo $(DEPDIR)/libOpenCL_la-clSetKernelArgSVMPointer.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clSetKernelArgSVMPointer.c' object='libOpenCL_la-clSetKernelArgSVMPointer.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clSetKernelArgSVMPointer.lo `test -f 'clSetKernelArgSVMPointer.c' || echo '$(srcdir)/'`clSetKernelArgSVMPointer.c
+
+libOpenCL_la-clSetKernelExecInfo.lo: clSetKernelExecInfo.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-clSetKernelExecInfo.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-clSetKernelExecInfo.Tpo -c -o libOpenCL_la-clSetKernelExecInfo.lo `test -f 'clSetKernelExecInfo.c' || echo '$(srcdir)/'`clSetKernelExecInfo.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-clSetKernelExecInfo.Tpo $(DEPDIR)/libOpenCL_la-clSetKernelExecInfo.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clSetKernelExecInfo.c' object='libOpenCL_la-clSetKernelExecInfo.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-clSetKernelExecInfo.lo `test -f 'clSetKernelExecInfo.c' || echo '$(srcdir)/'`clSetKernelExecInfo.c
+
 libOpenCL_la-pocl_util.lo: pocl_util.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-pocl_util.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-pocl_util.Tpo -c -o libOpenCL_la-pocl_util.lo `test -f 'pocl_util.c' || echo '$(srcdir)/'`pocl_util.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-pocl_util.Tpo $(DEPDIR)/libOpenCL_la-pocl_util.Plo
@@ -1756,6 +1893,20 @@ libOpenCL_la-pocl_image_util.lo: pocl_image_util.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-pocl_image_util.lo `test -f 'pocl_image_util.c' || echo '$(srcdir)/'`pocl_image_util.c
 
+libOpenCL_la-pocl_img_buf_cpy.lo: pocl_img_buf_cpy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-pocl_img_buf_cpy.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-pocl_img_buf_cpy.Tpo -c -o libOpenCL_la-pocl_img_buf_cpy.lo `test -f 'pocl_img_buf_cpy.c' || echo '$(srcdir)/'`pocl_img_buf_cpy.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-pocl_img_buf_cpy.Tpo $(DEPDIR)/libOpenCL_la-pocl_img_buf_cpy.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pocl_img_buf_cpy.c' object='libOpenCL_la-pocl_img_buf_cpy.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-pocl_img_buf_cpy.lo `test -f 'pocl_img_buf_cpy.c' || echo '$(srcdir)/'`pocl_img_buf_cpy.c
+
+libOpenCL_la-pocl_timing.lo: pocl_timing.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-pocl_timing.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-pocl_timing.Tpo -c -o libOpenCL_la-pocl_timing.lo `test -f 'pocl_timing.c' || echo '$(srcdir)/'`pocl_timing.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-pocl_timing.Tpo $(DEPDIR)/libOpenCL_la-pocl_timing.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pocl_timing.c' object='libOpenCL_la-pocl_timing.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libOpenCL_la-pocl_timing.lo `test -f 'pocl_timing.c' || echo '$(srcdir)/'`pocl_timing.c
+
 libOpenCL_la-pocl_runtime_config.lo: pocl_runtime_config.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libOpenCL_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libOpenCL_la-pocl_runtime_config.lo -MD -MP -MF $(DEPDIR)/libOpenCL_la-pocl_runtime_config.Tpo -c -o libOpenCL_la-pocl_runtime_config.lo `test -f 'pocl_runtime_config.c' || echo '$(srcdir)/'`pocl_runtime_config.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libOpenCL_la-pocl_runtime_config.Tpo $(DEPDIR)/libOpenCL_la-pocl_runtime_config.Plo
@@ -1826,6 +1977,13 @@ libpocl_la-clCreateCommandQueue.lo: clCreateCommandQueue.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clCreateCommandQueue.lo `test -f 'clCreateCommandQueue.c' || echo '$(srcdir)/'`clCreateCommandQueue.c
 
+libpocl_la-clCreateCommandQueueWithProperties.lo: clCreateCommandQueueWithProperties.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clCreateCommandQueueWithProperties.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clCreateCommandQueueWithProperties.Tpo -c -o libpocl_la-clCreateCommandQueueWithProperties.lo `test -f 'clCreateCommandQueueWithProperties.c' || echo '$(srcdir)/'`clCreateCommandQueueWithPro [...]
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clCreateCommandQueueWithProperties.Tpo $(DEPDIR)/libpocl_la-clCreateCommandQueueWithProperties.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clCreateCommandQueueWithProperties.c' object='libpocl_la-clCreateCommandQueueWithProperties.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clCreateCommandQueueWithProperties.lo `test -f 'clCreateCommandQueueWithProperties.c' || echo '$(srcdir)/'`clCreateCommandQueueWithProperties.c
+
 libpocl_la-clReleaseCommandQueue.lo: clReleaseCommandQueue.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clReleaseCommandQueue.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clReleaseCommandQueue.Tpo -c -o libpocl_la-clReleaseCommandQueue.lo `test -f 'clReleaseCommandQueue.c' || echo '$(srcdir)/'`clReleaseCommandQueue.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clReleaseCommandQueue.Tpo $(DEPDIR)/libpocl_la-clReleaseCommandQueue.Plo
@@ -1868,6 +2026,13 @@ libpocl_la-clEnqueueFillImage.lo: clEnqueueFillImage.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clEnqueueFillImage.lo `test -f 'clEnqueueFillImage.c' || echo '$(srcdir)/'`clEnqueueFillImage.c
 
+libpocl_la-clEnqueueFillBuffer.lo: clEnqueueFillBuffer.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clEnqueueFillBuffer.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clEnqueueFillBuffer.Tpo -c -o libpocl_la-clEnqueueFillBuffer.lo `test -f 'clEnqueueFillBuffer.c' || echo '$(srcdir)/'`clEnqueueFillBuffer.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clEnqueueFillBuffer.Tpo $(DEPDIR)/libpocl_la-clEnqueueFillBuffer.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueFillBuffer.c' object='libpocl_la-clEnqueueFillBuffer.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clEnqueueFillBuffer.lo `test -f 'clEnqueueFillBuffer.c' || echo '$(srcdir)/'`clEnqueueFillBuffer.c
+
 libpocl_la-clEnqueueReadBuffer.lo: clEnqueueReadBuffer.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clEnqueueReadBuffer.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clEnqueueReadBuffer.Tpo -c -o libpocl_la-clEnqueueReadBuffer.lo `test -f 'clEnqueueReadBuffer.c' || echo '$(srcdir)/'`clEnqueueReadBuffer.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clEnqueueReadBuffer.Tpo $(DEPDIR)/libpocl_la-clEnqueueReadBuffer.Plo
@@ -2372,6 +2537,69 @@ libpocl_la-clRetainDevice.lo: clRetainDevice.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clRetainDevice.lo `test -f 'clRetainDevice.c' || echo '$(srcdir)/'`clRetainDevice.c
 
+libpocl_la-clSVMAlloc.lo: clSVMAlloc.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clSVMAlloc.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clSVMAlloc.Tpo -c -o libpocl_la-clSVMAlloc.lo `test -f 'clSVMAlloc.c' || echo '$(srcdir)/'`clSVMAlloc.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clSVMAlloc.Tpo $(DEPDIR)/libpocl_la-clSVMAlloc.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clSVMAlloc.c' object='libpocl_la-clSVMAlloc.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clSVMAlloc.lo `test -f 'clSVMAlloc.c' || echo '$(srcdir)/'`clSVMAlloc.c
+
+libpocl_la-clSVMFree.lo: clSVMFree.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clSVMFree.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clSVMFree.Tpo -c -o libpocl_la-clSVMFree.lo `test -f 'clSVMFree.c' || echo '$(srcdir)/'`clSVMFree.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clSVMFree.Tpo $(DEPDIR)/libpocl_la-clSVMFree.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clSVMFree.c' object='libpocl_la-clSVMFree.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clSVMFree.lo `test -f 'clSVMFree.c' || echo '$(srcdir)/'`clSVMFree.c
+
+libpocl_la-clEnqueueSVMFree.lo: clEnqueueSVMFree.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clEnqueueSVMFree.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clEnqueueSVMFree.Tpo -c -o libpocl_la-clEnqueueSVMFree.lo `test -f 'clEnqueueSVMFree.c' || echo '$(srcdir)/'`clEnqueueSVMFree.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clEnqueueSVMFree.Tpo $(DEPDIR)/libpocl_la-clEnqueueSVMFree.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMFree.c' object='libpocl_la-clEnqueueSVMFree.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clEnqueueSVMFree.lo `test -f 'clEnqueueSVMFree.c' || echo '$(srcdir)/'`clEnqueueSVMFree.c
+
+libpocl_la-clEnqueueSVMMap.lo: clEnqueueSVMMap.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clEnqueueSVMMap.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clEnqueueSVMMap.Tpo -c -o libpocl_la-clEnqueueSVMMap.lo `test -f 'clEnqueueSVMMap.c' || echo '$(srcdir)/'`clEnqueueSVMMap.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clEnqueueSVMMap.Tpo $(DEPDIR)/libpocl_la-clEnqueueSVMMap.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMMap.c' object='libpocl_la-clEnqueueSVMMap.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clEnqueueSVMMap.lo `test -f 'clEnqueueSVMMap.c' || echo '$(srcdir)/'`clEnqueueSVMMap.c
+
+libpocl_la-clEnqueueSVMUnmap.lo: clEnqueueSVMUnmap.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clEnqueueSVMUnmap.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clEnqueueSVMUnmap.Tpo -c -o libpocl_la-clEnqueueSVMUnmap.lo `test -f 'clEnqueueSVMUnmap.c' || echo '$(srcdir)/'`clEnqueueSVMUnmap.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clEnqueueSVMUnmap.Tpo $(DEPDIR)/libpocl_la-clEnqueueSVMUnmap.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMUnmap.c' object='libpocl_la-clEnqueueSVMUnmap.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clEnqueueSVMUnmap.lo `test -f 'clEnqueueSVMUnmap.c' || echo '$(srcdir)/'`clEnqueueSVMUnmap.c
+
+libpocl_la-clEnqueueSVMMemcpy.lo: clEnqueueSVMMemcpy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clEnqueueSVMMemcpy.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clEnqueueSVMMemcpy.Tpo -c -o libpocl_la-clEnqueueSVMMemcpy.lo `test -f 'clEnqueueSVMMemcpy.c' || echo '$(srcdir)/'`clEnqueueSVMMemcpy.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clEnqueueSVMMemcpy.Tpo $(DEPDIR)/libpocl_la-clEnqueueSVMMemcpy.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMMemcpy.c' object='libpocl_la-clEnqueueSVMMemcpy.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clEnqueueSVMMemcpy.lo `test -f 'clEnqueueSVMMemcpy.c' || echo '$(srcdir)/'`clEnqueueSVMMemcpy.c
+
+libpocl_la-clEnqueueSVMMemFill.lo: clEnqueueSVMMemFill.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clEnqueueSVMMemFill.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clEnqueueSVMMemFill.Tpo -c -o libpocl_la-clEnqueueSVMMemFill.lo `test -f 'clEnqueueSVMMemFill.c' || echo '$(srcdir)/'`clEnqueueSVMMemFill.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clEnqueueSVMMemFill.Tpo $(DEPDIR)/libpocl_la-clEnqueueSVMMemFill.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clEnqueueSVMMemFill.c' object='libpocl_la-clEnqueueSVMMemFill.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clEnqueueSVMMemFill.lo `test -f 'clEnqueueSVMMemFill.c' || echo '$(srcdir)/'`clEnqueueSVMMemFill.c
+
+libpocl_la-clSetKernelArgSVMPointer.lo: clSetKernelArgSVMPointer.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clSetKernelArgSVMPointer.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clSetKernelArgSVMPointer.Tpo -c -o libpocl_la-clSetKernelArgSVMPointer.lo `test -f 'clSetKernelArgSVMPointer.c' || echo '$(srcdir)/'`clSetKernelArgSVMPointer.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clSetKernelArgSVMPointer.Tpo $(DEPDIR)/libpocl_la-clSetKernelArgSVMPointer.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clSetKernelArgSVMPointer.c' object='libpocl_la-clSetKernelArgSVMPointer.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clSetKernelArgSVMPointer.lo `test -f 'clSetKernelArgSVMPointer.c' || echo '$(srcdir)/'`clSetKernelArgSVMPointer.c
+
+libpocl_la-clSetKernelExecInfo.lo: clSetKernelExecInfo.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-clSetKernelExecInfo.lo -MD -MP -MF $(DEPDIR)/libpocl_la-clSetKernelExecInfo.Tpo -c -o libpocl_la-clSetKernelExecInfo.lo `test -f 'clSetKernelExecInfo.c' || echo '$(srcdir)/'`clSetKernelExecInfo.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-clSetKernelExecInfo.Tpo $(DEPDIR)/libpocl_la-clSetKernelExecInfo.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='clSetKernelExecInfo.c' object='libpocl_la-clSetKernelExecInfo.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-clSetKernelExecInfo.lo `test -f 'clSetKernelExecInfo.c' || echo '$(srcdir)/'`clSetKernelExecInfo.c
+
 libpocl_la-pocl_util.lo: pocl_util.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-pocl_util.lo -MD -MP -MF $(DEPDIR)/libpocl_la-pocl_util.Tpo -c -o libpocl_la-pocl_util.lo `test -f 'pocl_util.c' || echo '$(srcdir)/'`pocl_util.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-pocl_util.Tpo $(DEPDIR)/libpocl_la-pocl_util.Plo
@@ -2393,6 +2621,20 @@ libpocl_la-pocl_image_util.lo: pocl_image_util.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-pocl_image_util.lo `test -f 'pocl_image_util.c' || echo '$(srcdir)/'`pocl_image_util.c
 
+libpocl_la-pocl_img_buf_cpy.lo: pocl_img_buf_cpy.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-pocl_img_buf_cpy.lo -MD -MP -MF $(DEPDIR)/libpocl_la-pocl_img_buf_cpy.Tpo -c -o libpocl_la-pocl_img_buf_cpy.lo `test -f 'pocl_img_buf_cpy.c' || echo '$(srcdir)/'`pocl_img_buf_cpy.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-pocl_img_buf_cpy.Tpo $(DEPDIR)/libpocl_la-pocl_img_buf_cpy.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pocl_img_buf_cpy.c' object='libpocl_la-pocl_img_buf_cpy.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-pocl_img_buf_cpy.lo `test -f 'pocl_img_buf_cpy.c' || echo '$(srcdir)/'`pocl_img_buf_cpy.c
+
+libpocl_la-pocl_timing.lo: pocl_timing.c
+ at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-pocl_timing.lo -MD -MP -MF $(DEPDIR)/libpocl_la-pocl_timing.Tpo -c -o libpocl_la-pocl_timing.lo `test -f 'pocl_timing.c' || echo '$(srcdir)/'`pocl_timing.c
+ at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-pocl_timing.Tpo $(DEPDIR)/libpocl_la-pocl_timing.Plo
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pocl_timing.c' object='libpocl_la-pocl_timing.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_la-pocl_timing.lo `test -f 'pocl_timing.c' || echo '$(srcdir)/'`pocl_timing.c
+
 libpocl_la-pocl_runtime_config.lo: pocl_runtime_config.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_la-pocl_runtime_config.lo -MD -MP -MF $(DEPDIR)/libpocl_la-pocl_runtime_config.Tpo -c -o libpocl_la-pocl_runtime_config.lo `test -f 'pocl_runtime_config.c' || echo '$(srcdir)/'`pocl_runtime_config.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_la-pocl_runtime_config.Tpo $(DEPDIR)/libpocl_la-pocl_runtime_config.Plo
@@ -2754,7 +2996,7 @@ uninstall-am: uninstall-libLTLIBRARIES
 FORCE:
 
 kernellib_hash.h: FORCE
-	echo '#define POCL_KERNELLIB_SHA1 "'`sha1sum ../../include/_kernel.h ../../include/_kernel_c.h ../../include/pocl_types.h ../../include/pocl_features.h ../kernel/*.cl ../kernel/*.c ../kernel/vecmathlib-pocl/*.cl ../kernel/vecmathlib-pocl/*.cc | sha1sum -`'"' > kernellib_hash.new
+	echo '#define POCL_KERNELLIB_SHA1 "'`sha1sum ../../include/_kernel.h ../../include/_kernel_c.h ../../include/pocl_types.h ../kernel/*.cl ../kernel/*.c ../kernel/vecmathlib-pocl/*.cl ../kernel/vecmathlib-pocl/*.cc | sha1sum -`'"' > kernellib_hash.new
 	cmp kernellib_hash.new kernellib_hash.h || mv kernellib_hash.new kernellib_hash.h
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
diff --git a/lib/CL/clBuildProgram.c b/lib/CL/clBuildProgram.c
index 1c79974..de44434 100644
--- a/lib/CL/clBuildProgram.c
+++ b/lib/CL/clBuildProgram.c
@@ -52,6 +52,7 @@ static const char cl_parameters[] =
   "-cl-fast-relaxed-math "
   "-cl-std=CL1.2 "
   "-cl-std=CL1.1 "
+  "-cl-std=CL2.0 "
   "-cl-kernel-arg-info "
   "-w "
   "-g "
@@ -113,7 +114,7 @@ CL_API_SUFFIX__VERSION_1_0
   char *modded_options = NULL;
   char *token = NULL;
   char *saveptr = NULL;
-  void* cache_lock = NULL;
+  void* write_cache_lock = NULL;
 
   POCL_RETURN_ERROR_COND((program == NULL), CL_INVALID_PROGRAM);
 
@@ -209,7 +210,7 @@ CL_API_SUFFIX__VERSION_1_0
     }
 
   POCL_MSG_PRINT_INFO("building program with options %s\n",
-                      options != NULL ? options : "");
+                      user_options != NULL ? user_options : "");
 
   /* Build the fully linked non-parallel bitcode for all
          devices. */
@@ -228,20 +229,19 @@ CL_API_SUFFIX__VERSION_1_0
       /* clCreateProgramWithSource */
       if (program->source)
         {
-          error = pocl_llvm_build_program(program, device_i, user_options,
-                                          &cache_lock, program_bc_path);
-          if (error != 0)
-            {
-              errcode = CL_BUILD_PROGRAM_FAILURE;
-              goto ERROR_CLEAN_BINARIES;
-            }
-          assert(cache_lock);
+          error = pocl_llvm_build_program(program, device_i,
+                                          user_options, program_bc_path);
+          POCL_GOTO_ERROR_ON((error != 0), CL_BUILD_PROGRAM_FAILURE,
+                             "pocl_llvm_build_program() failed\n");
         }
       /* clCreateProgramWithBinaries */
       else if (program->binaries[device_i]) {
-            pocl_cache_create_program_cachedir(program, device_i, NULL, 0,
-                                               program_bc_path, &cache_lock);
-            assert(cache_lock);
+            error = pocl_cache_create_program_cachedir(program, device_i,
+                                               NULL, 0, program_bc_path);
+            POCL_GOTO_ERROR_ON((error != 0), CL_BUILD_PROGRAM_FAILURE,
+                               "Could not create program cachedir");
+            write_cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
+            assert(write_cache_lock);
             errcode = pocl_write_file(program_bc_path, (char*)program->binaries[device_i],
                           (uint64_t)program->binary_sizes[device_i], 0, 0);
             POCL_GOTO_ERROR_ON(errcode, CL_BUILD_PROGRAM_FAILURE,
@@ -256,6 +256,9 @@ CL_API_SUFFIX__VERSION_1_0
       /* Read binaries from program.bc to memory */
       if (program->binaries[device_i] == NULL)
         {
+          if (!write_cache_lock)
+            write_cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
+          assert(write_cache_lock);
           errcode = pocl_read_file(program_bc_path, &binary, &fsize);
           POCL_GOTO_ERROR_ON(errcode, CL_BUILD_ERROR, "Failed to read binaries from program.bc to memory: %s\n", program_bc_path);
 
@@ -264,21 +267,30 @@ CL_API_SUFFIX__VERSION_1_0
         }
 
       if (program->llvm_irs[device_i] == NULL)
+        {
+          if (!write_cache_lock)
+            write_cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
+          assert(write_cache_lock);
           pocl_update_program_llvm_irs(program, device_i, device);
+        }
 
+      /* Maintain a 'last_accessed' file in every program's
+       * cache directory. Will be useful for cache pruning script
+       * that flushes old directories based on LRU */
       pocl_cache_update_program_last_access(program, device_i);
 
-      pocl_cache_release_lock(cache_lock);
-      cache_lock = NULL;
+      if (write_cache_lock)
+        {
+          pocl_cache_release_lock(write_cache_lock);
+          write_cache_lock = NULL;
+        }
+
     }
 
+
   POCL_GOTO_ERROR_ON((actually_built < num_devices), CL_BUILD_PROGRAM_FAILURE,
                      "Some of the devices on the argument-supplied list are"
-                     "not available for the program, or do not exist\n")
-
-  /* Maintain a 'last_accessed' file in every program's
-   * cache directory. Will be useful for cache pruning script
-   * that flushes old directories based on LRU */
+                     "not available for the program, or do not exist\n");
 
   program->build_status = CL_BUILD_SUCCESS;
   POCL_UNLOCK_OBJ(program);
@@ -288,19 +300,21 @@ CL_API_SUFFIX__VERSION_1_0
   /* Set pointers to NULL during cleanup so that clProgramRelease won't
    * cause a double free. */
 
-ERROR_CLEAN_BINARIES:
-  for(i = 0; i < device_i; i++)
+ERROR:
+  for(i = 0; i < num_devices; i++)
   {
     POCL_MEM_FREE(program->binaries[i]);
+    pocl_cache_release_lock(program->read_locks[i]);
+    program->read_locks[i] = NULL;
   }
   POCL_MEM_FREE(program->binaries);
   POCL_MEM_FREE(program->binary_sizes);
+  POCL_MEM_FREE(unique_devlist);
+  pocl_cache_release_lock(write_cache_lock);
 ERROR_CLEAN_OPTIONS:
   POCL_MEM_FREE(modded_options);
-ERROR:
-  POCL_MEM_FREE(unique_devlist);
   program->build_status = CL_BUILD_ERROR;
-  pocl_cache_release_lock(cache_lock);
+
   POCL_UNLOCK_OBJ(program);
   return errcode;
 }
diff --git a/lib/CL/clCreateBuffer.c b/lib/CL/clCreateBuffer.c
index 6c94124..8459015 100644
--- a/lib/CL/clCreateBuffer.c
+++ b/lib/CL/clCreateBuffer.c
@@ -111,7 +111,9 @@ POname(clCreateBuffer)(cl_context context,
   mem->type = CL_MEM_OBJECT_BUFFER;
   mem->flags = flags;
   mem->is_image = CL_FALSE;
-  
+  mem->packet_size = 0;
+  mem->max_packets = 0;
+
   /* Store the per device buffer pointers always to a known
      location in the buffer (dev_id), even though the context
      might not contain all the devices. */
@@ -155,8 +157,7 @@ ERROR_CLEAN_MEM_AND_DEVICE:
   for (j = 0; j < i; ++j)
     {
       device = context->devices[j];
-      device->ops->free(device->data, flags, 
-                        mem->device_ptrs[device->dev_id].mem_ptr);
+      device->ops->free(device, mem);
     }
 ERROR:
   POCL_MEM_FREE(mem);
diff --git a/lib/CL/clCreateCommandQueue.c b/lib/CL/clCreateCommandQueue.c
index ae3011f..5ba2f36 100644
--- a/lib/CL/clCreateCommandQueue.c
+++ b/lib/CL/clCreateCommandQueue.c
@@ -43,6 +43,9 @@ POname(clCreateCommandQueue)(cl_context context,
   POCL_GOTO_ERROR_ON((properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE),
       CL_INVALID_QUEUE_PROPERTIES, "Pocl doesn't have out-of-order queues yet\n");
 
+  if (pocl_debug_messages)
+    properties |= CL_QUEUE_PROFILING_ENABLE;
+
   for (i=0; i<context->num_devices; i++)
     {
       if (context->devices[i] == POCL_REAL_DEV(device))
@@ -58,7 +61,7 @@ POname(clCreateCommandQueue)(cl_context context,
     errcode = CL_OUT_OF_HOST_MEMORY;
     goto ERROR;
   }
-  
+
   POCL_INIT_OBJECT(command_queue);
 
   command_queue->context = context;
@@ -66,6 +69,9 @@ POname(clCreateCommandQueue)(cl_context context,
   command_queue->properties = properties;
   command_queue->root = NULL;
 
+  POCL_RETAIN_OBJECT(context);
+  POCL_RETAIN_OBJECT(device);
+
   if (errcode_ret != NULL)
     *errcode_ret = CL_SUCCESS;
 
diff --git a/lib/CL/clCreateCommandQueueWithProperties.c b/lib/CL/clCreateCommandQueueWithProperties.c
new file mode 100644
index 0000000..efb564d
--- /dev/null
+++ b/lib/CL/clCreateCommandQueueWithProperties.c
@@ -0,0 +1,107 @@
+/* OpenCL runtime library: clCreateCommandQueueWithProperties()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_util.h"
+#include "pocl_queue_util.h"
+
+CL_API_ENTRY cl_command_queue CL_API_CALL
+POname(clCreateCommandQueueWithProperties)(cl_context context,
+                                           cl_device_id device,
+                                           const cl_queue_properties *properties,
+                                           cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+  unsigned i = 0;
+  int errcode;
+  cl_bool found = CL_FALSE;
+  cl_command_queue_properties queue_props = 0;
+  int queue_props_set = 0;
+  cl_uint queue_size = 0;
+  const cl_command_queue_properties valid_prop_flags =
+      (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+       | CL_QUEUE_PROFILING_ENABLE
+       | CL_QUEUE_ON_DEVICE
+       | CL_QUEUE_ON_DEVICE_DEFAULT);
+
+  POCL_GOTO_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
+
+  for (i=0; i<context->num_devices; i++)
+    {
+      if (context->devices[i] == POCL_REAL_DEV(device))
+        found = CL_TRUE;
+    }
+
+  POCL_GOTO_ERROR_ON((found == CL_FALSE), CL_INVALID_DEVICE,
+                     "Could not find device in the context\n");
+
+  i = 0;
+  if (properties)
+    while(properties[i])
+      switch(properties[i])
+        {
+        case CL_QUEUE_PROPERTIES:
+          queue_props = (cl_command_queue_properties)properties[i+1];
+          queue_props_set = 1;
+          i+=2;
+          break;
+        case CL_QUEUE_SIZE:
+          queue_size = (cl_uint)properties[i+1];
+          i+=2;
+          break;
+        default:
+          POCL_GOTO_ERROR_ON(1, CL_INVALID_VALUE, "Invalid values it properties\n");
+        }
+
+  if (queue_props_set)
+    {
+      if (queue_props & CL_QUEUE_ON_DEVICE)
+        {
+          if (queue_size == 0)
+            queue_size = device->dev_queue_pref_size;
+
+          POCL_GOTO_ERROR_COND((queue_size > device->dev_queue_max_size),
+                               CL_INVALID_QUEUE_PROPERTIES);
+        }
+      else
+        POCL_GOTO_ERROR_ON((queue_size > 0), CL_INVALID_VALUE,
+                           "To specify queue size, you must use CL_QUEUE_ON_DEVICE in flags\n");
+
+      /* validate flags */
+      POCL_GOTO_ERROR_ON((queue_props & (!valid_prop_flags)), CL_INVALID_VALUE,
+                         "CL_QUEUE_PROPERTIES contain invalid entries");
+
+      // create a device side queue
+      POCL_ABORT_UNIMPLEMENTED("Device side queue");
+    }
+  else // create a host side queue.
+    {
+      return POname(clCreateCommandQueue)(context, device, queue_props, errcode_ret);
+    }
+
+ERROR:
+  if(errcode_ret)
+    {
+      *errcode_ret = errcode;
+    }
+  return NULL;
+}
+POsym(clCreateCommandQueueWithProperties)
diff --git a/lib/CL/clCreateContext.c b/lib/CL/clCreateContext.c
index 4a1f38b..ed934d0 100644
--- a/lib/CL/clCreateContext.c
+++ b/lib/CL/clCreateContext.c
@@ -186,6 +186,8 @@ POname(clCreateContext)(const cl_context_properties * properties,
       goto ERROR_CLEAN_CONTEXT_AND_DEVICES;
     }
 
+  pocl_setup_context(context);
+
   pocl_init_mem_manager ();
   
   if (errcode_ret)
diff --git a/lib/CL/clCreateContextFromType.c b/lib/CL/clCreateContextFromType.c
index 8e9494c..d1467de 100644
--- a/lib/CL/clCreateContextFromType.c
+++ b/lib/CL/clCreateContextFromType.c
@@ -23,6 +23,7 @@
 
 #include "devices/devices.h"
 #include "pocl_cl.h"
+#include "pocl_util.h"
 #include "pocl_mem_management.h"
 #include <stdlib.h>
 #include <string.h>
@@ -77,7 +78,7 @@ POname(clCreateContextFromType)(const cl_context_properties *properties,
       /* Return a dummy context so icd call to clReleaseContext() still
          works. This fixes AMD SDK OpenCL samples to work (as of 2012-12-05). */
       POCL_MSG_WARN("Couldn't find any device of type %lu; returning "
-                          "a dummy context with 0 devices\n", (unsigned long)device_type);
+                    "a dummy context with 0 devices\n", (unsigned long)device_type);
       return context;
     }
 
@@ -102,6 +103,8 @@ POname(clCreateContextFromType)(const cl_context_properties *properties,
       POname(clRetainDevice)(device_ptr);
     } 
 
+  pocl_setup_context(context);
+
   pocl_init_mem_manager ();
 
   if (errcode_ret != NULL)
diff --git a/lib/CL/clCreateKernel.c b/lib/CL/clCreateKernel.c
index 98ddd0b..9cd1fdc 100644
--- a/lib/CL/clCreateKernel.c
+++ b/lib/CL/clCreateKernel.c
@@ -45,7 +45,6 @@ POname(clCreateKernel)(cl_program program,
   int errcode;
   int error;
   unsigned device_i;
-  void* cache_lock = NULL;
 
   POCL_GOTO_ERROR_COND((kernel_name == NULL), CL_INVALID_VALUE);
 
@@ -59,7 +58,7 @@ POname(clCreateKernel)(cl_program program,
       " (even for programs created with binaries)\n");
 
   POCL_GOTO_ERROR_ON((program->build_status != CL_BUILD_SUCCESS),
-    CL_INVALID_PROGRAM_EXECUTABLE, "Last BuildProgram() was not successful\n")
+    CL_INVALID_PROGRAM_EXECUTABLE, "Last BuildProgram() was not successful\n");
 
   POCL_GOTO_ERROR_ON((program->llvm_irs == NULL),
     CL_INVALID_PROGRAM_EXECUTABLE, "No built binaries in program "
@@ -85,14 +84,9 @@ POname(clCreateKernel)(cl_program program,
       if (!pocl_cache_device_cachedir_exists(program, device_i))
           continue;
 
-      cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
-      assert(cache_lock);
-
       error = pocl_llvm_get_kernel_metadata(program,
                       kernel, device_i, kernel_name, &errcode);
 
-      pocl_cache_release_lock(cache_lock);
-
       if (error)
         {
           POCL_MSG_ERR("Failed to get kernel metadata "
diff --git a/lib/CL/clCreateKernelsInProgram.c b/lib/CL/clCreateKernelsInProgram.c
index f4c41d9..aa03671 100644
--- a/lib/CL/clCreateKernelsInProgram.c
+++ b/lib/CL/clCreateKernelsInProgram.c
@@ -28,7 +28,7 @@ POname(clCreateKernelsInProgram)(cl_program      program ,
       " (even for programs created with binaries)\n");
 
   POCL_RETURN_ERROR_ON((program->build_status != CL_BUILD_SUCCESS),
-    CL_INVALID_PROGRAM_EXECUTABLE, "Last BuildProgram() was not successful\n")
+    CL_INVALID_PROGRAM_EXECUTABLE, "Last BuildProgram() was not successful\n");
 
   POCL_RETURN_ERROR_ON((program->llvm_irs == NULL),
     CL_INVALID_PROGRAM_EXECUTABLE, "No built binaries in program "
@@ -52,7 +52,7 @@ POname(clCreateKernelsInProgram)(cl_program      program ,
       for (idx = 0; idx < num_kern_found; idx++)
         {
           cl_int error_ret;
-          kernels[idx] = clCreateKernel (program, knames[idx], &error_ret);
+          kernels[idx] = POname(clCreateKernel) (program, knames[idx], &error_ret);
 
           /* Check for errors, clean up & bail.
            * If we happened to pass a invalid kernel name after all
@@ -64,7 +64,7 @@ POname(clCreateKernelsInProgram)(cl_program      program ,
             {
               for (; idx>0; idx--)
                 {
-                  clReleaseKernel (kernels[idx-1]);
+                  POname(clReleaseKernel) (kernels[idx-1]);
                 }
               POCL_MEM_FREE(knames);
               /* If error_ret is INVALID_KERNEL_DEFINITION, returning it here
diff --git a/lib/CL/clCreateProgramWithBinary.c b/lib/CL/clCreateProgramWithBinary.c
index 6f61d98..c36eb73 100644
--- a/lib/CL/clCreateProgramWithBinary.c
+++ b/lib/CL/clCreateProgramWithBinary.c
@@ -96,6 +96,7 @@ POname(clCreateProgramWithBinary)(cl_context                     context,
   program->binaries = NULL;
   program->compiler_options = NULL;
   program->llvm_irs = NULL;
+  program->read_locks = NULL;
 
   if ((program->binary_sizes =
        (size_t*) calloc (num_devices, sizeof(size_t))) == NULL ||
@@ -105,6 +106,8 @@ POname(clCreateProgramWithBinary)(cl_context                     context,
        calloc (num_devices, sizeof(char*))) == NULL ||
       ((program->llvm_irs =
         (void**) calloc (num_devices, sizeof(void*))) == NULL) ||
+      ((program->read_locks =
+        (void**) calloc (num_devices, sizeof(void*))) == NULL) ||
       ((program->build_hash = (SHA1_digest_t*)
         calloc (num_devices, sizeof(SHA1_digest_t))) == NULL))
     {
diff --git a/lib/CL/clCreateProgramWithSource.c b/lib/CL/clCreateProgramWithSource.c
index 5ea062a..ba802d0 100644
--- a/lib/CL/clCreateProgramWithSource.c
+++ b/lib/CL/clCreateProgramWithSource.c
@@ -100,6 +100,7 @@ POname(clCreateProgramWithSource)(cl_context context,
   program->devices = context->devices;
   program->kernels = NULL;
   program->build_status = CL_BUILD_NONE;
+  program->read_locks = NULL;
 
   if ((program->binary_sizes =
        (size_t*) calloc (program->num_devices, sizeof(size_t))) == NULL ||
@@ -109,6 +110,8 @@ POname(clCreateProgramWithSource)(cl_context context,
        calloc (program->num_devices, sizeof(char*))) == NULL ||
       ((program->llvm_irs =
         (void**) calloc (program->num_devices, sizeof(void*))) == NULL) ||
+      ((program->read_locks =
+        (void**) calloc (program->num_devices, sizeof(void*))) == NULL) ||
       ((program->build_hash = (SHA1_digest_t*)
         calloc (program->num_devices, sizeof(SHA1_digest_t))) == NULL))
     {
diff --git a/lib/CL/clEnqueueCopyBuffer.c b/lib/CL/clEnqueueCopyBuffer.c
index e89365e..b34b918 100644
--- a/lib/CL/clEnqueueCopyBuffer.c
+++ b/lib/CL/clEnqueueCopyBuffer.c
@@ -72,7 +72,7 @@ CL_API_SUFFIX__VERSION_1_0
   if (pocl_buffers_overlap(src_buffer, dst_buffer, src_offset,
         dst_offset, size) != CL_SUCCESS) return CL_MEM_COPY_OVERLAP;
 
-  POCL_CHECK_DEV_IN_CMDQ
+  POCL_CHECK_DEV_IN_CMDQ;
 
   errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_COPY_BUFFER, 
                                  event, num_events_in_wait_list, 
diff --git a/lib/CL/clEnqueueCopyBufferRect.c b/lib/CL/clEnqueueCopyBufferRect.c
index a3737f2..5633c42 100644
--- a/lib/CL/clEnqueueCopyBufferRect.c
+++ b/lib/CL/clEnqueueCopyBufferRect.c
@@ -24,6 +24,7 @@
 #include "pocl_cl.h"
 #include <assert.h>
 #include "pocl_util.h"
+#include "pocl_img_buf_cpy.h"
 
 CL_API_ENTRY cl_int CL_API_CALL
 POname(clEnqueueCopyBufferRect)(cl_command_queue command_queue,
@@ -40,97 +41,13 @@ POname(clEnqueueCopyBufferRect)(cl_command_queue command_queue,
                                 const cl_event *event_wait_list,
                                 cl_event *event) CL_API_SUFFIX__VERSION_1_1
 {
-  cl_device_id device;
-  unsigned i;
-
-  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
-
-  POCL_RETURN_ERROR_COND((src_buffer == NULL), CL_INVALID_MEM_OBJECT);
-
-  POCL_RETURN_ERROR_COND((dst_buffer == NULL), CL_INVALID_MEM_OBJECT);
-
-  POCL_RETURN_ERROR_ON((src_buffer->type != CL_MEM_OBJECT_BUFFER),
-      CL_INVALID_MEM_OBJECT, "src_buffer is not a CL_MEM_OBJECT_BUFFER\n");
-  POCL_RETURN_ERROR_ON((dst_buffer->type != CL_MEM_OBJECT_BUFFER),
-      CL_INVALID_MEM_OBJECT, "dst_buffer is not a CL_MEM_OBJECT_BUFFER\n");
-
-  POCL_RETURN_ERROR_ON(((command_queue->context != src_buffer->context) ||
-      (command_queue->context != dst_buffer->context)), CL_INVALID_CONTEXT,
-      "src_buffer, dst_buffer and command_queue are not from the same context\n");
-
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((src_origin == NULL), CL_INVALID_VALUE);
-
-  POCL_RETURN_ERROR_COND((dst_origin == NULL), CL_INVALID_VALUE);
-
-  POCL_RETURN_ERROR_COND((region == NULL), CL_INVALID_VALUE);
-
-  size_t region_bytes = region[0] * region[1] * region[2];
-  POCL_RETURN_ERROR_ON((region_bytes <= 0), CL_INVALID_VALUE, "All items in region must be >0\n");
-
-  if (pocl_buffer_boundcheck_3d(src_buffer->size, src_origin, region, &src_row_pitch,
-      &src_slice_pitch, "src_") != CL_SUCCESS) return CL_INVALID_VALUE;
-
-  if (pocl_buffer_boundcheck_3d(dst_buffer->size, dst_origin, region, &dst_row_pitch,
-      &dst_slice_pitch, "dst_") != CL_SUCCESS) return CL_INVALID_VALUE;
-
-  if (src_buffer == dst_buffer) {
-    POCL_RETURN_ERROR_ON((src_slice_pitch != dst_slice_pitch),
-      CL_INVALID_VALUE, "src_buffer and dst_buffer are the same buffer object,"
-      " but the given dst & src slice pitch differ\n")
-
-    POCL_RETURN_ERROR_ON((src_row_pitch != dst_row_pitch),
-      CL_INVALID_VALUE, "src_buffer and dst_buffer are the same buffer object,"
-      " but the given dst & src row pitch differ\n")
-
-    POCL_RETURN_ERROR_ON((check_copy_overlap(src_origin, dst_origin, region,
-      src_row_pitch, src_slice_pitch)), CL_MEM_COPY_OVERLAP, "src_buffer and "
-      "dst_buffer are the same buffer object, and source and destination "
-      "regions overlap");
-
-  }
-
-  POCL_CHECK_DEV_IN_CMDQ
-
-  /* execute directly */
-  /* TODO: enqueue the read_rect if this is a non-blocking read (see
-     clEnqueueReadBuffer) */
-  if (command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
-    {
-      POCL_ABORT_UNIMPLEMENTED("clEnqueueCopyBufferRect: Out-of-order queue");
-      /* wait for the event in event_wait_list to finish */
-    }
-  else
-    {
-      /* in-order queue - all previously enqueued commands must 
-       * finish before this read */
-      // ensure our buffer is not freed yet
-      POname(clRetainMemObject) (src_buffer);
-      POname(clRetainMemObject) (dst_buffer);
-      POname(clFinish)(command_queue);
-    }
-  POCL_UPDATE_EVENT_SUBMITTED(event);
-  POCL_UPDATE_EVENT_RUNNING(event);
-
-  /* TODO: offset computation doesn't work in case the ptr is not 
-     a direct pointer */
-  device->ops->copy_rect(device->data,
-                       src_buffer->device_ptrs[device->dev_id].mem_ptr,
-                       dst_buffer->device_ptrs[device->dev_id].mem_ptr,
-                       src_origin, dst_origin, region,
-                       src_row_pitch, src_slice_pitch,
-                       dst_row_pitch, dst_slice_pitch);
-
-  POCL_UPDATE_EVENT_COMPLETE(event);
-
-  POname(clReleaseMemObject) (src_buffer);
-  POname(clReleaseMemObject) (dst_buffer);
-
-  return CL_SUCCESS;
+  return pocl_rect_copy(command_queue,
+    src_buffer, CL_FALSE,
+    dst_buffer, CL_FALSE,
+    src_origin, dst_origin, region,
+    src_row_pitch, src_slice_pitch,
+    dst_row_pitch, dst_slice_pitch,
+    num_events_in_wait_list, event_wait_list,
+    event);
 }
 POsym(clEnqueueCopyBufferRect)
diff --git a/lib/CL/clEnqueueCopyImage.c b/lib/CL/clEnqueueCopyImage.c
index fc9b8bb..797e58e 100644
--- a/lib/CL/clEnqueueCopyImage.c
+++ b/lib/CL/clEnqueueCopyImage.c
@@ -1,5 +1,5 @@
 #include "pocl_util.h"
-#include "pocl_image_util.h"
+#include "pocl_img_buf_cpy.h"
 
 CL_API_ENTRY cl_int CL_API_CALL
 POname(clEnqueueCopyImage)(cl_command_queue      command_queue ,
@@ -12,80 +12,13 @@ POname(clEnqueueCopyImage)(cl_command_queue      command_queue ,
                    const cl_event *      event_wait_list ,
                    cl_event *            event ) CL_API_SUFFIX__VERSION_1_0
 {
-  int errcode;
-
-  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
-
-  POCL_RETURN_ERROR_ON((!command_queue->device->image_support), CL_INVALID_OPERATION,
-    "Device %s does not support images\n", command_queue->device->long_name);
-
-  POCL_RETURN_ERROR_COND((src_image == NULL), CL_INVALID_MEM_OBJECT);
-  POCL_RETURN_ERROR_COND((dst_image == NULL), CL_INVALID_MEM_OBJECT);
-  POCL_RETURN_ERROR_COND((src_origin == NULL), CL_INVALID_VALUE);
-  POCL_RETURN_ERROR_COND((dst_origin == NULL), CL_INVALID_VALUE);
-  POCL_RETURN_ERROR_COND((region == NULL), CL_INVALID_VALUE);
-
-
-  POCL_RETURN_ERROR_ON(((command_queue->context != src_image->context) ||
-      (command_queue->context != dst_image->context)), CL_INVALID_CONTEXT,
-      "src_image, dst_image and command_queue are not from the same context\n");
-
-  POCL_RETURN_ERROR_ON((!src_image->is_image), CL_INVALID_MEM_OBJECT,
-                                                "src_image is not an image\n");
-  POCL_RETURN_ERROR_ON((!dst_image->is_image), CL_INVALID_MEM_OBJECT,
-                                                "dst_image is not an image\n");
-
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-
-  POCL_RETURN_ERROR_ON((src_image->image_channel_order != dst_image->image_channel_order),
-    CL_IMAGE_FORMAT_MISMATCH, "src_image and dst_image have different image channel order\n");
-
-  POCL_RETURN_ERROR_ON((src_image->image_channel_data_type != dst_image->image_channel_data_type),
-    CL_IMAGE_FORMAT_MISMATCH, "src_image and dst_image have different image channel data type\n");
-
-  POCL_RETURN_ERROR_ON((src_image->type == CL_MEM_OBJECT_IMAGE2D && src_origin[2] != 0),
-    CL_INVALID_VALUE, "src_origin[2] must be 0 for 2D src_image\n");
-
-  POCL_RETURN_ERROR_ON((dst_image->type == CL_MEM_OBJECT_IMAGE2D && dst_origin[2] != 0),
-    CL_INVALID_VALUE, "dst_origin[2] must be 0 for 2D dst_image\n");
-
-  POCL_RETURN_ERROR_ON(((dst_image->type == CL_MEM_OBJECT_IMAGE2D ||
-     src_image->type == CL_MEM_OBJECT_IMAGE2D) &&  region[2] != 1),
-    CL_INVALID_VALUE, "for any 2D image copy, region[2] must be 1\n");
-
-  errcode = pocl_check_device_supports_image(src_image, command_queue);
-  if (errcode != CL_SUCCESS)
-    return errcode;
-  errcode = pocl_check_device_supports_image(dst_image, command_queue);
-  if (errcode != CL_SUCCESS)
-    return errcode;
-
-  /* Adjust image pointers */
-  size_t mod_region[3] = {region[0] * src_image->image_elem_size * src_image->image_channels,
-                          region[1], region[2]};
-  size_t mod_src_origin[3] = {src_origin[0] * src_image->image_elem_size * src_image->image_channels,
-                              src_origin[1], src_origin[2]};
-  size_t mod_dst_origin[3] = {dst_origin[0] * dst_image->image_elem_size * dst_image->image_channels,
-                              dst_origin[1], dst_origin[2]};
-
-  /* TODO: use copy buffer when possible (same width/height) */
-  return POname(clEnqueueCopyBufferRect)(command_queue,
-                                         src_image,
-                                         dst_image,
-                                         mod_src_origin,
-                                         mod_dst_origin,
-                                         mod_region,
-                                         src_image->image_row_pitch,
-                                         src_image->image_slice_pitch,
-                                         dst_image->image_row_pitch,
-                                         dst_image->image_slice_pitch,
-                                         num_events_in_wait_list,
-                                         event_wait_list,
-                                         event);
+  return pocl_rect_copy(command_queue,
+    src_image, CL_TRUE,
+    dst_image, CL_TRUE,
+    src_origin, dst_origin, region,
+    0, 0,
+    0, 0,
+    num_events_in_wait_list, event_wait_list,
+    event);
 }
 POsym(clEnqueueCopyImage)
diff --git a/lib/CL/clEnqueueFillBuffer.c b/lib/CL/clEnqueueFillBuffer.c
new file mode 100644
index 0000000..4e0edad
--- /dev/null
+++ b/lib/CL/clEnqueueFillBuffer.c
@@ -0,0 +1,108 @@
+/* OpenCL runtime library: clEnqueueFillBuffer()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_util.h"
+#include <string.h>
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+POname(clEnqueueFillBuffer)(cl_command_queue  command_queue,
+                           cl_mem            buffer,
+                           const void *      pattern,
+                           size_t            pattern_size,
+                           size_t            offset,
+                           size_t            size,
+                           cl_uint           num_events_in_wait_list,
+                           const cl_event*   event_wait_list,
+                           cl_event*         event)
+CL_API_SUFFIX__VERSION_1_2
+{
+  int errcode = CL_SUCCESS;
+  _cl_command_node *cmd = NULL;
+
+  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
+
+  POCL_RETURN_ERROR_COND((buffer == NULL), CL_INVALID_MEM_OBJECT);
+
+  POCL_RETURN_ERROR_ON((buffer->type != CL_MEM_OBJECT_BUFFER), CL_INVALID_MEM_OBJECT,
+                       "buffer is not a CL_MEM_OBJECT_BUFFER\n");
+
+  POCL_RETURN_ERROR_ON((buffer->flags & CL_MEM_READ_ONLY), CL_INVALID_MEM_OBJECT,
+                       "buffer is CL_MEM_READ_ONLY\n");
+
+  POCL_RETURN_ERROR_ON((command_queue->context != buffer->context), CL_INVALID_CONTEXT,
+                       "buffer and command_queue are not from the same context\n");
+
+  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  errcode = pocl_buffer_boundcheck(buffer, offset, size);
+  if (errcode != CL_SUCCESS)
+    return errcode;
+
+  /* CL_INVALID_VALUE if pattern is NULL or if pattern_size is 0
+   * or if pattern_size is not one of {1, 2, 4, 8, 16, 32, 64, 128}. */
+  POCL_RETURN_ERROR_COND((pattern == NULL), CL_INVALID_VALUE);
+  POCL_RETURN_ERROR_COND((pattern_size == 0), CL_INVALID_VALUE);
+  POCL_RETURN_ERROR_COND((pattern_size > 128), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_ON((__builtin_popcount(pattern_size) > 1), CL_INVALID_VALUE,
+                       "pattern_size(%zu) must be a power-of-two value", pattern_size);
+
+  /* CL_INVALID_VALUE if offset and size are not a multiple of pattern_size.  */
+  POCL_RETURN_ERROR_ON((offset % pattern_size), CL_INVALID_VALUE,
+                       "offset(%zu) must be a multiple of pattern_size(%zu)\n",
+                       offset, pattern_size);
+  POCL_RETURN_ERROR_ON((size % pattern_size), CL_INVALID_VALUE,
+                       "size(%zu) must be a multiple of pattern_size(%zu)\n",
+                       size, pattern_size);
+
+  /* ############# TODO #############
+   * CL_MISALIGNED_SUB_BUFFER_OFFSET if buffer is a sub-buffer object
+   * and offset specified when the sub-buffer object is created is not aligned
+   * to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for device associated with queue. */
+
+  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_FILL_BUFFER,
+                                 event, num_events_in_wait_list,
+                                 event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
+
+  cmd->command.memfill.ptr =
+      buffer->device_ptrs[command_queue->device->dev_id].mem_ptr;
+  cmd->command.memfill.size = size;
+  cmd->command.memfill.offset = offset;
+  void *p = pocl_aligned_malloc(pattern_size, pattern_size);
+  memcpy(p, pattern, pattern_size);
+  cmd->command.memfill.pattern = p;
+  cmd->command.memfill.pattern_size = pattern_size;
+
+  POname(clRetainMemObject) (buffer);
+  pocl_command_enqueue(command_queue, cmd);
+
+  return CL_SUCCESS;
+
+}
+POsym(clEnqueueFillBuffer)
diff --git a/lib/CL/clEnqueueMapBuffer.c b/lib/CL/clEnqueueMapBuffer.c
index 68e4187..1f9bcd7 100644
--- a/lib/CL/clEnqueueMapBuffer.c
+++ b/lib/CL/clEnqueueMapBuffer.c
@@ -70,15 +70,15 @@ POname(clEnqueueMapBuffer)(cl_command_queue command_queue,
 
   POCL_GOTO_ERROR_ON((buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS) &&
     map_flags & CL_MAP_READ), CL_INVALID_OPERATION, "buffer has been created with "
-    "CL_MEM_HOST_WRITE_ONLY or CL_MEM_HOST_NO_ACCESS and CL_MAP_READ is set in map_flags\n")
+    "CL_MEM_HOST_WRITE_ONLY or CL_MEM_HOST_NO_ACCESS and CL_MAP_READ is set in map_flags\n");
 
   POCL_GOTO_ERROR_ON((buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS) &&
       map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)), CL_INVALID_OPERATION,
       "buffer has been created with CL_MEM_HOST_READ_ONL or CL_MEM_HOST_NO_ACCESS "
-      "and CL_MAP_WRITE or CL_MAP_WRITE_INVALIDATE_REGION is set in map_flags\n")
+      "and CL_MAP_WRITE or CL_MAP_WRITE_INVALIDATE_REGION is set in map_flags\n");
+
+  POCL_CHECK_DEV_IN_CMDQ;
 
-  POCL_CHECK_DEV_IN_CMDQ
- 
   /* Ensure the parent buffer is not freed prematurely. */
   POname(clRetainMemObject) (buffer);
   must_release = 1;
diff --git a/lib/CL/clEnqueueMapImage.c b/lib/CL/clEnqueueMapImage.c
index 5915ccf..1d2c44f 100644
--- a/lib/CL/clEnqueueMapImage.c
+++ b/lib/CL/clEnqueueMapImage.c
@@ -76,7 +76,7 @@ CL_API_SUFFIX__VERSION_1_0
   if (errcode != CL_SUCCESS)
     goto ERROR;
 
-  POCL_GOTO_ERROR_COND((image_row_pitch == NULL), CL_INVALID_VALUE)
+  POCL_GOTO_ERROR_COND((image_row_pitch == NULL), CL_INVALID_VALUE);
 
   errcode = pocl_check_image_origin_region(image, origin, region);
   if (errcode != CL_SUCCESS)
diff --git a/lib/CL/clEnqueueNDRangeKernel.c b/lib/CL/clEnqueueNDRangeKernel.c
index 873074b..bd93d5b 100644
--- a/lib/CL/clEnqueueNDRangeKernel.c
+++ b/lib/CL/clEnqueueNDRangeKernel.c
@@ -62,7 +62,6 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
   cl_device_id realdev = NULL;
   struct pocl_context pc;
   _cl_command_node *command_node;
-  void* cache_lock;
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
   
@@ -102,7 +101,7 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
   for (i = 0; i < kernel->num_args; i++)
     {
       POCL_RETURN_ERROR_ON((!kernel->arg_info[i].is_set), CL_INVALID_KERNEL_ARGS,
-        "The %i-th kernel argument is not set!\n", i)
+        "The %i-th kernel argument is not set!\n", i);
     }
 
   if (local_work_size != NULL) 
@@ -110,7 +109,9 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
       local_x = local_work_size[0];
       local_y = work_dim > 1 ? local_work_size[1] : 1;
       local_z = work_dim > 2 ? local_work_size[2] : 1;
-    } 
+      if (local_x > global_x || local_y > global_y || local_z > global_z)
+        goto DETERMINE_LOCAL_SIZE;
+    }
   else 
     {
       /* Embarrassingly parallel kernel with a free work-group
@@ -121,8 +122,8 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
          trying to respect the preferred WG size multiple (for better 
          SIMD instruction utilization).          
       */
-
       size_t preferred_wg_multiple;
+DETERMINE_LOCAL_SIZE:
       POname(clGetKernelWorkGroupInfo)
         (kernel, command_queue->device, 
          CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 
@@ -231,10 +232,6 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
   POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
     CL_INVALID_EVENT_WAIT_LIST);
 
-  cache_lock = pocl_cache_acquire_writer_lock(kernel->program,
-                                              realdev);
-  assert(cache_lock);
-
   char cachedir[POCL_FILENAME_LENGTH];
   pocl_cache_make_kernel_cachedir_path(cachedir, kernel->program,
                                   realdev, kernel,
@@ -340,7 +337,6 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
   error = CL_SUCCESS;
 
 ERROR:
-  pocl_cache_release_lock(cache_lock);
   return error;
 
 }
diff --git a/lib/CL/clEnqueueReadBufferRect.c b/lib/CL/clEnqueueReadBufferRect.c
index 0e3f15b..31f6637 100644
--- a/lib/CL/clEnqueueReadBufferRect.c
+++ b/lib/CL/clEnqueueReadBufferRect.c
@@ -80,7 +80,7 @@ POname(clEnqueueReadBufferRect)(cl_command_queue command_queue,
       &host_slice_pitch, "") != CL_SUCCESS) return CL_INVALID_VALUE;
 
 
-  POCL_CHECK_DEV_IN_CMDQ
+  POCL_CHECK_DEV_IN_CMDQ;
 
   /* execute directly */
   /* TODO: enqueue the read_rect if this is a non-blocking read (see
diff --git a/lib/CL/clEnqueueSVMFree.c b/lib/CL/clEnqueueSVMFree.c
new file mode 100644
index 0000000..de44b50
--- /dev/null
+++ b/lib/CL/clEnqueueSVMFree.c
@@ -0,0 +1,86 @@
+/* OpenCL runtime library: clEnqueueSVMFree()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_cl.h"
+#include "pocl_util.h"
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clEnqueueSVMFree) (cl_command_queue command_queue,
+                  cl_uint  num_svm_pointers,
+                  void  *svm_pointers[],
+                  void (CL_CALLBACK  *pfn_free_func) ( cl_command_queue queue,
+                                                       cl_uint num_svm_pointers,
+                                                       void *svm_pointers[],
+                                                       void  *user_data),
+                  void *user_data,
+                  cl_uint num_events_in_wait_list,
+                  const cl_event *event_wait_list,
+                  cl_event *event) CL_API_SUFFIX__VERSION_2_0
+{
+  unsigned i;
+
+  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
+
+  POCL_RETURN_ERROR_ON((command_queue->context->svm_allocdev == NULL),
+      CL_INVALID_CONTEXT, "None of the devices in this context is SVM-capable\n");
+
+  POCL_RETURN_ERROR_COND((num_svm_pointers == 0), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((svm_pointers == NULL), CL_INVALID_VALUE);
+  for (i=0; i<num_svm_pointers; i++)
+    POCL_RETURN_ERROR_COND((svm_pointers[i] == NULL), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  for(i=0; i<num_events_in_wait_list; i++)
+    POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
+
+  _cl_command_node *cmd = NULL;
+
+  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_FREE,
+                                     event, num_events_in_wait_list,
+                                     event_wait_list);
+
+  if (errcode != CL_SUCCESS)
+    {
+      POCL_MEM_FREE(cmd);
+      return errcode;
+    }
+
+  cmd->command.svm_free.num_svm_pointers = num_svm_pointers;
+  cmd->command.svm_free.svm_pointers = svm_pointers;
+  cmd->command.svm_free.queue = command_queue;
+  cmd->command.svm_free.data = user_data;
+  cmd->command.svm_free.pfn_free_func = pfn_free_func;
+
+  pocl_command_enqueue(command_queue, cmd);
+
+  return CL_SUCCESS;
+
+}
+POsym(clEnqueueSVMFree);
+
diff --git a/lib/CL/clEnqueueSVMMap.c b/lib/CL/clEnqueueSVMMap.c
new file mode 100644
index 0000000..657c1bd
--- /dev/null
+++ b/lib/CL/clEnqueueSVMMap.c
@@ -0,0 +1,91 @@
+/* OpenCL runtime library: clEnqueueSVMMap()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_cl.h"
+#include "pocl_util.h"
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clEnqueueSVMMap) (cl_command_queue command_queue,
+                 cl_bool blocking_map,
+                 cl_map_flags map_flags,
+                 void *svm_ptr,
+                 size_t size,
+                 cl_uint num_events_in_wait_list,
+                 const cl_event *event_wait_list,
+                 cl_event *event) CL_API_SUFFIX__VERSION_2_0
+{
+  unsigned i;
+  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
+
+  POCL_RETURN_ERROR_ON((command_queue->context->svm_allocdev == NULL),
+      CL_INVALID_CONTEXT, "None of the devices in this context is SVM-capable\n");
+
+  if (DEVICE_MMAP_IS_NOP(command_queue->device)
+      && (num_events_in_wait_list == 0)
+      && (event == NULL))
+    {
+      if (blocking_map == CL_TRUE)
+        return POname(clFinish)(command_queue);
+      else
+        return CL_SUCCESS;
+    }
+
+  POCL_RETURN_ERROR_COND((svm_ptr == NULL), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((size == 0), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  for(i=0; i<num_events_in_wait_list; i++)
+    POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
+
+  _cl_command_node *cmd = NULL;
+
+  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_MAP,
+                                     event, num_events_in_wait_list,
+                                     event_wait_list);
+
+  if (errcode != CL_SUCCESS)
+    {
+      POCL_MEM_FREE(cmd);
+      return errcode;
+    }
+
+  cmd->command.svm_map.svm_ptr = svm_ptr;
+  cmd->command.svm_map.size = size;
+  cmd->command.svm_map.flags = map_flags;
+
+  pocl_command_enqueue(command_queue, cmd);
+
+  if (blocking_map == CL_TRUE)
+    return POname(clFinish)(command_queue);
+  else
+    return CL_SUCCESS;
+
+}
+POsym(clEnqueueSVMMap);
+
diff --git a/lib/CL/clEnqueueSVMMemFill.c b/lib/CL/clEnqueueSVMMemFill.c
new file mode 100644
index 0000000..17df977
--- /dev/null
+++ b/lib/CL/clEnqueueSVMMemFill.c
@@ -0,0 +1,93 @@
+/* OpenCL runtime library: clEnqueueSVMMemFill()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_cl.h"
+#include "pocl_util.h"
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clEnqueueSVMMemFill) (cl_command_queue command_queue,
+                     void *svm_ptr,
+                     const void *pattern,
+                     size_t pattern_size,
+                     size_t size,
+                     cl_uint num_events_in_wait_list,
+                     const cl_event *event_wait_list,
+                     cl_event *event) CL_API_SUFFIX__VERSION_2_0
+{
+  unsigned i;
+  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
+
+  POCL_RETURN_ERROR_ON((command_queue->context->svm_allocdev == NULL),
+      CL_INVALID_CONTEXT, "None of the devices in this context is SVM-capable\n");
+
+  POCL_RETURN_ERROR_COND((svm_ptr == NULL), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((pattern_size == 0), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((pattern_size > 128), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_ON((__builtin_popcount(pattern_size) > 1), CL_INVALID_VALUE,
+                       "pattern_size (%zu) must be a power-of-2 value\n", pattern_size);
+
+  POCL_RETURN_ERROR_COND((size == 0), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_ON(((intptr_t)svm_ptr % pattern_size > 0), CL_INVALID_VALUE,
+                       "svm_ptr must be aligned to pattern_size\n");
+
+  POCL_RETURN_ERROR_ON((size % pattern_size > 0), CL_INVALID_VALUE,
+                       "size must be a multiple of pattern_size\n");
+
+  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  for(i=0; i<num_events_in_wait_list; i++)
+    POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
+
+  _cl_command_node *cmd = NULL;
+
+  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_MEMFILL,
+                                     event, num_events_in_wait_list,
+                                     event_wait_list);
+
+  if (errcode != CL_SUCCESS)
+    {
+      POCL_MEM_FREE(cmd);
+      return errcode;
+    }
+
+  cmd->command.memfill.ptr = svm_ptr;
+  cmd->command.memfill.offset = 0;
+  cmd->command.memfill.size = size;
+  void *p = pocl_aligned_malloc(pattern_size, pattern_size);
+  memcpy(p, pattern, pattern_size);
+  cmd->command.memfill.pattern = p;
+  cmd->command.memfill.pattern_size = pattern_size;
+  pocl_command_enqueue(command_queue, cmd);
+
+  return CL_SUCCESS;
+}
+POsym(clEnqueueSVMMemFill);
+
diff --git a/lib/CL/clEnqueueSVMMemcpy.c b/lib/CL/clEnqueueSVMMemcpy.c
new file mode 100644
index 0000000..73e680c
--- /dev/null
+++ b/lib/CL/clEnqueueSVMMemcpy.c
@@ -0,0 +1,83 @@
+/* OpenCL runtime library: clEnqueueSVMMemcpy()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_cl.h"
+#include "pocl_util.h"
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clEnqueueSVMMemcpy) (cl_command_queue command_queue,
+                    cl_bool blocking_copy,
+                    void *dst_ptr,
+                    const void *src_ptr,
+                    size_t size,
+                    cl_uint num_events_in_wait_list,
+                    const cl_event *event_wait_list,
+                    cl_event *event) CL_API_SUFFIX__VERSION_2_0
+{
+  unsigned i;
+  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
+
+  POCL_RETURN_ERROR_ON((command_queue->context->svm_allocdev == NULL),
+      CL_INVALID_CONTEXT, "None of the devices in this context is SVM-capable\n");
+
+  POCL_RETURN_ERROR_COND((src_ptr == NULL), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((dst_ptr == NULL), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((size == 0), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  for(i=0; i<num_events_in_wait_list; i++)
+    POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
+
+  _cl_command_node *cmd = NULL;
+
+  if (blocking_copy)
+    POCL_ABORT_UNIMPLEMENTED("Blocking memcpy");
+
+  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_MEMCPY,
+                                     event, num_events_in_wait_list,
+                                     event_wait_list);
+
+  if (errcode != CL_SUCCESS)
+    {
+      POCL_MEM_FREE(cmd);
+      return errcode;
+    }
+
+  cmd->command.svm_memcpy.src = src_ptr;
+  cmd->command.svm_memcpy.size = size;
+  cmd->command.svm_memcpy.dst = dst_ptr;
+
+  pocl_command_enqueue(command_queue, cmd);
+
+  return CL_SUCCESS;
+
+}
+POsym(clEnqueueSVMMemcpy);
+
diff --git a/lib/CL/clEnqueueSVMUnmap.c b/lib/CL/clEnqueueSVMUnmap.c
new file mode 100644
index 0000000..043ca83
--- /dev/null
+++ b/lib/CL/clEnqueueSVMUnmap.c
@@ -0,0 +1,78 @@
+/* OpenCL runtime library: clEnqueueSVMUnmap()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_cl.h"
+#include "pocl_util.h"
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clEnqueueSVMUnmap) (cl_command_queue command_queue,
+                   void *svm_ptr,
+                   cl_uint num_events_in_wait_list,
+                   const cl_event *event_wait_list,
+                   cl_event *event) CL_API_SUFFIX__VERSION_2_0
+{
+  unsigned i;
+  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
+
+  POCL_RETURN_ERROR_ON((command_queue->context->svm_allocdev == NULL),
+      CL_INVALID_CONTEXT, "None of the devices in this context is SVM-capable\n");
+
+  if (DEVICE_MMAP_IS_NOP(command_queue->device)
+      && (num_events_in_wait_list == 0)
+      && (event == NULL))
+    return CL_SUCCESS;
+
+  POCL_RETURN_ERROR_COND((svm_ptr == NULL), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
+                         CL_INVALID_EVENT_WAIT_LIST);
+
+  for(i=0; i<num_events_in_wait_list; i++)
+    POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
+
+  _cl_command_node *cmd = NULL;
+
+  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_UNMAP,
+                                     event, num_events_in_wait_list,
+                                     event_wait_list);
+
+  if (errcode != CL_SUCCESS)
+    {
+      POCL_MEM_FREE(cmd);
+      return errcode;
+    }
+
+  cmd->command.svm_unmap.svm_ptr = svm_ptr;
+  //cmd->command.svm_unmap.size = size;
+  //cmd->command.svm_map.flags = map_flags;
+
+  pocl_command_enqueue(command_queue, cmd);
+
+  return CL_SUCCESS;
+
+}
+POsym(clEnqueueSVMUnmap);
+
diff --git a/lib/CL/clEnqueueUnmapMemObject.c b/lib/CL/clEnqueueUnmapMemObject.c
index ef312b8..6cb545c 100644
--- a/lib/CL/clEnqueueUnmapMemObject.c
+++ b/lib/CL/clEnqueueUnmapMemObject.c
@@ -64,7 +64,7 @@ POname(clEnqueueUnmapMemObject)(cl_command_queue command_queue,
       "Could not find mapping of this memobj\n");
 
   /* find the index of the device's ptr in the buffer */
-  POCL_CHECK_DEV_IN_CMDQ
+  POCL_CHECK_DEV_IN_CMDQ;
 
   errcode = pocl_create_command (&cmd, command_queue,
                                  CL_COMMAND_UNMAP_MEM_OBJECT,
diff --git a/lib/CL/clEnqueueWriteBuffer.c b/lib/CL/clEnqueueWriteBuffer.c
index 631e4e1..8c5ca19 100644
--- a/lib/CL/clEnqueueWriteBuffer.c
+++ b/lib/CL/clEnqueueWriteBuffer.c
@@ -49,7 +49,7 @@ POname(clEnqueueWriteBuffer)(cl_command_queue command_queue,
   POCL_RETURN_ERROR_ON((command_queue->context != buffer->context),
     CL_INVALID_CONTEXT, "buffer and command_queue are not from the same context\n");
 
-  POCL_RETURN_ERROR_COND((ptr == NULL), CL_INVALID_VALUE)
+  POCL_RETURN_ERROR_COND((ptr == NULL), CL_INVALID_VALUE);
   if (pocl_buffer_boundcheck(buffer, offset, cb) != CL_SUCCESS)
     return CL_INVALID_VALUE;
 
diff --git a/lib/CL/clFinish.c b/lib/CL/clFinish.c
index 92e6161..22bdb8f 100644
--- a/lib/CL/clFinish.c
+++ b/lib/CL/clFinish.c
@@ -23,6 +23,7 @@
 
 #include "pocl_cl.h"
 #include "pocl_util.h"
+#include "pocl_debug.h"
 #include "pocl_image_util.h"
 #include "utlist.h"
 #include "clEnqueueMapBuffer.h"
@@ -40,6 +41,8 @@ POname(clFinish)(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
   cl_bool command_ready;
   cl_event *event;
 
+  POCL_MEASURE_START(clFinish);
+
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
   if (command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
@@ -92,6 +95,7 @@ POname(clFinish)(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
 
   exec_commands(ready_list);
 
+  POCL_MEASURE_FINISH(clFinish);
   return CL_SUCCESS;
 }
 POsym(clFinish)
@@ -121,6 +125,7 @@ static void exec_commands (_cl_command_node *node_list)
              node->command.read.offset,
              node->command.read.cb);
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Read Buffer           ");
           POname(clReleaseMemObject) (node->command.read.buffer);
           break;
         case CL_COMMAND_WRITE_BUFFER:
@@ -132,6 +137,7 @@ static void exec_commands (_cl_command_node *node_list)
              node->command.write.offset,
              node->command.write.cb);
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Write Buffer          ");
           POname(clReleaseMemObject) (node->command.write.buffer);
           break;
         case CL_COMMAND_COPY_BUFFER:
@@ -144,6 +150,7 @@ static void exec_commands (_cl_command_node *node_list)
              node->command.copy.dst_offset,
              node->command.copy.cb);
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Copy Buffer           ");
           POname(clReleaseMemObject) (node->command.copy.src_buffer);
           POname(clReleaseMemObject) (node->command.copy.dst_buffer);
           break;
@@ -153,6 +160,7 @@ static void exec_commands (_cl_command_node *node_list)
           pocl_map_mem_cmd (node->device, node->command.map.buffer,
                             node->command.map.mapping);
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Map Image/Buffer      ");
           break;
         case CL_COMMAND_WRITE_IMAGE:
           POCL_UPDATE_EVENT_RUNNING(event);
@@ -165,6 +173,7 @@ static void exec_commands (_cl_command_node *node_list)
              node->command.rw_image.rowpitch,
              node->command.rw_image.slicepitch);
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Write Image           ");
           break;
         case CL_COMMAND_READ_IMAGE:
           POCL_UPDATE_EVENT_RUNNING(event);
@@ -177,6 +186,7 @@ static void exec_commands (_cl_command_node *node_list)
              node->command.rw_image.rowpitch,
              node->command.rw_image.slicepitch);
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Read Image            ");
           break;
         case CL_COMMAND_UNMAP_MEM_OBJECT:
           POCL_UPDATE_EVENT_RUNNING(event);
@@ -204,12 +214,14 @@ static void exec_commands (_cl_command_node *node_list)
                     node->command.unmap.mapping);
           (node->command.unmap.memobj)->map_count--;
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Unmap Mem obj         ");
           break;
         case CL_COMMAND_NDRANGE_KERNEL:
           assert (*event == node->event);
           POCL_UPDATE_EVENT_RUNNING(event);
           node->device->ops->run(node->command.run.data, node);
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Enqueue NDRange       ");
           for (i = 0; i < node->command.run.arg_buffer_count; ++i)
             {
               cl_mem buf = node->command.run.arg_buffers[i];
@@ -234,6 +246,7 @@ static void exec_commands (_cl_command_node *node_list)
           POCL_UPDATE_EVENT_RUNNING(event);
           node->device->ops->run_native(node->command.native.data, node);
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Enqueue Native        ");
           for (i = 0; i < node->command.native.num_mem_objects; ++i)
             {
               cl_mem buf = node->command.native.mem_list[i];
@@ -256,11 +269,81 @@ static void exec_commands (_cl_command_node *node_list)
              node->command.fill_image.pixel_size);
           POCL_MEM_FREE(node->command.fill_image.fill_pixel);
           POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Fill Image            ");
+          break;
+        case CL_COMMAND_FILL_BUFFER:
+          POCL_UPDATE_EVENT_RUNNING(event);
+          node->device->ops->memfill
+            (node->command.memfill.ptr,
+             node->command.memfill.size,
+             node->command.memfill.offset,
+             node->command.memfill.pattern,
+             node->command.memfill.pattern_size);
+          POCL_MEM_FREE(node->command.memfill.pattern);
+          POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "Fill Buffer           ");
           break;
         case CL_COMMAND_MARKER:
           POCL_UPDATE_EVENT_RUNNING(event);
           POCL_UPDATE_EVENT_COMPLETE(event);
           break;
+        case CL_COMMAND_SVM_FREE:
+          POCL_UPDATE_EVENT_RUNNING(event);
+          if (node->command.svm_free.pfn_free_func)
+            node->command.svm_free.pfn_free_func(
+                node->command.svm_free.queue,
+                node->command.svm_free.num_svm_pointers,
+                node->command.svm_free.svm_pointers,
+                node->command.svm_free.data);
+          else
+            for (i=0; i < node->command.svm_free.num_svm_pointers; i++)
+              node->device->ops->free_ptr(node->device,
+                  node->command.svm_free.svm_pointers[i]);
+          POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "SVM Free              ");
+          break;
+        case CL_COMMAND_SVM_MAP:
+          POCL_UPDATE_EVENT_RUNNING(event);
+          if (DEVICE_MMAP_IS_NOP(node->device))
+            ; // no-op
+          else
+            node->device->ops->map_mem
+              (node->device->data, node->command.svm_map.svm_ptr,
+               0, node->command.svm_map.size, NULL);
+          POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "SVM Map              ");
+          break;
+        case CL_COMMAND_SVM_UNMAP:
+          POCL_UPDATE_EVENT_RUNNING(event);
+          if (DEVICE_MMAP_IS_NOP(node->device))
+            ; // no-op
+          else
+            node->device->ops->unmap_mem
+                 (node->device->data, NULL,
+                  node->command.svm_unmap.svm_ptr, 0);
+          break;
+          POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "SVM Unmap             ");
+        case CL_COMMAND_SVM_MEMCPY:
+          POCL_UPDATE_EVENT_RUNNING(event);
+          node->device->ops->copy(NULL,
+             node->command.svm_memcpy.src, 0,
+             node->command.svm_memcpy.dst, 0,
+             node->command.svm_memcpy.size);
+          POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "SVM Memcpy            ");
+          break;
+        case CL_COMMAND_SVM_MEMFILL:
+          POCL_UPDATE_EVENT_RUNNING(event);
+          node->device->ops->memfill(
+             node->command.memfill.ptr,
+             node->command.memfill.size, 0,
+             node->command.memfill.pattern,
+             node->command.memfill.pattern_size);
+          POCL_UPDATE_EVENT_COMPLETE(event);
+          POCL_DEBUG_EVENT_TIME(event, "SVM MemFill           ");
+          break;
+
         default:
           POCL_ABORT_UNIMPLEMENTED("clFinish: Unknown command");
           break;
diff --git a/lib/CL/clGetDeviceInfo.c b/lib/CL/clGetDeviceInfo.c
index b6d1c2c..98d7a14 100644
--- a/lib/CL/clGetDeviceInfo.c
+++ b/lib/CL/clGetDeviceInfo.c
@@ -180,8 +180,6 @@ POname(clGetDeviceInfo)(cl_device_id   device,
     POCL_RETURN_GETINFO(cl_bool, device->compiler_available);
   case CL_DEVICE_EXECUTION_CAPABILITIES            :
     POCL_RETURN_GETINFO(cl_device_exec_capabilities, device->execution_capabilities);
-  case CL_DEVICE_QUEUE_PROPERTIES                  :
-    POCL_RETURN_GETINFO(cl_command_queue_properties, device->queue_properties);
    
   case CL_DEVICE_NAME:
     POCL_RETURN_GETINFO_STR(device->long_name);
@@ -228,7 +226,7 @@ POname(clGetDeviceInfo)(cl_device_id   device,
   case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          : 
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->native_vector_width_half);
   case CL_DEVICE_OPENCL_C_VERSION                  :
-    POCL_RETURN_GETINFO_STR("OpenCL C 1.2");
+    POCL_RETURN_GETINFO_STR("OpenCL C 2.0");
   case CL_DEVICE_BUILT_IN_KERNELS                  :
     POCL_RETURN_GETINFO_STR("");
 
@@ -255,6 +253,43 @@ POname(clGetDeviceInfo)(cl_device_id   device,
   case CL_DEVICE_REFERENCE_COUNT:
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, 
                                             (cl_uint)device->pocl_refcount)
+
+
+
+
+  case CL_DEVICE_SVM_CAPABILITIES:
+    POCL_RETURN_GETINFO(cl_device_svm_capabilities, device->svm_caps);
+  case CL_DEVICE_MAX_ON_DEVICE_EVENTS:
+    POCL_RETURN_GETINFO(cl_uint, device->max_events);
+  case CL_DEVICE_MAX_ON_DEVICE_QUEUES:
+    POCL_RETURN_GETINFO(cl_uint, device->max_queues);
+  case CL_DEVICE_MAX_PIPE_ARGS:
+    POCL_RETURN_GETINFO(cl_uint, device->max_pipe_args);
+  case CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS:
+    POCL_RETURN_GETINFO(cl_uint, device->max_pipe_active_res);
+  case CL_DEVICE_PIPE_MAX_PACKET_SIZE:
+    POCL_RETURN_GETINFO(cl_uint, device->max_pipe_packet_size);
+  case CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE:
+    POCL_RETURN_GETINFO(cl_uint, device->dev_queue_pref_size);
+  case CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE:
+    POCL_RETURN_GETINFO(cl_uint, device->dev_queue_max_size);
+  case CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT:
+    POCL_RETURN_GETINFO(cl_uint, 0);
+  case CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT:
+    POCL_RETURN_GETINFO(cl_uint, 0);
+  case CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT:
+    POCL_RETURN_GETINFO(cl_uint, 0);
+  case CL_DEVICE_SPIR_VERSIONS:
+    POCL_RETURN_GETINFO_STR("1.2");
+  case CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES:
+    POCL_RETURN_GETINFO(cl_command_queue_properties, device->on_dev_queue_props);
+  case CL_DEVICE_QUEUE_ON_HOST_PROPERTIES:
+    POCL_RETURN_GETINFO(cl_command_queue_properties, device->on_host_queue_props);
+
+  case CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE:
+    POCL_RETURN_GETINFO(size_t, device->global_var_pref_size);
+  case CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE:
+    POCL_RETURN_GETINFO(size_t, device->global_var_max_size);
   }
   return CL_INVALID_VALUE;
 }
diff --git a/lib/CL/clGetPlatformIDs.c b/lib/CL/clGetPlatformIDs.c
index 309baf3..10eadaa 100644
--- a/lib/CL/clGetPlatformIDs.c
+++ b/lib/CL/clGetPlatformIDs.c
@@ -29,8 +29,198 @@
 #pragma GCC visibility push(hidden)
 #endif
 
+/* The "implementation" of the _cl_device_id struct.
+* Instantiated in clGetPlatformIDs.c
+*
+* TODO: the NULL entries are functions that lack implementation
+* (or even stubs) in pocl
+*/
 #ifdef BUILD_ICD
-struct _cl_icd_dispatch pocl_dispatch = POCL_ICD_DISPATCH;
+struct _cl_icd_dispatch pocl_dispatch = {
+  &POclGetPlatformIDs,
+  &POclGetPlatformInfo,
+  &POclGetDeviceIDs,
+  &POclGetDeviceInfo,
+  &POclCreateContext,
+  &POclCreateContextFromType,
+  &POclRetainContext,
+  &POclReleaseContext,
+  &POclGetContextInfo,
+  &POclCreateCommandQueue,
+  &POclRetainCommandQueue, /* 10 */
+  &POclReleaseCommandQueue,
+  &POclGetCommandQueueInfo,
+  NULL /*clSetCommandQueueProperty*/,
+  &POclCreateBuffer,
+  &POclCreateImage2D,
+  &POclCreateImage3D,
+  &POclRetainMemObject,
+  &POclReleaseMemObject,
+  &POclGetSupportedImageFormats,
+  &POclGetMemObjectInfo, /* 20 */
+  &POclGetImageInfo,
+  &POclCreateSampler,
+  &POclRetainSampler,
+  &POclReleaseSampler,
+  &POclGetSamplerInfo,
+  &POclCreateProgramWithSource,
+  &POclCreateProgramWithBinary,
+  &POclRetainProgram,
+  &POclReleaseProgram,
+  &POclBuildProgram, /* 30 */
+  &POclUnloadCompiler,
+  &POclGetProgramInfo,
+  &POclGetProgramBuildInfo,
+  &POclCreateKernel,
+  &POclCreateKernelsInProgram,
+  &POclRetainKernel,
+  &POclReleaseKernel,
+  &POclSetKernelArg,
+  &POclGetKernelInfo,
+  &POclGetKernelWorkGroupInfo, /* 40 */
+  &POclWaitForEvents,
+  &POclGetEventInfo,
+  &POclRetainEvent,
+  &POclReleaseEvent,
+  &POclGetEventProfilingInfo,
+  &POclFlush,
+  &POclFinish,
+  &POclEnqueueReadBuffer,
+  &POclEnqueueWriteBuffer,
+  &POclEnqueueCopyBuffer, /* 50 */
+  &POclEnqueueReadImage,
+  &POclEnqueueWriteImage,
+  &POclEnqueueCopyImage,
+  &POclEnqueueCopyImageToBuffer,
+  &POclEnqueueCopyBufferToImage,
+  &POclEnqueueMapBuffer,
+  &POclEnqueueMapImage,
+  &POclEnqueueUnmapMemObject,
+  &POclEnqueueNDRangeKernel,
+  &POclEnqueueTask, /* 60 */
+  &POclEnqueueNativeKernel,
+  &POclEnqueueMarker,
+  &POclEnqueueWaitForEvents,
+  &POclEnqueueBarrier,
+  &POclGetExtensionFunctionAddress,
+  NULL, /* &POclCreateFromGLBuffer,      */
+  &POclCreateFromGLTexture2D,
+  &POclCreateFromGLTexture3D,
+  NULL, /* &POclCreateFromGLRenderbuffer, */
+  NULL, /* &POclGetGLObjectInfo,  70       */
+  NULL, /* &POclGetGLTextureInfo,        */
+  NULL, /* &POclEnqueueAcquireGLObjects, */
+  NULL, /* &POclEnqueueReleaseGLObjects, */
+  NULL, /* &POclGetGLContextInfoKHR,     */
+  NULL, /* &clUnknown75 */
+  NULL, /* &clUnknown76 */
+  NULL, /* &clUnknown77 */
+  NULL, /* &clUnknown78 */
+  NULL, /* &clUnknown79 */
+  NULL, /* &clUnknown80 */
+  &POclSetEventCallback,
+  &POclCreateSubBuffer,
+  &POclSetMemObjectDestructorCallback,
+  &POclCreateUserEvent,
+  &POclSetUserEventStatus,
+  &POclEnqueueReadBufferRect,
+  &POclEnqueueWriteBufferRect,
+  &POclEnqueueCopyBufferRect,
+  NULL, /* &POclCreateSubDevicesEXT,     */
+  &POclRetainDevice, /* &POclRetainDeviceEXT,         */
+  &POclReleaseDevice, /* &POclReleaseDeviceEXT,        */
+  NULL, /* &clUnknown92 */
+  &POclCreateSubDevices,
+  &POclRetainDevice,
+  &POclReleaseDevice,
+  &POclCreateImage,
+  NULL, /* &POclCreateProgramWithBuiltInKernels, */
+  NULL, /* &POclCompileProgram,          */
+  NULL, /* &POclLinkProgram,             */
+  NULL, /* &POclUnloadPlatformCompiler,  */
+  &POclGetKernelArgInfo,
+  &POclEnqueueFillBuffer,
+  &POclEnqueueFillImage,
+  NULL, /* &POclEnqueueMigrateMemObjects, */
+  &POclEnqueueMarkerWithWaitList,
+  NULL, /* &POclEnqueueBarrierWithWaitList, */
+  NULL, /* &POclGetExtensionFunctionAddressForPlatform, */
+  NULL, /* &POclCreateFromGLTexture,     */
+  NULL, /* &clUnknown109 */
+  NULL, /* &clUnknown110 */
+  NULL, /* &clUnknown111 */
+  NULL, /* &clUnknown112 */
+  NULL, /* &clUnknown113 */
+  NULL, /* &clUnknown114 */
+  NULL, /* &clUnknown115 */
+  NULL, /* &clUnknown116 */
+  NULL, /* &clUnknown117 */
+  NULL, /* &clUnknown118 */
+  NULL, /* &clUnknown119 */
+  NULL, /* &clUnknown120 */
+  NULL, /* &clUnknown121 */
+  NULL, /* &clUnknown122 */
+#if (OCL_ICD_IDENTIFIED_FUNCTIONS > 110)
+  &POclCreateCommandQueueWithProperties,
+  NULL, /* &POclCreatePipe,*/
+  NULL, /* &POclGetPipeInfo,*/
+  &POclSVMAlloc,
+  &POclSVMFree,
+  &POclEnqueueSVMFree,
+  &POclEnqueueSVMMemcpy,
+  &POclEnqueueSVMMemFill,
+  &POclEnqueueSVMMap,
+  &POclEnqueueSVMUnmap,
+  NULL, /* clCreateSamplerWithProperties */
+  &POclSetKernelArgSVMPointer,
+  &POclSetKernelExecInfo,
+  NULL, /* &clUnknown136 */
+  NULL, /* &clUnknown137 */
+  NULL, /* &clUnknown138 */
+  NULL, /* &clUnknown139 */
+  NULL, /* &clUnknown140 */
+  NULL, /* &clUnknown141 */
+  NULL, /* &clUnknown142 */
+  NULL, /* &clUnknown143 */
+  NULL, /* &clUnknown144 */
+  NULL, /* &clUnknown145 */
+  NULL, /* &clUnknown146 */
+  NULL, /* &clUnknown147 */
+  NULL, /* &clUnknown148 */
+  NULL, /* &clUnknown149 */
+  NULL, /* &clUnknown150 */
+  NULL, /* &clUnknown151 */
+  NULL, /* &clUnknown152 */
+  NULL, /* &clUnknown153 */
+  NULL, /* &clUnknown154 */
+  NULL, /* &clUnknown155 */
+  NULL, /* &clUnknown156 */
+  NULL, /* &clUnknown157 */
+  NULL, /* &clUnknown158 */
+  NULL, /* &clUnknown159 */
+  NULL, /* &clUnknown160 */
+  NULL, /* &clUnknown161 */
+  NULL, /* &clUnknown162 */
+  NULL, /* &clUnknown163 */
+  NULL, /* &clUnknown164 */
+  NULL, /* &clUnknown165 */
+#endif
+#if (OCL_ICD_IDENTIFIED_FUNCTIONS > 127)
+  NULL, /* &clUnknown166 */
+  NULL, /* &clUnknown167 */
+  NULL, /* &clUnknown168 */
+  NULL, /* &clUnknown169 */
+  NULL, /* &clUnknown170 */
+  NULL, /* &clUnknown171 */
+  NULL, /* &clUnknown172 */
+  NULL, /* &clUnknown173 */
+  NULL, /* &clUnknown174 */
+  NULL, /* &clUnknown175 */
+  NULL, /* &clUnknown176 */
+  NULL, /* &clUnknown177 */
+#endif
+};
+
 struct _cl_platform_id _platforms[1]  = {{&pocl_dispatch}};
 #else
 struct _cl_platform_id _platforms[1]  = {};
diff --git a/lib/CL/clGetPlatformInfo.c b/lib/CL/clGetPlatformInfo.c
index 36bf1d6..287a0a3 100644
--- a/lib/CL/clGetPlatformInfo.c
+++ b/lib/CL/clGetPlatformInfo.c
@@ -50,7 +50,8 @@ POname(clGetPlatformInfo)(cl_platform_id   platform,
       POCL_RETURN_GETINFO_STR("FULL_PROFILE");
 
     case CL_PLATFORM_VERSION:
-      POCL_RETURN_GETINFO_STR("OpenCL 1.2 pocl " PACKAGE_VERSION);
+      POCL_RETURN_GETINFO_STR("OpenCL " POCL_CL_VERSION\
+                        " pocl " PACKAGE_VERSION ", LLVM " LLVM_VERSION);
 
     case CL_PLATFORM_NAME:
       POCL_RETURN_GETINFO_STR("Portable Computing Language");
diff --git a/lib/CL/clGetProgramBuildInfo.c b/lib/CL/clGetProgramBuildInfo.c
index e4ef947..44fb4a2 100644
--- a/lib/CL/clGetProgramBuildInfo.c
+++ b/lib/CL/clGetProgramBuildInfo.c
@@ -40,8 +40,8 @@ POname(clGetProgramBuildInfo)(cl_program            program,
   POCL_RETURN_ERROR_COND((program == NULL), CL_INVALID_PROGRAM);
 
   int device_i = pocl_cl_device_to_index(program, device);
-  POCL_RETURN_ERROR_ON((device_i < 0), CL_INVALID_DEVICE, "Program was not "
-    "built for this device\n")
+  POCL_RETURN_ERROR_ON((device_i < 0), CL_INVALID_DEVICE, "Program does not have "
+    "this device in it's device list\n");
 
   switch (param_name) {
   case CL_PROGRAM_BUILD_STATUS:
diff --git a/lib/CL/clGetProgramInfo.c b/lib/CL/clGetProgramInfo.c
index b1ac8ee..5ab4e06 100644
--- a/lib/CL/clGetProgramInfo.c
+++ b/lib/CL/clGetProgramInfo.c
@@ -86,10 +86,10 @@ POname(clGetProgramInfo)(cl_program program,
     {
       size_t const value_size = sizeof(cl_device_id) * program->num_devices;
       if (param_value)
-      {
-        if (param_value_size < value_size) return CL_INVALID_VALUE;
-        memcpy(param_value, (void*)program->devices, value_size);
-      }
+        {
+          if (param_value_size < value_size) return CL_INVALID_VALUE;
+          memcpy(param_value, (void*)program->devices, value_size);
+        }
       if (param_value_size_ret)
         *param_value_size_ret = value_size;
       return CL_SUCCESS;
@@ -101,26 +101,70 @@ POname(clGetProgramInfo)(cl_program program,
     }
   case CL_PROGRAM_KERNEL_NAMES:
     {
+      /* Note: In the specification (2.0) of the other XXXInfo
+         functions, param_value_size_ret is described as follows:
+
+         > param_value_size_ret returns the actual size in bytes of data
+         > being *queried* by param_value.
+
+         while in GetProgramInfo and APIs defined later in the
+         documentation, it is:
+
+         > param_value_size_ret returns the actual size in bytes of data
+         > *copied* to param_value.
+
+         it reads as if the spec allows the implementation to stop copying
+         the string at an arbitrary point where the limit
+         (param_value_size) is reached, but that's not the case. When it
+         happens, it should instead raise an error CL_INVALID_VALUE.
+
+         Also note the specification of the param_value_size_ret to param_name
+         CL_PROGRAM_SOURCE.  It says "The actual number of characters that
+         represents[sic] the program source code including the null terminator
+         is returned in param_value_size_ret." By an analogy, it is sane to
+         return the size of entire concatenated string, not the size of
+         bytes copied (partially).
+
+         Also note the specification of GetPlatformInfo +
+         CL_PLATFORM_EXTENSIONS.  it refers to "param_value_size_ret" as
+         the actual size in bytes of data being *queried*, and its
+         description of param_value_size is the same.
+
+         --- guicho271828
+      */
+
       const char *kernel_names[32];
       unsigned num_kernels = 0;
       size_t size = 0;
       num_kernels = pocl_llvm_get_kernel_names(program, kernel_names, 32);
+
+      /* optimized for clarity */
       for (i = 0; i < num_kernels; ++i)
         {
-          if (size + strlen (kernel_names[i]) + 1 >= param_value_size)
-            break;
-          size += strlen (kernel_names[i]) + 1;
-          
-          if (i == 0)
-            memcpy (param_value, kernel_names[i], strlen(kernel_names[i])+1);
-          else
-            strcat((char*)param_value, kernel_names[i]);
+          size += strlen (kernel_names[i]) ;
           if (i != num_kernels - 1)
-            strcat ((char*)param_value, ";");
+            size += 1;          /* a semicolon */
         }
-
+      size += 1;                /* a NULL */
       if (param_value_size_ret)
-        *param_value_size_ret = size;      
+        *param_value_size_ret = size;
+      if (param_value)
+        {
+          /* only when param_value is non-NULL */
+          if (size > param_value_size)
+            return CL_INVALID_VALUE;
+          /* should not break from the switch clause
+             because of POCL_ABORT_UNIMPLEMENTED */
+          for (i = 0; i < num_kernels; ++i)
+            {
+              if (i == 0)
+                strcpy (param_value, kernel_names[i]); /* copy including NULL */
+              else
+                strcat ((char*)param_value, kernel_names[i]);
+              if (i != num_kernels - 1)
+                strcat ((char*)param_value, ";");
+            }
+        }
       return CL_SUCCESS;
     }
   default:
diff --git a/lib/CL/clReleaseCommandQueue.c b/lib/CL/clReleaseCommandQueue.c
index c79465c..50fb0df 100644
--- a/lib/CL/clReleaseCommandQueue.c
+++ b/lib/CL/clReleaseCommandQueue.c
@@ -31,13 +31,18 @@ POname(clReleaseCommandQueue)(cl_command_queue command_queue) CL_API_SUFFIX__VER
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
   int new_refcount;
+  cl_context context = command_queue->context;
+  cl_device_id device = command_queue->device;
+
   POname(clFlush)(command_queue);
   POCL_RELEASE_OBJECT(command_queue, new_refcount);
   if (new_refcount == 0)
     {
       pocl_queue_list_delete(command_queue);
       POCL_MEM_FREE(command_queue);
-      /* TODO: should clReleaseContext()? */
+
+      POname(clReleaseContext)(context);
+      POname(clReleaseDevice)(device);
     }
   return CL_SUCCESS;
 }
diff --git a/lib/CL/clReleaseContext.c b/lib/CL/clReleaseContext.c
index 77d43d1..99d70d3 100644
--- a/lib/CL/clReleaseContext.c
+++ b/lib/CL/clReleaseContext.c
@@ -27,7 +27,12 @@ CL_API_ENTRY cl_int CL_API_CALL
 POname(clReleaseContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
 {
   int new_refcount;
-  POCL_RETURN_ERROR_COND((!context->valid), CL_INVALID_CONTEXT);
+  // if context is invalid, return immediately
+  if (!context->valid)
+    {
+      POCL_MEM_FREE(context);
+      return CL_SUCCESS;
+    }
 
   POCL_RELEASE_OBJECT(context, new_refcount);
   if (new_refcount == 0)
diff --git a/lib/CL/clReleaseMemObject.c b/lib/CL/clReleaseMemObject.c
index d2b25a3..4b6d0d1 100644
--- a/lib/CL/clReleaseMemObject.c
+++ b/lib/CL/clReleaseMemObject.c
@@ -29,6 +29,8 @@ POname(clReleaseMemObject)(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0
 {
   int new_refcount;
   cl_device_id device_id;
+  cl_context context;
+  cl_mem parent = NULL;
   unsigned i;
   mem_mapping_t *mapping, *temp;
 
@@ -44,31 +46,32 @@ POname(clReleaseMemObject)(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0
      with memobj are deleted.
   */
 
-  if (new_refcount == 0) 
+  if (new_refcount == 0)
     {
-      if (memobj->parent == NULL) 
+      if (memobj->parent == NULL)
         {
           for (i = 0; i < memobj->context->num_devices; ++i)
             {
               device_id = memobj->context->devices[i];
-              device_id->ops->free(device_id->data, memobj->flags, memobj->device_ptrs[device_id->dev_id].mem_ptr);
+              device_id->ops->free(device_id, memobj);
               memobj->device_ptrs[device_id->dev_id].mem_ptr = NULL;
             }
-        } else 
-        {
-          /* a sub buffer object does not free the memory from
-             the device */          
-          POCL_RELEASE_OBJECT(memobj->parent, new_refcount);
         }
-      POCL_RELEASE_OBJECT(memobj->context, new_refcount);
       DL_FOREACH_SAFE(memobj->mappings, mapping, temp)
         {
           POCL_MEM_FREE(mapping);
         }
       memobj->mappings = NULL;
-      
+
+      context = memobj->context;
+      parent = memobj->parent;
+
       POCL_MEM_FREE(memobj->device_ptrs);
       POCL_MEM_FREE(memobj);
+
+      if (parent)
+        POname(clReleaseMemObject)(parent);
+      POname(clReleaseContext)(context);
     }
   return CL_SUCCESS;
 }
diff --git a/lib/CL/clReleaseProgram.c b/lib/CL/clReleaseProgram.c
index c003ef7..06bf1ca 100644
--- a/lib/CL/clReleaseProgram.c
+++ b/lib/CL/clReleaseProgram.c
@@ -48,20 +48,20 @@ POname(clReleaseProgram)(cl_program program) CL_API_SUFFIX__VERSION_1_0
 
   if (new_refcount == 0)
     {
+      cl_context context = program->context;
 
       /* Mark all kernels as having no program.
          FIXME: this should not be needed if the kernels
          retain the parent program (and release when the kernel
          is released). */
       for (k=program->kernels; k!=NULL; k=k->next)
-        {          
+        {
           k->program = NULL;
         }
 
       if(program->devices != program->context->devices)
         POCL_MEM_FREE(program->devices);
 
-      POCL_RELEASE_OBJECT (program->context, new_refcount);
       POCL_MEM_FREE(program->source);
 
       POCL_MEM_FREE(program->binary_sizes);
@@ -80,6 +80,8 @@ POname(clReleaseProgram)(cl_program program) CL_API_SUFFIX__VERSION_1_0
       POCL_MEM_FREE(program->build_hash);
       POCL_MEM_FREE(program->llvm_irs);
       POCL_MEM_FREE(program);
+
+      POname(clReleaseContext)(context);
     }
 
   return CL_SUCCESS;
diff --git a/lib/CL/clSVMAlloc.c b/lib/CL/clSVMAlloc.c
new file mode 100644
index 0000000..5cd3f32
--- /dev/null
+++ b/lib/CL/clSVMAlloc.c
@@ -0,0 +1,110 @@
+/* OpenCL runtime library: clSVMAlloc()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_util.h"
+#include "devices.h"
+
+CL_API_ENTRY void* CL_API_CALL
+POname(clSVMAlloc)(cl_context context,
+                   cl_svm_mem_flags flags,
+                   size_t size,
+                   unsigned int alignment) CL_API_SUFFIX__VERSION_2_0
+{
+  unsigned i, p;
+
+  POCL_RETURN_ERROR_COND((context == NULL), NULL);
+
+  POCL_RETURN_ERROR_ON((!context->svm_allocdev), NULL,
+                       "None of the devices in this context is SVM-capable\n");
+
+  POCL_RETURN_ERROR_COND((size == 0), NULL);
+
+  POCL_RETURN_ERROR_ON((size > context->min_max_mem_alloc_size), NULL,
+                       "size(%zu) > CL_DEVICE_MAX_MEM_ALLOC_SIZE value "
+                       "for some device in context\n", size);
+
+  /* flags does not contain CL_MEM_SVM_FINE_GRAIN_BUFFER
+   * but does contain CL_MEM_SVM_ATOMICS. */
+  POCL_RETURN_ERROR_COND((flags & CL_MEM_SVM_ATOMICS) &&
+                         ((flags & CL_MEM_SVM_FINE_GRAIN_BUFFER) == 0), NULL);
+
+  /* Flags  */
+  p = __builtin_popcount(flags & (CL_MEM_READ_WRITE
+                                           | CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY));
+  POCL_RETURN_ERROR_ON((p > 1), NULL, "flags may contain only one of "
+                   "CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY\n");
+
+  const cl_svm_mem_flags valid_flags = (CL_MEM_SVM_ATOMICS | CL_MEM_SVM_FINE_GRAIN_BUFFER
+                                  | CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY
+                                  | CL_MEM_READ_ONLY);
+  POCL_RETURN_ERROR_ON((flags & (!valid_flags)), NULL, "flags argument "
+                                "contains invalid bits (nonexistent flags)\n");
+
+  /* CL_MEM_SVM_FINE_GRAIN_BUFFER or CL_MEM_SVM_ATOMICS is specified in flags
+   * and these are not supported by at least one device in context. */
+  if (flags & CL_MEM_SVM_FINE_GRAIN_BUFFER)
+    for (i=0; i < context->num_devices; i++)
+      POCL_RETURN_ERROR_ON((DEVICE_SVM_FINEGR(context->devices[i]) == 0), NULL,
+                           "One of the devices in the context doesn't support "
+                           "fine-grained buffers, and it's in flags\n");
+
+  if (flags & CL_MEM_SVM_ATOMICS)
+    for (i=0; i < context->num_devices; i++)
+      POCL_RETURN_ERROR_ON((DEVICE_SVM_ATOM(context->devices[i]) == 0), NULL,
+                           "One of the devices in the context doesn't support "
+                           "SVM atomics buffers, and it's in flags\n");
+
+#define dev context->svm_allocdev
+
+  if (alignment == 0)
+    alignment = dev->min_data_type_align_size;
+
+  /* alignment is not a power of two or the OpenCL implementation cannot support
+   * the specified alignment for at least one device in context. */
+  p = __builtin_popcount(alignment);
+  POCL_RETURN_ERROR_ON((p > 1), NULL, "aligment argument must be a power of 2\n");
+
+  for (i=0; i < context->num_devices; i++)
+    POCL_RETURN_ERROR_ON((context->devices[i]->min_data_type_align_size < alignment),
+                         NULL, "All devices must support the requested memory "
+                         "aligment (%u) \n", alignment);
+
+  /* create a fake (temporary) cl_mem */
+  cl_mem mem = alloca(sizeof(struct _cl_mem));
+  mem->flags = CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE;
+  mem->mem_host_ptr = NULL;
+  mem->size = size;
+  pocl_mem_identifier device_ptrs[pocl_num_devices];
+  device_ptrs[dev->global_mem_id].mem_ptr = NULL;
+  mem->device_ptrs = device_ptrs;
+
+  cl_int errcode = dev->ops->alloc_mem_obj(dev, mem);
+  /* There was a failure to allocate resources */
+  POCL_RETURN_ERROR_ON((errcode != CL_SUCCESS), NULL,
+                       "Failed to allocate the memory: %u\n", errcode);
+
+  return device_ptrs[dev->global_mem_id].mem_ptr;
+
+}
+POsym(clSVMAlloc);
+
diff --git a/lib/kernel/hsail64/log1p.cl b/lib/CL/clSVMFree.c
similarity index 66%
copy from lib/kernel/hsail64/log1p.cl
copy to lib/CL/clSVMFree.c
index 54e86ea..6db1e8a 100644
--- a/lib/kernel/hsail64/log1p.cl
+++ b/lib/CL/clSVMFree.c
@@ -1,4 +1,4 @@
-/* OpenCL built-in library: log1p()
+/* OpenCL runtime library: clSVMFree()
 
    Copyright (c) 2015 Michal Babej / Tampere University of Technology
 
@@ -21,6 +21,30 @@
    THE SOFTWARE.
 */
 
-#include "../templates.h"
+#include "pocl_util.h"
+#include "pocl_debug.h"
+
+CL_API_ENTRY void CL_API_CALL
+POname(clSVMFree)(cl_context context,
+                  void *svm_pointer) CL_API_SUFFIX__VERSION_2_0
+{
+  if (context == NULL)
+  {
+    POCL_MSG_WARN("Bad cl_context");
+    return;
+  }
+
+  if (context->svm_allocdev==NULL)
+  {
+    POCL_MSG_WARN("None of the devices in this context is SVM-capable");
+    return;
+  }
+
+  if (svm_pointer == NULL)
+    return;
+
+  context->svm_allocdev->ops->free_ptr(context->svm_allocdev, svm_pointer);
+
+}
+POsym(clSVMFree);
 
-DEFINE_EXPR_V_V(log1p, (log((vtype)(1.0) + a)))
diff --git a/lib/CL/clSetKernelArgSVMPointer.c b/lib/CL/clSetKernelArgSVMPointer.c
new file mode 100644
index 0000000..07945d5
--- /dev/null
+++ b/lib/CL/clSetKernelArgSVMPointer.c
@@ -0,0 +1,59 @@
+/* OpenCL runtime library: clSetKernelArgSVMPointer()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "config.h"
+#include "pocl_cl.h"
+#include "pocl_util.h"
+#include "devices.h"
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clSetKernelArgSVMPointer)(cl_kernel kernel,
+                                 cl_uint arg_index,
+                                 const void *arg_value) CL_API_SUFFIX__VERSION_2_0
+{
+  POCL_RETURN_ERROR_COND((kernel == NULL), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_ON((!kernel->context->svm_allocdev), CL_INVALID_CONTEXT,
+                       "None of the devices in this context is SVM-capable\n");
+
+  cl_mem mem = malloc(sizeof(struct _cl_mem));
+  POCL_INIT_OBJECT(mem);
+  mem->mem_host_ptr = (void*)arg_value;
+  mem->parent = NULL;
+  mem->map_count = 0;
+  mem->mappings = NULL;
+  mem->type = CL_MEM_OBJECT_BUFFER;
+  mem->flags = CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE;
+  mem->device_ptrs = NULL;
+  mem->is_image = CL_FALSE;
+  mem->packet_size = 0;
+  mem->max_packets = 0;
+  //mem->size = size;  TODO
+  mem->context = kernel->context;
+
+  POCL_MSG_PRINT_INFO("Setting kernel ARG %i to SVM %p using cl_mem: %p\n", arg_index, arg_value, mem);
+
+  return POname(clSetKernelArg)(kernel, arg_index, sizeof(cl_mem), &mem);
+
+}
+POsym(clSetKernelArgSVMPointer)
diff --git a/lib/CL/clReleaseContext.c b/lib/CL/clSetKernelExecInfo.c
similarity index 51%
copy from lib/CL/clReleaseContext.c
copy to lib/CL/clSetKernelExecInfo.c
index 77d43d1..35e81b0 100644
--- a/lib/CL/clReleaseContext.c
+++ b/lib/CL/clSetKernelExecInfo.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clReleaseContext()
+/* OpenCL runtime library: clSetKernelExecInfo()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Universidad Rey Juan Carlos
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,28 +24,32 @@
 #include "pocl_cl.h"
 
 CL_API_ENTRY cl_int CL_API_CALL
-POname(clReleaseContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
+POname(clSetKernelExecInfo)(cl_kernel  kernel ,
+                            cl_kernel_exec_info  param_name ,
+                            size_t  param_value_size ,
+                            const void  *param_value) CL_API_SUFFIX__VERSION_1_0
 {
-  int new_refcount;
-  POCL_RETURN_ERROR_COND((!context->valid), CL_INVALID_CONTEXT);
+  POCL_RETURN_ERROR_COND((kernel == NULL), CL_INVALID_VALUE);
+
+  POCL_RETURN_ERROR_ON((!kernel->context->svm_allocdev), CL_INVALID_CONTEXT,
+                       "None of the devices in this context is SVM-capable\n");
 
-  POCL_RELEASE_OBJECT(context, new_refcount);
-  if (new_refcount == 0)
+  /* TODO not sure what to actually do with indirect pointers..*/
+  switch (param_name)
     {
-      /* The context holds references to all its devices,
-         memory objects, command-queues etc. Release the
-         references and let the objects to get freed. */
-      /* TODO: call the corresponding clRelease* functions
-         for all the referred objects. */
-      unsigned i;
-      for (i = 0; i < context->num_devices; ++i) 
+      case CL_KERNEL_EXEC_INFO_SVM_PTRS:
         {
-          POname(clReleaseDevice) (context->devices[i]);
-        }   
-      POCL_MEM_FREE(context->devices);
-      POCL_MEM_FREE(context->properties);
-      POCL_MEM_FREE(context);
+        POCL_MSG_PRINT_INFO("clSetKernelExecInfo called with CL_KERNEL_EXEC_INFO_SVM_PTRS\n");
+        break;
+        }
+      case CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM:
+        {
+        cl_bool j = *(cl_bool*)param_value;
+        POCL_MSG_PRINT_INFO("clSetKernelExecInfo called with CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM: %i", j);
+        }
     }
+
   return CL_SUCCESS;
+
 }
-POsym(clReleaseContext)
+POsym(clSetKernelExecInfo)
diff --git a/lib/CL/devices/CMakeLists.txt b/lib/CL/devices/CMakeLists.txt
index 5c017e7..7caffdc 100644
--- a/lib/CL/devices/CMakeLists.txt
+++ b/lib/CL/devices/CMakeLists.txt
@@ -37,19 +37,12 @@ if(ENABLE_TCE)
   include_directories(AFTER "tce")
   add_subdirectory("tce")
   list(APPEND POCL_DEVICES_LINK_LIST ${TCE_LIBS})
-  list(APPEND POCL_DEVICES_OBJS "$<TARGET_OBJECTS:pocl-devices-tce-ttasim>")
-endif()
-
-if(ENABLE_SPU)
-  add_subdirectory("cellspu")
-  list(APPEND POCL_DEVICES_LINK_LIST "spe2")
 endif()
 
 if(ENABLE_HSA)
   include_directories(AFTER "${HSA_INCLUDES}")
   add_subdirectory("hsa")
   list(APPEND POCL_DEVICES_LINK_LIST ${HSALIB})
-  list(APPEND POCL_DEVICES_OBJS "$<TARGET_OBJECTS:pocl-devices-hsa>")
 endif()
 
 set(POCL_DEVICES_SOURCES 
diff --git a/lib/CL/devices/Makefile.am b/lib/CL/devices/Makefile.am
index e7f9fee..3c1038d 100644
--- a/lib/CL/devices/Makefile.am
+++ b/lib/CL/devices/Makefile.am
@@ -45,11 +45,6 @@ SUBDIRS += tce
 libpocl_devices_la_LIBADD += tce/libpocl-devices-tce.la
 endif
 
-if BUILD_SPU
-SUBDIRS += cellspu
-libpocl_devices_la_LIBADD += cellspu/libpocl-devices-cellspu.la
-endif
-
 if BUILD_HSA
 SUBDIRS += hsa
 libpocl_devices_la_LIBADD += hsa/libpocl-devices-hsa.la
diff --git a/lib/CL/devices/Makefile.in b/lib/CL/devices/Makefile.in
index 8257161..6b9f18b 100644
--- a/lib/CL/devices/Makefile.in
+++ b/lib/CL/devices/Makefile.in
@@ -116,10 +116,8 @@ host_triplet = @host@
 target_triplet = @target@
 @TCE_AVAILABLE_TRUE at am__append_1 = tce
 @TCE_AVAILABLE_TRUE at am__append_2 = tce/libpocl-devices-tce.la
- at BUILD_SPU_TRUE@am__append_3 = cellspu
- at BUILD_SPU_TRUE@am__append_4 = cellspu/libpocl-devices-cellspu.la
- at BUILD_HSA_TRUE@am__append_5 = hsa
- at BUILD_HSA_TRUE@am__append_6 = hsa/libpocl-devices-hsa.la
+ at BUILD_HSA_TRUE@am__append_3 = hsa
+ at BUILD_HSA_TRUE@am__append_4 = hsa/libpocl-devices-hsa.la
 subdir = lib/CL/devices
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
@@ -138,7 +136,7 @@ LTLIBRARIES = $(noinst_LTLIBRARIES)
 libpocl_devices_la_DEPENDENCIES = pthread/libpocl-devices-pthread.la \
 	basic/libpocl-devices-basic.la \
 	topology/libpocl-devices-topology.la $(am__append_2) \
-	$(am__append_4) $(am__append_6)
+	$(am__append_4)
 am_libpocl_devices_la_OBJECTS = libpocl_devices_la-devices.lo \
 	libpocl_devices_la-bufalloc.lo libpocl_devices_la-common.lo \
 	libpocl_devices_la-cpuinfo.lo
@@ -223,7 +221,7 @@ am__define_uniq_tagged_files = \
   done | $(am__uniquify_input)`
 ETAGS = etags
 CTAGS = ctags
-DIST_SUBDIRS = topology pthread basic tce cellspu hsa
+DIST_SUBDIRS = topology pthread basic tce hsa
 am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/config/depcomp
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 am__relativize = \
@@ -295,6 +293,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -302,6 +301,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -319,8 +319,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -396,6 +394,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -455,8 +454,7 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-SUBDIRS = topology pthread basic $(am__append_1) $(am__append_3) \
-	$(am__append_5)
+SUBDIRS = topology pthread basic $(am__append_1) $(am__append_3)
 noinst_LTLIBRARIES = libpocl-devices.la
 libpocl_devices_la_SOURCES = devices.h devices.c bufalloc.c dev_image.h \
 	prototypes.inc common.h common.c bufalloc.h cpuinfo.c cpuinfo.h
@@ -464,7 +462,7 @@ libpocl_devices_la_SOURCES = devices.h devices.c bufalloc.c dev_image.h \
 libpocl_devices_la_LIBADD = pthread/libpocl-devices-pthread.la \
 	basic/libpocl-devices-basic.la \
 	topology/libpocl-devices-topology.la $(am__append_2) \
-	$(am__append_4) $(am__append_6)
+	$(am__append_4)
 libpocl_devices_la_CPPFLAGS = \
   -I$(top_srcdir)/fix-include \
   -I$(top_srcdir)/include \
diff --git a/lib/CL/devices/basic/Makefile.in b/lib/CL/devices/basic/Makefile.in
index 2365960..a366e79 100644
--- a/lib/CL/devices/basic/Makefile.in
+++ b/lib/CL/devices/basic/Makefile.in
@@ -247,6 +247,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -254,6 +255,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -271,8 +273,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -348,6 +348,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/lib/CL/devices/basic/basic.c b/lib/CL/devices/basic/basic.c
index a1a500e..44a5e3a 100644
--- a/lib/CL/devices/basic/basic.c
+++ b/lib/CL/devices/basic/basic.c
@@ -22,6 +22,7 @@
    THE SOFTWARE.
 */
 
+#include "config.h"
 #include "basic.h"
 #include "cpuinfo.h"
 #include "topology/pocl_topology.h"
@@ -33,15 +34,9 @@
 #include <string.h>
 #include <stdlib.h>
 
-#ifndef _MSC_VER
-#  include <sys/time.h>
-#  include <sys/resource.h>
-#  include <unistd.h>
-#else
-#  include "vccompat.hpp"
-#endif
-
 #include "pocl_cache.h"
+#include "pocl_timing.h"
+#include "pocl_llvm.h"
 
 #define max(a,b) (((a) > (b)) ? (a) : (b))
 
@@ -201,6 +196,7 @@ pocl_basic_init_device_ops(struct pocl_device_ops *ops)
   ops->init = pocl_basic_init;
   ops->alloc_mem_obj = pocl_basic_alloc_mem_obj;
   ops->free = pocl_basic_free;
+  ops->free_ptr = pocl_basic_free_ptr;
   ops->read = pocl_basic_read;
   ops->read_rect = pocl_basic_read_rect;
   ops->write = pocl_basic_write;
@@ -208,7 +204,9 @@ pocl_basic_init_device_ops(struct pocl_device_ops *ops)
   ops->copy = pocl_basic_copy;
   ops->copy_rect = pocl_basic_copy_rect;
   ops->fill_rect = pocl_basic_fill_rect;
+  ops->memfill = pocl_basic_memfill;
   ops->map_mem = pocl_basic_map_mem;
+  ops->unmap_mem = pocl_basic_unmap_mem;
   ops->compile_submitted_kernels = pocl_basic_compile_submitted_kernels;
   ops->run = pocl_basic_run;
   ops->run_native = pocl_basic_run_native;
@@ -224,6 +222,7 @@ pocl_basic_init_device_infos(struct _cl_device_id* dev)
   dev->max_compute_units = 0;
   dev->max_work_item_dimensions = 3;
 
+  SETUP_DEVICE_CL_VERSION(HOST_DEVICE_CL_VERSION_MAJOR, HOST_DEVICE_CL_VERSION_MINOR)
   /*
     The hard restriction will be the context data which is
     stored in stack that can be as small as 8K in Linux.
@@ -281,13 +280,14 @@ pocl_basic_init_device_infos(struct _cl_device_id* dev)
   dev->local_mem_size = 0;
   dev->error_correction_support = CL_FALSE;
   dev->host_unified_memory = CL_TRUE;
-  dev->profiling_timer_resolution = 0;
+
+  dev->profiling_timer_resolution = pocl_timer_resolution;
+
   dev->endian_little = !(WORDS_BIGENDIAN);
   dev->available = CL_TRUE;
   dev->compiler_available = CL_TRUE;
   dev->spmd = CL_FALSE;
   dev->execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL;
-  dev->queue_properties = CL_QUEUE_PROFILING_ENABLE;
   dev->platform = 0;
 
   dev->parent_device = NULL;
@@ -311,25 +311,32 @@ pocl_basic_init_device_infos(struct _cl_device_id* dev)
      ensure that there is no more than a single space between
      identifiers. */
 
-#ifndef _CL_DISABLE_LONG
-#define DOUBLE_EXT "cl_khr_fp64 "
-#else
-#define DOUBLE_EXT 
-#endif
-
-#ifndef _CL_DISABLE_HALF
-#define HALF_EXT "cl_khr_fp16 "
-#else
-#define HALF_EXT
-#endif
-
-  dev->extensions = DOUBLE_EXT HALF_EXT "cl_khr_byte_addressable_store "
-      "cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics "
-      "cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics "
-      "cl_khr_int64_base_atomics cl_khr_int64_extended_atomics";
+  dev->should_allocate_svm = 0;
+  /* OpenCL 2.0 properties */
+  dev->svm_caps = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER
+                  | CL_DEVICE_SVM_FINE_GRAIN_BUFFER
+                  | CL_DEVICE_SVM_ATOMICS;
+  /* TODO these are minimums, figure out whats a reasonable value */
+  dev->max_events = 1024;
+  dev->max_queues = 1;
+  dev->max_pipe_args = 16;
+  dev->max_pipe_active_res = 1;
+  dev->max_pipe_packet_size = 1024;
+  dev->dev_queue_pref_size = 16 * 1024;
+  dev->dev_queue_max_size = 256 * 1024;
+  dev->on_dev_queue_props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+                               | CL_QUEUE_PROFILING_ENABLE;
+  dev->on_host_queue_props = CL_QUEUE_PROFILING_ENABLE;
+
+
+  dev->extensions = HOST_DEVICE_EXTENSIONS;
 
   dev->llvm_target_triplet = OCL_KERNEL_TARGET;
+#ifdef POCL_BUILT_WITH_CMAKE
+  dev->llvm_cpu = get_cpu_name();
+#else
   dev->llvm_cpu = OCL_KERNEL_TARGET_CPU;
+#endif
   dev->has_64bit_long = 1;
   dev->autolocals_to_args = 1;
 }
@@ -346,71 +353,7 @@ pocl_basic_probe(struct pocl_device_ops *ops)
   return env_count;
 }
 
-#define MIN_MAX_MEM_ALLOC_SIZE (128*1024*1024)
 
-/* set maximum allocation sizes for buffers and images */
-void
-pocl_basic_set_buffer_image_limits(cl_device_id device)
-{
-  /* Maximum allocation size: we don't have hardware limits, so we
-   * can potentially allocate the whole memory for a single buffer, unless
-   * of course there are limits set at the operating system level. Of course
-   * we still have to respect the OpenCL-commanded minimum */
-  size_t alloc_limit = SIZE_MAX;
-
-#ifndef _MSC_VER
-  // TODO getrlimit equivalent under Windows
-  struct rlimit limits;
-  int ret = getrlimit(RLIMIT_DATA, &limits);
-  if (ret == 0)
-    alloc_limit = limits.rlim_cur;
-#endif
-  if (alloc_limit > device->global_mem_size)
-    alloc_limit = device->global_mem_size;
-  else if (alloc_limit < MIN_MAX_MEM_ALLOC_SIZE)
-    alloc_limit = MIN_MAX_MEM_ALLOC_SIZE;
-  // TODO in theory now if alloc_limit was > rlim_cur and < rlim_max
-  // we should try and setrlimit to alloc_limit, or allocations might fail
-
-  device->local_mem_size = device->max_constant_buffer_size =
-    device->max_mem_alloc_size = alloc_limit;
-
-  /* We don't have hardware limitations on the buffer-backed image sizes,
-   * so we set the maximum size in terms of the maximum amount of pixels
-   * that fix in max_mem_alloc_size. A single pixel can take up to 4 32-bit channels,
-   * i.e. 16 bytes.
-   */
-  size_t max_pixels = device->max_mem_alloc_size/16;
-  if (max_pixels > device->image_max_buffer_size)
-    device->image_max_buffer_size = max_pixels;
-
-  /* Similarly, we can take the 2D image size limit to be the largest power of 2
-   * whose square fits in image_max_buffer_size; since the 2D image size limit
-   * starts at a power of 2, it's a simple matter of doubling.
-   * This is actually completely arbitrary, another equally valid option
-   * would be to have each maximum dimension match the image_max_buffer_size.
-   */
-  max_pixels = device->image2d_max_width;
-  // keep doubing until we go over
-  while (max_pixels <= device->image_max_buffer_size/max_pixels)
-    max_pixels *= 2;
-  // halve before assignment
-  max_pixels /= 2;
-  if (max_pixels > device->image2d_max_width)
-    device->image2d_max_width = device->image2d_max_height = max_pixels;
-
-  /* Same thing for 3D images, of course with cubes. Again, totally arbitrary. */
-  max_pixels = device->image3d_max_width;
-  // keep doubing until we go over
-  while (max_pixels*max_pixels <= device->image_max_buffer_size/max_pixels)
-    max_pixels *= 2;
-  // halve before assignment
-  max_pixels /= 2;
-  if (max_pixels > device->image3d_max_width)
-  device->image3d_max_width = device->image3d_max_height =
-    device->image3d_max_depth = max_pixels;
-
-}
 
 void
 pocl_basic_init (cl_device_id device, const char* parameters)
@@ -440,7 +383,7 @@ pocl_basic_init (cl_device_id device, const char* parameters)
   device->global_mem_size = 1;
   pocl_topology_detect_device_info(device);
   pocl_cpuinfo_detect_device_info(device);
-  pocl_basic_set_buffer_image_limits(device);
+  pocl_set_buffer_image_limits(device);
 
   /* in case hwloc doesn't provide a PCI ID, let's generate
      a vendor id that hopefully is unique across vendors. */
@@ -467,35 +410,6 @@ pocl_basic_init (cl_device_id device, const char* parameters)
   #endif
 }
 
-static void *
-pocl_basic_malloc (void *device_data, cl_mem_flags flags,
-		    size_t size, void *host_ptr)
-{
-  void *b;
-
-  if (flags & CL_MEM_COPY_HOST_PTR)
-    {
-      b = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, size);
-      if (b != NULL)
-        {
-          memcpy(b, host_ptr, size);
-          return b;
-        }
-      
-      return NULL;
-    }
-  
-  if (flags & CL_MEM_USE_HOST_PTR && host_ptr != NULL)
-    {
-      return host_ptr;
-    }
-  b = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, size);
-  if (b != NULL)
-    return b;
-  
-  return NULL;
-}
-
 cl_int
 pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem_obj)
 {
@@ -513,7 +427,7 @@ pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem_obj)
         }
       else
         {
-          b = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, mem_obj->size);
+          b = pocl_memalign_alloc_global_mem(device, MAX_EXTENDED_ALIGNMENT, mem_obj->size);
           if (b == NULL)
             return CL_MEM_OBJECT_ALLOCATION_FAILURE;
         }
@@ -537,12 +451,24 @@ pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem_obj)
 }
 
 void
-pocl_basic_free (void *data, cl_mem_flags flags, void *ptr)
+pocl_basic_free (cl_device_id device, cl_mem memobj)
 {
+  cl_mem_flags flags = memobj->flags;
+
   if (flags & CL_MEM_USE_HOST_PTR)
     return;
-  
-  POCL_MEM_FREE(ptr);
+
+  void* ptr = memobj->device_ptrs[device->dev_id].mem_ptr;
+  size_t size = memobj->size;
+
+  pocl_free_global_mem(device, ptr, size);
+}
+
+void pocl_basic_free_ptr (cl_device_id device, void* mem_ptr)
+{
+  /* TODO we should somehow figure out the size argument
+   * and call pocl_free_global_mem */
+  POCL_MEM_FREE(mem_ptr);
 }
 
 void
@@ -596,7 +522,7 @@ pocl_basic_run
       if (kernel->arg_info[i].is_local)
         {
           arguments[i] = malloc (sizeof (void *));
-          *(void **)(arguments[i]) = pocl_basic_malloc(data, 0, al->size, NULL);
+          *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size);
         }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER)
         {
@@ -617,7 +543,7 @@ pocl_basic_run
           dev_image_t di;
           fill_dev_image_t (&di, al, cmd->device);
 
-          void* devptr = pocl_basic_malloc (data, 0, sizeof(dev_image_t), NULL);
+          void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT,  sizeof(dev_image_t));
           arguments[i] = malloc (sizeof (void *));
           *(void **)(arguments[i]) = devptr; 
           pocl_basic_write (data, &di, devptr, 0, sizeof(dev_image_t));
@@ -627,7 +553,7 @@ pocl_basic_run
           dev_sampler_t ds;
           fill_dev_sampler_t(&ds, al);
           
-          void* devptr = pocl_basic_malloc (data, 0, sizeof(dev_sampler_t), NULL);
+          void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, sizeof(dev_sampler_t));
           arguments[i] = malloc (sizeof (void *));
           *(void **)(arguments[i]) = devptr;
           pocl_basic_write (data, &ds, devptr, 0, sizeof(dev_sampler_t));
@@ -643,7 +569,7 @@ pocl_basic_run
     {
       al = &(cmd->command.run.arguments[i]);
       arguments[i] = malloc (sizeof (void *));
-      *(void **)(arguments[i]) = pocl_basic_malloc (data, 0, al->size, NULL);
+      *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size);
     }
 
   for (z = 0; z < pc->num_groups[2]; ++z)
@@ -665,13 +591,13 @@ pocl_basic_run
     {
       if (kernel->arg_info[i].is_local)
         {
-          pocl_basic_free (data, 0, *(void **)(arguments[i]));
+          POCL_MEM_FREE(*(void **)(arguments[i]));
           POCL_MEM_FREE(arguments[i]);
         }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE ||
                 kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
         {
-          pocl_basic_free (data, 0, *(void **)(arguments[i]));
+          POCL_MEM_FREE(*(void **)(arguments[i]));
           POCL_MEM_FREE(arguments[i]);
         }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)arguments[i] == NULL)
@@ -683,7 +609,7 @@ pocl_basic_run
        i < kernel->num_args + kernel->num_locals;
        ++i)
     {
-      pocl_basic_free(data, 0, *(void **)(arguments[i]));
+      POCL_MEM_FREE(*(void **)(arguments[i]));
       POCL_MEM_FREE(arguments[i]);
     }
   free(arguments);
@@ -824,6 +750,83 @@ pocl_basic_fill_rect (void *data,
                 + buffer_slice_pitch * k, fill_pixel, pixel_size);
 }
 
+void pocl_basic_memfill(void *ptr,
+                        size_t size,
+                        size_t offset,
+                        const void* pattern,
+                        size_t pattern_size)
+{
+  size_t i;
+  unsigned j;
+
+  switch (pattern_size)
+    {
+    case 1:
+      {
+      uint8_t * p = (uint8_t*)ptr + offset;
+      for (i = 0; i < size; i++)
+        p[i] = *(uint8_t*)pattern;
+      }
+      break;
+    case 2:
+      {
+      uint16_t * p = (uint16_t*)ptr + offset;
+      for (i = 0; i < size; i++)
+        p[i] = *(uint16_t*)pattern;
+      }
+      break;
+    case 4:
+      {
+      uint32_t * p = (uint32_t*)ptr + offset;
+      for (i = 0; i < size; i++)
+        p[i] = *(uint32_t*)pattern;
+      }
+      break;
+    case 8:
+      {
+      uint64_t * p = (uint64_t*)ptr + offset;
+      for (i = 0; i < size; i++)
+        p[i] = *(uint64_t*)pattern;
+      }
+      break;
+    case 16:
+      {
+      uint64_t * p = (uint64_t*)ptr + offset;
+      for (i = 0; i < size; i++)
+        for (j = 0; j < 2; j++)
+          p[(i<<1) + j] = *((uint64_t*)pattern + j);
+      }
+      break;
+    case 32:
+      {
+      uint64_t * p = (uint64_t*)ptr + offset;
+      for (i = 0; i < size; i++)
+        for (j = 0; j < 4; j++)
+          p[(i<<2) + j] = *((uint64_t*)pattern + j);
+      }
+      break;
+    case 64:
+      {
+      uint64_t * p = (uint64_t*)ptr + offset;
+      for (i = 0; i < size; i++)
+        for (j = 0; j < 8; j++)
+          p[(i<<3) + j] = *((uint64_t*)pattern + j);
+      }
+      break;
+    case 128:
+      {
+      uint64_t * p = (uint64_t*)ptr + offset;
+      for (i = 0; i < size; i++)
+        for (j = 0; j < 16; j++)
+          p[(i<<4) + j] = *((uint64_t*)pattern + j);
+      }
+      break;
+    default:
+      assert (0 && "Invalid pattern size");
+      break;
+    }
+}
+
 void *
 pocl_basic_map_mem (void *data, void *buf_ptr, 
                       size_t offset, size_t size,
@@ -835,6 +838,14 @@ pocl_basic_map_mem (void *data, void *buf_ptr,
   return (char*)buf_ptr + offset;
 }
 
+void* pocl_basic_unmap_mem(void *data, void *host_ptr,
+                           void *device_start_ptr,
+                           size_t size)
+{
+  return host_ptr;
+}
+
+
 void
 pocl_basic_uninit (cl_device_id device)
 {
@@ -846,21 +857,7 @@ pocl_basic_uninit (cl_device_id device)
 cl_ulong
 pocl_basic_get_timer_value (void *data) 
 {
-#ifndef _MSC_VER
-  struct timeval current;
-  gettimeofday(&current, NULL);  
-  return (current.tv_sec * 1000000 + current.tv_usec)*1000;
-#else
-  FILETIME ft;
-  cl_ulong tmpres = 0;
-  GetSystemTimeAsFileTime(&ft);
-  tmpres |= ft.dwHighDateTime;
-  tmpres <<= 32;
-  tmpres |= ft.dwLowDateTime;
-  tmpres -= 11644473600000000Ui64;
-  tmpres /= 10;
-  return tmpres;
-#endif
+  return pocl_gettimemono_ns();
 }
 
 cl_int 
@@ -910,10 +907,6 @@ void check_compiler_cache (_cl_command_node *cmd)
           return;
         }
     }
-  cl_program program = cmd->command.run.kernel->program;
-
-  void* cache_lock = pocl_cache_acquire_writer_lock(program, cmd->device);
-  assert(cache_lock);
 
   ci = (compiler_cache_item*) malloc (sizeof (compiler_cache_item));
   ci->next = NULL;
@@ -936,7 +929,6 @@ void check_compiler_cache (_cl_command_node *cmd)
   cmd->command.run.wg = ci->wg = 
     (pocl_workgroup) lt_dlsym (dlhandle, workgroup_string);
 
-  pocl_cache_release_lock(cache_lock);
   LL_APPEND (compiler_cache, ci);
   POCL_UNLOCK (compiler_cache_lock);
 
diff --git a/lib/CL/devices/bufalloc.c b/lib/CL/devices/bufalloc.c
index af609bb..e04cc41 100644
--- a/lib/CL/devices/bufalloc.c
+++ b/lib/CL/devices/bufalloc.c
@@ -1,17 +1,17 @@
 /* OpenCL runtime/device driver library: custom buffer allocator
 
    Copyright (c) 2011 Tampere University of Technology
-   
+
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -29,7 +29,7 @@
  * dynamically allocate local memory chunks. The interface is address space
  * agnostic; it treats memory addresses and regions of memory as integers.
  *
- * Certain assumptions of OpenCL allocation patterns are made to optimize and 
+ * Certain assumptions of OpenCL allocation patterns are made to optimize and
  * simplify the implementation:
  *
  * 1) The allocations are often quite large and the "lifetimes" of the
@@ -48,7 +48,7 @@
  * Traversing through the list of chunk infos when searching for an
  * available freed chunk can be considered to be not very costly.
  *
- * 3a) There is no lack of (global) memory. 
+ * 3a) There is no lack of (global) memory.
  *
  * A wasteful but fast strategy can be used. Here the chunk is always
  * tried to be allocated to the end of the region to enforce "sequential
@@ -58,7 +58,7 @@
  *
  * A slower but less wasteful strategy should be used. In this version
  * the list of old chunks should be traversed first and reused in case
- * a large enough unallocated one is found. This version can be used 
+ * a large enough unallocated one is found. This version can be used
  * also for the case where there's a single region (basically heap) that
  * grows towards the stack or the global data area of the memory.
  *
@@ -79,7 +79,7 @@
 void
 print_chunk (chunk_info_t *chunk)
 {
-  printf ("### chunk %p: allocated: %d start: %zx size: %zu prev: %p next: %p\n", 
+  printf ("### chunk %p: allocated: %d start: %zx size: %zu prev: %p next: %p\n",
           chunk, chunk->is_allocated, chunk->start_address,
           chunk->size, chunk->prev, chunk->next);
 }
@@ -88,7 +88,7 @@ void
 print_chunks (chunk_info_t *first)
 {
   chunk_info_t *chunk;
-  DL_FOREACH (first, chunk) 
+  DL_FOREACH (first, chunk)
     {
       print_chunk (chunk);
     }
@@ -97,7 +97,7 @@ print_chunks (chunk_info_t *first)
 static int
 chunk_slack (chunk_info_t* chunk, size_t size, size_t* last_chunk_size)
 {
-  memory_address_t aligned_start_addr = 
+  memory_address_t aligned_start_addr =
     (chunk->start_address + chunk->parent_region->alignment - 1) &
     ~(chunk->parent_region->alignment - 1);
   size_t end_chunk = chunk->start_address + chunk->size;
@@ -115,9 +115,9 @@ chunk_slack (chunk_info_t* chunk, size_t size, size_t* last_chunk_size)
  *
  * @return The address of the chunk if it fits, 0 otherwise.
  */
-static chunk_info_t * 
-append_new_chunk (memory_region_t *region, 
-                  size_t size) 
+static chunk_info_t *
+append_new_chunk (memory_region_t *region,
+                  size_t size)
 {
 
   chunk_info_t* new_chunk = NULL;
@@ -140,24 +140,24 @@ append_new_chunk (memory_region_t *region,
     {
       BA_UNLOCK (region->lock);
       return NULL;
-    } 
-  else 
+    }
+  else
     {
       DL_DELETE (region->free_chunks, new_chunk);
     }
 
   /* Round the start address up towards the closest aligned
      address. */
-  new_chunk->start_address = 
+  new_chunk->start_address =
     (region->last_chunk->start_address + region->alignment - 1) &
-    ~(region->alignment - 1); 
+    ~(region->alignment - 1);
   new_chunk->parent_region = region;
   new_chunk->size = size;
   new_chunk->is_allocated = 1;
   new_chunk->children = NULL;
 
   chunk_slack (region->last_chunk, size, (size_t*)&region->last_chunk->size);
-  region->last_chunk->start_address = 
+  region->last_chunk->start_address =
     new_chunk->start_address + new_chunk->size;
 
   DL_DELETE (region->chunks, region->last_chunk);
@@ -169,7 +169,7 @@ append_new_chunk (memory_region_t *region,
   print_chunks (region->chunks);
   printf ("\n");
 #endif
-  
+
   BA_UNLOCK (region->lock);
 
   return new_chunk;
@@ -180,8 +180,8 @@ append_new_chunk (memory_region_t *region,
  *
  * @return The chunk, or NULL if no space available in the region.
  */
-chunk_info_t* 
-alloc_buffer_from_region (memory_region_t *region, size_t size) 
+chunk_info_t*
+alloc_buffer_from_region (memory_region_t *region, size_t size)
 {
   assert (region != NULL);
   /* The memory-wasteful but fast strategy:
@@ -197,11 +197,11 @@ alloc_buffer_from_region (memory_region_t *region, size_t size)
     }
 
   BA_LOCK (region->lock);
-  
-  DL_FOREACH (region->chunks, cursor) 
+
+  DL_FOREACH (region->chunks, cursor)
     {
       if (cursor == region->last_chunk ||
-          cursor->is_allocated || 
+          cursor->is_allocated ||
           !chunk_slack (cursor, size, NULL))
         continue; /* doesn't fit */
       /* found one */
@@ -218,10 +218,10 @@ alloc_buffer_from_region (memory_region_t *region, size_t size)
 
   BA_UNLOCK (region->lock);
 
-  if (chunk == NULL && region->strategy != BALLOCS_WASTEFUL) 
+  if (chunk == NULL && region->strategy != BALLOCS_WASTEFUL)
     {
       return append_new_chunk (region, size);
-    } 
+    }
   return chunk;
 }
 
@@ -241,7 +241,7 @@ alloc_buffer (memory_region_t *regions, size_t size)
 {
   chunk_info_t *chunk = NULL;
   memory_region_t *region = NULL;
-  LL_FOREACH(regions, region) 
+  LL_FOREACH(regions, region)
     {
       chunk = alloc_buffer_from_region (region, size);
       if (chunk != NULL)
@@ -258,7 +258,7 @@ alloc_buffer (memory_region_t *regions, size_t size)
  */
 chunk_info_t *
 create_sub_chunk (chunk_info_t *parent, size_t offset, size_t size)
-{    
+{
   chunk_info_t *subchunk = (chunk_info_t*)malloc(sizeof(struct chunk_info));
   subchunk->start_address = parent->start_address + offset;
   subchunk->size = size;
@@ -276,10 +276,10 @@ create_sub_chunk (chunk_info_t *parent, size_t offset, size_t size)
  * Must be called inside a locked region.
  *
  * @return A pointer to the coalesced chunk, or the second chunk in case
- * coalsecing could not be done. 
+ * coalsecing could not be done.
  */
-static chunk_info_t * 
-coalesce_chunks (chunk_info_t* first, 
+static chunk_info_t *
+coalesce_chunks (chunk_info_t* first,
                  chunk_info_t* second)
 {
   if (first == NULL) return second;
@@ -320,19 +320,19 @@ free_buffer (memory_region_t *regions, memory_address_t addr)
   printf ("#### free_buffer(%p, %x)\n", regions, addr);
 #endif
 
-  LL_FOREACH (regions, region) 
+  LL_FOREACH (regions, region)
     {
       chunk_info_t *chunk = NULL;
       BA_LOCK (region->lock);
       DL_FOREACH (region->chunks, chunk)
         {
-          if (chunk->start_address == addr) 
+          if (chunk->start_address == addr)
             {
               chunk->is_allocated = 0;
               coalesce_chunks (coalesce_chunks (chunk->prev, chunk), chunk->next);
               BA_UNLOCK (region->lock);
 #ifdef DEBUG_BUFALLOC
-              printf ("#### region %x after free_buffer at addr %x\n", 
+              printf ("#### region %x after free_buffer at addr %x\n",
                       region, addr);
               print_chunks (region->chunks);
               printf ("\n");
@@ -351,8 +351,8 @@ free_buffer (memory_region_t *regions, memory_address_t addr)
  * Successive unallocated chunks in the region, if found, are merged to
  * form larger unallocated chunks.
  */
-void 
-free_chunk (chunk_info_t* chunk) 
+void
+free_chunk (chunk_info_t* chunk)
 {
   memory_region_t *region = chunk->parent_region;
   BA_LOCK (region->lock);
@@ -368,12 +368,12 @@ free_chunk (chunk_info_t* chunk)
 
 }
 
-/** Initialize a memory_region_t. 
+/** Initialize a memory_region_t.
  * @param region is a pointer to a existing memory_region_t data structure.
  * @param start the base address of the memory region to be managed.
  * @Param size  the size of the region (in bytes?)
  */
-void 
+void
 init_mem_region (memory_region_t *region, memory_address_t start, size_t size)
 {
   int i;
@@ -398,7 +398,7 @@ init_mem_region (memory_region_t *region, memory_address_t start, size_t size)
     DL_APPEND (region->free_chunks, &region->all_chunks[i]);
 
 #ifdef DEBUG_BUFALLOC
-  printf ("#### memory region %x created. start: %x size: %u\n", 
+  printf ("#### memory region %x created. start: %x size: %u\n",
           region, start, size);
 #endif
 }
diff --git a/lib/CL/devices/bufalloc.h b/lib/CL/devices/bufalloc.h
index 6312148..13c5e2a 100644
--- a/lib/CL/devices/bufalloc.h
+++ b/lib/CL/devices/bufalloc.h
@@ -1,17 +1,17 @@
 /* OpenCL runtime/device driver library: custom buffer allocator
 
    Copyright (c) 2011 Tampere University of Technology
-   
+
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -71,7 +71,7 @@ typedef tce_sm_lock ba_lock_t;
 #endif
 
 /* The number of chunks in a region should be scaled to an approximate
-   maximum number of kernel buffer arguments. Running out of chunk 
+   maximum number of kernel buffer arguments. Running out of chunk
    data structures might leave region space unused due to that only. */
 #ifndef MAX_CHUNKS_IN_REGION
 #define MAX_CHUNKS_IN_REGION 64
@@ -81,12 +81,12 @@ typedef tce_sm_lock ba_lock_t;
 typedef size_t memory_address_t;
 
 /* the different strategies for how to allocate buffers from a memory region */
-enum allocation_strategy 
+enum allocation_strategy
   {
-    BALLOCS_WASTEFUL, /* try to fit to the end of the region first 
+    BALLOCS_WASTEFUL, /* try to fit to the end of the region first
                          (consumes the whole region quicker) */
-    BALLOCS_TIGHT     /* try to reuse old freed chunks first 
-                         (for the case when the region grows dynamically e.g. towards stack) 
+    BALLOCS_TIGHT     /* try to reuse old freed chunks first
+                         (for the case when the region grows dynamically e.g. towards stack)
                       */
   };
 
@@ -94,14 +94,14 @@ typedef AS_QUALIFIER struct chunk_info chunk_info_t;
 
 typedef AS_QUALIFIER struct memory_region memory_region_t;
 
-/* Info of a single "chunk" inside a memory region. Chunk is a piece 
-   of memory that has been allocated to a buffer (but might have been 
+/* Info of a single "chunk" inside a memory region. Chunk is a piece
+   of memory that has been allocated to a buffer (but might have been
    unallocated). Initially there's only one special chunk representing
    the whole region as one unallocated chunk. */
-struct chunk_info 
+struct chunk_info
 {
   memory_address_t start_address;
-  int is_allocated; 
+  int is_allocated;
   size_t size; /* size in bytes */
   chunk_info_t* next;
   chunk_info_t* prev;
@@ -113,25 +113,25 @@ struct chunk_info
 /* Represents a single continuous region of memory from which smaller
    "chunks" are allocated. Note: this doesn't include the memory space
    itself. */
-struct memory_region 
+struct memory_region
 {
   chunk_info_t all_chunks[MAX_CHUNKS_IN_REGION];
   chunk_info_t *chunks;
-  chunk_info_t *free_chunks; /* A pointer to a head of a linked list of 
-                                chunk_info records that can be used for 
-                                new allocations. This enables allocating 
-                                the chunk infos statically at compile time, 
-                                or dynamically. In the dynamic case, the 
-                                client of the bufalloc should first ensure 
-                                there is at least one free chunk info before 
+  chunk_info_t *free_chunks; /* A pointer to a head of a linked list of
+                                chunk_info records that can be used for
+                                new allocations. This enables allocating
+                                the chunk infos statically at compile time,
+                                or dynamically. In the dynamic case, the
+                                client of the bufalloc should first ensure
+                                there is at least one free chunk info before
                                 trying the allocation. If not, create one. */
   chunk_info_t *last_chunk; /* The last chunk in the region (a "sentinel"). In case
-                               the last chunk is allocated, the region 
+                               the last chunk is allocated, the region
                                is completely full. New chunks should be inserted
                                before this chunk. */
   memory_region_t *next;
   memory_region_t *prev;
-  enum allocation_strategy strategy; 
+  enum allocation_strategy strategy;
   unsigned short alignment; /* alignment of the returned chunks in a 2's exponent byte count */
   ba_lock_t lock;
 };
@@ -158,7 +158,7 @@ void
 print_chunks (chunk_info_t *first);
 
 #ifdef __GNUC__
-#pragma GCC visibility pop 
+#pragma GCC visibility pop
 #endif
 
 #ifdef __cplusplus
diff --git a/lib/CL/devices/cellspu/Makefile.am b/lib/CL/devices/cellspu/Makefile.am
deleted file mode 100644
index 879a35c..0000000
--- a/lib/CL/devices/cellspu/Makefile.am
+++ /dev/null
@@ -1,32 +0,0 @@
-# Process this file with automake to produce Makefile.in (in this,
-# and all subdirectories).
-# Makefile.am for pocl/lib/CL/devices/cellspu
-# 
-# Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-noinst_LTLIBRARIES = libpocl-devices-cellspu.la
-
-libpocl_devices_cellspu_la_SOURCES = cellspu.h cellspu.c
-
-libpocl_devices_cellspu_la_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -I$(top_srcdir)/lib/CL/devices -I$(top_srcdir)/lib/CL $(OCL_ICD_CFLAGS)
-libpocl_devices_cellspu_la_LDFLAGS = -lltdl @PTHREAD_CFLAGS@ --version-info ${LIB_VERSION}
-
-EXTRA_DIST = CMakeLists.txt
diff --git a/lib/CL/devices/cellspu/Makefile.in b/lib/CL/devices/cellspu/Makefile.in
deleted file mode 100644
index 7ee7e58..0000000
--- a/lib/CL/devices/cellspu/Makefile.in
+++ /dev/null
@@ -1,713 +0,0 @@
-# Makefile.in generated by automake 1.15 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994-2014 Free Software Foundation, Inc.
-
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
- at SET_MAKE@
-
-# Process this file with automake to produce Makefile.in (in this,
-# and all subdirectories).
-# Makefile.am for pocl/lib/CL/devices/cellspu
-# 
-# Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-VPATH = @srcdir@
-am__is_gnu_make = { \
-  if test -z '$(MAKELEVEL)'; then \
-    false; \
-  elif test -n '$(MAKE_HOST)'; then \
-    true; \
-  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
-    true; \
-  else \
-    false; \
-  fi; \
-}
-am__make_running_with_option = \
-  case $${target_option-} in \
-      ?) ;; \
-      *) echo "am__make_running_with_option: internal error: invalid" \
-              "target option '$${target_option-}' specified" >&2; \
-         exit 1;; \
-  esac; \
-  has_opt=no; \
-  sane_makeflags=$$MAKEFLAGS; \
-  if $(am__is_gnu_make); then \
-    sane_makeflags=$$MFLAGS; \
-  else \
-    case $$MAKEFLAGS in \
-      *\\[\ \	]*) \
-        bs=\\; \
-        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
-          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
-    esac; \
-  fi; \
-  skip_next=no; \
-  strip_trailopt () \
-  { \
-    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
-  }; \
-  for flg in $$sane_makeflags; do \
-    test $$skip_next = yes && { skip_next=no; continue; }; \
-    case $$flg in \
-      *=*|--*) continue;; \
-        -*I) strip_trailopt 'I'; skip_next=yes;; \
-      -*I?*) strip_trailopt 'I';; \
-        -*O) strip_trailopt 'O'; skip_next=yes;; \
-      -*O?*) strip_trailopt 'O';; \
-        -*l) strip_trailopt 'l'; skip_next=yes;; \
-      -*l?*) strip_trailopt 'l';; \
-      -[dEDm]) skip_next=yes;; \
-      -[JT]) skip_next=yes;; \
-    esac; \
-    case $$flg in \
-      *$$target_option*) has_opt=yes; break;; \
-    esac; \
-  done; \
-  test $$has_opt = yes
-am__make_dryrun = (target_option=n; $(am__make_running_with_option))
-am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
-pkgdatadir = $(datadir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkglibexecdir = $(libexecdir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-subdir = lib/CL/devices/cellspu
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
-	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
-	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
-	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \
-	$(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/config.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-LTLIBRARIES = $(noinst_LTLIBRARIES)
-libpocl_devices_cellspu_la_LIBADD =
-am_libpocl_devices_cellspu_la_OBJECTS =  \
-	libpocl_devices_cellspu_la-cellspu.lo
-libpocl_devices_cellspu_la_OBJECTS =  \
-	$(am_libpocl_devices_cellspu_la_OBJECTS)
-AM_V_lt = $(am__v_lt_ at AM_V@)
-am__v_lt_ = $(am__v_lt_ at AM_DEFAULT_V@)
-am__v_lt_0 = --silent
-am__v_lt_1 = 
-libpocl_devices_cellspu_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
-	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
-	$(AM_CFLAGS) $(CFLAGS) $(libpocl_devices_cellspu_la_LDFLAGS) \
-	$(LDFLAGS) -o $@
-AM_V_P = $(am__v_P_ at AM_V@)
-am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
-am__v_P_0 = false
-am__v_P_1 = :
-AM_V_GEN = $(am__v_GEN_ at AM_V@)
-am__v_GEN_ = $(am__v_GEN_ at AM_DEFAULT_V@)
-am__v_GEN_0 = @echo "  GEN     " $@;
-am__v_GEN_1 = 
-AM_V_at = $(am__v_at_ at AM_V@)
-am__v_at_ = $(am__v_at_ at AM_DEFAULT_V@)
-am__v_at_0 = @
-am__v_at_1 = 
-DEFAULT_INCLUDES = -I. at am__isrc@ -I$(top_builddir)
-depcomp = $(SHELL) $(top_srcdir)/config/depcomp
-am__depfiles_maybe = depfiles
-am__mv = mv -f
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
-	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
-	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
-	$(AM_CFLAGS) $(CFLAGS)
-AM_V_CC = $(am__v_CC_ at AM_V@)
-am__v_CC_ = $(am__v_CC_ at AM_DEFAULT_V@)
-am__v_CC_0 = @echo "  CC      " $@;
-am__v_CC_1 = 
-CCLD = $(CC)
-LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
-	$(AM_LDFLAGS) $(LDFLAGS) -o $@
-AM_V_CCLD = $(am__v_CCLD_ at AM_V@)
-am__v_CCLD_ = $(am__v_CCLD_ at AM_DEFAULT_V@)
-am__v_CCLD_0 = @echo "  CCLD    " $@;
-am__v_CCLD_1 = 
-SOURCES = $(libpocl_devices_cellspu_la_SOURCES)
-DIST_SOURCES = $(libpocl_devices_cellspu_la_SOURCES)
-am__can_run_installinfo = \
-  case $$AM_UPDATE_INFO_DIR in \
-    n|no|NO) false;; \
-    *) (install-info --version) >/dev/null 2>&1;; \
-  esac
-am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
-# Read a list of newline-separated strings from the standard input,
-# and print each of them once, without duplicates.  Input order is
-# *not* preserved.
-am__uniquify_input = $(AWK) '\
-  BEGIN { nonempty = 0; } \
-  { items[$$0] = 1; nonempty = 1; } \
-  END { if (nonempty) { for (i in items) print i; }; } \
-'
-# Make sure the list of sources is unique.  This is necessary because,
-# e.g., the same source file might be shared among _SOURCES variables
-# for different programs/libraries.
-am__define_uniq_tagged_files = \
-  list='$(am__tagged_files)'; \
-  unique=`for i in $$list; do \
-    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-  done | $(am__uniquify_input)`
-ETAGS = etags
-CTAGS = ctags
-am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/config/depcomp
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-ACLOCAL = @ACLOCAL@
-AMTAR = @AMTAR@
-AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
-AR = @AR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
-BOOST_LDFLAGS = @BOOST_LDFLAGS@
-BUILD_TIMESTAMP = @BUILD_TIMESTAMP@
-CC = @CC@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CLANG = @CLANG@
-CLANGXX = @CLANGXX@
-CLANGXX_FLAGS = @CLANGXX_FLAGS@
-CLFLAGS = @CLFLAGS@
-CPP = @CPP@
-CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-DLLTOOL = @DLLTOOL@
-DSYMUTIL = @DSYMUTIL@
-DUMPBIN = @DUMPBIN@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-FGREP = @FGREP@
-FORCED_CLFLAGS = @FORCED_CLFLAGS@
-GLEW_CFLAGS = @GLEW_CFLAGS@
-GLEW_LIBS = @GLEW_LIBS@
-GREP = @GREP@
-HOST = @HOST@
-HOST_AS_FLAGS = @HOST_AS_FLAGS@
-HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
-HOST_CPU = @HOST_CPU@
-HOST_LD_FLAGS = @HOST_LD_FLAGS@
-HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
-HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
-HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
-HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
-HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
-HSAILASM = @HSAILASM@
-HSA_INCLUDES = @HSA_INCLUDES@
-HSA_LIBS = @HSA_LIBS@
-HWLOC_CFLAGS = @HWLOC_CFLAGS@
-HWLOC_LIBS = @HWLOC_LIBS@
-ICD_LD_FLAGS = @ICD_LD_FLAGS@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-KERNEL_COMPILER_LIB_VERSION = @KERNEL_COMPILER_LIB_VERSION@
-LD = @LD@
-LDFLAGS = @LDFLAGS@
-LD_FLAGS_BIN = @LD_FLAGS_BIN@
-LIBOBJS = @LIBOBJS@
-LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
-LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
-LIBTOOL = @LIBTOOL@
-LIB_AGE_VERSION = @LIB_AGE_VERSION@
-LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
-LIB_FIRST_VERSION = @LIB_FIRST_VERSION@
-LIB_REVISION_VERSION = @LIB_REVISION_VERSION@
-LIB_VERSION = @LIB_VERSION@
-LIPO = @LIPO@
-LLC = @LLC@
-LLVM_AS = @LLVM_AS@
-LLVM_CONFIG = @LLVM_CONFIG@
-LLVM_CXX_FLAGS = @LLVM_CXX_FLAGS@
-LLVM_LDFLAGS = @LLVM_LDFLAGS@
-LLVM_LIBS = @LLVM_LIBS@
-LLVM_LINK = @LLVM_LINK@
-LLVM_OPT = @LLVM_OPT@
-LLVM_VERSION = @LLVM_VERSION@
-LN_S = @LN_S@
-LTDL_LIBS = @LTDL_LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
-MAKEINFO = @MAKEINFO@
-MANIFEST_TOOL = @MANIFEST_TOOL@
-MKDIR_P = @MKDIR_P@
-NM = @NM@
-NMEDIT = @NMEDIT@
-OBJDUMP = @OBJDUMP@
-OBJEXT = @OBJEXT@
-OCL_ICD_CFLAGS = @OCL_ICD_CFLAGS@
-OCL_ICD_LIBS = @OCL_ICD_LIBS@
-OCL_KERNEL_ARCH = @OCL_KERNEL_ARCH@
-OCL_KERNEL_TARGET = @OCL_KERNEL_TARGET@
-OCL_KERNEL_TARGET_CPU = @OCL_KERNEL_TARGET_CPU@
-OCL_TARGETS = @OCL_TARGETS@
-OPENCL_CFLAGS = @OPENCL_CFLAGS@
-OPENCL_CMAKE = @OPENCL_CMAKE@
-OPENCL_EXTLIBS = @OPENCL_EXTLIBS@
-OPENCL_LIBS = @OPENCL_LIBS@
-OPT = @OPT@
-OTOOL = @OTOOL@
-OTOOL64 = @OTOOL64@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_URL = @PACKAGE_URL@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PKG_CONFIG = @PKG_CONFIG@
-PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
-PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
-POAT_TESTSUITES = @POAT_TESTSUITES@
-POCL_DEVICE_ADDRESS_BITS = @POCL_DEVICE_ADDRESS_BITS@
-PTHREAD_CC = @PTHREAD_CC@
-PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SDL_CFLAGS = @SDL_CFLAGS@
-SDL_LIBS = @SDL_LIBS@
-SED = @SED@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-TARGET = @TARGET@
-TARGET_CLANG_FLAGS = @TARGET_CLANG_FLAGS@
-TARGET_CPU = @TARGET_CPU@
-TARGET_LLC_FLAGS = @TARGET_LLC_FLAGS@
-TARGET_SIZEOF_DOUBLE = @TARGET_SIZEOF_DOUBLE@
-TARGET_SIZEOF_HALF = @TARGET_SIZEOF_HALF@
-TARGET_SIZEOF_LONG = @TARGET_SIZEOF_LONG@
-TARGET_SIZEOF_VOID_P = @TARGET_SIZEOF_VOID_P@
-TCECC = @TCECC@
-TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
-TCE_AVAILABLE = @TCE_AVAILABLE@
-TCE_CONFIG = @TCE_CONFIG@
-VERSION = @VERSION@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_ct_AR = @ac_ct_AR@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
-acx_pthread_config = @acx_pthread_config@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_build_prefix = @top_build_prefix@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-noinst_LTLIBRARIES = libpocl-devices-cellspu.la
-libpocl_devices_cellspu_la_SOURCES = cellspu.h cellspu.c
-libpocl_devices_cellspu_la_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -I$(top_srcdir)/lib/CL/devices -I$(top_srcdir)/lib/CL $(OCL_ICD_CFLAGS)
-libpocl_devices_cellspu_la_LDFLAGS = -lltdl @PTHREAD_CFLAGS@ --version-info ${LIB_VERSION}
-EXTRA_DIST = CMakeLists.txt
-all: all-am
-
-.SUFFIXES:
-.SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
-	        && { if test -f $@; then exit 0; else break; fi; }; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign lib/CL/devices/cellspu/Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign lib/CL/devices/cellspu/Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure:  $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
-
-clean-noinstLTLIBRARIES:
-	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
-	@list='$(noinst_LTLIBRARIES)'; \
-	locs=`for p in $$list; do echo $$p; done | \
-	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
-	      sort -u`; \
-	test -z "$$locs" || { \
-	  echo rm -f $${locs}; \
-	  rm -f $${locs}; \
-	}
-
-libpocl-devices-cellspu.la: $(libpocl_devices_cellspu_la_OBJECTS) $(libpocl_devices_cellspu_la_DEPENDENCIES) $(EXTRA_libpocl_devices_cellspu_la_DEPENDENCIES) 
-	$(AM_V_CCLD)$(libpocl_devices_cellspu_la_LINK)  $(libpocl_devices_cellspu_la_OBJECTS) $(libpocl_devices_cellspu_la_LIBADD) $(LIBS)
-
-mostlyclean-compile:
-	-rm -f *.$(OBJEXT)
-
-distclean-compile:
-	-rm -f *.tab.c
-
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libpocl_devices_cellspu_la-cellspu.Plo at am__quote@
-
-.c.o:
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(COMPILE) -c -o $@ $<
-
-.c.obj:
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
-
-.c.lo:
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LTCOMPILE) -c -o $@ $<
-
-libpocl_devices_cellspu_la-cellspu.lo: cellspu.c
- at am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_devices_cellspu_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpocl_devices_cellspu_la-cellspu.lo -MD -MP -MF $(DEPDIR)/libpocl_devices_cellspu_la-cellspu.Tpo -c -o libpocl_devices_cellspu_la-cellspu.lo `test -f 'cellspu.c' || echo '$(srcdir)/'`cellspu.c
- at am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpocl_devices_cellspu_la-cellspu.Tpo $(DEPDIR)/libpocl_devices_cellspu_la-cellspu.Plo
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='cellspu.c' object='libpocl_devices_cellspu_la-cellspu.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCC_FALSE@	$(AM_V_CC at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpocl_devices_cellspu_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpocl_devices_cellspu_la-cellspu.lo `test -f 'cellspu.c' || echo '$(srcdir)/'`cellspu.c
-
-mostlyclean-libtool:
-	-rm -f *.lo
-
-clean-libtool:
-	-rm -rf .libs _libs
-
-ID: $(am__tagged_files)
-	$(am__define_uniq_tagged_files); mkid -fID $$unique
-tags: tags-am
-TAGS: tags
-
-tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	set x; \
-	here=`pwd`; \
-	$(am__define_uniq_tagged_files); \
-	shift; \
-	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  if test $$# -gt 0; then \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      "$$@" $$unique; \
-	  else \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      $$unique; \
-	  fi; \
-	fi
-ctags: ctags-am
-
-CTAGS: ctags
-ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	$(am__define_uniq_tagged_files); \
-	test -z "$(CTAGS_ARGS)$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && $(am__cd) $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) "$$here"
-cscopelist: cscopelist-am
-
-cscopelist-am: $(am__tagged_files)
-	list='$(am__tagged_files)'; \
-	case "$(srcdir)" in \
-	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
-	  *) sdir=$(subdir)/$(srcdir) ;; \
-	esac; \
-	for i in $$list; do \
-	  if test -f "$$i"; then \
-	    echo "$(subdir)/$$i"; \
-	  else \
-	    echo "$$sdir/$$i"; \
-	  fi; \
-	done >> $(top_builddir)/cscope.files
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-am
-all-am: Makefile $(LTLIBRARIES)
-installdirs:
-install: install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-am
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-am
-
-clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
-	mostlyclean-am
-
-distclean: distclean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-distclean-am: clean-am distclean-compile distclean-generic \
-	distclean-tags
-
-dvi: dvi-am
-
-dvi-am:
-
-html: html-am
-
-html-am:
-
-info: info-am
-
-info-am:
-
-install-data-am:
-
-install-dvi: install-dvi-am
-
-install-dvi-am:
-
-install-exec-am:
-
-install-html: install-html-am
-
-install-html-am:
-
-install-info: install-info-am
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-am
-
-install-pdf-am:
-
-install-ps: install-ps-am
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-am
-
-mostlyclean-am: mostlyclean-compile mostlyclean-generic \
-	mostlyclean-libtool
-
-pdf: pdf-am
-
-pdf-am:
-
-ps: ps-am
-
-ps-am:
-
-uninstall-am:
-
-.MAKE: install-am install-strip
-
-.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
-	clean-libtool clean-noinstLTLIBRARIES cscopelist-am ctags \
-	ctags-am distclean distclean-compile distclean-generic \
-	distclean-libtool distclean-tags distdir dvi dvi-am html \
-	html-am info info-am install install-am install-data \
-	install-data-am install-dvi install-dvi-am install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-man install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-compile \
-	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
-	tags tags-am uninstall uninstall-am
-
-.PRECIOUS: Makefile
-
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/lib/CL/devices/cellspu/cellspu.c b/lib/CL/devices/cellspu/cellspu.c
deleted file mode 100644
index cbb82c2..0000000
--- a/lib/CL/devices/cellspu/cellspu.c
+++ /dev/null
@@ -1,649 +0,0 @@
-/* cellspu.c - a pocl device driver for Cell SPU.
-
-   Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
-   
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-   
-   The above copyright notice and this permission notice shall be included in
-   all copies or substantial portions of the Software.
-   
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-   THE SOFTWARE.
-*/
-
-#include "cellspu.h"
-#include "config.h"
-#include <assert.h>
-#include <string.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <../dev_image.h>
-#include <sys/time.h>
-
-#include <libspe2.h>
-#include "pocl_device.h"
-#include "common.h"
-
-#define max(a,b) (((a) > (b)) ? (a) : (b))
-
-#define COMMAND_LENGTH 2048
-#define WORKGROUP_STRING_LENGTH 1024
-
-//#define DEBUG_CELLSPU_DRIVER
-
-struct data {
-  /* Currently loaded kernel. */
-  cl_kernel current_kernel;
-  /* Loaded kernel dynamic library handle. */
-  lt_dlhandle current_dlhandle;
-};
-
-//TODO: global, or per-device?
-spe_context_ptr_t spe_context;
-//TODO: this certainly should be per-program (per kernel?)
-spe_program_handle_t *hello_spu;
-//TODO: again - not global...
-memory_region_t spe_local_mem;
-
-
-void
-pocl_cellspu_init_device_ops(struct pocl_device_ops *ops)
-{
-  ops->device_name = "cellspu";
-        
-  ops->probe = pocl_basic_probe;
-  ops->init_device_infos = pocl_cellspu_init_device_infos;
-  ops->uninit = pocl_cellspu_uninit;
-  ops->init = pocl_cellspu_init;
-  ops->malloc = pocl_cellspu_malloc;
-  ops->create_sub_buffer = pocl_cellspu_create_sub_buffer;
-  ops->free = pocl_cellspu_free;
-  ops->read = pocl_cellspu_read;
-  ops->read_rect = pocl_cellspu_read_rect;
-  ops->write = pocl_cellspu_write;
-  ops->write_rect = pocl_cellspu_write_rect;
-  ops->copy = pocl_cellspu_copy;
-  ops->copy_rect = pocl_cellspu_copy_rect;
-  ops->map_mem = pocl_cellspu_map_mem;
-  ops->run = pocl_cellspu_run;
-  ops->get_timer_value = pocl_cellspu_get_timer_value;
-}
-
-
-void
-pocl_cellspu_init_device_infos(struct _cl_device_id* dev)
-{
-  dev->type = CL_DEVICE_TYPE_ACCELERATOR;
-  dev->max_compute_units = 1;
-  dev->max_work_item_dimensions = 3;
-  dev->max_work_item_sizes[0] = dev->max_work_item_sizes[1] =
-	  dev->max_work_item_sizes[2] = dev->max_work_group_size = 8192;
-  dev->preferred_wg_size_multiple = 8;
-  dev->preferred_vector_width_char = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR;
-  dev->preferred_vector_width_short = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT;
-  dev->preferred_vector_width_int = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT;
-  dev->preferred_vector_width_long = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_LONG;
-  dev->preferred_vector_width_float = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_FLOAT;
-  dev->preferred_vector_width_double = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE;
-  dev->preferred_vector_width_half = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_HALF;
-  /* TODO: figure out what the difference between preferred and native widths are. */
-  dev->native_vector_width_char = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR;
-  dev->native_vector_width_short = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT;
-  dev->native_vector_width_int = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT;
-  dev->native_vector_width_long = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_LONG;
-  dev->native_vector_width_float = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_FLOAT;
-  dev->native_vector_width_double = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE;
-  dev->native_vector_width_half = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_HALF;
-  dev->max_clock_frequency = 100;
-
-  dev->image_support = CL_FALSE;
-  dev->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN;
-  dev->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN;
-  dev->global_mem_cache_type = CL_NONE;
-  dev->local_mem_type = CL_GLOBAL;
-  dev->error_correction_support = CL_FALSE;
-  dev->host_unified_memory = CL_TRUE;
-  dev->endian_little = CL_FALSE;
-  dev->available = CL_TRUE;
-  dev->compiler_available = CL_TRUE;
-  dev->execution_capabilities = CL_EXEC_KERNEL;
-  dev->queue_properties = CL_QUEUE_PROFILING_ENABLE;
-  dev->vendor = "STI";
-  dev->profile = "EMBEDDED_PROFILE";
-  dev->extensions = "";
-  dev->llvm_target_triplet = "cellspu-v0";
-  dev->llvm_cpu = "cellspu";
-
-  dev->parent_device = NULL;
-  // cellspu does not support partitioning
-  dev->max_sub_devices = 1;
-  dev->num_partition_properties = 1;
-  dev->partition_properties = calloc(dev->num_partition_properties,
-    sizeof(cl_device_partition_property));
-  dev->num_partition_types = 0;
-  dev->partition_type = NULL;
-
-}
-
-void
-pocl_cellspu_init (cl_device_id device, const char* parameters)
-{
-  struct data *d;
-
-  d = (struct data *) malloc (sizeof (struct data));
-  device->data = d;
-  
-  d->current_kernel = NULL;
-  d->current_dlhandle = 0;
-
-  device->global_mem_size = 256*1024;
-  device->max_mem_alloc_size = device->global_mem_size / 2;
-
-  // TODO: find the API docs. what are the params?
-  spe_context = spe_context_create(0,NULL);
-  if (spe_context == NULL) perror("spe_context_create fails");
-  
-  // initialize the SPE local storage allocator. 
-  init_mem_region( &spe_local_mem, CELLSPU_OCL_BUFFERS_START, device->max_mem_alloc_size); 
-
-}
-
-/* 
- * Allocate a chunk for kernel local variables.
- */
-void *
-cellspu_malloc_local (void *device_data, size_t size)
-{
-  struct data* d = (struct data*)device_data;
-  chunk_info_t *chunk = alloc_buffer (&spe_local_mem, size);
-  return (void*) chunk;
-
-}
-void *
-pocl_cellspu_malloc (void *device_data, cl_mem_flags flags,
-		     size_t size, void *host_ptr)
-{
-  void *b;
-  struct data* d = (struct data*)device_data;
-
-  //TODO: unglobalify spe_local_mem
-  chunk_info_t *chunk = alloc_buffer (&spe_local_mem, size);
-  if (chunk == NULL) return NULL;
-
-#ifdef DEBUG_CELLSPU_DRIVER
-  printf("host: malloc %x (host) %x (device) size: %u\n", host_ptr, chunk->start_address, size);
-#endif
-#if 0
-  if ((flags & CL_MEM_COPY_HOST_PTR) ||  
-      ((flags & CL_MEM_USE_HOST_PTR) && host_ptr != NULL))
-    {
-      /* TODO: 
-         CL_MEM_USE_HOST_PTR must synch the buffer after execution 
-         back to the host's memory in case it's used as an output (?). */
-      d->copyHostToDevice(host_ptr, chunk->start_address, size);
-      return (void*) chunk;
-    }
-#endif
-  return (void*) chunk;
-
-}
-
-void
-pocl_cellspu_free (void *data, cl_mem_flags flags, void *ptr)
-{
-  POCL_ABORT_UNIMPLEMENTED();
-
-  if (flags & CL_MEM_USE_HOST_PTR)
-    return;
-  
-  POCL_MEM_FREE(ptr);
-}
-
-void
-pocl_cellspu_read (void *data, void *host_ptr, const void *device_ptr, size_t cb)
-{
-	chunk_info_t *chunk = (chunk_info_t*)device_ptr;
-	assert( chunk->is_allocated  && "cellspu: writing to an ullacoated memory?");
-
-#ifdef DEBUG_CELLSPU_DRIVER
-	printf("cellspu: read %d bytes to %x (host) from %x (device)\n", cb, host_ptr,chunk->start_address);
-#endif
-	void *mmap_base=spe_ls_area_get( spe_context );
-	memcpy( host_ptr, mmap_base+(chunk->start_address), cb);
-
-}
-
-/* write 'bytes' of bytes from *host_a to SPU local storage area. */
-void cellspu_memwrite( void *lsa, const void *host_a, size_t bytes )
-{	
-#ifdef DEBUG_CELLSPU_DRIVER
-	printf("cellspu: write %d bytes from %x (host) to %x (device)\n", bytes, host_a,lsa);
-#endif
-	void *mmap_base=spe_ls_area_get( spe_context );
-	memcpy( (void*)(mmap_base+(int)lsa), (const void*)host_a, bytes);
-}
-
-void
-pocl_cellspu_write (void *data, const void *host_ptr, void *device_ptr, size_t cb)
-{
-	chunk_info_t *chunk = (chunk_info_t*)device_ptr;
-	assert( chunk->is_allocated  && "cellspu: writing to an ullacoated memory?");
-        cellspu_memwrite( (void*)(chunk->start_address), host_ptr, cb );
-}
-
-
-void
-pocl_cellspu_run 
-(void *data, 
- _cl_command_node* cmd)
-{
-  struct data *d;
-  int error;
-  char bytecode[POCL_FILENAME_LENGTH];
-  char assembly[POCL_FILENAME_LENGTH];
-  char module[POCL_FILENAME_LENGTH];
-  char command[COMMAND_LENGTH];
-  char workgroup_string[WORKGROUP_STRING_LENGTH];
-  unsigned device;
-  struct pocl_argument *al;
-  size_t x, y, z;
-  unsigned i;
-  pocl_workgroup w;
-  char* tmpdir = cmd->command.run.tmp_dir;
-  cl_kernel kernel = cmd->command.run.kernel;
-  struct pocl_context *pc = &cmd->command.run.pc;
-  const char* kern_func = kernel->function_name;
-  unsigned int entry = SPE_DEFAULT_ENTRY;
-
-  assert (data != NULL);
-  d = (struct data *) data;
-
-  error = snprintf 
-    (module, POCL_FILENAME_LENGTH,
-     "%s/parallel.so", tmpdir);
-  assert (error >= 0);
-
-  // This is the entry to the kenrel. We currently hard-code it
-  // into the SPU binary. Resulting in only one entry-point per 
-  // SPU image.
-  // TODO: figure out which function to call given what conditions
-  snprintf (workgroup_string, WORKGROUP_STRING_LENGTH,
-            "_pocl_launcher_%s_workgroup_fast", kernel->function_name);
-
-
-  if ( access (module, F_OK) != 0)
-    {
-      char *llvm_ld;
-      error = snprintf (bytecode, POCL_FILENAME_LENGTH,
-                        "%s/linked.bc", tmpdir);
-      assert (error >= 0);
-      
-      if (getenv("POCL_BUILDING") != NULL)
-        llvm_ld = BUILDDIR "/tools/llvm-ld/pocl-llvm-ld";
-      else if (access(PKGLIBEXECDIR "/pocl-llvm-ld", X_OK) == 0)
-        llvm_ld = PKGLIBEXECDIR "/pocl-llvm-ld";
-      else
-        llvm_ld = "pocl-llvm-ld";
-
-      error = snprintf (command, COMMAND_LENGTH,
-			"%s --disable-opt -link-as-library -o %s %s/%s",
-                        llvm_ld, bytecode, tmpdir, POCL_PARALLEL_BC_FILENAME);
-      assert (error >= 0);
-      
-      error = system(command);
-      assert (error == 0);
-      
-      error = snprintf (assembly, POCL_FILENAME_LENGTH,
-			"%s/parallel.s",
-			tmpdir);
-      assert (error >= 0);
-      
-      // "-relocation-model=dynamic-no-pic" is a magic string,
-      // I do not know why it has to be there to produce valid
-      // sos on x86_64
-      error = snprintf (command, COMMAND_LENGTH,
-			LLC " " HOST_LLC_FLAGS " -o %s %s",
-			assembly,
-			bytecode);
-      assert (error >= 0);
-      error = system (command);
-      assert (error == 0);
-           
-
-      // Compile the assembly version of the OCL kernel with the
-      // C wrapper to get a spulet
-      error = snprintf (command, COMMAND_LENGTH,
-			"spu-gcc lib/CL/devices/cellspu/spe_wrap.c -o %s %s "
-			" -Xlinker --defsym -Xlinker _ocl_buffer=%d"
-			" -Xlinker --defsym -Xlinker kernel_command=%d"
-			" -I . -D_KERNEL=%s -std=c99",
-			module,
-			assembly, 
-			CELLSPU_OCL_BUFFERS_START,
-			CELLSPU_KERNEL_CMD_ADDR,
-			workgroup_string);
-      assert (error >= 0);
-#ifdef DEBUG_CELLSPU_DRIVER
-      printf("compiling: %s\n", command); fflush(stdout); 
-#endif
-      error = system (command);
-      assert (error == 0);
-
-    }
-      
-    // Load the SPU with the newly generated binary
-    hello_spu = spe_image_open( (const char*)module );
-    if( spe_program_load( spe_context, hello_spu) )
-        perror("spe_program_load fails");
-    
-//
-//  /* Find which device number within the context correspond
-//     to current device.  */
-//  for (i = 0; i < kernel->context->num_devices; ++i)
-//    {
-//      if (kernel->context->devices[i]->data == data)
-//	{
-//	  device = i;
-//	  break;
-//	}
-//    }
-//
-
-  // This structure gets passed to the device.
-  // It contains all the info needed to run a kernel  
-  __kernel_exec_cmd dev_cmd;
-  dev_cmd.work_dim = cmd->command.run.pc.work_dim;
-  dev_cmd.num_groups[0] = cmd->command.run.pc.num_groups[0];
-  dev_cmd.num_groups[1] = cmd->command.run.pc.num_groups[1];
-  dev_cmd.num_groups[2] = cmd->command.run.pc.num_groups[2];
-
-  dev_cmd.global_offset[0] = cmd->command.run.pc.global_offset[0];
-  dev_cmd.global_offset[1] = cmd->command.run.pc.global_offset[1];
-  dev_cmd.global_offset[2] = cmd->command.run.pc.global_offset[2];
-
-
-  // the code below is lifted from pthreads :) 
-  uint32_t *arguments = dev_cmd.args;
-
-  for (i = 0; i < kernel->num_args; ++i)
-    {
-      al = &(kernel->dyn_arguments[i]);
-      if (kernel->arg_info[i].is_local)
-        {
-          chunk_info_t* local_chunk = cellspu_malloc_local (d, al->size);
-          if (local_chunk == NULL)
-            POCL_ABORT ("Could not allocate memory for a local argument. Out of local mem?\n");
-
-          dev_cmd.args[i] = local_chunk->start_address;
-
-        }
-      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER)
-        {
-          /* It's legal to pass a NULL pointer to clSetKernelArguments. In 
-             that case we must pass the same NULL forward to the kernel.
-             Otherwise, the user must have created a buffer with per device
-             pointers stored in the cl_mem. */
-          if (al->value == NULL)
-            arguments[i] = (uint32_t)NULL;
-          else
-            arguments[i] = \
-              ((chunk_info_t*)((*(cl_mem *)\
-                (al->value))->device_ptrs[0]))->start_address;
-		//TODO: '0' above is the device number... don't hard-code!
-        }
-      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
-        {
-          POCL_ABORT_UNIMPLEMENTED();
-//          dev_image2d_t di;      
-//          cl_mem mem = *(cl_mem*)al->value;
-//          di.data = &((*(cl_mem *) (al->value))->device_ptrs[device]);
-//          di.data = ((*(cl_mem *) (al->value))->device_ptrs[device]);
-//          di.width = mem->image_width;
-//          di.height = mem->image_height;
-//          di.rowpitch = mem->image_row_pitch;
-//          di.order = mem->image_channel_order;
-//          di.data_type = mem->image_channel_data_type;
-//          void* devptr = pocl_cellspu_malloc(data, 0, sizeof(dev_image2d_t), NULL);
-//          arguments[i] = malloc (sizeof (void *));
-//          *(void **)(arguments[i]) = devptr; 
-//          pocl_cellspu_write (data, &di, devptr, sizeof(dev_image2d_t));
-        }
-      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
-        {
-          POCL_ABORT_UNIMPLEMENTED();
-//          dev_sampler_t ds;
-//          
-//          arguments[i] = malloc (sizeof (void *));
-//          *(void **)(arguments[i]) = pocl_cellspu_malloc(data, 0, sizeof(dev_sampler_t), NULL);
-//          pocl_cellspu_write (data, &ds, *(void**)arguments[i], sizeof(dev_sampler_t));
-        }
-      else
-        {
-          arguments[i] = (uint32_t)al->value;
-        }
-    }
-
-  // allocate memory for kernel local variables
-  for (i = kernel->num_args;
-       i < kernel->num_args + kernel->num_locals;
-       ++i)
-    {
-      al = &(kernel->dyn_arguments[i]);
-      arguments[i] = (uint32_t)malloc (sizeof (void *));
-      *(void **)(arguments[i]) = cellspu_malloc_local(data, al->size);
-    }
-
-  // the main loop on the spe needs an auxiliary struct for to get the 
-  // number of arguments and such. 
-  __kernel_metadata kmd;
-  strncpy( (char*) kmd.name, workgroup_string, sizeof( kmd.name ) );  
-  kmd.num_args = kernel->num_args;
-  kmd.num_locals = kernel->num_locals;
-  // TODO: fill in the rest, if used by the spu main function.
-
-  // TODO malloc_local should be given the 'device data'. as long as teh 
-  // spu context is global this is ok.
-  void *chunk = cellspu_malloc_local( NULL, sizeof(__kernel_metadata) ); 
-  void *kernel_area = (void*)((chunk_info_t*)chunk)->start_address;
-  cellspu_memwrite( kernel_area, &kmd, sizeof(__kernel_metadata) );
-  dev_cmd.kernel = (int) kernel_area;
-  
-  // finish up the command, send it to SPE
-  dev_cmd.status =POCL_KST_READY;
-  cellspu_memwrite( (void*)CELLSPU_KERNEL_CMD_ADDR, &dev_cmd, sizeof(__kernel_exec_cmd) );
-       
-  // Execute code on SPU. This starts with the main() in the spu - see spe_wrap.c
-  if (spe_context_run(spe_context,&entry,0,NULL,NULL,NULL) < 0)
-    perror("context_run error");
-
-//  for (z = 0; z < pc->num_groups[2]; ++z)
-//    {
-//      for (y = 0; y < pc->num_groups[1]; ++y)
-//        {
-//          for (x = 0; x < pc->num_groups[0]; ++x)
-//            {
-//              pc->group_id[0] = x;
-//              pc->group_id[1] = y;
-//              pc->group_id[2] = z;
-//
-//              w (arguments, pc);
-//
-//            }
-//        }
-//    }
-
-
-  // Clean-up ? 
-  for (i = 0; i < kernel->num_args; ++i)
-    {
-      if (kernel->arg_info[i].is_local)
-        pocl_cellspu_free(data, 0, *(void **)(arguments[i]));
-    }
-  for (i = kernel->num_args;
-       i < kernel->num_args + kernel->num_locals;
-       ++i)
-    pocl_cellspu_free(data, 0, *(void **)(arguments[i]));
-}
-
-void
-pocl_cellspu_copy (void *data, const void *src_ptr, void *__restrict__ dst_ptr, size_t cb)
-{
-  POCL_ABORT_UNIMPLEMENTED();
-
-  if (src_ptr == dst_ptr)
-    return;
-  
-  memcpy (dst_ptr, src_ptr, cb);
-}
-
-void
-pocl_cellspu_copy_rect (void *data,
-                      const void *__restrict const src_ptr,
-                      void *__restrict__ const dst_ptr,
-                      const size_t *__restrict__ const src_origin,
-                      const size_t *__restrict__ const dst_origin, 
-                      const size_t *__restrict__ const region,
-                      size_t const src_row_pitch,
-                      size_t const src_slice_pitch,
-                      size_t const dst_row_pitch,
-                      size_t const dst_slice_pitch)
-{
-  char const *__restrict const adjusted_src_ptr = 
-    (char const*)src_ptr +
-    src_origin[0] + src_row_pitch * (src_origin[1] + src_slice_pitch * src_origin[2]);
-  char *__restrict__ const adjusted_dst_ptr = 
-    (char*)dst_ptr +
-    dst_origin[0] + dst_row_pitch * (dst_origin[1] + dst_slice_pitch * dst_origin[2]);
-  
-  size_t j, k;
-  POCL_ABORT_UNIMPLEMENTED();
-
-  /* TODO: handle overlaping regions */
-  
-  for (k = 0; k < region[2]; ++k)
-    for (j = 0; j < region[1]; ++j)
-      memcpy (adjusted_dst_ptr + dst_row_pitch * j + dst_slice_pitch * k,
-              adjusted_src_ptr + src_row_pitch * j + src_slice_pitch * k,
-              region[0]);
-}
-
-void
-pocl_cellspu_write_rect (void *data,
-                       const void *__restrict__ const host_ptr,
-                       void *__restrict__ const device_ptr,
-                       const size_t *__restrict__ const buffer_origin,
-                       const size_t *__restrict__ const host_origin, 
-                       const size_t *__restrict__ const region,
-                       size_t const buffer_row_pitch,
-                       size_t const buffer_slice_pitch,
-                       size_t const host_row_pitch,
-                       size_t const host_slice_pitch)
-{
-  char *__restrict const adjusted_device_ptr = 
-    (char*)device_ptr +
-    buffer_origin[0] + buffer_row_pitch * (buffer_origin[1] + buffer_slice_pitch * buffer_origin[2]);
-  char const *__restrict__ const adjusted_host_ptr = 
-    (char const*)host_ptr +
-    host_origin[0] + host_row_pitch * (host_origin[1] + host_slice_pitch * host_origin[2]);
-  
-  size_t j, k;
-  POCL_ABORT_UNIMPLEMENTED();
-
-  /* TODO: handle overlaping regions */
-  
-  for (k = 0; k < region[2]; ++k)
-    for (j = 0; j < region[1]; ++j)
-      memcpy (adjusted_device_ptr + buffer_row_pitch * j + buffer_slice_pitch * k,
-              adjusted_host_ptr + host_row_pitch * j + host_slice_pitch * k,
-              region[0]);
-}
-
-void
-pocl_cellspu_read_rect (void *data,
-                      void *__restrict__ const host_ptr,
-                      void *__restrict__ const device_ptr,
-                      const size_t *__restrict__ const buffer_origin,
-                      const size_t *__restrict__ const host_origin, 
-                      const size_t *__restrict__ const region,
-                      size_t const buffer_row_pitch,
-                      size_t const buffer_slice_pitch,
-                      size_t const host_row_pitch,
-                      size_t const host_slice_pitch)
-{
-  char const *__restrict const adjusted_device_ptr = 
-    (char const*)device_ptr +
-    buffer_origin[0] + buffer_row_pitch * (buffer_origin[1] + buffer_slice_pitch * buffer_origin[2]);
-  char *__restrict__ const adjusted_host_ptr = 
-    (char*)host_ptr +
-    host_origin[0] + host_row_pitch * (host_origin[1] + host_slice_pitch * host_origin[2]);
-  
-  size_t j, k;
-  POCL_ABORT_UNIMPLEMENTED();
-  
-  /* TODO: handle overlaping regions */
-  
-  for (k = 0; k < region[2]; ++k)
-    for (j = 0; j < region[1]; ++j)
-      memcpy (adjusted_host_ptr + host_row_pitch * j + host_slice_pitch * k,
-              adjusted_device_ptr + buffer_row_pitch * j + buffer_slice_pitch * k,
-              region[0]);
-}
-
-
-void *
-pocl_cellspu_map_mem (void *data, void *buf_ptr, 
-                      size_t offset, size_t size,
-                      void *host_ptr) 
-{
-  /* All global pointers of the pthread/CPU device are in 
-     the host address space already, and up to date. */
-  POCL_ABORT_UNIMPLEMENTED();
-
-  if (host_ptr != NULL) return host_ptr;
-  return buf_ptr + offset;
-}
-
-void
-pocl_cellspu_uninit (cl_device_id device)
-{
-  struct data *d = (struct data*)device->data;
-  POCL_ABORT_UNIMPLEMENTED();
-
-  POCL_MEM_FREE(d);
-  device->data = NULL;
-}
-
-cl_ulong
-pocl_cellspu_get_timer_value (void *data) 
-{
-  POCL_ABORT_UNIMPLEMENTED();
-
-  struct timeval current;
-  gettimeofday(&current, NULL);  
-  return (current.tv_sec * 1000000 + current.tv_usec)*1000;
-}
-
-int 
-pocl_cellspu_build_program (void *data, const char *source_fn, const char *binary_fn, 
-			    const char *default_cmd, const char *user_opts, const char *dev_tmpdir) 
-{
-  POCL_ABORT_UNIMPLEMENTED();
-
-}
-
-void *
-pocl_cellspu_create_sub_buffer (void *device_data, void* buffer, size_t origin, size_t size)
-{
-  POCL_ABORT_UNIMPLEMENTED();
-  return NULL;
-}
diff --git a/lib/CL/devices/cellspu/cellspu.h b/lib/CL/devices/cellspu/cellspu.h
deleted file mode 100644
index d7f6d38..0000000
--- a/lib/CL/devices/cellspu/cellspu.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* cellspu.h - a pocl device driver for Cell SPU.
-
-   Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
-   
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-   
-   The above copyright notice and this permission notice shall be included in
-   all copies or substantial portions of the Software.
-   
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-   THE SOFTWARE.
-*/
-
-#ifndef POCL_CELLSPU_H
-#define POCL_CELLSPU_H
-
-#include "pocl_cl.h"
-#include "pocl_icd.h"
-#include "bufalloc.h"
-
-#include "prototypes.inc"
-
-/* simplistic linker script: 
- * this is the SPU local address where 'OpenCL global' memory starts.
- * (if we merge the spus to a single device, this is the 'OpenCL local' memory
- * 
- * The idea is to allocate
- * 64k (0-64k) for text.
- * 128k (64k-192k) for Opencl local memory.
- * 64k (192k-256k) for stack + heap (if any)
- * 
- * I was unable to place the stack to start at 0x20000, thus the "unclean" division.
- */
-#define CELLSPU_OCL_BUFFERS_START 0x10000
-#define CELLSPU_OCL_BUFFERS_SIZE  0x20000
-#define CELLSPU_KERNEL_CMD_ADDR   0x30000
-//#define CELLSPU_OCL_KERNEL_ADDRESS 0x2000
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-GEN_PROTOTYPES (cellspu)
-GEN_PROTOTYPES (basic)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* POCL_CELLSPU_H */
diff --git a/lib/CL/devices/common.c b/lib/CL/devices/common.c
index c598b09..18a431e 100644
--- a/lib/CL/devices/common.c
+++ b/lib/CL/devices/common.c
@@ -28,6 +28,8 @@
 #include <string.h>
 
 #ifndef _MSC_VER
+#  include <sys/time.h>
+#  include <sys/resource.h>
 #  include <unistd.h>
 #else
 #  include "vccompat.hpp"
@@ -42,6 +44,7 @@
 #include "pocl_mem_management.h"
 #include "pocl_runtime_config.h"
 #include "pocl_llvm.h"
+#include "pocl_debug.h"
 
 #include "_kernel_constants.h"
 
@@ -80,6 +83,9 @@ llvm_codegen (const char* tmpdir, cl_kernel kernel, cl_device_id device) {
   if (pocl_exists(module))
     return module;
 
+  void* write_lock = pocl_cache_acquire_writer_lock(kernel->program, device);
+  assert(write_lock);
+
       error = snprintf (bytecode, POCL_FILENAME_LENGTH,
                         "%s%s", tmpdir, POCL_PARALLEL_BC_FILENAME);
       assert (error >= 0);
@@ -108,6 +114,8 @@ llvm_codegen (const char* tmpdir, cl_kernel kernel, cl_device_id device) {
           pocl_remove(bytecode);
         }
 
+  pocl_cache_release_lock(write_lock);
+
   return module;
 }
 
@@ -181,3 +189,151 @@ pocl_memalign_alloc(size_t align_width, size_t size)
 }
 
 
+#define MIN_MAX_MEM_ALLOC_SIZE (128*1024*1024)
+
+/* accounting object for the main memory */
+static pocl_global_mem_t system_memory;
+
+void pocl_setup_device_for_system_memory(cl_device_id device)
+{
+  /* set up system memory limits, if required */
+  if (system_memory.total_alloc_limit == 0)
+  {
+      /* global_mem_size contains the entire memory size,
+       * and we need to leave some available for OS & other programs
+       * this sets it to 3/4 for systems with <=7gig mem,
+       * for >7 it sets to (total-2gigs)
+       */
+      size_t alloc_limit = device->global_mem_size;
+      if ((alloc_limit >> 20) > (7 << 10))
+        system_memory.total_alloc_limit = alloc_limit - (size_t)(1 << 31);
+      else
+        {
+          size_t temp = (alloc_limit >> 2);
+          system_memory.total_alloc_limit = alloc_limit - temp;
+        }
+
+      system_memory.max_ever_allocated =
+          system_memory.currently_allocated = 0;
+  }
+
+  device->global_mem_size = system_memory.total_alloc_limit;
+  if (device->global_mem_size < MIN_MAX_MEM_ALLOC_SIZE)
+    POCL_ABORT("Not enough memory to run on this device.\n");
+
+  /* Maximum allocation size: we don't have hardware limits, so we
+   * can potentially allocate the whole memory for a single buffer, unless
+   * of course there are limits set at the operating system level. Of course
+   * we still have to respect the OpenCL-commanded minimum */
+  size_t alloc_limit = SIZE_MAX;
+
+#ifndef _MSC_VER
+  // TODO getrlimit equivalent under Windows
+  struct rlimit limits;
+  int ret = getrlimit(RLIMIT_DATA, &limits);
+  if (ret == 0)
+    alloc_limit = limits.rlim_cur;
+  else
+#endif
+    alloc_limit = MIN_MAX_MEM_ALLOC_SIZE;
+
+  if (alloc_limit > device->global_mem_size)
+    alloc_limit = device->global_mem_size;
+
+  if (alloc_limit < MIN_MAX_MEM_ALLOC_SIZE)
+    alloc_limit = MIN_MAX_MEM_ALLOC_SIZE;
+
+  // set up device properties..
+  device->global_memory = &system_memory;
+  device->max_mem_alloc_size = alloc_limit;
+
+  // TODO in theory now if alloc_limit was > rlim_cur and < rlim_max
+  // we should try and setrlimit to alloc_limit, or allocations might fail
+}
+
+
+/* set maximum allocation sizes for buffers and images */
+void
+pocl_set_buffer_image_limits(cl_device_id device)
+{
+  pocl_setup_device_for_system_memory(device);
+  /* these aren't set up in pocl_setup_device_for_system_memory,
+   * because some devices (HSA) set them up themselves */
+  device->local_mem_size = device->max_constant_buffer_size =
+      device->max_mem_alloc_size;
+
+  /* We don't have hardware limitations on the buffer-backed image sizes,
+   * so we set the maximum size in terms of the maximum amount of pixels
+   * that fix in max_mem_alloc_size. A single pixel can take up to 4 32-bit channels,
+   * i.e. 16 bytes.
+   */
+  size_t max_pixels = device->max_mem_alloc_size/16;
+  if (max_pixels > device->image_max_buffer_size)
+    device->image_max_buffer_size = max_pixels;
+
+  /* Similarly, we can take the 2D image size limit to be the largest power of 2
+   * whose square fits in image_max_buffer_size; since the 2D image size limit
+   * starts at a power of 2, it's a simple matter of doubling.
+   * This is actually completely arbitrary, another equally valid option
+   * would be to have each maximum dimension match the image_max_buffer_size.
+   */
+  max_pixels = device->image2d_max_width;
+  // keep doubing until we go over
+  while (max_pixels <= device->image_max_buffer_size/max_pixels)
+    max_pixels *= 2;
+  // halve before assignment
+  max_pixels /= 2;
+  if (max_pixels > device->image2d_max_width)
+    device->image2d_max_width = device->image2d_max_height = max_pixels;
+
+  /* Same thing for 3D images, of course with cubes. Again, totally arbitrary. */
+  max_pixels = device->image3d_max_width;
+  // keep doubing until we go over
+  while (max_pixels*max_pixels <= device->image_max_buffer_size/max_pixels)
+    max_pixels *= 2;
+  // halve before assignment
+  max_pixels /= 2;
+  if (max_pixels > device->image3d_max_width)
+  device->image3d_max_width = device->image3d_max_height =
+    device->image3d_max_depth = max_pixels;
+
+}
+
+void* pocl_memalign_alloc_global_mem(cl_device_id device, size_t align, size_t size)
+{
+  pocl_global_mem_t *mem = device->global_memory;
+  if ((mem->total_alloc_limit - mem->currently_allocated) < size)
+    return NULL;
+
+  void* ptr = pocl_memalign_alloc(align, size);
+  if (!ptr)
+    return NULL;
+
+  mem->currently_allocated += size;
+  if (mem->max_ever_allocated < mem->currently_allocated)
+    mem->max_ever_allocated = mem->currently_allocated;
+
+  assert(mem->currently_allocated <= mem->total_alloc_limit);
+  return ptr;
+}
+
+void pocl_free_global_mem(cl_device_id device, void* ptr, size_t size)
+{
+  pocl_global_mem_t *mem = device->global_memory;
+
+  assert(mem->currently_allocated >= size);
+  mem->currently_allocated -= size;
+
+  POCL_MEM_FREE(ptr);
+}
+
+void pocl_print_system_memory_stats()
+{
+  POCL_MSG_PRINT("MEM STATS:\n", "",
+  "____ Total available system memory  : %10zu KB\n"
+  " ____ Currently used system memory   : %10zu KB\n"
+  " ____ Max used system memory         : %10zu KB\n",
+  system_memory.total_alloc_limit >> 10,
+  system_memory.currently_allocated >> 10,
+  system_memory.max_ever_allocated >> 10);
+}
diff --git a/lib/CL/devices/common.h b/lib/CL/devices/common.h
index 02bd7a6..73f01c2 100644
--- a/lib/CL/devices/common.h
+++ b/lib/CL/devices/common.h
@@ -28,6 +28,16 @@
 #include "pocl_cl.h"
 #include "dev_image.h"
 
+#define XSETUP_DEVICE_CL_VERSION(A, B)             \
+  dev->cl_version_major = A;                      \
+  dev->cl_version_minor = B;                      \
+  dev->cl_version_int = (A * 100) + (B * 10);     \
+  dev->cl_version_std = "CL" # A "." # B;         \
+  dev->version = "OpenCL " # A "." # B " pocl";
+
+#define SETUP_DEVICE_CL_VERSION(a, b) XSETUP_DEVICE_CL_VERSION(a, b)
+
+
 /* Determine preferred vector sizes */
 #if defined(__AVX__)
 #  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR   16
@@ -84,4 +94,15 @@ void fill_dev_sampler_t (dev_sampler_t *ds, struct pocl_argument *parg);
 
 void* pocl_memalign_alloc(size_t align_width, size_t size);
 
+
+void pocl_setup_device_for_system_memory(cl_device_id device);
+
+void pocl_set_buffer_image_limits(cl_device_id device);
+
+void* pocl_memalign_alloc_global_mem(cl_device_id device, size_t align, size_t size);
+
+void pocl_free_global_mem(cl_device_id device, void *ptr, size_t size);
+
+void pocl_print_system_memory_stats();
+
 #endif
diff --git a/lib/CL/devices/devices.c b/lib/CL/devices/devices.c
index 9e902a2..aa2781f 100644
--- a/lib/CL/devices/devices.c
+++ b/lib/CL/devices/devices.c
@@ -40,10 +40,6 @@
 #include "pocl_cache.h"
 #include "pocl_queue_util.h"
 
-#if defined(BUILD_SPU)
-#include "cellspu/cellspu.h"
-#endif
-
 #if defined(TCE_AVAILABLE)
 #include "tce/ttasim/ttasim.h"
 #endif
@@ -64,9 +60,6 @@ typedef void (*init_device_ops)(struct pocl_device_ops*);
 static init_device_ops pocl_devices_init_ops[] = {
   pocl_pthread_init_device_ops,
   pocl_basic_init_device_ops,
-#if defined(BUILD_SPU)
-  pocl_cellspu_init_device_ops,
-#endif
 #if defined(TCE_AVAILABLE)
   pocl_ttasim_init_device_ops,
 #endif
@@ -151,7 +144,7 @@ pocl_device_common_init(struct _cl_device_id* dev)
   POCL_INIT_OBJECT(dev);
   dev->driver_version = PACKAGE_VERSION;
   if(dev->version == NULL)
-    dev->version = "OpenCL 1.2 pocl";
+    dev->version = "OpenCL 2.0 pocl";
 
   dev->short_name = strdup(dev->ops->device_name);
   if(dev->long_name == NULL)
@@ -217,6 +210,7 @@ pocl_init_devices()
    * everytime we use the debug macros */
 #ifdef POCL_DEBUG_MESSAGES
   pocl_debug_messages = pocl_get_bool_option("POCL_DEBUG", 0);
+  stderr_is_a_tty = isatty(fileno(stderr));
 #endif
 
   pocl_cache_init_topdir();
diff --git a/lib/CL/devices/hsa/Makefile.in b/lib/CL/devices/hsa/Makefile.in
index 6f5b285..cc0bf93 100644
--- a/lib/CL/devices/hsa/Makefile.in
+++ b/lib/CL/devices/hsa/Makefile.in
@@ -246,6 +246,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -253,6 +254,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -270,8 +272,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -347,6 +347,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/lib/CL/devices/hsa/pocl-hsa.c b/lib/CL/devices/hsa/pocl-hsa.c
index f6173ca..b8ba4dc 100644
--- a/lib/CL/devices/hsa/pocl-hsa.c
+++ b/lib/CL/devices/hsa/pocl-hsa.c
@@ -49,11 +49,25 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
  */
 
+#ifndef _BSD_SOURCE
+#define _BSD_SOURCE
+#endif
+
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE
+#endif
+
+
 #include "hsa.h"
 #include "hsa_ext_finalize.h"
-#include "hsa_ext_amd.h"
 #include "hsa_ext_image.h"
 
+#include "config.h"
+
+#ifdef HAVE_HSA_EXT_AMD_H
+#include "hsa_ext_amd.h"
+#endif
+
 #include "pocl-hsa.h"
 #include "common.h"
 #include "devices.h"
@@ -69,6 +83,7 @@
 #ifndef _MSC_VER
 #  include <sys/wait.h>
 #  include <sys/time.h>
+#  include <sys/types.h>
 #  include <unistd.h>
 #else
 #  include "vccompat.hpp"
@@ -121,6 +136,10 @@ typedef struct pocl_hsa_device_data_s {
   /* Per-program data cache to simplify program compiling stage */
   pocl_hsa_kernel_cache_t kernel_cache[HSA_KERNEL_CACHE_SIZE];
   unsigned kernel_cache_lastptr;
+  /* kernel signal wait timeout hint, in HSA runtime units */
+  uint64_t timeout;
+  /* length of a timestamp unit expressed in nanoseconds */
+  double timestamp_unit;
 } pocl_hsa_device_data_t;
 
 struct pocl_supported_hsa_device_properties
@@ -159,9 +178,9 @@ pocl_hsa_init_device_ops(struct pocl_device_ops *ops)
   ops->read_rect = pocl_basic_read_rect;
   ops->write = pocl_basic_write;
   ops->write_rect = pocl_basic_write_rect;
-  ops->copy = pocl_basic_copy;
+  ops->copy = pocl_hsa_copy;
   ops->copy_rect = pocl_basic_copy_rect;
-  ops->get_timer_value = pocl_basic_get_timer_value;
+  ops->get_timer_value = pocl_hsa_get_timer_value;
 }
 
 #define MAX_HSA_AGENTS 16
@@ -176,7 +195,7 @@ static void pocl_hsa_abort_on_error(hsa_status_t status,
     {
       hsa_status_string(status, &str);
       POCL_MSG_PRINT2(func, line, "Error from HSA Runtime call:\n");
-      POCL_ABORT(str);
+      POCL_ABORT("%s", str);
     }
 }
 
@@ -249,16 +268,10 @@ supported_hsa_devices[MAX_HSA_AGENTS] =
         .has_64bit_long = 1,
         .vendor_id = 0x1002,
         .global_mem_cache_type = CL_READ_WRITE_CACHE,
-	.global_mem_cacheline_size = 64,
-	.max_compute_units = 8,
-	.max_clock_frequency = 720,
-	.max_constant_buffer_size = 65536,
+        .max_constant_buffer_size = 65536,
     .local_mem_type = CL_LOCAL,
     .endian_little = CL_TRUE,
-    .extensions = "cl_khr_fp64 cl_khr_byte_addressable_store"
-      " cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics"
-      " cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics"
-      " cl_khr_int64_base_atomics cl_khr_int64_extended_atomics",
+    .extensions = HSA_DEVICE_EXTENSIONS,
     .preferred_wg_size_multiple = 64, // wavefront size on Kaveri
     .preferred_vector_width_char = 4,
     .preferred_vector_width_short = 2,
@@ -299,9 +312,6 @@ get_hsa_device_features(char* dev_name, struct _cl_device_id* dev)
           COPY_ATTR (has_64bit_long);
           COPY_ATTR (vendor_id);
           COPY_ATTR (global_mem_cache_type);
-          COPY_ATTR (global_mem_cacheline_size);
-          COPY_ATTR (max_compute_units);
-          COPY_ATTR (max_clock_frequency);
           COPY_ATTR (max_constant_buffer_size);
           COPY_ATTR (local_mem_type);
           COPY_ATTR (endian_little);
@@ -331,6 +341,8 @@ pocl_hsa_init_device_infos(struct _cl_device_id* dev)
 {
   pocl_basic_init_device_infos (dev);
 
+  SETUP_DEVICE_CL_VERSION(HSA_DEVICE_CL_VERSION_MAJOR, HSA_DEVICE_CL_VERSION_MINOR)
+
   dev->spmd = CL_TRUE;
   dev->autolocals_to_args = 0;
 
@@ -368,8 +380,26 @@ pocl_hsa_init_device_infos(struct _cl_device_id* dev)
   dev->max_work_item_sizes[1] = wg_sizes[1];
   dev->max_work_item_sizes[2] = wg_sizes[2];
 
+#ifdef HAVE_HSA_EXT_AMD_H
+  uint32_t temp;
+  HSA_CHECK(hsa_agent_get_info(agent, HSA_AMD_AGENT_INFO_CACHELINE_SIZE, &temp));
+  dev->global_mem_cacheline_size = temp;
+
+  HSA_CHECK(hsa_agent_get_info(agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &temp));
+  dev->max_compute_units = temp;
+
+  HSA_CHECK(hsa_agent_get_info(agent, HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, &temp));
+  dev->max_clock_frequency = temp;
+#else
+#warning "Could not use AMD headers to find out CU/frequency of your device. Using some default values which are probably wrong..."
+  dev->global_mem_cacheline_size = 64;
+  dev->max_compute_units = 4;
+  dev->max_clock_frequency = 700;
+#endif
+
   HSA_CHECK(hsa_agent_get_info
     (agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &dev->max_work_group_size));
+
   /*Image features*/
   hsa_dim3_t image_size;
   HSA_CHECK(hsa_agent_get_info (agent, HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS, &image_size));
@@ -389,6 +419,24 @@ pocl_hsa_init_device_infos(struct _cl_device_id* dev)
     (agent, HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES, &dev->max_write_image_args));
   HSA_CHECK(hsa_agent_get_info
     (agent, HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS, &dev->max_samplers));
+
+  dev->should_allocate_svm = 1;
+  /* OpenCL 2.0 properties */
+  dev->svm_caps = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER
+                  | CL_DEVICE_SVM_FINE_GRAIN_BUFFER
+                  | CL_DEVICE_SVM_ATOMICS;
+  /* This is from clinfo output ran on AMD Catalyst drivers */
+  dev->max_events = 1024;
+  dev->max_queues = 1;
+  dev->max_pipe_args = 16;
+  dev->max_pipe_active_res = 16;
+  dev->max_pipe_packet_size = 1024 * 1024;
+  dev->dev_queue_pref_size = 256 * 1024;
+  dev->dev_queue_max_size = 512 * 1024;
+  dev->on_dev_queue_props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+                               | CL_QUEUE_PROFILING_ENABLE;
+  dev->on_host_queue_props = CL_QUEUE_PROFILING_ENABLE;
+
 }
 
 unsigned int
@@ -423,7 +471,6 @@ pocl_hsa_init (cl_device_id device, const char* parameters)
   pocl_hsa_device_data_t *d;
   static int global_mem_id;
   static int first_hsa_init = 1;
-  hsa_device_type_t dev_type;
 
   if (first_hsa_init)
     {
@@ -445,6 +492,13 @@ pocl_hsa_init (cl_device_id device, const char* parameters)
                                HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED, &boolarg));
   assert(boolarg != 0);
 
+#ifdef HAVE_HSA_EXT_AMD_H
+  char booltest = 0;
+  HSA_CHECK(hsa_region_get_info(d->global_region,
+                               HSA_AMD_REGION_INFO_HOST_ACCESSIBLE, &booltest));
+  assert(booltest != 0);
+#endif
+
   size_t sizearg;
   HSA_CHECK(hsa_region_get_info(d->global_region,
                                HSA_REGION_INFO_ALLOC_MAX_SIZE, &sizearg));
@@ -452,10 +506,12 @@ pocl_hsa_init (cl_device_id device, const char* parameters)
 
   /* For some reason, the global region size returned is 128 Terabytes...
    * for now, use the max alloc size, it seems to be a much more reasonable value.
-  HSA_CHECK(hsa_region_get_info(d->global_region, HSA_REGION_INFO_SIZE, &sizearg));
-  */
+   * HSA_CHECK(hsa_region_get_info(d->global_region, HSA_REGION_INFO_SIZE, &sizearg));
+   */
   device->global_mem_size = sizearg;
 
+  pocl_setup_device_for_system_memory(device);
+
   HSA_CHECK(hsa_region_get_info(d->group_region, HSA_REGION_INFO_SIZE, &sizearg));
   device->local_mem_size = sizearg;
 
@@ -467,27 +523,63 @@ pocl_hsa_init (cl_device_id device, const char* parameters)
   device->profile = (
       (d->agent_profile == HSA_PROFILE_FULL) ? "FULL_PROFILE" : "EMBEDDED_PROFILE");
 
+  uint64_t hsa_freq;
+  HSA_CHECK(hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &hsa_freq));
+  d->timeout = (uint64_t)((double)hsa_freq * 0.008);
+  d->timestamp_unit = (1000000000.0 / (double)hsa_freq);
+  POCL_MSG_PRINT_INFO("HSA timestamp frequency: %" PRIu64 "\n", hsa_freq);
+  POCL_MSG_PRINT_INFO("HSA timeout: %" PRIu64 "\n", d->timeout);
+  POCL_MSG_PRINT_INFO("HSA timestamp unit: %g\n", d->timestamp_unit);
+
+  device->profiling_timer_resolution = (size_t)(d->timestamp_unit) || 1;
+
   HSA_CHECK(hsa_queue_create(*d->agent, 4, HSA_QUEUE_TYPE_MULTI,
                        hsa_queue_callback, device->short_name,
                        -1, -1, &d->queue));
+
+}
+
+static void* pocl_hsa_malloc_account(pocl_global_mem_t *mem, size_t size, hsa_region_t r)
+{
+  void *b = NULL;
+  if ((mem->total_alloc_limit - mem->currently_allocated) < size)
+    return NULL;
+
+  if (hsa_memory_allocate(r, size, &b) != HSA_STATUS_SUCCESS)
+    return NULL;
+
+  mem->currently_allocated += size;
+  if (mem->max_ever_allocated < mem->currently_allocated)
+    mem->max_ever_allocated = mem->currently_allocated;
+  assert(mem->currently_allocated <= mem->total_alloc_limit);
+
+  if (b)
+    POCL_MSG_PRINT_INFO("HSA malloc'ed : size %" PRIuS "\n", size);
+
+  return b;
 }
 
 static void *
-pocl_hsa_malloc (pocl_hsa_device_data_t* d, cl_mem_flags flags, size_t size, void *host_ptr)
+pocl_hsa_malloc (cl_device_id device, cl_mem_flags flags, size_t size, void *host_ptr)
 {
-  void *b;
+  pocl_hsa_device_data_t* d = device->data;
+  void *b = NULL;
+  pocl_global_mem_t *mem = device->global_memory;
 
   if (flags & CL_MEM_COPY_HOST_PTR)
     {
+      POCL_MSG_PRINT_INFO("HSA: hsa_memory_allocate + hsa_memory_copy (CL_MEM_COPY_HOST_PTR)\n");
       assert(host_ptr != NULL);
-      if (hsa_memory_allocate(d->global_region, size, &b) != HSA_STATUS_SUCCESS)
-          return NULL;
-      hsa_memory_copy(b, host_ptr, size);
+
+      b = pocl_hsa_malloc_account(mem, size, d->global_region);
+      if (b)
+        hsa_memory_copy(b, host_ptr, size);
       return b;
     }
 
   if (flags & CL_MEM_USE_HOST_PTR)
     {
+      POCL_MSG_PRINT_INFO("HSA: hsa_memory_register (CL_MEM_USE_HOST_PTR)\n");
       assert(host_ptr != NULL);
       // TODO bookkeeping of mem registrations
       hsa_memory_register(host_ptr, size);
@@ -495,18 +587,34 @@ pocl_hsa_malloc (pocl_hsa_device_data_t* d, cl_mem_flags flags, size_t size, voi
     }
 
   assert(host_ptr == NULL);
-  if (hsa_memory_allocate(d->global_region, size, &b) != HSA_STATUS_SUCCESS)
-      return NULL;
-  return b;
+  //POCL_MSG_PRINT_INFO("HSA: hsa_memory_allocate (ALLOC_HOST_PTR)\n");
+  return pocl_hsa_malloc_account(mem, size, d->global_region);
 }
 
 void
-pocl_hsa_free (void *data, cl_mem_flags flags, void *ptr)
+pocl_hsa_free (cl_device_id device, cl_mem memobj)
 {
+  cl_mem_flags flags = memobj->flags;
+  void* ptr = memobj->device_ptrs[device->dev_id].mem_ptr;
+  size_t size = memobj->size;
+
   if (flags & CL_MEM_USE_HOST_PTR)
-    return; // TODO: hsa_memory_deregister() (needs size)
+    hsa_memory_deregister(ptr, size);
+  else
+    {
+      pocl_global_mem_t *mem = device->global_memory;
+      assert(mem->currently_allocated >= size);
+      mem->currently_allocated -= size;
+      hsa_memory_free(ptr);
+    }
+}
 
-  hsa_memory_free(ptr);
+void pocl_hsa_copy (void *data, const void *src_ptr, size_t src_offset,
+               void *__restrict__ dst_ptr, size_t dst_offset, size_t cb)
+{
+  assert(src_offset == 0);
+  assert(dst_offset == 0);
+  HSA_CHECK(hsa_memory_copy(dst_ptr, src_ptr, cb));
 }
 
 cl_int pocl_hsa_alloc_mem_obj(cl_device_id device, cl_mem mem_obj)
@@ -517,7 +625,7 @@ cl_int pocl_hsa_alloc_mem_obj(cl_device_id device, cl_mem mem_obj)
   /* if memory for this global memory is not yet allocated -> do it */
   if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL)
     {
-      b = pocl_hsa_malloc(device->data, flags, mem_obj->size, mem_obj->mem_host_ptr);
+      b = pocl_hsa_malloc(device, flags, mem_obj->size, mem_obj->mem_host_ptr);
       if (b == NULL)
         return CL_MEM_OBJECT_ALLOCATION_FAILURE;
 
@@ -552,7 +660,8 @@ setup_kernel_args (pocl_hsa_device_data_t *d,
     if (unaligned > 0) write_pos += (DSIZE - unaligned);     \
   } while (0)
 
-  for (size_t i = 0; i < cmd->command.run.kernel->num_args; ++i)
+  size_t i;
+  for (i = 0; i < cmd->command.run.kernel->num_args; ++i)
     {
       struct pocl_argument *al = &(cmd->command.run.arguments[i]);
       if (cmd->command.run.kernel->arg_info[i].is_local)
@@ -575,8 +684,12 @@ setup_kernel_args (pocl_hsa_device_data_t *d,
             }
           else
             {
-        	  uint64_t temp = (uint64_t)(*(cl_mem *)
-				  (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr;
+              cl_mem m = *(cl_mem *)al->value;
+              uint64_t temp = 0;
+              if (m->device_ptrs)
+                temp = (uint64_t)m->device_ptrs[cmd->device->dev_id].mem_ptr;
+              else
+                temp = (uint64_t)m->mem_host_ptr;
               memcpy (write_pos, &temp, sizeof(uint64_t));
             }
           write_pos += sizeof(uint64_t);
@@ -641,7 +754,8 @@ static pocl_hsa_kernel_cache_t* cache_kernel_dispatch_data(cl_kernel kernel,
   assert(stack_cache != NULL);
   assert(d != NULL);
 
-  for (unsigned i = 0; i<HSA_KERNEL_CACHE_SIZE; i++)
+  unsigned i;
+  for (i = 0; i<HSA_KERNEL_CACHE_SIZE; i++)
     {
       if (d->kernel_cache[i].kernel == kernel)
         return &d->kernel_cache[i];
@@ -708,8 +822,6 @@ pocl_hsa_run(void *dptr, _cl_command_node* cmd)
   cl_kernel kernel = cmd->command.run.kernel;
   struct pocl_context *pc = &cmd->command.run.pc;
   hsa_kernel_dispatch_packet_t *kernel_packet;
-  hsa_signal_t kernel_completion_signal;
-  hsa_region_t region;
   pocl_hsa_kernel_cache_t stack_cache, *cached_data;
 
   assert (dptr != NULL);
@@ -726,6 +838,9 @@ pocl_hsa_run(void *dptr, _cl_command_node* cmd)
 
   const uint32_t queueMask = d->queue->size - 1;
 
+  /* Launch the kernel by allocating a slot in the queue, writing the
+     command to it, signaling the update with a door bell and finally,
+     block waiting until finish signalled with the completion_signal. */
   uint64_t queue_index =
     hsa_queue_load_write_index_relaxed (d->queue);
   kernel_packet =
@@ -773,26 +888,31 @@ pocl_hsa_run(void *dptr, _cl_command_node* cmd)
   } hsa_header_union_t;
 
   hsa_header_union_t h;
+  h.a.header = (uint16_t)HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+  h.a.header |= (uint16_t)HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+  h.a.header |= (uint16_t)HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
   h.a.setup = (uint16_t)cmd->command.run.pc.work_dim << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-  h.a.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-  h.a.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
-  h.a.header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
   __atomic_store_n((uint32_t*)(&kernel_packet->header), h.header_setup, __ATOMIC_RELEASE);
 
-   /*
-    * Increment the write index and ring the doorbell to dispatch the kernel.
-    */
-   hsa_queue_store_write_index_relaxed (d->queue, queue_index + 1);
-   hsa_signal_store_relaxed (d->queue->doorbell_signal, queue_index);
-
-  /* Launch the kernel by allocating a slot in the queue, writing the
-     command to it, signaling the update with a door bell and finally,
-     block waiting until finish signalled with the completion_signal. */
+  /*
+   * Increment the write index and ring the doorbell to dispatch the kernel.
+   */
+  hsa_queue_store_write_index_relaxed (d->queue, queue_index + 1);
+  hsa_signal_store_relaxed (d->queue->doorbell_signal, queue_index);
 
+  /* Wait the first interval actively (should improve latency a bit for small jobs);
+   * if a longer wait is required, use a blocking wait */
   hsa_signal_value_t sigval =
     hsa_signal_wait_acquire
     (cached_data->kernel_completion_signal, HSA_SIGNAL_CONDITION_LT, 1,
-     (uint64_t)(-1), HSA_WAIT_STATE_ACTIVE);
+     d->timeout, HSA_WAIT_STATE_ACTIVE);
+
+  while (sigval > 0)
+    {
+      sigval = hsa_signal_wait_acquire
+        (cached_data->kernel_completion_signal, HSA_SIGNAL_CONDITION_LT, 1,
+         d->timeout, HSA_WAIT_STATE_BLOCKED);
+    }
 
   /* if the cache is full, release stuff */
   if (cached_data == &stack_cache)
@@ -839,11 +959,44 @@ pocl_hsa_run(void *dptr, _cl_command_node* cmd)
     }
 }
 
+/*
+ * This replaces a simple system(), because system() was causing issues
+ * (gpu lockups) when compiling code (via compile_parallel_bc_to_brig)
+ * with OpenCL 2.0 atomics (like CalcPie from AMD SDK).
+ * The reason of lockups is unknown (yet).
+ */
+static int run_command(char* args[])
+{
+  POCL_MSG_PRINT_INFO("Launching: %s", args[0]);
+#ifdef HAVE_VFORK
+  pid_t p = vfork();
+#elif defined(HAVE_FORK)
+  pid_t p = fork();
+#else
+#error Must have fork() or vfork() system calls for HSA
+#endif
+  if (p == 0)
+    {
+      return execv(args[0], args);
+    }
+  else
+    {
+      if (p < 0)
+        return -1;
+      int status;
+      if (waitpid(p, &status, 0) < 0)
+        POCL_ABORT("pocl-hsa: waitpid() itself failed.\n");
+      if (WIFEXITED(status))
+        return WEXITSTATUS(status);
+      else
+        return -2;
+    }
+}
+
 static int compile_parallel_bc_to_brig(const char* tmpdir, char* brigfile) {
   int error;
   char hsailfile[POCL_FILENAME_LENGTH];
   char bytecode[POCL_FILENAME_LENGTH];
-  char command[4096];
 
   error = snprintf (bytecode, POCL_FILENAME_LENGTH,
                     "%s%s", tmpdir, POCL_PARALLEL_BC_FILENAME);
@@ -857,30 +1010,26 @@ static int compile_parallel_bc_to_brig(const char* tmpdir, char* brigfile) {
     POCL_MSG_PRINT_INFO("pocl-hsa: using existing BRIG file: \n%s\n", brigfile);
   else
     {
+      // TODO call llvm via c++ interface like pocl_llvm_codegen()
       POCL_MSG_PRINT_INFO("pocl-hsa: BRIG file not found, compiling parallel.bc "
                           "to brig file: \n%s\n", bytecode);
 
-      // TODO call llvm via c++ interface like pocl_llvm_codegen()
       error = snprintf (hsailfile, POCL_FILENAME_LENGTH,
                     "%s%s.hsail", tmpdir, POCL_PARALLEL_BC_FILENAME);
       assert (error >= 0);
 
-      error = snprintf (command, 4096, LLC " -O2 -march=hsail64 -filetype=asm "
-                        "-o %s %s", hsailfile, bytecode);
-      assert (error >= 0);
-      error = system(command);
-      if (error != 0)
+      char* args1[] = { LLVM_LLC, "-O2", "-march=hsail64", "-filetype=asm", "-o",
+                        hsailfile, bytecode, NULL };
+      if ((error = run_command(args1)))
         {
-          POCL_MSG_PRINT_INFO("pocl-hsa: llc exit status %i\n", WEXITSTATUS(error));
+          POCL_MSG_PRINT_INFO("pocl-hsa: llc exit status %i\n", error);
           return error;
         }
 
-      error = snprintf (command, 4096, HSAIL_ASM " -o %s %s", brigfile, hsailfile);
-      assert (error >= 0);
-      error = system(command);
-      if (error != 0)
+      char* args2[] = { HSAIL_ASM, "-o", brigfile, hsailfile, NULL };
+      if ((error = run_command(args2)))
         {
-          POCL_MSG_PRINT_INFO("pocl-hsa: HSAILasm exit status %i\n", WEXITSTATUS(error));
+          POCL_MSG_PRINT_INFO("pocl-hsa: HSAILasm exit status %i\n", error);
           return error;
         }
     }
@@ -894,7 +1043,6 @@ pocl_hsa_compile_submitted_kernels (_cl_command_node *cmd)
   if (cmd->type != CL_COMMAND_NDRANGE_KERNEL)
     return;
 
-  int error;
   char brigfile[POCL_FILENAME_LENGTH];
   char *brig_blob;
 
@@ -904,7 +1052,8 @@ pocl_hsa_compile_submitted_kernels (_cl_command_node *cmd)
   hsa_executable_t *out = malloc(sizeof(hsa_executable_t));
   cmd->command.run.device_data = (void**)out;
 
-  for (unsigned i = 0; i<HSA_KERNEL_CACHE_SIZE; i++)
+  unsigned i;
+  for (i = 0; i<HSA_KERNEL_CACHE_SIZE; i++)
     if (d->kernel_cache[i].kernel == cmd->command.run.kernel)
       {
         *out = d->kernel_cache[i].hsa_exe;
@@ -977,7 +1126,8 @@ pocl_hsa_uninit (cl_device_id device)
 {
   pocl_hsa_device_data_t *d = (pocl_hsa_device_data_t*)device->data;
 
-  for (unsigned i = 0; i < HSA_KERNEL_CACHE_SIZE; i++)
+  unsigned i;
+  for (i = 0; i < HSA_KERNEL_CACHE_SIZE; i++)
     if (d->kernel_cache[i].kernel)
       {
         HSA_CHECK(hsa_executable_destroy(d->kernel_cache[i].hsa_exe));
@@ -988,3 +1138,13 @@ pocl_hsa_uninit (cl_device_id device)
   POCL_MEM_FREE(d);
   device->data = NULL;
 }
+
+
+cl_ulong pocl_hsa_get_timer_value(void *data)
+{
+  uint64_t hsa_ts;
+  HSA_CHECK(hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &hsa_ts));
+  cl_ulong res = (cl_ulong)(hsa_ts *
+                            ((pocl_hsa_device_data_t*)data)->timestamp_unit);
+  return res;
+}
diff --git a/lib/CL/devices/prototypes.inc b/lib/CL/devices/prototypes.inc
index 0314022..67241e6 100644
--- a/lib/CL/devices/prototypes.inc
+++ b/lib/CL/devices/prototypes.inc
@@ -39,7 +39,8 @@
   cl_int pocl_##__DRV__##_alloc_mem_obj (cl_device_id device, cl_mem mem_obj); \
   void *pocl_##__DRV__##_create_sub_buffer (void *device_data, void* buffer, \
                                             size_t origin, size_t size); \
-  void pocl_##__DRV__##_free (void *data, cl_mem_flags flags, void *ptr);   \
+  void pocl_##__DRV__##_free (cl_device_id device, cl_mem mem_obj);   \
+  void pocl_##__DRV__##_free_ptr (cl_device_id device, void* mem_ptr);   \
   void pocl_##__DRV__##_read (void *data, void *host_ptr,                   \
                           const void *device_ptr, size_t offset, size_t cb); \
   void pocl_##__DRV__##_read_rect (void *data, void *host_ptr,          \
@@ -82,6 +83,11 @@
                            size_t const buffer_slice_pitch,    \
                            void *fill_pixel,    \
                            size_t pixel_size);  \
+  void pocl_##__DRV__##_memfill (void *ptr,           \
+                            size_t size,              \
+                            size_t offset,            \
+                            const void* pattern,      \
+                            size_t pattern_size);     \
   void pocl_##__DRV__##_compile_submitted_kernels (_cl_command_node *node);  \
   void pocl_##__DRV__##_run (void *data, _cl_command_node* cmd);        \
   void pocl_##__DRV__##_run_native (void *data, _cl_command_node* cmd); \
diff --git a/lib/CL/devices/pthread/Makefile.in b/lib/CL/devices/pthread/Makefile.in
index 3d5fc4f..f0abf1a 100644
--- a/lib/CL/devices/pthread/Makefile.in
+++ b/lib/CL/devices/pthread/Makefile.in
@@ -247,6 +247,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -254,6 +255,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -271,8 +273,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -348,6 +348,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/lib/CL/devices/pthread/pthread.c b/lib/CL/devices/pthread/pthread.c
index 9a4916f..6725ba4 100644
--- a/lib/CL/devices/pthread/pthread.c
+++ b/lib/CL/devices/pthread/pthread.c
@@ -45,39 +45,6 @@
 #include "pocl_util.h"
 #include "pocl_mem_management.h"
 
-#ifdef CUSTOM_BUFFER_ALLOCATOR
-
-#include "bufalloc.h"
-
-/* Instead of mallocing a buffer size for a region, try to allocate 
-   this many times the buffer size to hopefully avoid mallocs for 
-   the next buffer allocations.
-   
-   Falls back to single multiple allocation if fails to allocate a
-   larger region. */
-#define ALLOCATION_MULTIPLE 32
-
-/* To avoid memory hogging in case of larger buffers, limit the
-   extra allocation margin to this number of megabytes.
-
-   The extra allocation should be done to avoid repetitive calls and
-   memory fragmentation for smaller buffers only. 
- */
-#define ADDITIONAL_ALLOCATION_MAX_MB 100
-
-/* Always create regions with at least this size to avoid allocating
-   small regions when there are lots of small buffers, which would counter 
-   a purpose of having own buffer management. It would end up having a lot of
-   small regions with linear searches over them.  */
-#define NEW_REGION_MIN_MB 10
-
-/* Whether to immediately free a region in case the last chunk was
-   deallocated. If 0, it can reuse the same region over multiple kernels. */
-#define FREE_EMPTY_REGIONS 0
-
-/* CUSTOM_BUFFER_ALLOCATOR */
-#endif
-
 #define COMMAND_LENGTH 2048
 #define WORKGROUP_STRING_LENGTH 1024
 
@@ -99,25 +66,11 @@ struct thread_arguments
 };
 
 
-#ifdef CUSTOM_BUFFER_ALLOCATOR
-typedef struct _mem_regions_management{
-  ba_lock_t mem_regions_lock;
-  struct memory_region *mem_regions;
-} mem_regions_management;
-#endif
-
 struct data {
   /* Currently loaded kernel. */
   cl_kernel current_kernel;
   /* Loaded kernel dynamic library handle. */
   lt_dlhandle current_dlhandle;
-
-#ifdef CUSTOM_BUFFER_ALLOCATOR
-  /* Lock for protecting the mem_regions linked list. Held when new mem_regions
-     are created or old ones freed. */
-  mem_regions_management* mem_regions;
-#endif
-
 };
 
 
@@ -127,10 +80,6 @@ pocl_lock_t ta_pool_lock;
 static size_t get_max_thread_count(cl_device_id device);
 static void * workgroup_thread (void *p);
 
-/* TODO: Declare this in a header file */
-void
-pocl_basic_set_buffer_image_limits(cl_device_id device);
-
 static void pocl_init_thread_argument_manager (void)
 {
   if (!argument_pool_initialized)
@@ -212,9 +161,6 @@ pocl_pthread_init (cl_device_id device, const char* parameters)
 {
   static int device_number = 0;
   struct data *d; 
-#ifdef CUSTOM_BUFFER_ALLOCATOR  
-  static mem_regions_management* mrm = NULL;
-#endif
 
   // TODO: this checks if the device was already initialized previously.
   // Should we instead have a separate bool field in device, or do the
@@ -228,15 +174,6 @@ pocl_pthread_init (cl_device_id device, const char* parameters)
   d->current_dlhandle = 0;
 
   device->data = d;
-#ifdef CUSTOM_BUFFER_ALLOCATOR  
-  if (mrm == NULL)
-    {
-      mrm = (mem_regions_management*)malloc (sizeof (mem_regions_management));
-      BA_INIT_LOCK (mrm->mem_regions_lock);
-      mrm->mem_regions = NULL;
-    }
-  d->mem_regions = mrm;
-#endif  
 
   device->address_bits = sizeof(void*) * 8;
 
@@ -251,7 +188,7 @@ pocl_pthread_init (cl_device_id device, const char* parameters)
   device->global_mem_size = 1;
   pocl_topology_detect_device_info(device);
   pocl_cpuinfo_detect_device_info(device);
-  pocl_basic_set_buffer_image_limits(device);
+  pocl_set_buffer_image_limits(device);
 
   /* in case hwloc doesn't provide a PCI ID, let's generate
      a vendor id that hopefully is unique across vendors. */
@@ -282,136 +219,21 @@ pocl_pthread_init (cl_device_id device, const char* parameters)
   #endif
 
   pocl_init_thread_argument_manager();
-  
 }
 
 void
 pocl_pthread_uninit (cl_device_id device)
 {
   struct data *d = (struct data*)device->data;
-#ifdef CUSTOM_BUFFER_ALLOCATOR
-  memory_region_t *region, *temp;
-  DL_FOREACH_SAFE(d->mem_regions->mem_regions, region, temp)
-    {
-      DL_DELETE(d->mem_regions->mem_regions, region);
-      free((void*)region->chunks->start_address);
-      region->chunks->start_address = 0;
-      POCL_MEM_FREE(region);
-    }
-  d->mem_regions->mem_regions = NULL;
-#endif  
   POCL_MEM_FREE(d);
   device->data = NULL;
 }
 
 
-#ifdef CUSTOM_BUFFER_ALLOCATOR
-static int
-allocate_aligned_buffer (struct data* d, void **memptr, size_t alignment, size_t size) 
-{
-  BA_LOCK(d->mem_regions->mem_regions_lock);
-  chunk_info_t *chunk = alloc_buffer (d->mem_regions->mem_regions, size);
-  if (chunk == NULL)
-    {
-      memory_region_t *new_mem_region =
-        (memory_region_t*)malloc (sizeof (memory_region_t));
-
-      if (new_mem_region == NULL) 
-        {
-          BA_UNLOCK (d->mem_regions->mem_regions_lock);
-          return ENOMEM;
-        }
-
-      /* Fallback to the minimum size in case of overflow. 
-         Allocate a larger chunk to avoid allocation overheads
-         later on. */
-      size_t region_size = 
-          max(max(min(size + ADDITIONAL_ALLOCATION_MAX_MB * 1024 * 1024, 
-                      size * ALLOCATION_MULTIPLE), size),
-              NEW_REGION_MIN_MB * 1024 * 1024);
-
-      assert (region_size >= size);
-
-      void* space = NULL;
-      space = pocl_memalign_alloc(alignment, region_size);
-      if (space == NULL)
-        {
-          /* Failed to allocate a large region. Fall back to allocating 
-             the smallest possible region for the buffer. */
-	        space = pocl_memalign_alloc(alignment, size);
-          if (space == NULL) 
-            {
-              BA_UNLOCK (d->mem_regions->mem_regions_lock);
-              return ENOMEM;
-            }
-          region_size = size;
-        }
-
-      init_mem_region (new_mem_region, (memory_address_t)space, region_size);
-      new_mem_region->alignment = (unsigned short)(alignment);
-      DL_APPEND (d->mem_regions->mem_regions, new_mem_region);
-      chunk = alloc_buffer_from_region (new_mem_region, size);
-
-      if (chunk == NULL)
-      {
-        printf("pocl error: could not allocate a buffer of size %zu from the newly created region of size %zu.\n",
-               size, region_size);
-        print_chunks(new_mem_region->chunks);
-        /* In case the malloc didn't fail it should have been able to allocate 
-           the buffer to a newly created Region. */
-        assert (chunk != NULL);
-      }
-    }
-  BA_UNLOCK (d->mem_regions->mem_regions_lock);
-  
-  *memptr = (void*) chunk->start_address;
-  return 0;
-}
-
-#else
-
-static int
-allocate_aligned_buffer (struct data* d, void **memptr, size_t alignment, size_t size) 
-{
-  *memptr = pocl_memalign_alloc(alignment, size);
-  return (((*memptr) == NULL)? -1: 0);
-}
-
-#endif
-
-void *
-pocl_pthread_malloc (void *device_data, cl_mem_flags flags, size_t size, void *host_ptr)
-{
-  void *b;
-  struct data* d = (struct data*)device_data;
-
-  if (flags & CL_MEM_COPY_HOST_PTR)
-    {
-      if (allocate_aligned_buffer (d, &b, MAX_EXTENDED_ALIGNMENT, size) == 0)
-        {
-          memcpy (b, host_ptr, size);
-          return b;
-        }
-      
-      return NULL;
-    }
-  
-  if (flags & CL_MEM_USE_HOST_PTR && host_ptr != NULL)
-    {
-      return host_ptr;
-    }
-
-  if (allocate_aligned_buffer (d, &b, MAX_EXTENDED_ALIGNMENT, size) == 0)
-    return b;
-  
-  return NULL;
-}
-
 cl_int
 pocl_pthread_alloc_mem_obj (cl_device_id device, cl_mem mem_obj)
 {
   void *b = NULL;
-  struct data* d = (struct data*)device->data;
   cl_mem_flags flags = mem_obj->flags;
 
   /* if memory for this global memory is not yet allocated -> do it */
@@ -423,9 +245,13 @@ pocl_pthread_alloc_mem_obj (cl_device_id device, cl_mem mem_obj)
           assert(mem_obj->mem_host_ptr != NULL);
           b = mem_obj->mem_host_ptr;
         }
-      else if (allocate_aligned_buffer (d, &b, MAX_EXTENDED_ALIGNMENT,
-                                        mem_obj->size) != 0)
-        return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+      else
+        {
+          b = pocl_memalign_alloc_global_mem( device, MAX_EXTENDED_ALIGNMENT,
+                                        mem_obj->size);
+          if (b==NULL)
+            return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+        }
 
       if (flags & CL_MEM_COPY_HOST_PTR)
         {
@@ -445,49 +271,21 @@ pocl_pthread_alloc_mem_obj (cl_device_id device, cl_mem mem_obj)
   return CL_SUCCESS;
 }
 
-#ifdef CUSTOM_BUFFER_ALLOCATOR
+
 void
-pocl_pthread_free (void *device_data, cl_mem_flags flags, void *ptr)
+pocl_pthread_free (cl_device_id device, cl_mem memobj)
 {
-  struct data* d = (struct data*) device_data;
-  memory_region_t *region = NULL;
+  cl_mem_flags flags = memobj->flags;
 
   if (flags & CL_MEM_USE_HOST_PTR)
-      return; /* The host code should free the host ptr. */
-
-  region = free_buffer (d->mem_regions->mem_regions, (memory_address_t)ptr);
+    return;
 
-  assert(region != NULL && "Unable to find the region for chunk.");
+  void* ptr = memobj->device_ptrs[device->dev_id].mem_ptr;
+  size_t size = memobj->size;
 
-#if FREE_EMPTY_REGIONS == 1
-  BA_LOCK(d->mem_regions->mem_regions_lock);
-  BA_LOCK(region->lock);
-  if (region->last_chunk == region->chunks && 
-      !region->chunks->is_allocated) 
-    {
-      /* All chunks have been deallocated. free() the whole 
-         memory region at once. */
-      DL_DELETE(d->mem_regions->mem_regions, region);
-      free((void*)region->last_chunk->start_address);
-      region->last_chunk->start_address = 0;
-      POCL_MEM_FREE(region);
-    }  
-  BA_UNLOCK(region->lock);
-  BA_UNLOCK(d->mem_regions->mem_regions_lock);
-#endif
+  pocl_free_global_mem(device, ptr, size);
 }
 
-#else
-
-void
-pocl_pthread_free (void *data, cl_mem_flags flags, void *ptr)
-{
-  if (flags & CL_MEM_USE_HOST_PTR)
-    return;
-  
-  POCL_MEM_FREE(ptr);
-}
-#endif
 
 void
 pocl_pthread_read (void *data, void *host_ptr, const void *device_ptr, 
@@ -647,7 +445,7 @@ workgroup_thread (void *p)
       if (kernel->arg_info[i].is_local)
         {
           arguments[i] = malloc (sizeof (void *));
-          *(void **)(arguments[i]) = pocl_pthread_malloc(ta->data, 0, al->size, NULL);
+          *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size);
         }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER)
       {
@@ -662,15 +460,18 @@ workgroup_thread (void *p)
           }
         else
           {
-            arguments[i] = 
-              &((*(cl_mem *)(al->value))->device_ptrs[ta->device->dev_id].mem_ptr);
+            cl_mem m = *(cl_mem *)al->value;
+            if (m->device_ptrs)
+              arguments[i] = &(m->device_ptrs[ta->device->dev_id].mem_ptr);
+            else
+              arguments[i] = &(m->mem_host_ptr);
           }
       }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
         {
           dev_image_t di;
           fill_dev_image_t(&di, al, ta->device);
-          void* devptr = pocl_pthread_malloc(ta->data, 0, sizeof(dev_image_t), NULL);
+          void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, sizeof(dev_image_t));
           arguments[i] = malloc (sizeof (void *));
           *(void **)(arguments[i]) = devptr;       
           pocl_pthread_write (ta->data, &di, devptr, 0, sizeof(dev_image_t));
@@ -680,7 +481,7 @@ workgroup_thread (void *p)
           dev_sampler_t ds;
           fill_dev_sampler_t(&ds, al);
           
-          void* devptr = pocl_pthread_malloc(ta->data, 0, sizeof(dev_sampler_t), NULL);
+          void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, sizeof(dev_sampler_t));
           arguments[i] = malloc (sizeof (void *));
           *(void **)(arguments[i]) = devptr;
           pocl_pthread_write (ta->data, &ds, devptr, 0, sizeof(dev_sampler_t));
@@ -697,8 +498,7 @@ workgroup_thread (void *p)
     {
       al = &(ta->kernel_args[i]);
       arguments[i] = malloc (sizeof (void *));
-      *(void **)(arguments[i]) = pocl_pthread_malloc (ta->data, 0, al->size, 
-                                                      NULL);
+      *(void **)(arguments[i]) = pocl_memalign_alloc (MAX_EXTENDED_ALIGNMENT, al->size);
     }
 
   size_t first_gid_x = ta->pc.group_id[0];
@@ -721,13 +521,13 @@ workgroup_thread (void *p)
     {
       if (kernel->arg_info[i].is_local )
         {
-          pocl_pthread_free (ta->data, 0, *(void **)(arguments[i]));
+          POCL_MEM_FREE(*(void **)(arguments[i]));
           POCL_MEM_FREE(arguments[i]);
         }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE ||
                 kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
         {
-          pocl_pthread_free (ta->data, 0, *(void **)(arguments[i]));
+          POCL_MEM_FREE(*(void **)(arguments[i]));
           POCL_MEM_FREE(arguments[i]);
         }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)arguments[i] == NULL)
@@ -739,7 +539,7 @@ workgroup_thread (void *p)
        i < kernel->num_args + kernel->num_locals;
        ++i)
     {
-      pocl_pthread_free (ta->data, 0, *(void **)(arguments[i]));
+      POCL_MEM_FREE(*(void **)(arguments[i]));
       POCL_MEM_FREE(arguments[i]);
     }
   free_thread_arguments (ta);
diff --git a/lib/CL/devices/tce/CMakeLists.txt b/lib/CL/devices/tce/CMakeLists.txt
index 25e50c6..c52f002 100644
--- a/lib/CL/devices/tce/CMakeLists.txt
+++ b/lib/CL/devices/tce/CMakeLists.txt
@@ -23,19 +23,20 @@
 #
 #=============================================================================
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TCE_INCLUDES}")
+add_compile_options(${TCE_INCLUDES})
+add_compile_options(${TCE_CXXFLAGS})
 
 if(ENABLE_TCE)
   add_subdirectory("ttasim")
-  set(POCL_DEVICES_OBJS "${POCL_DEVICES_OBJS};$<TARGET_OBJECTS:pocl-devices-tce-ttasim>" PARENT_SCOPE)
 endif()
 
 if(MSVC)
   set_source_files_properties( tce_common.h tce_common.cc PROPERTIES LANGUAGE CXX )
 endif(MSVC)
+
 add_library("pocl-devices-tce" OBJECT tce_common.h tce_common.cc)
 set(POCL_DEVICES_OBJS "${POCL_DEVICES_OBJS};$<TARGET_OBJECTS:pocl-devices-tce>" PARENT_SCOPE)
 
 # dist_pkgdata_DATA = tta_device_main.c
-install(FILES "tta_device_main.c" "tta_device_main_dthread.c"
+install(FILES "tta_device_main.c"
         DESTINATION ${POCL_INSTALL_PRIVATE_HEADER_DIR})
diff --git a/lib/CL/devices/tce/Makefile.in b/lib/CL/devices/tce/Makefile.in
index d5fe6e4..603bb3b 100644
--- a/lib/CL/devices/tce/Makefile.in
+++ b/lib/CL/devices/tce/Makefile.in
@@ -338,6 +338,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -345,6 +346,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -362,8 +364,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -439,6 +439,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/lib/CL/devices/tce/tce_common.cc b/lib/CL/devices/tce/tce_common.cc
index a15d8f6..336ab30 100644
--- a/lib/CL/devices/tce/tce_common.cc
+++ b/lib/CL/devices/tce/tce_common.cc
@@ -80,7 +80,7 @@ TCEDevice::~TCEDevice() {
 
 bool
 TCEDevice::isMultiCoreMachine() const {
-#if defined(TCEMC_AVAILABLE) && TCEMC_AVAILABLE == 1
+#ifdef TCEMC_AVAILABLE
   assert (machine_ != NULL);
   return machine_->coreCount() > 1;
 #else
@@ -316,7 +316,6 @@ cl_int
 pocl_tce_alloc_mem_obj (cl_device_id device, cl_mem mem_obj)
 {
   void *b = NULL;
-  TCEDevice *d = (TCEDevice*)device->data;
   cl_int flags = mem_obj->flags;
 
   /* if memory for this global memory is not yet allocated -> do it */
@@ -339,9 +338,9 @@ pocl_tce_alloc_mem_obj (cl_device_id device, cl_mem mem_obj)
 
 void
 pocl_tce_write (void *data, const void *host_ptr, void *device_ptr, 
-                size_t offset, size_t cb)
+                size_t /*offset*/, size_t cb)
 {
-  TCEDevice* d = (TCEDevice*)data;
+  TCEDevice *d = (TCEDevice*)data;
   chunk_info_t *chunk = (chunk_info_t*)device_ptr;
 #ifdef DEBUG_TTA_DRIVER
   printf("host: write %x %x %u\n", host_ptr, chunk->start_address, cb);
@@ -351,7 +350,7 @@ pocl_tce_write (void *data, const void *host_ptr, void *device_ptr,
 
 void
 pocl_tce_read (void *data, void *host_ptr, const void *device_ptr, 
-               size_t offset, size_t cb)
+               size_t /*offset*/, size_t cb)
 {
   TCEDevice* d = (TCEDevice*)data;
   chunk_info_t *chunk = (chunk_info_t*)device_ptr;
@@ -380,8 +379,9 @@ pocl_tce_malloc_local (void *device_data, size_t size)
 }
 
 void
-pocl_tce_free (void */*data*/, cl_mem_flags /*flags*/, void *ptr)
+pocl_tce_free (cl_device_id device, cl_mem mem_obj)
 {
+  void* ptr = mem_obj->device_ptrs[device->dev_id].mem_ptr;
   free_chunk ((chunk_info_t*) ptr);
 }
 
@@ -393,7 +393,6 @@ pocl_tce_run
   TCEDevice *d = (TCEDevice*)data;
   int error;
   char bytecode[POCL_FILENAME_LENGTH];
-  char command[COMMAND_LENGTH];
   uint32_t kernelAddr;
   unsigned i;
 
@@ -604,7 +603,10 @@ pocl_tce_map_mem (void *data, void *buf_ptr,
     } 
   else
     {
-      posix_memalign (&target, ALIGNMENT, size);
+        if (posix_memalign (&target, ALIGNMENT, size) != 0)
+        {
+            POCL_ABORT ("Could not allocate memory.");
+        }
     }
 
   /* Synch the device global region to the host memory. */
@@ -658,7 +660,7 @@ pocl_tce_build_hash (void *data, SHA1_CTX *build_hash)
 {
   TCEDevice *tce_dev = (TCEDevice*)data;
   FILE* adf_file = fopen (tce_dev->machine_file.c_str(), "r");
-  size_t size, n;
+  size_t size;
   uint8_t* adf_data = 0;
   const char *extra_flags = NULL;
   size_t ef_size;
@@ -667,7 +669,8 @@ pocl_tce_build_hash (void *data, SHA1_CTX *build_hash)
   size = ftell (adf_file);
   fseek (adf_file, 0, SEEK_SET);
   adf_data = (uint8_t*)malloc (size);
-  fread (adf_data, 1, size, adf_file);
+  if (fread (adf_data, 1, size, adf_file) == 0)
+      POCL_ABORT ("Could not read ADF.");
   
   //TCEString machine_hash = tce->dev->hash();
   pocl_SHA1_Update (build_hash, adf_data, size);
@@ -681,13 +684,13 @@ pocl_tce_build_hash (void *data, SHA1_CTX *build_hash)
 }
 
 void
-pocl_tce_copy (void */*data*/, const void *src_ptr, size_t src_offset, 
-               void *__restrict__ dst_ptr, size_t dst_offset, size_t cb)
+pocl_tce_copy (void */*data*/, const void *src_ptr, size_t /*src_offset*/,
+               void *__restrict__ dst_ptr, size_t /*dst_offset*/, size_t cb)
 {
   POCL_ABORT_UNIMPLEMENTED("Copy not yet supported in TCE driver.");
   if (src_ptr == dst_ptr)
     return;
-  
+
   memcpy (dst_ptr, src_ptr, cb);
 }
 
diff --git a/lib/CL/devices/tce/tta_device_main_dthread.c b/lib/CL/devices/tce/tta_device_main_dthread.c
new file mode 100644
index 0000000..331bfa0
--- /dev/null
+++ b/lib/CL/devices/tce/tta_device_main_dthread.c
@@ -0,0 +1,221 @@
+/* tta_device_main.c - the main program for the tta devices executing ocl kernels
+
+   Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+/* Note: Most of the debug code is broken because lwpr_print_str() only works
+ *       with char* to local address space */
+//#define DEBUG_TTA_DEVICE
+
+#include <malloc.h>
+#include <stdlib.h>
+#include <dthread.h>
+#include <lwpr.h>
+
+#ifdef DEBUG_TTA_DEVICE
+#include <stdio.h>
+#endif
+
+#include "pocl_device.h"
+
+#define __local__ __attribute__((address_space(0)))
+#define __global__ __attribute__((address_space(3)))
+#define __constant__ __attribute__((address_space(3)))
+
+typedef volatile __global__ __kernel_exec_cmd kernel_exec_cmd;
+typedef __global__ __kernel_metadata kernel_metadata;
+
+struct wg_thread_arg {
+  kernel_exec_cmd* cmd;
+  int first_gid_x; 
+  int last_gid_x;
+};
+
+int min(int a, int b) {
+    if (a < b) return a;
+    else return b;
+}
+
+/**
+ * Executes the work groups of the kernel command.
+ */
+static void *wg_thread(void *targ) {
+    struct wg_thread_arg *targs = (struct wg_thread_arg*)targ;
+    kernel_exec_cmd *cmd = targs->cmd;
+    int first_gidx = targs->first_gid_x;
+    int last_gidx = targs->last_gid_x;
+    kernel_metadata *kernel = (kernel_metadata*)cmd->kernel;
+
+    void* args[MAX_KERNEL_ARGS];
+
+    /* Copy the kernel function arguments from the global memory 
+       to the stack in the local memory. */
+    for (int i = 0; i < kernel->num_args + kernel->num_locals; ++i) {
+        args[i] = (void*)cmd->args[i];
+    }
+
+    const int num_groups_x = cmd->num_groups[0];
+    const int num_groups_y = (cmd->work_dim >= 2) ? (cmd->num_groups[1]) : 1;
+    const int num_groups_z = (cmd->work_dim == 3) ? (cmd->num_groups[2]) : 1;
+
+    struct pocl_context context;
+    context.work_dim = cmd->work_dim;
+    context.num_groups[0] = cmd->num_groups[0];
+    context.num_groups[1] = cmd->num_groups[1];
+    context.num_groups[2] = cmd->num_groups[2];
+    context.global_offset[0] = cmd->global_offset[0];
+    context.global_offset[1] = cmd->global_offset[1];
+    context.global_offset[2] = cmd->global_offset[2];
+                
+    for (unsigned gid_x = first_gidx; gid_x <= last_gidx; gid_x++) { 
+        for (unsigned gid_y = 0; gid_y < num_groups_y; gid_y++) { 
+            for (unsigned gid_z = 0; gid_z < num_groups_z; gid_z++) {
+                context.group_id[0] = gid_x;
+                context.group_id[1] = gid_y;
+                context.group_id[2] = gid_z;
+#ifdef DEBUG_TTA_DEVICE
+                lwpr_print_str("tta: ------------------- launching WG ");
+                lwpr_print_int(gid_x); lwpr_print_str("-");
+                lwpr_print_int(gid_y); lwpr_print_str("-");
+                lwpr_print_int(gid_z); lwpr_print_str(" @ ");
+                lwpr_print_int((unsigned)kernel->work_group_func);
+                lwpr_newline();
+#endif
+                kernel->work_group_func (args, &context);
+            } 
+        }
+    }
+    return NULL;
+}
+
+#define MAX_WG_THREADS 128
+dthread_t wg_threads[MAX_WG_THREADS];
+
+/**
+ * Prepares a work group for execution and launches it.
+ */
+static void tta_opencl_wg_launch(kernel_exec_cmd* cmd) {
+
+    int num_groups_x = cmd->num_groups[0];
+    int i, first_gid_x, last_gid_x;
+    int thread_count = min(min(num_groups_x, dthread_get_core_count()), MAX_WG_THREADS);
+    int wgs_per_thread = num_groups_x / thread_count;
+    int leftover_wgs = num_groups_x - wgs_per_thread * thread_count;
+    wgs_per_thread += leftover_wgs / thread_count;
+    leftover_wgs = num_groups_x - wgs_per_thread * thread_count;
+
+#ifdef DEBUG_TTA_DEVICE
+    lwpr_print_str("tta: ------------------- starting kernel ");
+    puts(kernel->name);
+#endif
+    first_gid_x = 0;
+    last_gid_x = wgs_per_thread - 1;
+    for (i = 0; i < thread_count; 
+         ++i, first_gid_x += wgs_per_thread, last_gid_x += wgs_per_thread) {
+        int status;
+        struct wg_thread_arg arg;
+        dthread_attr_t attr;
+
+        if (i + 1 == thread_count) last_gid_x += leftover_wgs;
+
+        arg.cmd = cmd;
+        arg.first_gid_x = first_gid_x;
+        arg.last_gid_x = last_gid_x;
+
+        dthread_attr_init(&attr);
+        dthread_attr_setargs(&attr, &arg, sizeof(arg));
+        status = dthread_create(&wg_threads[i], &attr, wg_thread);
+        /* Assume there's always enough space in the STT. */
+        if (status) {
+            exit(-1);
+        } 
+    }
+
+   for (int i = 0; i < thread_count; i++){ 
+       dthread_join(wg_threads[i], NULL);
+   }
+
+#ifdef DEBUG_TTA_DEVICE
+    lwpr_print_str("\ntta: ------------------- kernel finished\n");
+#endif
+}
+
+extern kernel_metadata _test_kernel_md;
+
+/* The shared kernel_command object using which the device is controlled. */
+kernel_exec_cmd kernel_command;
+
+static kernel_exec_cmd* wait_for_command() {
+    while (kernel_command.status != POCL_KST_READY) 
+        ;
+    kernel_command.status = POCL_KST_RUNNING;
+    return &kernel_command;
+}
+
+int main() {
+    kernel_exec_cmd *next_command;
+    kernel_metadata *next_kernel;
+    size_t dynamic_local_arg_sizes[MAX_KERNEL_ARGS];
+    int work_dim = 1;
+    size_t local_work_sizes[3] = {1, 0, 0};
+    size_t global_work_sizes[3] = {2, 0, 0};
+
+#ifdef DEBUG_TTA_DEVICE
+    lwpr_print_str("tta: Hello from a TTA device\n");
+    lwpr_print_str("tta: initializing the command objects\n");
+#endif
+
+    do {
+
+#ifdef DEBUG_TTA_DEVICE
+        lwpr_print_str("tta: waiting for commands\n");
+#endif
+
+        next_command = wait_for_command();
+
+        next_kernel = (kernel_metadata*)next_command->kernel;
+
+#ifdef DEBUG_TTA_DEVICE
+        lwpr_print_str("tta: got a command to execute: ");
+        lwpr_print_str(next_kernel->name);
+        lwpr_print_str(" with ");
+        lwpr_print_int(next_command->work_dim);
+        lwpr_print_str(" dimensions. num_groups ");
+        lwpr_print_int(next_command->num_groups[0]);
+        lwpr_print_str("-"),
+        lwpr_print_int(next_command->num_groups[1]);
+        lwpr_print_str("-"),
+        lwpr_print_int(next_command->num_groups[2]);
+        lwpr_print_str(" dimensions. global offset ");
+        lwpr_print_int(next_command->global_offset[0]);
+        lwpr_print_str("-"),
+        lwpr_print_int(next_command->global_offset[1]);
+        lwpr_print_str("-"),
+        lwpr_print_int(next_command->global_offset[2]);
+        lwpr_newline();
+#endif
+        tta_opencl_wg_launch(next_command);
+        kernel_command.status = POCL_KST_FINISHED;   
+
+    } while (1);
+
+    return 0;
+}
diff --git a/lib/CL/devices/tce/ttasim/CMakeLists.txt b/lib/CL/devices/tce/ttasim/CMakeLists.txt
index 7380c31..0452867 100644
--- a/lib/CL/devices/tce/ttasim/CMakeLists.txt
+++ b/lib/CL/devices/tce/ttasim/CMakeLists.txt
@@ -28,5 +28,6 @@ include_directories(BEFORE "..")
 if(MSVC)
   set_source_files_properties( ttasim.h ttasim.cc PROPERTIES LANGUAGE CXX )
 endif(MSVC)
-add_library("pocl-devices-tce-ttasim" OBJECT ttasim.h ttasim.cc ../tce_common.h)
+add_library("pocl-devices-tce-ttasim" OBJECT ttasim.h ttasim.cc)
+set(POCL_DEVICES_OBJS "${POCL_DEVICES_OBJS};$<TARGET_OBJECTS:pocl-devices-tce-ttasim>" PARENT_SCOPE)
 
diff --git a/lib/CL/devices/tce/ttasim/Makefile.in b/lib/CL/devices/tce/ttasim/Makefile.in
index 71041c7..ca88f5c 100644
--- a/lib/CL/devices/tce/ttasim/Makefile.in
+++ b/lib/CL/devices/tce/ttasim/Makefile.in
@@ -265,6 +265,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -272,6 +273,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -289,8 +291,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -366,6 +366,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/lib/CL/devices/tce/ttasim/todo.txt b/lib/CL/devices/tce/ttasim/todo.txt
new file mode 100644
index 0000000..d31ff08
--- /dev/null
+++ b/lib/CL/devices/tce/ttasim/todo.txt
@@ -0,0 +1,12 @@
+- reorganize dir structure to 
+  - tce/ttasim
+  - tce/plda-tta
+- plda-tta:
+  - refactor common code between ttasim and plda-tta
+  - implement plda driver hooks
+- clEnqueueCopyBuffer:
+  - inside the same device 
+  - between multiple devices
+- proper schedule generation (process the command queue
+  as an entity)
+- out of order command queues
diff --git a/lib/CL/devices/tce/ttasim/tta.txt b/lib/CL/devices/tce/ttasim/tta.txt
new file mode 100644
index 0000000..a50e8cd
--- /dev/null
+++ b/lib/CL/devices/tce/ttasim/tta.txt
@@ -0,0 +1,36 @@
+This device uses the ttasim for simulating the tta devices, however,
+some care has been taken to not rely on ttasim too much so it can
+be more easily converted to a device driver for the PCIe card.
+
+memories
+--------
+The TTAs are assumed to have at least two disjoint address spaces:
+
+* one dedicated for shared data 
+  - numerical id 3 (global) and numerical id 4 (const)
+
+* one for local data + the C data
+  - used for stack + heap of the kernel program
+    + the __local and __private address spaces of OpenCL
+  - numerical id 4 and numerical id 0 (the C default address space)
+
+memory allocation
+-----------------
+The device driver keeps book of all shared resources, also the
+global memory space and the local space.
+
+As global space is assumed always to be dedicated, the bufalloc
+can be used to allocate chunks from it and assume the addresses
+also map correctly.
+
+The local space is bit more problematic, though, because it's
+shared by the stack, the heap and the global variables in the
+kernel runtime. However, we do not need address mapping or
+data transfers between the host and the device for local pointers,
+thus it's enough to ensure that we do not overallocate it. 
+Therefore, a conservative estimate of the local space available can 
+be made, e.g. the AS size - 32KB.
+
+
+
+  
diff --git a/lib/CL/devices/tce/ttasim/ttasim.cc b/lib/CL/devices/tce/ttasim/ttasim.cc
index 79b7602..3318a86 100644
--- a/lib/CL/devices/tce/ttasim/ttasim.cc
+++ b/lib/CL/devices/tce/ttasim/ttasim.cc
@@ -1,6 +1,6 @@
 /* ttasim.cc - a pocl device driver for simulating TTA devices using TCE's ttasim
 
-   Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
+   Copyright (c) 2012-2015 Pekka Jääskeläinen / Tampere University of Technology
    
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -134,10 +134,12 @@ pocl_ttasim_init_device_infos(struct _cl_device_id* dev)
   dev->queue_properties = CL_QUEUE_PROFILING_ENABLE;
   dev->vendor = "TTA-Based Co-design Environment";
   dev->profile = "EMBEDDED_PROFILE";
-  dev->extensions = "";
+  dev->extensions = TCE_DEVICE_EXTENSIONS;
   dev->llvm_target_triplet = "tce-tut-llvm";
   dev->has_64bit_long = 1;
 
+  SETUP_DEVICE_CL_VERSION(TCE_DEVICE_CL_VERSION_MAJOR, TCE_DEVICE_CL_VERSION_MINOR);
+
   dev->parent_device = NULL;
   // ttasim does not support partitioning
   dev->max_sub_devices = 1;
@@ -170,7 +172,7 @@ public:
     char dev_name[256];
 
     const char *adf = strrchr(adfName, '/');
-    if (adf != NULL && *adf != NULL) adf++;
+    if (adf != NULL) adf++;
     if (snprintf (dev_name, 256, "ttasim-%s", adf) < 0)
       POCL_ABORT("Unable to generate the device name string.");
     dev->long_name = strdup(dev_name);  
diff --git a/lib/CL/devices/topology/Makefile.in b/lib/CL/devices/topology/Makefile.in
index 988418b..d1f383a 100644
--- a/lib/CL/devices/topology/Makefile.in
+++ b/lib/CL/devices/topology/Makefile.in
@@ -224,6 +224,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -231,6 +232,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -248,8 +250,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -325,6 +325,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/lib/CL/dummy.c b/lib/CL/dummy.c
new file mode 100644
index 0000000..374b4bc
--- /dev/null
+++ b/lib/CL/dummy.c
@@ -0,0 +1 @@
+/* Empty source file to keep "ar" happy */
diff --git a/lib/CL/pocl_cache.c b/lib/CL/pocl_cache.c
index be485c2..f626dea 100644
--- a/lib/CL/pocl_cache.c
+++ b/lib/CL/pocl_cache.c
@@ -65,7 +65,7 @@ int pocl_cl_device_to_index(cl_program   program,
 static void program_device_dir(char*        path,
                               cl_program   program,
                               unsigned     device_i,
-                              char*        append_path) {
+                              const char*        append_path) {
     assert(path);
     assert(program);
     assert(device_i < program->num_devices);
@@ -159,16 +159,24 @@ void pocl_cache_final_binary_path(char* final_binary_path, cl_program program,
 
 static void* acquire_program_lock(cl_program program,
                                   unsigned device_i,
+                                  const char* lock_type,
                                   int shared) {
     char lock_path[POCL_FILENAME_LENGTH];
-    program_device_dir(lock_path, program, device_i, "_rw");
+    program_device_dir(lock_path, program, device_i, lock_type);
 
     return acquire_lock(lock_path, shared);
 }
 
+// EXCLUSIVE writer lock
 void* pocl_cache_acquire_writer_lock_i(cl_program program,
                                        unsigned device_i) {
-    return acquire_program_lock(program, device_i, 0);
+    return acquire_program_lock(program, device_i, "_write", 0);
+}
+
+// SHARED reader lock (on clReleaseProgram, request EXCLUSIVE reader..)
+void* pocl_cache_acquire_reader_lock_i(cl_program program,
+                                       unsigned device_i) {
+    return acquire_program_lock(program, device_i, "_read", 1);
 }
 
 void pocl_cache_release_lock(void* lock) {
@@ -182,6 +190,12 @@ void* pocl_cache_acquire_writer_lock(cl_program program,
     return pocl_cache_acquire_writer_lock_i(program, (unsigned)index);
 }
 
+void* pocl_cache_acquire_reader_lock(cl_program program,
+                                     cl_device_id device) {
+    int index = pocl_cl_device_to_index(program, device);
+    assert(index >= 0);
+    return pocl_cache_acquire_reader_lock_i(program, (unsigned)index);
+}
 
 /******************************************************************************/
 
@@ -510,8 +524,7 @@ pocl_cache_create_program_cachedir(cl_program program,
                                    unsigned device_i,
                                    const char* preprocessed_source,
                                    size_t source_len,
-                                   char* program_bc_path,
-                                   void** cache_lock)
+                                   char* program_bc_path)
 {
     const char *hash_source = NULL;
     uint8_t old_build_hash[SHA1_DIGEST_SIZE] = {0};
@@ -542,6 +555,8 @@ pocl_cache_create_program_cachedir(cl_program program,
             program->binary_sizes[device_i] = 0;
         }
         pocl_free_llvm_irs(program, device_i);
+        pocl_cache_release_lock(program->read_locks[device_i]);
+        program->read_locks[device_i] = NULL;
     }
 
     program_device_dir(program_bc_path, program, device_i, "");
@@ -551,7 +566,8 @@ pocl_cache_create_program_cachedir(cl_program program,
 
     pocl_cache_program_bc_path(program_bc_path, program, device_i);
 
-    *cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
+    program->read_locks[device_i] = pocl_cache_acquire_reader_lock_i(program, device_i);
+    assert(program->read_locks[device_i]);
 
     return 0;
 }
@@ -560,15 +576,23 @@ void pocl_cache_cleanup_cachedir(cl_program program) {
 
     unsigned i;
 
-    if (!pocl_get_bool_option("POCL_KERNEL_CACHE", POCL_BUILD_KERNEL_CACHE)) {
+    for (i = 0; i < program->num_devices; ++i)
+      pocl_cache_release_lock(program->read_locks[i]);
+    POCL_MEM_FREE(program->read_locks);
+
+    if (!pocl_get_bool_option("POCL_KERNEL_CACHE", POCL_KERNEL_CACHE_DEFAULT)) {
 
         for (i=0; i< program->num_devices; i++) {
             if (program->build_hash[i][0] == 0)
                 continue;
 
-            void* lock = acquire_program_lock(program, i, 0);
+            void* lock = acquire_program_lock(program, i, "_read", 0);
             if (!lock)
-                return;
+              {
+                POCL_MSG_PRINT(" *** WARNING *** ", "",
+                "Could not get an exclusive lock to remove program cachedir");
+                continue;
+              }
             char cachedir[POCL_FILENAME_LENGTH];
             program_device_dir(cachedir, program, i, "");
             pocl_rm_rf(cachedir);
diff --git a/lib/CL/pocl_cl.h b/lib/CL/pocl_cl.h
index 38d6754..6e874d2 100644
--- a/lib/CL/pocl_cl.h
+++ b/lib/CL/pocl_cl.h
@@ -216,8 +216,9 @@ struct pocl_device_ops {
   void (*init) (cl_device_id device, const char *parameters);
   cl_int (*alloc_mem_obj) (cl_device_id device, cl_mem mem_obj);
   void *(*create_sub_buffer) (void *data, void* buffer, size_t origin, size_t size);
-  void (*free) (void *data, cl_mem_flags flags, void *ptr);
-  void (*read) (void *data, void *host_ptr, const void *device_ptr, 
+  void (*free) (cl_device_id device, cl_mem mem_obj);
+  void (*free_ptr) (cl_device_id device, void* mem_ptr);
+  void (*read) (void *data, void *host_ptr, const void *device_ptr,
                 size_t offset, size_t cb);
   void (*read_rect) (void *data, void *host_ptr, void *device_ptr,
                      const size_t *buffer_origin,
@@ -257,6 +258,12 @@ void (*fill_rect) (void *data,
                    void *fill_pixel,
                    size_t pixel_size);
 
+void (*memfill) (void *ptr,
+                 size_t size,
+                 size_t offset,
+                 const void* pattern,
+                 size_t pattern_size);
+
   /* Maps 'size' bytes of device global memory at buf_ptr + offset to 
      host-accessible memory. This might or might not involve copying 
      the block from the device. */
@@ -282,6 +289,12 @@ void (*fill_rect) (void *data,
                                          cl_uint *num_image_formats);
 };
 
+typedef struct pocl_global_mem_t {
+  size_t max_ever_allocated;
+  size_t currently_allocated;
+  size_t total_alloc_limit;
+} pocl_global_mem_t;
+
 struct _cl_device_id {
   POCL_ICD_OBJECT
   POCL_OBJECT;
@@ -331,6 +344,8 @@ struct _cl_device_id {
   cl_uint global_mem_cacheline_size;
   cl_ulong global_mem_cache_size;
   cl_ulong global_mem_size;
+  size_t global_var_pref_size;
+  size_t global_var_max_size;
   cl_ulong max_constant_buffer_size;
   cl_uint max_constant_args;
   cl_device_local_mem_type local_mem_type;
@@ -364,6 +379,10 @@ struct _cl_device_id {
   const char *profile;
   const char *version;
   const char *extensions;
+  cl_uint cl_version_major;    // 2
+  cl_uint cl_version_minor;    // 0
+  const char *cl_version_std;  // "CL2.0"
+  cl_uint cl_version_int;      // 200
  
   void *data;
   const char* llvm_target_triplet; /* the llvm target triplet to use */
@@ -372,13 +391,38 @@ struct _cl_device_id {
      indexing  arrays in data structures with device specific entries. */
   int dev_id;
   int global_mem_id; /* identifier for device global memory */
+  /* pointer to an accounting struct for global memory */
+  pocl_global_mem_t *global_memory;
   int has_64bit_long;  /* Does the device have 64bit longs */
   /* Convert automatic local variables to kernel arguments? */
   int autolocals_to_args;
 
+  // True if the device supports SVM has priority
+  // at allocating Shared Virtual Memory
+  cl_bool should_allocate_svm;
+  /* OpenCL 2.0 properties */
+  cl_device_svm_capabilities svm_caps;
+  cl_uint max_events;
+  cl_uint max_queues;
+  cl_uint max_pipe_args;
+  cl_uint max_pipe_active_res;
+  cl_uint max_pipe_packet_size;
+  cl_uint dev_queue_pref_size;
+  cl_uint dev_queue_max_size;
+  cl_command_queue_properties on_dev_queue_props;
+  cl_command_queue_properties on_host_queue_props;
+
   struct pocl_device_ops *ops; /* Device operations, shared amongst same devices */
 };
 
+#define DEVICE_SVM_FINEGR(dev) (dev->svm_caps & (CL_DEVICE_SVM_FINE_GRAIN_BUFFER \
+                                              | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM))
+#define DEVICE_SVM_ATOM(dev) (dev->svm_caps & CL_DEVICE_SVM_ATOMICS)
+
+#define DEVICE_IS_SVM_CAPABLE(dev) (dev->svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER)
+
+#define DEVICE_MMAP_IS_NOP(dev) (DEVICE_SVM_FINEGR(dev) && DEVICE_SVM_ATOM(dev))
+
 struct _cl_platform_id {
   POCL_ICD_OBJECT
 }; 
@@ -397,6 +441,13 @@ struct _cl_context {
      clReleaseContext for the result regardless if it failed or not. 
      Returns a valid = 0 context in that case.  */
   char valid;
+
+  /* The minimal value of max_mem_alloc_size of all devices in context */
+  size_t min_max_mem_alloc_size;
+  /* The device that should allocate SVM (might be == host)
+   * NULL if none of devices in the context is SVM capable */
+  cl_device_id svm_allocdev;
+
 };
 
 struct _cl_command_queue {
@@ -459,6 +510,9 @@ struct _cl_mem {
   cl_uint                 num_mip_levels;
   cl_uint                 num_samples;
   cl_mem                  buffer;
+  /* Pipe specific */
+  cl_uint packet_size;
+  cl_uint max_packets;
 };
 
 typedef uint8_t SHA1_digest_t[SHA1_DIGEST_SIZE * 2 + 1];
@@ -488,6 +542,8 @@ struct _cl_program {
   char main_build_log[640];
   /* Used to store the llvm IR of the build to save disk I/O. */
   void **llvm_irs;
+  /* Read locks */
+  void** read_locks;
   /* Use to store build status */
   cl_build_status build_status;
 };
diff --git a/lib/CL/pocl_debug.c b/lib/CL/pocl_debug.c
index 8011b53..c301f9b 100644
--- a/lib/CL/pocl_debug.c
+++ b/lib/CL/pocl_debug.c
@@ -1,9 +1,10 @@
 #include "pocl_debug.h"
+#include "pocl_timing.h"
 
 #ifdef POCL_DEBUG_MESSAGES
 int pocl_debug_messages;
+int stderr_is_a_tty;
 
-#ifdef HAVE_CLOCK_GETTIME
 
   #if !defined(_MSC_VER) && !defined(__MINGW32__)
 
@@ -11,22 +12,90 @@ int pocl_debug_messages;
     #include <stdio.h>
 
     void pocl_debug_print_header(const char* func, unsigned line) {
-        struct tm t;
-        long tm_nanosec;
-        struct timespec timespec;
 
-        clock_gettime(CLOCK_REALTIME, &timespec);
-        tm_nanosec = timespec.tv_nsec;
-        gmtime_r(&timespec.tv_sec, &t);
+        int year, mon, day, hour, min, sec, nanosec;
+        pocl_gettimereal(&year, &mon, &day, &hour, &min, &sec, &nanosec);
+
+        const char* formatstring;
+        if (stderr_is_a_tty)
+          formatstring = POCL_COLOR_BLUE
+              "[%04i-%02i-%02i %02i:%02i:%02i.%09li] "
+              POCL_COLOR_RESET "POCL: in fn"
+              POCL_COLOR_CYAN " %s "
+              POCL_COLOR_RESET "at line %u:\n";
+        else
+          formatstring = "[%04i-%02i-%02i %02i:%02i:%02i.%09i] "
+              "POCL: in fn %s at line %u:\n";
         fprintf(stderr,
-            "[%04i-%02i-%02i %02i:%02i:%02i.%09li] POCL: "
-            "in fn %s at line %u:\n", (t.tm_year + 1900),
-            t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min,
-            t.tm_sec, tm_nanosec,  func, line);
+            formatstring, year, mon, day, hour, min,
+            sec, nanosec, func, line);
+    }
+
+    void pocl_debug_measure_start(uint64_t *start) {
+      if (!pocl_debug_messages)
+        return;
+      *start = pocl_gettimemono_ns();
+    }
+
+    void pocl_debug_print_duration(const char* func, unsigned line,
+                                   const char* msg, uint64_t nanosecs)
+    {
+      if (!pocl_debug_messages)
+        return;
+      const char* formatstring;
+      if (stderr_is_a_tty)
+        formatstring = "      >>>  " POCL_COLOR_MAGENTA "     %3" PRIu64
+                       ".%03" PRIu64 " " POCL_COLOR_RESET " %s    %s\n";
+      else
+        formatstring = "      >>>       %3" PRIu64 ".%03"
+                       PRIu64 "  %s    %s\n";
+
+      uint64_t nsec = nanosecs % 1000000000;
+      uint64_t sec = nanosecs / 1000000000;
+      uint64_t a, b;
+
+      if ((sec == 0) && (nsec < 1000))
+        {
+          b = nsec % 1000;
+          if (stderr_is_a_tty)
+            formatstring = "      >>>      " POCL_COLOR_MAGENTA
+                    "     %3" PRIu64 " " POCL_COLOR_RESET " ns    %s\n";
+          else
+            formatstring = "      >>>           %3" PRIu64 "  ns    %s\n";
+          POCL_MSG_PRINT2(func, line, formatstring, b, msg);
+        }
+      else if ((sec == 0) && (nsec < 1000000))
+        {
+          a = nsec / 1000;
+          b = nsec % 1000;
+          POCL_MSG_PRINT2(func, line, formatstring, a, b, "us", msg);
+        }
+      else if (sec == 0)
+        {
+          a = nsec / 1000000;
+          b = (nsec % 1000000) / 1000;
+          POCL_MSG_PRINT2(func, line, formatstring, a, b, "ms", msg);
+        }
+      else
+          POCL_MSG_PRINT2(func, line, formatstring, sec, nsec, "s", msg);
+
+    }
+
+    void pocl_debug_measure_finish(uint64_t *start, uint64_t *finish,
+                                   const char* msg,
+                                   const char* func,
+                                   unsigned line) {
+      if (!pocl_debug_messages)
+        return;
+      *finish = pocl_gettimemono_ns();
+      pocl_debug_print_duration(func, line, msg, (*finish - *start) );
     }
 
   #else
 
+/* Doesn't work, haven't been able to get it working.
+ * Needs someone with experience in Win programming. */
+
     #include <windows.h>
     #include <stdio.h>
 
@@ -51,6 +120,5 @@ int pocl_debug_messages;
   #endif
 
 
-#endif
 
 #endif
diff --git a/lib/CL/pocl_debug.h b/lib/CL/pocl_debug.h
index 6040bbb..6f1e2b3 100644
--- a/lib/CL/pocl_debug.h
+++ b/lib/CL/pocl_debug.h
@@ -1,10 +1,47 @@
 #ifndef POCL_DEBUG_H
 #define POCL_DEBUG_H
 
+#ifdef _WIN32
+#  include <stdint.h>
+#  include <stddef.h> // size_t
+#  define PRIu64 "I64u"
+#  define PRIX64 "I64x"
+#  define PRIXPTR "p"
+#  define PRIuS "Iu"
+#else
+# ifndef __STDC_FORMAT_MACROS
+# define __STDC_FORMAT_MACROS
+# endif
+# include <inttypes.h>
+#endif
+
+// size_t print spec
+#ifndef PRIuS
+# define PRIuS "zu"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+// should use some terminfo library, but..
+#define POCL_COLOR_RESET   "\033[0m"
+#define POCL_COLOR_BLACK   "\033[30m"      /* Black */
+#define POCL_COLOR_RED     "\033[31m"      /* Red */
+#define POCL_COLOR_GREEN   "\033[32m"      /* Green */
+#define POCL_COLOR_YELLOW  "\033[33m"      /* Yellow */
+#define POCL_COLOR_BLUE    "\033[34m"      /* Blue */
+#define POCL_COLOR_MAGENTA "\033[35m"      /* Magenta */
+#define POCL_COLOR_CYAN    "\033[36m"      /* Cyan */
+#define POCL_COLOR_WHITE   "\033[37m"      /* White */
+#define POCL_COLOR_BOLDBLACK   "\033[1m\033[30m"      /* Bold Black */
+#define POCL_COLOR_BOLDRED     "\033[1m\033[31m"      /* Bold Red */
+#define POCL_COLOR_BOLDGREEN   "\033[1m\033[32m"      /* Bold Green */
+#define POCL_COLOR_BOLDYELLOW  "\033[1m\033[33m"      /* Bold Yellow */
+#define POCL_COLOR_BOLDBLUE    "\033[1m\033[34m"      /* Bold Blue */
+#define POCL_COLOR_BOLDMAGENTA "\033[1m\033[35m"      /* Bold Magenta */
+#define POCL_COLOR_BOLDCYAN    "\033[1m\033[36m"      /* Bold Cyan */
+#define POCL_COLOR_BOLDWHITE   "\033[1m\033[37m"      /* Bold White */
 
 #ifdef __GNUC__
 #pragma GCC visibility push(hidden)
@@ -34,9 +71,9 @@ extern "C" {
             " in %s:%d\n", __FILE__, __LINE__);                         \
     } while (0)
 
-#define POCL_ABORT(__MSG__)                                             \
+#define POCL_ABORT(...)                                                 \
     do {                                                                \
-        fprintf(stderr, __MSG__);                                       \
+        fprintf(stderr, __VA_ARGS__);                                  \
         exit(2);                                                        \
     } while (0)
 
@@ -55,6 +92,7 @@ extern "C" {
 #ifdef POCL_DEBUG_MESSAGES
 
     extern int pocl_debug_messages;
+    extern int stderr_is_a_tty;
 
     #if __GNUC__ >= 2
     #define __func__ __PRETTY_FUNCTION__
@@ -62,20 +100,32 @@ extern "C" {
     #define __func__ __FUNCTION__
     #endif
 
-    #ifdef HAVE_CLOCK_GETTIME
         #define POCL_DEBUG_HEADER pocl_debug_print_header(__func__, __LINE__);
         extern void pocl_debug_print_header(const char * func, unsigned line);
-    #else
-        #define POCL_DEBUG_HEADER                                           \
-            fprintf(stderr, "** POCL ** : in function %s"                   \
-            " at line %u:\n", __func__, __LINE__);
-    #endif
+        extern void pocl_debug_measure_start(uint64_t* start);
+        extern void pocl_debug_measure_finish(uint64_t* start, uint64_t* finish,
+                                              const char* msg,
+                                              const char *func,
+                                              unsigned line);
+        extern void pocl_debug_print_duration(const char* func, unsigned line,
+                                              const char* msg, uint64_t nanosecs);
+        #define POCL_MEASURE_START(SUFFIX) \
+          uint64_t pocl_time_start_ ## SUFFIX, pocl_time_finish_ ## SUFFIX; \
+          pocl_debug_measure_start(&pocl_time_start_ ## SUFFIX);
+
+        #define POCL_MEASURE_FINISH(SUFFIX) \
+          pocl_debug_measure_finish(&pocl_time_start_ ## SUFFIX, \
+                         &pocl_time_finish_ ## SUFFIX, "API: " #SUFFIX, \
+                         __func__, __LINE__);
 
     #define POCL_MSG_PRINT(TYPE, ERRCODE, ...)                              \
         do {                                                                \
             if (pocl_debug_messages) {                                      \
                 POCL_DEBUG_HEADER                                           \
-                fprintf(stderr, TYPE  ERRCODE " ");                         \
+                if (stderr_is_a_tty)                                        \
+                  fprintf(stderr, TYPE POCL_COLOR_CYAN ERRCODE " "  POCL_COLOR_RESET);            \
+                else                                                        \
+                  fprintf(stderr, TYPE ERRCODE " ");                        \
                 fprintf(stderr, __VA_ARGS__);                               \
             }                                                               \
         } while (0)
@@ -88,9 +138,23 @@ extern "C" {
             }                                                               \
         } while (0)
 
-    #define POCL_MSG_WARN(...)    POCL_MSG_PRINT(" *** WARNING *** ", "", __VA_ARGS__)
-    #define POCL_MSG_ERR(...)     POCL_MSG_PRINT(" *** ERROR *** ", "", __VA_ARGS__)
-    #define POCL_MSG_PRINT_INFO(...) POCL_MSG_PRINT(" *** INFO *** ", "", __VA_ARGS__)
+    #define POCL_MSG_WARN2(errcode, ...)   do { if (stderr_is_a_tty) \
+          POCL_MSG_PRINT(POCL_COLOR_YELLOW " *** WARNING *** ", errcode, __VA_ARGS__); \
+          else POCL_MSG_PRINT(" *** WARNING *** ", errcode, __VA_ARGS__); } while(0)
+    #define POCL_MSG_WARN(...)  POCL_MSG_WARN2("", __VA_ARGS__)
+
+    #define POCL_MSG_ERR2(errcode, ...)    do { if (stderr_is_a_tty) \
+          POCL_MSG_PRINT(POCL_COLOR_RED " *** ERROR *** ", errcode, __VA_ARGS__); \
+          else POCL_MSG_PRINT(" *** ERROR *** ", errcode, __VA_ARGS__); } while (0)
+    #define POCL_MSG_ERR(...)  POCL_MSG_ERR2("", __VA_ARGS__)
+
+    #define POCL_MSG_PRINT_INFO2(errcode, ...) do { if (stderr_is_a_tty) \
+          POCL_MSG_PRINT(POCL_COLOR_GREEN " *** INFO *** ", errcode, __VA_ARGS__); \
+          else POCL_MSG_PRINT(" *** INFO *** ", errcode, __VA_ARGS__); } while (0)
+    #define POCL_MSG_PRINT_INFO(...) POCL_MSG_PRINT_INFO2("", __VA_ARGS__)
+
+    #define POCL_DEBUG_EVENT_TIME(eventp, msg) \
+        pocl_debug_print_duration(__func__, __LINE__, "Event " msg, (uint64_t)((*eventp)->time_end - (*eventp)->time_start))
 
 #else
 
@@ -99,39 +163,60 @@ extern "C" {
     #define POCL_MSG_PRINT(...)
     #define POCL_MSG_PRINT2(...)
     #define POCL_MSG_PRINT_INFO(...)
+    #define POCL_MSG_PRINT_INFO2(...)
+    #define POCL_DEBUG_HEADER
+    #define POCL_MEASURE_START(...)
+    #define POCL_MEASURE_FINISH(...)
+    #define POCL_DEBUG_EVENT_TIME(...)
 
 #endif
 
 
 #define POCL_GOTO_ERROR_ON(cond, err_code, ...)                             \
-    if (cond)                                                               \
+  do                                                                        \
     {                                                                       \
-        POCL_MSG_PRINT(" *** ERROR *** ", # err_code, __VA_ARGS__);         \
-        errcode = err_code;                                                 \
-        goto ERROR;                                                         \
+      if (cond)                                                             \
+        {                                                                   \
+            POCL_MSG_ERR2(#err_code, __VA_ARGS__);                          \
+            errcode = err_code;                                             \
+            goto ERROR;                                                     \
+        }                                                                   \
     }                                                                       \
+  while (0)
 
 #define POCL_RETURN_ERROR_ON(cond, err_code, ...)                           \
-    if (cond)                                                               \
+  do                                                                        \
     {                                                                       \
-        POCL_MSG_PRINT(" *** ERROR *** ", # err_code, __VA_ARGS__);         \
-        return err_code;                                                    \
+      if (cond)                                                             \
+        {                                                                   \
+            POCL_MSG_ERR2(#err_code, __VA_ARGS__);                          \
+            return err_code;                                                \
+        }                                                                   \
     }                                                                       \
+  while (0)
 
 #define POCL_RETURN_ERROR_COND(cond, err_code)                              \
-    if (cond)                                                               \
+  do                                                                        \
     {                                                                       \
-        POCL_MSG_PRINT(" *** ERROR *** ", #err_code, "%s\n", #cond);        \
-        return err_code;                                                    \
+      if (cond)                                                             \
+        {                                                                   \
+          POCL_MSG_ERR2(#err_code, "%s\n", #cond);                          \
+          return err_code;                                                  \
+        }                                                                   \
     }                                                                       \
+  while (0)
 
 #define POCL_GOTO_ERROR_COND(cond, err_code)                                \
-    if (cond)                                                               \
+  do                                                                        \
     {                                                                       \
-        POCL_MSG_PRINT(" *** ERROR *** ", #err_code, "%s\n", #cond);        \
-        errcode = err_code;                                                 \
-        goto ERROR;                                                         \
+      if (cond)                                                             \
+        {                                                                   \
+          POCL_MSG_ERR2(#err_code, "%s\n", #cond);                          \
+          errcode = err_code;                                               \
+          goto ERROR;                                                       \
+        }                                                                   \
     }                                                                       \
+  while (0)
 
 
 
diff --git a/lib/CL/pocl_icd.h b/lib/CL/pocl_icd.h
index c1b8b46..df88161 100644
--- a/lib/CL/pocl_icd.h
+++ b/lib/CL/pocl_icd.h
@@ -38,128 +38,12 @@ extern struct _cl_icd_dispatch pocl_dispatch;  //from clGetPlatformIDs.c
 #ifdef HAVE_OCL_ICD 
 #include <ocl_icd.h>
 #else
+#define OCL_ICD_IDENTIFIED_FUNCTIONS 116
 struct _cl_icd_dispatch {
-	void *funcptr[122];
+        void *funcptr[166];
 };
 #endif
 
-/* The "implementation" of the _cl_device_id struct. 
- * Instantiated in clGetPlatformIDs.c
- *
- * TODO: the NULL entries are functions that lack implementation (or even stubs) in pocl
- */
-#define POCL_ICD_DISPATCH {           \
-  &POclGetPlatformIDs,          \
-  &POclGetPlatformInfo,         \
-  &POclGetDeviceIDs,            \
-  &POclGetDeviceInfo,           \
-  &POclCreateContext,           \
-  &POclCreateContextFromType,   \
-  &POclRetainContext,           \
-  &POclReleaseContext,          \
-  &POclGetContextInfo,          \
-  &POclCreateCommandQueue,      \
-  &POclRetainCommandQueue, /* 10 */           \
-  &POclReleaseCommandQueue,     \
-  &POclGetCommandQueueInfo,     \
-  NULL /*clSetCommandQueueProperty*/, \
-  &POclCreateBuffer,            \
-  &POclCreateImage2D,           \
-  &POclCreateImage3D,           \
-  &POclRetainMemObject,         \
-  &POclReleaseMemObject,        \
-  &POclGetSupportedImageFormats,\
-  &POclGetMemObjectInfo, /* 20 */             \
-  &POclGetImageInfo,            \
-  &POclCreateSampler,           \
-  &POclRetainSampler,           \
-  &POclReleaseSampler,          \
-  &POclGetSamplerInfo,          \
-  &POclCreateProgramWithSource, \
-  &POclCreateProgramWithBinary, \
-  &POclRetainProgram,           \
-  &POclReleaseProgram,          \
-  &POclBuildProgram, /* 30 */ \
-  &POclUnloadCompiler,          \
-  &POclGetProgramInfo,          \
-  &POclGetProgramBuildInfo,     \
-  &POclCreateKernel,            \
-  &POclCreateKernelsInProgram,  \
-  &POclRetainKernel,            \
-  &POclReleaseKernel,           \
-  &POclSetKernelArg,            \
-  &POclGetKernelInfo,           \
-  &POclGetKernelWorkGroupInfo, /* 40 */       \
-  &POclWaitForEvents,           \
-  &POclGetEventInfo,            \
-  &POclRetainEvent,             \
-  &POclReleaseEvent,            \
-  &POclGetEventProfilingInfo,   \
-  &POclFlush,                   \
-  &POclFinish,                  \
-  &POclEnqueueReadBuffer,       \
-  &POclEnqueueWriteBuffer,      \
-  &POclEnqueueCopyBuffer, /* 50 */  \
-  &POclEnqueueReadImage,        \
-  &POclEnqueueWriteImage,       \
-  &POclEnqueueCopyImage,        \
-  &POclEnqueueCopyImageToBuffer,\
-  &POclEnqueueCopyBufferToImage,\
-  &POclEnqueueMapBuffer,        \
-  &POclEnqueueMapImage,         \
-  &POclEnqueueUnmapMemObject,   \
-  &POclEnqueueNDRangeKernel,    \
-  &POclEnqueueTask, /* 60 */  \
-  &POclEnqueueNativeKernel,     \
-  &POclEnqueueMarker,           \
-  &POclEnqueueWaitForEvents,    \
-  &POclEnqueueBarrier,          \
-  &POclGetExtensionFunctionAddress, \
-  NULL, /* &POclCreateFromGLBuffer,      */ \
-  &POclCreateFromGLTexture2D,   \
-  &POclCreateFromGLTexture3D,   \
-  NULL, /* &POclCreateFromGLRenderbuffer, */ \
-  NULL, /* &POclGetGLObjectInfo,  70       */ \
-  NULL, /* &POclGetGLTextureInfo,        */ \
-  NULL, /* &POclEnqueueAcquireGLObjects, */ \
-  NULL, /* &POclEnqueueReleaseGLObjects, */ \
-  NULL, /* &POclGetGLContextInfoKHR,     */ \
-  NULL, /* &clUnknown75 */      \
-  NULL, /* &clUnknown76 */      \
-  NULL, /* &clUnknown77 */      \
-  NULL, /* &clUnknown78 */      \
-  NULL, /* &clUnknown79 */      \
-  NULL, /* &clUnknown80 */      \
-  &POclSetEventCallback,        \
-  &POclCreateSubBuffer,         \
-  &POclSetMemObjectDestructorCallback, \
-  &POclCreateUserEvent,         \
-  &POclSetUserEventStatus,      \
-  &POclEnqueueReadBufferRect,   \
-  &POclEnqueueWriteBufferRect,  \
-  &POclEnqueueCopyBufferRect,   \
-  NULL, /* &POclCreateSubDevicesEXT,     */ \
-  &POclRetainDevice, /* &POclRetainDeviceEXT,         */ \
-  &POclReleaseDevice, /* &POclReleaseDeviceEXT,        */ \
-  NULL, /* &clUnknown92 */      \
-  &POclCreateSubDevices,        \
-  &POclRetainDevice,                      \
-  &POclReleaseDevice,                     \
-  &POclCreateImage,                               \
-  NULL, /* &POclCreateProgramWithBuiltInKernels, */ \
-  NULL, /* &POclCompileProgram,          */ \
-  NULL, /* &POclLinkProgram,             */ \
-  NULL, /* &POclUnloadPlatformCompiler,  */ \
-  &POclGetKernelArgInfo,   \
-  NULL, /* &POclEnqueueFillBuffer,        */ \
-  &POclEnqueueFillImage,         \
-  NULL, /* &POclEnqueueMigrateMemObjects, */ \
-  &POclEnqueueMarkerWithWaitList,  \
-  NULL, /* &POclEnqueueBarrierWithWaitList, */ \
-  NULL, /* &POclGetExtensionFunctionAddressForPlatform, */ \
-  NULL, /* &POclCreateFromGLTexture,     */ \
-}
-
 #endif
 #endif
 
diff --git a/lib/CL/pocl_img_buf_cpy.c b/lib/CL/pocl_img_buf_cpy.c
new file mode 100644
index 0000000..d30c59c
--- /dev/null
+++ b/lib/CL/pocl_img_buf_cpy.c
@@ -0,0 +1,217 @@
+/* pocl_img_buf_cpy.c: common parts of image and buffer copying
+
+   Copyright (c) 2011 Universidad Rey Juan Carlos
+   Copyright (c) 2015 Giuseppe Bilotta
+   
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_cl.h"
+#include <assert.h>
+#include "pocl_image_util.h"
+#include "pocl_util.h"
+
+/* Copies between images and rectangular buffer copies share most of the code,
+   with specializations only needed for specific checks. The actual API calls
+   thus defer to this function, with the additional information of which of src
+   and/or dst is an image and which is a buffer.
+ */
+
+cl_int pocl_rect_copy(cl_command_queue command_queue,
+                      cl_mem src,
+                      cl_int src_is_image,
+                      cl_mem dst,
+                      cl_int dst_is_image,
+                      const size_t *src_origin,
+                      const size_t *dst_origin,
+                      const size_t *region,
+                      size_t src_row_pitch,
+                      size_t src_slice_pitch,
+                      size_t dst_row_pitch,
+                      size_t dst_slice_pitch,
+                      cl_uint num_events_in_wait_list,
+                      const cl_event *event_wait_list,
+                      cl_event *event)
+{
+  cl_int errcode;
+  cl_device_id device;
+  unsigned i;
+
+  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
+
+  if (src_is_image || dst_is_image)
+    {
+      POCL_RETURN_ERROR_ON((!command_queue->device->image_support), CL_INVALID_OPERATION,
+        "Device %s does not support images\n", command_queue->device->long_name);
+    }
+
+  POCL_RETURN_ERROR_COND((src == NULL), CL_INVALID_MEM_OBJECT);
+  POCL_RETURN_ERROR_COND((dst == NULL), CL_INVALID_MEM_OBJECT);
+  POCL_RETURN_ERROR_COND((src_origin == NULL), CL_INVALID_VALUE);
+  POCL_RETURN_ERROR_COND((dst_origin == NULL), CL_INVALID_VALUE);
+  POCL_RETURN_ERROR_COND((region == NULL), CL_INVALID_VALUE);
+
+  if (src_is_image)
+    {
+      POCL_RETURN_ERROR_ON((!src->is_image),
+        CL_INVALID_MEM_OBJECT, "src_image is not an image\n");
+      POCL_RETURN_ERROR_ON((src->type == CL_MEM_OBJECT_IMAGE2D && src_origin[2] != 0),
+        CL_INVALID_VALUE, "src_origin[2] must be 0 for 2D src_image\n");
+      errcode = pocl_check_device_supports_image(src, command_queue);
+      if (errcode != CL_SUCCESS)
+        return errcode;
+    }
+  else
+    {
+      POCL_RETURN_ERROR_ON((src->type != CL_MEM_OBJECT_BUFFER),
+        CL_INVALID_MEM_OBJECT, "src is not a CL_MEM_OBJECT_BUFFER\n");
+    }
+
+  if (dst_is_image)
+    {
+      POCL_RETURN_ERROR_ON((!dst->is_image),
+        CL_INVALID_MEM_OBJECT, "dst is not an image\n");
+      POCL_RETURN_ERROR_ON((dst->type == CL_MEM_OBJECT_IMAGE2D && dst_origin[2] != 0),
+        CL_INVALID_VALUE, "dst_origin[2] must be 0 for 2D dst_image\n");
+      errcode = pocl_check_device_supports_image(dst, command_queue);
+      if (errcode != CL_SUCCESS)
+        return errcode;
+    }
+  else
+    {
+      POCL_RETURN_ERROR_ON((dst->type != CL_MEM_OBJECT_BUFFER),
+        CL_INVALID_MEM_OBJECT, "dst is not a CL_MEM_OBJECT_BUFFER\n");
+    }
+
+  if (src_is_image && dst_is_image)
+    {
+      POCL_RETURN_ERROR_ON((src->image_channel_order != dst->image_channel_order),
+        CL_IMAGE_FORMAT_MISMATCH, "src and dst have different image channel order\n");
+      POCL_RETURN_ERROR_ON((src->image_channel_data_type != dst->image_channel_data_type),
+        CL_IMAGE_FORMAT_MISMATCH, "src and dst have different image channel data type\n");
+      POCL_RETURN_ERROR_ON((
+          (dst->type == CL_MEM_OBJECT_IMAGE2D || src->type == CL_MEM_OBJECT_IMAGE2D) &&
+          region[2] != 1),
+        CL_INVALID_VALUE, "for any 2D image copy, region[2] must be 1\n");
+   }
+
+
+  /* Images need to recompute the regions in bytes before copying */
+  size_t mod_region[3], mod_src_origin[3], mod_dst_origin[3];
+  memcpy(mod_region, region, 3*sizeof(size_t));
+  memcpy(mod_src_origin, src_origin, 3*sizeof(size_t));
+  memcpy(mod_dst_origin, dst_origin, 3*sizeof(size_t));
+
+  if (src_is_image && dst_is_image)
+    {
+      mod_region[0] *= src->image_elem_size * src->image_channels;
+      mod_src_origin[0] *= src->image_elem_size * src->image_channels;
+      mod_dst_origin[0] *= dst->image_elem_size * dst->image_channels;
+    }
+
+  if (src_is_image)
+    {
+      src_row_pitch = src->image_row_pitch;
+      src_slice_pitch = src->image_slice_pitch;
+    }
+
+  if (dst_is_image)
+    {
+      dst_row_pitch = dst->image_row_pitch;
+      dst_slice_pitch = dst->image_slice_pitch;
+    }
+
+  POCL_RETURN_ERROR_ON(((command_queue->context != src->context)
+      || (command_queue->context != dst->context)),
+    CL_INVALID_CONTEXT,
+      "src, dst and command_queue are not from the same context\n");
+
+  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
+    CL_INVALID_EVENT_WAIT_LIST);
+
+  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
+    CL_INVALID_EVENT_WAIT_LIST);
+
+  size_t region_bytes = mod_region[0] * mod_region[1] * mod_region[2];
+  POCL_RETURN_ERROR_ON((region_bytes <= 0), CL_INVALID_VALUE, "All items in region must be >0\n");
+
+  if (pocl_buffer_boundcheck_3d(src->size, mod_src_origin, mod_region,
+      &src_row_pitch, &src_slice_pitch, "src_") != CL_SUCCESS)
+    return CL_INVALID_VALUE;
+
+  if (pocl_buffer_boundcheck_3d(dst->size, mod_dst_origin, mod_region,
+      &dst_row_pitch, &dst_slice_pitch, "dst_") != CL_SUCCESS)
+    return CL_INVALID_VALUE;
+
+  if (src == dst)
+    {
+      POCL_RETURN_ERROR_ON((src_slice_pitch != dst_slice_pitch),
+        CL_INVALID_VALUE, "src and dst are the same object,"
+        " but the given dst & src slice pitch differ\n");
+
+      POCL_RETURN_ERROR_ON((src_row_pitch != dst_row_pitch),
+        CL_INVALID_VALUE, "src and dst are the same object,"
+        " but the given dst & src row pitch differ\n");
+
+      POCL_RETURN_ERROR_ON(
+        (check_copy_overlap(mod_src_origin, mod_dst_origin, mod_region,
+          src_row_pitch, src_slice_pitch)),
+        CL_MEM_COPY_OVERLAP, "src and dst are the same object,"
+        "and source and destination regions overlap\n");
+
+    }
+
+  POCL_CHECK_DEV_IN_CMDQ;
+
+  /* execute directly */
+  /* TODO: enqueue the read_rect if this is a non-blocking read (see
+     clEnqueueReadBuffer) */
+  if (command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    {
+      POCL_ABORT_UNIMPLEMENTED("clEnqueueCopyBufferRect: Out-of-order queue");
+      /* wait for the event in event_wait_list to finish */
+    }
+  else
+    {
+      /* in-order queue - all previously enqueued commands must 
+       * finish before this read */
+      // ensure our buffer is not freed yet
+      POname(clRetainMemObject) (src);
+      POname(clRetainMemObject) (dst);
+      POname(clFinish)(command_queue);
+    }
+  POCL_UPDATE_EVENT_SUBMITTED(event);
+  POCL_UPDATE_EVENT_RUNNING(event);
+
+  /* TODO: offset computation doesn't work in case the ptr is not 
+     a direct pointer */
+  device->ops->copy_rect(device->data,
+                       src->device_ptrs[device->dev_id].mem_ptr,
+                       dst->device_ptrs[device->dev_id].mem_ptr,
+                       mod_src_origin, mod_dst_origin, mod_region,
+                       src_row_pitch, src_slice_pitch,
+                       dst_row_pitch, dst_slice_pitch);
+
+  POCL_UPDATE_EVENT_COMPLETE(event);
+
+  POname(clReleaseMemObject) (src);
+  POname(clReleaseMemObject) (dst);
+
+  return CL_SUCCESS;
+}
diff --git a/lib/CL/clReleaseCommandQueue.c b/lib/CL/pocl_img_buf_cpy.h
similarity index 56%
copy from lib/CL/clReleaseCommandQueue.c
copy to lib/CL/pocl_img_buf_cpy.h
index c79465c..56dbd7a 100644
--- a/lib/CL/clReleaseCommandQueue.c
+++ b/lib/CL/pocl_img_buf_cpy.h
@@ -1,6 +1,7 @@
-/* OpenCL runtime library: clReleaseCommandQueue()
+/* pocl_img_buf_cpy.h: common parts of image and buffer copying
 
-   Copyright (c) 2011-2012 Universidad Rey Juan Carlos and Pekka Jääskeläinen
+   Copyright (c) 2011 Universidad Rey Juan Carlos
+   Copyright (c) 2015 Giuseppe Bilotta
    
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -22,23 +23,21 @@
 */
 
 #include "pocl_cl.h"
+#include <assert.h>
 #include "pocl_util.h"
-#include "pocl_queue_util.h"
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clReleaseCommandQueue)(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
-{
-  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
-
-  int new_refcount;
-  POname(clFlush)(command_queue);
-  POCL_RELEASE_OBJECT(command_queue, new_refcount);
-  if (new_refcount == 0)
-    {
-      pocl_queue_list_delete(command_queue);
-      POCL_MEM_FREE(command_queue);
-      /* TODO: should clReleaseContext()? */
-    }
-  return CL_SUCCESS;
-}
-POsym(clReleaseCommandQueue)
+cl_int pocl_rect_copy(cl_command_queue command_queue,
+                      cl_mem src,
+                      cl_int src_is_image,
+                      cl_mem dst,
+                      cl_int dst_is_image,
+                      const size_t *src_origin,
+                      const size_t *dst_origin,
+                      const size_t *region,
+                      size_t src_row_pitch,
+                      size_t src_slice_pitch,
+                      size_t dst_row_pitch,
+                      size_t dst_slice_pitch,
+                      cl_uint num_events_in_wait_list,
+                      const cl_event *event_wait_list,
+                      cl_event *event);
diff --git a/lib/CL/pocl_intfn.h b/lib/CL/pocl_intfn.h
index a9f4553..7af08fd 100644
--- a/lib/CL/pocl_intfn.h
+++ b/lib/CL/pocl_intfn.h
@@ -68,6 +68,7 @@ POdeclsym(clEnqueueWriteBuffer)
 POdeclsym(clEnqueueWriteBufferRect)
 POdeclsym(clEnqueueWriteImage)
 POdeclsym(clEnqueueFillImage)
+POdeclsym(clEnqueueFillBuffer)
 POdeclsym(clFinish)
 POdeclsym(clFlush)
 POdeclsym(clGetCommandQueueInfo)
@@ -111,5 +112,15 @@ POdeclsym(clSetMemObjectDestructorCallback)
 POdeclsym(clSetUserEventStatus)
 POdeclsym(clUnloadCompiler)
 POdeclsym(clWaitForEvents)
+POdeclsym(clEnqueueSVMFree)
+POdeclsym(clEnqueueSVMMap)
+POdeclsym(clEnqueueSVMMemcpy)
+POdeclsym(clEnqueueSVMMemFill)
+POdeclsym(clEnqueueSVMUnmap)
+POdeclsym(clSVMFree)
+POdeclsym(clSVMAlloc)
+POdeclsym(clSetKernelArgSVMPointer)
+POdeclsym(clSetKernelExecInfo)
+POdeclsym(clCreateCommandQueueWithProperties)
 
 #endif
diff --git a/lib/CL/pocl_llvm.h b/lib/CL/pocl_llvm.h
index 90d1521..581f639 100644
--- a/lib/CL/pocl_llvm.h
+++ b/lib/CL/pocl_llvm.h
@@ -29,13 +29,15 @@
 extern "C" {
 #endif
 
+/* Returns the cpu name as reported by LLVM. */
+char* get_cpu_name();
+
 /* Compiles an .cl file into LLVM IR.
  */
 int pocl_llvm_build_program
 (cl_program program,
  unsigned device_i,
- const char* user_options,
- void **cache_lock, char *program_bc_path);
+ const char* user_options_cstr, char *program_bc_path);
 
 
 /* Retrieve metadata of the given kernel in the program to populate the
diff --git a/lib/CL/pocl_llvm_api.cc b/lib/CL/pocl_llvm_api.cc
index c99243a..ac4ec4b 100644
--- a/lib/CL/pocl_llvm_api.cc
+++ b/lib/CL/pocl_llvm_api.cc
@@ -53,26 +53,14 @@ using llvm::legacy::PassManager;
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-#include "llvm/Linker.h"
-#else
 #include "llvm/Linker/Linker.h"
 #include "llvm/PassAnalysisSupport.h"
-#endif
 
-#ifdef LLVM_3_2
-#include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Support/IRReader.h"
-#include "llvm/DataLayout.h"
-#else
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IRReader/IRReader.h"
-#endif
 
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
@@ -88,6 +76,7 @@ using llvm::legacy::PassManager;
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Support/Host.h"
 
 #include <iostream>
 #include <fstream>
@@ -178,16 +167,15 @@ unlink_source(FrontendOptions &fe)
 
 }
 
-// Compatibility function: this function existed up to LLVM 3.5
-// With 3.6 its name & signature changed
-#if !(defined LLVM_3_2 || defined LLVM_3_3 || \
-      defined LLVM_3_4 || defined LLVM_3_5)
+#ifndef LLVM_OLDER_THAN_3_8
+#define PassManager legacy::PassManager
+#endif
+
 static llvm::Module*
 ParseIRFile(const char* fname, SMDiagnostic &Err, llvm::LLVMContext &ctx)
 {
     return parseIRFile(fname, Err, ctx).release();
 }
-#endif
 
 static void get_build_log(cl_program program,
                          unsigned device_i,
@@ -223,11 +211,19 @@ static void get_build_log(cl_program program,
 
 int pocl_llvm_build_program(cl_program program, 
                             unsigned device_i,
-                            const char* user_options,
-                            void** cache_lock,
+                            const char* user_options_cstr,
                             char* program_bc_path)
 
 {
+  void* write_lock = NULL;
+  char tempfile[POCL_FILENAME_LENGTH];
+  tempfile[0] = 0;
+  llvm::Module **mod = NULL;
+  std::string user_options(user_options_cstr ? user_options_cstr : "");
+  std::string content;
+  llvm::raw_string_ostream sos(content);
+  size_t n = 0;
+
   llvm::MutexGuard lockHolder(kernelCompilerLock);
   InitializeLLVM();
 
@@ -263,6 +259,18 @@ int pocl_llvm_build_program(cl_program program,
       POCL_MEM_FREE(device_switches);
     }
 
+  llvm::StringRef extensions(device->extensions);
+
+  if (extensions.size() > 0) {
+    size_t e_start = 0, e_end = 0;
+    while (e_end < std::string::npos) {
+      e_end = extensions.find(' ', e_start);
+      llvm::StringRef tok = extensions.slice(e_start, e_end);
+      e_start = e_end + 1;
+      ss << "-D" << tok.str() << " ";
+    }
+  }
+
   // This can cause illegal optimizations when unaware
   // of the barrier semantics. -O2 is the default opt level in
   // Clang for OpenCL C and seems to affect the performance
@@ -279,7 +287,34 @@ int pocl_llvm_build_program(cl_program program,
   // The current directory is a standard search path.
   ss << "-I. ";
 
-   /* With fp-contract we get calls to fma with processors which do not
+  ss << user_options << " ";
+
+  if (device->endian_little)
+    ss << "-D__ENDIAN_LITTLE__=1 ";
+
+  if (device->image_support)
+    ss << "-D__IMAGE_SUPPORT__=1 ";
+
+  ss << "-DCL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" << device->global_var_max_size << " ";
+
+  if (user_options.find("cl-fast-relaxed-math") != std::string::npos)
+    ss << "-D__FAST_RELAXED_MATH__=1 ";
+
+  ss << "-D__OPENCL_VERSION__=" << device->cl_version_int << " ";
+
+  if (user_options.find("-cl-std=") == std::string::npos)
+    ss << "-cl-std=" << device->cl_version_std << " ";
+
+  std::string temp(ss.str());
+  size_t pos = temp.find("-cl-std=CL");
+  pos += 10;
+  int cl_std_major = temp.c_str()[pos] - '0';
+  int cl_std_minor = temp.c_str()[pos+2] - '0';
+  int cl_std_i = cl_std_major * 100 + cl_std_minor * 10;
+  // if (cl_std_i != 10) && (cl_std != 11) && (cl_std != 12) (cl_std != 20)
+  ss << "-D__OPENCL_C_VERSION__=" << cl_std_i << " ";
+
+  /* With fp-contract we get calls to fma with processors which do not
       have fma instructions. These ruin the performance. Better to have
       the mul+add separated in the IR. */
   ss << "-fno-builtin -ffp-contract=off ";
@@ -288,7 +323,11 @@ int pocl_llvm_build_program(cl_program program,
   ss << "-triple=" << device->llvm_target_triplet << " ";
   if (device->llvm_cpu != NULL)
     ss << "-target-cpu " << device->llvm_cpu << " ";
-  ss << user_options << " ";
+
+#ifdef DEBUG_POCL_LLVM_API
+  std::cout << "pocl_llvm_build_program: Final options: " << ss.str() << std::endl;
+#endif
+
   std::istream_iterator<std::string> begin(ss);
   std::istream_iterator<std::string> end;
   std::istream_iterator<std::string> i = begin;
@@ -320,7 +359,7 @@ int pocl_llvm_build_program(cl_program program,
        diags))
     {
       pocl_cache_create_program_cachedir(program, device_i, NULL, 0,
-        program_bc_path, cache_lock);
+        program_bc_path);
       get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
       return CL_INVALID_BUILD_OPTIONS;
     }
@@ -334,22 +373,14 @@ int pocl_llvm_build_program(cl_program program,
   la->CharIsSigned = true;
 
   // the per-file types don't seem to override this 
-  la->OpenCLVersion = 120;
+  la->OpenCLVersion = cl_std_i;
   la->FakeAddressSpaceMap = true;
   la->Blocks = true; //-fblocks
   la->MathErrno = false; // -fno-math-errno
   la->NoBuiltin = true;  // -fno-builtin
-#ifndef LLVM_3_2
   la->AsmBlocks = true;  // -fasm (?)
-#endif
 
   PreprocessorOptions &po = pocl_build.getPreprocessorOpts();
-  /* configure.ac sets a a few host specific flags for pthreads and
-     basic devices. */
-  if (device->has_64bit_long == 0)
-    po.addMacroDef("_CL_DISABLE_LONG");
-
-  po.addMacroDef("__OPENCL_VERSION__=120"); // -D__OPENCL_VERSION_=120
 
   std::string kernelh;
   if (pocl_get_bool_option("POCL_BUILDING", 0))
@@ -364,20 +395,15 @@ int pocl_llvm_build_program(cl_program program,
     }
   po.Includes.push_back(kernelh);
 
-  // TODO: user_options (clBuildProgram options) are not passed
-
   clang::TargetOptions &ta = pocl_build.getTargetOpts();
   ta.Triple = device->llvm_target_triplet;
   if (device->llvm_cpu != NULL)
     ta.CPU = device->llvm_cpu;
 
-  // printf("### Triple: %s, CPU: %s\n", ta.Triple.c_str(), ta.CPU.c_str());
-
-#ifdef LLVM_3_2
-  CI.createDiagnostics(0, NULL, diagsBuffer, false);
-#else
+#ifdef DEBUG_POCL_LLVM_API
+  std::cout << "### Triple: " << ta.Triple.c_str() <<  ", CPU: " << ta.CPU.c_str();
+#endif
   CI.createDiagnostics(diagsBuffer, false);
-#endif 
 
   FrontendOptions &fe = pocl_build.getFrontendOpts();
   // The CreateFromArgs created an stdin input which we should remove first.
@@ -397,7 +423,6 @@ int pocl_llvm_build_program(cl_program program,
   cg.VerifyModule = false;
 
   PreprocessorOutputOptions &poo = pocl_build.getPreprocessorOutputOpts();
-  //PreprocessorOutputOptions prep = CI.getPreprocessorOutputOpts();
   poo.ShowCPP = 1;
   poo.ShowComments = 0;
   poo.ShowLineMarkers = 0;
@@ -407,101 +432,87 @@ int pocl_llvm_build_program(cl_program program,
   poo.RewriteIncludes = 0;
 
   std::string saved_output(fe.OutputFile);
-
-  char tempfile[POCL_FILENAME_LENGTH];
   pocl_cache_mk_temp_name(tempfile);
-
   fe.OutputFile = tempfile;
 
   bool success = true;
-  clang::FrontendAction *action2 = NULL;
-  action2 = new clang::PrintPreprocessedAction();
-  success = CI.ExecuteAction(*action2);
+  clang::PrintPreprocessedAction action2;
+  clang::EmitLLVMOnlyAction action(GlobalContext());
+  success = CI.ExecuteAction(action2);
   if (!success)
-    {
-      pocl_cache_create_program_cachedir(program, device_i, NULL, 0,
-        program_bc_path, cache_lock);
-      get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
-      return CL_BUILD_PROGRAM_FAILURE;
-    }
+    goto ERROR_BUILDLOG;
 
   char *preprocessed_out;
   uint64_t size;
   pocl_read_file(tempfile, &preprocessed_out, &size);
+  pocl_remove(tempfile);
+  fe.OutputFile = saved_output;
+
   if (!preprocessed_out)
-    {
-      pocl_cache_create_program_cachedir(program, device_i, NULL, 0,
-        program_bc_path, cache_lock);
-      get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
-      return CL_BUILD_PROGRAM_FAILURE;
-    }
+    goto ERROR_BUILDLOG;
 
   pocl_cache_create_program_cachedir(program, device_i, preprocessed_out,
-                                     size, program_bc_path, cache_lock);
+                                     size, program_bc_path);
 
   POCL_MEM_FREE(preprocessed_out);
-  pocl_remove(tempfile);
 
   if (pocl_exists(program_bc_path)) {
     unlink_source(fe);
     return CL_SUCCESS;
   }
 
-  fe.OutputFile = saved_output;
-
   // TODO: use pch: it is possible to disable the strict checking for
   // the compilation flags used to compile it and the current translation
   // unit via the preprocessor options directly.
 
-  clang::CodeGenAction *action = NULL;
-  action = new clang::EmitLLVMOnlyAction(GlobalContext());
-  success = CI.ExecuteAction(*action);
+  success = CI.ExecuteAction(action);
+
+  unlink_source(fe);
 
   get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
 
-  // FIXME: memleak, see FIXME below
-  if (!success) return CL_BUILD_PROGRAM_FAILURE;
+  if (!success)
+    return CL_BUILD_PROGRAM_FAILURE;
 
-  llvm::Module **mod = (llvm::Module **)&program->llvm_irs[device_i];
+  mod = (llvm::Module **)&program->llvm_irs[device_i];
   if (*mod != NULL)
     delete (llvm::Module*)*mod;
 
-#if LLVM_VERSION_MAJOR==3 && LLVM_VERSION_MINOR<6
-  *mod = action->takeModule();
-#else
-  *mod = action->takeModule().release();
-#endif
+  *mod = action.takeModule().release();
 
   if (*mod == NULL)
     return CL_BUILD_PROGRAM_FAILURE;
 
+  write_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
+  assert(write_lock);
+
   /* Always retain program.bc. Its required in clBuildProgram */
   pocl_write_module(*mod, program_bc_path, 0);
 
   /* To avoid writing & reading the same back,
    * save program->binaries[i]
    */
-
-  std::string content;
-  llvm::raw_string_ostream sos(content);
   WriteBitcodeToFile(*mod, sos);
   sos.str(); // flush
 
   if (program->binaries[device_i])
     POCL_MEM_FREE(program->binaries[device_i]);
 
-  size_t n = content.size();
+  n = content.size();
   program->binary_sizes[device_i] = n;
   program->binaries[device_i] = (unsigned char *) malloc(n);
   std::memcpy(program->binaries[device_i], content.c_str(), n);
 
-  unlink_source(fe);
-
-  // FIXME: cannot delete action as it contains something the llvm::Module
-  // refers to. We should create it globally, at compiler initialization time.
-  //delete action;
+  pocl_cache_release_lock(write_lock);
 
   return CL_SUCCESS;
+
+ERROR_BUILDLOG:
+  pocl_cache_create_program_cachedir(program, device_i, NULL, 0,
+    program_bc_path);
+  get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
+  return CL_BUILD_PROGRAM_FAILURE;
+
 }
 
 int pocl_llvm_get_kernel_arg_metadata(const char* kernel_name,
@@ -519,13 +530,9 @@ int pocl_llvm_get_kernel_arg_metadata(const char* kernel_name,
   for (unsigned i = 0, e = opencl_kernels->getNumOperands(); i != e; ++i) {
     llvm::MDNode *kernel_iter = opencl_kernels->getOperand(i);
 
-#ifdef LLVM_OLDER_THAN_3_6
-    llvm::Function *kernel_prototype = llvm::cast<llvm::Function>(kernel_iter->getOperand(0));
-#else
     llvm::Function *kernel_prototype = 
       llvm::cast<llvm::Function>(
         dyn_cast<llvm::ValueAsMetadata>(kernel_iter->getOperand(0))->getValue());
-#endif
     std::string name = kernel_prototype->getName().str();
     if (name == kernel_name) {
       kernel_metadata = kernel_iter;
@@ -552,9 +559,6 @@ int pocl_llvm_get_kernel_arg_metadata(const char* kernel_name,
     std::string meta_name = meta_name_node->getString().str();
 
     for (unsigned j = 1; j != arg_num; ++j) {
-#ifdef LLVM_OLDER_THAN_3_6
-      llvm::Value *meta_arg_value = meta_node->getOperand(j);
-#else
       llvm::Value *meta_arg_value = NULL;
       if (isa<ValueAsMetadata>(meta_node->getOperand(j)))
         meta_arg_value = 
@@ -562,7 +566,6 @@ int pocl_llvm_get_kernel_arg_metadata(const char* kernel_name,
       else if (isa<ConstantAsMetadata>(meta_node->getOperand(j)))
         meta_arg_value = 
           dyn_cast<ConstantAsMetadata>(meta_node->getOperand(j))->getValue(); 
-#endif
       struct pocl_argument_info* current_arg = &kernel->arg_info[j-1];
 
       if (meta_arg_value != NULL && isa<ConstantInt>(meta_arg_value) && 
@@ -594,6 +597,8 @@ int pocl_llvm_get_kernel_arg_metadata(const char* kernel_name,
               current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_LOCAL; break;
             case POCL_ADDRESS_SPACE_CONSTANT:
               current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_CONSTANT; break;
+            case POCL_ADDRESS_SPACE_GENERIC:
+              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE; break;
           }
         }
       }
@@ -688,9 +693,7 @@ int pocl_llvm_get_kernel_metadata(cl_program program,
   }
 
   DataLayout *TD = 0;
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  const std::string &ModuleDataLayout = input->getDataLayout();
-#elif (defined LLVM_OLDER_THAN_3_7)
+#ifdef LLVM_OLDER_THAN_3_7
   const std::string &ModuleDataLayout = input->getDataLayout()->getStringRepresentation();
 #else
   const std::string &ModuleDataLayout = input->getDataLayout().getStringRepresentation();
@@ -710,7 +713,7 @@ int pocl_llvm_get_kernel_metadata(cl_program program,
     funcName = kernel_function->getName().str();
     if (pocl::is_automatic_local(funcName, *i))
       {
-        locals.push_back(i);
+        locals.push_back(&*i);
       }
   }
 
@@ -794,13 +797,6 @@ int pocl_llvm_get_kernel_metadata(cl_program program,
   if (size_info) {
     for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) {
       llvm::MDNode *KernelSizeInfo = size_info->getOperand(i);
-#ifdef LLVM_OLDER_THAN_3_6
-      if (KernelSizeInfo->getOperand(0) != kernel_function) 
-        continue;
-      reqdx = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(1)))->getLimitedValue();
-      reqdy = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(2)))->getLimitedValue();
-      reqdz = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(3)))->getLimitedValue();
-#else
       if (dyn_cast<ValueAsMetadata>(
         KernelSizeInfo->getOperand(0).get())->getValue() != kernel_function) 
         continue;
@@ -813,7 +809,6 @@ int pocl_llvm_get_kernel_metadata(cl_program program,
       reqdz = (llvm::cast<ConstantInt>(
                  llvm::dyn_cast<ConstantAsMetadata>(
                    KernelSizeInfo->getOperand(3))->getValue()))->getLimitedValue();
-#endif
       break;
     }
   }
@@ -856,6 +851,15 @@ int pocl_llvm_get_kernel_metadata(cl_program program,
   return 0;
 }
 
+char* get_cpu_name() {
+  StringRef r = llvm::sys::getHostCPUName();
+  assert(r.size() > 0);
+  char* cpu_name = (char*) malloc (r.size()+1);
+  strncpy(cpu_name, r.data(), r.size());
+  cpu_name[r.size()] = 0;
+  return cpu_name;
+}
+
 /* helpers copied from LLVM opt START */
 
 /* FIXME: these options should come from the cl_device, and
@@ -893,24 +897,51 @@ static llvm::TargetOptions GetTargetOptions() {
 #endif
   return Options;
 }
+
+/* for "distro" style kernel libs, return which kernellib to use, at runtime */
+const char* getX86KernelLibName() {
+  StringMap<bool> Features;
+  llvm::sys::getHostCPUFeatures(Features);
+  const char *res = NULL;
+
+  if (Features["sse2"])
+    res = "sse2";
+  else
+    POCL_ABORT("Pocl on x86_64 requires at least SSE2");
+  if (Features["ssse3"] && Features["cx16"])
+    res = "ssse3";
+  if (Features["sse4.1"] && Features["cx16"])
+    res = "sse41";
+  if (Features["avx"] && Features["cx16"] && Features["popcnt"])
+    res = "avx";
+  if (Features["avx"] && Features["cx16"] && Features["popcnt"]
+      && Features["xop"] && Features["fma4"])
+    res = "avx_fma4";
+  if (Features["avx"] && Features["avx2"] && Features["cx16"]
+      && Features["popcnt"] && Features["lzcnt"] && Features["f16c"]
+      && Features["fma"] && Features["bmi"] && Features["bmi2"])
+    res = "avx2";
+  if (Features["avx512f"] )
+    res = "avx512";
+
+  return res;
+}
+
+
 // Returns the TargetMachine instance or zero if no triple is provided.
 static TargetMachine* GetTargetMachine(cl_device_id device,
  const std::vector<std::string>& MAttrs=std::vector<std::string>()) {
 
   std::string Error;
   Triple TheTriple(device->llvm_target_triplet);
+
   std::string MCPU =  device->llvm_cpu ? device->llvm_cpu : "";
+
   const Target *TheTarget = 
     TargetRegistry::lookupTarget("", TheTriple, Error);
-  
-  // In LLVM 3.4 and earlier, the target registry falls back to 
-  // the cpp backend in case a proper match was not found. In 
-  // that case simply do not use target info in the compilation 
-  // because it can be an off-tree target not registered at
-  // this point (read: TCE).
-  if (!TheTarget || TheTarget->getName() == std::string("cpp")) {
-    return 0;
-  }
+  if (!TheTarget)
+    return nullptr;
+  assert(TheTarget->getName() != std::string("cpp"));
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
   if (MAttrs.size()) {
@@ -974,33 +1005,28 @@ static PassManager& kernel_compiler_passes
     initializeVectorization(Registry);
     initializeIPO(Registry);
     initializeAnalysis(Registry);
+#ifdef LLVM_OLDER_THAN_3_8
     initializeIPA(Registry);
+#endif
     initializeTransformUtils(Registry);
     initializeInstCombine(Registry);
     initializeInstrumentation(Registry);
     initializeTarget(Registry);
   }
 
-#if !(defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
   // Scalarizer is in LLVM upstream since 3.4.
   const bool SCALARIZE = pocl_is_option_set("POCL_SCALARIZE_KERNELS");
-#else
-  const bool SCALARIZE = false;
-#endif
 
-#ifndef LLVM_3_2
 # ifdef LLVM_OLDER_THAN_3_7
   StringMap<llvm::cl::Option*> opts;
   llvm::cl::getRegisteredOptions(opts);
 # else
   StringMap<llvm::cl::Option *>& opts = llvm::cl::getRegisteredOptions();
 # endif
-#endif
 
   PassManager *Passes = new PassManager();
 
-#if defined LLVM_3_2
-#elif defined LLVM_OLDER_THAN_3_7
+#ifdef LLVM_OLDER_THAN_3_7
   // Need to setup the target info for target specific passes. */
   TargetMachine *Machine = GetTargetMachine(device);
 
@@ -1015,11 +1041,7 @@ static PassManager& kernel_compiler_passes
 
 
   if (module_data_layout != "") {
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-    Passes->add(new DataLayout(module_data_layout));
-#elif (defined LLVM_3_5)
-    Passes->add(new DataLayoutPass(DataLayout(module_data_layout)));
-#elif (defined LLVM_OLDER_THAN_3_7)
+#if (defined LLVM_OLDER_THAN_3_7)
     Passes->add(new DataLayoutPass());
 #endif
   }
@@ -1096,7 +1118,6 @@ static PassManager& kernel_compiler_passes
   const std::string wg_method = 
     pocl_get_string_option("POCL_WORK_GROUP_METHOD", "loopvec");
 
-#ifndef LLVM_3_2
   if (wg_method == "loopvec")
     {
 
@@ -1134,7 +1155,6 @@ static PassManager& kernel_compiler_passes
           O->addOccurrence(1, StringRef("debug-only"), StringRef("loop-vectorize"), false); 
 #endif
 
-#if !(defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
           if (pocl_get_bool_option("POCL_VECTORIZER_REMARKS", 0) == 1) {
             // Enable diagnostics from the loop vectorizer.
             O = opts["pass-remarks-missed"];
@@ -1152,14 +1172,12 @@ static PassManager& kernel_compiler_passes
             O->addOccurrence(1, StringRef("pass-remarks"), StringRef("loop-vectorize"), 
                              false); 
           }
-#endif
 
           O = opts["unroll-threshold"];
           assert(O && "could not find LLVM option 'unroll-threshold'");
           O->addOccurrence(1, StringRef("unroll-threshold"), StringRef("1"), false); 
         }
     } 
-#endif
 
   passes.push_back("instcombine");
   passes.push_back("STANDARD_OPTS");
@@ -1176,7 +1194,6 @@ static PassManager& kernel_compiler_passes
           Builder.OptLevel = 3;
           Builder.SizeLevel = 0;
 
-#if !(defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
           // These need to be setup in addition to invoking the passes
           // to get the vectorizers initialized properly.
           if (wg_method == "loopvec") {
@@ -1193,12 +1210,7 @@ static PassManager& kernel_compiler_passes
             Builder.BBVectorize = pocl_get_bool_option ("POCL_BBVECTORIZE", 0);
 #endif
           }
-#endif
 
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
-          // SimplifyLibCalls has been removed in LLVM 3.4.
-          Builder.DisableSimplifyLibCalls = true;
-#endif
           Builder.populateModulePassManager(*Passes);
      
           continue;
@@ -1244,57 +1256,95 @@ kernel_library
   Triple triple(device->llvm_target_triplet);
 
   if (libs.find(device) != libs.end())
-    {
-      return libs[device];
-    }
+    return libs[device];
+
+  const char *subdir = "host";
+  bool is_host = true;
+#ifdef TCE_AVAILABLE
+  if (triple.getArch() == Triple::tce) {
+    subdir = "tce";
+    is_host = false;
+  }
+#endif
+#ifdef BUILD_HSA
+  if (triple.getArch() == Triple::hsail64) {
+    subdir = "hsail64";
+    is_host = false;
+  }
+#endif
+#ifdef AMDGCN_ENABLED
+  if (triple.getArch == Triple::amdgcn) {
+    subdir = "amdgcn";
+    is_host = false;
+  }
+#endif
 
   // TODO sync with Nat Ferrus' indexed linking
   std::string kernellib;
-  if (pocl_get_bool_option("POCL_BUILDING", 0))
-    {
-      kernellib = BUILDDIR;
-      kernellib += "/lib/kernel/";
-      // TODO: get this from the target triplet: TCE, cellspu
-      if (triple.getArch() == Triple::tce) 
-        {
-          kernellib += "tce";
-        }
-#ifdef LLVM_3_2 
-      else if (triple.getArch() == Triple::cellspu) 
-        {
-          kernellib += "cellspu";
-        }
+  std::string kernellib_fallback;
+  if (pocl_get_bool_option("POCL_BUILDING", 0)) {
+    kernellib = BUILDDIR;
+    kernellib += "/lib/kernel/";
+    kernellib += subdir;
+    // TODO: get this from the TCE target triplet
+    kernellib += "/kernel-";
+    kernellib += device->llvm_target_triplet;
+    if (is_host) {
+#ifdef POCL_BUILT_WITH_CMAKE
+    kernellib += '-';
+    kernellib_fallback = kernellib;
+    kernellib_fallback += OCL_KERNEL_TARGET_CPU;
+    kernellib_fallback += ".bc";
+#ifdef KERNELLIB_HOST_DISTRO_VARIANTS
+    if (triple.getArch() == Triple::x86_64 ||
+        triple.getArch() == Triple::x86)
+      kernellib += getX86KernelLibName();
+    else
 #endif
-#ifdef BUILD_HSA
-      else if (triple.getArch() == Triple::hsail64) {
-          kernellib += "hsail64";
-      }
+      kernellib += device->llvm_cpu;
 #endif
-#ifdef AMDGCN_ENABLED
-      else if (triple.getArch() == Triple::amdgcn) {
-          kernellib += "amdgcn";
-      }
+    }
+  } else { // POCL_BUILDING == 0, use install dir
+    kernellib = PKGDATADIR;
+    kernellib += "/kernel-";
+    kernellib += device->llvm_target_triplet;
+    if (is_host) {
+#ifdef POCL_BUILT_WITH_CMAKE
+    kernellib += '-';
+    kernellib_fallback = kernellib;
+    kernellib_fallback += OCL_KERNEL_TARGET_CPU;
+    kernellib_fallback += ".bc";
+#ifdef KERNELLIB_HOST_DISTRO_VARIANTS
+    if (triple.getArch() == Triple::x86_64 ||
+        triple.getArch() == Triple::x86)
+      kernellib += getX86KernelLibName();
+    else
+#endif
+      kernellib += device->llvm_cpu;
 #endif
-      else 
-        {
-          kernellib += "host";
-        }
-      kernellib += "/kernel-"; 
-      kernellib += device->llvm_target_triplet;
-      kernellib +=".bc";   
+    }
+  }
+  kernellib += ".bc";
+
+  llvm::Module *lib;
+  SMDiagnostic Err;
+
+  if (pocl_exists(kernellib.c_str()))
+    {
+      POCL_MSG_PRINT_INFO("Using %s as the built-in lib.\n", kernellib.c_str());
+      lib = ParseIRFile(kernellib.c_str(), Err, *GlobalContext());
     }
   else
     {
-      kernellib = PKGDATADIR;
-      kernellib += "/kernel-";
-      kernellib += device->llvm_target_triplet;
-      kernellib += ".bc";
+      if (is_host && pocl_exists(kernellib_fallback.c_str()))
+        {
+          POCL_MSG_WARN("Using fallback %s as the built-in lib.\n",
+                        kernellib_fallback.c_str());
+          lib = ParseIRFile(kernellib_fallback.c_str(), Err, *GlobalContext());
+        }
+      else
+        POCL_ABORT("Kernel library file %s doesn't exist.", kernellib.c_str());
     }
-
-  POCL_MSG_PRINT_INFO("using %s as the built-in lib.\n", kernellib.c_str());
-
-  SMDiagnostic Err;
-  llvm::Module *lib = ParseIRFile(kernellib.c_str(), Err, *GlobalContext());
   assert (lib != NULL);
   libs[device] = lib;
 
@@ -1345,9 +1395,12 @@ int pocl_llvm_generate_workgroup_function(cl_device_id device, cl_kernel kernel,
 #ifdef DEBUG_POCL_LLVM_API        
       printf("### cloning the preloaded LLVM IR\n");
 #endif
-      input = 
-        llvm::CloneModule
-        ((llvm::Module*)kernel->program->llvm_irs[device_i]);
+      llvm::Module* p = (llvm::Module*)kernel->program->llvm_irs[device_i];
+#ifdef LLVM_OLDER_THAN_3_8
+      input = llvm::CloneModule(p);
+#else
+      input = (llvm::CloneModule(p)).release();
+#endif
     }
   else
     {
@@ -1359,6 +1412,21 @@ int pocl_llvm_generate_workgroup_function(cl_device_id device, cl_kernel kernel,
       input = ParseIRFile(program_bc_path, Err, *GlobalContext());
     }
 
+  /* Note this is a hack to get SPIR working. We'll be linking the
+   * host kernel library (plain LLVM IR) to the SPIR program.bc,
+   * so LLVM complains about incompatible DataLayouts. The proper solution
+   * would be to generate a SPIR kernel library
+   */
+  if (triple.getArch() == Triple::x86 || triple.getArch() == Triple::x86_64) {
+      if (input->getTargetTriple().substr(0, 6) == std::string("spir64")) {
+          input->setTargetTriple(triple.getTriple());
+          input->setDataLayout("e-m:e-i64:64-f80:128-n8:16:32:64-S128");
+      } else if (input->getTargetTriple().substr(0, 4) == std::string("spir")) {
+          input->setTargetTriple(triple.getTriple());
+          input->setDataLayout("e-m:e-p:32:32-i64:64-f80:32-n8:16:32-S32");
+      }
+  }
+
   // Later this should be replaced with indexed linking of source code
   // and/or bitcode for each kernel.
   llvm::Module *libmodule = kernel_library(device);
@@ -1372,9 +1440,7 @@ int pocl_llvm_generate_workgroup_function(cl_device_id device, cl_kernel kernel,
   pocl::WGLocalSizeZ = local_z;
   KernelName = kernel->name;
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  kernel_compiler_passes(device, input->getDataLayout()).run(*input);
-#elif (defined LLVM_OLDER_THAN_3_7)
+#ifdef LLVM_OLDER_THAN_3_7
   kernel_compiler_passes(
       device,
       input->getDataLayout()->getStringRepresentation()).run(*input);
@@ -1388,6 +1454,7 @@ int pocl_llvm_generate_workgroup_function(cl_device_id device, cl_kernel kernel,
   pocl_cache_write_kernel_parallel_bc(input, program, device_i, kernel,
                                   local_x, local_y, local_z);
 
+  delete input;
   return 0;
 }
 
@@ -1500,13 +1567,9 @@ pocl_llvm_get_kernel_names( cl_program program, const char **knames, unsigned ma
 
   for (i=0; i<n; i++) {
     assert( md->getOperand(i)->getOperand(0) != NULL);
-#ifdef LLVM_OLDER_THAN_3_6
-    llvm::Function *k = cast<Function>(md->getOperand(i)->getOperand(0));
-#else
     llvm::Function *k = 
       cast<Function>(
         dyn_cast<llvm::ValueAsMetadata>(md->getOperand(i)->getOperand(0))->getValue());
-#endif
     if (i < max_num_krn)
       knames[i]= k->getName().data();
   }
@@ -1529,6 +1592,7 @@ pocl_llvm_codegen(cl_kernel kernel,
 
     llvm::Triple triple(device->llvm_target_triplet);
     llvm::TargetMachine *target = GetTargetMachine(device);
+
     llvm::Module *input = ParseIRFile(infilename, Err, *GlobalContext());
 
     PassManager PM;
@@ -1539,25 +1603,13 @@ pocl_llvm_codegen(cl_kernel kernel,
     llvm::TargetLibraryInfoWrapperPass *TLIPass = new TargetLibraryInfoWrapperPass(triple);
     PM.add(TLIPass);
 #endif
+#ifdef LLVM_OLDER_THAN_3_7
     if (target != NULL) {
-#if defined LLVM_3_2
-      PM.add(new TargetTransformInfo(target->getScalarTargetTransformInfo(),
-                                     target->getVectorTargetTransformInfo()));
-#elif (defined LLVM_OLDER_THAN_3_7)
       target->addAnalysisPasses(PM);
-#endif
     }
+#endif
 
     // TODO: get DataLayout from the 'device'
-#if defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4
-    const DataLayout *TD = NULL;
-    if (target != NULL)
-      TD = target->getDataLayout();
-    if (TD != NULL)
-        PM.add(new DataLayout(*TD));
-    else
-        PM.add(new DataLayout(input));
-#endif
     // TODO: better error check
 #ifdef LLVM_OLDER_THAN_3_7
     std::string data;
diff --git a/lib/CL/pocl_queue_util.c b/lib/CL/pocl_queue_util.c
index 6e3c6b4..7826f9e 100644
--- a/lib/CL/pocl_queue_util.c
+++ b/lib/CL/pocl_queue_util.c
@@ -31,6 +31,7 @@
 #include <string.h>
 #include "pocl_debug.h"
 #include "pocl_queue_util.h"
+#include "common.h"
 
 static pocl_lock_t queue_lock = POCL_LOCK_INITIALIZER;
 static size_t queue_size = 0;
@@ -46,6 +47,7 @@ void pocl_finish_all_queues()
     if (queue_list[i])
       POname(clFinish)(queue_list[i]);
   }
+  pocl_print_system_memory_stats();
 }
 
 void pocl_init_queue_list()
diff --git a/lib/CL/pocl_timing.c b/lib/CL/pocl_timing.c
new file mode 100644
index 0000000..6b97b8d
--- /dev/null
+++ b/lib/CL/pocl_timing.c
@@ -0,0 +1,153 @@
+/* OpenCL runtime library: OS-dependent time routines
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "config.h"
+
+#ifndef _MSC_VER
+#  ifndef __STDC_FORMAT_MACROS
+#    define __STDC_FORMAT_MACROS
+#  endif
+#  include <inttypes.h>
+#  ifdef HAVE_CLOCK_GETTIME
+#    include <time.h>
+#  else
+#    include <sys/time.h>
+#  endif
+#  ifdef __MACH__
+#    include <mach/clock.h>
+#    include <mach/mach.h>
+#  endif
+#  include <sys/resource.h>
+#  include <unistd.h>
+#else
+#  include "vccompat.hpp"
+#  include <stdint.h>
+#  include <stddef.h> // size_t
+#endif
+
+#include "pocl_timing.h"
+
+#ifdef HAVE_CLOCK_GETTIME
+// clock_gettime is (at best) nanosec res
+const unsigned pocl_timer_resolution = 1;
+#else
+#  ifndef _MSC_VER
+// gettimeofday() has (at best) microsec res
+const unsigned pocl_timer_resolution = 1000;
+#  else
+// the resolution of windows clock is "it depends"...
+const unsigned pocl_timer_resolution = 1000;
+#  endif
+#endif
+
+
+uint64_t pocl_gettimemono_ns() {
+
+#ifdef HAVE_CLOCK_GETTIME
+  struct timespec timespec;
+# ifdef __linux__
+  clock_gettime(CLOCK_MONOTONIC_RAW, &timespec);
+# elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
+  clock_gettime(CLOCK_UPTIME_FAST, &timespec);
+# else
+# warn Using clock_gettime with CLOCK_REALTIME for monotonic clocks
+  clock_gettime(CLOCK_REALTIME, &timespec);
+# endif
+  return ((timespec.tv_sec * 1000000000UL) + timespec.tv_nsec);
+
+
+#elif defined(__APPLE__)
+  clock_serv_t cclock;
+  mach_timespec_t mts;
+  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+  clock_get_time(cclock, &mts);
+  mach_port_deallocate(mach_task_self(), cclock);
+  return ((mts.tv_sec * 1000000000UL) + mts.tv_nsec);
+
+#elif defined(_WIN32)
+  FILETIME ft;
+  GetSystemTimeAsFileTime(&ft);
+  res |= ft.dwHighDateTime;
+  res <<= 32;
+  res |= ft.dwLowDateTime;
+  res -= 11644473600000000Ui64;
+  res /= 10;
+  return res;
+
+#else
+  struct timeval current;
+  gettimeofday(&current, NULL);
+  return ((uint64_t)current.tv_sec * 1000000 + current.tv_usec)*1000;
+
+#endif
+}
+
+int pocl_gettimereal(int *year, int *mon, int *day, int *hour, int *min, int *sec, int* nanosec)
+{
+#if defined(HAVE_CLOCK_GETTIME) || defined(__APPLE__) || defined(HAVE_GETTIMEOFDAY)
+  struct tm t;
+  struct timespec timespec;
+  time_t sec_input;
+
+#if defined(HAVE_CLOCK_GETTIME)
+  clock_gettime(CLOCK_REALTIME, &timespec);
+  *nanosec = timespec.tv_nsec;
+  sec_input = timespec.tv_sec;
+#elif defined(__APPLE__)
+  clock_serv_t cclock;
+  mach_timespec_t mts;
+  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+  clock_get_time(cclock, &mts);
+  mach_port_deallocate(mach_task_self(), cclock);
+  *nanosec = mts.tv_nsec;
+  sec_input = mts.tv_sec;
+#else /* gettimeofday */
+  struct timeval current;
+  gettimeofday(&current, NULL);
+  *nanosec = (uint64_t)current.tv_sec * 1000000;
+  sec_input = current.tv_usec;
+#endif
+  gmtime_r(&sec_input, &t);
+  *year = (t.tm_year + 1900);
+  *mon = t.tm_mon;
+  *day = t.tm_mday;
+  *hour = t.tm_hour;
+  *min = t.tm_min;
+  *sec = t.tm_sec;
+  return 0;
+
+#elif defined(_WIN32)
+  FILETIME ft;
+  GetSystemTimeAsFileTime(&ft);
+  res |= ft.dwHighDateTime;
+  res <<= 32;
+  res |= ft.dwLowDateTime;
+  res -= 11644473600000000Ui64;
+  res /= 10;
+  // TODO finish this
+  return 1;
+#else
+#error Unknown system variant
+#endif
+
+}
diff --git a/lib/CL/pocl_timing.h b/lib/CL/pocl_timing.h
new file mode 100644
index 0000000..e01050c
--- /dev/null
+++ b/lib/CL/pocl_timing.h
@@ -0,0 +1,26 @@
+#ifndef POCL_TIMING_H
+#define POCL_TIMING_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GNUC__
+#pragma GCC visibility push(hidden)
+#endif
+
+extern const unsigned pocl_timer_resolution;
+
+uint64_t pocl_gettimemono_ns();
+
+int pocl_gettimereal(int *year, int *mon, int *day, int *hour, int *min, int *sec, int* nanosec);
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/CL/pocl_util.c b/lib/CL/pocl_util.c
index 1edaf73..108410d 100644
--- a/lib/CL/pocl_util.c
+++ b/lib/CL/pocl_util.c
@@ -311,11 +311,11 @@ void pocl_command_enqueue (cl_command_queue command_queue,
 
 int pocl_buffer_boundcheck(cl_mem buffer, size_t offset, size_t size) {
   POCL_RETURN_ERROR_ON((offset > buffer->size), CL_INVALID_VALUE,
-            "offset(%zu) > buffer->size(%zu)", offset, buffer->size)
+            "offset(%zu) > buffer->size(%zu)", offset, buffer->size);
   POCL_RETURN_ERROR_ON((size > buffer->size), CL_INVALID_VALUE,
-            "size(%zu) > buffer->size(%zu)", size, buffer->size)
+            "size(%zu) > buffer->size(%zu)", size, buffer->size);
   POCL_RETURN_ERROR_ON((offset + size > buffer->size), CL_INVALID_VALUE,
-            "offset + size (%zu) > buffer->size(%zu)", (offset+size), buffer->size)
+            "offset + size (%zu) > buffer->size(%zu)", (offset+size), buffer->size);
   return CL_SUCCESS;
 }
 
@@ -374,18 +374,18 @@ int pocl_buffers_boundcheck(cl_mem src_buffer,
                             size_t dst_offset,
                             size_t size) {
   POCL_RETURN_ERROR_ON((src_offset > src_buffer->size), CL_INVALID_VALUE,
-            "src_offset(%zu) > src_buffer->size(%zu)", src_offset, src_buffer->size)
+            "src_offset(%zu) > src_buffer->size(%zu)", src_offset, src_buffer->size);
   POCL_RETURN_ERROR_ON((size > src_buffer->size), CL_INVALID_VALUE,
-            "size(%zu) > src_buffer->size(%zu)", size, src_buffer->size)
+            "size(%zu) > src_buffer->size(%zu)", size, src_buffer->size);
   POCL_RETURN_ERROR_ON((src_offset + size > src_buffer->size), CL_INVALID_VALUE,
-            "src_offset + size (%zu) > src_buffer->size(%zu)", (src_offset+size), src_buffer->size)
+            "src_offset + size (%zu) > src_buffer->size(%zu)", (src_offset+size), src_buffer->size);
 
   POCL_RETURN_ERROR_ON((dst_offset > dst_buffer->size), CL_INVALID_VALUE,
-            "dst_offset(%zu) > dst_buffer->size(%zu)", dst_offset, dst_buffer->size)
+            "dst_offset(%zu) > dst_buffer->size(%zu)", dst_offset, dst_buffer->size);
   POCL_RETURN_ERROR_ON((size > dst_buffer->size), CL_INVALID_VALUE,
-            "size(%zu) > dst_buffer->size(%zu)", size, dst_buffer->size)
+            "size(%zu) > dst_buffer->size(%zu)", size, dst_buffer->size);
   POCL_RETURN_ERROR_ON((dst_offset + size > dst_buffer->size), CL_INVALID_VALUE,
-            "dst_offset + size (%zu) > dst_buffer->size(%zu)", (dst_offset+size), dst_buffer->size)
+            "dst_offset + size (%zu) > dst_buffer->size(%zu)", (dst_offset+size), dst_buffer->size);
   return CL_SUCCESS;
 }
 
@@ -400,10 +400,10 @@ int pocl_buffers_overlap(cl_mem src_buffer,
   if (src_buffer == dst_buffer) {
     POCL_RETURN_ERROR_ON(((src_offset <= dst_offset) && (dst_offset <=
       (src_offset + size - 1))), CL_MEM_COPY_OVERLAP, "dst_offset lies inside \
-      the src region and the src_buffer == dst_buffer")
+      the src region and the src_buffer == dst_buffer");
     POCL_RETURN_ERROR_ON(((dst_offset <= src_offset) && (src_offset <=
       (dst_offset + size - 1))), CL_MEM_COPY_OVERLAP, "src_offset lies inside \
-      the dst region and the src_buffer == dst_buffer")
+      the dst region and the src_buffer == dst_buffer");
   }
 
   /* sub buffers overlap check  */
@@ -416,10 +416,10 @@ int pocl_buffers_overlap(cl_mem src_buffer,
 
     POCL_RETURN_ERROR_ON(((src_offset <= dst_offset) && (dst_offset <=
       (src_offset + size - 1))), CL_MEM_COPY_OVERLAP, "dst_offset lies inside \
-      the src region and src_buffer + dst_buffer are subbuffers of the same buffer")
+      the src region and src_buffer + dst_buffer are subbuffers of the same buffer");
     POCL_RETURN_ERROR_ON(((dst_offset <= src_offset) && (src_offset <=
       (dst_offset + size - 1))), CL_MEM_COPY_OVERLAP, "src_offset lies inside \
-      the dst region and src_buffer + dst_buffer are subbuffers of the same buffer")
+      the dst region and src_buffer + dst_buffer are subbuffers of the same buffer");
 
   }
 
@@ -551,3 +551,26 @@ cl_device_id * pocl_unique_device_list(const cl_device_id * in, cl_uint num, cl_
   *real = real_num;
   return out;
 }
+
+/* Setup certain info about context that comes up later in API calls */
+void pocl_setup_context(cl_context context)
+{
+  unsigned i;
+  context->min_max_mem_alloc_size = SIZE_MAX;
+  context->svm_allocdev = NULL;
+  for(i=0; i<context->num_devices; i++)
+    {
+      if (context->devices[i]->should_allocate_svm)
+        context->svm_allocdev = context->devices[i];
+      if (context->devices[i]->max_mem_alloc_size < context->min_max_mem_alloc_size)
+        context->min_max_mem_alloc_size =
+            context->devices[i]->max_mem_alloc_size;
+    }
+  if (context->svm_allocdev == NULL)
+    for(i=0; i<context->num_devices; i++)
+      if (DEVICE_IS_SVM_CAPABLE(context->devices[i]))
+        {
+          context->svm_allocdev = context->devices[i];
+          break;
+        }
+}
diff --git a/lib/CL/pocl_util.h b/lib/CL/pocl_util.h
index 0be780c..eb29bb9 100644
--- a/lib/CL/pocl_util.h
+++ b/lib/CL/pocl_util.h
@@ -93,6 +93,9 @@ check_copy_overlap(const size_t src_offset[3],
                    const size_t region[3],
                    const size_t row_pitch, const size_t slice_pitch);
 
+void pocl_setup_context(cl_context context);
+
+
 /* Helpers for dealing with devices / subdevices */
 
 #define POCL_REAL_DEV(dev) (dev->parent_device ? dev->parent_device : dev)
@@ -100,13 +103,17 @@ check_copy_overlap(const size_t src_offset[3],
 cl_device_id * pocl_unique_device_list(const cl_device_id * in, cl_uint num, cl_uint *real);
 
 #define POCL_CHECK_DEV_IN_CMDQ                                               \
-  device = command_queue->device;                                            \
-  for (i = 0; i < command_queue->context->num_devices; ++i)                  \
+  do                                                                         \
     {                                                                        \
-      if (command_queue->context->devices[i] == POCL_REAL_DEV(device))       \
-        break;                                                               \
+      device = command_queue->device;                                        \
+      for (i = 0; i < command_queue->context->num_devices; ++i)              \
+        {                                                                    \
+          if (command_queue->context->devices[i] == POCL_REAL_DEV(device))   \
+            break;                                                           \
+        }                                                                    \
+      assert(i < command_queue->context->num_devices);                       \
     }                                                                        \
-  assert(i < command_queue->context->num_devices);
+  while (0)
 
 
 #ifdef __cplusplus
@@ -124,39 +131,48 @@ cl_device_id * pocl_unique_device_list(const cl_device_id * in, cl_uint num, cl_
  */
 
 #define POCL_RETURN_GETINFO_INNER(__SIZE__, MEMASSIGN)                  \
-    if (param_value) {                                                  \
-      if (param_value_size < __SIZE__) return CL_INVALID_VALUE;         \
-      MEMASSIGN;                                                        \
+  do                                                                    \
+    {                                                                   \
+      if (param_value)                                                  \
+        {                                                               \
+          if (param_value_size < __SIZE__) return CL_INVALID_VALUE;     \
+          MEMASSIGN;                                                    \
+        }                                                               \
+      if (param_value_size_ret)                                         \
+        *param_value_size_ret = __SIZE__;                               \
+      return CL_SUCCESS;                                                \
     }                                                                   \
-    if (param_value_size_ret)                                           \
-      *param_value_size_ret = __SIZE__;                                 \
-    return CL_SUCCESS;                                                  \
+  while (0)
 
 #define POCL_RETURN_GETINFO_SIZE(__SIZE__, __POINTER__)                 \
-  {                                                                     \
-    POCL_RETURN_GETINFO_INNER(__SIZE__,                                 \
-                memcpy(param_value, __POINTER__, __SIZE__))             \
-  }
+  POCL_RETURN_GETINFO_INNER(__SIZE__,                                   \
+    memcpy(param_value, __POINTER__, __SIZE__))
 
 #define POCL_RETURN_GETINFO_STR(__STR__)                                \
-  {                                                                     \
-    size_t const value_size = strlen(__STR__) + 1;                      \
-    POCL_RETURN_GETINFO_INNER(value_size,                               \
-                memcpy(param_value, __STR__, value_size))               \
-  }
+  do                                                                    \
+    {                                                                   \
+      size_t const value_size = strlen(__STR__) + 1;                    \
+      POCL_RETURN_GETINFO_INNER(value_size,                             \
+                  memcpy(param_value, __STR__, value_size));            \
+    }                                                                   \
+  while (0)
 
 #define POCL_RETURN_GETINFO(__TYPE__, __VALUE__)                        \
-  {                                                                     \
-    size_t const value_size = sizeof(__TYPE__);                         \
-    POCL_RETURN_GETINFO_INNER(value_size,                               \
-                *(__TYPE__*)param_value=__VALUE__)                      \
-  }
+  do                                                                    \
+    {                                                                   \
+      size_t const value_size = sizeof(__TYPE__);                       \
+      POCL_RETURN_GETINFO_INNER(value_size,                             \
+                  *(__TYPE__*)param_value=__VALUE__);                   \
+    }                                                                   \
+  while (0)
 
 #define POCL_RETURN_GETINFO_ARRAY(__TYPE__, __NUM__, __VALUE__)         \
-  {                                                                     \
-    size_t const value_size = __NUM__*sizeof(__TYPE__);                 \
-    POCL_RETURN_GETINFO_INNER(value_size,                               \
-                memcpy(param_value, __VALUE__, value_size))             \
-  }
+  do                                                                    \
+    {                                                                   \
+      size_t const value_size = __NUM__*sizeof(__TYPE__);               \
+      POCL_RETURN_GETINFO_INNER(value_size,                             \
+                  memcpy(param_value, __VALUE__, value_size));          \
+    }                                                                   \
+  while (0)
 
 #endif
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 091a448..4680166 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -26,18 +26,19 @@
 separate_arguments(HOST_CLANG_FLAGS)
 separate_arguments(HOST_LLC_FLAGS)
 
+add_subdirectory("kernel")
+
 add_subdirectory("llvmopencl")
 
 add_subdirectory("CL")
 
 #############################################################
 
-# must be after cell / tce b/c of LD_FLAGS_BIN
+# must be after tce b/c of LD_FLAGS_BIN
 # and after adding lib/CL subdir b/c of libpocl location,
 # but before poclu & examples & tests (they need these variables)
 
-# TODO possibly required
-set(OPENCL_CFLAGS "") #'$(PTHREAD_CFLAGS)'
+set(OPENCL_CFLAGS "${PTHREAD_CFLAGS}")
 
 if(TESTS_USE_ICD)
 
@@ -45,9 +46,11 @@ if(TESTS_USE_ICD)
     set(OPENCL_LIBS ${OPENCL_LIBRARIES})
   else()
     message(WARNING "Tests-use-ICD requested, but cannot find an OpenCL library; adding -lOpenCL to LDFLAGS")
-    set(OPENCL_LIBS "-lOpenCL")
+    set(OPENCL_LIBS "OpenCL")
   endif()
 
+  set(OPENCL_LIBS "${PTHREAD_LDFLAGS};${OPENCL_LIBS}")
+
   set(OPENCL_EXTLIBS ${OPENCL_LIBS})
   set(OPENCL_CMAKE_OPTIONS "")
 
@@ -82,7 +85,5 @@ set(OPENCL_EXTLIBS "${OPENCL_EXTLIBS}" PARENT_SCOPE)
 set(OPENCL_CMAKE_OPTIONS "${OPENCL_CMAKE_OPTIONS}" PARENT_SCOPE)
 set(OPENCL_CFLAGS "${OPENCL_CFLAGS}" PARENT_SCOPE)
 
-add_subdirectory("kernel")
-
 add_subdirectory("poclu")
 
diff --git a/lib/Makefile.in b/lib/Makefile.in
index 33a473a..3db83a9 100644
--- a/lib/Makefile.in
+++ b/lib/Makefile.in
@@ -252,6 +252,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -259,6 +260,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -276,8 +278,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -353,6 +353,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/lib/kernel/CMakeLists.txt b/lib/kernel/CMakeLists.txt
index 1ff09a7..7c68546 100644
--- a/lib/kernel/CMakeLists.txt
+++ b/lib/kernel/CMakeLists.txt
@@ -139,6 +139,7 @@ native_powr.cl
 native_recip.cl
 native_rsqrt.cl
 native_sin.cl
+native_sqrt.cl
 native_tan.cl
 nextafter.cl
 normalize.cl
@@ -146,6 +147,7 @@ popcount.cl
 pow.cl
 pown.cl
 powr.cl
+printf.c
 radians.cl
 read_image.cl
 recip.cl
@@ -222,6 +224,7 @@ mul24.cl
 mul_hi.cl
 nextafter.cl
 popcount.cl
+printf.c
 read_image.cl
 rhadd.cl
 rotate.cl
@@ -332,6 +335,7 @@ vecmathlib-pocl/native_powr.cl
 vecmathlib-pocl/native_recip.cl
 vecmathlib-pocl/native_rsqrt.cl
 vecmathlib-pocl/native_sin.cl
+vecmathlib-pocl/native_sqrt.cl
 vecmathlib-pocl/native_tan.cl
 vecmathlib-pocl/normalize.cl
 vecmathlib-pocl/pow.cc
@@ -357,11 +361,6 @@ vecmathlib-pocl/tanh.cc
 vecmathlib-pocl/tanpi.cl
 vecmathlib-pocl/trunc.cc)
 
-if(NEW_PRINTF_WORKS)
-  list(APPEND SOURCES_WITH_VML printf.c)
-  list(APPEND SOURCES_WITHOUT_VML printf.c)
-endif()
-
 #LKERNEL_HDRS_EXTRA - headers that should be dependencies
 set(KERNEL_DEPEND_HEADERS
 "${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib/vec_test.h"
@@ -403,6 +402,9 @@ set(KERNEL_DEPEND_HEADERS
 "${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib/vec_sse_float4.h"
 )
 
+set(KERNEL_BC_LIST "")
+set(KERNEL_TARGET_LIST "")
+
 #*********************************************************************
 
 if(OCL_TARGETS MATCHES "host")
@@ -411,15 +413,10 @@ endif()
 
 #*********************************************************************
 
-if(OCL_TARGETS MATCHES "cellspu")
-  add_subdirectory("cellspu")
-endif()
-
-#*********************************************************************
-
 if(OCL_TARGETS MATCHES "tce")
   add_subdirectory("tce")
 endif()
+
 #*********************************************************************
 
 if(OCL_TARGETS MATCHES "amdgcn")
@@ -432,5 +429,28 @@ endif()
 
 #*********************************************************************
 
+# "Escape" a list before passing to an external command
+string(REPLACE ";" "****" KERNEL_BC_LIST_ESCAPED "${KERNEL_BC_LIST}")
+
+add_custom_command( OUTPUT "${CMAKE_BINARY_DIR}/kernellib_hash.h"
+  COMMAND "${CMAKE_COMMAND}"
+      -DKERNEL_BC_LIST_ESCAPED='${KERNEL_BC_LIST_ESCAPED}'
+      -DINCLUDEDIR='${CMAKE_SOURCE_DIR}/include'
+      -DOUTPUT='${CMAKE_BINARY_DIR}/kernellib_hash.h'
+      -P "${CMAKE_SOURCE_DIR}/cmake/kernellib_hash.cmake"
+  DEPENDS ${KERNEL_BC_LIST}
+      "${CMAKE_SOURCE_DIR}/include/_kernel.h"
+      "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
+      "${CMAKE_SOURCE_DIR}/include/pocl_types.h"
+  COMMENT "Generating SHA1 of all kernel libs..."
+  VERBATIM)
+
+
+add_custom_target("kernellib_hash" ALL
+    DEPENDS "${CMAKE_BINARY_DIR}/kernellib_hash.h")
+foreach(DEP IN LISTS KERNEL_TARGET_LIST)
+  add_dependencies("kernellib_hash" ${DEP})
+endforeach()
+
 # TODO extra_dist in lib/kernel/Makefile.am:
 # EXTRA_DIST = *.c *.cl *.h *.ll      \
diff --git a/lib/kernel/Makefile.am b/lib/kernel/Makefile.am
index b4451af..a1d4071 100644
--- a/lib/kernel/Makefile.am
+++ b/lib/kernel/Makefile.am
@@ -37,7 +37,6 @@ EXTRA_DIST = $(srcdir)/*.c $(srcdir)/*.cl $(srcdir)/*.h $(srcdir)/*.ll			\
 	$(srcdir)/vecmathlib-pocl/*.cl			\
 	$(srcdir)/vecmathlib-pocl/*.h			\
 	$(srcdir)/vecmathlib-pocl/*.py			\
-	$(srcdir)/cellspu/* \
 	$(srcdir)/tce/* \
 	CMakeLists.txt
 
@@ -45,6 +44,6 @@ EXTRA_DIST = $(srcdir)/*.c $(srcdir)/*.cl $(srcdir)/*.h $(srcdir)/*.ll			\
 # Distclean the dirs regardless if the target was enabled or not
 # to cleanup the Makefiles.
 distclean-local:
-	for dir in host tce cellspu; do \
+	for dir in amdgcn host hsail tce; do \
 		make -C $$dir distclean;\
 	done;
diff --git a/lib/kernel/Makefile.in b/lib/kernel/Makefile.in
index 405036f..8344243 100644
--- a/lib/kernel/Makefile.in
+++ b/lib/kernel/Makefile.in
@@ -252,6 +252,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -259,6 +260,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -276,8 +278,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -353,6 +353,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -427,7 +428,6 @@ EXTRA_DIST = $(srcdir)/*.c $(srcdir)/*.cl $(srcdir)/*.h $(srcdir)/*.ll			\
 	$(srcdir)/vecmathlib-pocl/*.cl			\
 	$(srcdir)/vecmathlib-pocl/*.h			\
 	$(srcdir)/vecmathlib-pocl/*.py			\
-	$(srcdir)/cellspu/* \
 	$(srcdir)/tce/* \
 	CMakeLists.txt
 
@@ -748,7 +748,7 @@ uninstall-am:
 # Distclean the dirs regardless if the target was enabled or not
 # to cleanup the Makefiles.
 distclean-local:
-	for dir in host tce cellspu; do \
+	for dir in amdgcn host hsail tce; do \
 		make -C $$dir distclean;\
 	done;
 
diff --git a/lib/kernel/cellspu/CMakeLists.txt b/lib/kernel/cellspu/CMakeLists.txt
deleted file mode 100644
index 402cb0f..0000000
--- a/lib/kernel/cellspu/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-#=============================================================================
-#   CMake build system files
-#
-#   Copyright (c) 2014 pocl developers
-#
-#   Permission is hereby granted, free of charge, to any person obtaining a copy
-#   of this software and associated documentation files (the "Software"), to deal
-#   in the Software without restriction, including without limitation the rights
-#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#   copies of the Software, and to permit persons to whom the Software is
-#   furnished to do so, subject to the following conditions:
-#
-#   The above copyright notice and this permission notice shall be included in
-#   all copies or substantial portions of the Software.
-#
-#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-#   THE SOFTWARE.
-#
-#=============================================================================
-
-include("bitcode_rules")
-
-# Use TARGET flags:
-#CLANG_FLAGS = @TARGET_CLANG_FLAGS@ -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off
-separate_arguments(CELL_TARGET_CLANG_FLAGS)
-set(CLANG_FLAGS ${CELL_TARGET_CLANG_FLAGS} "-Xclang" "-ffake-address-space-map" "-emit-llvm" "-ffp-contract=off")
-
-#LLC_FLAGS   = @TARGET_LLC_FLAGS@
-separate_arguments(CELL_TARGET_LLC_FLAGS)
-set(LLC_FLAGS ${CELL_TARGET_LLC_FLAGS})
-
-# TODO LLC_flags is used by kernel.bc target, but ld is unused
-#LD_FLAGS    = @TARGET_LD_FLAGS@
-
-#KERNEL_TARGET = tce (WRONG)
-make_kernel_bc(KERNEL_BC "cellspu" ${SOURCES_WITHOUT_VML})
-
-# just debug
-message(STATUS "Cell SPU Kernel BC: ${KERNEL_BC}")
-
-# a target is needed...
-add_custom_target("kernel_cell" ALL DEPENDS ${KERNEL_BC})
-
-install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${KERNEL_BC}"
-        DESTINATION "${POCL_INSTALL_PRIVATE_DATADIR}")
diff --git a/lib/kernel/cellspu/Makefile b/lib/kernel/cellspu/Makefile
deleted file mode 100644
index c166bdc..0000000
--- a/lib/kernel/cellspu/Makefile
+++ /dev/null
@@ -1,751 +0,0 @@
-# Makefile.in generated by automake 1.15 from Makefile.am.
-# lib/kernel/cellspu/Makefile.  Generated from Makefile.in by configure.
-
-# Copyright (C) 1994-2014 Free Software Foundation, Inc.
-
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-
-
-# Process this file with automake to produce Makefile.in
-# 
-# Copyright (c) 2011 Universidad Rey Juan Carlos
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-# rules.mk - the make rules for building the kernel library
-# 
-# Copyright (c) 2013 Erik Schnetter
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-# The caller (the Makefile which includes this file) needs to set the
-# following variables:
-# 
-# KERNEL_TARGET
-# CLANG_FLAGS
-# LLC_FLAGS
-# LD_FLAGS
-
-# sources.mk - a list of all kernel source files
-# 
-# Copyright (c) 2011-2013 Universidad Rey Juan Carlos
-#                         Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-
-am__is_gnu_make = { \
-  if test -z '$(MAKELEVEL)'; then \
-    false; \
-  elif test -n '$(MAKE_HOST)'; then \
-    true; \
-  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
-    true; \
-  else \
-    false; \
-  fi; \
-}
-am__make_running_with_option = \
-  case $${target_option-} in \
-      ?) ;; \
-      *) echo "am__make_running_with_option: internal error: invalid" \
-              "target option '$${target_option-}' specified" >&2; \
-         exit 1;; \
-  esac; \
-  has_opt=no; \
-  sane_makeflags=$$MAKEFLAGS; \
-  if $(am__is_gnu_make); then \
-    sane_makeflags=$$MFLAGS; \
-  else \
-    case $$MAKEFLAGS in \
-      *\\[\ \	]*) \
-        bs=\\; \
-        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
-          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
-    esac; \
-  fi; \
-  skip_next=no; \
-  strip_trailopt () \
-  { \
-    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
-  }; \
-  for flg in $$sane_makeflags; do \
-    test $$skip_next = yes && { skip_next=no; continue; }; \
-    case $$flg in \
-      *=*|--*) continue;; \
-        -*I) strip_trailopt 'I'; skip_next=yes;; \
-      -*I?*) strip_trailopt 'I';; \
-        -*O) strip_trailopt 'O'; skip_next=yes;; \
-      -*O?*) strip_trailopt 'O';; \
-        -*l) strip_trailopt 'l'; skip_next=yes;; \
-      -*l?*) strip_trailopt 'l';; \
-      -[dEDm]) skip_next=yes;; \
-      -[JT]) skip_next=yes;; \
-    esac; \
-    case $$flg in \
-      *$$target_option*) has_opt=yes; break;; \
-    esac; \
-  done; \
-  test $$has_opt = yes
-am__make_dryrun = (target_option=n; $(am__make_running_with_option))
-am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
-pkgdatadir = $(datadir)/pocl
-pkgincludedir = $(includedir)/pocl
-pkglibdir = $(libdir)/pocl
-pkglibexecdir = $(libexecdir)/pocl
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = x86_64-unknown-linux-gnu
-host_triplet = x86_64-unknown-linux-gnu
-target_triplet = x86_64-unknown-linux-gnu
-am__append_1 = printf.c
-subdir = lib/kernel/cellspu
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
-	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
-	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
-	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \
-	$(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/config.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-AM_V_P = $(am__v_P_$(V))
-am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY))
-am__v_P_0 = false
-am__v_P_1 = :
-AM_V_GEN = $(am__v_GEN_$(V))
-am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY))
-am__v_GEN_0 = @echo "  GEN     " $@;
-am__v_GEN_1 = 
-AM_V_at = $(am__v_at_$(V))
-am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY))
-am__v_at_0 = @
-am__v_at_1 = 
-SOURCES =
-DIST_SOURCES =
-am__can_run_installinfo = \
-  case $$AM_UPDATE_INFO_DIR in \
-    n|no|NO) false;; \
-    *) (install-info --version) >/dev/null 2>&1;; \
-  esac
-am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
-am__vpath_adj = case $$p in \
-    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
-    *) f=$$p;; \
-  esac;
-am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
-am__install_max = 40
-am__nobase_strip_setup = \
-  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
-am__nobase_strip = \
-  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
-am__nobase_list = $(am__nobase_strip_setup); \
-  for p in $$list; do echo "$$p $$p"; done | \
-  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
-  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
-    if (++n[$$2] == $(am__install_max)) \
-      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
-    END { for (dir in files) print dir, files[dir] }'
-am__base_list = \
-  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
-  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
-am__uninstall_files_from_dir = { \
-  test -z "$$files" \
-    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
-    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
-         $(am__cd) "$$dir" && rm -f $$files; }; \
-  }
-am__installdirs = "$(DESTDIR)$(pkgdatadir)"
-DATA = $(nodist_pkgdata_DATA)
-am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
-am__DIST_COMMON = $(srcdir)/../rules.mk $(srcdir)/../sources.mk \
-	$(srcdir)/Makefile.in
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-ACLOCAL = ${SHELL} /tmp/pocl/config/missing aclocal-1.15
-AMTAR = $${TAR-tar}
-AM_DEFAULT_VERBOSITY = 1
-AR = ar
-AUTOCONF = ${SHELL} /tmp/pocl/config/missing autoconf
-AUTOHEADER = ${SHELL} /tmp/pocl/config/missing autoheader
-AUTOMAKE = ${SHELL} /tmp/pocl/config/missing automake-1.15
-AWK = gawk
-BOOST_CPPFLAGS = 
-BOOST_LDFLAGS = 
-BUILD_TIMESTAMP = 201510261514380399935
-CC = gcc
-CCDEPMODE = depmode=gcc3
-CFLAGS = -g -O2
-CLANG = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang
-CLANGXX = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang++
-CLANGXX_FLAGS = --target=x86_64-unknown-linux-gnu  -DVML_NO_IOSTREAM
-CLFLAGS =  -D__OPENCL_VERSION__=120
-CPP = gcc -E
-CPPFLAGS = 
-CXX = g++
-CXXCPP = g++ -E
-CXXDEPMODE = depmode=gcc3
-CXXFLAGS = -g -O2
-CYGPATH_W = echo
-DEFS = -DHAVE_CONFIG_H
-DEPDIR = .deps
-DLLTOOL = false
-DSYMUTIL = 
-DUMPBIN = 
-ECHO_C = 
-ECHO_N = -n
-ECHO_T = 
-EGREP = /usr/bin/grep -E
-EXEEXT = 
-FGREP = /usr/bin/grep -F
-FORCED_CLFLAGS = -Xclang -ffake-address-space-map -fno-math-errno -fblocks -fno-builtin -fasm -Wno-format
-GLEW_CFLAGS = -I/usr/include/libdrm 
-GLEW_LIBS = -lGLEW -lGLU -lGL 
-GREP = /usr/bin/grep
-HOST = x86_64-unknown-linux-gnu
-HOST_AS_FLAGS = 
-HOST_CLANG_FLAGS =  --target=x86_64-unknown-linux-gnu -march=bdver3 -D_CL_DISABLE_HALF
-HOST_CPU = x86_64
-HOST_LD_FLAGS = -shared -lm
-HOST_LLC_FLAGS = -relocation-model=pic -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver3
-HOST_SIZEOF_DOUBLE = 8
-HOST_SIZEOF_HALF = 2
-HOST_SIZEOF_LONG = 8
-HOST_SIZEOF_VOID_P = 8
-HSAILASM = /opt/HSA/bin/HSAILasm
-HSA_INCLUDES = -I/opt/HSA/include
-HSA_LIBS = -lhsa-runtime64 
-HWLOC_CFLAGS = -I/usr/include/libxml2 
-HWLOC_LIBS = -lhwloc 
-ICD_LD_FLAGS = -Wl,-Bsymbolic
-INSTALL = /usr/bin/install -c
-INSTALL_DATA = ${INSTALL} -m 644
-INSTALL_PROGRAM = ${INSTALL}
-INSTALL_SCRIPT = ${INSTALL}
-INSTALL_STRIP_PROGRAM = $(install_sh) -c -s
-KERNEL_COMPILER_LIB_VERSION = 6:0:0
-LD = /usr/bin/ld -m elf_x86_64
-LDFLAGS =  -L/opt/HSA/lib
-LD_FLAGS_BIN =  
-LIBOBJS = 
-LIBRARY_SUFFIX = .so
-LIBS = 
-LIBSPE_CFLAGS = 
-LIBSPE_LIBS = 
-LIBTOOL = $(SHELL) $(top_builddir)/libtool
-LIB_AGE_VERSION = 5
-LIB_CURRENT_VERSION = 6
-LIB_FIRST_VERSION = 1
-LIB_REVISION_VERSION = 0
-LIB_VERSION = 6:0:5
-LIPO = 
-LLC = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llc
-LLVM_AS = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-as
-LLVM_CONFIG = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-config
-LLVM_CXX_FLAGS = -I/home/LLVM_370_HSAIL_rwdi_NA_rtti/include  -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wnon-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -g -DNDEBUG  -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
-LLVM_LDFLAGS = -L/home/LLVM_370_HSAIL_rwdi_NA_rtti/lib  -lrt -ldl -lcurses -lpthread -lz -lm
-LLVM_LIBS = -lLLVMLTO -lLLVMObjCARCOpts -lLLVMLinker -lLLVMBitWriter -lLLVMTableGen -lLLVMMIRParser -lLLVMDebugInfoPDB -lLLVMOrcJIT -lLLVMIRReader -lLLVMAsmParser -lLLVMHSAILCodeGen -lLLVMHSAILDesc -lLLVMHSAILInfo -lLLVMHSAILAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUUtils -lLLVMAMDGPUDesc -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86I [...]
-LLVM_LINK = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-link
-LLVM_OPT = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/opt
-LLVM_VERSION = 3.7.0svn
-LN_S = ln -s
-LTDL_LIBS = -lltdl 
-LTLIBOBJS = 
-LT_SYS_LIBRARY_PATH = 
-MAKEINFO = ${SHELL} /tmp/pocl/config/missing makeinfo
-MANIFEST_TOOL = :
-MKDIR_P = /usr/bin/mkdir -p
-NM = /usr/bin/nm -B
-NMEDIT = 
-OBJDUMP = objdump
-OBJEXT = o
-OCL_ICD_CFLAGS = 
-OCL_ICD_LIBS = 
-OCL_KERNEL_ARCH = 
-OCL_KERNEL_TARGET = x86_64-unknown-linux-gnu
-OCL_KERNEL_TARGET_CPU = bdver3
-OCL_TARGETS = host hsail64
-OPENCL_CFLAGS = 
-OPENCL_CMAKE = 
-OPENCL_EXTLIBS = -lOpenCL 
-OPENCL_LIBS = -lOpenCL 
-OPT = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/opt
-OTOOL = 
-OTOOL64 = 
-PACKAGE = pocl
-PACKAGE_BUGREPORT = pocl-devel at lists.sourceforge.net
-PACKAGE_NAME = pocl
-PACKAGE_STRING = pocl 0.12
-PACKAGE_TARNAME = pocl
-PACKAGE_URL = 
-PACKAGE_VERSION = 0.12
-PATH_SEPARATOR = :
-PKG_CONFIG = /usr/bin/pkg-config
-PKG_CONFIG_LIBDIR = 
-PKG_CONFIG_PATH = 
-POAT_TESTSUITES =  hsa
-POCL_DEVICE_ADDRESS_BITS = 64
-PTHREAD_CC = gcc
-PTHREAD_CFLAGS = -pthread
-PTHREAD_LIBS = 
-RANLIB = ranlib
-SDL_CFLAGS = 
-SDL_LIBS = 
-SED = /usr/bin/sed
-SET_MAKE = 
-SHELL = /bin/sh
-STRIP = strip
-TARGET = x86_64-unknown-linux-gnu
-TARGET_CLANG_FLAGS = 
-TARGET_CPU = x86_64
-TARGET_LLC_FLAGS = 
-TARGET_SIZEOF_DOUBLE = 8
-TARGET_SIZEOF_HALF = 2
-TARGET_SIZEOF_LONG = 8
-TARGET_SIZEOF_VOID_P = 8
-TCECC = 
-TCEMC_AVAILABLE = 
-TCE_AVAILABLE = 
-TCE_CONFIG = 
-VERSION = 0.12
-abs_builddir = /tmp/pocl/lib/kernel/cellspu
-abs_srcdir = /tmp/pocl/lib/kernel/cellspu
-abs_top_builddir = /tmp/pocl
-abs_top_srcdir = /tmp/pocl
-ac_ct_AR = ar
-ac_ct_CC = gcc
-ac_ct_CXX = g++
-ac_ct_DUMPBIN = 
-acx_pthread_config = 
-am__include = include
-am__leading_dot = .
-am__quote = 
-am__tar = $${TAR-tar} chof - "$$tardir"
-am__untar = $${TAR-tar} xf -
-bindir = ${exec_prefix}/bin
-build = x86_64-unknown-linux-gnu
-build_alias = 
-build_cpu = x86_64
-build_os = linux-gnu
-build_vendor = unknown
-builddir = .
-datadir = ${datarootdir}
-datarootdir = ${prefix}/share
-docdir = ${datarootdir}/doc/${PACKAGE_TARNAME}
-dvidir = ${docdir}
-exec_prefix = ${prefix}
-host = x86_64-unknown-linux-gnu
-host_alias = 
-host_cpu = x86_64
-host_os = linux-gnu
-host_vendor = unknown
-htmldir = ${docdir}
-includedir = ${prefix}/include
-infodir = ${datarootdir}/info
-install_sh = ${SHELL} /tmp/pocl/config/install-sh
-libdir = ${exec_prefix}/lib
-libexecdir = ${exec_prefix}/libexec
-localedir = ${datarootdir}/locale
-localstatedir = ${prefix}/var
-mandir = ${datarootdir}/man
-mkdir_p = $(MKDIR_P)
-oldincludedir = /usr/include
-pdfdir = ${docdir}
-prefix = /usr/local
-program_transform_name = s,x,x,
-psdir = ${docdir}
-sbindir = ${exec_prefix}/sbin
-sharedstatedir = ${prefix}/com
-srcdir = .
-sysconfdir = /etc
-target = x86_64-unknown-linux-gnu
-target_alias = 
-target_cpu = x86_64
-target_os = linux-gnu
-target_vendor = unknown
-top_build_prefix = ../../../
-top_builddir = ../../..
-top_srcdir = ../../..
-KERNEL_TARGET = tce
-
-# Use TARGET flags:
-CLANG_FLAGS =  -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off
-LLC_FLAGS = 
-LD_FLAGS = @TARGET_LD_FLAGS@
-KERNEL_BC = kernel-${KERNEL_TARGET}.bc
-nodist_pkgdata_DATA = ${KERNEL_BC}
-
-# The standard list of kernel sources can be modified with
-# LKERNEL_SRCS_EXCLUDE, which removes files from the standard list,
-# and LKERNEL_SRCS_EXTRA, which adds extra files to the source list.
-LKERNEL_SRCS = \
-	$(filter-out ${LKERNEL_SRCS_EXCLUDE}, ${LKERNEL_SRCS_DEFAULT})	\
-	${LKERNEL_SRCS_EXTRA}
-
-OBJ = $(LKERNEL_SRCS:%=%.bc)
-CLEANFILES = kernel-${KERNEL_TARGET}.bc ${OBJ}
-LKERNEL_HDRS = image.h pocl_image_rw_utils.h templates.h
-LKERNEL_SRCS_DEFAULT = abs.cl abs_diff.cl acos.cl acosh.cl acospi.cl \
-	add_sat.cl all.cl any.cl as_type.cl asin.cl asinh.cl asinpi.cl \
-	async_work_group_copy.cl atan.cl atan2.cl atan2pi.cl atanh.cl \
-	atanpi.cl atomics.cl barrier.ll bitselect.cl cbrt.cl ceil.cl \
-	clamp.cl clamp_int.cl clz.cl convert_type.cl copysign.cl \
-	cos.cl cosh.cl cospi.cl cross.cl degrees.cl distance.cl \
-	divide.cl dot.cl erf.cl erfc.cl exp.cl exp10.cl exp2.cl \
-	expm1.cl fabs.cl fast_distance.cl fast_length.cl \
-	fast_normalize.cl fdim.cl floor.cl fma.cl fmax.cl fmin.cl \
-	fmod.cl fract.cl get_global_id.c get_global_offset.c \
-	get_global_size.c get_group_id.c get_image_depth.cl \
-	get_image_height.cl get_image_width.cl get_image_dim.cl \
-	get_local_id.c get_local_size.c get_num_groups.c \
-	get_work_dim.c hadd.cl hypot.cl ilogb.cl isequal.cl \
-	isfinite.cl isgreater.cl isgreaterequal.cl isinf.cl isless.cl \
-	islessequal.cl islessgreater.cl isnan.cl isnormal.cl \
-	isnotequal.cl isordered.cl isunordered.cl ldexp.cl length.cl \
-	lgamma.cl log.cl log10.cl log1p.cl log2.cl logb.cl mad.cl \
-	mad24.cl mad_hi.cl mad_sat.cl max.cl max_i.cl maxmag.cl min.cl \
-	min_i.cl minmag.cl mix.cl mul24.cl mul_hi.cl nan.cl \
-	native_cos.cl native_exp.cl native_exp10.cl native_exp2.cl \
-	native_log.cl native_log10.cl native_log2.cl native_powr.cl \
-	native_recip.cl native_rsqrt.cl native_sin.cl native_sqrt.cl \
-	native_tan.cl nextafter.cl normalize.cl popcount.cl pow.cl \
-	pown.cl powr.cl radians.cl read_image.cl recip.cl remainder.cl \
-	rhadd.cl rint.cl rootn.cl rotate.cl round.cl rsqrt.cl \
-	select.cl shuffle.cl sign.cl signbit.cl sin.cl sincos.cl \
-	sinh.cl sinpi.cl smoothstep.cl sqrt.cl step.cl sub_sat.cl \
-	tan.cl tanh.cl tanpi.cl tgamma.cl trunc.cl upsample.cl \
-	vload.cl vload_half.cl vstore.cl vstore_half.cl \
-	wait_group_events.cl write_image.cl $(am__append_1)
-EXTRA_DIST = CMakeLists.txt
-all: all-am
-
-.SUFFIXES:
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/../rules.mk $(srcdir)/../sources.mk $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
-	        && { if test -f $@; then exit 0; else break; fi; }; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign lib/kernel/cellspu/Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign lib/kernel/cellspu/Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-$(srcdir)/../rules.mk $(srcdir)/../sources.mk $(am__empty):
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure:  $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
-
-mostlyclean-libtool:
-	-rm -f *.lo
-
-clean-libtool:
-	-rm -rf .libs _libs
-install-nodist_pkgdataDATA: $(nodist_pkgdata_DATA)
-	@$(NORMAL_INSTALL)
-	@list='$(nodist_pkgdata_DATA)'; test -n "$(pkgdatadir)" || list=; \
-	if test -n "$$list"; then \
-	  echo " $(MKDIR_P) '$(DESTDIR)$(pkgdatadir)'"; \
-	  $(MKDIR_P) "$(DESTDIR)$(pkgdatadir)" || exit 1; \
-	fi; \
-	for p in $$list; do \
-	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
-	  echo "$$d$$p"; \
-	done | $(am__base_list) | \
-	while read files; do \
-	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pkgdatadir)'"; \
-	  $(INSTALL_DATA) $$files "$(DESTDIR)$(pkgdatadir)" || exit $$?; \
-	done
-
-uninstall-nodist_pkgdataDATA:
-	@$(NORMAL_UNINSTALL)
-	@list='$(nodist_pkgdata_DATA)'; test -n "$(pkgdatadir)" || list=; \
-	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	dir='$(DESTDIR)$(pkgdatadir)'; $(am__uninstall_files_from_dir)
-tags TAGS:
-
-ctags CTAGS:
-
-cscope cscopelist:
-
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-am
-all-am: Makefile $(DATA)
-installdirs:
-	for dir in "$(DESTDIR)$(pkgdatadir)"; do \
-	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
-	done
-install: install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-am
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-am
-
-clean-am: clean-generic clean-libtool mostlyclean-am
-
-distclean: distclean-am
-	-rm -f Makefile
-distclean-am: clean-am distclean-generic
-
-dvi: dvi-am
-
-dvi-am:
-
-html: html-am
-
-html-am:
-
-info: info-am
-
-info-am:
-
-install-data-am: install-nodist_pkgdataDATA
-
-install-dvi: install-dvi-am
-
-install-dvi-am:
-
-install-exec-am:
-
-install-html: install-html-am
-
-install-html-am:
-
-install-info: install-info-am
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-am
-
-install-pdf-am:
-
-install-ps: install-ps-am
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-am
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-am
-
-mostlyclean-am: mostlyclean-generic mostlyclean-libtool
-
-pdf: pdf-am
-
-pdf-am:
-
-ps: ps-am
-
-ps-am:
-
-uninstall-am: uninstall-nodist_pkgdataDATA
-
-.MAKE: install-am install-strip
-
-.PHONY: all all-am check check-am clean clean-generic clean-libtool \
-	cscopelist-am ctags-am distclean distclean-generic \
-	distclean-libtool distdir dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-info install-info-am install-man \
-	install-nodist_pkgdataDATA install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-generic \
-	mostlyclean-libtool pdf pdf-am ps ps-am tags-am uninstall \
-	uninstall-am uninstall-nodist_pkgdataDATA
-
-.PRECIOUS: Makefile
-
-
-all: ${KERNEL_BC}
-
-vpath %.c  ../../../lib/kernel
-vpath %.cc ../../../lib/kernel
-vpath %.cl ../../../lib/kernel
-vpath %.ll ../../../lib/kernel
-
-# Generate a precompiled header for the built-in function
-# declarations, in case supported by the target.
-
-# Note: the precompiled header must be compiled with the same features
-# as the kernels will be. That is, use exactly the same frontend
-# feature switches. Otherwise it will fail when compiling the kernel
-# against the precompiled header.
-_kernel.h.pch: ../../../include/${TARGET_DIR}/types.h ../../../include/_kernel.h
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang -Xclang -ffake-address-space-map -fno-math-errno -fblocks -fno-builtin -fasm -Wno-format  -D__OPENCL_VERSION__=120 -Xclang -ffake-address-space-map -c -target ${KERNEL_TARGET} -x cl \
-	-include ../../../include/${TARGET_DIR}/types.h \
-	-Xclang -emit-pch ../../../include/_kernel.h -o _kernel.h.pch 
-
-# Rules to compile the different kernel library source file types into
-# LLVM bitcode
-%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
-	mkdir -p ${dir $@}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang ${CLANG_FLAGS} ${CLFLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $< 
-%.cc.bc: %.cc ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
-	mkdir -p ${dir $@}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang++ ${CLANG_FLAGS} ${CLANGXX_FLAGS} -c -o $@ $< -include ${abs_top_srcdir}/include/pocl_features.h
-%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
-	mkdir -p ${dir $@}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang ${CLANG_FLAGS} -x cl ${CLFLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
-%.ll.bc: %.ll
-	mkdir -p ${dir $@}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-as -o $@ $<
-
-# Optimize the bitcode library to speed up optimization times for the
-# OpenCL kernels
-${KERNEL_BC}: ${OBJ}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-link $^ -o - | /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/opt ${LLC_FLAGS} ${KERNEL_LIB_OPT_FLAGS} -O3 -fp-contract=off -o $@
-
-# vim: set noexpandtab ts=8:
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/lib/kernel/cellspu/Makefile.in b/lib/kernel/cellspu/Makefile.in
deleted file mode 100644
index a6f15af..0000000
--- a/lib/kernel/cellspu/Makefile.in
+++ /dev/null
@@ -1,751 +0,0 @@
-# Makefile.in generated by automake 1.15 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994-2014 Free Software Foundation, Inc.
-
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
- at SET_MAKE@
-
-# Process this file with automake to produce Makefile.in
-# 
-# Copyright (c) 2011 Universidad Rey Juan Carlos
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-# rules.mk - the make rules for building the kernel library
-# 
-# Copyright (c) 2013 Erik Schnetter
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-# The caller (the Makefile which includes this file) needs to set the
-# following variables:
-# 
-# KERNEL_TARGET
-# CLANG_FLAGS
-# LLC_FLAGS
-# LD_FLAGS
-
-# sources.mk - a list of all kernel source files
-# 
-# Copyright (c) 2011-2013 Universidad Rey Juan Carlos
-#                         Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-VPATH = @srcdir@
-am__is_gnu_make = { \
-  if test -z '$(MAKELEVEL)'; then \
-    false; \
-  elif test -n '$(MAKE_HOST)'; then \
-    true; \
-  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
-    true; \
-  else \
-    false; \
-  fi; \
-}
-am__make_running_with_option = \
-  case $${target_option-} in \
-      ?) ;; \
-      *) echo "am__make_running_with_option: internal error: invalid" \
-              "target option '$${target_option-}' specified" >&2; \
-         exit 1;; \
-  esac; \
-  has_opt=no; \
-  sane_makeflags=$$MAKEFLAGS; \
-  if $(am__is_gnu_make); then \
-    sane_makeflags=$$MFLAGS; \
-  else \
-    case $$MAKEFLAGS in \
-      *\\[\ \	]*) \
-        bs=\\; \
-        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
-          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
-    esac; \
-  fi; \
-  skip_next=no; \
-  strip_trailopt () \
-  { \
-    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
-  }; \
-  for flg in $$sane_makeflags; do \
-    test $$skip_next = yes && { skip_next=no; continue; }; \
-    case $$flg in \
-      *=*|--*) continue;; \
-        -*I) strip_trailopt 'I'; skip_next=yes;; \
-      -*I?*) strip_trailopt 'I';; \
-        -*O) strip_trailopt 'O'; skip_next=yes;; \
-      -*O?*) strip_trailopt 'O';; \
-        -*l) strip_trailopt 'l'; skip_next=yes;; \
-      -*l?*) strip_trailopt 'l';; \
-      -[dEDm]) skip_next=yes;; \
-      -[JT]) skip_next=yes;; \
-    esac; \
-    case $$flg in \
-      *$$target_option*) has_opt=yes; break;; \
-    esac; \
-  done; \
-  test $$has_opt = yes
-am__make_dryrun = (target_option=n; $(am__make_running_with_option))
-am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
-pkgdatadir = $(datadir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkglibexecdir = $(libexecdir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
- at NEW_PRINTF_WORKS_TRUE@am__append_1 = printf.c
-subdir = lib/kernel/cellspu
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
-	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
-	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
-	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \
-	$(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/config.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-AM_V_P = $(am__v_P_ at AM_V@)
-am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
-am__v_P_0 = false
-am__v_P_1 = :
-AM_V_GEN = $(am__v_GEN_ at AM_V@)
-am__v_GEN_ = $(am__v_GEN_ at AM_DEFAULT_V@)
-am__v_GEN_0 = @echo "  GEN     " $@;
-am__v_GEN_1 = 
-AM_V_at = $(am__v_at_ at AM_V@)
-am__v_at_ = $(am__v_at_ at AM_DEFAULT_V@)
-am__v_at_0 = @
-am__v_at_1 = 
-SOURCES =
-DIST_SOURCES =
-am__can_run_installinfo = \
-  case $$AM_UPDATE_INFO_DIR in \
-    n|no|NO) false;; \
-    *) (install-info --version) >/dev/null 2>&1;; \
-  esac
-am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
-am__vpath_adj = case $$p in \
-    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
-    *) f=$$p;; \
-  esac;
-am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
-am__install_max = 40
-am__nobase_strip_setup = \
-  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
-am__nobase_strip = \
-  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
-am__nobase_list = $(am__nobase_strip_setup); \
-  for p in $$list; do echo "$$p $$p"; done | \
-  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
-  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
-    if (++n[$$2] == $(am__install_max)) \
-      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
-    END { for (dir in files) print dir, files[dir] }'
-am__base_list = \
-  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
-  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
-am__uninstall_files_from_dir = { \
-  test -z "$$files" \
-    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
-    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
-         $(am__cd) "$$dir" && rm -f $$files; }; \
-  }
-am__installdirs = "$(DESTDIR)$(pkgdatadir)"
-DATA = $(nodist_pkgdata_DATA)
-am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
-am__DIST_COMMON = $(srcdir)/../rules.mk $(srcdir)/../sources.mk \
-	$(srcdir)/Makefile.in
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-ACLOCAL = @ACLOCAL@
-AMTAR = @AMTAR@
-AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
-AR = @AR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
-BOOST_LDFLAGS = @BOOST_LDFLAGS@
-BUILD_TIMESTAMP = @BUILD_TIMESTAMP@
-CC = @CC@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CLANG = @CLANG@
-CLANGXX = @CLANGXX@
-CLANGXX_FLAGS = @CLANGXX_FLAGS@
-CLFLAGS = @CLFLAGS@
-CPP = @CPP@
-CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-DLLTOOL = @DLLTOOL@
-DSYMUTIL = @DSYMUTIL@
-DUMPBIN = @DUMPBIN@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-FGREP = @FGREP@
-FORCED_CLFLAGS = @FORCED_CLFLAGS@
-GLEW_CFLAGS = @GLEW_CFLAGS@
-GLEW_LIBS = @GLEW_LIBS@
-GREP = @GREP@
-HOST = @HOST@
-HOST_AS_FLAGS = @HOST_AS_FLAGS@
-HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
-HOST_CPU = @HOST_CPU@
-HOST_LD_FLAGS = @HOST_LD_FLAGS@
-HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
-HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
-HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
-HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
-HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
-HSAILASM = @HSAILASM@
-HSA_INCLUDES = @HSA_INCLUDES@
-HSA_LIBS = @HSA_LIBS@
-HWLOC_CFLAGS = @HWLOC_CFLAGS@
-HWLOC_LIBS = @HWLOC_LIBS@
-ICD_LD_FLAGS = @ICD_LD_FLAGS@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-KERNEL_COMPILER_LIB_VERSION = @KERNEL_COMPILER_LIB_VERSION@
-LD = @LD@
-LDFLAGS = @LDFLAGS@
-LD_FLAGS_BIN = @LD_FLAGS_BIN@
-LIBOBJS = @LIBOBJS@
-LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
-LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
-LIBTOOL = @LIBTOOL@
-LIB_AGE_VERSION = @LIB_AGE_VERSION@
-LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
-LIB_FIRST_VERSION = @LIB_FIRST_VERSION@
-LIB_REVISION_VERSION = @LIB_REVISION_VERSION@
-LIB_VERSION = @LIB_VERSION@
-LIPO = @LIPO@
-LLC = @LLC@
-LLVM_AS = @LLVM_AS@
-LLVM_CONFIG = @LLVM_CONFIG@
-LLVM_CXX_FLAGS = @LLVM_CXX_FLAGS@
-LLVM_LDFLAGS = @LLVM_LDFLAGS@
-LLVM_LIBS = @LLVM_LIBS@
-LLVM_LINK = @LLVM_LINK@
-LLVM_OPT = @LLVM_OPT@
-LLVM_VERSION = @LLVM_VERSION@
-LN_S = @LN_S@
-LTDL_LIBS = @LTDL_LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
-MAKEINFO = @MAKEINFO@
-MANIFEST_TOOL = @MANIFEST_TOOL@
-MKDIR_P = @MKDIR_P@
-NM = @NM@
-NMEDIT = @NMEDIT@
-OBJDUMP = @OBJDUMP@
-OBJEXT = @OBJEXT@
-OCL_ICD_CFLAGS = @OCL_ICD_CFLAGS@
-OCL_ICD_LIBS = @OCL_ICD_LIBS@
-OCL_KERNEL_ARCH = @OCL_KERNEL_ARCH@
-OCL_KERNEL_TARGET = @OCL_KERNEL_TARGET@
-OCL_KERNEL_TARGET_CPU = @OCL_KERNEL_TARGET_CPU@
-OCL_TARGETS = @OCL_TARGETS@
-OPENCL_CFLAGS = @OPENCL_CFLAGS@
-OPENCL_CMAKE = @OPENCL_CMAKE@
-OPENCL_EXTLIBS = @OPENCL_EXTLIBS@
-OPENCL_LIBS = @OPENCL_LIBS@
-OPT = @OPT@
-OTOOL = @OTOOL@
-OTOOL64 = @OTOOL64@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_URL = @PACKAGE_URL@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PKG_CONFIG = @PKG_CONFIG@
-PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
-PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
-POAT_TESTSUITES = @POAT_TESTSUITES@
-POCL_DEVICE_ADDRESS_BITS = @POCL_DEVICE_ADDRESS_BITS@
-PTHREAD_CC = @PTHREAD_CC@
-PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SDL_CFLAGS = @SDL_CFLAGS@
-SDL_LIBS = @SDL_LIBS@
-SED = @SED@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-TARGET = @TARGET@
-TARGET_CLANG_FLAGS = @TARGET_CLANG_FLAGS@
-TARGET_CPU = @TARGET_CPU@
-TARGET_LLC_FLAGS = @TARGET_LLC_FLAGS@
-TARGET_SIZEOF_DOUBLE = @TARGET_SIZEOF_DOUBLE@
-TARGET_SIZEOF_HALF = @TARGET_SIZEOF_HALF@
-TARGET_SIZEOF_LONG = @TARGET_SIZEOF_LONG@
-TARGET_SIZEOF_VOID_P = @TARGET_SIZEOF_VOID_P@
-TCECC = @TCECC@
-TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
-TCE_AVAILABLE = @TCE_AVAILABLE@
-TCE_CONFIG = @TCE_CONFIG@
-VERSION = @VERSION@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_ct_AR = @ac_ct_AR@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
-acx_pthread_config = @acx_pthread_config@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_build_prefix = @top_build_prefix@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-KERNEL_TARGET = tce
-
-# Use TARGET flags:
-CLANG_FLAGS = @TARGET_CLANG_FLAGS@ -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off
-LLC_FLAGS = @TARGET_LLC_FLAGS@
-LD_FLAGS = @TARGET_LD_FLAGS@
-KERNEL_BC = kernel-${KERNEL_TARGET}.bc
-nodist_pkgdata_DATA = ${KERNEL_BC}
-
-# The standard list of kernel sources can be modified with
-# LKERNEL_SRCS_EXCLUDE, which removes files from the standard list,
-# and LKERNEL_SRCS_EXTRA, which adds extra files to the source list.
-LKERNEL_SRCS = \
-	$(filter-out ${LKERNEL_SRCS_EXCLUDE}, ${LKERNEL_SRCS_DEFAULT})	\
-	${LKERNEL_SRCS_EXTRA}
-
-OBJ = $(LKERNEL_SRCS:%=%.bc)
-CLEANFILES = kernel-${KERNEL_TARGET}.bc ${OBJ}
-LKERNEL_HDRS = image.h pocl_image_rw_utils.h templates.h
-LKERNEL_SRCS_DEFAULT = abs.cl abs_diff.cl acos.cl acosh.cl acospi.cl \
-	add_sat.cl all.cl any.cl as_type.cl asin.cl asinh.cl asinpi.cl \
-	async_work_group_copy.cl atan.cl atan2.cl atan2pi.cl atanh.cl \
-	atanpi.cl atomics.cl barrier.ll bitselect.cl cbrt.cl ceil.cl \
-	clamp.cl clamp_int.cl clz.cl convert_type.cl copysign.cl \
-	cos.cl cosh.cl cospi.cl cross.cl degrees.cl distance.cl \
-	divide.cl dot.cl erf.cl erfc.cl exp.cl exp10.cl exp2.cl \
-	expm1.cl fabs.cl fast_distance.cl fast_length.cl \
-	fast_normalize.cl fdim.cl floor.cl fma.cl fmax.cl fmin.cl \
-	fmod.cl fract.cl get_global_id.c get_global_offset.c \
-	get_global_size.c get_group_id.c get_image_depth.cl \
-	get_image_height.cl get_image_width.cl get_image_dim.cl \
-	get_local_id.c get_local_size.c get_num_groups.c \
-	get_work_dim.c hadd.cl hypot.cl ilogb.cl isequal.cl \
-	isfinite.cl isgreater.cl isgreaterequal.cl isinf.cl isless.cl \
-	islessequal.cl islessgreater.cl isnan.cl isnormal.cl \
-	isnotequal.cl isordered.cl isunordered.cl ldexp.cl length.cl \
-	lgamma.cl log.cl log10.cl log1p.cl log2.cl logb.cl mad.cl \
-	mad24.cl mad_hi.cl mad_sat.cl max.cl max_i.cl maxmag.cl min.cl \
-	min_i.cl minmag.cl mix.cl mul24.cl mul_hi.cl nan.cl \
-	native_cos.cl native_exp.cl native_exp10.cl native_exp2.cl \
-	native_log.cl native_log10.cl native_log2.cl native_powr.cl \
-	native_recip.cl native_rsqrt.cl native_sin.cl native_sqrt.cl \
-	native_tan.cl nextafter.cl normalize.cl popcount.cl pow.cl \
-	pown.cl powr.cl radians.cl read_image.cl recip.cl remainder.cl \
-	rhadd.cl rint.cl rootn.cl rotate.cl round.cl rsqrt.cl \
-	select.cl shuffle.cl sign.cl signbit.cl sin.cl sincos.cl \
-	sinh.cl sinpi.cl smoothstep.cl sqrt.cl step.cl sub_sat.cl \
-	tan.cl tanh.cl tanpi.cl tgamma.cl trunc.cl upsample.cl \
-	vload.cl vload_half.cl vstore.cl vstore_half.cl \
-	wait_group_events.cl write_image.cl $(am__append_1)
-EXTRA_DIST = CMakeLists.txt
-all: all-am
-
-.SUFFIXES:
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/../rules.mk $(srcdir)/../sources.mk $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
-	        && { if test -f $@; then exit 0; else break; fi; }; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign lib/kernel/cellspu/Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign lib/kernel/cellspu/Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-$(srcdir)/../rules.mk $(srcdir)/../sources.mk $(am__empty):
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure:  $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
-
-mostlyclean-libtool:
-	-rm -f *.lo
-
-clean-libtool:
-	-rm -rf .libs _libs
-install-nodist_pkgdataDATA: $(nodist_pkgdata_DATA)
-	@$(NORMAL_INSTALL)
-	@list='$(nodist_pkgdata_DATA)'; test -n "$(pkgdatadir)" || list=; \
-	if test -n "$$list"; then \
-	  echo " $(MKDIR_P) '$(DESTDIR)$(pkgdatadir)'"; \
-	  $(MKDIR_P) "$(DESTDIR)$(pkgdatadir)" || exit 1; \
-	fi; \
-	for p in $$list; do \
-	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
-	  echo "$$d$$p"; \
-	done | $(am__base_list) | \
-	while read files; do \
-	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pkgdatadir)'"; \
-	  $(INSTALL_DATA) $$files "$(DESTDIR)$(pkgdatadir)" || exit $$?; \
-	done
-
-uninstall-nodist_pkgdataDATA:
-	@$(NORMAL_UNINSTALL)
-	@list='$(nodist_pkgdata_DATA)'; test -n "$(pkgdatadir)" || list=; \
-	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	dir='$(DESTDIR)$(pkgdatadir)'; $(am__uninstall_files_from_dir)
-tags TAGS:
-
-ctags CTAGS:
-
-cscope cscopelist:
-
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-am
-all-am: Makefile $(DATA)
-installdirs:
-	for dir in "$(DESTDIR)$(pkgdatadir)"; do \
-	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
-	done
-install: install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-am
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-am
-
-clean-am: clean-generic clean-libtool mostlyclean-am
-
-distclean: distclean-am
-	-rm -f Makefile
-distclean-am: clean-am distclean-generic
-
-dvi: dvi-am
-
-dvi-am:
-
-html: html-am
-
-html-am:
-
-info: info-am
-
-info-am:
-
-install-data-am: install-nodist_pkgdataDATA
-
-install-dvi: install-dvi-am
-
-install-dvi-am:
-
-install-exec-am:
-
-install-html: install-html-am
-
-install-html-am:
-
-install-info: install-info-am
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-am
-
-install-pdf-am:
-
-install-ps: install-ps-am
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-am
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-am
-
-mostlyclean-am: mostlyclean-generic mostlyclean-libtool
-
-pdf: pdf-am
-
-pdf-am:
-
-ps: ps-am
-
-ps-am:
-
-uninstall-am: uninstall-nodist_pkgdataDATA
-
-.MAKE: install-am install-strip
-
-.PHONY: all all-am check check-am clean clean-generic clean-libtool \
-	cscopelist-am ctags-am distclean distclean-generic \
-	distclean-libtool distdir dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-info install-info-am install-man \
-	install-nodist_pkgdataDATA install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-generic \
-	mostlyclean-libtool pdf pdf-am ps ps-am tags-am uninstall \
-	uninstall-am uninstall-nodist_pkgdataDATA
-
-.PRECIOUS: Makefile
-
-
-all: ${KERNEL_BC}
-
-vpath %.c  @top_srcdir@/lib/kernel
-vpath %.cc @top_srcdir@/lib/kernel
-vpath %.cl @top_srcdir@/lib/kernel
-vpath %.ll @top_srcdir@/lib/kernel
-
-# Generate a precompiled header for the built-in function
-# declarations, in case supported by the target.
-
-# Note: the precompiled header must be compiled with the same features
-# as the kernels will be. That is, use exactly the same frontend
-# feature switches. Otherwise it will fail when compiling the kernel
-# against the precompiled header.
-_kernel.h.pch: @top_builddir@/include/${TARGET_DIR}/types.h @top_srcdir@/include/_kernel.h
-	@CLANG@ @FORCED_CLFLAGS@ @CLFLAGS@ -Xclang -ffake-address-space-map -c -target ${KERNEL_TARGET} -x cl \
-	-include @top_builddir@/include/${TARGET_DIR}/types.h \
-	-Xclang -emit-pch @top_srcdir@/include/_kernel.h -o _kernel.h.pch 
-
-# Rules to compile the different kernel library source file types into
-# LLVM bitcode
-%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
-	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} ${CLFLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $< 
-%.cc.bc: %.cc ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
-	mkdir -p ${dir $@}
-	@CLANGXX@ ${CLANG_FLAGS} ${CLANGXX_FLAGS} -c -o $@ $< -include ${abs_top_srcdir}/include/pocl_features.h
-%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
-	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} -x cl ${CLFLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
-%.ll.bc: %.ll
-	mkdir -p ${dir $@}
-	@LLVM_AS@ -o $@ $<
-
-# Optimize the bitcode library to speed up optimization times for the
-# OpenCL kernels
-${KERNEL_BC}: ${OBJ}
-	@LLVM_LINK@ $^ -o - | @LLVM_OPT@ ${LLC_FLAGS} ${KERNEL_LIB_OPT_FLAGS} -O3 -fp-contract=off -o $@
-
-# vim: set noexpandtab ts=8:
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/lib/kernel/host/CMakeLists.txt b/lib/kernel/host/CMakeLists.txt
index 3c99dc2..b08e789 100644
--- a/lib/kernel/host/CMakeLists.txt
+++ b/lib/kernel/host/CMakeLists.txt
@@ -31,28 +31,99 @@ else()
   set(KERNEL_SOURCES ${SOURCES_WITHOUT_VML})
 endif()
 
+if(X86_64)
+  list(APPEND KERNEL_SOURCES svm_atomics_x86_64.ll svm_atomics.cl)
+elseif(MIPS)
+  message(STATUS "OpenCL 2.0 atomics are currently broken on MIPS")
+else()
+  message(STATUS "Using generic OpenCL 2.0 atomics. Might or might not break your build.")
+  list(APPEND KERNEL_SOURCES svm_atomics_host.cl svm_atomics.cl)
+endif()
 
-# Use HOST flags:
-#~ CLANG_FLAGS = @HOST_CLANG_FLAGS@ -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off
 separate_arguments(HOST_CLANG_FLAGS)
-set(CLANG_FLAGS ${HOST_CLANG_FLAGS} "-Xclang" "-ffake-address-space-map" "-emit-llvm" "-ffp-contract=off")
-
-# LLC_FLAGS   = @HOST_LLC_FLAGS@
 separate_arguments(HOST_LLC_FLAGS)
-set(LLC_FLAGS ${HOST_LLC_FLAGS})
+set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${HOST_DEVICE_CL_VERSION} ${HOST_DEVICE_EXTENSION_DEFINES}")
+separate_arguments(DEVICE_CL_FLAGS)
+
+
+function(x86_distro_variant_to_flags VARIANT OUT_LLC_FLAGS OUT_CLANG_FLAGS)
+
+  if("${VARIANT}" STREQUAL "sse2")
+    set(CLANG_F "${CLANG_MARCH_FLAG}athlon64")
+    set(LLC_F "-mcpu=athlon64")
+
+  elseif("${VARIANT}" STREQUAL "ssse3")
+    set(CLANG_F "${CLANG_MARCH_FLAG}core2")
+    set(LLC_F "-mcpu=core2")
+
+  elseif("${VARIANT}" STREQUAL "sse41")
+    set(CLANG_F "${CLANG_MARCH_FLAG}penryn")
+    set(LLC_F "-mcpu=penryn")
+
+  elseif("${VARIANT}" STREQUAL "avx")
+    set(CLANG_F "${CLANG_MARCH_FLAG}sandybridge")
+    set(LLC_F "-mcpu=sandybridge")
+
+  elseif("${VARIANT}" STREQUAL "avx_fma4")
+    set(CLANG_F "${CLANG_MARCH_FLAG}bdver1")
+    set(LLC_F "-mcpu=bdver1")
+
+  elseif("${VARIANT}" STREQUAL "avx2")
+    set(CLANG_F "${CLANG_MARCH_FLAG}haswell")
+    set(LLC_F "-mcpu=haswell")
+
+  elseif("${VARIANT}" STREQUAL "avx512")
+    set(CLANG_F "${CLANG_MARCH_FLAG}skylake")
+    set(LLC_F "-mcpu=skylake")
+
+  else()
+    set(CLANG_F "${CLANG_MARCH_FLAG}${VARIANT}")
+    set(LLC_F "-mcpu=${VARIANT}")
 
-# TODO LLC_flags is used by kernel.bc target, but ld is unused
-#LD_FLAGS    = @HOST_LD_FLAGS@
+  endif()
+
+  set(${OUT_LLC_FLAGS} "${LLC_F}" PARENT_SCOPE)
+  set(${OUT_CLANG_FLAGS} "${CLANG_F}" PARENT_SCOPE)
+endfunction()
+
+###############################################################################
+
+foreach(CPU_VARIANT IN LISTS KERNELLIB_HOST_CPU_VARIANTS)
+
+if(CPU_VARIANT MATCHES "native")
+  set(VARIANT "${LLC_HOST_CPU}")
+else()
+  set(VARIANT "${CPU_VARIANT}")
+endif()
+
+if(X86_64 OR I386)
+  x86_distro_variant_to_flags("${VARIANT}" LLC_CPUFLAGS CLANG_CPUFLAGS)
+else()
+  set(CLANG_CPUFLAGS "${CLANG_MARCH_FLAG}${VARIANT}")
+  set(LLC_CPUFLAGS "-mcpu=${VARIANT}")
+endif()
+
+separate_arguments(CLANG_CPUFLAGS)
+separate_arguments(LLC_CPUFLAGS)
+set(CLANG_FLAGS ${HOST_CLANG_FLAGS} ${CLANG_CPUFLAGS} "-Xclang" "-ffake-address-space-map" "-emit-llvm" "-ffp-contract=off")
+set(LLC_FLAGS ${HOST_LLC_FLAGS} ${LLC_CPUFLAGS})
 
 # KERNEL_TARGET = @OCL_KERNEL_TARGET@
-make_kernel_bc(KERNEL_BC "${OCL_KERNEL_TARGET}" ${KERNEL_SOURCES})
+make_kernel_bc(KERNEL_BC "${OCL_KERNEL_TARGET}-${VARIANT}" "${VARIANT}" ${KERNEL_SOURCES})
 
 # just debug
-message(STATUS "Host Kernel BC: ${KERNEL_BC}")
+message(STATUS "Host Kernel BC for \"${VARIANT}\": ${KERNEL_BC}")
+
+list(APPEND KERNEL_BC_LIST "${KERNEL_BC}")
+set(KERNEL_BC_LIST "${KERNEL_BC_LIST}" PARENT_SCOPE)
 
 # a target is needed...
-add_custom_target("kernel_host" ALL
-    DEPENDS ${KERNEL_BC} "${CMAKE_BINARY_DIR}/kernellib_hash.h")
+add_custom_target("kernel_host_${VARIANT}" DEPENDS ${KERNEL_BC})
+
+list(APPEND KERNEL_TARGET_LIST "kernel_host_${VARIANT}")
+set(KERNEL_TARGET_LIST "${KERNEL_TARGET_LIST}" PARENT_SCOPE)
 
 install(FILES "${KERNEL_BC}"
         DESTINATION "${POCL_INSTALL_PRIVATE_DATADIR}")
+
+endforeach()
diff --git a/lib/kernel/host/Makefile.am b/lib/kernel/host/Makefile.am
index ea8d74d..04ec347 100644
--- a/lib/kernel/host/Makefile.am
+++ b/lib/kernel/host/Makefile.am
@@ -28,6 +28,7 @@ KERNEL_TARGET = @OCL_KERNEL_TARGET@
 CLANG_FLAGS = @HOST_CLANG_FLAGS@ -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off
 LLC_FLAGS   = @HOST_LLC_FLAGS@
 LD_FLAGS    = @HOST_LD_FLAGS@
+DEVICE_CL_FLAGS = -D__OPENCL_VERSION__=200 @HOST_DEVICE_EXTENSION_DEFINES@
 
 include ../rules.mk
 include ../sources.mk
@@ -35,4 +36,12 @@ if USE_VECMATHLIB
 include ../sources-vml.mk
 endif
 
+if HOST_CPU_IS_X86_64
+  LKERNEL_SRCS_EXTRA2 = svm_atomics_x86_64.ll svm_atomics.cl
+else
+  LKERNEL_SRCS_EXTRA2 = svm_atomics_host.cl svm_atomics.cl
+endif
+
+
+
 EXTRA_DIST = CMakeLists.txt
diff --git a/lib/kernel/host/Makefile.in b/lib/kernel/host/Makefile.in
index 6271293..c5a64d9 100644
--- a/lib/kernel/host/Makefile.in
+++ b/lib/kernel/host/Makefile.in
@@ -65,6 +65,7 @@
 # CLANG_FLAGS
 # LLC_FLAGS
 # LD_FLAGS
+# DEVICE_CL_FLAGS
 
 # sources.mk - a list of all kernel source files
 # 
@@ -164,7 +165,6 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
- at NEW_PRINTF_WORKS_TRUE@am__append_1 = printf.c
 subdir = lib/kernel/host
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
@@ -275,6 +275,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -282,6 +283,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -299,8 +301,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -376,6 +376,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -441,6 +442,7 @@ KERNEL_TARGET = @OCL_KERNEL_TARGET@
 CLANG_FLAGS = @HOST_CLANG_FLAGS@ -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off
 LLC_FLAGS = @HOST_LLC_FLAGS@
 LD_FLAGS = @HOST_LD_FLAGS@
+DEVICE_CL_FLAGS = -D__OPENCL_VERSION__=200 @HOST_DEVICE_EXTENSION_DEFINES@
 KERNEL_BC = kernel-${KERNEL_TARGET}.bc
 nodist_pkgdata_DATA = ${KERNEL_BC}
 
@@ -449,42 +451,170 @@ nodist_pkgdata_DATA = ${KERNEL_BC}
 # and LKERNEL_SRCS_EXTRA, which adds extra files to the source list.
 LKERNEL_SRCS = \
 	$(filter-out ${LKERNEL_SRCS_EXCLUDE}, ${LKERNEL_SRCS_DEFAULT})	\
-	${LKERNEL_SRCS_EXTRA}
+	${LKERNEL_SRCS_EXTRA} ${LKERNEL_SRCS_EXTRA2}
 
 OBJ = $(LKERNEL_SRCS:%=%.bc)
 CLEANFILES = kernel-${KERNEL_TARGET}.bc ${OBJ}
 LKERNEL_HDRS = image.h pocl_image_rw_utils.h templates.h
-LKERNEL_SRCS_DEFAULT = abs.cl abs_diff.cl acos.cl acosh.cl acospi.cl \
-	add_sat.cl all.cl any.cl as_type.cl asin.cl asinh.cl asinpi.cl \
-	async_work_group_copy.cl atan.cl atan2.cl atan2pi.cl atanh.cl \
-	atanpi.cl atomics.cl barrier.ll bitselect.cl cbrt.cl ceil.cl \
-	clamp.cl clamp_int.cl clz.cl convert_type.cl copysign.cl \
-	cos.cl cosh.cl cospi.cl cross.cl degrees.cl distance.cl \
-	divide.cl dot.cl erf.cl erfc.cl exp.cl exp10.cl exp2.cl \
-	expm1.cl fabs.cl fast_distance.cl fast_length.cl \
-	fast_normalize.cl fdim.cl floor.cl fma.cl fmax.cl fmin.cl \
-	fmod.cl fract.cl get_global_id.c get_global_offset.c \
-	get_global_size.c get_group_id.c get_image_depth.cl \
-	get_image_height.cl get_image_width.cl get_image_dim.cl \
-	get_local_id.c get_local_size.c get_num_groups.c \
-	get_work_dim.c hadd.cl hypot.cl ilogb.cl isequal.cl \
-	isfinite.cl isgreater.cl isgreaterequal.cl isinf.cl isless.cl \
-	islessequal.cl islessgreater.cl isnan.cl isnormal.cl \
-	isnotequal.cl isordered.cl isunordered.cl ldexp.cl length.cl \
-	lgamma.cl log.cl log10.cl log1p.cl log2.cl logb.cl mad.cl \
-	mad24.cl mad_hi.cl mad_sat.cl max.cl max_i.cl maxmag.cl min.cl \
-	min_i.cl minmag.cl mix.cl mul24.cl mul_hi.cl nan.cl \
-	native_cos.cl native_exp.cl native_exp10.cl native_exp2.cl \
-	native_log.cl native_log10.cl native_log2.cl native_powr.cl \
-	native_recip.cl native_rsqrt.cl native_sin.cl native_sqrt.cl \
-	native_tan.cl nextafter.cl normalize.cl popcount.cl pow.cl \
-	pown.cl powr.cl radians.cl read_image.cl recip.cl remainder.cl \
-	rhadd.cl rint.cl rootn.cl rotate.cl round.cl rsqrt.cl \
-	select.cl shuffle.cl sign.cl signbit.cl sin.cl sincos.cl \
-	sinh.cl sinpi.cl smoothstep.cl sqrt.cl step.cl sub_sat.cl \
-	tan.cl tanh.cl tanpi.cl tgamma.cl trunc.cl upsample.cl \
-	vload.cl vload_half.cl vstore.cl vstore_half.cl \
-	wait_group_events.cl write_image.cl $(am__append_1)
+LKERNEL_SRCS_DEFAULT = \
+	abs.cl					\
+	abs_diff.cl				\
+	acos.cl					\
+	acosh.cl				\
+	acospi.cl				\
+	add_sat.cl				\
+	all.cl					\
+	any.cl					\
+	as_type.cl				\
+	asin.cl					\
+	asinh.cl				\
+	asinpi.cl				\
+	async_work_group_copy.cl		\
+	atan.cl					\
+	atan2.cl				\
+	atan2pi.cl				\
+	atanh.cl				\
+	atanpi.cl				\
+	atomics.cl				\
+	barrier.ll				\
+	bitselect.cl				\
+	cbrt.cl					\
+	ceil.cl					\
+	clamp.cl				\
+	clamp_int.cl				\
+	clz.cl					\
+	convert_type.cl				\
+	copysign.cl				\
+	cos.cl					\
+	cosh.cl					\
+	cospi.cl				\
+	cross.cl				\
+	degrees.cl				\
+	distance.cl				\
+	divide.cl				\
+	dot.cl					\
+	erf.cl					\
+	erfc.cl					\
+	exp.cl					\
+	exp10.cl				\
+	exp2.cl					\
+	expm1.cl				\
+	fabs.cl					\
+	fast_distance.cl			\
+	fast_length.cl				\
+	fast_normalize.cl			\
+	fdim.cl					\
+	floor.cl				\
+	fma.cl					\
+	fmax.cl					\
+	fmin.cl					\
+	fmod.cl					\
+	fract.cl				\
+	get_global_id.c				\
+	get_global_offset.c			\
+	get_global_size.c			\
+	get_group_id.c				\
+	get_image_depth.cl			\
+	get_image_height.cl			\
+	get_image_width.cl			\
+	get_image_dim.cl			\
+	get_local_id.c				\
+	get_local_size.c			\
+	get_num_groups.c			\
+	get_work_dim.c				\
+	hadd.cl					\
+	hypot.cl				\
+	ilogb.cl				\
+	isequal.cl				\
+	isfinite.cl				\
+	isgreater.cl				\
+	isgreaterequal.cl			\
+	isinf.cl				\
+	isless.cl				\
+	islessequal.cl				\
+	islessgreater.cl			\
+	isnan.cl				\
+	isnormal.cl				\
+	isnotequal.cl				\
+	isordered.cl				\
+	isunordered.cl				\
+	ldexp.cl				\
+	length.cl				\
+	lgamma.cl				\
+	log.cl					\
+	log10.cl				\
+	log1p.cl				\
+	log2.cl					\
+	logb.cl					\
+	mad.cl					\
+	mad24.cl				\
+	mad_hi.cl				\
+	mad_sat.cl				\
+	max.cl					\
+	max_i.cl				\
+	maxmag.cl				\
+	min.cl					\
+	min_i.cl				\
+	minmag.cl				\
+	mix.cl					\
+	mul24.cl				\
+	mul_hi.cl				\
+	nan.cl					\
+	native_cos.cl				\
+	native_exp.cl				\
+	native_exp10.cl				\
+	native_exp2.cl				\
+	native_log.cl				\
+	native_log10.cl				\
+	native_log2.cl				\
+	native_powr.cl				\
+	native_recip.cl				\
+	native_rsqrt.cl				\
+	native_sin.cl				\
+	native_sqrt.cl				\
+	native_tan.cl				\
+	nextafter.cl				\
+	normalize.cl				\
+	popcount.cl				\
+	pow.cl					\
+	pown.cl					\
+	powr.cl					\
+	printf.c                                \
+	radians.cl				\
+	read_image.cl				\
+	recip.cl				\
+	remainder.cl				\
+	rhadd.cl				\
+	rint.cl					\
+	rootn.cl				\
+	rotate.cl				\
+	round.cl				\
+	rsqrt.cl				\
+	select.cl				\
+	shuffle.cl				\
+	sign.cl					\
+	signbit.cl				\
+	sin.cl					\
+	sincos.cl				\
+	sinh.cl					\
+	sinpi.cl				\
+	smoothstep.cl				\
+	sqrt.cl					\
+	step.cl					\
+	sub_sat.cl				\
+	tan.cl					\
+	tanh.cl					\
+	tanpi.cl				\
+	tgamma.cl				\
+	trunc.cl				\
+	upsample.cl				\
+	vload.cl				\
+	vload_half.cl				\
+	vstore.cl				\
+	vstore_half.cl				\
+	wait_group_events.cl			\
+	write_image.cl
+
 
 # sources-vml.mk - kernel source file overrides for vecmathlib
 # 
@@ -586,6 +716,7 @@ LKERNEL_SRCS_DEFAULT = abs.cl abs_diff.cl acos.cl acosh.cl acospi.cl \
 @USE_VECMATHLIB_TRUE@        native_recip.cl                         \
 @USE_VECMATHLIB_TRUE@        native_rsqrt.cl                         \
 @USE_VECMATHLIB_TRUE@        native_sin.cl                           \
+ at USE_VECMATHLIB_TRUE@        native_sqrt.cl                          \
 @USE_VECMATHLIB_TRUE@        native_tan.cl                           \
 @USE_VECMATHLIB_TRUE@	normalize.cl				\
 @USE_VECMATHLIB_TRUE@	pow.cl					\
@@ -707,6 +838,7 @@ LKERNEL_SRCS_DEFAULT = abs.cl abs_diff.cl acos.cl acosh.cl acospi.cl \
 @USE_VECMATHLIB_TRUE@	native_recip.cl					\
 @USE_VECMATHLIB_TRUE@	native_rsqrt.cl					\
 @USE_VECMATHLIB_TRUE@	native_sin.cl					\
+ at USE_VECMATHLIB_TRUE@	native_sqrt.cl  				\
 @USE_VECMATHLIB_TRUE@	native_tan.cl					\
 @USE_VECMATHLIB_TRUE@	normalize.cl					\
 @USE_VECMATHLIB_TRUE@	pow.cc						\
@@ -771,6 +903,8 @@ LKERNEL_SRCS_DEFAULT = abs.cl abs_diff.cl acos.cl acosh.cl acospi.cl \
 @USE_VECMATHLIB_TRUE@	mathfuncs_rcp.h		\
 @USE_VECMATHLIB_TRUE@	vec_sse_float4.h)
 
+ at HOST_CPU_IS_X86_64_FALSE@LKERNEL_SRCS_EXTRA2 = svm_atomics_host.cl svm_atomics.cl
+ at HOST_CPU_IS_X86_64_TRUE@LKERNEL_SRCS_EXTRA2 = svm_atomics_x86_64.ll svm_atomics.cl
 EXTRA_DIST = CMakeLists.txt
 all: all-am
 
@@ -993,10 +1127,10 @@ uninstall-am: uninstall-nodist_pkgdataDATA
 
 all: ${KERNEL_BC}
 
-vpath %.c  @top_srcdir@/lib/kernel
-vpath %.cc @top_srcdir@/lib/kernel
-vpath %.cl @top_srcdir@/lib/kernel
-vpath %.ll @top_srcdir@/lib/kernel
+vpath %.c @srcdir@ @top_srcdir@/lib/kernel
+vpath %.cc @srcdir@ @top_srcdir@/lib/kernel
+vpath %.cl @srcdir@ @top_srcdir@/lib/kernel
+vpath %.ll @srcdir@ @top_srcdir@/lib/kernel
 
 # Generate a precompiled header for the built-in function
 # declarations, in case supported by the target.
@@ -1012,15 +1146,15 @@ _kernel.h.pch: @top_builddir@/include/${TARGET_DIR}/types.h @top_srcdir@/include
 
 # Rules to compile the different kernel library source file types into
 # LLVM bitcode
-%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
+%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} ${CLFLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $< 
-%.cc.bc: %.cc ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
+	@CLANG@ ${CLANG_FLAGS} ${CLFLAGS} ${DEVICE_CL_FLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $<
+%.cc.bc: %.cc  ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANGXX@ ${CLANG_FLAGS} ${CLANGXX_FLAGS} -c -o $@ $< -include ${abs_top_srcdir}/include/pocl_features.h
-%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
+	@CLANGXX@ ${CLANG_FLAGS} ${CLANGXX_FLAGS} ${DEVICE_CL_FLAGS} -c -o $@ $<
+%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} -x cl ${CLFLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
+	@CLANG@ ${CLANG_FLAGS} -x cl ${CLFLAGS} ${DEVICE_CL_FLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
 %.ll.bc: %.ll
 	mkdir -p ${dir $@}
 	@LLVM_AS@ -o $@ $<
diff --git a/lib/kernel/hsail64/CMakeLists.txt b/lib/kernel/hsail64/CMakeLists.txt
index bf2f825..18d3ea6 100644
--- a/lib/kernel/hsail64/CMakeLists.txt
+++ b/lib/kernel/hsail64/CMakeLists.txt
@@ -32,6 +32,7 @@ foreach(FILE printf.c barrier.ll get_image_depth.cl get_image_dim.cl
   list(REMOVE_ITEM KERNEL_SOURCES "${FILE}")
 endforeach()
 
+list(APPEND KERNEL_SOURCES svm_atomics.cl)
 
 foreach(FILE atomics.cl atomic_impl.ll barrier.c
         get_global_id.c get_global_size.c get_group_id.c
@@ -41,12 +42,14 @@ foreach(FILE atomics.cl atomic_impl.ll barrier.c
         native_rsqrt.cl  native_sin.cl  native_sqrt.cl
         fabs.cl floor.cl rint.cl trunc.cl remainder.cl
         fma.cl mad.cl mad_hi.cl mul_hi.cl mul24.cl mad24.cl
-        sqrt.cl sqrt_default.ll cbrt.cl hypot.cl copysign.cl
+        sqrt.cl sqrt_default.ll cbrt.cl hypot.cl length.cl copysign.cl
         exp.cl exp2.cl exp10.cl expm1.cl
         log.cl log2.cl log10.cl log1p.cl
         sin.cl tan.cl cos.cl sinh.cl tanh.cl cosh.cl
         asin.cl acos.cl atan.cl asinh.cl acosh.cl atanh.cl
-        ilogb.cl ldexp.cl fract.cl frexp.cl atan2.cl pow.cl)
+        ilogb.cl ldexp.cl fract.cl frexp.cl atan2.cl pow.cl
+        lgamma.cl tgamma.cl erf.cl erfc.cl fast_normalize.cl fast_length.cl
+        svm_atomics_hsail.cl.ll)
 
   list(REMOVE_ITEM KERNEL_SOURCES "${FILE}")
   list(APPEND KERNEL_SOURCES "hsail64/${FILE}")
@@ -54,15 +57,26 @@ endforeach()
 
 set(CLANG_FLAGS "-Xclang" "-ffake-address-space-map" "-emit-llvm" "-target" "hsail64" "-D_CL_DISABLE_HALF")
 set(LLC_FLAGS "")
+set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${HSA_DEVICE_CL_VERSION}")
+separate_arguments(HSA_DEVICE_EXTENSIONS)
+foreach(EXT ${HSA_DEVICE_EXTENSIONS})
+  set(DEVICE_CL_FLAGS "${DEVICE_CL_FLAGS} -D${EXT}")
+endforeach()
+separate_arguments(DEVICE_CL_FLAGS)
 
-make_kernel_bc(KERNEL_BC "hsail64" ${KERNEL_SOURCES})
+make_kernel_bc(KERNEL_BC "hsail64" "BCs" ${KERNEL_SOURCES})
 
 # just debug
 message(STATUS "HSAIL64 Kernel BC: ${KERNEL_BC}")
 
+list(APPEND KERNEL_BC_LIST "${KERNEL_BC}")
+set(KERNEL_BC_LIST "${KERNEL_BC_LIST}" PARENT_SCOPE)
+
 # a target is needed...
-add_custom_target("kernel_hsail64" ALL
-    DEPENDS ${KERNEL_BC} "${CMAKE_BINARY_DIR}/kernellib_hash.h")
+add_custom_target("kernel_hsail64" DEPENDS ${KERNEL_BC})
+
+list(APPEND KERNEL_TARGET_LIST "kernel_hsail64")
+set(KERNEL_TARGET_LIST "${KERNEL_TARGET_LIST}" PARENT_SCOPE)
 
 install(FILES "${KERNEL_BC}"
         DESTINATION "${POCL_INSTALL_PRIVATE_DATADIR}")
diff --git a/lib/kernel/hsail64/Makefile.am b/lib/kernel/hsail64/Makefile.am
index 6f37982..2df2179 100644
--- a/lib/kernel/hsail64/Makefile.am
+++ b/lib/kernel/hsail64/Makefile.am
@@ -26,12 +26,13 @@ KERNEL_TARGET = hsail64
 CLANG_FLAGS = @TARGET_CLANG_FLAGS@ -Xclang -ffake-address-space-map -emit-llvm -D_CL_DISABLE_HALF -target ${KERNEL_TARGET} -I${abs_top_srcdir}/lib/kernel/hsail64
 LLC_FLAGS   = @TARGET_LLC_FLAGS@
 LD_FLAGS    = @TARGET_LD_FLAGS@
+DEVICE_CL_FLAGS = -D__OPENCL_VERSION__=200 @HSA_DEVICE_EXTENSION_DEFINES@
 
 include ../rules.mk
 include ../sources.mk
 
 # TODO: a global memory ring buffer stdout based printf
 LKERNEL_SRCS_EXCLUDE = printf.c barrier.ll
-LKERNEL_SRCS_EXTRA = barrier.c sqrt_default.ll
+LKERNEL_SRCS_EXTRA = barrier.c sqrt_default.ll svm_atomics.cl svm_atomics_hsail.cl.ll
 
 EXTRA_DIST = CMakeLists.txt $(srcdir)/*.c $(srcdir)/*.cl $(srcdir)/*.h $(srcdir)/*.ll frexp.inc
diff --git a/lib/kernel/hsail64/Makefile.in b/lib/kernel/hsail64/Makefile.in
index 935ea63..4be9ea6 100644
--- a/lib/kernel/hsail64/Makefile.in
+++ b/lib/kernel/hsail64/Makefile.in
@@ -65,6 +65,7 @@
 # CLANG_FLAGS
 # LLC_FLAGS
 # LD_FLAGS
+# DEVICE_CL_FLAGS
 
 # sources.mk - a list of all kernel source files
 # 
@@ -164,7 +165,6 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
- at NEW_PRINTF_WORKS_TRUE@am__append_1 = printf.c
 subdir = lib/kernel/hsail64
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
@@ -275,6 +275,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -282,6 +283,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -299,8 +301,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -376,6 +376,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -441,6 +442,7 @@ KERNEL_TARGET = hsail64
 CLANG_FLAGS = @TARGET_CLANG_FLAGS@ -Xclang -ffake-address-space-map -emit-llvm -D_CL_DISABLE_HALF -target ${KERNEL_TARGET} -I${abs_top_srcdir}/lib/kernel/hsail64
 LLC_FLAGS = @TARGET_LLC_FLAGS@
 LD_FLAGS = @TARGET_LD_FLAGS@
+DEVICE_CL_FLAGS = -D__OPENCL_VERSION__=200 @HSA_DEVICE_EXTENSION_DEFINES@
 KERNEL_BC = kernel-${KERNEL_TARGET}.bc
 nodist_pkgdata_DATA = ${KERNEL_BC}
 
@@ -449,46 +451,174 @@ nodist_pkgdata_DATA = ${KERNEL_BC}
 # and LKERNEL_SRCS_EXTRA, which adds extra files to the source list.
 LKERNEL_SRCS = \
 	$(filter-out ${LKERNEL_SRCS_EXCLUDE}, ${LKERNEL_SRCS_DEFAULT})	\
-	${LKERNEL_SRCS_EXTRA}
+	${LKERNEL_SRCS_EXTRA} ${LKERNEL_SRCS_EXTRA2}
 
 OBJ = $(LKERNEL_SRCS:%=%.bc)
 CLEANFILES = kernel-${KERNEL_TARGET}.bc ${OBJ}
 LKERNEL_HDRS = image.h pocl_image_rw_utils.h templates.h
-LKERNEL_SRCS_DEFAULT = abs.cl abs_diff.cl acos.cl acosh.cl acospi.cl \
-	add_sat.cl all.cl any.cl as_type.cl asin.cl asinh.cl asinpi.cl \
-	async_work_group_copy.cl atan.cl atan2.cl atan2pi.cl atanh.cl \
-	atanpi.cl atomics.cl barrier.ll bitselect.cl cbrt.cl ceil.cl \
-	clamp.cl clamp_int.cl clz.cl convert_type.cl copysign.cl \
-	cos.cl cosh.cl cospi.cl cross.cl degrees.cl distance.cl \
-	divide.cl dot.cl erf.cl erfc.cl exp.cl exp10.cl exp2.cl \
-	expm1.cl fabs.cl fast_distance.cl fast_length.cl \
-	fast_normalize.cl fdim.cl floor.cl fma.cl fmax.cl fmin.cl \
-	fmod.cl fract.cl get_global_id.c get_global_offset.c \
-	get_global_size.c get_group_id.c get_image_depth.cl \
-	get_image_height.cl get_image_width.cl get_image_dim.cl \
-	get_local_id.c get_local_size.c get_num_groups.c \
-	get_work_dim.c hadd.cl hypot.cl ilogb.cl isequal.cl \
-	isfinite.cl isgreater.cl isgreaterequal.cl isinf.cl isless.cl \
-	islessequal.cl islessgreater.cl isnan.cl isnormal.cl \
-	isnotequal.cl isordered.cl isunordered.cl ldexp.cl length.cl \
-	lgamma.cl log.cl log10.cl log1p.cl log2.cl logb.cl mad.cl \
-	mad24.cl mad_hi.cl mad_sat.cl max.cl max_i.cl maxmag.cl min.cl \
-	min_i.cl minmag.cl mix.cl mul24.cl mul_hi.cl nan.cl \
-	native_cos.cl native_exp.cl native_exp10.cl native_exp2.cl \
-	native_log.cl native_log10.cl native_log2.cl native_powr.cl \
-	native_recip.cl native_rsqrt.cl native_sin.cl native_sqrt.cl \
-	native_tan.cl nextafter.cl normalize.cl popcount.cl pow.cl \
-	pown.cl powr.cl radians.cl read_image.cl recip.cl remainder.cl \
-	rhadd.cl rint.cl rootn.cl rotate.cl round.cl rsqrt.cl \
-	select.cl shuffle.cl sign.cl signbit.cl sin.cl sincos.cl \
-	sinh.cl sinpi.cl smoothstep.cl sqrt.cl step.cl sub_sat.cl \
-	tan.cl tanh.cl tanpi.cl tgamma.cl trunc.cl upsample.cl \
-	vload.cl vload_half.cl vstore.cl vstore_half.cl \
-	wait_group_events.cl write_image.cl $(am__append_1)
+LKERNEL_SRCS_DEFAULT = \
+	abs.cl					\
+	abs_diff.cl				\
+	acos.cl					\
+	acosh.cl				\
+	acospi.cl				\
+	add_sat.cl				\
+	all.cl					\
+	any.cl					\
+	as_type.cl				\
+	asin.cl					\
+	asinh.cl				\
+	asinpi.cl				\
+	async_work_group_copy.cl		\
+	atan.cl					\
+	atan2.cl				\
+	atan2pi.cl				\
+	atanh.cl				\
+	atanpi.cl				\
+	atomics.cl				\
+	barrier.ll				\
+	bitselect.cl				\
+	cbrt.cl					\
+	ceil.cl					\
+	clamp.cl				\
+	clamp_int.cl				\
+	clz.cl					\
+	convert_type.cl				\
+	copysign.cl				\
+	cos.cl					\
+	cosh.cl					\
+	cospi.cl				\
+	cross.cl				\
+	degrees.cl				\
+	distance.cl				\
+	divide.cl				\
+	dot.cl					\
+	erf.cl					\
+	erfc.cl					\
+	exp.cl					\
+	exp10.cl				\
+	exp2.cl					\
+	expm1.cl				\
+	fabs.cl					\
+	fast_distance.cl			\
+	fast_length.cl				\
+	fast_normalize.cl			\
+	fdim.cl					\
+	floor.cl				\
+	fma.cl					\
+	fmax.cl					\
+	fmin.cl					\
+	fmod.cl					\
+	fract.cl				\
+	get_global_id.c				\
+	get_global_offset.c			\
+	get_global_size.c			\
+	get_group_id.c				\
+	get_image_depth.cl			\
+	get_image_height.cl			\
+	get_image_width.cl			\
+	get_image_dim.cl			\
+	get_local_id.c				\
+	get_local_size.c			\
+	get_num_groups.c			\
+	get_work_dim.c				\
+	hadd.cl					\
+	hypot.cl				\
+	ilogb.cl				\
+	isequal.cl				\
+	isfinite.cl				\
+	isgreater.cl				\
+	isgreaterequal.cl			\
+	isinf.cl				\
+	isless.cl				\
+	islessequal.cl				\
+	islessgreater.cl			\
+	isnan.cl				\
+	isnormal.cl				\
+	isnotequal.cl				\
+	isordered.cl				\
+	isunordered.cl				\
+	ldexp.cl				\
+	length.cl				\
+	lgamma.cl				\
+	log.cl					\
+	log10.cl				\
+	log1p.cl				\
+	log2.cl					\
+	logb.cl					\
+	mad.cl					\
+	mad24.cl				\
+	mad_hi.cl				\
+	mad_sat.cl				\
+	max.cl					\
+	max_i.cl				\
+	maxmag.cl				\
+	min.cl					\
+	min_i.cl				\
+	minmag.cl				\
+	mix.cl					\
+	mul24.cl				\
+	mul_hi.cl				\
+	nan.cl					\
+	native_cos.cl				\
+	native_exp.cl				\
+	native_exp10.cl				\
+	native_exp2.cl				\
+	native_log.cl				\
+	native_log10.cl				\
+	native_log2.cl				\
+	native_powr.cl				\
+	native_recip.cl				\
+	native_rsqrt.cl				\
+	native_sin.cl				\
+	native_sqrt.cl				\
+	native_tan.cl				\
+	nextafter.cl				\
+	normalize.cl				\
+	popcount.cl				\
+	pow.cl					\
+	pown.cl					\
+	powr.cl					\
+	printf.c                                \
+	radians.cl				\
+	read_image.cl				\
+	recip.cl				\
+	remainder.cl				\
+	rhadd.cl				\
+	rint.cl					\
+	rootn.cl				\
+	rotate.cl				\
+	round.cl				\
+	rsqrt.cl				\
+	select.cl				\
+	shuffle.cl				\
+	sign.cl					\
+	signbit.cl				\
+	sin.cl					\
+	sincos.cl				\
+	sinh.cl					\
+	sinpi.cl				\
+	smoothstep.cl				\
+	sqrt.cl					\
+	step.cl					\
+	sub_sat.cl				\
+	tan.cl					\
+	tanh.cl					\
+	tanpi.cl				\
+	tgamma.cl				\
+	trunc.cl				\
+	upsample.cl				\
+	vload.cl				\
+	vload_half.cl				\
+	vstore.cl				\
+	vstore_half.cl				\
+	wait_group_events.cl			\
+	write_image.cl
+
 
 # TODO: a global memory ring buffer stdout based printf
 LKERNEL_SRCS_EXCLUDE = printf.c barrier.ll
-LKERNEL_SRCS_EXTRA = barrier.c sqrt_default.ll
+LKERNEL_SRCS_EXTRA = barrier.c sqrt_default.ll svm_atomics.cl svm_atomics_hsail.cl.ll
 EXTRA_DIST = CMakeLists.txt $(srcdir)/*.c $(srcdir)/*.cl $(srcdir)/*.h $(srcdir)/*.ll frexp.inc
 all: all-am
 
@@ -711,10 +841,10 @@ uninstall-am: uninstall-nodist_pkgdataDATA
 
 all: ${KERNEL_BC}
 
-vpath %.c  @top_srcdir@/lib/kernel
-vpath %.cc @top_srcdir@/lib/kernel
-vpath %.cl @top_srcdir@/lib/kernel
-vpath %.ll @top_srcdir@/lib/kernel
+vpath %.c @srcdir@ @top_srcdir@/lib/kernel
+vpath %.cc @srcdir@ @top_srcdir@/lib/kernel
+vpath %.cl @srcdir@ @top_srcdir@/lib/kernel
+vpath %.ll @srcdir@ @top_srcdir@/lib/kernel
 
 # Generate a precompiled header for the built-in function
 # declarations, in case supported by the target.
@@ -730,15 +860,15 @@ _kernel.h.pch: @top_builddir@/include/${TARGET_DIR}/types.h @top_srcdir@/include
 
 # Rules to compile the different kernel library source file types into
 # LLVM bitcode
-%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
+%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} ${CLFLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $< 
-%.cc.bc: %.cc ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
+	@CLANG@ ${CLANG_FLAGS} ${CLFLAGS} ${DEVICE_CL_FLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $<
+%.cc.bc: %.cc  ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANGXX@ ${CLANG_FLAGS} ${CLANGXX_FLAGS} -c -o $@ $< -include ${abs_top_srcdir}/include/pocl_features.h
-%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
+	@CLANGXX@ ${CLANG_FLAGS} ${CLANGXX_FLAGS} ${DEVICE_CL_FLAGS} -c -o $@ $<
+%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} -x cl ${CLFLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
+	@CLANG@ ${CLANG_FLAGS} -x cl ${CLFLAGS} ${DEVICE_CL_FLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
 %.ll.bc: %.ll
 	mkdir -p ${dir $@}
 	@LLVM_AS@ -o $@ $<
diff --git a/lib/kernel/hsail64/erf.cl b/lib/kernel/hsail64/erf.cl
new file mode 100644
index 0000000..c0cabe1
--- /dev/null
+++ b/lib/kernel/hsail64/erf.cl
@@ -0,0 +1,181 @@
+/* OpenCL built-in library: erf()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "hsail_templates.h"
+
+/*
+ * The following code is adapted from
+ * http://www.johndcook.com/blog/2009/01/19/stand-alone-error-function-erf/
+ * which is licensed Public Domain.
+ *
+ * severely limited precision, only useful for floats.
+ * TODO it's probably not accurate enough on the entire range.
+ */
+
+float _cl_builtin_erff(float g)
+{
+  float x = fabs(g);
+  if (x >= 4.0f)
+    return (g > 0.0f) ? 1.0f : -1.0f;
+
+  // constants
+  float a1 =  0.254829592f;
+  float a2 = -0.284496736f;
+  float a3 =  1.421413741f;
+  float a4 = -1.453152027f;
+  float a5 =  1.061405429f;
+  float p  =  0.3275911f;
+
+  // A&S formula 7.1.26
+  float t = 1.0f / fma(p, x, 1.0f);
+  float temp = fma(a5, t, a4);
+  temp = fma(temp, t, a3);
+  temp = fma(temp, t, a2);
+  temp = fma(temp, t, a1);
+  temp *= t;
+  float y = fma(temp, exp(-x*x), -1.0f);
+
+  return (g > 0.0f) ? -y : y;
+}
+
+float _cl_builtin_erfcf(float g);
+
+
+
+
+
+/* The following code is adapted from
+ * from cpython/Modules/mathmodule.c
+ *
+ * Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+2011, 2012, 2013, 2014, 2015 Python Software Foundation; All Rights Reserved
+ * which has Python Software License (BSD compatible).
+ *
+ * Changes:
+ * - adapt to OpenCL builtins (isnan etc)
+ * - use fma() where possible.
+ */
+
+
+#define ERF_SERIES_CUTOFF 1.5
+#define ERF_SERIES_TERMS 25
+#define ERFC_CONTFRAC_CUTOFF 30.0
+#define ERFC_CONTFRAC_TERMS 50
+
+#define SQRTPI 1.772453850905516027298167483341145182798
+
+/*
+   Error function, via power series.
+   Given a finite float x, return an approximation to erf(x).
+   Converges reasonably fast for small x.
+*/
+
+double m_erf_series(double x)
+{
+    double x2, acc, fk, result, temp;
+    int i;
+
+    x2 = x * x;
+    acc = 0.0;
+    fk = (double)ERF_SERIES_TERMS + 0.5;
+    for (i = 0; i < ERF_SERIES_TERMS; i++) {
+        temp = acc / fk;
+        //acc = 2.0 + x2 * temp;
+        acc = fma(x2, temp, 2.0);
+        fk -= 1.0;
+    }
+    return (acc * x * exp(-x2) / SQRTPI);
+}
+
+/*
+   Complementary error function, via continued fraction expansion.
+   Given a positive float x, return an approximation to erfc(x).  Converges
+   reasonably fast for x large (say, x > 2.0), and should be safe from
+   overflow if x and nterms are not too large.  On an IEEE 754 machine, with x
+   <= 30.0, we're safe up to nterms = 100.  For x >= 30.0, erfc(x) is smaller
+   than the smallest representable nonzero float.  */
+
+double m_erfc_contfrac(double x)
+{
+    double x2, a, da, p, p_last, q, q_last, b, result;
+    int i;
+
+    if (x >= ERFC_CONTFRAC_CUTOFF)
+        return 0.0;
+
+    x2 = x*x;
+    a = 0.0;
+    da = 0.5;
+    p = 1.0; p_last = 0.0;
+    q = da + x2; q_last = 1.0;
+    for (i = 0; i < ERFC_CONTFRAC_TERMS; i++) {
+        double temp;
+        a += da;
+        da += 2.0;
+        b = da + x2;
+        //temp = p; p = b*p - a*p_last; p_last = temp;
+        temp = p; p = fma(b, p, -a*p_last); p_last = temp;
+        //temp = q; q = b*q - a*q_last; q_last = temp;
+        temp = q; q = fma(b, q, -a*q_last); q_last = temp;
+    }
+    return (p / q * x * exp(-x2) / SQRTPI);
+}
+
+/* Error function erf(x), for general x */
+
+double _cl_builtin_erf(double x)
+{
+    double absx, cf;
+
+    if (isnan(x))
+        return x;
+    absx = fabs(x);
+    if (absx < ERF_SERIES_CUTOFF)
+        return m_erf_series(x);
+    else {
+        cf = m_erfc_contfrac(absx);
+        return (x > 0.0) ? 1.0 - cf : cf - 1.0;
+    }
+}
+
+/* Complementary error function erfc(x), for general x. */
+
+double _cl_builtin_erfc(double x)
+{
+    double absx, cf;
+
+    if (isnan(x))
+        return x;
+    absx = fabs(x);
+    if (absx < ERF_SERIES_CUTOFF)
+        return 1.0 - m_erf_series(x);
+    else {
+        cf = m_erfc_contfrac(absx);
+        return (x > 0.0) ? cf : 2.0 - cf;
+    }
+}
+
+
+IMPLEMENT_EXPR_ALL(erf, V_V, _cl_builtin_erff(a), _cl_builtin_erf(a))
+
+IMPLEMENT_EXPR_ALL(erfc, V_V, _cl_builtin_erfcf(a), _cl_builtin_erfc(a))
diff --git a/lib/kernel/hsail64/log1p.cl b/lib/kernel/hsail64/erfc.cl
similarity index 85%
copy from lib/kernel/hsail64/log1p.cl
copy to lib/kernel/hsail64/erfc.cl
index 54e86ea..0c1ae7d 100644
--- a/lib/kernel/hsail64/log1p.cl
+++ b/lib/kernel/hsail64/erfc.cl
@@ -1,4 +1,4 @@
-/* OpenCL built-in library: log1p()
+/* OpenCL built-in library: erfc()
 
    Copyright (c) 2015 Michal Babej / Tampere University of Technology
 
@@ -23,4 +23,11 @@
 
 #include "../templates.h"
 
-DEFINE_EXPR_V_V(log1p, (log((vtype)(1.0) + a)))
+/* TODO this is a simplistic version which should be fixed
+ * for values of erf(x) close to 1.0f
+ */
+
+float _cl_builtin_erfcf(float x)
+{
+  return (1.0f - erf(x));
+}
diff --git a/lib/kernel/hsail64/expm1.cl b/lib/kernel/hsail64/expm1.cl
index 0bb8e3f..59ddce6 100644
--- a/lib/kernel/hsail64/expm1.cl
+++ b/lib/kernel/hsail64/expm1.cl
@@ -21,6 +21,30 @@
    THE SOFTWARE.
 */
 
-#include "../templates.h"
+#include "hsail_templates.h"
 
-DEFINE_EXPR_V_V(expm1, (exp(a) - (vtype)1.0))
+// public domain code from http://www.johndcook.com/blog/cpp_expm1/
+
+float _cl_builtin_expm1f(float x)
+{
+  if (fabs(x) < 1e-4f)
+    {
+      float xx = x*x;
+      return fma(0.5f, xx, x);
+    }
+  else
+    return exp(x) - 1.0f;
+}
+
+double _cl_builtin_expm1(double x)
+{
+  if (fabs(x) < 1e-10)  // TODO find the proper value to compare against
+    {
+      double xx = x*x;
+      return fma(0.5, xx, x);
+    }
+  else
+    return exp(x) - 1.0;
+}
+
+IMPLEMENT_EXPR_ALL(expm1, V_V, _cl_builtin_expm1f(a), _cl_builtin_expm1(a))
diff --git a/lib/kernel/hsail64/fast_length.cl b/lib/kernel/hsail64/fast_length.cl
new file mode 100644
index 0000000..9f09fbb
--- /dev/null
+++ b/lib/kernel/hsail64/fast_length.cl
@@ -0,0 +1,85 @@
+/* OpenCL built-in library: fast_length()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+/*
+ * TODO these should use "half_sqrt" but i'm not sure if 1) it exists in HSAIL,
+ * and 2) if it'd be any faster than native sqrt()
+ */
+
+float _CL_OVERLOADABLE fast_length(float x)
+{
+  return x;
+}
+
+float _CL_OVERLOADABLE fast_length(float2 v)
+{
+  float temp = v.x * v.x;
+  temp = fma(v.y, v.y, temp);
+  return sqrt(temp);
+}
+
+float _CL_OVERLOADABLE fast_length(float3 v)
+{
+  float temp = v.x * v.x;
+  temp = fma(v.y, v.y, temp);
+  temp = fma(v.z, v.z, temp);
+  return sqrt(temp);
+}
+
+float _CL_OVERLOADABLE fast_length(float4 v)
+{
+  float temp = v.x * v.x;
+  temp = fma(v.y, v.y, temp);
+  temp = fma(v.z, v.z, temp);
+  temp = fma(v.w, v.w, temp);
+  return sqrt(temp);
+}
+
+double _CL_OVERLOADABLE fast_length(double x)
+{
+  return x;
+}
+
+double _CL_OVERLOADABLE fast_length(double2 v)
+{
+  double temp = v.x * v.x;
+  temp = fma(v.y, v.y, temp);
+  return sqrt(temp);
+}
+
+double _CL_OVERLOADABLE fast_length(double3 v)
+{
+  double temp = v.x * v.x;
+  temp = fma(v.y, v.y, temp);
+  temp = fma(v.z, v.z, temp);
+  return sqrt(temp);
+}
+
+double _CL_OVERLOADABLE fast_length(double4 v)
+{
+  double temp = v.x * v.x;
+  temp = fma(v.y, v.y, temp);
+  temp = fma(v.z, v.z, temp);
+  temp = fma(v.w, v.w, temp);
+  return sqrt(temp);
+}
diff --git a/lib/kernel/hsail64/log1p.cl b/lib/kernel/hsail64/fast_normalize.cl
similarity index 63%
copy from lib/kernel/hsail64/log1p.cl
copy to lib/kernel/hsail64/fast_normalize.cl
index 54e86ea..f400134 100644
--- a/lib/kernel/hsail64/log1p.cl
+++ b/lib/kernel/hsail64/fast_normalize.cl
@@ -1,4 +1,4 @@
-/* OpenCL built-in library: log1p()
+/* OpenCL built-in library: fast_normalize()
 
    Copyright (c) 2015 Michal Babej / Tampere University of Technology
 
@@ -21,6 +21,14 @@
    THE SOFTWARE.
 */
 
-#include "../templates.h"
+#include "hsail_templates.h"
 
-DEFINE_EXPR_V_V(log1p, (log((vtype)(1.0) + a)))
+#define FAST_NORMALIZE ( (fast_length(a) == (stype)0.0) ? ((vtype)0.0) : (a / (vtype)fast_length(a)) )
+
+IMPLEMENT_EXPR_V_V(fast_normalize, FAST_NORMALIZE, float2, float, int2, int)
+IMPLEMENT_EXPR_V_V(fast_normalize, FAST_NORMALIZE, float3, float, int3, int)
+IMPLEMENT_EXPR_V_V(fast_normalize, FAST_NORMALIZE, float4, float, int4, int)
+
+IMPLEMENT_EXPR_V_V(fast_normalize, FAST_NORMALIZE, double2, double, int2, int)
+IMPLEMENT_EXPR_V_V(fast_normalize, FAST_NORMALIZE, double3, double, int3, int)
+IMPLEMENT_EXPR_V_V(fast_normalize, FAST_NORMALIZE, double4, double, int4, int)
diff --git a/lib/kernel/hsail64/hypot.cl b/lib/kernel/hsail64/hypot.cl
index cc76e5d..b811415 100644
--- a/lib/kernel/hsail64/hypot.cl
+++ b/lib/kernel/hsail64/hypot.cl
@@ -22,6 +22,31 @@
    THE SOFTWARE.
 */
 
-#include "../templates.h"
+#include "hsail_templates.h"
 
-DEFINE_EXPR_V_VV(hypot, sqrt(a*a + b*b))
+
+float _cl_builtin_hypotf(float x, float y)
+{
+    float a = fabs(x);
+    float b = fabs(y);
+    float n = fmin(a, b);
+    float m = fmax(a, b);
+    if (m == 0.0f)
+        return 0.0f;
+    float d = n / m;
+    return m * sqrt(fma(d, d, 1.0f));
+}
+
+double _cl_builtin_hypot(double x, double y)
+{
+    double a = fabs(x);
+    double b = fabs(y);
+    double n = fmin(a, b);
+    double m = fmax(a, b);
+    if (m == 0.0)
+        return 0.0;
+    double d = n / m;
+    return m * sqrt(fma(d, d, 1.0));
+}
+
+IMPLEMENT_EXPR_ALL(hypot, V_VV, _cl_builtin_hypotf(a, b), _cl_builtin_hypot(a, b))
diff --git a/lib/kernel/hsail64/expm1.cl b/lib/kernel/hsail64/length.cl
similarity index 63%
copy from lib/kernel/hsail64/expm1.cl
copy to lib/kernel/hsail64/length.cl
index 0bb8e3f..3cfd1ec 100644
--- a/lib/kernel/hsail64/expm1.cl
+++ b/lib/kernel/hsail64/length.cl
@@ -1,4 +1,4 @@
-/* OpenCL built-in library: expm1()
+/* OpenCL built-in library: length()
 
    Copyright (c) 2015 Michal Babej / Tampere University of Technology
 
@@ -21,6 +21,42 @@
    THE SOFTWARE.
 */
 
-#include "../templates.h"
+float _CL_OVERLOADABLE length(float x)
+{
+  return x;
+}
 
-DEFINE_EXPR_V_V(expm1, (exp(a) - (vtype)1.0))
+float _CL_OVERLOADABLE length(float2 v)
+{
+  return hypot(v.x, v.y);
+}
+
+float _CL_OVERLOADABLE length(float3 v)
+{
+  return hypot(hypot(v.x, v.y), v.z);
+}
+
+float _CL_OVERLOADABLE length(float4 v)
+{
+  return hypot(hypot(hypot(v.x, v.y), v.z), v.w);
+}
+
+double _CL_OVERLOADABLE length(double x)
+{
+  return x;
+}
+
+double _CL_OVERLOADABLE length(double2 v)
+{
+  return hypot(v.x, v.y);
+}
+
+double _CL_OVERLOADABLE length(double3 v)
+{
+  return hypot(hypot(v.x, v.y), v.z);
+}
+
+double _CL_OVERLOADABLE length(double4 v)
+{
+  return hypot(hypot(hypot(v.x, v.y), v.z), v.w);
+}
diff --git a/lib/kernel/hsail64/lgamma.cl b/lib/kernel/hsail64/lgamma.cl
new file mode 100644
index 0000000..f51c5c0
--- /dev/null
+++ b/lib/kernel/hsail64/lgamma.cl
@@ -0,0 +1,113 @@
+/* OpenCL built-in library: lgamma()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "hsail_templates.h"
+
+#define LN_SQRT2PI (0.91893853320467274178)
+#define LN_SQRT2PI_F (0.91893853320467274178f)
+
+double _cl_builtin_lgamma(double x) {
+    x -= 1.0;
+    if (x <= -1.0)
+      return NAN;
+
+    double z = x;
+    double a = 0.99999999999999709182;
+    z += 1.0;
+    a += 57.156235665862923517 / z;
+    z +=1.0;
+    a += -59.597960355475491248 / z;
+    z +=1.0;
+    a += 14.136097974741747174 / z;
+    z +=1.0;
+    a += -0.49191381609762019978 / z;
+    z +=1.0;
+    a += 0.33994649984811888699e-4 / z;
+    z +=1.0;
+    a += 0.46523628927048575665e-4 / z;
+    z +=1.0;
+    a += -0.98374475304879564677e-4 / z;
+    z +=1.0;
+    a += 0.15808870322491248884e-3 / z;
+    z +=1.0;
+    a += -0.21026444172410488319e-3 / z;
+    z +=1.0;
+    a += 0.21743961811521264320e-3 / z;
+    z +=1.0;
+    a += -0.16431810653676389022e-3 / z;
+    z +=1.0;
+    a += 0.84418223983852743293e-4 / z;
+    z +=1.0;
+    a += -0.26190838401581408670e-4 / z;
+    z +=1.0;
+    a += 0.36899182659531622704e-5 / z;
+
+    double tmp = x + (607/128.0 + 0.5);
+    return (LN_SQRT2PI + log(a) + ((x + 0.5) * log(tmp)) - tmp);
+}
+
+
+
+float _cl_builtin_lgammaf(float x) {
+    x -= 1.0f;
+    if (x <= -1.0f)
+      return NAN;
+
+    float a = 0.99999999999999709182f;
+    float z = x;
+    z += 1.0f;
+    a += 57.156235665862923517f / z;
+    z +=1.0f;
+    a += -59.597960355475491248f / z;
+    z +=1.0f;
+    a += 14.136097974741747174f / z;
+    z +=1.0f;
+    a += -0.49191381609762019978f / z;
+    z +=1.0f;
+    a += 0.33994649984811888699e-4f / z;
+    z +=1.0f;
+    a += 0.46523628927048575665e-4f / z;
+    z +=1.0f;
+    a += -0.98374475304879564677e-4f / z;
+    z +=1.0f;
+    a += 0.15808870322491248884e-3f / z;
+    z +=1.0f;
+    a += -0.21026444172410488319e-3f / z;
+    z +=1.0f;
+    a += 0.21743961811521264320e-3f / z;
+    z +=1.0f;
+    a += -0.16431810653676389022e-3f / z;
+    z +=1.0f;
+    a += 0.84418223983852743293e-4f / z;
+    z +=1.0f;
+    a += -0.26190838401581408670e-4f / z;
+    z +=1.0f;
+    a += 0.36899182659531622704e-5f / z;
+
+    float tmp = x + (607/128.0f + 0.5f);
+    return (LN_SQRT2PI_F + log(a) + ((x + 0.5f)*log(tmp)) - tmp);
+}
+
+
+
+IMPLEMENT_EXPR_ALL(lgamma, V_V, _cl_builtin_lgammaf(a), _cl_builtin_lgamma(a))
diff --git a/lib/kernel/hsail64/log1p.cl b/lib/kernel/hsail64/log1p.cl
index 54e86ea..2523704 100644
--- a/lib/kernel/hsail64/log1p.cl
+++ b/lib/kernel/hsail64/log1p.cl
@@ -21,6 +21,37 @@
    THE SOFTWARE.
 */
 
-#include "../templates.h"
+#include "hsail_templates.h"
 
-DEFINE_EXPR_V_V(log1p, (log((vtype)(1.0) + a)))
+// DEFINE_EXPR_V_V(log1p, (log((vtype)(1.0) + a)))
+
+// public domain from http://www.johndcook.com/blog/cpp_log_one_plus_x/
+
+float _cl_builtin_log1pf(float x) {
+  if (x < -1.0f)
+    return NAN;
+
+  if (fabs(x) > 1e-4f)  // TODO find the proper value here
+    return log(1.0f + x);
+  else
+    {
+      float xx = x*x;
+      return fma(-0.5f, xx, x);
+    }
+}
+
+double _cl_builtin_log1p(double x) {
+  if (x < -1.0)
+    return NAN;
+
+  if (fabs(x) > 1e-8) // TODO find the proper value here
+    return log(1.0 + x);
+  else
+    {
+      double xx = x*x;
+      return fma(-0.5, xx, x);
+    }
+}
+
+
+IMPLEMENT_EXPR_ALL(log1p, V_V, _cl_builtin_log1pf(a), _cl_builtin_log1p(a))
diff --git a/lib/kernel/hsail64/svm_atomics_hsail.cl.ll b/lib/kernel/hsail64/svm_atomics_hsail.cl.ll
new file mode 100644
index 0000000..646d64b
--- /dev/null
+++ b/lib/kernel/hsail64/svm_atomics_hsail.cl.ll
@@ -0,0 +1,8078 @@
+; ModuleID = 'svm_atomics_hsail.cl.bc'
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:64:64-p8:32:32-p9:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "hsail64"
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z37pocl_atomic_flag_test_and_set__globalPVU3AS1U7_Atomici12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  %9 = icmp ne i32 %.0, 0
+  ret i1 %9
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z30pocl_atomic_flag_clear__globalPVU3AS1U7_Atomici12memory_order12memory_scope(i32 addrspace(1)* nocapture %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 0, i32 addrspace(1)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 0, i32 addrspace(1)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 0, i32 addrspace(1)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU3AS1U7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* nocapture %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z24pocl_atomic_load__globalPVU3AS1U7_Atomici12memory_order12memory_scope(i32 addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i32, i32 addrspace(1)* %object monotonic, align 4
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i32, i32 addrspace(1)* %object acquire, align 4
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i32, i32 addrspace(1)* %object seq_cst, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_exchange__globalPVU3AS1U7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU3AS1U7_AtomiciPii12memory_orderS3_12memory_scope(i32 addrspace(1)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg volatile i32 addrspace(1)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg volatile i32 addrspace(1)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg volatile i32 addrspace(1)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg volatile i32 addrspace(1)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg volatile i32 addrspace(1)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU3AS1U7_AtomiciPii12memory_orderS3_12memory_scope(i32 addrspace(1)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_add__globalPVU3AS1U7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_sub__globalPVU3AS1U7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_or__globalPVU3AS1U7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_xor__globalPVU3AS1U7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_and__globalPVU3AS1U7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_min__globalPVU3AS1U7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_max__globalPVU3AS1U7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU3AS1U7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* nocapture %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z24pocl_atomic_load__globalPVU3AS1U7_Atomicj12memory_order12memory_scope(i32 addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i32, i32 addrspace(1)* %object monotonic, align 4
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i32, i32 addrspace(1)* %object acquire, align 4
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i32, i32 addrspace(1)* %object seq_cst, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_exchange__globalPVU3AS1U7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU3AS1U7_AtomicjPjj12memory_orderS3_12memory_scope(i32 addrspace(1)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg volatile i32 addrspace(1)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg volatile i32 addrspace(1)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg volatile i32 addrspace(1)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg volatile i32 addrspace(1)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg volatile i32 addrspace(1)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU3AS1U7_AtomicjPjj12memory_orderS3_12memory_scope(i32 addrspace(1)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_add__globalPVU3AS1U7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_sub__globalPVU3AS1U7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_or__globalPVU3AS1U7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_xor__globalPVU3AS1U7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_and__globalPVU3AS1U7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_min__globalPVU3AS1U7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_max__globalPVU3AS1U7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU3AS1U7_Atomicff12memory_order12memory_scope(float addrspace(1)* nocapture %object, float %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %5 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast float %desired to i32
+  %2 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  store atomic volatile i32 %1, i32 addrspace(1)* %2 release, align 4
+  br label %13
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast float %desired to i32
+  %4 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br label %9
+
+; <label>:5                                       ; preds = %0
+  %6 = icmp eq i32 %order, 3
+  %7 = bitcast float %desired to i32
+  %8 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br i1 %6, label %9, label %12
+
+; <label>:9                                       ; preds = %5, %.thread
+  %10 = phi i32 addrspace(1)* [ %4, %.thread ], [ %8, %5 ]
+  %11 = phi i32 [ %3, %.thread ], [ %7, %5 ]
+  store atomic volatile i32 %11, i32 addrspace(1)* %10 monotonic, align 4
+  br label %13
+
+; <label>:12                                      ; preds = %5
+  store atomic volatile i32 %7, i32 addrspace(1)* %8 seq_cst, align 4
+  br label %13
+
+; <label>:13                                      ; preds = %12, %.thread1, %9
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define float @_Z24pocl_atomic_load__globalPVU3AS1U7_Atomicf12memory_order12memory_scope(float addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %4 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %2 = load atomic volatile i32, i32 addrspace(1)* %1 acquire, align 4
+  br label %12
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br label %7
+
+; <label>:4                                       ; preds = %0
+  %5 = icmp eq i32 %order, 3
+  %6 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br i1 %5, label %7, label %10
+
+; <label>:7                                       ; preds = %4, %.thread
+  %8 = phi i32 addrspace(1)* [ %3, %.thread ], [ %6, %4 ]
+  %9 = load atomic volatile i32, i32 addrspace(1)* %8 monotonic, align 4
+  br label %12
+
+; <label>:10                                      ; preds = %4
+  %11 = load atomic volatile i32, i32 addrspace(1)* %6 seq_cst, align 4
+  br label %12
+
+; <label>:12                                      ; preds = %10, %.thread1, %7
+  %.sroa.0.0 = phi i32 [ %9, %7 ], [ %11, %10 ], [ %2, %.thread1 ]
+  %13 = bitcast i32 %.sroa.0.0 to float
+  ret float %13
+}
+
+; Function Attrs: nounwind uwtable
+define float @_Z28pocl_atomic_exchange__globalPVU3AS1U7_Atomicff12memory_order12memory_scope(float addrspace(1)* %object, float %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %10 [
+    i32 0, label %.thread
+    i32 1, label %.thread2
+    i32 2, label %.thread3
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = bitcast float %desired to i32
+  %2 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %3 = atomicrmw volatile xchg i32 addrspace(1)* %2, i32 %1 monotonic
+  br label %18
+
+.thread2:                                         ; preds = %0
+  %4 = bitcast float %desired to i32
+  %5 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %6 = atomicrmw volatile xchg i32 addrspace(1)* %5, i32 %4 acquire
+  br label %18
+
+.thread3:                                         ; preds = %0
+  %7 = bitcast float %desired to i32
+  %8 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %9 = atomicrmw volatile xchg i32 addrspace(1)* %8, i32 %7 release
+  br label %18
+
+; <label>:10                                      ; preds = %0
+  %11 = icmp eq i32 %order, 3
+  %12 = bitcast float %desired to i32
+  %13 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br i1 %11, label %14, label %16
+
+; <label>:14                                      ; preds = %10
+  %15 = atomicrmw volatile xchg i32 addrspace(1)* %13, i32 %12 acq_rel
+  br label %18
+
+; <label>:16                                      ; preds = %10
+  %17 = atomicrmw volatile xchg i32 addrspace(1)* %13, i32 %12 seq_cst
+  br label %18
+
+; <label>:18                                      ; preds = %16, %14, %.thread3, %.thread2, %.thread
+  %.sroa.0.0 = phi i32 [ %3, %.thread ], [ %17, %16 ], [ %15, %14 ], [ %9, %.thread3 ], [ %6, %.thread2 ]
+  %19 = bitcast i32 %.sroa.0.0 to float
+  ret float %19
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU3AS1U7_AtomicfPff12memory_orderS3_12memory_scope(float addrspace(1)* %object, float* nocapture %expected, float %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast float %desired to i32
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %23 = bitcast float* %expected to i32*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i32, i32* %23, align 4
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i32, i32* %23, align 4
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i32, i32* %23, align 4
+  %33 = cmpxchg volatile i32 addrspace(1)* %22, i32 %32, i32 %11 monotonic monotonic
+  %34 = extractvalue { i32, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i32, i1 } %33, 0
+  store i32 %36, i32* %23, align 4
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg volatile i32 addrspace(1)* %22, i32 %25, i32 %11 acquire monotonic
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg volatile i32 addrspace(1)* %22, i32 %25, i32 %11 acquire acquire
+  %44 = extractvalue { i32, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i32, i1 } %40, 0
+  store i32 %46, i32* %23, align 4
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i32, i1 } %43, 0
+  store i32 %50, i32* %23, align 4
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i32, i32* %23, align 4
+  %55 = cmpxchg volatile i32 addrspace(1)* %22, i32 %54, i32 %11 release monotonic
+  %56 = extractvalue { i32, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i32, i1 } %55, 0
+  store i32 %58, i32* %23, align 4
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg volatile i32 addrspace(1)* %22, i32 %27, i32 %11 acq_rel monotonic
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg volatile i32 addrspace(1)* %22, i32 %27, i32 %11 acq_rel acquire
+  %66 = extractvalue { i32, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i32, i1 } %62, 0
+  store i32 %68, i32* %23, align 4
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i32, i1 } %65, 0
+  store i32 %72, i32* %23, align 4
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i32, i32* %23, align 4
+  %77 = cmpxchg volatile i32 addrspace(1)* %22, i32 %76, i32 %11 seq_cst monotonic
+  %78 = extractvalue { i32, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i32, i32* %23, align 4
+  %81 = cmpxchg volatile i32 addrspace(1)* %22, i32 %80, i32 %11 seq_cst acquire
+  %82 = extractvalue { i32, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i32, i32* %23, align 4
+  %85 = cmpxchg volatile i32 addrspace(1)* %22, i32 %84, i32 %11 seq_cst seq_cst
+  %86 = extractvalue { i32, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i32, i1 } %77, 0
+  store i32 %88, i32* %23, align 4
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i32, i1 } %81, 0
+  store i32 %92, i32* %23, align 4
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i32, i1 } %85, 0
+  store i32 %96, i32* %23, align 4
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU3AS1U7_AtomicfPff12memory_orderS3_12memory_scope(float addrspace(1)* %object, float* nocapture %expected, float %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast float %desired to i32
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %23 = bitcast float* %expected to i32*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i32, i32* %23, align 4
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i32, i32* %23, align 4
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i32, i32* %23, align 4
+  %33 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %32, i32 %11 monotonic monotonic
+  %34 = extractvalue { i32, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i32, i1 } %33, 0
+  store i32 %36, i32* %23, align 4
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %25, i32 %11 acquire monotonic
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %25, i32 %11 acquire acquire
+  %44 = extractvalue { i32, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i32, i1 } %40, 0
+  store i32 %46, i32* %23, align 4
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i32, i1 } %43, 0
+  store i32 %50, i32* %23, align 4
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i32, i32* %23, align 4
+  %55 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %54, i32 %11 release monotonic
+  %56 = extractvalue { i32, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i32, i1 } %55, 0
+  store i32 %58, i32* %23, align 4
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %27, i32 %11 acq_rel monotonic
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %27, i32 %11 acq_rel acquire
+  %66 = extractvalue { i32, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i32, i1 } %62, 0
+  store i32 %68, i32* %23, align 4
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i32, i1 } %65, 0
+  store i32 %72, i32* %23, align 4
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i32, i32* %23, align 4
+  %77 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %76, i32 %11 seq_cst monotonic
+  %78 = extractvalue { i32, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i32, i32* %23, align 4
+  %81 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %80, i32 %11 seq_cst acquire
+  %82 = extractvalue { i32, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i32, i32* %23, align 4
+  %85 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %84, i32 %11 seq_cst seq_cst
+  %86 = extractvalue { i32, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i32, i1 } %77, 0
+  store i32 %88, i32* %23, align 4
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i32, i1 } %81, 0
+  store i32 %92, i32* %23, align 4
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i32, i1 } %85, 0
+  store i32 %96, i32* %23, align 4
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU3AS1U7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* nocapture %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object monotonic, align 8
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object release, align 8
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object seq_cst, align 8
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z24pocl_atomic_load__globalPVU3AS1U7_Atomicl12memory_order12memory_scope(i64 addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i64, i64 addrspace(1)* %object monotonic, align 8
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i64, i64 addrspace(1)* %object acquire, align 8
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i64, i64 addrspace(1)* %object seq_cst, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_exchange__globalPVU3AS1U7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU3AS1U7_AtomiclPll12memory_orderS3_12memory_scope(i64 addrspace(1)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg volatile i64 addrspace(1)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg volatile i64 addrspace(1)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg volatile i64 addrspace(1)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg volatile i64 addrspace(1)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg volatile i64 addrspace(1)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU3AS1U7_AtomiclPll12memory_orderS3_12memory_scope(i64 addrspace(1)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_add__globalPVU3AS1U7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_sub__globalPVU3AS1U7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_or__globalPVU3AS1U7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_xor__globalPVU3AS1U7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_and__globalPVU3AS1U7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_min__globalPVU3AS1U7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_max__globalPVU3AS1U7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU3AS1U7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* nocapture %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object monotonic, align 8
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object release, align 8
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object seq_cst, align 8
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z24pocl_atomic_load__globalPVU3AS1U7_Atomicm12memory_order12memory_scope(i64 addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i64, i64 addrspace(1)* %object monotonic, align 8
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i64, i64 addrspace(1)* %object acquire, align 8
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i64, i64 addrspace(1)* %object seq_cst, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_exchange__globalPVU3AS1U7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU3AS1U7_AtomicmPmm12memory_orderS3_12memory_scope(i64 addrspace(1)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg volatile i64 addrspace(1)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg volatile i64 addrspace(1)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg volatile i64 addrspace(1)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg volatile i64 addrspace(1)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg volatile i64 addrspace(1)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU3AS1U7_AtomicmPmm12memory_orderS3_12memory_scope(i64 addrspace(1)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_add__globalPVU3AS1U7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_sub__globalPVU3AS1U7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_or__globalPVU3AS1U7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_xor__globalPVU3AS1U7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_and__globalPVU3AS1U7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_min__globalPVU3AS1U7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_max__globalPVU3AS1U7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU3AS1U7_Atomicdd12memory_order12memory_scope(double addrspace(1)* nocapture %object, double %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %5 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast double %desired to i64
+  %2 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  store atomic volatile i64 %1, i64 addrspace(1)* %2 release, align 8
+  br label %13
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast double %desired to i64
+  %4 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br label %9
+
+; <label>:5                                       ; preds = %0
+  %6 = icmp eq i32 %order, 3
+  %7 = bitcast double %desired to i64
+  %8 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br i1 %6, label %9, label %12
+
+; <label>:9                                       ; preds = %5, %.thread
+  %10 = phi i64 addrspace(1)* [ %4, %.thread ], [ %8, %5 ]
+  %11 = phi i64 [ %3, %.thread ], [ %7, %5 ]
+  store atomic volatile i64 %11, i64 addrspace(1)* %10 monotonic, align 8
+  br label %13
+
+; <label>:12                                      ; preds = %5
+  store atomic volatile i64 %7, i64 addrspace(1)* %8 seq_cst, align 8
+  br label %13
+
+; <label>:13                                      ; preds = %12, %.thread1, %9
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define double @_Z24pocl_atomic_load__globalPVU3AS1U7_Atomicd12memory_order12memory_scope(double addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %4 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %2 = load atomic volatile i64, i64 addrspace(1)* %1 acquire, align 8
+  br label %12
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br label %7
+
+; <label>:4                                       ; preds = %0
+  %5 = icmp eq i32 %order, 3
+  %6 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br i1 %5, label %7, label %10
+
+; <label>:7                                       ; preds = %4, %.thread
+  %8 = phi i64 addrspace(1)* [ %3, %.thread ], [ %6, %4 ]
+  %9 = load atomic volatile i64, i64 addrspace(1)* %8 monotonic, align 8
+  br label %12
+
+; <label>:10                                      ; preds = %4
+  %11 = load atomic volatile i64, i64 addrspace(1)* %6 seq_cst, align 8
+  br label %12
+
+; <label>:12                                      ; preds = %10, %.thread1, %7
+  %.sroa.0.0 = phi i64 [ %9, %7 ], [ %11, %10 ], [ %2, %.thread1 ]
+  %13 = bitcast i64 %.sroa.0.0 to double
+  ret double %13
+}
+
+; Function Attrs: nounwind uwtable
+define double @_Z28pocl_atomic_exchange__globalPVU3AS1U7_Atomicdd12memory_order12memory_scope(double addrspace(1)* %object, double %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %10 [
+    i32 0, label %.thread
+    i32 1, label %.thread2
+    i32 2, label %.thread3
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = bitcast double %desired to i64
+  %2 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %3 = atomicrmw volatile xchg i64 addrspace(1)* %2, i64 %1 monotonic
+  br label %18
+
+.thread2:                                         ; preds = %0
+  %4 = bitcast double %desired to i64
+  %5 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %6 = atomicrmw volatile xchg i64 addrspace(1)* %5, i64 %4 acquire
+  br label %18
+
+.thread3:                                         ; preds = %0
+  %7 = bitcast double %desired to i64
+  %8 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %9 = atomicrmw volatile xchg i64 addrspace(1)* %8, i64 %7 release
+  br label %18
+
+; <label>:10                                      ; preds = %0
+  %11 = icmp eq i32 %order, 3
+  %12 = bitcast double %desired to i64
+  %13 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br i1 %11, label %14, label %16
+
+; <label>:14                                      ; preds = %10
+  %15 = atomicrmw volatile xchg i64 addrspace(1)* %13, i64 %12 acq_rel
+  br label %18
+
+; <label>:16                                      ; preds = %10
+  %17 = atomicrmw volatile xchg i64 addrspace(1)* %13, i64 %12 seq_cst
+  br label %18
+
+; <label>:18                                      ; preds = %16, %14, %.thread3, %.thread2, %.thread
+  %.sroa.0.0 = phi i64 [ %3, %.thread ], [ %17, %16 ], [ %15, %14 ], [ %9, %.thread3 ], [ %6, %.thread2 ]
+  %19 = bitcast i64 %.sroa.0.0 to double
+  ret double %19
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU3AS1U7_AtomicdPdd12memory_orderS3_12memory_scope(double addrspace(1)* %object, double* nocapture %expected, double %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast double %desired to i64
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %23 = bitcast double* %expected to i64*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i64, i64* %23, align 8
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i64, i64* %23, align 8
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i64, i64* %23, align 8
+  %33 = cmpxchg volatile i64 addrspace(1)* %22, i64 %32, i64 %11 monotonic monotonic
+  %34 = extractvalue { i64, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i64, i1 } %33, 0
+  store i64 %36, i64* %23, align 8
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg volatile i64 addrspace(1)* %22, i64 %25, i64 %11 acquire monotonic
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg volatile i64 addrspace(1)* %22, i64 %25, i64 %11 acquire acquire
+  %44 = extractvalue { i64, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i64, i1 } %40, 0
+  store i64 %46, i64* %23, align 8
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i64, i1 } %43, 0
+  store i64 %50, i64* %23, align 8
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i64, i64* %23, align 8
+  %55 = cmpxchg volatile i64 addrspace(1)* %22, i64 %54, i64 %11 release monotonic
+  %56 = extractvalue { i64, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i64, i1 } %55, 0
+  store i64 %58, i64* %23, align 8
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg volatile i64 addrspace(1)* %22, i64 %27, i64 %11 acq_rel monotonic
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg volatile i64 addrspace(1)* %22, i64 %27, i64 %11 acq_rel acquire
+  %66 = extractvalue { i64, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i64, i1 } %62, 0
+  store i64 %68, i64* %23, align 8
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i64, i1 } %65, 0
+  store i64 %72, i64* %23, align 8
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i64, i64* %23, align 8
+  %77 = cmpxchg volatile i64 addrspace(1)* %22, i64 %76, i64 %11 seq_cst monotonic
+  %78 = extractvalue { i64, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i64, i64* %23, align 8
+  %81 = cmpxchg volatile i64 addrspace(1)* %22, i64 %80, i64 %11 seq_cst acquire
+  %82 = extractvalue { i64, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i64, i64* %23, align 8
+  %85 = cmpxchg volatile i64 addrspace(1)* %22, i64 %84, i64 %11 seq_cst seq_cst
+  %86 = extractvalue { i64, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i64, i1 } %77, 0
+  store i64 %88, i64* %23, align 8
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i64, i1 } %81, 0
+  store i64 %92, i64* %23, align 8
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i64, i1 } %85, 0
+  store i64 %96, i64* %23, align 8
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU3AS1U7_AtomicdPdd12memory_orderS3_12memory_scope(double addrspace(1)* %object, double* nocapture %expected, double %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast double %desired to i64
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %23 = bitcast double* %expected to i64*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i64, i64* %23, align 8
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i64, i64* %23, align 8
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i64, i64* %23, align 8
+  %33 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %32, i64 %11 monotonic monotonic
+  %34 = extractvalue { i64, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i64, i1 } %33, 0
+  store i64 %36, i64* %23, align 8
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %25, i64 %11 acquire monotonic
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %25, i64 %11 acquire acquire
+  %44 = extractvalue { i64, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i64, i1 } %40, 0
+  store i64 %46, i64* %23, align 8
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i64, i1 } %43, 0
+  store i64 %50, i64* %23, align 8
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i64, i64* %23, align 8
+  %55 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %54, i64 %11 release monotonic
+  %56 = extractvalue { i64, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i64, i1 } %55, 0
+  store i64 %58, i64* %23, align 8
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %27, i64 %11 acq_rel monotonic
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %27, i64 %11 acq_rel acquire
+  %66 = extractvalue { i64, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i64, i1 } %62, 0
+  store i64 %68, i64* %23, align 8
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i64, i1 } %65, 0
+  store i64 %72, i64* %23, align 8
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i64, i64* %23, align 8
+  %77 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %76, i64 %11 seq_cst monotonic
+  %78 = extractvalue { i64, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i64, i64* %23, align 8
+  %81 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %80, i64 %11 seq_cst acquire
+  %82 = extractvalue { i64, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i64, i64* %23, align 8
+  %85 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %84, i64 %11 seq_cst seq_cst
+  %86 = extractvalue { i64, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i64, i1 } %77, 0
+  store i64 %88, i64* %23, align 8
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i64, i1 } %81, 0
+  store i64 %92, i64* %23, align 8
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i64, i1 } %85, 0
+  store i64 %96, i64* %23, align 8
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z36pocl_atomic_flag_test_and_set__localPVU3AS2U7_Atomici12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  %9 = icmp ne i32 %.0, 0
+  ret i1 %9
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z29pocl_atomic_flag_clear__localPVU3AS2U7_Atomici12memory_order12memory_scope(i32 addrspace(2)* nocapture %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 0, i32 addrspace(2)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 0, i32 addrspace(2)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 0, i32 addrspace(2)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU3AS2U7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* nocapture %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z23pocl_atomic_load__localPVU3AS2U7_Atomici12memory_order12memory_scope(i32 addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i32, i32 addrspace(2)* %object monotonic, align 4
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i32, i32 addrspace(2)* %object acquire, align 4
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i32, i32 addrspace(2)* %object seq_cst, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z27pocl_atomic_exchange__localPVU3AS2U7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU3AS2U7_AtomiciPii12memory_orderS3_12memory_scope(i32 addrspace(2)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg volatile i32 addrspace(2)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg volatile i32 addrspace(2)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg volatile i32 addrspace(2)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg volatile i32 addrspace(2)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg volatile i32 addrspace(2)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU3AS2U7_AtomiciPii12memory_orderS3_12memory_scope(i32 addrspace(2)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_add__localPVU3AS2U7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_sub__localPVU3AS2U7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z27pocl_atomic_fetch_or__localPVU3AS2U7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_xor__localPVU3AS2U7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_and__localPVU3AS2U7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_min__localPVU3AS2U7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_max__localPVU3AS2U7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU3AS2U7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* nocapture %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z23pocl_atomic_load__localPVU3AS2U7_Atomicj12memory_order12memory_scope(i32 addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i32, i32 addrspace(2)* %object monotonic, align 4
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i32, i32 addrspace(2)* %object acquire, align 4
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i32, i32 addrspace(2)* %object seq_cst, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z27pocl_atomic_exchange__localPVU3AS2U7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU3AS2U7_AtomicjPjj12memory_orderS3_12memory_scope(i32 addrspace(2)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg volatile i32 addrspace(2)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg volatile i32 addrspace(2)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg volatile i32 addrspace(2)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg volatile i32 addrspace(2)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg volatile i32 addrspace(2)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU3AS2U7_AtomicjPjj12memory_orderS3_12memory_scope(i32 addrspace(2)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_add__localPVU3AS2U7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_sub__localPVU3AS2U7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z27pocl_atomic_fetch_or__localPVU3AS2U7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_xor__localPVU3AS2U7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_and__localPVU3AS2U7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_min__localPVU3AS2U7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_max__localPVU3AS2U7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU3AS2U7_Atomicff12memory_order12memory_scope(float addrspace(2)* nocapture %object, float %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %5 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast float %desired to i32
+  %2 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  store atomic volatile i32 %1, i32 addrspace(2)* %2 release, align 4
+  br label %13
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast float %desired to i32
+  %4 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br label %9
+
+; <label>:5                                       ; preds = %0
+  %6 = icmp eq i32 %order, 3
+  %7 = bitcast float %desired to i32
+  %8 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br i1 %6, label %9, label %12
+
+; <label>:9                                       ; preds = %5, %.thread
+  %10 = phi i32 addrspace(2)* [ %4, %.thread ], [ %8, %5 ]
+  %11 = phi i32 [ %3, %.thread ], [ %7, %5 ]
+  store atomic volatile i32 %11, i32 addrspace(2)* %10 monotonic, align 4
+  br label %13
+
+; <label>:12                                      ; preds = %5
+  store atomic volatile i32 %7, i32 addrspace(2)* %8 seq_cst, align 4
+  br label %13
+
+; <label>:13                                      ; preds = %12, %.thread1, %9
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define float @_Z23pocl_atomic_load__localPVU3AS2U7_Atomicf12memory_order12memory_scope(float addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %4 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %2 = load atomic volatile i32, i32 addrspace(2)* %1 acquire, align 4
+  br label %12
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br label %7
+
+; <label>:4                                       ; preds = %0
+  %5 = icmp eq i32 %order, 3
+  %6 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br i1 %5, label %7, label %10
+
+; <label>:7                                       ; preds = %4, %.thread
+  %8 = phi i32 addrspace(2)* [ %3, %.thread ], [ %6, %4 ]
+  %9 = load atomic volatile i32, i32 addrspace(2)* %8 monotonic, align 4
+  br label %12
+
+; <label>:10                                      ; preds = %4
+  %11 = load atomic volatile i32, i32 addrspace(2)* %6 seq_cst, align 4
+  br label %12
+
+; <label>:12                                      ; preds = %10, %.thread1, %7
+  %.sroa.0.0 = phi i32 [ %9, %7 ], [ %11, %10 ], [ %2, %.thread1 ]
+  %13 = bitcast i32 %.sroa.0.0 to float
+  ret float %13
+}
+
+; Function Attrs: nounwind uwtable
+define float @_Z27pocl_atomic_exchange__localPVU3AS2U7_Atomicff12memory_order12memory_scope(float addrspace(2)* %object, float %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %10 [
+    i32 0, label %.thread
+    i32 1, label %.thread2
+    i32 2, label %.thread3
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = bitcast float %desired to i32
+  %2 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %3 = atomicrmw volatile xchg i32 addrspace(2)* %2, i32 %1 monotonic
+  br label %18
+
+.thread2:                                         ; preds = %0
+  %4 = bitcast float %desired to i32
+  %5 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %6 = atomicrmw volatile xchg i32 addrspace(2)* %5, i32 %4 acquire
+  br label %18
+
+.thread3:                                         ; preds = %0
+  %7 = bitcast float %desired to i32
+  %8 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %9 = atomicrmw volatile xchg i32 addrspace(2)* %8, i32 %7 release
+  br label %18
+
+; <label>:10                                      ; preds = %0
+  %11 = icmp eq i32 %order, 3
+  %12 = bitcast float %desired to i32
+  %13 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br i1 %11, label %14, label %16
+
+; <label>:14                                      ; preds = %10
+  %15 = atomicrmw volatile xchg i32 addrspace(2)* %13, i32 %12 acq_rel
+  br label %18
+
+; <label>:16                                      ; preds = %10
+  %17 = atomicrmw volatile xchg i32 addrspace(2)* %13, i32 %12 seq_cst
+  br label %18
+
+; <label>:18                                      ; preds = %16, %14, %.thread3, %.thread2, %.thread
+  %.sroa.0.0 = phi i32 [ %3, %.thread ], [ %17, %16 ], [ %15, %14 ], [ %9, %.thread3 ], [ %6, %.thread2 ]
+  %19 = bitcast i32 %.sroa.0.0 to float
+  ret float %19
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU3AS2U7_AtomicfPff12memory_orderS3_12memory_scope(float addrspace(2)* %object, float* nocapture %expected, float %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast float %desired to i32
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %23 = bitcast float* %expected to i32*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i32, i32* %23, align 4
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i32, i32* %23, align 4
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i32, i32* %23, align 4
+  %33 = cmpxchg volatile i32 addrspace(2)* %22, i32 %32, i32 %11 monotonic monotonic
+  %34 = extractvalue { i32, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i32, i1 } %33, 0
+  store i32 %36, i32* %23, align 4
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg volatile i32 addrspace(2)* %22, i32 %25, i32 %11 acquire monotonic
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg volatile i32 addrspace(2)* %22, i32 %25, i32 %11 acquire acquire
+  %44 = extractvalue { i32, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i32, i1 } %40, 0
+  store i32 %46, i32* %23, align 4
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i32, i1 } %43, 0
+  store i32 %50, i32* %23, align 4
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i32, i32* %23, align 4
+  %55 = cmpxchg volatile i32 addrspace(2)* %22, i32 %54, i32 %11 release monotonic
+  %56 = extractvalue { i32, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i32, i1 } %55, 0
+  store i32 %58, i32* %23, align 4
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg volatile i32 addrspace(2)* %22, i32 %27, i32 %11 acq_rel monotonic
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg volatile i32 addrspace(2)* %22, i32 %27, i32 %11 acq_rel acquire
+  %66 = extractvalue { i32, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i32, i1 } %62, 0
+  store i32 %68, i32* %23, align 4
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i32, i1 } %65, 0
+  store i32 %72, i32* %23, align 4
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i32, i32* %23, align 4
+  %77 = cmpxchg volatile i32 addrspace(2)* %22, i32 %76, i32 %11 seq_cst monotonic
+  %78 = extractvalue { i32, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i32, i32* %23, align 4
+  %81 = cmpxchg volatile i32 addrspace(2)* %22, i32 %80, i32 %11 seq_cst acquire
+  %82 = extractvalue { i32, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i32, i32* %23, align 4
+  %85 = cmpxchg volatile i32 addrspace(2)* %22, i32 %84, i32 %11 seq_cst seq_cst
+  %86 = extractvalue { i32, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i32, i1 } %77, 0
+  store i32 %88, i32* %23, align 4
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i32, i1 } %81, 0
+  store i32 %92, i32* %23, align 4
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i32, i1 } %85, 0
+  store i32 %96, i32* %23, align 4
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU3AS2U7_AtomicfPff12memory_orderS3_12memory_scope(float addrspace(2)* %object, float* nocapture %expected, float %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast float %desired to i32
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %23 = bitcast float* %expected to i32*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i32, i32* %23, align 4
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i32, i32* %23, align 4
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i32, i32* %23, align 4
+  %33 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %32, i32 %11 monotonic monotonic
+  %34 = extractvalue { i32, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i32, i1 } %33, 0
+  store i32 %36, i32* %23, align 4
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %25, i32 %11 acquire monotonic
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %25, i32 %11 acquire acquire
+  %44 = extractvalue { i32, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i32, i1 } %40, 0
+  store i32 %46, i32* %23, align 4
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i32, i1 } %43, 0
+  store i32 %50, i32* %23, align 4
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i32, i32* %23, align 4
+  %55 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %54, i32 %11 release monotonic
+  %56 = extractvalue { i32, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i32, i1 } %55, 0
+  store i32 %58, i32* %23, align 4
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %27, i32 %11 acq_rel monotonic
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %27, i32 %11 acq_rel acquire
+  %66 = extractvalue { i32, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i32, i1 } %62, 0
+  store i32 %68, i32* %23, align 4
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i32, i1 } %65, 0
+  store i32 %72, i32* %23, align 4
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i32, i32* %23, align 4
+  %77 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %76, i32 %11 seq_cst monotonic
+  %78 = extractvalue { i32, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i32, i32* %23, align 4
+  %81 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %80, i32 %11 seq_cst acquire
+  %82 = extractvalue { i32, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i32, i32* %23, align 4
+  %85 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %84, i32 %11 seq_cst seq_cst
+  %86 = extractvalue { i32, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i32, i1 } %77, 0
+  store i32 %88, i32* %23, align 4
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i32, i1 } %81, 0
+  store i32 %92, i32* %23, align 4
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i32, i1 } %85, 0
+  store i32 %96, i32* %23, align 4
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU3AS2U7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* nocapture %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object monotonic, align 8
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object release, align 8
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object seq_cst, align 8
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z23pocl_atomic_load__localPVU3AS2U7_Atomicl12memory_order12memory_scope(i64 addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i64, i64 addrspace(2)* %object monotonic, align 8
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i64, i64 addrspace(2)* %object acquire, align 8
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i64, i64 addrspace(2)* %object seq_cst, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z27pocl_atomic_exchange__localPVU3AS2U7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU3AS2U7_AtomiclPll12memory_orderS3_12memory_scope(i64 addrspace(2)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg volatile i64 addrspace(2)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg volatile i64 addrspace(2)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg volatile i64 addrspace(2)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg volatile i64 addrspace(2)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg volatile i64 addrspace(2)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU3AS2U7_AtomiclPll12memory_orderS3_12memory_scope(i64 addrspace(2)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_add__localPVU3AS2U7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_sub__localPVU3AS2U7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z27pocl_atomic_fetch_or__localPVU3AS2U7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_xor__localPVU3AS2U7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_and__localPVU3AS2U7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_min__localPVU3AS2U7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_max__localPVU3AS2U7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU3AS2U7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* nocapture %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object monotonic, align 8
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object release, align 8
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object seq_cst, align 8
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z23pocl_atomic_load__localPVU3AS2U7_Atomicm12memory_order12memory_scope(i64 addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i64, i64 addrspace(2)* %object monotonic, align 8
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i64, i64 addrspace(2)* %object acquire, align 8
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i64, i64 addrspace(2)* %object seq_cst, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z27pocl_atomic_exchange__localPVU3AS2U7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU3AS2U7_AtomicmPmm12memory_orderS3_12memory_scope(i64 addrspace(2)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg volatile i64 addrspace(2)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg volatile i64 addrspace(2)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg volatile i64 addrspace(2)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg volatile i64 addrspace(2)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg volatile i64 addrspace(2)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU3AS2U7_AtomicmPmm12memory_orderS3_12memory_scope(i64 addrspace(2)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_add__localPVU3AS2U7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_sub__localPVU3AS2U7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z27pocl_atomic_fetch_or__localPVU3AS2U7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_xor__localPVU3AS2U7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_and__localPVU3AS2U7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_min__localPVU3AS2U7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_max__localPVU3AS2U7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU3AS2U7_Atomicdd12memory_order12memory_scope(double addrspace(2)* nocapture %object, double %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %5 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast double %desired to i64
+  %2 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  store atomic volatile i64 %1, i64 addrspace(2)* %2 release, align 8
+  br label %13
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast double %desired to i64
+  %4 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br label %9
+
+; <label>:5                                       ; preds = %0
+  %6 = icmp eq i32 %order, 3
+  %7 = bitcast double %desired to i64
+  %8 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br i1 %6, label %9, label %12
+
+; <label>:9                                       ; preds = %5, %.thread
+  %10 = phi i64 addrspace(2)* [ %4, %.thread ], [ %8, %5 ]
+  %11 = phi i64 [ %3, %.thread ], [ %7, %5 ]
+  store atomic volatile i64 %11, i64 addrspace(2)* %10 monotonic, align 8
+  br label %13
+
+; <label>:12                                      ; preds = %5
+  store atomic volatile i64 %7, i64 addrspace(2)* %8 seq_cst, align 8
+  br label %13
+
+; <label>:13                                      ; preds = %12, %.thread1, %9
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define double @_Z23pocl_atomic_load__localPVU3AS2U7_Atomicd12memory_order12memory_scope(double addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %4 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %2 = load atomic volatile i64, i64 addrspace(2)* %1 acquire, align 8
+  br label %12
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br label %7
+
+; <label>:4                                       ; preds = %0
+  %5 = icmp eq i32 %order, 3
+  %6 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br i1 %5, label %7, label %10
+
+; <label>:7                                       ; preds = %4, %.thread
+  %8 = phi i64 addrspace(2)* [ %3, %.thread ], [ %6, %4 ]
+  %9 = load atomic volatile i64, i64 addrspace(2)* %8 monotonic, align 8
+  br label %12
+
+; <label>:10                                      ; preds = %4
+  %11 = load atomic volatile i64, i64 addrspace(2)* %6 seq_cst, align 8
+  br label %12
+
+; <label>:12                                      ; preds = %10, %.thread1, %7
+  %.sroa.0.0 = phi i64 [ %9, %7 ], [ %11, %10 ], [ %2, %.thread1 ]
+  %13 = bitcast i64 %.sroa.0.0 to double
+  ret double %13
+}
+
+; Function Attrs: nounwind uwtable
+define double @_Z27pocl_atomic_exchange__localPVU3AS2U7_Atomicdd12memory_order12memory_scope(double addrspace(2)* %object, double %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %10 [
+    i32 0, label %.thread
+    i32 1, label %.thread2
+    i32 2, label %.thread3
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = bitcast double %desired to i64
+  %2 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %3 = atomicrmw volatile xchg i64 addrspace(2)* %2, i64 %1 monotonic
+  br label %18
+
+.thread2:                                         ; preds = %0
+  %4 = bitcast double %desired to i64
+  %5 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %6 = atomicrmw volatile xchg i64 addrspace(2)* %5, i64 %4 acquire
+  br label %18
+
+.thread3:                                         ; preds = %0
+  %7 = bitcast double %desired to i64
+  %8 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %9 = atomicrmw volatile xchg i64 addrspace(2)* %8, i64 %7 release
+  br label %18
+
+; <label>:10                                      ; preds = %0
+  %11 = icmp eq i32 %order, 3
+  %12 = bitcast double %desired to i64
+  %13 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br i1 %11, label %14, label %16
+
+; <label>:14                                      ; preds = %10
+  %15 = atomicrmw volatile xchg i64 addrspace(2)* %13, i64 %12 acq_rel
+  br label %18
+
+; <label>:16                                      ; preds = %10
+  %17 = atomicrmw volatile xchg i64 addrspace(2)* %13, i64 %12 seq_cst
+  br label %18
+
+; <label>:18                                      ; preds = %16, %14, %.thread3, %.thread2, %.thread
+  %.sroa.0.0 = phi i64 [ %3, %.thread ], [ %17, %16 ], [ %15, %14 ], [ %9, %.thread3 ], [ %6, %.thread2 ]
+  %19 = bitcast i64 %.sroa.0.0 to double
+  ret double %19
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU3AS2U7_AtomicdPdd12memory_orderS3_12memory_scope(double addrspace(2)* %object, double* nocapture %expected, double %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast double %desired to i64
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %23 = bitcast double* %expected to i64*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i64, i64* %23, align 8
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i64, i64* %23, align 8
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i64, i64* %23, align 8
+  %33 = cmpxchg volatile i64 addrspace(2)* %22, i64 %32, i64 %11 monotonic monotonic
+  %34 = extractvalue { i64, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i64, i1 } %33, 0
+  store i64 %36, i64* %23, align 8
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg volatile i64 addrspace(2)* %22, i64 %25, i64 %11 acquire monotonic
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg volatile i64 addrspace(2)* %22, i64 %25, i64 %11 acquire acquire
+  %44 = extractvalue { i64, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i64, i1 } %40, 0
+  store i64 %46, i64* %23, align 8
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i64, i1 } %43, 0
+  store i64 %50, i64* %23, align 8
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i64, i64* %23, align 8
+  %55 = cmpxchg volatile i64 addrspace(2)* %22, i64 %54, i64 %11 release monotonic
+  %56 = extractvalue { i64, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i64, i1 } %55, 0
+  store i64 %58, i64* %23, align 8
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg volatile i64 addrspace(2)* %22, i64 %27, i64 %11 acq_rel monotonic
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg volatile i64 addrspace(2)* %22, i64 %27, i64 %11 acq_rel acquire
+  %66 = extractvalue { i64, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i64, i1 } %62, 0
+  store i64 %68, i64* %23, align 8
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i64, i1 } %65, 0
+  store i64 %72, i64* %23, align 8
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i64, i64* %23, align 8
+  %77 = cmpxchg volatile i64 addrspace(2)* %22, i64 %76, i64 %11 seq_cst monotonic
+  %78 = extractvalue { i64, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i64, i64* %23, align 8
+  %81 = cmpxchg volatile i64 addrspace(2)* %22, i64 %80, i64 %11 seq_cst acquire
+  %82 = extractvalue { i64, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i64, i64* %23, align 8
+  %85 = cmpxchg volatile i64 addrspace(2)* %22, i64 %84, i64 %11 seq_cst seq_cst
+  %86 = extractvalue { i64, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i64, i1 } %77, 0
+  store i64 %88, i64* %23, align 8
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i64, i1 } %81, 0
+  store i64 %92, i64* %23, align 8
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i64, i1 } %85, 0
+  store i64 %96, i64* %23, align 8
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU3AS2U7_AtomicdPdd12memory_orderS3_12memory_scope(double addrspace(2)* %object, double* nocapture %expected, double %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast double %desired to i64
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %23 = bitcast double* %expected to i64*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i64, i64* %23, align 8
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i64, i64* %23, align 8
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i64, i64* %23, align 8
+  %33 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %32, i64 %11 monotonic monotonic
+  %34 = extractvalue { i64, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i64, i1 } %33, 0
+  store i64 %36, i64* %23, align 8
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %25, i64 %11 acquire monotonic
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %25, i64 %11 acquire acquire
+  %44 = extractvalue { i64, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i64, i1 } %40, 0
+  store i64 %46, i64* %23, align 8
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i64, i1 } %43, 0
+  store i64 %50, i64* %23, align 8
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i64, i64* %23, align 8
+  %55 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %54, i64 %11 release monotonic
+  %56 = extractvalue { i64, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i64, i1 } %55, 0
+  store i64 %58, i64* %23, align 8
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %27, i64 %11 acq_rel monotonic
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %27, i64 %11 acq_rel acquire
+  %66 = extractvalue { i64, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i64, i1 } %62, 0
+  store i64 %68, i64* %23, align 8
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i64, i1 } %65, 0
+  store i64 %72, i64* %23, align 8
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i64, i64* %23, align 8
+  %77 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %76, i64 %11 seq_cst monotonic
+  %78 = extractvalue { i64, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i64, i64* %23, align 8
+  %81 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %80, i64 %11 seq_cst acquire
+  %82 = extractvalue { i64, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i64, i64* %23, align 8
+  %85 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %84, i64 %11 seq_cst seq_cst
+  %86 = extractvalue { i64, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i64, i1 } %77, 0
+  store i64 %88, i64* %23, align 8
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i64, i1 } %81, 0
+  store i64 %92, i64* %23, align 8
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i64, i1 } %85, 0
+  store i64 %96, i64* %23, align 8
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Franz clang version 3.7.0 (https://github.com/llvm-mirror/clang.git 40b68b4c02b9d9e1e4138815747adf5589496240) (https://github.com/HSAFoundation/HLC-HSAIL-Development-LLVM.git 183de0bdbefa5664920942fa418d1efefb87faeb) (based on LLVM 3.7.0svn)"}
diff --git a/lib/kernel/hsail64/tgamma.cl b/lib/kernel/hsail64/tgamma.cl
new file mode 100644
index 0000000..930ba45
--- /dev/null
+++ b/lib/kernel/hsail64/tgamma.cl
@@ -0,0 +1,94 @@
+/* OpenCL built-in library: tgamma()
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "hsail_templates.h"
+
+#define M_SQRT_PI 1.7724538509055159
+#define M_SQRT_PI_F 1.7724538509055159f
+
+double _cl_builtin_tgamma(double g)
+{
+  double x = g;
+  if (g < 0.5)
+    x = (1.0 - g);
+
+  double a = 0.99999999999980993;
+  a += 676.5203681218851 / x;
+  x += 1.0;
+  a += -1259.1392167224028 / x;
+  x += 1.0;
+  a += 771.32342877765313 / x;
+  x += 1.0;
+  a += -176.61502916214059 / x;
+  x += 1.0;
+  a += 12.507343278686905 / x;
+  x += 1.0;
+  a +=  -0.13857109526572012/ x;
+  x += 1.0;
+  a += 9.9843695780195716e-6 / x;
+  x += 1.0;
+  a += 1.5056327351493116e-7 / x;
+
+  double t = x - 0.5;
+  double res = M_SQRT_PI * M_SQRT2 * pow(t, (x - 7.5)) * exp(-t) * a;
+
+  if (g < 0.5)
+    return (M_PI / (sin(M_PI*x) * res));
+  else
+    return ((fabs(g) > 40.0f) ? INFINITY : res);  // TODO proper range
+}
+
+float _cl_builtin_tgammaf(float g)
+{
+  float x = g;
+  if (g < 0.5f)
+    x = (1.0f - g);
+
+  float a = 0.99999999999980993f;
+  a += 676.5203681218851f / x;
+  x += 1.0f;
+  a += -1259.1392167224028f / x;
+  x += 1.0f;
+  a += 771.32342877765313f / x;
+  x += 1.0f;
+  a += -176.61502916214059f / x;
+  x += 1.0f;
+  a += 12.507343278686905f / x;
+  x += 1.0f;
+  a +=  -0.13857109526572012/ x;
+  x += 1.0f;
+  a += 9.9843695780195716e-6f / x;
+  x += 1.0f;
+  a += 1.5056327351493116e-7f / x;
+
+  float t = x - 0.5f;
+  float  res = M_SQRT_PI_F * M_SQRT2_F * pow(t, (x - 7.5f)) * exp(-t) * a;
+
+  if (g < 0.5f)
+    return (M_PI_F / (sin(M_PI_F*x) * res));
+  else
+    return ((fabs(g) > 25.0f) ? INFINITY : res);
+
+}
+
+IMPLEMENT_EXPR_ALL(tgamma, V_V, _cl_builtin_tgammaf(a), _cl_builtin_tgamma(a))
diff --git a/lib/kernel/printf.c b/lib/kernel/printf.c
index 710add5..4d99e48 100644
--- a/lib/kernel/printf.c
+++ b/lib/kernel/printf.c
@@ -22,17 +22,15 @@
    THE SOFTWARE.
 */
 
-// Make the C99 printf visible again
-#undef printf
-
 #include <limits.h>
 #include <stdarg.h>
 #include <stdbool.h>
 
 // We implement the OpenCL printf by calling the C99 printf. This is
 // not very efficient, but is easy to implement.
-int printf(const char* restrict fmt, ...);
-int snprintf(char* restrict str, size_t size, const char* restrict fmt, ...);
+#define OCL_C_AS __attribute__((address_space(0)))
+int printf(OCL_C_AS const char* restrict fmt, ...);
+int snprintf(OCL_C_AS char* restrict str, size_t size, OCL_C_AS const char* restrict fmt, ...);
 
 // For debugging
 // Use as: DEBUG_PRINTF((fmt, args...)) -- note double parentheses!
@@ -59,7 +57,7 @@ typedef struct {
 
 #define DEFINE_PRINT_INTS(WIDTH)                                        \
   void _cl_print_ints_##WIDTH(flags_t flags, int field_width, int precision, \
-                              char conv, const void* vals, int n)       \
+                              char conv, OCL_C_AS const void* vals, int n)       \
   {                                                                     \
     DEBUG_PRINTF(("[printf:ints:n=%df]\n", n));                         \
     char outfmt[1000];                                                  \
@@ -78,7 +76,7 @@ typedef struct {
     for (int d=0; d<n; ++d) {                                           \
       DEBUG_PRINTF(("[printf:ints:d=%d]\n", d));                        \
       if (d != 0) printf(",");                                          \
-      printf(outfmt, ((const WIDTH*)vals)[d]);                          \
+      printf(outfmt, ((OCL_C_AS const WIDTH*)vals)[d]);                          \
     }                                                                   \
     DEBUG_PRINTF(("[printf:ints:done]\n"));                             \
   }
@@ -110,7 +108,7 @@ float __attribute__((overloadable)) vload_half(size_t offset, const half *p);
 
 #define DEFINE_PRINT_FLOATS(WIDTH)                                      \
   void _cl_print_floats_##WIDTH(flags_t flags, int field_width, int precision, \
-                                char conv, const void* vals, int n)     \
+                                char conv, OCL_C_AS const void* vals, int n)     \
   {                                                                     \
     DEBUG_PRINTF(("[printf:floats:n=%dd]\n", n));                       \
     char outfmt[1000];                                                  \
@@ -129,7 +127,7 @@ float __attribute__((overloadable)) vload_half(size_t offset, const half *p);
     for (int d=0; d<n; ++d) {                                           \
       DEBUG_PRINTF(("[printf:floats:d=%d]\n", d));                      \
       if (d != 0) printf(",");                                          \
-      printf(outfmt, FLOAT_GET_##WIDTH((const WIDTH*)vals+d));          \
+      printf(outfmt, FLOAT_GET_##WIDTH((OCL_C_AS const WIDTH*)vals+d));          \
     }                                                                   \
     DEBUG_PRINTF(("[printf:floats:done]\n"));                           \
   }
@@ -161,7 +159,7 @@ void _cl_print_char(flags_t flags, int field_width, int val)
   DEBUG_PRINTF(("[printf:char:done]\n"));
 }
 
-void _cl_print_string(flags_t flags, int field_width, const char* val)
+void _cl_print_string(flags_t flags, int field_width, OCL_C_AS const char* val)
 {
   DEBUG_PRINTF(("[printf:char]\n"));
   char outfmt[1000];
@@ -174,7 +172,7 @@ void _cl_print_string(flags_t flags, int field_width, const char* val)
   DEBUG_PRINTF(("[printf:char:done]\n"));
 }
 
-void _cl_print_pointer(flags_t flags, int field_width, const void* val)
+void _cl_print_pointer(flags_t flags, int field_width, OCL_C_AS const void* val)
 {
   DEBUG_PRINTF(("[printf:char]\n"));
   char outfmt[1000];
@@ -417,7 +415,7 @@ int _cl_printf(const OCL_CONSTANT_AS char* restrict format, ...)
           if (precision != -1) goto error;
           if (vector_length != 1) goto error;
           if (length != 0) goto error;
-          const char* val = va_arg(ap, const char*);
+          OCL_C_AS const char* val = va_arg(ap, OCL_C_AS const char*);
           _cl_print_string(flags, field_width, val);
           break;
         }
@@ -428,7 +426,7 @@ int _cl_printf(const OCL_CONSTANT_AS char* restrict format, ...)
           if (precision != -1) goto error;
           if (vector_length != 1) goto error;
           if (length != 0) goto error;
-          const void* val = va_arg(ap, const void*);
+          OCL_C_AS const void* val = va_arg(ap, OCL_C_AS const void*);
           _cl_print_pointer(flags, field_width, val);
           break;
         }
diff --git a/lib/kernel/printf_constant.c b/lib/kernel/printf_constant.c
index 44e666c..bbb2201 100644
--- a/lib/kernel/printf_constant.c
+++ b/lib/kernel/printf_constant.c
@@ -28,7 +28,7 @@
  * the private space (0) and calls a system vprintf.
  */
 
-
+#include <stddef.h>
 #include <stdarg.h>
 
 #ifdef __TCE_V1__
@@ -45,11 +45,16 @@
 
 #endif
 
-#include <stdio.h>
 
 #define OCL_CONSTANT_AS __attribute__((address_space(3)))
-int vprintf(const char *, __builtin_va_list);
-int fflush(FILE *stream);
+
+/* AS 0 is required for the prototypes, otherwise they get assigned
+ * the generic AS (#4) */
+#define OCL_C_AS __attribute__((address_space(0)))
+int vprintf(OCL_C_AS const char *, __builtin_va_list);
+int fflush(OCL_C_AS void *stream);
+
+
 
 #undef printf
 #define MAX_FORMAT_STR_SIZE 2048
diff --git a/lib/kernel/rules.mk b/lib/kernel/rules.mk
index c4ffc00..27bc9d4 100644
--- a/lib/kernel/rules.mk
+++ b/lib/kernel/rules.mk
@@ -29,6 +29,7 @@
 # CLANG_FLAGS
 # LLC_FLAGS
 # LD_FLAGS
+# DEVICE_CL_FLAGS
 
 KERNEL_BC=kernel-${KERNEL_TARGET}.bc
 
@@ -41,14 +42,14 @@ all: ${KERNEL_BC}
 # and LKERNEL_SRCS_EXTRA, which adds extra files to the source list.
 LKERNEL_SRCS =								\
 	$(filter-out ${LKERNEL_SRCS_EXCLUDE}, ${LKERNEL_SRCS_DEFAULT})	\
-	${LKERNEL_SRCS_EXTRA}
+	${LKERNEL_SRCS_EXTRA} ${LKERNEL_SRCS_EXTRA2}
 
 OBJ = $(LKERNEL_SRCS:%=%.bc)
 
-vpath %.c  @top_srcdir@/lib/kernel
-vpath %.cc @top_srcdir@/lib/kernel
-vpath %.cl @top_srcdir@/lib/kernel
-vpath %.ll @top_srcdir@/lib/kernel
+vpath %.c @srcdir@ @top_srcdir@/lib/kernel
+vpath %.cc @srcdir@ @top_srcdir@/lib/kernel
+vpath %.cl @srcdir@ @top_srcdir@/lib/kernel
+vpath %.ll @srcdir@ @top_srcdir@/lib/kernel
 
 
 
@@ -68,15 +69,15 @@ _kernel.h.pch: @top_builddir@/include/${TARGET_DIR}/types.h @top_srcdir@/include
 
 # Rules to compile the different kernel library source file types into
 # LLVM bitcode
-%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
+%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} ${CLFLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $< 
-%.cc.bc: %.cc ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
+	@CLANG@ ${CLANG_FLAGS} ${CLFLAGS} ${DEVICE_CL_FLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $<
+%.cc.bc: %.cc  ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANGXX@ ${CLANG_FLAGS} ${CLANGXX_FLAGS} -c -o $@ $< -include ${abs_top_srcdir}/include/pocl_features.h
-%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
+	@CLANGXX@ ${CLANG_FLAGS} ${CLANGXX_FLAGS} ${DEVICE_CL_FLAGS} -c -o $@ $<
+%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} -x cl ${CLFLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
+	@CLANG@ ${CLANG_FLAGS} -x cl ${CLFLAGS} ${DEVICE_CL_FLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
 %.ll.bc: %.ll
 	mkdir -p ${dir $@}
 	@LLVM_AS@ -o $@ $<
diff --git a/lib/kernel/sources-vml.mk b/lib/kernel/sources-vml.mk
index d92fcea..7865540 100644
--- a/lib/kernel/sources-vml.mk
+++ b/lib/kernel/sources-vml.mk
@@ -100,6 +100,7 @@ LKERNEL_SRCS_EXCLUDE =				\
         native_recip.cl                         \
         native_rsqrt.cl                         \
         native_sin.cl                           \
+        native_sqrt.cl                          \
         native_tan.cl                           \
 	normalize.cl				\
 	pow.cl					\
@@ -221,6 +222,7 @@ LKERNEL_SRCS_EXTRA = $(addprefix vecmathlib-pocl/,	\
 	native_recip.cl					\
 	native_rsqrt.cl					\
 	native_sin.cl					\
+	native_sqrt.cl  				\
 	native_tan.cl					\
 	normalize.cl					\
 	pow.cc						\
diff --git a/lib/kernel/sources.mk b/lib/kernel/sources.mk
index a1b73b4..a042c93 100644
--- a/lib/kernel/sources.mk
+++ b/lib/kernel/sources.mk
@@ -147,6 +147,7 @@ LKERNEL_SRCS_DEFAULT =				\
 	pow.cl					\
 	pown.cl					\
 	powr.cl					\
+	printf.c                                \
 	radians.cl				\
 	read_image.cl				\
 	recip.cl				\
@@ -182,8 +183,4 @@ LKERNEL_SRCS_DEFAULT =				\
 	wait_group_events.cl			\
 	write_image.cl
 
-if NEW_PRINTF_WORKS
-LKERNEL_SRCS_DEFAULT += printf.c
-endif
-
 # vim: set noexpandtab ts=8:
diff --git a/lib/kernel/svm_atomics.cl b/lib/kernel/svm_atomics.cl
new file mode 100644
index 0000000..4480920
--- /dev/null
+++ b/lib/kernel/svm_atomics.cl
@@ -0,0 +1,424 @@
+/* OpenCL built-in library: OpenCL 2.0 Atomics (C11 subset)
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   These implementations merely lower the call to a device-specific
+   call (prefixed with "pocl_atomic_")
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+
+
+#ifndef _SVM_ATOMICS_H
+#include "svm_atomics.h"
+#define _SVM_ATOMICS_H
+#endif
+
+
+
+#if !defined(Q)
+
+#  define Q __global
+#  define QUAL(f) f ## __global
+#  include "svm_atomics.cl"
+#  undef Q
+#  undef QUAL
+
+#  define Q __local
+#  define QUAL(f) f ## __local
+#  include "svm_atomics.cl"
+#  undef Q
+#  undef QUAL
+
+#elif !defined(ATOMIC_TYPE)
+
+bool _CL_OVERLOADABLE atomic_flag_test_and_set ( volatile Q atomic_flag  *object )
+{
+  return atomic_flag_test_and_set_explicit(object, memory_order_seq_cst);
+}
+
+bool _CL_OVERLOADABLE atomic_flag_test_and_set_explicit ( volatile Q atomic_flag  *object ,
+  memory_order order)
+{
+  return atomic_flag_test_and_set_explicit(object, order, memory_scope_device);
+}
+
+bool _CL_OVERLOADABLE atomic_flag_test_and_set_explicit ( volatile Q atomic_flag  *object ,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_flag_test_and_set)(object, order, scope);
+}
+
+
+
+void _CL_OVERLOADABLE atomic_flag_clear ( volatile Q atomic_flag  *object )
+{
+  atomic_flag_clear_explicit(object, memory_order_seq_cst);
+}
+
+void _CL_OVERLOADABLE atomic_flag_clear_explicit ( volatile Q atomic_flag  *object ,
+  memory_order order)
+{
+  atomic_flag_clear_explicit(object, order, memory_scope_device);
+}
+
+void _CL_OVERLOADABLE atomic_flag_clear_explicit ( volatile Q atomic_flag  *object ,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_flag_clear)(object, order, scope);
+}
+
+
+
+#  define ATOMIC_TYPE atomic_int
+#  define NONATOMIC_TYPE int
+#  include "svm_atomics.cl"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#  define ATOMIC_TYPE atomic_uint
+#  define NONATOMIC_TYPE uint
+#  include "svm_atomics.cl"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#  define ATOMIC_TYPE atomic_float
+#  define NONATOMIC_TYPE float
+#  define NON_INTEGER
+#  include "svm_atomics.cl"
+#  undef NON_INTEGER
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+
+#  define ATOMIC_TYPE atomic_long
+#  define NONATOMIC_TYPE long
+#  include "svm_atomics.cl"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#  define ATOMIC_TYPE atomic_ulong
+#  define NONATOMIC_TYPE ulong
+#  include "svm_atomics.cl"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#endif
+
+#ifdef cl_khr_fp64
+
+#  define ATOMIC_TYPE atomic_double
+#  define NONATOMIC_TYPE double
+#  define NON_INTEGER
+#  include "svm_atomics.cl"
+#  undef NON_INTEGER
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#endif
+
+#else
+
+void _CL_OVERLOADABLE atomic_store ( volatile Q ATOMIC_TYPE  *object,
+                    NONATOMIC_TYPE  desired)
+{
+  atomic_store_explicit(object, desired, memory_order_seq_cst);
+}
+
+void _CL_OVERLOADABLE atomic_store_explicit (  volatile Q ATOMIC_TYPE  *object,
+                              NONATOMIC_TYPE  desired,
+                              memory_order order)
+{
+  atomic_store_explicit(object, desired, order, memory_scope_device);
+}
+
+void _CL_OVERLOADABLE atomic_store_explicit (  volatile Q ATOMIC_TYPE  *object,
+                              NONATOMIC_TYPE  desired,
+                              memory_order order,
+                              memory_scope scope)
+{
+  QUAL(pocl_atomic_store)(object, desired, order, scope);
+}
+
+void _CL_OVERLOADABLE atomic_init (volatile Q ATOMIC_TYPE *object, NONATOMIC_TYPE value)
+{
+  atomic_store_explicit(object, value, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_load (  volatile Q ATOMIC_TYPE  *object)
+{
+  return atomic_load_explicit(object, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_load_explicit ( volatile Q ATOMIC_TYPE  *object,
+  memory_order order)
+{
+  return atomic_load_explicit(object, order, memory_scope_device);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_load_explicit ( volatile Q ATOMIC_TYPE  *object,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_load)(object, order, scope);
+}
+
+
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_exchange ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  desired)
+{
+  return atomic_exchange_explicit(object, desired, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_exchange_explicit (volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  desired,
+  memory_order order)
+{
+  return atomic_exchange_explicit(object, desired, order, memory_scope_device);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_exchange_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  desired,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_exchange)(object, desired, order, scope);
+}
+
+
+bool _CL_OVERLOADABLE atomic_compare_exchange_strong ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired)
+{
+  return atomic_compare_exchange_strong_explicit(
+        object, expected, desired, memory_order_seq_cst, memory_order_seq_cst);
+}
+
+bool _CL_OVERLOADABLE atomic_compare_exchange_strong_explicit ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired,
+  memory_order success,
+  memory_order failure)
+{
+  return atomic_compare_exchange_strong_explicit(
+        object, expected, desired, success, failure, memory_scope_device);
+}
+
+bool _CL_OVERLOADABLE atomic_compare_exchange_strong_explicit ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired,
+  memory_order success,
+  memory_order failure,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_compare_exchange_strong)(object, expected, desired, success, failure, scope);
+}
+
+bool _CL_OVERLOADABLE atomic_compare_exchange_weak ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired)
+{
+  return atomic_compare_exchange_weak_explicit(
+        object, expected, desired, memory_order_seq_cst, memory_order_seq_cst);
+}
+
+bool _CL_OVERLOADABLE atomic_compare_exchange_weak_explicit ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired,
+  memory_order success,
+  memory_order failure)
+{
+  return atomic_compare_exchange_weak_explicit(
+        object, expected, desired, success, failure, memory_scope_device);
+}
+
+bool _CL_OVERLOADABLE atomic_compare_exchange_weak_explicit ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired,
+  memory_order success,
+  memory_order failure,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_compare_exchange_weak)(object, expected, desired, success, failure, scope);
+}
+
+
+#ifndef NON_INTEGER
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_add ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand)
+{
+  return atomic_fetch_add_explicit(object, operand, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_add_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order)
+{
+  return atomic_fetch_add_explicit(object, operand, order, memory_scope_device);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_add_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_fetch_add)(object, operand, order, scope);
+}
+
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_sub ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand)
+{
+  return atomic_fetch_sub_explicit(object, operand, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_sub_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order)
+{
+  return atomic_fetch_sub_explicit(object, operand, order, memory_scope_device);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_sub_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_fetch_sub)(object, operand, order, scope);
+}
+
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_or ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand)
+{
+  return atomic_fetch_or_explicit(object, operand, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_or_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order)
+{
+  return atomic_fetch_or_explicit(object, operand, order, memory_scope_device);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_or_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_fetch_or)(object, operand, order, scope);
+}
+
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_xor ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand)
+{
+  return atomic_fetch_xor_explicit(object, operand, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_xor_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order)
+{
+  return atomic_fetch_xor_explicit(object, operand, order, memory_scope_device);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_xor_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_fetch_xor)(object, operand, order, scope);
+}
+
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_and ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand)
+{
+  return atomic_fetch_and_explicit(object, operand, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_and_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order)
+{
+  return atomic_fetch_and_explicit(object, operand, order, memory_scope_device);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_and_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_fetch_and)(object, operand, order, scope);
+}
+
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_min ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand)
+{
+  return atomic_fetch_min_explicit(object, operand, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_min_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order)
+{
+  return atomic_fetch_min_explicit(object, operand, order, memory_scope_device);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_min_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_fetch_min)(object, operand, order, scope);
+}
+
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_max ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand)
+{
+  return atomic_fetch_max_explicit(object, operand, memory_order_seq_cst);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_max_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order)
+{
+  return atomic_fetch_max_explicit(object, operand, order, memory_scope_device);
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE atomic_fetch_max_explicit ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return QUAL(pocl_atomic_fetch_max)(object, operand, order, scope);
+}
+
+#endif
+
+#endif
diff --git a/lib/kernel/svm_atomics.h b/lib/kernel/svm_atomics.h
new file mode 100644
index 0000000..e820988
--- /dev/null
+++ b/lib/kernel/svm_atomics.h
@@ -0,0 +1,169 @@
+/* OpenCL built-in library: OpenCL 2.0 Atomics (C11 subset) prototypes
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+
+#if !defined(Q)
+
+#  define Q __global
+#  define QUAL(f) f ## __global
+#  include "svm_atomics.h"
+#  undef Q
+#  undef QUAL
+
+#  define Q __local
+#  define QUAL(f) f ## __local
+#  include "svm_atomics.h"
+#  undef Q
+#  undef QUAL
+
+#elif !defined(ATOMIC_TYPE)
+
+bool _CL_OVERLOADABLE QUAL(pocl_atomic_flag_test_and_set) ( volatile Q atomic_flag  *object ,
+  memory_order order,
+  memory_scope scope);
+
+void _CL_OVERLOADABLE QUAL(pocl_atomic_flag_clear) ( volatile Q atomic_flag  *object ,
+  memory_order order,
+  memory_scope scope);
+
+#  define ATOMIC_TYPE atomic_int
+#  define NONATOMIC_TYPE int
+#  include "svm_atomics.h"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#  define ATOMIC_TYPE atomic_uint
+#  define NONATOMIC_TYPE uint
+#  include "svm_atomics.h"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#  define ATOMIC_TYPE atomic_float
+#  define NONATOMIC_TYPE float
+#  define NON_INTEGER
+#  include "svm_atomics.h"
+#  undef NON_INTEGER
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+#  define ATOMIC_TYPE atomic_long
+#  define NONATOMIC_TYPE long
+#  include "svm_atomics.h"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#  define ATOMIC_TYPE atomic_ulong
+#  define NONATOMIC_TYPE ulong
+#  include "svm_atomics.h"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#endif
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#  define ATOMIC_TYPE atomic_double
+#  define NONATOMIC_TYPE double
+#  define NON_INTEGER
+#  include "svm_atomics.h"
+#  undef NON_INTEGER
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#endif
+
+#else
+
+_CL_OVERLOADABLE void QUAL(pocl_atomic_store)( volatile Q ATOMIC_TYPE  *object,
+                              NONATOMIC_TYPE  desired,
+                              memory_order order,
+                              memory_scope scope);
+
+_CL_OVERLOADABLE NONATOMIC_TYPE QUAL(pocl_atomic_load) ( volatile Q ATOMIC_TYPE  *object,
+                                        memory_order order,
+                                        memory_scope scope);
+
+_CL_OVERLOADABLE NONATOMIC_TYPE QUAL(pocl_atomic_exchange) ( volatile Q ATOMIC_TYPE  *object,
+                                            NONATOMIC_TYPE  desired,
+                                            memory_order order,
+                                            memory_scope scope);
+
+bool _CL_OVERLOADABLE QUAL(pocl_atomic_compare_exchange_strong) ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired,
+  memory_order success,
+  memory_order failure,
+  memory_scope scope);
+
+bool _CL_OVERLOADABLE QUAL(pocl_atomic_compare_exchange_weak) ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired,
+  memory_order success,
+  memory_order failure,
+  memory_scope scope);
+
+#ifndef NON_INTEGER
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_add) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope);
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_sub) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope);
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_or) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope);
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_xor) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope);
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_and) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope);
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_min) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope);
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_max) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope);
+
+#endif
+
+#endif
diff --git a/lib/kernel/svm_atomics_host.cl b/lib/kernel/svm_atomics_host.cl
new file mode 100644
index 0000000..5c9f2ce
--- /dev/null
+++ b/lib/kernel/svm_atomics_host.cl
@@ -0,0 +1,253 @@
+/* OpenCL built-in library: OpenCL 2.0 Atomics (C11 subset) implementation for host device
+
+   Copyright (c) 2015 Michal Babej / Tampere University of Technology
+
+   This relies on Clang's C11 atomic builtins.
+
+   Note: for some architectures, the host-specific llvm bitcode is used instead
+   of this file (since Clang doesn't have proper builtins for 64bit min/max atomics,
+   yet LLVM's atomicrmw can do them; using this file gives only limited min/max
+   atomics).
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#ifndef _SVM_ATOMICS_H
+#include "svm_atomics.h"
+
+#define CONV_ORDER(mo) ((mo==memory_order_relaxed) ? __ATOMIC_RELAXED : \
+                       ((mo==memory_order_acquire) ? __ATOMIC_ACQUIRE : \
+                       ((mo==memory_order_release) ? __ATOMIC_RELEASE : \
+                       ((mo==memory_order_acq_rel) ? __ATOMIC_ACQ_REL : \
+                                                     __ATOMIC_SEQ_CST ))))
+#define _SVM_ATOMICS_H
+#endif
+
+
+
+
+
+#if !defined(Q)
+
+#  define Q __global
+#  define QUAL(f) f ## __global
+#  include "svm_atomics_host.cl"
+#  undef Q
+#  undef QUAL
+
+#  define Q __local
+#  define QUAL(f) f ## __local
+#  include "svm_atomics_host.cl"
+#  undef Q
+#  undef QUAL
+
+#elif !defined(ATOMIC_TYPE)
+
+bool _CL_OVERLOADABLE QUAL(pocl_atomic_flag_test_and_set) ( volatile Q atomic_int  *object ,
+  memory_order order,
+  memory_scope scope)
+{
+  return __c11_atomic_exchange(object, 1, CONV_ORDER(order));
+}
+
+void _CL_OVERLOADABLE QUAL(pocl_atomic_flag_clear) ( volatile Q atomic_int  *object ,
+  memory_order order,
+  memory_scope scope)
+{
+  __c11_atomic_store(object, 0, CONV_ORDER(order));
+}
+
+#  define ATOMIC_TYPE atomic_int
+#  define NONATOMIC_TYPE int
+#  define IS_INT
+#  include "svm_atomics_host.cl"
+#  undef IS_INT
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#  define ATOMIC_TYPE atomic_uint
+#  define NONATOMIC_TYPE uint
+#  define IS_UINT
+#  include "svm_atomics_host.cl"
+#  undef IS_UINT
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#  define ATOMIC_TYPE atomic_float
+#  define NONATOMIC_TYPE float
+#  define NON_INTEGER
+#  include "svm_atomics_host.cl"
+#  undef NON_INTEGER
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+
+#  define ATOMIC_TYPE atomic_long
+#  define NONATOMIC_TYPE long
+#  include "svm_atomics_host.cl"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#  define ATOMIC_TYPE atomic_ulong
+#  define NONATOMIC_TYPE ulong
+#  include "svm_atomics_host.cl"
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#endif
+
+#ifdef cl_khr_fp64
+
+#  define ATOMIC_TYPE atomic_double
+#  define NONATOMIC_TYPE double
+#  define NON_INTEGER
+#  include "svm_atomics_host.cl"
+#  undef NON_INTEGER
+#  undef ATOMIC_TYPE
+#  undef NONATOMIC_TYPE
+
+#endif
+
+#else
+
+/************************************************************************/
+
+_CL_OVERLOADABLE void QUAL(pocl_atomic_store)( volatile Q ATOMIC_TYPE  *object,
+                              NONATOMIC_TYPE  desired,
+                              memory_order order,
+                              memory_scope scope)
+{
+  __c11_atomic_store(object, desired, CONV_ORDER(order));
+}
+
+_CL_OVERLOADABLE NONATOMIC_TYPE QUAL(pocl_atomic_load) ( volatile Q ATOMIC_TYPE  *object,
+                                        memory_order order,
+                                        memory_scope scope)
+{
+  return __c11_atomic_load(object, CONV_ORDER(order));
+}
+
+
+_CL_OVERLOADABLE NONATOMIC_TYPE QUAL(pocl_atomic_exchange) ( volatile Q ATOMIC_TYPE  *object,
+                                            NONATOMIC_TYPE  desired,
+                                            memory_order order,
+                                            memory_scope scope)
+{
+  return __c11_atomic_exchange(object, desired, CONV_ORDER(order));
+}
+
+bool _CL_OVERLOADABLE QUAL(pocl_atomic_compare_exchange_strong) ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired,
+  memory_order success,
+  memory_order failure,
+  memory_scope scope)
+{
+  return __c11_atomic_compare_exchange_strong(object,  expected, desired, CONV_ORDER(success), CONV_ORDER(failure));
+}
+
+bool _CL_OVERLOADABLE QUAL(pocl_atomic_compare_exchange_weak) ( volatile Q ATOMIC_TYPE  *object,
+  private NONATOMIC_TYPE  *expected,
+  NONATOMIC_TYPE  desired,
+  memory_order success,
+  memory_order failure,
+  memory_scope scope)
+{
+  return __c11_atomic_compare_exchange_weak(object,  expected, desired, CONV_ORDER(success), CONV_ORDER(failure));
+}
+
+#ifndef NON_INTEGER
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_add) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return __c11_atomic_fetch_add(object, operand, CONV_ORDER(order));
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_sub) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return __c11_atomic_fetch_sub(object, operand, CONV_ORDER(order));
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_or) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return __c11_atomic_fetch_or(object, operand, CONV_ORDER(order));
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_xor) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return __c11_atomic_fetch_xor(object, operand, CONV_ORDER(order));
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_and) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+  return __c11_atomic_fetch_and(object, operand, CONV_ORDER(order));
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_min) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+#if defined(IS_INT)
+  return __sync_fetch_and_min((volatile Q NONATOMIC_TYPE *)object, operand);
+#elif defined(IS_UINT)
+  return __sync_fetch_and_umin((volatile Q NONATOMIC_TYPE *)object, operand);
+#else
+  __builtin_trap();
+  return 0;
+#endif
+}
+
+NONATOMIC_TYPE _CL_OVERLOADABLE QUAL(pocl_atomic_fetch_max) ( volatile Q ATOMIC_TYPE  *object,
+  NONATOMIC_TYPE  operand,
+  memory_order order,
+  memory_scope scope)
+{
+#if defined(IS_INT)
+  return __sync_fetch_and_max((volatile Q NONATOMIC_TYPE *)object, operand);
+#elif defined(IS_UINT)
+  return __sync_fetch_and_umax((volatile Q NONATOMIC_TYPE *)object, operand);
+#else
+  __builtin_trap();
+  return 0;
+#endif
+}
+
+#endif
+
+/************************************************************************/
+
+
+#endif
diff --git a/lib/kernel/svm_atomics_x86_64.ll b/lib/kernel/svm_atomics_x86_64.ll
new file mode 100644
index 0000000..4412db3
--- /dev/null
+++ b/lib/kernel/svm_atomics_x86_64.ll
@@ -0,0 +1,8075 @@
+; ModuleID = 'svm_atomics_host.cl.bc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z37pocl_atomic_flag_test_and_set__globalPVU8CLglobalU7_Atomici12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 1 seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  %9 = icmp ne i32 %.0, 0
+  ret i1 %9
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z30pocl_atomic_flag_clear__globalPVU8CLglobalU7_Atomici12memory_order12memory_scope(i32 addrspace(1)* nocapture %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 0, i32 addrspace(1)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 0, i32 addrspace(1)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 0, i32 addrspace(1)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU8CLglobalU7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* nocapture %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z24pocl_atomic_load__globalPVU8CLglobalU7_Atomici12memory_order12memory_scope(i32 addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i32, i32 addrspace(1)* %object monotonic, align 4
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i32, i32 addrspace(1)* %object acquire, align 4
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i32, i32 addrspace(1)* %object seq_cst, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_exchange__globalPVU8CLglobalU7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU8CLglobalU7_AtomiciPii12memory_orderS3_12memory_scope(i32 addrspace(1)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg volatile i32 addrspace(1)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg volatile i32 addrspace(1)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg volatile i32 addrspace(1)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg volatile i32 addrspace(1)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg volatile i32 addrspace(1)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU8CLglobalU7_AtomiciPii12memory_orderS3_12memory_scope(i32 addrspace(1)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_add__globalPVU8CLglobalU7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_sub__globalPVU8CLglobalU7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_or__globalPVU8CLglobalU7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_xor__globalPVU8CLglobalU7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_and__globalPVU8CLglobalU7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_min__globalPVU8CLglobalU7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile min i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_max__globalPVU8CLglobalU7_Atomicii12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile max i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU8CLglobalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* nocapture %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(1)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z24pocl_atomic_load__globalPVU8CLglobalU7_Atomicj12memory_order12memory_scope(i32 addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i32, i32 addrspace(1)* %object monotonic, align 4
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i32, i32 addrspace(1)* %object acquire, align 4
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i32, i32 addrspace(1)* %object seq_cst, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_exchange__globalPVU8CLglobalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(1)* %object, i32 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU8CLglobalU7_AtomicjPjj12memory_orderS3_12memory_scope(i32 addrspace(1)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg volatile i32 addrspace(1)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg volatile i32 addrspace(1)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg volatile i32 addrspace(1)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg volatile i32 addrspace(1)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg volatile i32 addrspace(1)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU8CLglobalU7_AtomicjPjj12memory_orderS3_12memory_scope(i32 addrspace(1)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg weak volatile i32 addrspace(1)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_add__globalPVU8CLglobalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_sub__globalPVU8CLglobalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_or__globalPVU8CLglobalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_xor__globalPVU8CLglobalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_and__globalPVU8CLglobalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_min__globalPVU8CLglobalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umin i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z29pocl_atomic_fetch_max__globalPVU8CLglobalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(1)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umax i32 addrspace(1)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU8CLglobalU7_Atomicff12memory_order12memory_scope(float addrspace(1)* nocapture %object, float %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %5 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast float %desired to i32
+  %2 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  store atomic volatile i32 %1, i32 addrspace(1)* %2 release, align 4
+  br label %13
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast float %desired to i32
+  %4 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br label %9
+
+; <label>:5                                       ; preds = %0
+  %6 = icmp eq i32 %order, 3
+  %7 = bitcast float %desired to i32
+  %8 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br i1 %6, label %9, label %12
+
+; <label>:9                                       ; preds = %5, %.thread
+  %10 = phi i32 addrspace(1)* [ %4, %.thread ], [ %8, %5 ]
+  %11 = phi i32 [ %3, %.thread ], [ %7, %5 ]
+  store atomic volatile i32 %11, i32 addrspace(1)* %10 monotonic, align 4
+  br label %13
+
+; <label>:12                                      ; preds = %5
+  store atomic volatile i32 %7, i32 addrspace(1)* %8 seq_cst, align 4
+  br label %13
+
+; <label>:13                                      ; preds = %12, %.thread1, %9
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define float @_Z24pocl_atomic_load__globalPVU8CLglobalU7_Atomicf12memory_order12memory_scope(float addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %4 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %2 = load atomic volatile i32, i32 addrspace(1)* %1 acquire, align 4
+  br label %12
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br label %7
+
+; <label>:4                                       ; preds = %0
+  %5 = icmp eq i32 %order, 3
+  %6 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br i1 %5, label %7, label %10
+
+; <label>:7                                       ; preds = %4, %.thread
+  %8 = phi i32 addrspace(1)* [ %3, %.thread ], [ %6, %4 ]
+  %9 = load atomic volatile i32, i32 addrspace(1)* %8 monotonic, align 4
+  br label %12
+
+; <label>:10                                      ; preds = %4
+  %11 = load atomic volatile i32, i32 addrspace(1)* %6 seq_cst, align 4
+  br label %12
+
+; <label>:12                                      ; preds = %10, %.thread1, %7
+  %.sroa.0.0 = phi i32 [ %9, %7 ], [ %11, %10 ], [ %2, %.thread1 ]
+  %13 = bitcast i32 %.sroa.0.0 to float
+  ret float %13
+}
+
+; Function Attrs: nounwind uwtable
+define float @_Z28pocl_atomic_exchange__globalPVU8CLglobalU7_Atomicff12memory_order12memory_scope(float addrspace(1)* %object, float %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %10 [
+    i32 0, label %.thread
+    i32 1, label %.thread2
+    i32 2, label %.thread3
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = bitcast float %desired to i32
+  %2 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %3 = atomicrmw volatile xchg i32 addrspace(1)* %2, i32 %1 monotonic
+  br label %18
+
+.thread2:                                         ; preds = %0
+  %4 = bitcast float %desired to i32
+  %5 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %6 = atomicrmw volatile xchg i32 addrspace(1)* %5, i32 %4 acquire
+  br label %18
+
+.thread3:                                         ; preds = %0
+  %7 = bitcast float %desired to i32
+  %8 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %9 = atomicrmw volatile xchg i32 addrspace(1)* %8, i32 %7 release
+  br label %18
+
+; <label>:10                                      ; preds = %0
+  %11 = icmp eq i32 %order, 3
+  %12 = bitcast float %desired to i32
+  %13 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  br i1 %11, label %14, label %16
+
+; <label>:14                                      ; preds = %10
+  %15 = atomicrmw volatile xchg i32 addrspace(1)* %13, i32 %12 acq_rel
+  br label %18
+
+; <label>:16                                      ; preds = %10
+  %17 = atomicrmw volatile xchg i32 addrspace(1)* %13, i32 %12 seq_cst
+  br label %18
+
+; <label>:18                                      ; preds = %16, %14, %.thread3, %.thread2, %.thread
+  %.sroa.0.0 = phi i32 [ %3, %.thread ], [ %17, %16 ], [ %15, %14 ], [ %9, %.thread3 ], [ %6, %.thread2 ]
+  %19 = bitcast i32 %.sroa.0.0 to float
+  ret float %19
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU8CLglobalU7_AtomicfPff12memory_orderS3_12memory_scope(float addrspace(1)* %object, float* nocapture %expected, float %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast float %desired to i32
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %23 = bitcast float* %expected to i32*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i32, i32* %23, align 4
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i32, i32* %23, align 4
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i32, i32* %23, align 4
+  %33 = cmpxchg volatile i32 addrspace(1)* %22, i32 %32, i32 %11 monotonic monotonic
+  %34 = extractvalue { i32, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i32, i1 } %33, 0
+  store i32 %36, i32* %23, align 4
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg volatile i32 addrspace(1)* %22, i32 %25, i32 %11 acquire monotonic
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg volatile i32 addrspace(1)* %22, i32 %25, i32 %11 acquire acquire
+  %44 = extractvalue { i32, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i32, i1 } %40, 0
+  store i32 %46, i32* %23, align 4
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i32, i1 } %43, 0
+  store i32 %50, i32* %23, align 4
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i32, i32* %23, align 4
+  %55 = cmpxchg volatile i32 addrspace(1)* %22, i32 %54, i32 %11 release monotonic
+  %56 = extractvalue { i32, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i32, i1 } %55, 0
+  store i32 %58, i32* %23, align 4
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg volatile i32 addrspace(1)* %22, i32 %27, i32 %11 acq_rel monotonic
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg volatile i32 addrspace(1)* %22, i32 %27, i32 %11 acq_rel acquire
+  %66 = extractvalue { i32, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i32, i1 } %62, 0
+  store i32 %68, i32* %23, align 4
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i32, i1 } %65, 0
+  store i32 %72, i32* %23, align 4
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i32, i32* %23, align 4
+  %77 = cmpxchg volatile i32 addrspace(1)* %22, i32 %76, i32 %11 seq_cst monotonic
+  %78 = extractvalue { i32, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i32, i32* %23, align 4
+  %81 = cmpxchg volatile i32 addrspace(1)* %22, i32 %80, i32 %11 seq_cst acquire
+  %82 = extractvalue { i32, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i32, i32* %23, align 4
+  %85 = cmpxchg volatile i32 addrspace(1)* %22, i32 %84, i32 %11 seq_cst seq_cst
+  %86 = extractvalue { i32, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i32, i1 } %77, 0
+  store i32 %88, i32* %23, align 4
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i32, i1 } %81, 0
+  store i32 %92, i32* %23, align 4
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i32, i1 } %85, 0
+  store i32 %96, i32* %23, align 4
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU8CLglobalU7_AtomicfPff12memory_orderS3_12memory_scope(float addrspace(1)* %object, float* nocapture %expected, float %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast float %desired to i32
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast float addrspace(1)* %object to i32 addrspace(1)*
+  %23 = bitcast float* %expected to i32*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i32, i32* %23, align 4
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i32, i32* %23, align 4
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i32, i32* %23, align 4
+  %33 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %32, i32 %11 monotonic monotonic
+  %34 = extractvalue { i32, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i32, i1 } %33, 0
+  store i32 %36, i32* %23, align 4
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %25, i32 %11 acquire monotonic
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %25, i32 %11 acquire acquire
+  %44 = extractvalue { i32, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i32, i1 } %40, 0
+  store i32 %46, i32* %23, align 4
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i32, i1 } %43, 0
+  store i32 %50, i32* %23, align 4
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i32, i32* %23, align 4
+  %55 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %54, i32 %11 release monotonic
+  %56 = extractvalue { i32, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i32, i1 } %55, 0
+  store i32 %58, i32* %23, align 4
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %27, i32 %11 acq_rel monotonic
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %27, i32 %11 acq_rel acquire
+  %66 = extractvalue { i32, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i32, i1 } %62, 0
+  store i32 %68, i32* %23, align 4
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i32, i1 } %65, 0
+  store i32 %72, i32* %23, align 4
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i32, i32* %23, align 4
+  %77 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %76, i32 %11 seq_cst monotonic
+  %78 = extractvalue { i32, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i32, i32* %23, align 4
+  %81 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %80, i32 %11 seq_cst acquire
+  %82 = extractvalue { i32, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i32, i32* %23, align 4
+  %85 = cmpxchg weak volatile i32 addrspace(1)* %22, i32 %84, i32 %11 seq_cst seq_cst
+  %86 = extractvalue { i32, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i32, i1 } %77, 0
+  store i32 %88, i32* %23, align 4
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i32, i1 } %81, 0
+  store i32 %92, i32* %23, align 4
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i32, i1 } %85, 0
+  store i32 %96, i32* %23, align 4
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU8CLglobalU7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* nocapture %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object monotonic, align 8
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object release, align 8
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object seq_cst, align 8
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z24pocl_atomic_load__globalPVU8CLglobalU7_Atomicl12memory_order12memory_scope(i64 addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i64, i64 addrspace(1)* %object monotonic, align 8
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i64, i64 addrspace(1)* %object acquire, align 8
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i64, i64 addrspace(1)* %object seq_cst, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_exchange__globalPVU8CLglobalU7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU8CLglobalU7_AtomiclPll12memory_orderS3_12memory_scope(i64 addrspace(1)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg volatile i64 addrspace(1)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg volatile i64 addrspace(1)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg volatile i64 addrspace(1)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg volatile i64 addrspace(1)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg volatile i64 addrspace(1)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU8CLglobalU7_AtomiclPll12memory_orderS3_12memory_scope(i64 addrspace(1)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_add__globalPVU8CLglobalU7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_sub__globalPVU8CLglobalU7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_or__globalPVU8CLglobalU7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_xor__globalPVU8CLglobalU7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_and__globalPVU8CLglobalU7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_min__globalPVU8CLglobalU7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile min i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_max__globalPVU8CLglobalU7_Atomicll12memory_order12memory_scope(i64 addrspace(1)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile max i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU8CLglobalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* nocapture %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object monotonic, align 8
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object release, align 8
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(1)* %object seq_cst, align 8
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z24pocl_atomic_load__globalPVU8CLglobalU7_Atomicm12memory_order12memory_scope(i64 addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i64, i64 addrspace(1)* %object monotonic, align 8
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i64, i64 addrspace(1)* %object acquire, align 8
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i64, i64 addrspace(1)* %object seq_cst, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_exchange__globalPVU8CLglobalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i64 addrspace(1)* %object, i64 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU8CLglobalU7_AtomicmPmm12memory_orderS3_12memory_scope(i64 addrspace(1)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg volatile i64 addrspace(1)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg volatile i64 addrspace(1)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg volatile i64 addrspace(1)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg volatile i64 addrspace(1)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg volatile i64 addrspace(1)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU8CLglobalU7_AtomicmPmm12memory_orderS3_12memory_scope(i64 addrspace(1)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg weak volatile i64 addrspace(1)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_add__globalPVU8CLglobalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_sub__globalPVU8CLglobalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_or__globalPVU8CLglobalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_xor__globalPVU8CLglobalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_and__globalPVU8CLglobalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_min__globalPVU8CLglobalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umin i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z29pocl_atomic_fetch_max__globalPVU8CLglobalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(1)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umax i64 addrspace(1)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z25pocl_atomic_store__globalPVU8CLglobalU7_Atomicdd12memory_order12memory_scope(double addrspace(1)* nocapture %object, double %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %5 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast double %desired to i64
+  %2 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  store atomic volatile i64 %1, i64 addrspace(1)* %2 release, align 8
+  br label %13
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast double %desired to i64
+  %4 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br label %9
+
+; <label>:5                                       ; preds = %0
+  %6 = icmp eq i32 %order, 3
+  %7 = bitcast double %desired to i64
+  %8 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br i1 %6, label %9, label %12
+
+; <label>:9                                       ; preds = %5, %.thread
+  %10 = phi i64 addrspace(1)* [ %4, %.thread ], [ %8, %5 ]
+  %11 = phi i64 [ %3, %.thread ], [ %7, %5 ]
+  store atomic volatile i64 %11, i64 addrspace(1)* %10 monotonic, align 8
+  br label %13
+
+; <label>:12                                      ; preds = %5
+  store atomic volatile i64 %7, i64 addrspace(1)* %8 seq_cst, align 8
+  br label %13
+
+; <label>:13                                      ; preds = %12, %.thread1, %9
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define double @_Z24pocl_atomic_load__globalPVU8CLglobalU7_Atomicd12memory_order12memory_scope(double addrspace(1)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %4 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %2 = load atomic volatile i64, i64 addrspace(1)* %1 acquire, align 8
+  br label %12
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br label %7
+
+; <label>:4                                       ; preds = %0
+  %5 = icmp eq i32 %order, 3
+  %6 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br i1 %5, label %7, label %10
+
+; <label>:7                                       ; preds = %4, %.thread
+  %8 = phi i64 addrspace(1)* [ %3, %.thread ], [ %6, %4 ]
+  %9 = load atomic volatile i64, i64 addrspace(1)* %8 monotonic, align 8
+  br label %12
+
+; <label>:10                                      ; preds = %4
+  %11 = load atomic volatile i64, i64 addrspace(1)* %6 seq_cst, align 8
+  br label %12
+
+; <label>:12                                      ; preds = %10, %.thread1, %7
+  %.sroa.0.0 = phi i64 [ %9, %7 ], [ %11, %10 ], [ %2, %.thread1 ]
+  %13 = bitcast i64 %.sroa.0.0 to double
+  ret double %13
+}
+
+; Function Attrs: nounwind uwtable
+define double @_Z28pocl_atomic_exchange__globalPVU8CLglobalU7_Atomicdd12memory_order12memory_scope(double addrspace(1)* %object, double %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %10 [
+    i32 0, label %.thread
+    i32 1, label %.thread2
+    i32 2, label %.thread3
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = bitcast double %desired to i64
+  %2 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %3 = atomicrmw volatile xchg i64 addrspace(1)* %2, i64 %1 monotonic
+  br label %18
+
+.thread2:                                         ; preds = %0
+  %4 = bitcast double %desired to i64
+  %5 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %6 = atomicrmw volatile xchg i64 addrspace(1)* %5, i64 %4 acquire
+  br label %18
+
+.thread3:                                         ; preds = %0
+  %7 = bitcast double %desired to i64
+  %8 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %9 = atomicrmw volatile xchg i64 addrspace(1)* %8, i64 %7 release
+  br label %18
+
+; <label>:10                                      ; preds = %0
+  %11 = icmp eq i32 %order, 3
+  %12 = bitcast double %desired to i64
+  %13 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  br i1 %11, label %14, label %16
+
+; <label>:14                                      ; preds = %10
+  %15 = atomicrmw volatile xchg i64 addrspace(1)* %13, i64 %12 acq_rel
+  br label %18
+
+; <label>:16                                      ; preds = %10
+  %17 = atomicrmw volatile xchg i64 addrspace(1)* %13, i64 %12 seq_cst
+  br label %18
+
+; <label>:18                                      ; preds = %16, %14, %.thread3, %.thread2, %.thread
+  %.sroa.0.0 = phi i64 [ %3, %.thread ], [ %17, %16 ], [ %15, %14 ], [ %9, %.thread3 ], [ %6, %.thread2 ]
+  %19 = bitcast i64 %.sroa.0.0 to double
+  ret double %19
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z43pocl_atomic_compare_exchange_strong__globalPVU8CLglobalU7_AtomicdPdd12memory_orderS3_12memory_scope(double addrspace(1)* %object, double* nocapture %expected, double %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast double %desired to i64
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %23 = bitcast double* %expected to i64*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i64, i64* %23, align 8
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i64, i64* %23, align 8
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i64, i64* %23, align 8
+  %33 = cmpxchg volatile i64 addrspace(1)* %22, i64 %32, i64 %11 monotonic monotonic
+  %34 = extractvalue { i64, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i64, i1 } %33, 0
+  store i64 %36, i64* %23, align 8
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg volatile i64 addrspace(1)* %22, i64 %25, i64 %11 acquire monotonic
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg volatile i64 addrspace(1)* %22, i64 %25, i64 %11 acquire acquire
+  %44 = extractvalue { i64, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i64, i1 } %40, 0
+  store i64 %46, i64* %23, align 8
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i64, i1 } %43, 0
+  store i64 %50, i64* %23, align 8
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i64, i64* %23, align 8
+  %55 = cmpxchg volatile i64 addrspace(1)* %22, i64 %54, i64 %11 release monotonic
+  %56 = extractvalue { i64, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i64, i1 } %55, 0
+  store i64 %58, i64* %23, align 8
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg volatile i64 addrspace(1)* %22, i64 %27, i64 %11 acq_rel monotonic
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg volatile i64 addrspace(1)* %22, i64 %27, i64 %11 acq_rel acquire
+  %66 = extractvalue { i64, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i64, i1 } %62, 0
+  store i64 %68, i64* %23, align 8
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i64, i1 } %65, 0
+  store i64 %72, i64* %23, align 8
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i64, i64* %23, align 8
+  %77 = cmpxchg volatile i64 addrspace(1)* %22, i64 %76, i64 %11 seq_cst monotonic
+  %78 = extractvalue { i64, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i64, i64* %23, align 8
+  %81 = cmpxchg volatile i64 addrspace(1)* %22, i64 %80, i64 %11 seq_cst acquire
+  %82 = extractvalue { i64, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i64, i64* %23, align 8
+  %85 = cmpxchg volatile i64 addrspace(1)* %22, i64 %84, i64 %11 seq_cst seq_cst
+  %86 = extractvalue { i64, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i64, i1 } %77, 0
+  store i64 %88, i64* %23, align 8
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i64, i1 } %81, 0
+  store i64 %92, i64* %23, align 8
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i64, i1 } %85, 0
+  store i64 %96, i64* %23, align 8
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z41pocl_atomic_compare_exchange_weak__globalPVU8CLglobalU7_AtomicdPdd12memory_orderS3_12memory_scope(double addrspace(1)* %object, double* nocapture %expected, double %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast double %desired to i64
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast double addrspace(1)* %object to i64 addrspace(1)*
+  %23 = bitcast double* %expected to i64*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i64, i64* %23, align 8
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i64, i64* %23, align 8
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i64, i64* %23, align 8
+  %33 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %32, i64 %11 monotonic monotonic
+  %34 = extractvalue { i64, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i64, i1 } %33, 0
+  store i64 %36, i64* %23, align 8
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %25, i64 %11 acquire monotonic
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %25, i64 %11 acquire acquire
+  %44 = extractvalue { i64, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i64, i1 } %40, 0
+  store i64 %46, i64* %23, align 8
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i64, i1 } %43, 0
+  store i64 %50, i64* %23, align 8
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i64, i64* %23, align 8
+  %55 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %54, i64 %11 release monotonic
+  %56 = extractvalue { i64, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i64, i1 } %55, 0
+  store i64 %58, i64* %23, align 8
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %27, i64 %11 acq_rel monotonic
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %27, i64 %11 acq_rel acquire
+  %66 = extractvalue { i64, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i64, i1 } %62, 0
+  store i64 %68, i64* %23, align 8
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i64, i1 } %65, 0
+  store i64 %72, i64* %23, align 8
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i64, i64* %23, align 8
+  %77 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %76, i64 %11 seq_cst monotonic
+  %78 = extractvalue { i64, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i64, i64* %23, align 8
+  %81 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %80, i64 %11 seq_cst acquire
+  %82 = extractvalue { i64, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i64, i64* %23, align 8
+  %85 = cmpxchg weak volatile i64 addrspace(1)* %22, i64 %84, i64 %11 seq_cst seq_cst
+  %86 = extractvalue { i64, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i64, i1 } %77, 0
+  store i64 %88, i64* %23, align 8
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i64, i1 } %81, 0
+  store i64 %92, i64* %23, align 8
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i64, i1 } %85, 0
+  store i64 %96, i64* %23, align 8
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z36pocl_atomic_flag_test_and_set__localPVU7CLlocalU7_Atomici12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 1 seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  %9 = icmp ne i32 %.0, 0
+  ret i1 %9
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z29pocl_atomic_flag_clear__localPVU7CLlocalU7_Atomici12memory_order12memory_scope(i32 addrspace(2)* nocapture %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 0, i32 addrspace(2)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 0, i32 addrspace(2)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 0, i32 addrspace(2)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU7CLlocalU7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* nocapture %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z23pocl_atomic_load__localPVU7CLlocalU7_Atomici12memory_order12memory_scope(i32 addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i32, i32 addrspace(2)* %object monotonic, align 4
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i32, i32 addrspace(2)* %object acquire, align 4
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i32, i32 addrspace(2)* %object seq_cst, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z27pocl_atomic_exchange__localPVU7CLlocalU7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU7CLlocalU7_AtomiciPii12memory_orderS3_12memory_scope(i32 addrspace(2)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg volatile i32 addrspace(2)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg volatile i32 addrspace(2)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg volatile i32 addrspace(2)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg volatile i32 addrspace(2)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg volatile i32 addrspace(2)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU7CLlocalU7_AtomiciPii12memory_orderS3_12memory_scope(i32 addrspace(2)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_add__localPVU7CLlocalU7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_sub__localPVU7CLlocalU7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z27pocl_atomic_fetch_or__localPVU7CLlocalU7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_xor__localPVU7CLlocalU7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_and__localPVU7CLlocalU7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_min__localPVU7CLlocalU7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile min i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_max__localPVU7CLlocalU7_Atomicii12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile max i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU7CLlocalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* nocapture %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object monotonic, align 4
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object release, align 4
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i32 %desired, i32 addrspace(2)* %object seq_cst, align 4
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z23pocl_atomic_load__localPVU7CLlocalU7_Atomicj12memory_order12memory_scope(i32 addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i32, i32 addrspace(2)* %object monotonic, align 4
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i32, i32 addrspace(2)* %object acquire, align 4
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i32, i32 addrspace(2)* %object seq_cst, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z27pocl_atomic_exchange__localPVU7CLlocalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i32 addrspace(2)* %object, i32 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU7CLlocalU7_AtomicjPjj12memory_orderS3_12memory_scope(i32 addrspace(2)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg volatile i32 addrspace(2)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg volatile i32 addrspace(2)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg volatile i32 addrspace(2)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg volatile i32 addrspace(2)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg volatile i32 addrspace(2)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU7CLlocalU7_AtomicjPjj12memory_orderS3_12memory_scope(i32 addrspace(2)* %object, i32* nocapture %expected, i32 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i32, i32* %expected, align 4
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i32, i32* %expected, align 4
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i32, i32* %expected, align 4
+  %30 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %29, i32 %desired monotonic monotonic
+  %31 = extractvalue { i32, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i32, i1 } %30, 0
+  store i32 %33, i32* %expected, align 4
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire monotonic
+  %38 = extractvalue { i32, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %22, i32 %desired acquire acquire
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i32, i1 } %37, 0
+  store i32 %43, i32* %expected, align 4
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i32, i1 } %40, 0
+  store i32 %47, i32* %expected, align 4
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i32, i32* %expected, align 4
+  %52 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %51, i32 %desired release monotonic
+  %53 = extractvalue { i32, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i32, i1 } %52, 0
+  store i32 %55, i32* %expected, align 4
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel monotonic
+  %60 = extractvalue { i32, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %24, i32 %desired acq_rel acquire
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i32, i1 } %59, 0
+  store i32 %65, i32* %expected, align 4
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i32, i1 } %62, 0
+  store i32 %69, i32* %expected, align 4
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i32, i32* %expected, align 4
+  %74 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %73, i32 %desired seq_cst monotonic
+  %75 = extractvalue { i32, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i32, i32* %expected, align 4
+  %78 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %77, i32 %desired seq_cst acquire
+  %79 = extractvalue { i32, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i32, i32* %expected, align 4
+  %82 = cmpxchg weak volatile i32 addrspace(2)* %object, i32 %81, i32 %desired seq_cst seq_cst
+  %83 = extractvalue { i32, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i32, i1 } %74, 0
+  store i32 %85, i32* %expected, align 4
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i32, i1 } %78, 0
+  store i32 %89, i32* %expected, align 4
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i32, i1 } %82, 0
+  store i32 %93, i32* %expected, align 4
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_add__localPVU7CLlocalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_sub__localPVU7CLlocalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z27pocl_atomic_fetch_or__localPVU7CLlocalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_xor__localPVU7CLlocalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_and__localPVU7CLlocalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_min__localPVU7CLlocalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umin i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z28pocl_atomic_fetch_max__localPVU7CLlocalU7_Atomicjj12memory_order12memory_scope(i32 addrspace(2)* %object, i32 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umax i32 addrspace(2)* %object, i32 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i32 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i32 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU7CLlocalU7_Atomicff12memory_order12memory_scope(float addrspace(2)* nocapture %object, float %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %5 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast float %desired to i32
+  %2 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  store atomic volatile i32 %1, i32 addrspace(2)* %2 release, align 4
+  br label %13
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast float %desired to i32
+  %4 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br label %9
+
+; <label>:5                                       ; preds = %0
+  %6 = icmp eq i32 %order, 3
+  %7 = bitcast float %desired to i32
+  %8 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br i1 %6, label %9, label %12
+
+; <label>:9                                       ; preds = %5, %.thread
+  %10 = phi i32 addrspace(2)* [ %4, %.thread ], [ %8, %5 ]
+  %11 = phi i32 [ %3, %.thread ], [ %7, %5 ]
+  store atomic volatile i32 %11, i32 addrspace(2)* %10 monotonic, align 4
+  br label %13
+
+; <label>:12                                      ; preds = %5
+  store atomic volatile i32 %7, i32 addrspace(2)* %8 seq_cst, align 4
+  br label %13
+
+; <label>:13                                      ; preds = %12, %.thread1, %9
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define float @_Z23pocl_atomic_load__localPVU7CLlocalU7_Atomicf12memory_order12memory_scope(float addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %4 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %2 = load atomic volatile i32, i32 addrspace(2)* %1 acquire, align 4
+  br label %12
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br label %7
+
+; <label>:4                                       ; preds = %0
+  %5 = icmp eq i32 %order, 3
+  %6 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br i1 %5, label %7, label %10
+
+; <label>:7                                       ; preds = %4, %.thread
+  %8 = phi i32 addrspace(2)* [ %3, %.thread ], [ %6, %4 ]
+  %9 = load atomic volatile i32, i32 addrspace(2)* %8 monotonic, align 4
+  br label %12
+
+; <label>:10                                      ; preds = %4
+  %11 = load atomic volatile i32, i32 addrspace(2)* %6 seq_cst, align 4
+  br label %12
+
+; <label>:12                                      ; preds = %10, %.thread1, %7
+  %.sroa.0.0 = phi i32 [ %9, %7 ], [ %11, %10 ], [ %2, %.thread1 ]
+  %13 = bitcast i32 %.sroa.0.0 to float
+  ret float %13
+}
+
+; Function Attrs: nounwind uwtable
+define float @_Z27pocl_atomic_exchange__localPVU7CLlocalU7_Atomicff12memory_order12memory_scope(float addrspace(2)* %object, float %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %10 [
+    i32 0, label %.thread
+    i32 1, label %.thread2
+    i32 2, label %.thread3
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = bitcast float %desired to i32
+  %2 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %3 = atomicrmw volatile xchg i32 addrspace(2)* %2, i32 %1 monotonic
+  br label %18
+
+.thread2:                                         ; preds = %0
+  %4 = bitcast float %desired to i32
+  %5 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %6 = atomicrmw volatile xchg i32 addrspace(2)* %5, i32 %4 acquire
+  br label %18
+
+.thread3:                                         ; preds = %0
+  %7 = bitcast float %desired to i32
+  %8 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %9 = atomicrmw volatile xchg i32 addrspace(2)* %8, i32 %7 release
+  br label %18
+
+; <label>:10                                      ; preds = %0
+  %11 = icmp eq i32 %order, 3
+  %12 = bitcast float %desired to i32
+  %13 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  br i1 %11, label %14, label %16
+
+; <label>:14                                      ; preds = %10
+  %15 = atomicrmw volatile xchg i32 addrspace(2)* %13, i32 %12 acq_rel
+  br label %18
+
+; <label>:16                                      ; preds = %10
+  %17 = atomicrmw volatile xchg i32 addrspace(2)* %13, i32 %12 seq_cst
+  br label %18
+
+; <label>:18                                      ; preds = %16, %14, %.thread3, %.thread2, %.thread
+  %.sroa.0.0 = phi i32 [ %3, %.thread ], [ %17, %16 ], [ %15, %14 ], [ %9, %.thread3 ], [ %6, %.thread2 ]
+  %19 = bitcast i32 %.sroa.0.0 to float
+  ret float %19
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU7CLlocalU7_AtomicfPff12memory_orderS3_12memory_scope(float addrspace(2)* %object, float* nocapture %expected, float %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast float %desired to i32
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %23 = bitcast float* %expected to i32*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i32, i32* %23, align 4
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i32, i32* %23, align 4
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i32, i32* %23, align 4
+  %33 = cmpxchg volatile i32 addrspace(2)* %22, i32 %32, i32 %11 monotonic monotonic
+  %34 = extractvalue { i32, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i32, i1 } %33, 0
+  store i32 %36, i32* %23, align 4
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg volatile i32 addrspace(2)* %22, i32 %25, i32 %11 acquire monotonic
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg volatile i32 addrspace(2)* %22, i32 %25, i32 %11 acquire acquire
+  %44 = extractvalue { i32, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i32, i1 } %40, 0
+  store i32 %46, i32* %23, align 4
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i32, i1 } %43, 0
+  store i32 %50, i32* %23, align 4
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i32, i32* %23, align 4
+  %55 = cmpxchg volatile i32 addrspace(2)* %22, i32 %54, i32 %11 release monotonic
+  %56 = extractvalue { i32, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i32, i1 } %55, 0
+  store i32 %58, i32* %23, align 4
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg volatile i32 addrspace(2)* %22, i32 %27, i32 %11 acq_rel monotonic
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg volatile i32 addrspace(2)* %22, i32 %27, i32 %11 acq_rel acquire
+  %66 = extractvalue { i32, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i32, i1 } %62, 0
+  store i32 %68, i32* %23, align 4
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i32, i1 } %65, 0
+  store i32 %72, i32* %23, align 4
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i32, i32* %23, align 4
+  %77 = cmpxchg volatile i32 addrspace(2)* %22, i32 %76, i32 %11 seq_cst monotonic
+  %78 = extractvalue { i32, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i32, i32* %23, align 4
+  %81 = cmpxchg volatile i32 addrspace(2)* %22, i32 %80, i32 %11 seq_cst acquire
+  %82 = extractvalue { i32, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i32, i32* %23, align 4
+  %85 = cmpxchg volatile i32 addrspace(2)* %22, i32 %84, i32 %11 seq_cst seq_cst
+  %86 = extractvalue { i32, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i32, i1 } %77, 0
+  store i32 %88, i32* %23, align 4
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i32, i1 } %81, 0
+  store i32 %92, i32* %23, align 4
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i32, i1 } %85, 0
+  store i32 %96, i32* %23, align 4
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU7CLlocalU7_AtomicfPff12memory_orderS3_12memory_scope(float addrspace(2)* %object, float* nocapture %expected, float %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast float %desired to i32
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast float addrspace(2)* %object to i32 addrspace(2)*
+  %23 = bitcast float* %expected to i32*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i32, i32* %23, align 4
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i32, i32* %23, align 4
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i32, i32* %23, align 4
+  %33 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %32, i32 %11 monotonic monotonic
+  %34 = extractvalue { i32, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i32, i1 } %33, 0
+  store i32 %36, i32* %23, align 4
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %25, i32 %11 acquire monotonic
+  %41 = extractvalue { i32, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %25, i32 %11 acquire acquire
+  %44 = extractvalue { i32, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i32, i1 } %40, 0
+  store i32 %46, i32* %23, align 4
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i32, i1 } %43, 0
+  store i32 %50, i32* %23, align 4
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i32, i32* %23, align 4
+  %55 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %54, i32 %11 release monotonic
+  %56 = extractvalue { i32, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i32, i1 } %55, 0
+  store i32 %58, i32* %23, align 4
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %27, i32 %11 acq_rel monotonic
+  %63 = extractvalue { i32, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %27, i32 %11 acq_rel acquire
+  %66 = extractvalue { i32, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i32, i1 } %62, 0
+  store i32 %68, i32* %23, align 4
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i32, i1 } %65, 0
+  store i32 %72, i32* %23, align 4
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i32, i32* %23, align 4
+  %77 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %76, i32 %11 seq_cst monotonic
+  %78 = extractvalue { i32, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i32, i32* %23, align 4
+  %81 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %80, i32 %11 seq_cst acquire
+  %82 = extractvalue { i32, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i32, i32* %23, align 4
+  %85 = cmpxchg weak volatile i32 addrspace(2)* %22, i32 %84, i32 %11 seq_cst seq_cst
+  %86 = extractvalue { i32, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i32, i1 } %77, 0
+  store i32 %88, i32* %23, align 4
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i32, i1 } %81, 0
+  store i32 %92, i32* %23, align 4
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i32, i1 } %85, 0
+  store i32 %96, i32* %23, align 4
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU7CLlocalU7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* nocapture %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object monotonic, align 8
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object release, align 8
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object seq_cst, align 8
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z23pocl_atomic_load__localPVU7CLlocalU7_Atomicl12memory_order12memory_scope(i64 addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i64, i64 addrspace(2)* %object monotonic, align 8
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i64, i64 addrspace(2)* %object acquire, align 8
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i64, i64 addrspace(2)* %object seq_cst, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z27pocl_atomic_exchange__localPVU7CLlocalU7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU7CLlocalU7_AtomiclPll12memory_orderS3_12memory_scope(i64 addrspace(2)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg volatile i64 addrspace(2)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg volatile i64 addrspace(2)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg volatile i64 addrspace(2)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg volatile i64 addrspace(2)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg volatile i64 addrspace(2)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU7CLlocalU7_AtomiclPll12memory_orderS3_12memory_scope(i64 addrspace(2)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_add__localPVU7CLlocalU7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_sub__localPVU7CLlocalU7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z27pocl_atomic_fetch_or__localPVU7CLlocalU7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_xor__localPVU7CLlocalU7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_and__localPVU7CLlocalU7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_min__localPVU7CLlocalU7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile min i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_max__localPVU7CLlocalU7_Atomicll12memory_order12memory_scope(i64 addrspace(2)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile max i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU7CLlocalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* nocapture %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %1 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object monotonic, align 8
+  br label %2
+
+.thread1:                                         ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object release, align 8
+  br label %2
+
+; <label>:1                                       ; preds = %0
+  store atomic volatile i64 %desired, i64 addrspace(2)* %object seq_cst, align 8
+  br label %2
+
+; <label>:2                                       ; preds = %1, %.thread1, %.thread
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z23pocl_atomic_load__localPVU7CLlocalU7_Atomicm12memory_order12memory_scope(i64 addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %3 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+    i32 3, label %.thread
+  ]
+
+.thread:                                          ; preds = %0, %0, %0
+  %1 = load atomic volatile i64, i64 addrspace(2)* %object monotonic, align 8
+  br label %5
+
+.thread1:                                         ; preds = %0
+  %2 = load atomic volatile i64, i64 addrspace(2)* %object acquire, align 8
+  br label %5
+
+; <label>:3                                       ; preds = %0
+  %4 = load atomic volatile i64, i64 addrspace(2)* %object seq_cst, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %3, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %4, %3 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z27pocl_atomic_exchange__localPVU7CLlocalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xchg i64 addrspace(2)* %object, i64 %desired seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU7CLlocalU7_AtomicmPmm12memory_orderS3_12memory_scope(i64 addrspace(2)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg volatile i64 addrspace(2)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg volatile i64 addrspace(2)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg volatile i64 addrspace(2)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg volatile i64 addrspace(2)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg volatile i64 addrspace(2)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU7CLlocalU7_AtomicmPmm12memory_orderS3_12memory_scope(i64 addrspace(2)* %object, i64* nocapture %expected, i64 %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = icmp eq i32 %failure, 0
+  br i1 %11, label %19, label %12
+
+; <label>:12                                      ; preds = %9
+  %13 = icmp eq i32 %failure, 1
+  br i1 %13, label %19, label %14
+
+; <label>:14                                      ; preds = %12
+  %15 = icmp eq i32 %failure, 2
+  br i1 %15, label %19, label %16
+
+; <label>:16                                      ; preds = %14
+  %17 = icmp eq i32 %failure, 3
+  %18 = select i1 %17, i32 4, i32 5
+  br label %19
+
+; <label>:19                                      ; preds = %12, %14, %16, %9
+  %20 = phi i32 [ 0, %9 ], [ 2, %12 ], [ %18, %16 ], [ 3, %14 ]
+  switch i32 %10, label %28 [
+    i32 1, label %21
+    i32 2, label %21
+    i32 3, label %50
+    i32 4, label %23
+    i32 5, label %25
+  ]
+
+; <label>:21                                      ; preds = %19, %19
+  %.off = add nsw i32 %20, -1
+  %switch = icmp ult i32 %.off, 2
+  %22 = load i64, i64* %expected, align 8
+  br i1 %switch, label %39, label %36
+
+; <label>:23                                      ; preds = %19
+  %.off1 = add nsw i32 %20, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %24 = load i64, i64* %expected, align 8
+  br i1 %switch2, label %61, label %58
+
+; <label>:25                                      ; preds = %19
+  switch i32 %20, label %72 [
+    i32 1, label %76
+    i32 2, label %76
+    i32 5, label %80
+  ]
+
+; <label>:26                                      ; preds = %86, %90, %94, %66, %70, %44, %48, %56, %34
+  %.0 = phi i8 [ %35, %34 ], [ %87, %86 ], [ %95, %94 ], [ %91, %90 ], [ %71, %70 ], [ %67, %66 ], [ %57, %56 ], [ %49, %48 ], [ %45, %44 ]
+  %27 = icmp ne i8 %.0, 0
+  ret i1 %27
+
+; <label>:28                                      ; preds = %19
+  %29 = load i64, i64* %expected, align 8
+  %30 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %29, i64 %desired monotonic monotonic
+  %31 = extractvalue { i64, i1 } %30, 1
+  br i1 %31, label %34, label %32
+
+; <label>:32                                      ; preds = %28
+  %33 = extractvalue { i64, i1 } %30, 0
+  store i64 %33, i64* %expected, align 8
+  br label %34
+
+; <label>:34                                      ; preds = %32, %28
+  %35 = zext i1 %31 to i8
+  br label %26
+
+; <label>:36                                      ; preds = %21
+  %37 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire monotonic
+  %38 = extractvalue { i64, i1 } %37, 1
+  br i1 %38, label %44, label %42
+
+; <label>:39                                      ; preds = %21
+  %40 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %22, i64 %desired acquire acquire
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %48, label %46
+
+; <label>:42                                      ; preds = %36
+  %43 = extractvalue { i64, i1 } %37, 0
+  store i64 %43, i64* %expected, align 8
+  br label %44
+
+; <label>:44                                      ; preds = %42, %36
+  %45 = zext i1 %38 to i8
+  br label %26
+
+; <label>:46                                      ; preds = %39
+  %47 = extractvalue { i64, i1 } %40, 0
+  store i64 %47, i64* %expected, align 8
+  br label %48
+
+; <label>:48                                      ; preds = %46, %39
+  %49 = zext i1 %41 to i8
+  br label %26
+
+; <label>:50                                      ; preds = %19
+  %51 = load i64, i64* %expected, align 8
+  %52 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %51, i64 %desired release monotonic
+  %53 = extractvalue { i64, i1 } %52, 1
+  br i1 %53, label %56, label %54
+
+; <label>:54                                      ; preds = %50
+  %55 = extractvalue { i64, i1 } %52, 0
+  store i64 %55, i64* %expected, align 8
+  br label %56
+
+; <label>:56                                      ; preds = %54, %50
+  %57 = zext i1 %53 to i8
+  br label %26
+
+; <label>:58                                      ; preds = %23
+  %59 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel monotonic
+  %60 = extractvalue { i64, i1 } %59, 1
+  br i1 %60, label %66, label %64
+
+; <label>:61                                      ; preds = %23
+  %62 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %24, i64 %desired acq_rel acquire
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %70, label %68
+
+; <label>:64                                      ; preds = %58
+  %65 = extractvalue { i64, i1 } %59, 0
+  store i64 %65, i64* %expected, align 8
+  br label %66
+
+; <label>:66                                      ; preds = %64, %58
+  %67 = zext i1 %60 to i8
+  br label %26
+
+; <label>:68                                      ; preds = %61
+  %69 = extractvalue { i64, i1 } %62, 0
+  store i64 %69, i64* %expected, align 8
+  br label %70
+
+; <label>:70                                      ; preds = %68, %61
+  %71 = zext i1 %63 to i8
+  br label %26
+
+; <label>:72                                      ; preds = %25
+  %73 = load i64, i64* %expected, align 8
+  %74 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %73, i64 %desired seq_cst monotonic
+  %75 = extractvalue { i64, i1 } %74, 1
+  br i1 %75, label %86, label %84
+
+; <label>:76                                      ; preds = %25, %25
+  %77 = load i64, i64* %expected, align 8
+  %78 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %77, i64 %desired seq_cst acquire
+  %79 = extractvalue { i64, i1 } %78, 1
+  br i1 %79, label %90, label %88
+
+; <label>:80                                      ; preds = %25
+  %81 = load i64, i64* %expected, align 8
+  %82 = cmpxchg weak volatile i64 addrspace(2)* %object, i64 %81, i64 %desired seq_cst seq_cst
+  %83 = extractvalue { i64, i1 } %82, 1
+  br i1 %83, label %94, label %92
+
+; <label>:84                                      ; preds = %72
+  %85 = extractvalue { i64, i1 } %74, 0
+  store i64 %85, i64* %expected, align 8
+  br label %86
+
+; <label>:86                                      ; preds = %84, %72
+  %87 = zext i1 %75 to i8
+  br label %26
+
+; <label>:88                                      ; preds = %76
+  %89 = extractvalue { i64, i1 } %78, 0
+  store i64 %89, i64* %expected, align 8
+  br label %90
+
+; <label>:90                                      ; preds = %88, %76
+  %91 = zext i1 %79 to i8
+  br label %26
+
+; <label>:92                                      ; preds = %80
+  %93 = extractvalue { i64, i1 } %82, 0
+  store i64 %93, i64* %expected, align 8
+  br label %94
+
+; <label>:94                                      ; preds = %92, %80
+  %95 = zext i1 %83 to i8
+  br label %26
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_add__localPVU7CLlocalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile add i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_sub__localPVU7CLlocalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile sub i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z27pocl_atomic_fetch_or__localPVU7CLlocalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile or i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_xor__localPVU7CLlocalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile xor i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_and__localPVU7CLlocalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* %object, i64 %operand, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile and i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_min__localPVU7CLlocalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umin i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: noreturn nounwind uwtable
+define i64 @_Z28pocl_atomic_fetch_max__localPVU7CLlocalU7_Atomicmm12memory_order12memory_scope(i64 addrspace(2)* nocapture readnone %object, i64 %operand, i32 %order, i32 %scope) #1 {
+  switch i32 %order, label %6 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread2
+    i32 3, label %4
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand monotonic
+  br label %8
+
+.thread1:                                         ; preds = %0
+  %2 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand acquire
+  br label %8
+
+.thread2:                                         ; preds = %0
+  %3 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand release
+  br label %8
+
+; <label>:4                                       ; preds = %0
+  %5 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand acq_rel
+  br label %8
+
+; <label>:6                                       ; preds = %0
+  %7 = atomicrmw volatile umax i64 addrspace(2)* %object, i64 %operand seq_cst
+  br label %8
+
+; <label>:8                                       ; preds = %6, %4, %.thread2, %.thread1, %.thread
+  %.0 = phi i64 [ %1, %.thread ], [ %7, %6 ], [ %5, %4 ], [ %3, %.thread2 ], [ %2, %.thread1 ]
+  ret i64 %.0
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z24pocl_atomic_store__localPVU7CLlocalU7_Atomicdd12memory_order12memory_scope(double addrspace(2)* nocapture %object, double %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %5 [
+    i32 0, label %.thread
+    i32 1, label %.thread
+    i32 2, label %.thread1
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast double %desired to i64
+  %2 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  store atomic volatile i64 %1, i64 addrspace(2)* %2 release, align 8
+  br label %13
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast double %desired to i64
+  %4 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br label %9
+
+; <label>:5                                       ; preds = %0
+  %6 = icmp eq i32 %order, 3
+  %7 = bitcast double %desired to i64
+  %8 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br i1 %6, label %9, label %12
+
+; <label>:9                                       ; preds = %5, %.thread
+  %10 = phi i64 addrspace(2)* [ %4, %.thread ], [ %8, %5 ]
+  %11 = phi i64 [ %3, %.thread ], [ %7, %5 ]
+  store atomic volatile i64 %11, i64 addrspace(2)* %10 monotonic, align 8
+  br label %13
+
+; <label>:12                                      ; preds = %5
+  store atomic volatile i64 %7, i64 addrspace(2)* %8 seq_cst, align 8
+  br label %13
+
+; <label>:13                                      ; preds = %12, %.thread1, %9
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define double @_Z23pocl_atomic_load__localPVU7CLlocalU7_Atomicd12memory_order12memory_scope(double addrspace(2)* nocapture readonly %object, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %4 [
+    i32 0, label %.thread
+    i32 1, label %.thread1
+    i32 2, label %.thread
+  ]
+
+.thread1:                                         ; preds = %0
+  %1 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %2 = load atomic volatile i64, i64 addrspace(2)* %1 acquire, align 8
+  br label %12
+
+.thread:                                          ; preds = %0, %0
+  %3 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br label %7
+
+; <label>:4                                       ; preds = %0
+  %5 = icmp eq i32 %order, 3
+  %6 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br i1 %5, label %7, label %10
+
+; <label>:7                                       ; preds = %4, %.thread
+  %8 = phi i64 addrspace(2)* [ %3, %.thread ], [ %6, %4 ]
+  %9 = load atomic volatile i64, i64 addrspace(2)* %8 monotonic, align 8
+  br label %12
+
+; <label>:10                                      ; preds = %4
+  %11 = load atomic volatile i64, i64 addrspace(2)* %6 seq_cst, align 8
+  br label %12
+
+; <label>:12                                      ; preds = %10, %.thread1, %7
+  %.sroa.0.0 = phi i64 [ %9, %7 ], [ %11, %10 ], [ %2, %.thread1 ]
+  %13 = bitcast i64 %.sroa.0.0 to double
+  ret double %13
+}
+
+; Function Attrs: nounwind uwtable
+define double @_Z27pocl_atomic_exchange__localPVU7CLlocalU7_Atomicdd12memory_order12memory_scope(double addrspace(2)* %object, double %desired, i32 %order, i32 %scope) #0 {
+  switch i32 %order, label %10 [
+    i32 0, label %.thread
+    i32 1, label %.thread2
+    i32 2, label %.thread3
+  ]
+
+.thread:                                          ; preds = %0
+  %1 = bitcast double %desired to i64
+  %2 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %3 = atomicrmw volatile xchg i64 addrspace(2)* %2, i64 %1 monotonic
+  br label %18
+
+.thread2:                                         ; preds = %0
+  %4 = bitcast double %desired to i64
+  %5 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %6 = atomicrmw volatile xchg i64 addrspace(2)* %5, i64 %4 acquire
+  br label %18
+
+.thread3:                                         ; preds = %0
+  %7 = bitcast double %desired to i64
+  %8 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %9 = atomicrmw volatile xchg i64 addrspace(2)* %8, i64 %7 release
+  br label %18
+
+; <label>:10                                      ; preds = %0
+  %11 = icmp eq i32 %order, 3
+  %12 = bitcast double %desired to i64
+  %13 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  br i1 %11, label %14, label %16
+
+; <label>:14                                      ; preds = %10
+  %15 = atomicrmw volatile xchg i64 addrspace(2)* %13, i64 %12 acq_rel
+  br label %18
+
+; <label>:16                                      ; preds = %10
+  %17 = atomicrmw volatile xchg i64 addrspace(2)* %13, i64 %12 seq_cst
+  br label %18
+
+; <label>:18                                      ; preds = %16, %14, %.thread3, %.thread2, %.thread
+  %.sroa.0.0 = phi i64 [ %3, %.thread ], [ %17, %16 ], [ %15, %14 ], [ %9, %.thread3 ], [ %6, %.thread2 ]
+  %19 = bitcast i64 %.sroa.0.0 to double
+  ret double %19
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z42pocl_atomic_compare_exchange_strong__localPVU7CLlocalU7_AtomicdPdd12memory_orderS3_12memory_scope(double addrspace(2)* %object, double* nocapture %expected, double %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast double %desired to i64
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %23 = bitcast double* %expected to i64*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i64, i64* %23, align 8
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i64, i64* %23, align 8
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i64, i64* %23, align 8
+  %33 = cmpxchg volatile i64 addrspace(2)* %22, i64 %32, i64 %11 monotonic monotonic
+  %34 = extractvalue { i64, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i64, i1 } %33, 0
+  store i64 %36, i64* %23, align 8
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg volatile i64 addrspace(2)* %22, i64 %25, i64 %11 acquire monotonic
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg volatile i64 addrspace(2)* %22, i64 %25, i64 %11 acquire acquire
+  %44 = extractvalue { i64, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i64, i1 } %40, 0
+  store i64 %46, i64* %23, align 8
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i64, i1 } %43, 0
+  store i64 %50, i64* %23, align 8
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i64, i64* %23, align 8
+  %55 = cmpxchg volatile i64 addrspace(2)* %22, i64 %54, i64 %11 release monotonic
+  %56 = extractvalue { i64, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i64, i1 } %55, 0
+  store i64 %58, i64* %23, align 8
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg volatile i64 addrspace(2)* %22, i64 %27, i64 %11 acq_rel monotonic
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg volatile i64 addrspace(2)* %22, i64 %27, i64 %11 acq_rel acquire
+  %66 = extractvalue { i64, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i64, i1 } %62, 0
+  store i64 %68, i64* %23, align 8
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i64, i1 } %65, 0
+  store i64 %72, i64* %23, align 8
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i64, i64* %23, align 8
+  %77 = cmpxchg volatile i64 addrspace(2)* %22, i64 %76, i64 %11 seq_cst monotonic
+  %78 = extractvalue { i64, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i64, i64* %23, align 8
+  %81 = cmpxchg volatile i64 addrspace(2)* %22, i64 %80, i64 %11 seq_cst acquire
+  %82 = extractvalue { i64, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i64, i64* %23, align 8
+  %85 = cmpxchg volatile i64 addrspace(2)* %22, i64 %84, i64 %11 seq_cst seq_cst
+  %86 = extractvalue { i64, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i64, i1 } %77, 0
+  store i64 %88, i64* %23, align 8
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i64, i1 } %81, 0
+  store i64 %92, i64* %23, align 8
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i64, i1 } %85, 0
+  store i64 %96, i64* %23, align 8
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+; Function Attrs: nounwind uwtable
+define zeroext i1 @_Z40pocl_atomic_compare_exchange_weak__localPVU7CLlocalU7_AtomicdPdd12memory_orderS3_12memory_scope(double addrspace(2)* %object, double* nocapture %expected, double %desired, i32 %success, i32 %failure, i32 %scope) #0 {
+  %1 = icmp eq i32 %success, 0
+  br i1 %1, label %9, label %2
+
+; <label>:2                                       ; preds = %0
+  %3 = icmp eq i32 %success, 1
+  br i1 %3, label %9, label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = icmp eq i32 %success, 2
+  br i1 %5, label %9, label %6
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 %success, 3
+  %8 = select i1 %7, i32 4, i32 5
+  br label %9
+
+; <label>:9                                       ; preds = %2, %4, %6, %0
+  %10 = phi i32 [ 0, %0 ], [ 2, %2 ], [ %8, %6 ], [ 3, %4 ]
+  %11 = bitcast double %desired to i64
+  %12 = icmp eq i32 %failure, 0
+  br i1 %12, label %20, label %13
+
+; <label>:13                                      ; preds = %9
+  %14 = icmp eq i32 %failure, 1
+  br i1 %14, label %20, label %15
+
+; <label>:15                                      ; preds = %13
+  %16 = icmp eq i32 %failure, 2
+  br i1 %16, label %20, label %17
+
+; <label>:17                                      ; preds = %15
+  %18 = icmp eq i32 %failure, 3
+  %19 = select i1 %18, i32 4, i32 5
+  br label %20
+
+; <label>:20                                      ; preds = %13, %15, %17, %9
+  %21 = phi i32 [ 0, %9 ], [ 2, %13 ], [ %19, %17 ], [ 3, %15 ]
+  %22 = bitcast double addrspace(2)* %object to i64 addrspace(2)*
+  %23 = bitcast double* %expected to i64*
+  switch i32 %10, label %31 [
+    i32 1, label %24
+    i32 2, label %24
+    i32 3, label %53
+    i32 4, label %26
+    i32 5, label %28
+  ]
+
+; <label>:24                                      ; preds = %20, %20
+  %.off = add nsw i32 %21, -1
+  %switch = icmp ult i32 %.off, 2
+  %25 = load i64, i64* %23, align 8
+  br i1 %switch, label %42, label %39
+
+; <label>:26                                      ; preds = %20
+  %.off1 = add nsw i32 %21, -1
+  %switch2 = icmp ult i32 %.off1, 2
+  %27 = load i64, i64* %23, align 8
+  br i1 %switch2, label %64, label %61
+
+; <label>:28                                      ; preds = %20
+  switch i32 %21, label %75 [
+    i32 1, label %79
+    i32 2, label %79
+    i32 5, label %83
+  ]
+
+; <label>:29                                      ; preds = %89, %93, %97, %69, %73, %47, %51, %59, %37
+  %.0 = phi i8 [ %38, %37 ], [ %90, %89 ], [ %98, %97 ], [ %94, %93 ], [ %74, %73 ], [ %70, %69 ], [ %60, %59 ], [ %52, %51 ], [ %48, %47 ]
+  %30 = icmp ne i8 %.0, 0
+  ret i1 %30
+
+; <label>:31                                      ; preds = %20
+  %32 = load i64, i64* %23, align 8
+  %33 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %32, i64 %11 monotonic monotonic
+  %34 = extractvalue { i64, i1 } %33, 1
+  br i1 %34, label %37, label %35
+
+; <label>:35                                      ; preds = %31
+  %36 = extractvalue { i64, i1 } %33, 0
+  store i64 %36, i64* %23, align 8
+  br label %37
+
+; <label>:37                                      ; preds = %35, %31
+  %38 = zext i1 %34 to i8
+  br label %29
+
+; <label>:39                                      ; preds = %24
+  %40 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %25, i64 %11 acquire monotonic
+  %41 = extractvalue { i64, i1 } %40, 1
+  br i1 %41, label %47, label %45
+
+; <label>:42                                      ; preds = %24
+  %43 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %25, i64 %11 acquire acquire
+  %44 = extractvalue { i64, i1 } %43, 1
+  br i1 %44, label %51, label %49
+
+; <label>:45                                      ; preds = %39
+  %46 = extractvalue { i64, i1 } %40, 0
+  store i64 %46, i64* %23, align 8
+  br label %47
+
+; <label>:47                                      ; preds = %45, %39
+  %48 = zext i1 %41 to i8
+  br label %29
+
+; <label>:49                                      ; preds = %42
+  %50 = extractvalue { i64, i1 } %43, 0
+  store i64 %50, i64* %23, align 8
+  br label %51
+
+; <label>:51                                      ; preds = %49, %42
+  %52 = zext i1 %44 to i8
+  br label %29
+
+; <label>:53                                      ; preds = %20
+  %54 = load i64, i64* %23, align 8
+  %55 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %54, i64 %11 release monotonic
+  %56 = extractvalue { i64, i1 } %55, 1
+  br i1 %56, label %59, label %57
+
+; <label>:57                                      ; preds = %53
+  %58 = extractvalue { i64, i1 } %55, 0
+  store i64 %58, i64* %23, align 8
+  br label %59
+
+; <label>:59                                      ; preds = %57, %53
+  %60 = zext i1 %56 to i8
+  br label %29
+
+; <label>:61                                      ; preds = %26
+  %62 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %27, i64 %11 acq_rel monotonic
+  %63 = extractvalue { i64, i1 } %62, 1
+  br i1 %63, label %69, label %67
+
+; <label>:64                                      ; preds = %26
+  %65 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %27, i64 %11 acq_rel acquire
+  %66 = extractvalue { i64, i1 } %65, 1
+  br i1 %66, label %73, label %71
+
+; <label>:67                                      ; preds = %61
+  %68 = extractvalue { i64, i1 } %62, 0
+  store i64 %68, i64* %23, align 8
+  br label %69
+
+; <label>:69                                      ; preds = %67, %61
+  %70 = zext i1 %63 to i8
+  br label %29
+
+; <label>:71                                      ; preds = %64
+  %72 = extractvalue { i64, i1 } %65, 0
+  store i64 %72, i64* %23, align 8
+  br label %73
+
+; <label>:73                                      ; preds = %71, %64
+  %74 = zext i1 %66 to i8
+  br label %29
+
+; <label>:75                                      ; preds = %28
+  %76 = load i64, i64* %23, align 8
+  %77 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %76, i64 %11 seq_cst monotonic
+  %78 = extractvalue { i64, i1 } %77, 1
+  br i1 %78, label %89, label %87
+
+; <label>:79                                      ; preds = %28, %28
+  %80 = load i64, i64* %23, align 8
+  %81 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %80, i64 %11 seq_cst acquire
+  %82 = extractvalue { i64, i1 } %81, 1
+  br i1 %82, label %93, label %91
+
+; <label>:83                                      ; preds = %28
+  %84 = load i64, i64* %23, align 8
+  %85 = cmpxchg weak volatile i64 addrspace(2)* %22, i64 %84, i64 %11 seq_cst seq_cst
+  %86 = extractvalue { i64, i1 } %85, 1
+  br i1 %86, label %97, label %95
+
+; <label>:87                                      ; preds = %75
+  %88 = extractvalue { i64, i1 } %77, 0
+  store i64 %88, i64* %23, align 8
+  br label %89
+
+; <label>:89                                      ; preds = %87, %75
+  %90 = zext i1 %78 to i8
+  br label %29
+
+; <label>:91                                      ; preds = %79
+  %92 = extractvalue { i64, i1 } %81, 0
+  store i64 %92, i64* %23, align 8
+  br label %93
+
+; <label>:93                                      ; preds = %91, %79
+  %94 = zext i1 %82 to i8
+  br label %29
+
+; <label>:95                                      ; preds = %83
+  %96 = extractvalue { i64, i1 } %85, 0
+  store i64 %96, i64* %23, align 8
+  br label %97
+
+; <label>:97                                      ; preds = %95, %83
+  %98 = zext i1 %86 to i8
+  br label %29
+}
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noreturn nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noreturn nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Franz clang version 3.7.0 (https://github.com/llvm-mirror/clang.git 40b68b4c02b9d9e1e4138815747adf5589496240) (https://github.com/HSAFoundation/HLC-HSAIL-Development-LLVM.git 183de0bdbefa5664920942fa418d1efefb87faeb) (based on LLVM 3.7.0svn)"}
diff --git a/lib/kernel/tce/CMakeLists.txt b/lib/kernel/tce/CMakeLists.txt
index 09e3737..94c3de0 100644
--- a/lib/kernel/tce/CMakeLists.txt
+++ b/lib/kernel/tce/CMakeLists.txt
@@ -29,12 +29,11 @@ include("bitcode_rules")
 # fails in TCE code gen:
 # SplitVectorResult #0: 0x24c5ae0: v8i16,ch = vaarg 0x20628e8, 0x2423ed0, 0x24c59e0, 0x24c4fe0 [ORD=223] [ID=0]
 # LLVM ERROR: Do not know how to split the result of this operator!
-if(NEW_PRINTF_WORKS)
-  list(REMOVE_ITEM SOURCES_WITHOUT_VML "printf.c")
-  list(APPEND SOURCES_WITHOUT_VML "printf_constant.c")
-endif()
-
+list(REMOVE_ITEM SOURCES_WITHOUT_VML "printf.c" "atomics.cl")
+list(APPEND SOURCES_WITHOUT_VML "printf_constant.c")
 
+set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${TCE_DEVICE_CL_VERSION} ${TCE_DEVICE_EXTENSION_DEFINES}")
+separate_arguments(DEVICE_CL_FLAGS)
 
 # Use TARGET flags:
 # with some additional CLANG_FLAGS
@@ -50,13 +49,19 @@ set(LLC_FLAGS ${TCE_TARGET_LLC_FLAGS})
 #LD_FLAGS    = @TARGET_LD_FLAGS@
 
 #KERNEL_TARGET = tce (WRONG)
-make_kernel_bc(KERNEL_BC "tce-tut-llvm" ${SOURCES_WITHOUT_VML})
+make_kernel_bc(KERNEL_BC "tce-tut-llvm" "tta" ${SOURCES_WITHOUT_VML})
 
 # just debug
 message(STATUS "TCE Kernel BC: ${KERNEL_BC}")
 
+list(APPEND KERNEL_BC_LIST "${KERNEL_BC}")
+set(KERNEL_BC_LIST "${KERNEL_BC_LIST}" PARENT_SCOPE)
+
 # a target is needed...
-add_custom_target("kernel_tce" ALL DEPENDS ${KERNEL_BC})
+add_custom_target("kernel_tce" DEPENDS ${KERNEL_BC})
+
+list(APPEND KERNEL_TARGET_LIST "kernel_tce")
+set(KERNEL_TARGET_LIST "${KERNEL_TARGET_LIST}" PARENT_SCOPE)
 
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${KERNEL_BC}"
         DESTINATION "${POCL_INSTALL_PRIVATE_DATADIR}")
diff --git a/lib/kernel/tce/Makefile b/lib/kernel/tce/Makefile
deleted file mode 100644
index 86ab8f2..0000000
--- a/lib/kernel/tce/Makefile
+++ /dev/null
@@ -1,758 +0,0 @@
-# Makefile.in generated by automake 1.15 from Makefile.am.
-# lib/kernel/tce/Makefile.  Generated from Makefile.in by configure.
-
-# Copyright (C) 1994-2014 Free Software Foundation, Inc.
-
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-
-
-# Process this file with automake to produce Makefile.in
-# 
-# Copyright (c) 2011 Universidad Rey Juan Carlos
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-# rules.mk - the make rules for building the kernel library
-# 
-# Copyright (c) 2013 Erik Schnetter
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-# The caller (the Makefile which includes this file) needs to set the
-# following variables:
-# 
-# KERNEL_TARGET
-# CLANG_FLAGS
-# LLC_FLAGS
-# LD_FLAGS
-
-# sources.mk - a list of all kernel source files
-# 
-# Copyright (c) 2011-2013 Universidad Rey Juan Carlos
-#                         Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-
-am__is_gnu_make = { \
-  if test -z '$(MAKELEVEL)'; then \
-    false; \
-  elif test -n '$(MAKE_HOST)'; then \
-    true; \
-  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
-    true; \
-  else \
-    false; \
-  fi; \
-}
-am__make_running_with_option = \
-  case $${target_option-} in \
-      ?) ;; \
-      *) echo "am__make_running_with_option: internal error: invalid" \
-              "target option '$${target_option-}' specified" >&2; \
-         exit 1;; \
-  esac; \
-  has_opt=no; \
-  sane_makeflags=$$MAKEFLAGS; \
-  if $(am__is_gnu_make); then \
-    sane_makeflags=$$MFLAGS; \
-  else \
-    case $$MAKEFLAGS in \
-      *\\[\ \	]*) \
-        bs=\\; \
-        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
-          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
-    esac; \
-  fi; \
-  skip_next=no; \
-  strip_trailopt () \
-  { \
-    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
-  }; \
-  for flg in $$sane_makeflags; do \
-    test $$skip_next = yes && { skip_next=no; continue; }; \
-    case $$flg in \
-      *=*|--*) continue;; \
-        -*I) strip_trailopt 'I'; skip_next=yes;; \
-      -*I?*) strip_trailopt 'I';; \
-        -*O) strip_trailopt 'O'; skip_next=yes;; \
-      -*O?*) strip_trailopt 'O';; \
-        -*l) strip_trailopt 'l'; skip_next=yes;; \
-      -*l?*) strip_trailopt 'l';; \
-      -[dEDm]) skip_next=yes;; \
-      -[JT]) skip_next=yes;; \
-    esac; \
-    case $$flg in \
-      *$$target_option*) has_opt=yes; break;; \
-    esac; \
-  done; \
-  test $$has_opt = yes
-am__make_dryrun = (target_option=n; $(am__make_running_with_option))
-am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
-pkgdatadir = $(datadir)/pocl
-pkgincludedir = $(includedir)/pocl
-pkglibdir = $(libdir)/pocl
-pkglibexecdir = $(libexecdir)/pocl
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = x86_64-unknown-linux-gnu
-host_triplet = x86_64-unknown-linux-gnu
-target_triplet = x86_64-unknown-linux-gnu
-am__append_1 = printf.c
-subdir = lib/kernel/tce
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
-	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
-	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
-	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \
-	$(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/config.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-AM_V_P = $(am__v_P_$(V))
-am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY))
-am__v_P_0 = false
-am__v_P_1 = :
-AM_V_GEN = $(am__v_GEN_$(V))
-am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY))
-am__v_GEN_0 = @echo "  GEN     " $@;
-am__v_GEN_1 = 
-AM_V_at = $(am__v_at_$(V))
-am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY))
-am__v_at_0 = @
-am__v_at_1 = 
-SOURCES =
-DIST_SOURCES =
-am__can_run_installinfo = \
-  case $$AM_UPDATE_INFO_DIR in \
-    n|no|NO) false;; \
-    *) (install-info --version) >/dev/null 2>&1;; \
-  esac
-am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
-am__vpath_adj = case $$p in \
-    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
-    *) f=$$p;; \
-  esac;
-am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
-am__install_max = 40
-am__nobase_strip_setup = \
-  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
-am__nobase_strip = \
-  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
-am__nobase_list = $(am__nobase_strip_setup); \
-  for p in $$list; do echo "$$p $$p"; done | \
-  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
-  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
-    if (++n[$$2] == $(am__install_max)) \
-      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
-    END { for (dir in files) print dir, files[dir] }'
-am__base_list = \
-  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
-  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
-am__uninstall_files_from_dir = { \
-  test -z "$$files" \
-    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
-    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
-         $(am__cd) "$$dir" && rm -f $$files; }; \
-  }
-am__installdirs = "$(DESTDIR)$(pkgdatadir)"
-DATA = $(nodist_pkgdata_DATA)
-am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
-am__DIST_COMMON = $(srcdir)/../rules.mk $(srcdir)/../sources.mk \
-	$(srcdir)/Makefile.in
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-ACLOCAL = ${SHELL} /tmp/pocl/config/missing aclocal-1.15
-AMTAR = $${TAR-tar}
-AM_DEFAULT_VERBOSITY = 1
-AR = ar
-AUTOCONF = ${SHELL} /tmp/pocl/config/missing autoconf
-AUTOHEADER = ${SHELL} /tmp/pocl/config/missing autoheader
-AUTOMAKE = ${SHELL} /tmp/pocl/config/missing automake-1.15
-AWK = gawk
-BOOST_CPPFLAGS = 
-BOOST_LDFLAGS = 
-BUILD_TIMESTAMP = 201510261514380399935
-CC = gcc
-CCDEPMODE = depmode=gcc3
-CFLAGS = -g -O2
-CLANG = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang
-CLANGXX = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang++
-CLANGXX_FLAGS = --target=x86_64-unknown-linux-gnu  -DVML_NO_IOSTREAM
-CLFLAGS =  -D__OPENCL_VERSION__=120
-CPP = gcc -E
-CPPFLAGS = 
-CXX = g++
-CXXCPP = g++ -E
-CXXDEPMODE = depmode=gcc3
-CXXFLAGS = -g -O2
-CYGPATH_W = echo
-DEFS = -DHAVE_CONFIG_H
-DEPDIR = .deps
-DLLTOOL = false
-DSYMUTIL = 
-DUMPBIN = 
-ECHO_C = 
-ECHO_N = -n
-ECHO_T = 
-EGREP = /usr/bin/grep -E
-EXEEXT = 
-FGREP = /usr/bin/grep -F
-FORCED_CLFLAGS = -Xclang -ffake-address-space-map -fno-math-errno -fblocks -fno-builtin -fasm -Wno-format
-GLEW_CFLAGS = -I/usr/include/libdrm 
-GLEW_LIBS = -lGLEW -lGLU -lGL 
-GREP = /usr/bin/grep
-HOST = x86_64-unknown-linux-gnu
-HOST_AS_FLAGS = 
-HOST_CLANG_FLAGS =  --target=x86_64-unknown-linux-gnu -march=bdver3 -D_CL_DISABLE_HALF
-HOST_CPU = x86_64
-HOST_LD_FLAGS = -shared -lm
-HOST_LLC_FLAGS = -relocation-model=pic -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver3
-HOST_SIZEOF_DOUBLE = 8
-HOST_SIZEOF_HALF = 2
-HOST_SIZEOF_LONG = 8
-HOST_SIZEOF_VOID_P = 8
-HSAILASM = /opt/HSA/bin/HSAILasm
-HSA_INCLUDES = -I/opt/HSA/include
-HSA_LIBS = -lhsa-runtime64 
-HWLOC_CFLAGS = -I/usr/include/libxml2 
-HWLOC_LIBS = -lhwloc 
-ICD_LD_FLAGS = -Wl,-Bsymbolic
-INSTALL = /usr/bin/install -c
-INSTALL_DATA = ${INSTALL} -m 644
-INSTALL_PROGRAM = ${INSTALL}
-INSTALL_SCRIPT = ${INSTALL}
-INSTALL_STRIP_PROGRAM = $(install_sh) -c -s
-KERNEL_COMPILER_LIB_VERSION = 6:0:0
-LD = /usr/bin/ld -m elf_x86_64
-LDFLAGS =  -L/opt/HSA/lib
-LD_FLAGS_BIN =  
-LIBOBJS = 
-LIBRARY_SUFFIX = .so
-LIBS = 
-LIBSPE_CFLAGS = 
-LIBSPE_LIBS = 
-LIBTOOL = $(SHELL) $(top_builddir)/libtool
-LIB_AGE_VERSION = 5
-LIB_CURRENT_VERSION = 6
-LIB_FIRST_VERSION = 1
-LIB_REVISION_VERSION = 0
-LIB_VERSION = 6:0:5
-LIPO = 
-LLC = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llc
-LLVM_AS = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-as
-LLVM_CONFIG = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-config
-LLVM_CXX_FLAGS = -I/home/LLVM_370_HSAIL_rwdi_NA_rtti/include  -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wnon-virtual-dtor -Wno-comment -std=c++11 -ffunction-sections -fdata-sections -O2 -g -DNDEBUG  -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
-LLVM_LDFLAGS = -L/home/LLVM_370_HSAIL_rwdi_NA_rtti/lib  -lrt -ldl -lcurses -lpthread -lz -lm
-LLVM_LIBS = -lLLVMLTO -lLLVMObjCARCOpts -lLLVMLinker -lLLVMBitWriter -lLLVMTableGen -lLLVMMIRParser -lLLVMDebugInfoPDB -lLLVMOrcJIT -lLLVMIRReader -lLLVMAsmParser -lLLVMHSAILCodeGen -lLLVMHSAILDesc -lLLVMHSAILInfo -lLLVMHSAILAsmPrinter -lLLVMAMDGPUCodeGen -lLLVMAMDGPUAsmParser -lLLVMAMDGPUUtils -lLLVMAMDGPUDesc -lLLVMAMDGPUInfo -lLLVMAMDGPUAsmPrinter -lLLVMX86Disassembler -lLLVMX86AsmParser -lLLVMX86CodeGen -lLLVMSelectionDAG -lLLVMAsmPrinter -lLLVMX86Desc -lLLVMMCDisassembler -lLLVMX86I [...]
-LLVM_LINK = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-link
-LLVM_OPT = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/opt
-LLVM_VERSION = 3.7.0svn
-LN_S = ln -s
-LTDL_LIBS = -lltdl 
-LTLIBOBJS = 
-LT_SYS_LIBRARY_PATH = 
-MAKEINFO = ${SHELL} /tmp/pocl/config/missing makeinfo
-MANIFEST_TOOL = :
-MKDIR_P = /usr/bin/mkdir -p
-NM = /usr/bin/nm -B
-NMEDIT = 
-OBJDUMP = objdump
-OBJEXT = o
-OCL_ICD_CFLAGS = 
-OCL_ICD_LIBS = 
-OCL_KERNEL_ARCH = 
-OCL_KERNEL_TARGET = x86_64-unknown-linux-gnu
-OCL_KERNEL_TARGET_CPU = bdver3
-OCL_TARGETS = host hsail64
-OPENCL_CFLAGS = 
-OPENCL_CMAKE = 
-OPENCL_EXTLIBS = -lOpenCL 
-OPENCL_LIBS = -lOpenCL 
-OPT = /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/opt
-OTOOL = 
-OTOOL64 = 
-PACKAGE = pocl
-PACKAGE_BUGREPORT = pocl-devel at lists.sourceforge.net
-PACKAGE_NAME = pocl
-PACKAGE_STRING = pocl 0.12
-PACKAGE_TARNAME = pocl
-PACKAGE_URL = 
-PACKAGE_VERSION = 0.12
-PATH_SEPARATOR = :
-PKG_CONFIG = /usr/bin/pkg-config
-PKG_CONFIG_LIBDIR = 
-PKG_CONFIG_PATH = 
-POAT_TESTSUITES =  hsa
-POCL_DEVICE_ADDRESS_BITS = 64
-PTHREAD_CC = gcc
-PTHREAD_CFLAGS = -pthread
-PTHREAD_LIBS = 
-RANLIB = ranlib
-SDL_CFLAGS = 
-SDL_LIBS = 
-SED = /usr/bin/sed
-SET_MAKE = 
-SHELL = /bin/sh
-STRIP = strip
-TARGET = x86_64-unknown-linux-gnu
-TARGET_CLANG_FLAGS = 
-TARGET_CPU = x86_64
-TARGET_LLC_FLAGS = 
-TARGET_SIZEOF_DOUBLE = 8
-TARGET_SIZEOF_HALF = 2
-TARGET_SIZEOF_LONG = 8
-TARGET_SIZEOF_VOID_P = 8
-TCECC = 
-TCEMC_AVAILABLE = 
-TCE_AVAILABLE = 
-TCE_CONFIG = 
-VERSION = 0.12
-abs_builddir = /tmp/pocl/lib/kernel/tce
-abs_srcdir = /tmp/pocl/lib/kernel/tce
-abs_top_builddir = /tmp/pocl
-abs_top_srcdir = /tmp/pocl
-ac_ct_AR = ar
-ac_ct_CC = gcc
-ac_ct_CXX = g++
-ac_ct_DUMPBIN = 
-acx_pthread_config = 
-am__include = include
-am__leading_dot = .
-am__quote = 
-am__tar = $${TAR-tar} chof - "$$tardir"
-am__untar = $${TAR-tar} xf -
-bindir = ${exec_prefix}/bin
-build = x86_64-unknown-linux-gnu
-build_alias = 
-build_cpu = x86_64
-build_os = linux-gnu
-build_vendor = unknown
-builddir = .
-datadir = ${datarootdir}
-datarootdir = ${prefix}/share
-docdir = ${datarootdir}/doc/${PACKAGE_TARNAME}
-dvidir = ${docdir}
-exec_prefix = ${prefix}
-host = x86_64-unknown-linux-gnu
-host_alias = 
-host_cpu = x86_64
-host_os = linux-gnu
-host_vendor = unknown
-htmldir = ${docdir}
-includedir = ${prefix}/include
-infodir = ${datarootdir}/info
-install_sh = ${SHELL} /tmp/pocl/config/install-sh
-libdir = ${exec_prefix}/lib
-libexecdir = ${exec_prefix}/libexec
-localedir = ${datarootdir}/locale
-localstatedir = ${prefix}/var
-mandir = ${datarootdir}/man
-mkdir_p = $(MKDIR_P)
-oldincludedir = /usr/include
-pdfdir = ${docdir}
-prefix = /usr/local
-program_transform_name = s,x,x,
-psdir = ${docdir}
-sbindir = ${exec_prefix}/sbin
-sharedstatedir = ${prefix}/com
-srcdir = .
-sysconfdir = /etc
-target = x86_64-unknown-linux-gnu
-target_alias = 
-target_cpu = x86_64
-target_os = linux-gnu
-target_vendor = unknown
-top_build_prefix = ../../../
-top_builddir = ../../..
-top_srcdir = ../../..
-KERNEL_TARGET = tce-tut-llvm
-
-# Use TARGET flags:
-CLANG_FLAGS =  -Xclang -ffake-address-space-map -Xclang -menable-no-nans -emit-llvm -ffp-contract=off -target tce-tut-llvm  -isystem `tce-config --prefix`/tce-llvm/include
-LLC_FLAGS = 
-LD_FLAGS = @TARGET_LD_FLAGS@
-KERNEL_BC = kernel-${KERNEL_TARGET}.bc
-nodist_pkgdata_DATA = ${KERNEL_BC}
-
-# The standard list of kernel sources can be modified with
-# LKERNEL_SRCS_EXCLUDE, which removes files from the standard list,
-# and LKERNEL_SRCS_EXTRA, which adds extra files to the source list.
-LKERNEL_SRCS = \
-	$(filter-out ${LKERNEL_SRCS_EXCLUDE}, ${LKERNEL_SRCS_DEFAULT})	\
-	${LKERNEL_SRCS_EXTRA}
-
-OBJ = $(LKERNEL_SRCS:%=%.bc)
-CLEANFILES = kernel-${KERNEL_TARGET}.bc ${OBJ}
-LKERNEL_HDRS = image.h pocl_image_rw_utils.h templates.h
-LKERNEL_SRCS_DEFAULT = abs.cl abs_diff.cl acos.cl acosh.cl acospi.cl \
-	add_sat.cl all.cl any.cl as_type.cl asin.cl asinh.cl asinpi.cl \
-	async_work_group_copy.cl atan.cl atan2.cl atan2pi.cl atanh.cl \
-	atanpi.cl atomics.cl barrier.ll bitselect.cl cbrt.cl ceil.cl \
-	clamp.cl clamp_int.cl clz.cl convert_type.cl copysign.cl \
-	cos.cl cosh.cl cospi.cl cross.cl degrees.cl distance.cl \
-	divide.cl dot.cl erf.cl erfc.cl exp.cl exp10.cl exp2.cl \
-	expm1.cl fabs.cl fast_distance.cl fast_length.cl \
-	fast_normalize.cl fdim.cl floor.cl fma.cl fmax.cl fmin.cl \
-	fmod.cl fract.cl get_global_id.c get_global_offset.c \
-	get_global_size.c get_group_id.c get_image_depth.cl \
-	get_image_height.cl get_image_width.cl get_image_dim.cl \
-	get_local_id.c get_local_size.c get_num_groups.c \
-	get_work_dim.c hadd.cl hypot.cl ilogb.cl isequal.cl \
-	isfinite.cl isgreater.cl isgreaterequal.cl isinf.cl isless.cl \
-	islessequal.cl islessgreater.cl isnan.cl isnormal.cl \
-	isnotequal.cl isordered.cl isunordered.cl ldexp.cl length.cl \
-	lgamma.cl log.cl log10.cl log1p.cl log2.cl logb.cl mad.cl \
-	mad24.cl mad_hi.cl mad_sat.cl max.cl max_i.cl maxmag.cl min.cl \
-	min_i.cl minmag.cl mix.cl mul24.cl mul_hi.cl nan.cl \
-	native_cos.cl native_exp.cl native_exp10.cl native_exp2.cl \
-	native_log.cl native_log10.cl native_log2.cl native_powr.cl \
-	native_recip.cl native_rsqrt.cl native_sin.cl native_sqrt.cl \
-	native_tan.cl nextafter.cl normalize.cl popcount.cl pow.cl \
-	pown.cl powr.cl radians.cl read_image.cl recip.cl remainder.cl \
-	rhadd.cl rint.cl rootn.cl rotate.cl round.cl rsqrt.cl \
-	select.cl shuffle.cl sign.cl signbit.cl sin.cl sincos.cl \
-	sinh.cl sinpi.cl smoothstep.cl sqrt.cl step.cl sub_sat.cl \
-	tan.cl tanh.cl tanpi.cl tgamma.cl trunc.cl upsample.cl \
-	vload.cl vload_half.cl vstore.cl vstore_half.cl \
-	wait_group_events.cl write_image.cl $(am__append_1)
-
-# Use the libc (newlib) printf() for now because the pocl's
-# fails in TCE code gen:
-# SplitVectorResult #0: 0x24c5ae0: v8i16,ch = vaarg 0x20628e8, 0x2423ed0, 0x24c59e0, 0x24c4fe0 [ORD=223] [ID=0]
-# LLVM ERROR: Do not know how to split the result of this operator!
-LKERNEL_SRCS_EXCLUDE = printf.c 
-LKERNEL_SRCS_EXTRA = printf_constant.c
-EXTRA_DIST = CMakeLists.txt
-all: all-am
-
-.SUFFIXES:
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/../rules.mk $(srcdir)/../sources.mk $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
-	        && { if test -f $@; then exit 0; else break; fi; }; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign lib/kernel/tce/Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign lib/kernel/tce/Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-$(srcdir)/../rules.mk $(srcdir)/../sources.mk $(am__empty):
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure:  $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
-
-mostlyclean-libtool:
-	-rm -f *.lo
-
-clean-libtool:
-	-rm -rf .libs _libs
-install-nodist_pkgdataDATA: $(nodist_pkgdata_DATA)
-	@$(NORMAL_INSTALL)
-	@list='$(nodist_pkgdata_DATA)'; test -n "$(pkgdatadir)" || list=; \
-	if test -n "$$list"; then \
-	  echo " $(MKDIR_P) '$(DESTDIR)$(pkgdatadir)'"; \
-	  $(MKDIR_P) "$(DESTDIR)$(pkgdatadir)" || exit 1; \
-	fi; \
-	for p in $$list; do \
-	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
-	  echo "$$d$$p"; \
-	done | $(am__base_list) | \
-	while read files; do \
-	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(pkgdatadir)'"; \
-	  $(INSTALL_DATA) $$files "$(DESTDIR)$(pkgdatadir)" || exit $$?; \
-	done
-
-uninstall-nodist_pkgdataDATA:
-	@$(NORMAL_UNINSTALL)
-	@list='$(nodist_pkgdata_DATA)'; test -n "$(pkgdatadir)" || list=; \
-	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	dir='$(DESTDIR)$(pkgdatadir)'; $(am__uninstall_files_from_dir)
-tags TAGS:
-
-ctags CTAGS:
-
-cscope cscopelist:
-
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-am
-all-am: Makefile $(DATA)
-installdirs:
-	for dir in "$(DESTDIR)$(pkgdatadir)"; do \
-	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
-	done
-install: install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-am
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-am
-
-clean-am: clean-generic clean-libtool mostlyclean-am
-
-distclean: distclean-am
-	-rm -f Makefile
-distclean-am: clean-am distclean-generic
-
-dvi: dvi-am
-
-dvi-am:
-
-html: html-am
-
-html-am:
-
-info: info-am
-
-info-am:
-
-install-data-am: install-nodist_pkgdataDATA
-
-install-dvi: install-dvi-am
-
-install-dvi-am:
-
-install-exec-am:
-
-install-html: install-html-am
-
-install-html-am:
-
-install-info: install-info-am
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-am
-
-install-pdf-am:
-
-install-ps: install-ps-am
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-am
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-am
-
-mostlyclean-am: mostlyclean-generic mostlyclean-libtool
-
-pdf: pdf-am
-
-pdf-am:
-
-ps: ps-am
-
-ps-am:
-
-uninstall-am: uninstall-nodist_pkgdataDATA
-
-.MAKE: install-am install-strip
-
-.PHONY: all all-am check check-am clean clean-generic clean-libtool \
-	cscopelist-am ctags-am distclean distclean-generic \
-	distclean-libtool distdir dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-info install-info-am install-man \
-	install-nodist_pkgdataDATA install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-generic \
-	mostlyclean-libtool pdf pdf-am ps ps-am tags-am uninstall \
-	uninstall-am uninstall-nodist_pkgdataDATA
-
-.PRECIOUS: Makefile
-
-
-all: ${KERNEL_BC}
-
-vpath %.c  ../../../lib/kernel
-vpath %.cc ../../../lib/kernel
-vpath %.cl ../../../lib/kernel
-vpath %.ll ../../../lib/kernel
-
-# Generate a precompiled header for the built-in function
-# declarations, in case supported by the target.
-
-# Note: the precompiled header must be compiled with the same features
-# as the kernels will be. That is, use exactly the same frontend
-# feature switches. Otherwise it will fail when compiling the kernel
-# against the precompiled header.
-_kernel.h.pch: ../../../include/${TARGET_DIR}/types.h ../../../include/_kernel.h
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang -Xclang -ffake-address-space-map -fno-math-errno -fblocks -fno-builtin -fasm -Wno-format  -D__OPENCL_VERSION__=120 -Xclang -ffake-address-space-map -c -target ${KERNEL_TARGET} -x cl \
-	-include ../../../include/${TARGET_DIR}/types.h \
-	-Xclang -emit-pch ../../../include/_kernel.h -o _kernel.h.pch 
-
-# Rules to compile the different kernel library source file types into
-# LLVM bitcode
-%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
-	mkdir -p ${dir $@}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang ${CLANG_FLAGS} ${CLFLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $< 
-%.cc.bc: %.cc ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
-	mkdir -p ${dir $@}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang++ ${CLANG_FLAGS} ${CLANGXX_FLAGS} -c -o $@ $< -include ${abs_top_srcdir}/include/pocl_features.h
-%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
-	mkdir -p ${dir $@}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/clang ${CLANG_FLAGS} -x cl ${CLFLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
-%.ll.bc: %.ll
-	mkdir -p ${dir $@}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-as -o $@ $<
-
-# Optimize the bitcode library to speed up optimization times for the
-# OpenCL kernels
-${KERNEL_BC}: ${OBJ}
-	/home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/llvm-link $^ -o - | /home/LLVM_370_HSAIL_rwdi_NA_rtti/bin/opt ${LLC_FLAGS} ${KERNEL_LIB_OPT_FLAGS} -O3 -fp-contract=off -o $@
-
-# vim: set noexpandtab ts=8:
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/lib/kernel/tce/Makefile.am b/lib/kernel/tce/Makefile.am
index 0fe940e..b459844 100644
--- a/lib/kernel/tce/Makefile.am
+++ b/lib/kernel/tce/Makefile.am
@@ -28,6 +28,7 @@ KERNEL_TARGET = tce-tut-llvm
 CLANG_FLAGS = @TARGET_CLANG_FLAGS@ -Xclang -ffake-address-space-map -Xclang -menable-no-nans -emit-llvm -ffp-contract=off -target tce-tut-llvm  -isystem `tce-config --prefix`/tce-llvm/include
 LLC_FLAGS   = @TARGET_LLC_FLAGS@
 LD_FLAGS    = @TARGET_LD_FLAGS@
+DEVICE_CL_FLAGS = -D__OPENCL_VERSION__=200 @TCE_DEVICE_EXTENSION_DEFINES@
 
 include ../rules.mk
 include ../sources.mk
@@ -36,11 +37,8 @@ include ../sources.mk
 # fails in TCE code gen:
 # SplitVectorResult #0: 0x24c5ae0: v8i16,ch = vaarg 0x20628e8, 0x2423ed0, 0x24c59e0, 0x24c4fe0 [ORD=223] [ID=0]
 # LLVM ERROR: Do not know how to split the result of this operator!
-
-if NEW_PRINTF_WORKS
-LKERNEL_SRCS_EXCLUDE = printf.c 
+LKERNEL_SRCS_EXCLUDE = printf.c svm_atomics_host.cl atomics.cl
 LKERNEL_SRCS_EXTRA = printf_constant.c
-endif
 
 EXTRA_DIST = CMakeLists.txt
 
diff --git a/lib/kernel/tce/Makefile.in b/lib/kernel/tce/Makefile.in
index f9035e9..0057614 100644
--- a/lib/kernel/tce/Makefile.in
+++ b/lib/kernel/tce/Makefile.in
@@ -65,6 +65,7 @@
 # CLANG_FLAGS
 # LLC_FLAGS
 # LD_FLAGS
+# DEVICE_CL_FLAGS
 
 # sources.mk - a list of all kernel source files
 # 
@@ -164,7 +165,6 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
- at NEW_PRINTF_WORKS_TRUE@am__append_1 = printf.c
 subdir = lib/kernel/tce
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
@@ -275,6 +275,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -282,6 +283,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -299,8 +301,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -376,6 +376,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -441,6 +442,7 @@ KERNEL_TARGET = tce-tut-llvm
 CLANG_FLAGS = @TARGET_CLANG_FLAGS@ -Xclang -ffake-address-space-map -Xclang -menable-no-nans -emit-llvm -ffp-contract=off -target tce-tut-llvm  -isystem `tce-config --prefix`/tce-llvm/include
 LLC_FLAGS = @TARGET_LLC_FLAGS@
 LD_FLAGS = @TARGET_LD_FLAGS@
+DEVICE_CL_FLAGS = -D__OPENCL_VERSION__=200 @TCE_DEVICE_EXTENSION_DEFINES@
 KERNEL_BC = kernel-${KERNEL_TARGET}.bc
 nodist_pkgdata_DATA = ${KERNEL_BC}
 
@@ -449,49 +451,177 @@ nodist_pkgdata_DATA = ${KERNEL_BC}
 # and LKERNEL_SRCS_EXTRA, which adds extra files to the source list.
 LKERNEL_SRCS = \
 	$(filter-out ${LKERNEL_SRCS_EXCLUDE}, ${LKERNEL_SRCS_DEFAULT})	\
-	${LKERNEL_SRCS_EXTRA}
+	${LKERNEL_SRCS_EXTRA} ${LKERNEL_SRCS_EXTRA2}
 
 OBJ = $(LKERNEL_SRCS:%=%.bc)
 CLEANFILES = kernel-${KERNEL_TARGET}.bc ${OBJ}
 LKERNEL_HDRS = image.h pocl_image_rw_utils.h templates.h
-LKERNEL_SRCS_DEFAULT = abs.cl abs_diff.cl acos.cl acosh.cl acospi.cl \
-	add_sat.cl all.cl any.cl as_type.cl asin.cl asinh.cl asinpi.cl \
-	async_work_group_copy.cl atan.cl atan2.cl atan2pi.cl atanh.cl \
-	atanpi.cl atomics.cl barrier.ll bitselect.cl cbrt.cl ceil.cl \
-	clamp.cl clamp_int.cl clz.cl convert_type.cl copysign.cl \
-	cos.cl cosh.cl cospi.cl cross.cl degrees.cl distance.cl \
-	divide.cl dot.cl erf.cl erfc.cl exp.cl exp10.cl exp2.cl \
-	expm1.cl fabs.cl fast_distance.cl fast_length.cl \
-	fast_normalize.cl fdim.cl floor.cl fma.cl fmax.cl fmin.cl \
-	fmod.cl fract.cl get_global_id.c get_global_offset.c \
-	get_global_size.c get_group_id.c get_image_depth.cl \
-	get_image_height.cl get_image_width.cl get_image_dim.cl \
-	get_local_id.c get_local_size.c get_num_groups.c \
-	get_work_dim.c hadd.cl hypot.cl ilogb.cl isequal.cl \
-	isfinite.cl isgreater.cl isgreaterequal.cl isinf.cl isless.cl \
-	islessequal.cl islessgreater.cl isnan.cl isnormal.cl \
-	isnotequal.cl isordered.cl isunordered.cl ldexp.cl length.cl \
-	lgamma.cl log.cl log10.cl log1p.cl log2.cl logb.cl mad.cl \
-	mad24.cl mad_hi.cl mad_sat.cl max.cl max_i.cl maxmag.cl min.cl \
-	min_i.cl minmag.cl mix.cl mul24.cl mul_hi.cl nan.cl \
-	native_cos.cl native_exp.cl native_exp10.cl native_exp2.cl \
-	native_log.cl native_log10.cl native_log2.cl native_powr.cl \
-	native_recip.cl native_rsqrt.cl native_sin.cl native_sqrt.cl \
-	native_tan.cl nextafter.cl normalize.cl popcount.cl pow.cl \
-	pown.cl powr.cl radians.cl read_image.cl recip.cl remainder.cl \
-	rhadd.cl rint.cl rootn.cl rotate.cl round.cl rsqrt.cl \
-	select.cl shuffle.cl sign.cl signbit.cl sin.cl sincos.cl \
-	sinh.cl sinpi.cl smoothstep.cl sqrt.cl step.cl sub_sat.cl \
-	tan.cl tanh.cl tanpi.cl tgamma.cl trunc.cl upsample.cl \
-	vload.cl vload_half.cl vstore.cl vstore_half.cl \
-	wait_group_events.cl write_image.cl $(am__append_1)
+LKERNEL_SRCS_DEFAULT = \
+	abs.cl					\
+	abs_diff.cl				\
+	acos.cl					\
+	acosh.cl				\
+	acospi.cl				\
+	add_sat.cl				\
+	all.cl					\
+	any.cl					\
+	as_type.cl				\
+	asin.cl					\
+	asinh.cl				\
+	asinpi.cl				\
+	async_work_group_copy.cl		\
+	atan.cl					\
+	atan2.cl				\
+	atan2pi.cl				\
+	atanh.cl				\
+	atanpi.cl				\
+	atomics.cl				\
+	barrier.ll				\
+	bitselect.cl				\
+	cbrt.cl					\
+	ceil.cl					\
+	clamp.cl				\
+	clamp_int.cl				\
+	clz.cl					\
+	convert_type.cl				\
+	copysign.cl				\
+	cos.cl					\
+	cosh.cl					\
+	cospi.cl				\
+	cross.cl				\
+	degrees.cl				\
+	distance.cl				\
+	divide.cl				\
+	dot.cl					\
+	erf.cl					\
+	erfc.cl					\
+	exp.cl					\
+	exp10.cl				\
+	exp2.cl					\
+	expm1.cl				\
+	fabs.cl					\
+	fast_distance.cl			\
+	fast_length.cl				\
+	fast_normalize.cl			\
+	fdim.cl					\
+	floor.cl				\
+	fma.cl					\
+	fmax.cl					\
+	fmin.cl					\
+	fmod.cl					\
+	fract.cl				\
+	get_global_id.c				\
+	get_global_offset.c			\
+	get_global_size.c			\
+	get_group_id.c				\
+	get_image_depth.cl			\
+	get_image_height.cl			\
+	get_image_width.cl			\
+	get_image_dim.cl			\
+	get_local_id.c				\
+	get_local_size.c			\
+	get_num_groups.c			\
+	get_work_dim.c				\
+	hadd.cl					\
+	hypot.cl				\
+	ilogb.cl				\
+	isequal.cl				\
+	isfinite.cl				\
+	isgreater.cl				\
+	isgreaterequal.cl			\
+	isinf.cl				\
+	isless.cl				\
+	islessequal.cl				\
+	islessgreater.cl			\
+	isnan.cl				\
+	isnormal.cl				\
+	isnotequal.cl				\
+	isordered.cl				\
+	isunordered.cl				\
+	ldexp.cl				\
+	length.cl				\
+	lgamma.cl				\
+	log.cl					\
+	log10.cl				\
+	log1p.cl				\
+	log2.cl					\
+	logb.cl					\
+	mad.cl					\
+	mad24.cl				\
+	mad_hi.cl				\
+	mad_sat.cl				\
+	max.cl					\
+	max_i.cl				\
+	maxmag.cl				\
+	min.cl					\
+	min_i.cl				\
+	minmag.cl				\
+	mix.cl					\
+	mul24.cl				\
+	mul_hi.cl				\
+	nan.cl					\
+	native_cos.cl				\
+	native_exp.cl				\
+	native_exp10.cl				\
+	native_exp2.cl				\
+	native_log.cl				\
+	native_log10.cl				\
+	native_log2.cl				\
+	native_powr.cl				\
+	native_recip.cl				\
+	native_rsqrt.cl				\
+	native_sin.cl				\
+	native_sqrt.cl				\
+	native_tan.cl				\
+	nextafter.cl				\
+	normalize.cl				\
+	popcount.cl				\
+	pow.cl					\
+	pown.cl					\
+	powr.cl					\
+	printf.c                                \
+	radians.cl				\
+	read_image.cl				\
+	recip.cl				\
+	remainder.cl				\
+	rhadd.cl				\
+	rint.cl					\
+	rootn.cl				\
+	rotate.cl				\
+	round.cl				\
+	rsqrt.cl				\
+	select.cl				\
+	shuffle.cl				\
+	sign.cl					\
+	signbit.cl				\
+	sin.cl					\
+	sincos.cl				\
+	sinh.cl					\
+	sinpi.cl				\
+	smoothstep.cl				\
+	sqrt.cl					\
+	step.cl					\
+	sub_sat.cl				\
+	tan.cl					\
+	tanh.cl					\
+	tanpi.cl				\
+	tgamma.cl				\
+	trunc.cl				\
+	upsample.cl				\
+	vload.cl				\
+	vload_half.cl				\
+	vstore.cl				\
+	vstore_half.cl				\
+	wait_group_events.cl			\
+	write_image.cl
+
 
 # Use the libc (newlib) printf() for now because the pocl's
 # fails in TCE code gen:
 # SplitVectorResult #0: 0x24c5ae0: v8i16,ch = vaarg 0x20628e8, 0x2423ed0, 0x24c59e0, 0x24c4fe0 [ORD=223] [ID=0]
 # LLVM ERROR: Do not know how to split the result of this operator!
- at NEW_PRINTF_WORKS_TRUE@LKERNEL_SRCS_EXCLUDE = printf.c 
- at NEW_PRINTF_WORKS_TRUE@LKERNEL_SRCS_EXTRA = printf_constant.c
+LKERNEL_SRCS_EXCLUDE = printf.c svm_atomics_host.cl atomics.cl
+LKERNEL_SRCS_EXTRA = printf_constant.c
 EXTRA_DIST = CMakeLists.txt
 all: all-am
 
@@ -714,10 +844,10 @@ uninstall-am: uninstall-nodist_pkgdataDATA
 
 all: ${KERNEL_BC}
 
-vpath %.c  @top_srcdir@/lib/kernel
-vpath %.cc @top_srcdir@/lib/kernel
-vpath %.cl @top_srcdir@/lib/kernel
-vpath %.ll @top_srcdir@/lib/kernel
+vpath %.c @srcdir@ @top_srcdir@/lib/kernel
+vpath %.cc @srcdir@ @top_srcdir@/lib/kernel
+vpath %.cl @srcdir@ @top_srcdir@/lib/kernel
+vpath %.ll @srcdir@ @top_srcdir@/lib/kernel
 
 # Generate a precompiled header for the built-in function
 # declarations, in case supported by the target.
@@ -733,15 +863,15 @@ _kernel.h.pch: @top_builddir@/include/${TARGET_DIR}/types.h @top_srcdir@/include
 
 # Rules to compile the different kernel library source file types into
 # LLVM bitcode
-%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
+%.c.bc: %.c ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/_kernel_c.h ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} ${CLFLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $< 
-%.cc.bc: %.cc ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
+	@CLANG@ ${CLANG_FLAGS} ${CLFLAGS} ${DEVICE_CL_FLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $<
+%.cc.bc: %.cc  ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANGXX@ ${CLANG_FLAGS} ${CLANGXX_FLAGS} -c -o $@ $< -include ${abs_top_srcdir}/include/pocl_features.h
-%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${abs_top_srcdir}/include/pocl_features.h ${LKERNEL_HDRS_EXTRA}
+	@CLANGXX@ ${CLANG_FLAGS} ${CLANGXX_FLAGS} ${DEVICE_CL_FLAGS} -c -o $@ $<
+%.cl.bc: %.cl ${abs_top_srcdir}/include/_kernel.h ${abs_top_srcdir}/include/_kernel_c.h ${abs_top_srcdir}/include/pocl_types.h ${LKERNEL_HDRS_EXTRA}
 	mkdir -p ${dir $@}
-	@CLANG@ ${CLANG_FLAGS} -x cl ${CLFLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
+	@CLANG@ ${CLANG_FLAGS} -x cl ${CLFLAGS} ${DEVICE_CL_FLAGS} -fsigned-char -c -o $@ $< -include ${abs_top_srcdir}/include/_kernel.h
 %.ll.bc: %.ll
 	mkdir -p ${dir $@}
 	@LLVM_AS@ -o $@ $<
diff --git a/lib/kernel/vecmathlib/floatbuiltins.h b/lib/kernel/vecmathlib/floatbuiltins.h
index ee076a2..a7dd6f1 100644
--- a/lib/kernel/vecmathlib/floatbuiltins.h
+++ b/lib/kernel/vecmathlib/floatbuiltins.h
@@ -6,323 +6,383 @@
 #if defined __clang__
 
 namespace vecmathlib {
-  
-  inline char builtin_abs(char x) { return __builtin_abs(x); }
-  inline short builtin_abs(short x) { return __builtin_abs(x); }
-  inline int builtin_abs(int x) { return __builtin_abs(x); }
-  inline long builtin_abs(long x) { return __builtin_labs(x); }
+
+inline char builtin_abs(char x) { return __builtin_abs(x); }
+inline short builtin_abs(short x) { return __builtin_abs(x); }
+inline int builtin_abs(int x) { return __builtin_abs(x); }
+inline long builtin_abs(long x) { return __builtin_labs(x); }
 #if __SIZEOF_LONG_LONG__
-  inline long long builtin_abs(long long x) { return __builtin_llabs(x); }
+inline long long builtin_abs(long long x) { return __builtin_llabs(x); }
 #endif
-  
-  inline unsigned char builtin_clz(unsigned char x) { return __builtin_clzs(x) - CHAR_BIT * (sizeof(unsigned short) - sizeof(unsigned char)); }
-  inline unsigned short builtin_clz(unsigned short x) { return __builtin_clzs(x); }
-  inline unsigned int builtin_clz(unsigned int x) { return __builtin_clz(x); }
-  inline unsigned long builtin_clz(unsigned long x) { return __builtin_clzl(x); }
+
+inline unsigned char builtin_clz(unsigned char x) {
+  return __builtin_clzs(x) -
+         CHAR_BIT * (sizeof(unsigned short) - sizeof(unsigned char));
+}
+inline unsigned short builtin_clz(unsigned short x) {
+  return __builtin_clzs(x);
+}
+inline unsigned int builtin_clz(unsigned int x) { return __builtin_clz(x); }
+inline unsigned long builtin_clz(unsigned long x) { return __builtin_clzl(x); }
 #if __SIZEOF_LONG_LONG__
-  inline unsigned long long builtin_clz(unsigned long long x) { return __builtin_clzll(x); }
+inline unsigned long long builtin_clz(unsigned long long x) {
+  return __builtin_clzll(x);
+}
 #endif
-  
-  inline unsigned char builtin_popcount(unsigned char x) { return __builtin_popcount(x); }
-  inline unsigned short builtin_popcount(unsigned short x) { return __builtin_popcount(x); }
-  inline unsigned int builtin_popcount(unsigned int x) { return __builtin_popcount(x); }
-  inline unsigned long builtin_popcount(unsigned long x) { return __builtin_popcountl(x); }
+
+inline unsigned char builtin_popcount(unsigned char x) {
+  return __builtin_popcount(x);
+}
+inline unsigned short builtin_popcount(unsigned short x) {
+  return __builtin_popcount(x);
+}
+inline unsigned int builtin_popcount(unsigned int x) {
+  return __builtin_popcount(x);
+}
+inline unsigned long builtin_popcount(unsigned long x) {
+  return __builtin_popcountl(x);
+}
 #if __SIZEOF_LONG_LONG__
-  inline unsigned long long builtin_popcount(unsigned long long x) { return __builtin_popcountll(x); }
+inline unsigned long long builtin_popcount(unsigned long long x) {
+  return __builtin_popcountll(x);
+}
 #endif
-  
-  
-  
-  inline float builtin_acos(float x) { return __builtin_acosf(x); }
-  inline double builtin_acos(double x) { return __builtin_acos(x); }
+
+inline float builtin_acos(float x) { return __builtin_acosf(x); }
+inline double builtin_acos(double x) { return __builtin_acos(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_acos(long double x) { return __builtin_acosl(x); }
+inline long double builtin_acos(long double x) { return __builtin_acosl(x); }
 #endif
-  
-  inline float builtin_acosh(float x) { return __builtin_acoshf(x); }
-  inline double builtin_acosh(double x) { return __builtin_acosh(x); }
+
+inline float builtin_acosh(float x) { return __builtin_acoshf(x); }
+inline double builtin_acosh(double x) { return __builtin_acosh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_acosh(long double x) { return __builtin_acoshl(x); }
+inline long double builtin_acosh(long double x) { return __builtin_acoshl(x); }
 #endif
-  
-  inline float builtin_asin(float x) { return __builtin_asinf(x); }
-  inline double builtin_asin(double x) { return __builtin_asin(x); }
+
+inline float builtin_asin(float x) { return __builtin_asinf(x); }
+inline double builtin_asin(double x) { return __builtin_asin(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_asin(long double x) { return __builtin_asinl(x); }
+inline long double builtin_asin(long double x) { return __builtin_asinl(x); }
 #endif
-  
-  inline float builtin_asinh(float x) { return __builtin_asinhf(x); }
-  inline double builtin_asinh(double x) { return __builtin_asinh(x); }
+
+inline float builtin_asinh(float x) { return __builtin_asinhf(x); }
+inline double builtin_asinh(double x) { return __builtin_asinh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_asinh(long double x) { return __builtin_asinhl(x); }
+inline long double builtin_asinh(long double x) { return __builtin_asinhl(x); }
 #endif
-  
-  inline float builtin_atan(float x) { return __builtin_atanf(x); }
-  inline double builtin_atan(double x) { return __builtin_atan(x); }
+
+inline float builtin_atan(float x) { return __builtin_atanf(x); }
+inline double builtin_atan(double x) { return __builtin_atan(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_atan(long double x) { return __builtin_atanl(x); }
+inline long double builtin_atan(long double x) { return __builtin_atanl(x); }
 #endif
-  
-  inline float builtin_atan2(float x, float y) { return __builtin_atan2f(x, y); }
-  inline double builtin_atan2(double x, double y) { return __builtin_atan2(x, y); }
+
+inline float builtin_atan2(float x, float y) { return __builtin_atan2f(x, y); }
+inline double builtin_atan2(double x, double y) {
+  return __builtin_atan2(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_atan2(long double x, long double y) { return __builtin_atan2l(x, y); }
+inline long double builtin_atan2(long double x, long double y) {
+  return __builtin_atan2l(x, y);
+}
 #endif
-  
-  inline float builtin_atanh(float x) { return __builtin_atanhf(x); }
-  inline double builtin_atanh(double x) { return __builtin_atanh(x); }
+
+inline float builtin_atanh(float x) { return __builtin_atanhf(x); }
+inline double builtin_atanh(double x) { return __builtin_atanh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_atanh(long double x) { return __builtin_atanhl(x); }
+inline long double builtin_atanh(long double x) { return __builtin_atanhl(x); }
 #endif
-  
-  inline float builtin_cbrt(float x) { return __builtin_cbrtf(x); }
-  inline double builtin_cbrt(double x) { return __builtin_cbrt(x); }
+
+inline float builtin_cbrt(float x) { return __builtin_cbrtf(x); }
+inline double builtin_cbrt(double x) { return __builtin_cbrt(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_cbrt(long double x) { return __builtin_cbrtl(x); }
+inline long double builtin_cbrt(long double x) { return __builtin_cbrtl(x); }
 #endif
-  
-  inline float builtin_ceil(float x) { return __builtin_ceilf(x); }
-  inline double builtin_ceil(double x) { return __builtin_ceil(x); }
+
+inline float builtin_ceil(float x) { return __builtin_ceilf(x); }
+inline double builtin_ceil(double x) { return __builtin_ceil(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_ceil(long double x) { return __builtin_ceill(x); }
+inline long double builtin_ceil(long double x) { return __builtin_ceill(x); }
 #endif
-    
-  inline float builtin_copysign(float x, float y) { return __builtin_copysignf(x, y); }
-  inline double builtin_copysign(double x, double y) { return __builtin_copysign(x, y); }
+
+inline float builtin_copysign(float x, float y) {
+  return __builtin_copysignf(x, y);
+}
+inline double builtin_copysign(double x, double y) {
+  return __builtin_copysign(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_copysign(long double x, long double y) { return __builtin_copysignl(x, y); }
+inline long double builtin_copysign(long double x, long double y) {
+  return __builtin_copysignl(x, y);
+}
 #endif
 
-  inline float builtin_cos(float x) { return __builtin_cosf(x); }
-  inline double builtin_cos(double x) { return __builtin_cos(x); }
+inline float builtin_cos(float x) { return __builtin_cosf(x); }
+inline double builtin_cos(double x) { return __builtin_cos(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_cos(long double x) { return __builtin_cosl(x); }
+inline long double builtin_cos(long double x) { return __builtin_cosl(x); }
 #endif
-  
-  inline float builtin_cosh(float x) { return __builtin_coshf(x); }
-  inline double builtin_cosh(double x) { return __builtin_cosh(x); }
+
+inline float builtin_cosh(float x) { return __builtin_coshf(x); }
+inline double builtin_cosh(double x) { return __builtin_cosh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_cosh(long double x) { return __builtin_coshl(x); }
+inline long double builtin_cosh(long double x) { return __builtin_coshl(x); }
 #endif
 
-  inline float builtin_exp(float x) { return __builtin_expf(x); }
-  inline double builtin_exp(double x) { return __builtin_exp(x); }
+inline float builtin_exp(float x) { return __builtin_expf(x); }
+inline double builtin_exp(double x) { return __builtin_exp(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_exp(long double x) { return __builtin_expl(x); }
+inline long double builtin_exp(long double x) { return __builtin_expl(x); }
 #endif
-  
-  inline float builtin_exp2(float x) { return __builtin_exp2f(x); }
-  inline double builtin_exp2(double x) { return __builtin_exp2(x); }
+
+inline float builtin_exp2(float x) { return __builtin_exp2f(x); }
+inline double builtin_exp2(double x) { return __builtin_exp2(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_exp2(long double x) { return __builtin_exp2l(x); }
+inline long double builtin_exp2(long double x) { return __builtin_exp2l(x); }
 #endif
 
-  inline float builtin_expm1(float x) { return __builtin_expm1f(x); }
-  inline double builtin_expm1(double x) { return __builtin_expm1(x); }
+inline float builtin_expm1(float x) { return __builtin_expm1f(x); }
+inline double builtin_expm1(double x) { return __builtin_expm1(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_expm1(long double x) { return __builtin_expm1l(x); }
+inline long double builtin_expm1(long double x) { return __builtin_expm1l(x); }
 #endif
 
-  inline float builtin_fabs(float x) { return __builtin_fabsf(x); }
-  inline double builtin_fabs(double x) { return __builtin_fabs(x); }
+inline float builtin_fabs(float x) { return __builtin_fabsf(x); }
+inline double builtin_fabs(double x) { return __builtin_fabs(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fabs(long double x) { return __builtin_fabsl(x); }
+inline long double builtin_fabs(long double x) { return __builtin_fabsl(x); }
 #endif
-  
-  inline float builtin_fdim(float x, float y) { return __builtin_fdimf(x, y); }
-  inline double builtin_fdim(double x, double y) { return __builtin_fdim(x, y); }
+
+inline float builtin_fdim(float x, float y) { return __builtin_fdimf(x, y); }
+inline double builtin_fdim(double x, double y) { return __builtin_fdim(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fdim(long double x, long double y) { return __builtin_fdiml(x, y); }
+inline long double builtin_fdim(long double x, long double y) {
+  return __builtin_fdiml(x, y);
+}
 #endif
-  
-  inline float builtin_floor(float x) { return __builtin_floorf(x); }
-  inline double builtin_floor(double x) { return __builtin_floor(x); }
+
+inline float builtin_floor(float x) { return __builtin_floorf(x); }
+inline double builtin_floor(double x) { return __builtin_floor(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_floor(long double x) { return __builtin_floorl(x); }
+inline long double builtin_floor(long double x) { return __builtin_floorl(x); }
 #endif
-  
-  inline float builtin_fma(float x, float y, float z) { return __builtin_fmaf(x, y, z); }
-  inline double builtin_fma(double x, double y, double z) { return __builtin_fma(x, y, z); }
+
+inline float builtin_fma(float x, float y, float z) {
+  return __builtin_fmaf(x, y, z);
+}
+inline double builtin_fma(double x, double y, double z) {
+  return __builtin_fma(x, y, z);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fma(long double x, long double y, long double z) { return __builtin_fmal(x, y, z); }
+inline long double builtin_fma(long double x, long double y, long double z) {
+  return __builtin_fmal(x, y, z);
+}
 #endif
-  
-  inline float builtin_fmax(float x, float y) { return __builtin_fmaxf(x, y); }
-  inline double builtin_fmax(double x, double y) { return __builtin_fmax(x, y); }
+
+inline float builtin_fmax(float x, float y) { return __builtin_fmaxf(x, y); }
+inline double builtin_fmax(double x, double y) { return __builtin_fmax(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fmax(long double x, long double y) { return __builtin_fmaxl(x, y); }
+inline long double builtin_fmax(long double x, long double y) {
+  return __builtin_fmaxl(x, y);
+}
 #endif
-  
-  inline float builtin_fmin(float x, float y) { return __builtin_fminf(x, y); }
-  inline double builtin_fmin(double x, double y) { return __builtin_fmin(x, y); }
+
+inline float builtin_fmin(float x, float y) { return __builtin_fminf(x, y); }
+inline double builtin_fmin(double x, double y) { return __builtin_fmin(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fmin(long double x, long double y) { return __builtin_fminl(x, y); }
+inline long double builtin_fmin(long double x, long double y) {
+  return __builtin_fminl(x, y);
+}
 #endif
-  
-  inline float builtin_fmod(float x, float y) { return __builtin_fmodf(x, y); }
-  inline double builtin_fmod(double x, double y) { return __builtin_fmod(x, y); }
+
+inline float builtin_fmod(float x, float y) { return __builtin_fmodf(x, y); }
+inline double builtin_fmod(double x, double y) { return __builtin_fmod(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fmod(long double x, long double y) { return __builtin_fmodl(x, y); }
+inline long double builtin_fmod(long double x, long double y) {
+  return __builtin_fmodl(x, y);
+}
 #endif
-  
-  inline float builtin_frexp(float x, int* r) { return __builtin_frexpf(x, r); }
-  inline double builtin_frexp(double x, int* r) { return __builtin_frexp(x, r); }
+
+inline float builtin_frexp(float x, int *r) { return __builtin_frexpf(x, r); }
+inline double builtin_frexp(double x, int *r) { return __builtin_frexp(x, r); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_frexp(long double x, int* r) { return __builtin_frexpl(x, r); }
+inline long double builtin_frexp(long double x, int *r) {
+  return __builtin_frexpl(x, r);
+}
 #endif
-  
-  inline float builtin_hypot(float x, float y) { return __builtin_hypotf(x, y); }
-  inline double builtin_hypot(double x, double y) { return __builtin_hypot(x, y); }
+
+inline float builtin_hypot(float x, float y) { return __builtin_hypotf(x, y); }
+inline double builtin_hypot(double x, double y) {
+  return __builtin_hypot(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_hypot(long double x, long double y) { return __builtin_hypotl(x, y); }
+inline long double builtin_hypot(long double x, long double y) {
+  return __builtin_hypotl(x, y);
+}
 #endif
-  
-  inline int builtin_ilogb(float x) { return __builtin_ilogbf(x); }
-  inline int builtin_ilogb(double x) { return __builtin_ilogb(x); }
+
+inline int builtin_ilogb(float x) { return __builtin_ilogbf(x); }
+inline int builtin_ilogb(double x) { return __builtin_ilogb(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_ilogb(long double x) { return __builtin_ilogbl(x); }
+inline int builtin_ilogb(long double x) { return __builtin_ilogbl(x); }
 #endif
-  
-  inline int builtin_isfinite(float x) { return __builtin_isfinite(x); }
-  inline int builtin_isfinite(double x) { return __builtin_isfinite(x); }
+
+inline int builtin_isfinite(float x) { return __builtin_isfinite(x); }
+inline int builtin_isfinite(double x) { return __builtin_isfinite(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_isfinite(long double x) { return __builtin_isfinite(x); }
+inline int builtin_isfinite(long double x) { return __builtin_isfinite(x); }
 #endif
-  
-  inline int builtin_isinf(float x) { return __builtin_isinf(x); }
-  inline int builtin_isinf(double x) { return __builtin_isinf(x); }
+
+inline int builtin_isinf(float x) { return __builtin_isinf(x); }
+inline int builtin_isinf(double x) { return __builtin_isinf(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_isinf(long double x) { return __builtin_isinf(x); }
+inline int builtin_isinf(long double x) { return __builtin_isinf(x); }
 #endif
-  
-  inline int builtin_isnan(float x) { return __builtin_isnan(x); }
-  inline int builtin_isnan(double x) { return __builtin_isnan(x); }
+
+inline int builtin_isnan(float x) { return __builtin_isnan(x); }
+inline int builtin_isnan(double x) { return __builtin_isnan(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_isnan(long double x) { return __builtin_isnan(x); }
+inline int builtin_isnan(long double x) { return __builtin_isnan(x); }
 #endif
-  
-  inline int builtin_isnormal(float x) { return __builtin_isnormal(x); }
-  inline int builtin_isnormal(double x) { return __builtin_isnormal(x); }
+
+inline int builtin_isnormal(float x) { return __builtin_isnormal(x); }
+inline int builtin_isnormal(double x) { return __builtin_isnormal(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_isnormal(long double x) { return __builtin_isnormal(x); }
+inline int builtin_isnormal(long double x) { return __builtin_isnormal(x); }
 #endif
-  
-  inline float builtin_ldexp(float x, int y) { return __builtin_ldexpf(x, y); }
-  inline double builtin_ldexp(double x, int y) { return __builtin_ldexp(x, y); }
+
+inline float builtin_ldexp(float x, int y) { return __builtin_ldexpf(x, y); }
+inline double builtin_ldexp(double x, int y) { return __builtin_ldexp(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_ldexp(long double x, int y) { return __builtin_ldexpl(x, y); }
+inline long double builtin_ldexp(long double x, int y) {
+  return __builtin_ldexpl(x, y);
+}
 #endif
-  
-  inline long long builtin_llrint(float x) { return __builtin_llrintf(x); }
-  inline long long builtin_llrint(double x) { return __builtin_llrint(x); }
+
+inline long long builtin_llrint(float x) { return __builtin_llrintf(x); }
+inline long long builtin_llrint(double x) { return __builtin_llrint(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long long builtin_llrint(long double x) { return __builtin_llrintl(x); }
+inline long long builtin_llrint(long double x) { return __builtin_llrintl(x); }
 #endif
 
-  inline float builtin_log(float x) { return __builtin_logf(x); }
-  inline double builtin_log(double x) { return __builtin_log(x); }
+inline float builtin_log(float x) { return __builtin_logf(x); }
+inline double builtin_log(double x) { return __builtin_log(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_log(long double x) { return __builtin_logl(x); }
+inline long double builtin_log(long double x) { return __builtin_logl(x); }
 #endif
 
-  inline float builtin_log10(float x) { return __builtin_log10f(x); }
-  inline double builtin_log10(double x) { return __builtin_log10(x); }
+inline float builtin_log10(float x) { return __builtin_log10f(x); }
+inline double builtin_log10(double x) { return __builtin_log10(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_log10(long double x) { return __builtin_log10l(x); }
+inline long double builtin_log10(long double x) { return __builtin_log10l(x); }
 #endif
 
-  inline float builtin_log1p(float x) { return __builtin_log1pf(x); }
-  inline double builtin_log1p(double x) { return __builtin_log1p(x); }
+inline float builtin_log1p(float x) { return __builtin_log1pf(x); }
+inline double builtin_log1p(double x) { return __builtin_log1p(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_log1p(long double x) { return __builtin_log1pl(x); }
+inline long double builtin_log1p(long double x) { return __builtin_log1pl(x); }
 #endif
 
-  inline float builtin_log2(float x) { return __builtin_log2f(x); }
-  inline double builtin_log2(double x) { return __builtin_log2(x); }
+inline float builtin_log2(float x) { return __builtin_log2f(x); }
+inline double builtin_log2(double x) { return __builtin_log2(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_log2(long double x) { return __builtin_log2l(x); }
+inline long double builtin_log2(long double x) { return __builtin_log2l(x); }
 #endif
-  
-  inline long builtin_lrint(float x) { return __builtin_lrintf(x); }
-  inline long builtin_lrint(double x) { return __builtin_lrint(x); }
+
+inline long builtin_lrint(float x) { return __builtin_lrintf(x); }
+inline long builtin_lrint(double x) { return __builtin_lrint(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long builtin_lrint(long double x) { return __builtin_lrintl(x); }
+inline long builtin_lrint(long double x) { return __builtin_lrintl(x); }
 #endif
-  
-  inline float builtin_nextafter(float x, float y) { return __builtin_nextafterf(x, y); }
-  inline double builtin_nextafter(double x, double y) { return __builtin_nextafter(x, y); }
+
+inline float builtin_nextafter(float x, float y) {
+  return __builtin_nextafterf(x, y);
+}
+inline double builtin_nextafter(double x, double y) {
+  return __builtin_nextafter(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_nextafter(long double x, long double y) { return __builtin_nextafterl(x, y); }
+inline long double builtin_nextafter(long double x, long double y) {
+  return __builtin_nextafterl(x, y);
+}
 #endif
-  
-  inline float builtin_pow(float x, float y) { return __builtin_powf(x, y); }
-  inline double builtin_pow(double x, double y) { return __builtin_pow(x, y); }
+
+inline float builtin_pow(float x, float y) { return __builtin_powf(x, y); }
+inline double builtin_pow(double x, double y) { return __builtin_pow(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_pow(long double x, long double y) { return __builtin_powl(x, y); }
+inline long double builtin_pow(long double x, long double y) {
+  return __builtin_powl(x, y);
+}
 #endif
-  
-  inline float builtin_remainder(float x, float y) { return __builtin_remainderf(x, y); }
-  inline double builtin_remainder(double x, double y) { return __builtin_remainder(x, y); }
+
+inline float builtin_remainder(float x, float y) {
+  return __builtin_remainderf(x, y);
+}
+inline double builtin_remainder(double x, double y) {
+  return __builtin_remainder(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_remainder(long double x, long double y) { return __builtin_remainderl(x, y); }
+inline long double builtin_remainder(long double x, long double y) {
+  return __builtin_remainderl(x, y);
+}
 #endif
 
-  inline float builtin_rint(float x) { return __builtin_rintf(x); }
-  inline double builtin_rint(double x) { return __builtin_rint(x); }
+inline float builtin_rint(float x) { return __builtin_rintf(x); }
+inline double builtin_rint(double x) { return __builtin_rint(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_rint(long double x) { return __builtin_rintl(x); }
+inline long double builtin_rint(long double x) { return __builtin_rintl(x); }
 #endif
 
-  inline float builtin_round(float x) { return __builtin_roundf(x); }
-  inline double builtin_round(double x) { return __builtin_round(x); }
+inline float builtin_round(float x) { return __builtin_roundf(x); }
+inline double builtin_round(double x) { return __builtin_round(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_round(long double x) { return __builtin_roundl(x); }
+inline long double builtin_round(long double x) { return __builtin_roundl(x); }
 #endif
-  
-  inline int builtin_signbit(float x) { return __builtin_signbitf(x); }
-  inline int builtin_signbit(double x) { return __builtin_signbit(x); }
+
+inline int builtin_signbit(float x) { return __builtin_signbitf(x); }
+inline int builtin_signbit(double x) { return __builtin_signbit(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_signbit(long double x) { return __builtin_signbitl(x); }
+inline int builtin_signbit(long double x) { return __builtin_signbitl(x); }
 #endif
 
-  inline float builtin_sin(float x) { return __builtin_sinf(x); }
-  inline double builtin_sin(double x) { return __builtin_sin(x); }
+inline float builtin_sin(float x) { return __builtin_sinf(x); }
+inline double builtin_sin(double x) { return __builtin_sin(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_sin(long double x) { return __builtin_sinl(x); }
+inline long double builtin_sin(long double x) { return __builtin_sinl(x); }
 #endif
-  
-  inline float builtin_sinh(float x) { return __builtin_sinhf(x); }
-  inline double builtin_sinh(double x) { return __builtin_sinh(x); }
+
+inline float builtin_sinh(float x) { return __builtin_sinhf(x); }
+inline double builtin_sinh(double x) { return __builtin_sinh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_sinh(long double x) { return __builtin_sinhl(x); }
+inline long double builtin_sinh(long double x) { return __builtin_sinhl(x); }
 #endif
-  
-  inline float builtin_sqrt(float x) { return __builtin_sqrtf(x); }
-  inline double builtin_sqrt(double x) { return __builtin_sqrt(x); }
+
+inline float builtin_sqrt(float x) { return __builtin_sqrtf(x); }
+inline double builtin_sqrt(double x) { return __builtin_sqrt(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); }
+inline long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); }
 #endif
 
-  inline float builtin_tan(float x) { return __builtin_tanf(x); }
-  inline double builtin_tan(double x) { return __builtin_tan(x); }
+inline float builtin_tan(float x) { return __builtin_tanf(x); }
+inline double builtin_tan(double x) { return __builtin_tan(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_tan(long double x) { return __builtin_tanl(x); }
+inline long double builtin_tan(long double x) { return __builtin_tanl(x); }
 #endif
-  
-  inline float builtin_tanh(float x) { return __builtin_tanhf(x); }
-  inline double builtin_tanh(double x) { return __builtin_tanh(x); }
+
+inline float builtin_tanh(float x) { return __builtin_tanhf(x); }
+inline double builtin_tanh(double x) { return __builtin_tanh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_tanh(long double x) { return __builtin_tanhl(x); }
+inline long double builtin_tanh(long double x) { return __builtin_tanhl(x); }
 #endif
-  
-  inline float builtin_trunc(float x) { return __builtin_truncf(x); }
-  inline double builtin_trunc(double x) { return __builtin_trunc(x); }
+
+inline float builtin_trunc(float x) { return __builtin_truncf(x); }
+inline double builtin_trunc(double x) { return __builtin_trunc(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_trunc(long double x) { return __builtin_truncl(x); }
+inline long double builtin_trunc(long double x) { return __builtin_truncl(x); }
 #endif
-  
 }
 
 #endif
 
-#endif  // #ifndef FLOATBUILTINS_H
+#endif // #ifndef FLOATBUILTINS_H
diff --git a/lib/kernel/vecmathlib/floatprops.h b/lib/kernel/vecmathlib/floatprops.h
index f1c39a2..c7a3b7f 100644
--- a/lib/kernel/vecmathlib/floatprops.h
+++ b/lib/kernel/vecmathlib/floatprops.h
@@ -10,310 +10,279 @@
 #include <cstring>
 #include <limits>
 
+namespace vecmathlib {
 
+// A structure describing various properties of a floating point
+// type. Most properties are already described in numeric_limits, so
+// we inherit it.
+template <typename real_t> struct floatprops {
+  // Some interesting properties are:
+  //    min
+  //    max
+  //    digits
+  //    epsilon
+  //    min_exponent
+  //    max_exponent
+  //    infinity
+  //    quiet_NaN
+};
 
-namespace vecmathlib {
-  
-  // A structure describing various properties of a floating point
-  // type. Most properties are already described in numeric_limits, so
-  // we inherit it.
-  template<typename real_t>
-  struct floatprops {
-    // Some interesting properties are:
-    //    min
-    //    max
-    //    digits
-    //    epsilon
-    //    min_exponent
-    //    max_exponent
-    //    infinity
-    //    quiet_NaN
-  };
-  
-  
-  
-  // Properties of fp8
-  template<>
-  struct floatprops<fp8> {
-    typedef fp8 real_t;
-    typedef vml_std::int8_t int_t;
-    typedef vml_std::uint8_t uint_t;
-    
-    static char const* name() { return "fp8"; }
-    
-    // Definitions that might come from numeric_limits<> instead:
-    static real_t min() { __builtin_unreachable(); }
-    static real_t max() { __builtin_unreachable(); }
-    static int const digits = 4;
-    static real_t epsilon() { __builtin_unreachable(); }
-    static int const min_exponent = -6;
-    static int const max_exponent = 7;
-    static real_t infinity() { __builtin_unreachable(); }
-    static real_t quiet_NaN() { __builtin_unreachable(); }
-    
-    // Ensure the sizes match
-    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
-    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-    
-    // Number of bits in internal representation
-    static int const bits = 8 * sizeof(real_t);
-    static int const mantissa_bits = digits - 1;
-    static int const signbit_bits = 1;
-    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
-    static int const exponent_offset = 2 - min_exponent;
-    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
-                  "error in bit counts");
-    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
-    static uint_t const exponent_mask =
-      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
-    static uint_t const signbit_mask = uint_t(1) << (bits-1);
-    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
-                  "error in masks");
-    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
-                  uint_t(~uint_t(0)),
-                  "error in masks");
-    
-    // Re-interpret bit patterns
-    static real_t as_float(int_t x)
-    {
-      real_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t as_int(real_t x)
-    {
-      int_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t replicate_byte(unsigned char byte)
-    {
-      int_t res;
-      std::memset(&res, byte, sizeof res);
-      return res;
-    }
-    
-    // Convert values (truncate)
-    static real_t convert_float(int_t x) { __builtin_unreachable(); }
-    static int_t convert_int(real_t x) { __builtin_unreachable(); }
-  };
-  
-  
-  
-  // Properties of fp16
-  template<>
-  struct floatprops<fp16> {
-    typedef fp16 real_t;
-    typedef vml_std::int16_t int_t;
-    typedef vml_std::uint16_t uint_t;
-    
-    static char const* name() { return "fp16"; }
-    
-    // Definitions that might come from numeric_limits<> instead:
-    static real_t min() { __builtin_unreachable(); }
-    static real_t max() { __builtin_unreachable(); }
-    static int const digits = 11;
-    static real_t epsilon() { __builtin_unreachable(); }
-    static int const min_exponent = -14;
-    static int const max_exponent = 15;
-    static real_t infinity() { __builtin_unreachable(); }
-    static real_t quiet_NaN() { __builtin_unreachable(); }
-    
-    // Ensure the sizes match
-    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
-    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-    
-    // Number of bits in internal representation
-    static int const bits = 8 * sizeof(real_t);
-    static int const mantissa_bits = digits - 1;
-    static int const signbit_bits = 1;
-    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
-    static int const exponent_offset = 2 - min_exponent;
-    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
-                  "error in bit counts");
-    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
-    static uint_t const exponent_mask =
-      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
-    static uint_t const signbit_mask = uint_t(1) << (bits-1);
-    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
-                  "error in masks");
-    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
-                  uint_t(~uint_t(0)),
-                  "error in masks");
-    
-    // Re-interpret bit patterns
-    static real_t as_float(int_t x)
-    {
-      real_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t as_int(real_t x)
-    {
-      int_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t replicate_byte(unsigned char byte)
-    {
-      int_t res;
-      std::memset(&res, byte, sizeof res);
-      return res;
-    }
-    
-    // Convert values (truncate)
-    static real_t convert_float(int_t x) { __builtin_unreachable(); }
-    static int_t convert_int(real_t x) { __builtin_unreachable(); }
-  };
-  
-  
-  
-  // Properties of float
-  template<>
-  struct floatprops<float>: std::numeric_limits<float> {
-    typedef float real_t;
-    typedef vml_std::int32_t int_t;
-    typedef vml_std::uint32_t uint_t;
-    
-    static char const* name() { return "float"; }
-    
-    // Ensure the internal representation is what we expect
-    static_assert(is_signed, "real_t is not signed");
-    static_assert(radix==2, "real_t is not binary");
-    
-    // Ensure the sizes match
-    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
-    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-    
-    // Number of bits in internal representation
-    static int const bits = 8 * sizeof(real_t);
-    static int const mantissa_bits = digits - 1;
-    static int const signbit_bits = 1;
-    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
-    static int const exponent_offset = 2 - min_exponent;
-    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
-                  "error in bit counts");
-    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
-    static uint_t const exponent_mask =
-      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
-    static uint_t const signbit_mask = uint_t(1) << (bits-1);
-    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
-                  "error in masks");
-    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
-                  uint_t(~uint_t(0)),
-                  "error in masks");
-    
-    // Re-interpret bit patterns
-    static real_t as_float(int_t x)
-    {
-      real_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t as_int(real_t x)
-    {
-      int_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t replicate_byte(unsigned char byte)
-    {
-      int_t res;
-      std::memset(&res, byte, sizeof res);
-      return res;
-    }
-    
-    // Convert values (truncate)
-    static real_t convert_float(int_t x) { return real_t(x); }
-    static int_t convert_int(real_t x) { return int_t(x); }
-  };
-  
-  
-  
-  // Properties of double
-  template<>
-  struct floatprops<double>: std::numeric_limits<double> {
-    typedef double real_t;
-    typedef vml_std::int64_t int_t;
-    typedef vml_std::uint64_t uint_t;
-    
-    static char const* name() { return "double"; }
-    
-    // Ensure the internal representation is what we expect
-    static_assert(is_signed, "real_t is not signed");
-    static_assert(radix==2, "real_t is not binary");
-    
-    // Ensure the sizes match
-    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
-    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-    
-    // Number of bits in internal representation
-    static int const bits = 8 * sizeof(real_t);
-    static int const mantissa_bits = digits - 1;
-    static int const signbit_bits = 1;
-    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
-    static int const exponent_offset = 2 - min_exponent;
-    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
-                  "error in bit counts");
-    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
-    static uint_t const exponent_mask =
-      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
-    static uint_t const signbit_mask = uint_t(1) << (bits-1);
-    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
-                  "error in masks");
-    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
-                  uint_t(~uint_t(0)),
-                  "error in masks");
-    
-    // Re-interpret bit patterns
-    static real_t as_float(int_t x)
-    {
-      real_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t as_int(real_t x)
-    {
-      int_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t replicate_byte(unsigned char byte)
-    {
-      int_t res;
-      std::memset(&res, byte, sizeof res);
-      return res;
-    }
-    
-    // Convert values (truncate)
-    static real_t convert_float(int_t x) { return real_t(x); }
-    static int_t convert_int(real_t x) { return int_t(x); }
-  };
-  
-  
-  
-  // We are adding the (unused) type RV here to avoid name mangling
-  // problems. On some systems, the vector size does not enter into
-  // the mangled name (!), leading to duplicate function definitions.
-  template<typename RV, typename V, typename E>
-  E get_elt(const V& v, const int n)
-  {
-    const size_t s = sizeof(E);
-    E e;
-    // assert(n>=0 and s*n<sizeof(V));
-    std::memcpy(&e, &((const char*)&v)[s*n], s);
-    return e;
+// Properties of fp8
+template <> struct floatprops<fp8> {
+  typedef fp8 real_t;
+  typedef vml_std::int8_t int_t;
+  typedef vml_std::uint8_t uint_t;
+
+  static char const *name() { return "fp8"; }
+
+  // Definitions that might come from numeric_limits<> instead:
+  static real_t min() { __builtin_unreachable(); }
+  static real_t max() { __builtin_unreachable(); }
+  static int const digits = 4;
+  static real_t epsilon() { __builtin_unreachable(); }
+  static int const min_exponent = -6;
+  static int const max_exponent = 7;
+  static real_t infinity() { __builtin_unreachable(); }
+  static real_t quiet_NaN() { __builtin_unreachable(); }
+
+  // Ensure the sizes match
+  static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+  static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+  // Number of bits in internal representation
+  static int const bits = 8 * sizeof(real_t);
+  static int const mantissa_bits = digits - 1;
+  static int const signbit_bits = 1;
+  static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+  static int const exponent_offset = 2 - min_exponent;
+  static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                "error in bit counts");
+  static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+  static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+                                      << mantissa_bits;
+  static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+  static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                "error in masks");
+  static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                    uint_t(~uint_t(0)),
+                "error in masks");
+
+  // Re-interpret bit patterns
+  static real_t as_float(int_t x) {
+    real_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t as_int(real_t x) {
+    int_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t replicate_byte(unsigned char byte) {
+    int_t res;
+    std::memset(&res, byte, sizeof res);
+    return res;
+  }
+
+  // Convert values (truncate)
+  static real_t convert_float(int_t x) { __builtin_unreachable(); }
+  static int_t convert_int(real_t x) { __builtin_unreachable(); }
+};
+
+// Properties of fp16
+template <> struct floatprops<fp16> {
+  typedef fp16 real_t;
+  typedef vml_std::int16_t int_t;
+  typedef vml_std::uint16_t uint_t;
+
+  static char const *name() { return "fp16"; }
+
+  // Definitions that might come from numeric_limits<> instead:
+  static real_t min() { __builtin_unreachable(); }
+  static real_t max() { __builtin_unreachable(); }
+  static int const digits = 11;
+  static real_t epsilon() { __builtin_unreachable(); }
+  static int const min_exponent = -14;
+  static int const max_exponent = 15;
+  static real_t infinity() { __builtin_unreachable(); }
+  static real_t quiet_NaN() { __builtin_unreachable(); }
+
+  // Ensure the sizes match
+  static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+  static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+  // Number of bits in internal representation
+  static int const bits = 8 * sizeof(real_t);
+  static int const mantissa_bits = digits - 1;
+  static int const signbit_bits = 1;
+  static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+  static int const exponent_offset = 2 - min_exponent;
+  static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                "error in bit counts");
+  static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+  static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+                                      << mantissa_bits;
+  static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+  static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                "error in masks");
+  static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                    uint_t(~uint_t(0)),
+                "error in masks");
+
+  // Re-interpret bit patterns
+  static real_t as_float(int_t x) {
+    real_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t as_int(real_t x) {
+    int_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t replicate_byte(unsigned char byte) {
+    int_t res;
+    std::memset(&res, byte, sizeof res);
+    return res;
+  }
+
+  // Convert values (truncate)
+  static real_t convert_float(int_t x) { __builtin_unreachable(); }
+  static int_t convert_int(real_t x) { __builtin_unreachable(); }
+};
+
+// Properties of float
+template <> struct floatprops<float> : std::numeric_limits<float> {
+  typedef float real_t;
+  typedef vml_std::int32_t int_t;
+  typedef vml_std::uint32_t uint_t;
+
+  static char const *name() { return "float"; }
+
+  // Ensure the internal representation is what we expect
+  static_assert(is_signed, "real_t is not signed");
+  static_assert(radix == 2, "real_t is not binary");
+
+  // Ensure the sizes match
+  static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+  static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+  // Number of bits in internal representation
+  static int const bits = 8 * sizeof(real_t);
+  static int const mantissa_bits = digits - 1;
+  static int const signbit_bits = 1;
+  static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+  static int const exponent_offset = 2 - min_exponent;
+  static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                "error in bit counts");
+  static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+  static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+                                      << mantissa_bits;
+  static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+  static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                "error in masks");
+  static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                    uint_t(~uint_t(0)),
+                "error in masks");
+
+  // Re-interpret bit patterns
+  static real_t as_float(int_t x) {
+    real_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
   }
-  
-  template<typename RV, typename V, typename E>
-  V& set_elt(V& v, const int n, const E e)
-  {
-    const size_t s = sizeof(E);
-    // assert(n>=0 and s*n<sizeof(V));
-    std::memcpy(&((char*)&v)[s*n], &e, s);
-    return v;
+  static int_t as_int(real_t x) {
+    int_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
   }
-  
+  static int_t replicate_byte(unsigned char byte) {
+    int_t res;
+    std::memset(&res, byte, sizeof res);
+    return res;
+  }
+
+  // Convert values (truncate)
+  static real_t convert_float(int_t x) { return real_t(x); }
+  static int_t convert_int(real_t x) { return int_t(x); }
+};
+
+// Properties of double
+template <> struct floatprops<double> : std::numeric_limits<double> {
+  typedef double real_t;
+  typedef vml_std::int64_t int_t;
+  typedef vml_std::uint64_t uint_t;
+
+  static char const *name() { return "double"; }
+
+  // Ensure the internal representation is what we expect
+  static_assert(is_signed, "real_t is not signed");
+  static_assert(radix == 2, "real_t is not binary");
+
+  // Ensure the sizes match
+  static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+  static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+  // Number of bits in internal representation
+  static int const bits = 8 * sizeof(real_t);
+  static int const mantissa_bits = digits - 1;
+  static int const signbit_bits = 1;
+  static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+  static int const exponent_offset = 2 - min_exponent;
+  static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                "error in bit counts");
+  static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+  static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+                                      << mantissa_bits;
+  static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+  static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                "error in masks");
+  static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                    uint_t(~uint_t(0)),
+                "error in masks");
+
+  // Re-interpret bit patterns
+  static real_t as_float(int_t x) {
+    real_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t as_int(real_t x) {
+    int_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t replicate_byte(unsigned char byte) {
+    int_t res;
+    std::memset(&res, byte, sizeof res);
+    return res;
+  }
+
+  // Convert values (truncate)
+  static real_t convert_float(int_t x) { return real_t(x); }
+  static int_t convert_int(real_t x) { return int_t(x); }
+};
+
+// We are adding the (unused) type RV here to avoid name mangling
+// problems. On some systems, the vector size does not enter into
+// the mangled name (!), leading to duplicate function definitions.
+template <typename RV, typename V, typename E>
+E get_elt(const V &v, const int n) {
+  const size_t s = sizeof(E);
+  E e;
+  // assert(n>=0 and s*n<sizeof(V));
+  std::memcpy(&e, &((const char *)&v)[s * n], s);
+  return e;
+}
+
+template <typename RV, typename V, typename E>
+V &set_elt(V &v, const int n, const E e) {
+  const size_t s = sizeof(E);
+  // assert(n>=0 and s*n<sizeof(V));
+  std::memcpy(&((char *)&v)[s * n], &e, s);
+  return v;
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef FLOATPROPS_H
+#endif // #ifndef FLOATPROPS_H
diff --git a/lib/kernel/vecmathlib/floattypes.h b/lib/kernel/vecmathlib/floattypes.h
index 2c22a10..fa4cc44 100644
--- a/lib/kernel/vecmathlib/floattypes.h
+++ b/lib/kernel/vecmathlib/floattypes.h
@@ -3,20 +3,14 @@
 #ifndef FLOATTYPES_H
 #define FLOATTYPES_H
 
-
-
 #include <cassert>
 #include <cstdlib>
 
-
-
-#if ! (defined __clang__ || defined __gcc__)
-#  define __builtin_unreachable() (assert(0))
-#  define __builtin_expect(expr, val) (expr)
+#if !(defined __clang__ || defined __gcc__)
+#define __builtin_unreachable() (assert(0))
+#define __builtin_expect(expr, val) (expr)
 #endif
 
-
-
 // We expect either 199711L or 201103L
 #if __cplusplus >= 201103L
 // C++11 is supported, use it
@@ -25,11 +19,9 @@
 #include <cstdint>
 
 namespace vml_std {
-  using namespace std;
+using namespace std;
 }
 
-
-
 #else
 // C++11 is not supported, work around the missing pieces
 
@@ -40,34 +32,36 @@ namespace vml_std {
 #include <stdint.h>
 
 #ifndef static_assert
-#  define static_assert(cond, msg)
+#define VML_CONCAT2(x, y) x##y
+#define VML_CONCAT(x, y) VML_CONCAT2(x, y)
+#define static_assert(cond, msg)                                               \
+  typedef int VML_CONCAT(vml_static_assert_, __LINE__)[(cond) ? 1 : -1]        \
+      __attribute__((__unused__))
 #endif
 
-
-
 // Capture libc macros, then undefine them
 #ifndef isfinite
-#  error "isfinite is not a macro"
+#error "isfinite is not a macro"
 #endif
 #ifndef isinf
-#  error "isinf is not a macro"
+#error "isinf is not a macro"
 #endif
 #ifndef isnan
-#  error "isnan is not a macro"
+#error "isnan is not a macro"
 #endif
 #ifndef isnormal
-#  error "isnormal is not a macro"
+#error "isnormal is not a macro"
 #endif
 #ifndef signbit
-#  error "signbit is not a macro"
+#error "signbit is not a macro"
 #endif
 
 namespace {
-  template<typename T> inline int libc_isfinite(T x) { return isfinite(x); }
-  template<typename T> inline int libc_isinf(T x) { return isinf(x); }
-  template<typename T> inline int libc_isnan(T x) { return isnan(x); }
-  template<typename T> inline int libc_isnormal(T x) { return isnormal(x); }
-  template<typename T> inline int libc_signbit(T x) { return signbit(x); }
+template <typename T> inline int libc_isfinite(T x) { return isfinite(x); }
+template <typename T> inline int libc_isinf(T x) { return isinf(x); }
+template <typename T> inline int libc_isnan(T x) { return isnan(x); }
+template <typename T> inline int libc_isnormal(T x) { return isnormal(x); }
+template <typename T> inline int libc_signbit(T x) { return signbit(x); }
 }
 
 // Include this before undefining the macros below
@@ -79,153 +73,146 @@ namespace {
 #undef isnormal
 #undef signbit
 
-
-
 namespace vml_std {
-  
-  // Make some type definitions from stdint.h available in std
-  typedef ::uint8_t uint8_t;
-  typedef ::int8_t int8_t;
-  typedef ::uint16_t uint16_t;
-  typedef ::int16_t int16_t;
-  typedef ::uint32_t uint32_t;
-  typedef ::int32_t int32_t;
+
+// Make some type definitions from stdint.h available in std
+typedef ::uint8_t uint8_t;
+typedef ::int8_t int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t int32_t;
 #if __SIZEOF_LONG__ == 8
-  // Even if both "long" and "long long" have the same size, they are
-  // still different types. In many cases, it is then preferable to
-  // use "long" instead of "long long".
-  typedef unsigned long uint64_t;
-  typedef long int64_t;
+// Even if both "long" and "long long" have the same size, they are
+// still different types. In many cases, it is then preferable to
+// use "long" instead of "long long".
+typedef unsigned long uint64_t;
+typedef long int64_t;
 #else
-  typedef ::uint64_t uint64_t;
-  typedef ::int64_t int64_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t int64_t;
 #endif
-  
-  
-  
-  // Make math functions from math.h available in vml_std
-  // (We could instead take some of them -- but not all -- from std.)
-  
-  inline float acos(float x) { return ::acosf(x); }
-  inline float acosh(float x) { return ::acoshf(x); }
-  inline float asin(float x) { return ::asinf(x); }
-  inline float asinh(float x) { return ::asinhf(x); }
-  inline float atan(float x) { return ::atanf(x); }
-  inline float atan2(float x, float y) { return ::atan2f(x, y); }
-  inline float atanh(float x) { return ::atanhf(x); }
-  inline float cbrt(float x) { return ::cbrtf(x); }
-  inline float ceil(float x) { return ::ceilf(x); }
-  inline float cos(float x) { return ::cosf(x); }
-  inline float cosh(float x) { return ::coshf(x); }
-  inline float copysign(float x, float y) { return ::copysignf(x, y); }
-  inline float exp(float x) { return ::expf(x); }
-  inline float exp2(float x) { return ::exp2f(x); }
-  inline float expm1(float x) { return ::expm1f(x); }
-  inline float fabs(float x) { return ::fabsf(x); }
-  inline float fdim(float x, float y) { return ::fdimf(x, y); }
-  inline float floor(float x) { return ::floorf(x); }
-  inline float fma(float x, float y, float z) { return ::fmaf(x, y, z); }
-  inline float fmax(float x, float y) { return ::fmaxf(x, y); }
-  inline float fmin(float x, float y) { return ::fminf(x, y); }
-  inline float fmod(float x, float y) { return ::fmodf(x, y); }
-  inline float frexp(float x, int* r) { return ::frexpf(x, r); }
-  inline float hypot(float x, float y) { return ::hypotf(x, y); }
-  inline int ilogb(float x) { return ::ilogbf(x); }
-  inline bool isfinite(float x) { return libc_isfinite(x); }
-  inline bool isinf(float x) { return libc_isinf(x); }
-  inline bool isnan(float x) { return libc_isnan(x); }
-  inline bool isnormal(float x) { return libc_isnormal(x); }
-  inline float ldexp(float x, int n) { return ::ldexpf(x, n); }
-  inline long long llrint(float x) { return ::llrintf(x); }
-  inline float log(float x) { return ::logf(x); }
-  inline float log10(float x) { return ::log10f(x); }
-  inline float log1p(float x) { return ::log1pf(x); }
-  inline float log2(float x) { return ::log2f(x); }
-  inline long lrint(float x) { return ::lrintf(x); }
-  inline float nextafter(float x, float y) { return ::nextafterf(x, y); }
-  inline float pow(float x, float y) { return ::powf(x, y); }
-  inline float remainder(float x, float y) { return ::remainderf(x, y); }
-  inline float rint(float x) { return ::rintf(x); }
-  inline float round(float x) { return ::roundf(x); }
-  inline bool signbit(float x) { return libc_signbit(x); }
-  inline float sin(float x) { return ::sinf(x); }
-  inline float sinh(float x) { return ::sinhf(x); }
-  inline float sqrt(float x) { return ::sqrtf(x); }
-  inline float tan(float x) { return ::tanf(x); }
-  inline float tanh(float x) { return ::tanhf(x); }
-  inline float trunc(float x) { return ::truncf(x); }
-  
-  inline double acos(double x) { return ::acos(x); }
-  inline double acosh(double x) { return ::acosh(x); }
-  inline double asin(double x) { return ::asin(x); }
-  inline double asinh(double x) { return ::asinh(x); }
-  inline double atan(double x) { return ::atan(x); }
-  inline double atan2(double x, double y) { return ::atan2(x, y); }
-  inline double atanh(double x) { return ::atanh(x); }
-  inline double cbrt(double x) { return ::cbrt(x); }
-  inline double ceil(double x) { return ::ceil(x); }
-  inline double cos(double x) { return ::cos(x); }
-  inline double cosh(double x) { return ::cosh(x); }
-  inline double copysign(double x, double y) { return ::copysign(x, y); }
-  inline double exp(double x) { return ::exp(x); }
-  inline double exp2(double x) { return ::exp2(x); }
-  inline double expm1(double x) { return ::expm1(x); }
-  inline double fabs(double x) { return ::fabs(x); }
-  inline double fdim(double x, double y) { return ::fdim(x, y); }
-  inline double floor(double x) { return ::floor(x); }
-  inline double fma(double x, double y, double z) { return ::fma(x, y, z); }
-  inline double fmax(double x, double y) { return ::fmax(x, y); }
-  inline double fmin(double x, double y) { return ::fmin(x, y); }
-  inline double fmod(double x, double y) { return ::fmod(x, y); }
-  inline double frexp(double x, int* r) { return ::frexp(x, r); }
-  inline double hypot(double x, double y) { return ::hypot(x, y); }
-  inline int ilogb(double x) { return ::ilogb(x); }
-  inline bool isfinite(double x) { return libc_isfinite(x); }
-  inline bool isinf(double x) { return libc_isinf(x); }
-  inline bool isnan(double x) { return libc_isnan(x); }
-  inline bool isnormal(double x) { return libc_isnormal(x); }
-  inline double ldexp(double x, int n) { return ::ldexp(x, n); }
-  inline long long llrint(double x) { return ::llrint(x); }
-  inline double log(double x) { return ::log(x); }
-  inline double log10(double x) { return ::log10(x); }
-  inline double log1p(double x) { return ::log1p(x); }
-  inline double log2(double x) { return ::log2(x); }
-  inline long lrint(double x) { return ::lrint(x); }
-  inline double nextafter(double x, double y) { return ::nextafter(x, y); }
-  inline double pow(double x, double y) { return ::pow(x, y); }
-  inline double remainder(double x, double y) { return ::remainder(x, y); }
-  inline double rint(double x) { return ::rint(x); }
-  inline double round(double x) { return ::round(x); }
-  inline bool signbit(double x) { return libc_signbit(x); }
-  inline double sin(double x) { return ::sin(x); }
-  inline double sinh(double x) { return ::sinh(x); }
-  inline double sqrt(double x) { return ::sqrt(x); }
-  inline double tan(double x) { return ::tan(x); }
-  inline double tanh(double x) { return ::tanh(x); }
-  inline double trunc(double x) { return ::trunc(x); }
-  
+
+// Make math functions from math.h available in vml_std
+// (We could instead take some of them -- but not all -- from std.)
+
+inline float acos(float x) { return ::acosf(x); }
+inline float acosh(float x) { return ::acoshf(x); }
+inline float asin(float x) { return ::asinf(x); }
+inline float asinh(float x) { return ::asinhf(x); }
+inline float atan(float x) { return ::atanf(x); }
+inline float atan2(float x, float y) { return ::atan2f(x, y); }
+inline float atanh(float x) { return ::atanhf(x); }
+inline float cbrt(float x) { return ::cbrtf(x); }
+inline float ceil(float x) { return ::ceilf(x); }
+inline float cos(float x) { return ::cosf(x); }
+inline float cosh(float x) { return ::coshf(x); }
+inline float copysign(float x, float y) { return ::copysignf(x, y); }
+inline float exp(float x) { return ::expf(x); }
+inline float exp2(float x) { return ::exp2f(x); }
+inline float expm1(float x) { return ::expm1f(x); }
+inline float fabs(float x) { return ::fabsf(x); }
+inline float fdim(float x, float y) { return ::fdimf(x, y); }
+inline float floor(float x) { return ::floorf(x); }
+inline float fma(float x, float y, float z) { return ::fmaf(x, y, z); }
+inline float fmax(float x, float y) { return ::fmaxf(x, y); }
+inline float fmin(float x, float y) { return ::fminf(x, y); }
+inline float fmod(float x, float y) { return ::fmodf(x, y); }
+inline float frexp(float x, int *r) { return ::frexpf(x, r); }
+inline float hypot(float x, float y) { return ::hypotf(x, y); }
+inline int ilogb(float x) { return ::ilogbf(x); }
+inline bool isfinite(float x) { return libc_isfinite(x); }
+inline bool isinf(float x) { return libc_isinf(x); }
+inline bool isnan(float x) { return libc_isnan(x); }
+inline bool isnormal(float x) { return libc_isnormal(x); }
+inline float ldexp(float x, int n) { return ::ldexpf(x, n); }
+inline long long llrint(float x) { return ::llrintf(x); }
+inline float log(float x) { return ::logf(x); }
+inline float log10(float x) { return ::log10f(x); }
+inline float log1p(float x) { return ::log1pf(x); }
+inline float log2(float x) { return ::log2f(x); }
+inline long lrint(float x) { return ::lrintf(x); }
+inline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+inline float pow(float x, float y) { return ::powf(x, y); }
+inline float remainder(float x, float y) { return ::remainderf(x, y); }
+inline float rint(float x) { return ::rintf(x); }
+inline float round(float x) { return ::roundf(x); }
+inline bool signbit(float x) { return libc_signbit(x); }
+inline float sin(float x) { return ::sinf(x); }
+inline float sinh(float x) { return ::sinhf(x); }
+inline float sqrt(float x) { return ::sqrtf(x); }
+inline float tan(float x) { return ::tanf(x); }
+inline float tanh(float x) { return ::tanhf(x); }
+inline float trunc(float x) { return ::truncf(x); }
+
+inline double acos(double x) { return ::acos(x); }
+inline double acosh(double x) { return ::acosh(x); }
+inline double asin(double x) { return ::asin(x); }
+inline double asinh(double x) { return ::asinh(x); }
+inline double atan(double x) { return ::atan(x); }
+inline double atan2(double x, double y) { return ::atan2(x, y); }
+inline double atanh(double x) { return ::atanh(x); }
+inline double cbrt(double x) { return ::cbrt(x); }
+inline double ceil(double x) { return ::ceil(x); }
+inline double cos(double x) { return ::cos(x); }
+inline double cosh(double x) { return ::cosh(x); }
+inline double copysign(double x, double y) { return ::copysign(x, y); }
+inline double exp(double x) { return ::exp(x); }
+inline double exp2(double x) { return ::exp2(x); }
+inline double expm1(double x) { return ::expm1(x); }
+inline double fabs(double x) { return ::fabs(x); }
+inline double fdim(double x, double y) { return ::fdim(x, y); }
+inline double floor(double x) { return ::floor(x); }
+inline double fma(double x, double y, double z) { return ::fma(x, y, z); }
+inline double fmax(double x, double y) { return ::fmax(x, y); }
+inline double fmin(double x, double y) { return ::fmin(x, y); }
+inline double fmod(double x, double y) { return ::fmod(x, y); }
+inline double frexp(double x, int *r) { return ::frexp(x, r); }
+inline double hypot(double x, double y) { return ::hypot(x, y); }
+inline int ilogb(double x) { return ::ilogb(x); }
+inline bool isfinite(double x) { return libc_isfinite(x); }
+inline bool isinf(double x) { return libc_isinf(x); }
+inline bool isnan(double x) { return libc_isnan(x); }
+inline bool isnormal(double x) { return libc_isnormal(x); }
+inline double ldexp(double x, int n) { return ::ldexp(x, n); }
+inline long long llrint(double x) { return ::llrint(x); }
+inline double log(double x) { return ::log(x); }
+inline double log10(double x) { return ::log10(x); }
+inline double log1p(double x) { return ::log1p(x); }
+inline double log2(double x) { return ::log2(x); }
+inline long lrint(double x) { return ::lrint(x); }
+inline double nextafter(double x, double y) { return ::nextafter(x, y); }
+inline double pow(double x, double y) { return ::pow(x, y); }
+inline double remainder(double x, double y) { return ::remainder(x, y); }
+inline double rint(double x) { return ::rint(x); }
+inline double round(double x) { return ::round(x); }
+inline bool signbit(double x) { return libc_signbit(x); }
+inline double sin(double x) { return ::sin(x); }
+inline double sinh(double x) { return ::sinh(x); }
+inline double sqrt(double x) { return ::sqrt(x); }
+inline double tan(double x) { return ::tan(x); }
+inline double tanh(double x) { return ::tanh(x); }
+inline double trunc(double x) { return ::trunc(x); }
 }
 
 #endif
 
+namespace vecmathlib {
+
+struct fp8 {
+  // 1 bit sign, 4 bits exponent, 3 bits mantissa, exponent offset 7 (?)
+  vml_std::uint8_t val;
+  fp8() {}
+  fp8(double x) { __builtin_unreachable(); }
+};
 
+struct fp16 {
+  // 1 bit sign, 5 bits exponent, 10 bits mantissa, exponent offset 15 (?)
+  vml_std::uint16_t val;
+  fp16() {}
+  fp16(double x) { __builtin_unreachable(); }
+};
 
-namespace vecmathlib {
-  
-  struct fp8 {
-    // 1 bit sign, 4 bits exponent, 3 bits mantissa
-    vml_std::uint8_t val;
-    fp8() {}
-    fp8(double x) { __builtin_unreachable(); }
-  };
-  
-  struct fp16 {
-    // 1 bit sign, 5 bits exponent, 10 bits mantissa
-    vml_std::uint16_t val;
-    fp16() {}
-    fp16(double x) { __builtin_unreachable(); }
-  };
-  
 } // namespace vecmathlib
 
-#endif  // #ifndef FLOATTYPES_H
+#endif // #ifndef FLOATTYPES_H
diff --git a/lib/kernel/vecmathlib/loop.cc b/lib/kernel/vecmathlib/loop.cc
new file mode 100644
index 0000000..8b42970
--- /dev/null
+++ b/lib/kernel/vecmathlib/loop.cc
@@ -0,0 +1,290 @@
+// -*-C++-*-
+
+#define restrict __restrict__
+
+#include "vecmathlib.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+
+#include <sys/time.h>
+
+using namespace std;
+using namespace vecmathlib;
+
+////////////////////////////////////////////////////////////////////////////////
+// Helpers
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers
+#endif
+
+// align upwards
+static size_t align_up(size_t i, size_t size) {
+  return (i + size - 1) / size * size;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// High-resolution timer
+////////////////////////////////////////////////////////////////////////////////
+
+typedef unsigned long long ticks;
+inline ticks getticks() {
+#if __has_builtin(__builtin_readcyclecounter)
+  return __builtin_readcyclecounter();
+#elif defined __x86_64__
+  ticks a, d;
+  asm volatile("rdtsc" : "=a"(a), "=d"(d));
+  return a | (d << 32);
+#elif defined __powerpc__
+  unsigned int tbl, tbu, tbu1;
+  do {
+    asm volatile("mftbu %0" : "=r"(tbu));
+    asm volatile("mftb %0" : "=r"(tbl));
+    asm volatile("mftbu %0" : "=r"(tbu1));
+  } while (tbu != tbu1);
+  return ((unsigned long long)tbu << 32) | tbl;
+#else
+  timeval tv;
+  gettimeofday(&tv, NULL);
+  return 1000000ULL * tv.tv_sec + tv.tv_usec;
+// timespec ts;
+// clock_gettime(CLOCK_REALTIME, &ts);
+// return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
+#endif
+}
+inline double elapsed(ticks t1, ticks t0) { return t1 - t0; }
+
+double get_sys_time() {
+  timeval tp;
+  gettimeofday(&tp, NULL);
+  return tp.tv_sec + 1.0e-6 * tp.tv_usec;
+}
+
+double measure_tick() {
+  ticks const rstart = getticks();
+  double const wstart = get_sys_time();
+  while (get_sys_time() - wstart < 0.1) {
+    // do nothing, just wait
+  }
+  ticks const rend = getticks();
+  double const wend = get_sys_time();
+  assert(wend - wstart >= 0.09);
+  return (wend - wstart) / elapsed(rend, rstart);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Initialize the grid
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename realvec_t>
+void init(typename realvec_t::real_t *restrict xptr, ptrdiff_t m, ptrdiff_t ldm,
+          ptrdiff_t n) {
+  for (ptrdiff_t j = 0; j < n; ++j) {
+    for (ptrdiff_t i = 0; i < m; ++i) {
+      const ptrdiff_t ij = ldm * j + i;
+      xptr[ij] = (i + j) % 2;
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Evolution loop: Simple stencil example (Gaussian smoothing)
+////////////////////////////////////////////////////////////////////////////////
+
+// Introduce a delay, so that cache access is not so important
+template <typename T> static T delay(const T x) {
+  return x;
+  // return log(exp(x));
+}
+
+// Original version, unvectorized
+template <typename realvec_t>
+void smooth_scalar(typename realvec_t::real_t const *restrict xptr,
+                   typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+                   ptrdiff_t ldm, ptrdiff_t n) {
+  typedef typename realvec_t::real_t real_t;
+  for (ptrdiff_t j = 1; j < n - 1; ++j) {
+    for (ptrdiff_t i = 1; i < m - 1; ++i) {
+      const ptrdiff_t ij = ldm * j + i;
+      const real_t x = xptr[ij];
+      const real_t xil = xptr[ij - 1];
+      const real_t xir = xptr[ij + 1];
+      const real_t xjl = xptr[ij - ldm];
+      const real_t xjr = xptr[ij + ldm];
+      const real_t y =
+          real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr);
+      yptr[ij] = delay(y);
+    }
+  }
+}
+
+// Assuming no particular alignment
+template <typename realvec_t>
+void smooth_unaligned(typename realvec_t::real_t const *restrict xptr,
+                      typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+                      ptrdiff_t ldm, ptrdiff_t n) {
+  typedef typename realvec_t::real_t real_t;
+  typedef typename realvec_t::mask_t mask_t;
+  for (ptrdiff_t j = 1; j < n - 1; ++j) {
+    // Desired loop bounds
+    const ptrdiff_t imin = 1;
+    const ptrdiff_t imax = m - 1;
+    // Align actual loop iterations with vector size
+    const ptrdiff_t ioff = ldm * j;
+    for (mask_t mask(imin, imax, ioff); mask; ++mask) {
+      const ptrdiff_t i = mask.index();
+      const ptrdiff_t ij = ioff + i;
+      const realvec_t x = realvec_t::loadu(xptr + ij);
+      const realvec_t xil = realvec_t::loadu(xptr + ij, -1);
+      const realvec_t xir = realvec_t::loadu(xptr + ij, +1);
+      const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm);
+      const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm);
+      const realvec_t y = realvec_t(real_t(0.5)) * x +
+                          realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+      storeu(delay(y), yptr + ij, mask);
+    }
+  }
+}
+
+// Assuming that xptr and yptr are aligned, but ldm can be arbitrary
+template <typename realvec_t>
+void smooth_aligned(typename realvec_t::real_t const *restrict xptr,
+                    typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+                    ptrdiff_t ldm, ptrdiff_t n) {
+  typedef typename realvec_t::real_t real_t;
+  typedef typename realvec_t::mask_t mask_t;
+  for (ptrdiff_t j = 1; j < n - 1; ++j) {
+    // Desired loop bounds
+    const ptrdiff_t imin = 1;
+    const ptrdiff_t imax = m - 1;
+    // Align actual loop iterations with vector size
+    const ptrdiff_t ioff = ldm * j;
+    for (mask_t mask(imin, imax, ioff); mask; ++mask) {
+      const ptrdiff_t i = mask.index();
+      const ptrdiff_t ij = ioff + i;
+      const realvec_t x = realvec_t::loada(xptr + ij);
+      const realvec_t xil = realvec_t::loadu(xptr + ij, -1);
+      const realvec_t xir = realvec_t::loadu(xptr + ij, +1);
+      const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm);
+      const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm);
+      const realvec_t y = realvec_t(real_t(0.5)) * x +
+                          realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+      storea(delay(y), yptr + ij, mask);
+    }
+  }
+}
+
+// Assuming that xptr and yptr are aligned, and ldm is a multiple of
+// the vector size
+template <typename realvec_t>
+void smooth_padded(typename realvec_t::real_t const *restrict xptr,
+                   typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+                   ptrdiff_t ldm, ptrdiff_t n) {
+  typedef typename realvec_t::real_t real_t;
+  typedef typename realvec_t::mask_t mask_t;
+  assert(ldm % realvec_t::size == 0);
+  for (ptrdiff_t j = 1; j < n - 1; ++j) {
+    // Desired loop bounds
+    const ptrdiff_t imin = 1;
+    const ptrdiff_t imax = m - 1;
+    // Align actual loop iterations with vector size
+    const ptrdiff_t ioff = ldm * j;
+    for (mask_t mask(imin, imax, ioff); mask; ++mask) {
+      const ptrdiff_t i = mask.index();
+      const ptrdiff_t ij = ioff + i;
+      const realvec_t x = realvec_t::loada(xptr + ij);
+      const realvec_t xil = realvec_t::loadu(xptr + ij, -1);
+      const realvec_t xir = realvec_t::loadu(xptr + ij, +1);
+      const realvec_t xjl = realvec_t::loada(xptr + ij - ldm);
+      const realvec_t xjr = realvec_t::loada(xptr + ij + ldm);
+      const realvec_t y = realvec_t(real_t(0.5)) * x +
+                          realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+      storea(delay(y), yptr + ij, mask);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Main routine
+////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char **argv) {
+  // Number of iterations
+  const int niters = 100;
+
+  // Grid size
+  const ptrdiff_t m = 100;
+  const ptrdiff_t n = 100;
+
+// Choose a vector size
+#if defined VECMATHLIB_HAVE_VEC_DOUBLE_4
+  typedef realvec<double, 4> realvec_t;
+#elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2
+  typedef realvec<double, 2> realvec_t;
+#else
+  typedef realpseudovec<double, 1> realvec_t;
+#endif
+
+  // Ensure the grid size is aligned
+  const ptrdiff_t ldm = align_up(m, realvec_t::size);
+  typedef realvec_t::real_t real_t;
+  vector<real_t> x0(ldm * n + realvec_t::size - 1),
+      y0(ldm * n + realvec_t::size - 1);
+  real_t *restrict const x =
+      (real_t *)align_up(intptr_t(&x0[0]), sizeof(realvec_t));
+  real_t *restrict const y =
+      (real_t *)align_up(intptr_t(&y0[0]), sizeof(realvec_t));
+  for (ptrdiff_t i = 0; i < ldm * n; ++i)
+    y[i] = 0.0;
+
+  // Initialize
+  init<realvec_t>(&x[0], m, ldm, n);
+
+  // Timers
+  ticks t0, t1;
+  double const cycles_per_tick = 1.0; // measure_tick();
+  double cycles;
+
+  // Run the different evolution loop versions
+  t0 = getticks();
+  for (int iter = 0; iter < niters; ++iter) {
+    smooth_scalar<realvec_t>(&x[0], &y[0], m, ldm, n);
+  }
+  t1 = getticks();
+  cycles =
+      cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
+  cout << "smooth_scalar:    " << cycles << " cycles/point\n";
+
+  t0 = getticks();
+  for (int iter = 0; iter < niters; ++iter) {
+    smooth_unaligned<realvec_t>(&x[0], &y[0], m, ldm, n);
+  }
+  t1 = getticks();
+  cycles =
+      cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
+  cout << "smooth_unaligned: " << cycles << " cycles/point\n";
+
+  t0 = getticks();
+  for (int iter = 0; iter < niters; ++iter) {
+    smooth_aligned<realvec_t>(&x[0], &y[0], m, ldm, n);
+  }
+  t1 = getticks();
+  cycles =
+      cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
+  cout << "smooth_aligned:   " << cycles << " cycles/point\n";
+
+  t0 = getticks();
+  for (int iter = 0; iter < niters; ++iter) {
+    smooth_padded<realvec_t>(&x[0], &y[0], m, ldm, n);
+  }
+  t1 = getticks();
+  cycles =
+      cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
+  cout << "smooth_padded:    " << cycles << " cycles/point\n";
+
+  return 0;
+}
diff --git a/lib/kernel/vecmathlib/mathfuncs.h b/lib/kernel/vecmathlib/mathfuncs.h
index 8d90f9a..9f042d1 100644
--- a/lib/kernel/vecmathlib/mathfuncs.h
+++ b/lib/kernel/vecmathlib/mathfuncs.h
@@ -19,4 +19,4 @@
 #include "mathfuncs_sinh.h"
 #include "mathfuncs_sqrt.h"
 
-#endif  // #ifndef MATHFUNCS_H
+#endif // #ifndef MATHFUNCS_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_asin.h b/lib/kernel/vecmathlib/mathfuncs_asin.h
index 3dd9c75..cd174a2 100644
--- a/lib/kernel/vecmathlib/mathfuncs_asin.h
+++ b/lib/kernel/vecmathlib/mathfuncs_asin.h
@@ -7,206 +7,181 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+namespace {
 
-namespace vecmathlib {
-  
-  
-  
-  namespace {
-    
-    template<typename realvec_t>
-    realvec_t mulsign(realvec_t x, realvec_t y)
-    {
-      typedef typename realvec_t::real_t real_t;
-      typedef typename realvec_t::intvec_t intvec_t;
-      typedef intvec_t IV;
-      typedef floatprops<real_t> FP;
-      
-      intvec_t value = as_int(x);
-      intvec_t sign = as_int(y) & IV(FP::signbit_mask);
-      return as_float(value ^ sign);
-    }
-    
-    // Note: the order of arguments is y, x, as is convention for atan2
-    template<typename realvec_t>
-    realvec_t atan2k(realvec_t y, realvec_t x)
-    {
-      // Algorithm taken from SLEEF 2.80
-      
-      typedef typename realvec_t::real_t real_t;
-      typedef typename realvec_t::boolvec_t boolvec_t;
-      typedef realvec_t RV;
-      
-      realvec_t q = RV(0.0);
-      
-      q = ifthen(signbit(x), RV(-2.0), q);
-      x = fabs(x);
-      
-      boolvec_t cond = y > x;
-      realvec_t x0 = x;
-      realvec_t y0 = y;
-      x = ifthen(cond,  y0, x0);
-      y = ifthen(cond, -x0, y0);
-      q += ifthen(cond, RV(1.0), RV(0.0));
-      
-      realvec_t s = y / x;
-      realvec_t t = s * s;
-      
-      realvec_t u;
-      switch (sizeof(real_t)) {
-      default: __builtin_unreachable();
-      case sizeof(float):
-        u = RV(0.00282363896258175373077393f);
-        u = mad(u, t, RV(-0.0159569028764963150024414f));
-        u = mad(u, t, RV(0.0425049886107444763183594f));
-        u = mad(u, t, RV(-0.0748900920152664184570312f));
-        u = mad(u, t, RV(0.106347933411598205566406f));
-        u = mad(u, t, RV(-0.142027363181114196777344f));
-        u = mad(u, t, RV(0.199926957488059997558594f));
-        u = mad(u, t, RV(-0.333331018686294555664062f));
-        break;
-      case sizeof(double):
-        u = RV(-1.88796008463073496563746e-05);
-        u = mad(u, t, RV(0.000209850076645816976906797));
-        u = mad(u, t, RV(-0.00110611831486672482563471));
-        u = mad(u, t, RV(0.00370026744188713119232403));
-        u = mad(u, t, RV(-0.00889896195887655491740809));
-        u = mad(u, t, RV(0.016599329773529201970117));
-        u = mad(u, t, RV(-0.0254517624932312641616861));
-        u = mad(u, t, RV(0.0337852580001353069993897));
-        u = mad(u, t, RV(-0.0407629191276836500001934));
-        u = mad(u, t, RV(0.0466667150077840625632675));
-        u = mad(u, t, RV(-0.0523674852303482457616113));
-        u = mad(u, t, RV(0.0587666392926673580854313));
-        u = mad(u, t, RV(-0.0666573579361080525984562));
-        u = mad(u, t, RV(0.0769219538311769618355029));
-        u = mad(u, t, RV(-0.090908995008245008229153));
-        u = mad(u, t, RV(0.111111105648261418443745));
-        u = mad(u, t, RV(-0.14285714266771329383765));
-        u = mad(u, t, RV(0.199999999996591265594148));
-        u = mad(u, t, RV(-0.333333333333311110369124));
-        break;
-      }
-      
-      t = mad(u, t * s, s);
-      t = mad(q, RV(M_PI_2), t);
-      
-      return t;
-    }
-    
-  }
-  
+template <typename realvec_t> realvec_t mulsign(realvec_t x, realvec_t y) {
+  typedef typename realvec_t::real_t real_t;
+  typedef typename realvec_t::intvec_t intvec_t;
+  typedef intvec_t IV;
+  typedef floatprops<real_t> FP;
 
+  intvec_t value = as_int(x);
+  intvec_t sign = as_int(y) & IV(FP::signbit_mask);
+  return as_float(value ^ sign);
+}
 
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    return mulsign(atan2k(fabs(d), sqrt((RV(1.0)+d)*(RV(1.0)-d))), d);
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    return (mulsign(atan2k(sqrt((RV(1.0)+d)*(RV(1.0)-d)), fabs(d)), d) +
-            ifthen(d < RV(0.0), RV(M_PI), RV(0.0)));
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t s)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    realvec_t q1 = s;
-    s = fabs(s);
-    
-    boolvec_t q0 = s > RV(1.0);
-    s = ifthen(q0, rcp(s), s);
-    
-    realvec_t t = s * s;
-    
-    realvec_t u;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      u = RV(0.00282363896258175373077393f);
-      u = mad(u, t, RV(-0.0159569028764963150024414f));
-      u = mad(u, t, RV(0.0425049886107444763183594f));
-      u = mad(u, t, RV(-0.0748900920152664184570312f));
-      u = mad(u, t, RV(0.106347933411598205566406f));
-      u = mad(u, t, RV(-0.142027363181114196777344f));
-      u = mad(u, t, RV(0.199926957488059997558594f));
-      u = mad(u, t, RV(-0.333331018686294555664062f));
-      break;
-    case sizeof(double):
-      u = RV(-1.88796008463073496563746e-05);
-      u = mad(u, t, RV(0.000209850076645816976906797));
-      u = mad(u, t, RV(-0.00110611831486672482563471));
-      u = mad(u, t, RV(0.00370026744188713119232403));
-      u = mad(u, t, RV(-0.00889896195887655491740809));
-      u = mad(u, t, RV(0.016599329773529201970117));
-      u = mad(u, t, RV(-0.0254517624932312641616861));
-      u = mad(u, t, RV(0.0337852580001353069993897));
-      u = mad(u, t, RV(-0.0407629191276836500001934));
-      u = mad(u, t, RV(0.0466667150077840625632675));
-      u = mad(u, t, RV(-0.0523674852303482457616113));
-      u = mad(u, t, RV(0.0587666392926673580854313));
-      u = mad(u, t, RV(-0.0666573579361080525984562));
-      u = mad(u, t, RV(0.0769219538311769618355029));
-      u = mad(u, t, RV(-0.090908995008245008229153));
-      u = mad(u, t, RV(0.111111105648261418443745));
-      u = mad(u, t, RV(-0.14285714266771329383765));
-      u = mad(u, t, RV(0.199999999996591265594148));
-      u = mad(u, t, RV(-0.333333333333311110369124));
-      break;
-    }
-    
-    t = s + s * (t * u);
-    
-    t = ifthen(q0, RV(M_PI_2) - t, t);
-    t = copysign(t, q1);
-    
-    return t;
+// Note: the order of arguments is y, x, as is convention for atan2
+template <typename realvec_t> realvec_t atan2k(realvec_t y, realvec_t x) {
+  // Algorithm taken from SLEEF 2.80
+
+  typedef typename realvec_t::real_t real_t;
+  typedef typename realvec_t::boolvec_t boolvec_t;
+  typedef realvec_t RV;
+
+  realvec_t q = RV(0.0);
+
+  q = ifthen(signbit(x), RV(-2.0), q);
+  x = fabs(x);
+
+  boolvec_t cond = y > x;
+  realvec_t x0 = x;
+  realvec_t y0 = y;
+  x = ifthen(cond, y0, x0);
+  y = ifthen(cond, -x0, y0);
+  q += ifthen(cond, RV(1.0), RV(0.0));
+
+  realvec_t s = y / x;
+  realvec_t t = s * s;
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(0.00282363896258175373077393f);
+    u = mad(u, t, RV(-0.0159569028764963150024414f));
+    u = mad(u, t, RV(0.0425049886107444763183594f));
+    u = mad(u, t, RV(-0.0748900920152664184570312f));
+    u = mad(u, t, RV(0.106347933411598205566406f));
+    u = mad(u, t, RV(-0.142027363181114196777344f));
+    u = mad(u, t, RV(0.199926957488059997558594f));
+    u = mad(u, t, RV(-0.333331018686294555664062f));
+    break;
+  case sizeof(double):
+    u = RV(-1.88796008463073496563746e-05);
+    u = mad(u, t, RV(0.000209850076645816976906797));
+    u = mad(u, t, RV(-0.00110611831486672482563471));
+    u = mad(u, t, RV(0.00370026744188713119232403));
+    u = mad(u, t, RV(-0.00889896195887655491740809));
+    u = mad(u, t, RV(0.016599329773529201970117));
+    u = mad(u, t, RV(-0.0254517624932312641616861));
+    u = mad(u, t, RV(0.0337852580001353069993897));
+    u = mad(u, t, RV(-0.0407629191276836500001934));
+    u = mad(u, t, RV(0.0466667150077840625632675));
+    u = mad(u, t, RV(-0.0523674852303482457616113));
+    u = mad(u, t, RV(0.0587666392926673580854313));
+    u = mad(u, t, RV(-0.0666573579361080525984562));
+    u = mad(u, t, RV(0.0769219538311769618355029));
+    u = mad(u, t, RV(-0.090908995008245008229153));
+    u = mad(u, t, RV(0.111111105648261418443745));
+    u = mad(u, t, RV(-0.14285714266771329383765));
+    u = mad(u, t, RV(0.199999999996591265594148));
+    u = mad(u, t, RV(-0.333333333333311110369124));
+    break;
   }
 
-  
-  
-  // Note: the order of arguments is y, x, as is convention for atan2
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_atan2(realvec_t y, realvec_t x)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    realvec_t r = atan2k(fabs(y), x);
-    
-    r = mulsign(r, x);
-    
-    r = ifthen(isinf(x) || x == RV(0.0),
-               ifthen(isinf(x),
-                      RV(M_PI_2) - copysign(RV(M_PI_2), x),
-                      RV(M_PI_2)),
-               r);
-    
-    r = ifthen(isinf(y),
-               ifthen(isinf(x),
-                      RV(M_PI_2) - copysign(RV(M_PI_4), x),
-                      RV(M_PI_2)),
-               r);
-    
-    r = ifthen(y == RV(0.0),
-               ifthen(signbit(x), RV(M_PI), RV(0.0)),
-               r);
-    
-    const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
-    return ifthen(isnan(x) || isnan(y), RV(nan), mulsign(r, y));
+  t = mad(u, t * s, s);
+  t = mad(q, RV(M_PI_2), t);
+
+  return t;
+}
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+  return mulsign(atan2k(fabs(d), sqrt((RV(1.0) + d) * (RV(1.0) - d))), d);
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+  return (mulsign(atan2k(sqrt((RV(1.0) + d) * (RV(1.0) - d)), fabs(d)), d) +
+          ifthen(d < RV(0.0), RV(M_PI), RV(0.0)));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t s) {
+  // Algorithm taken from SLEEF 2.80
+
+  realvec_t q1 = s;
+  s = fabs(s);
+
+  boolvec_t q0 = s > RV(1.0);
+  s = ifthen(q0, rcp(s), s);
+
+  realvec_t t = s * s;
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(0.00282363896258175373077393f);
+    u = mad(u, t, RV(-0.0159569028764963150024414f));
+    u = mad(u, t, RV(0.0425049886107444763183594f));
+    u = mad(u, t, RV(-0.0748900920152664184570312f));
+    u = mad(u, t, RV(0.106347933411598205566406f));
+    u = mad(u, t, RV(-0.142027363181114196777344f));
+    u = mad(u, t, RV(0.199926957488059997558594f));
+    u = mad(u, t, RV(-0.333331018686294555664062f));
+    break;
+  case sizeof(double):
+    u = RV(-1.88796008463073496563746e-05);
+    u = mad(u, t, RV(0.000209850076645816976906797));
+    u = mad(u, t, RV(-0.00110611831486672482563471));
+    u = mad(u, t, RV(0.00370026744188713119232403));
+    u = mad(u, t, RV(-0.00889896195887655491740809));
+    u = mad(u, t, RV(0.016599329773529201970117));
+    u = mad(u, t, RV(-0.0254517624932312641616861));
+    u = mad(u, t, RV(0.0337852580001353069993897));
+    u = mad(u, t, RV(-0.0407629191276836500001934));
+    u = mad(u, t, RV(0.0466667150077840625632675));
+    u = mad(u, t, RV(-0.0523674852303482457616113));
+    u = mad(u, t, RV(0.0587666392926673580854313));
+    u = mad(u, t, RV(-0.0666573579361080525984562));
+    u = mad(u, t, RV(0.0769219538311769618355029));
+    u = mad(u, t, RV(-0.090908995008245008229153));
+    u = mad(u, t, RV(0.111111105648261418443745));
+    u = mad(u, t, RV(-0.14285714266771329383765));
+    u = mad(u, t, RV(0.199999999996591265594148));
+    u = mad(u, t, RV(-0.333333333333311110369124));
+    break;
   }
-  
+
+  t = s + s * (t * u);
+
+  t = ifthen(q0, RV(M_PI_2) - t, t);
+  t = copysign(t, q1);
+
+  return t;
+}
+
+// Note: the order of arguments is y, x, as is convention for atan2
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_atan2(realvec_t y, realvec_t x) {
+  // Algorithm taken from SLEEF 2.80
+
+  realvec_t r = atan2k(fabs(y), x);
+
+  r = mulsign(r, x);
+
+  r = ifthen(isinf(x) || x == RV(0.0),
+             ifthen(isinf(x), RV(M_PI_2) - copysign(RV(M_PI_2), x), RV(M_PI_2)),
+             r);
+
+  r = ifthen(isinf(y),
+             ifthen(isinf(x), RV(M_PI_2) - copysign(RV(M_PI_4), x), RV(M_PI_2)),
+             r);
+
+  r = ifthen(y == RV(0.0), ifthen(signbit(x), RV(M_PI), RV(0.0)), r);
+
+  const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+  return ifthen(isnan(x) || isnan(y), RV(nan), mulsign(r, y));
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_ASIN_H
+#endif // #ifndef MATHFUNCS_ASIN_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_asinh.h b/lib/kernel/vecmathlib/mathfuncs_asinh.h
index c7be8eb..1197261 100644
--- a/lib/kernel/vecmathlib/mathfuncs_asinh.h
+++ b/lib/kernel/vecmathlib/mathfuncs_asinh.h
@@ -7,36 +7,31 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_asinh(realvec_t x) {
+  // Reduce range
+  realvec_t r = fabs(x);
+  r = log(r + sqrt(r * r + RV(1.0)));
+  r = copysign(r, x);
+  return r;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_acosh(realvec_t x) {
+  return log(x + sqrt(x * x - RV(1.0)));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_atanh(realvec_t x) {
+  // Reduce range
+  realvec_t r = fabs(x);
+  r = RV(0.5) * log((RV(1.0) + r) / (RV(1.0) - r));
+  r = copysign(r, x);
+  return r;
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_asinh(realvec_t x)
-  {
-    // Reduce range
-    realvec_t r = fabs(x);
-    r = log(r + sqrt(r*r + RV(1.0)));
-    r = copysign(r, x);
-    return r;
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_acosh(realvec_t x)
-  {
-    return log(x + sqrt(x*x - RV(1.0)));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_atanh(realvec_t x)
-  {
-    // Reduce range
-    realvec_t r = fabs(x);
-    r = RV(0.5) * log((RV(1.0) + r) / (RV(1.0) - r));
-    r = copysign(r, x);
-    return r;
-  }
-  
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_ASINH_H
+#endif // #ifndef MATHFUNCS_ASINH_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_base.h b/lib/kernel/vecmathlib/mathfuncs_base.h
index c685542..8545003 100644
--- a/lib/kernel/vecmathlib/mathfuncs_base.h
+++ b/lib/kernel/vecmathlib/mathfuncs_base.h
@@ -5,130 +5,127 @@
 
 #include "floatprops.h"
 
+namespace vecmathlib {
 
+template <typename realvec_t> struct mathfuncs {
+  typedef floatprops<typename realvec_t::real_t> FP;
+
+  typedef typename FP::real_t real_t;
+  typedef typename FP::int_t int_t;
+  typedef typename FP::uint_t uint_t;
+
+  static int const size = realvec_t::size;
+
+  // typedef realvec<real_t, size> realvec_t;
+  typedef typename realvec_t::intvec_t intvec_t;
+  typedef typename realvec_t::boolvec_t boolvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  // static real_t R(double a) { return real_t(a); }
+  // static int_t I(int a) { return int_t(a); }
+  // static uint_t U(int a) { return uint_t(a); }
+  // static realvec_t RV(real_t a) { return realvec_t(a); }
+  // static intvec_t IV(int_t a) { return intvec_t(a); }
+  // static boolvec_t BV(bool a) { return boolvec_t(a); }
+
+  // int
+  static intvec_t vml_abs(intvec_t x);
+  static intvec_t vml_bitifthen(intvec_t x, intvec_t y, intvec_t z);
+  static intvec_t vml_clz(intvec_t x);
+  static boolvec_t vml_isignbit(intvec_t x);
+  static intvec_t vml_max(intvec_t x, intvec_t y);
+  static intvec_t vml_min(intvec_t x, intvec_t y);
+  static intvec_t vml_popcount(intvec_t x);
+  static intvec_t vml_rotate(intvec_t x, int_t n);
+  static intvec_t vml_rotate(intvec_t x, intvec_t n);
+
+  // asin
+  static realvec_t vml_acos(realvec_t x);
+  static realvec_t vml_asin(realvec_t x);
+  static realvec_t vml_atan(realvec_t x);
+  static realvec_t vml_atan2(realvec_t y, realvec_t x);
+
+  // asinh
+  static realvec_t vml_acosh(realvec_t x);
+  static realvec_t vml_asinh(realvec_t x);
+  static realvec_t vml_atanh(realvec_t x);
+
+  // convert
+  static realvec_t vml_antitrunc(realvec_t x);
+  static realvec_t vml_ceil(realvec_t x);
+  static realvec_t vml_convert_float(intvec_t x);
+  static intvec_t vml_convert_int(realvec_t x);
+  static realvec_t vml_floor(realvec_t x);
+  static intvec_t vml_lrint(realvec_t x);
+  static realvec_t vml_rint(realvec_t x);
+  static realvec_t vml_round(realvec_t x);
+  static realvec_t vml_nextafter(realvec_t x, realvec_t y);
+  static realvec_t vml_trunc(realvec_t x);
+
+  // fabs
+  static realvec_t vml_copysign(realvec_t x, realvec_t y);
+  static realvec_t vml_fabs(realvec_t x);
+  static realvec_t vml_fdim(realvec_t x, realvec_t y);
+  static realvec_t vml_fma(realvec_t x, realvec_t y, realvec_t z);
+  static realvec_t vml_fmax(realvec_t x, realvec_t y);
+  static realvec_t vml_fmin(realvec_t x, realvec_t y);
+  static realvec_t vml_frexp(realvec_t x, intvec_t *r);
+  static intvec_t vml_ilogb(realvec_t x);
+  static boolvec_t vml_ieee_isfinite(realvec_t x);
+  static boolvec_t vml_ieee_isinf(realvec_t x);
+  static boolvec_t vml_ieee_isnan(realvec_t x);
+  static boolvec_t vml_ieee_isnormal(realvec_t x);
+  static boolvec_t vml_isfinite(realvec_t x);
+  static boolvec_t vml_isinf(realvec_t x);
+  static boolvec_t vml_isnan(realvec_t x);
+  static boolvec_t vml_isnormal(realvec_t x);
+  static realvec_t vml_ldexp(realvec_t x, intvec_t n);
+  static realvec_t vml_mad(realvec_t x, realvec_t y, realvec_t z);
+  static boolvec_t vml_signbit(realvec_t x);
+
+  // exp
+  static realvec_t vml_exp(realvec_t x);
+  static realvec_t vml_exp10(realvec_t x);
+  static realvec_t vml_exp2(realvec_t x);
+  static realvec_t vml_expm1(realvec_t x);
+
+  // log
+  static realvec_t vml_log(realvec_t x);
+  static realvec_t vml_log10(realvec_t x);
+  static realvec_t vml_log1p(realvec_t x);
+  static realvec_t vml_log2(realvec_t x);
+
+  // pow
+  static realvec_t vml_pow(realvec_t x, realvec_t y);
+
+  // rcp
+  static realvec_t vml_fmod(realvec_t x, realvec_t y);
+  static realvec_t vml_rcp(realvec_t x);
+  static realvec_t vml_remainder(realvec_t x, realvec_t y);
+
+  // sin
+  static realvec_t vml_cos(realvec_t x);
+  static realvec_t vml_sin(realvec_t x);
+  static realvec_t vml_tan(realvec_t x);
+
+  // sinh
+  static realvec_t vml_cosh(realvec_t x);
+  static realvec_t vml_sinh(realvec_t x);
+  static realvec_t vml_tanh(realvec_t x);
+
+  // sqrt
+  static realvec_t vml_cbrt(realvec_t x);
+  static realvec_t vml_hypot(realvec_t x, realvec_t y);
+  static realvec_t vml_rsqrt(realvec_t x);
+  static realvec_t vml_sqrt(realvec_t x);
+};
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  struct mathfuncs {
-    typedef floatprops<typename realvec_t::real_t> FP;
-    
-    typedef typename FP::real_t real_t;
-    typedef typename FP::int_t int_t;
-    typedef typename FP::uint_t uint_t;
-    
-    static int const size = realvec_t::size;
-    
-    // typedef realvec<real_t, size> realvec_t;
-    typedef typename realvec_t::intvec_t intvec_t;
-    typedef typename realvec_t::boolvec_t boolvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    // static real_t R(double a) { return real_t(a); }
-    // static int_t I(int a) { return int_t(a); }
-    // static uint_t U(int a) { return uint_t(a); }
-    // static realvec_t RV(real_t a) { return realvec_t(a); }
-    // static intvec_t IV(int_t a) { return intvec_t(a); }
-    // static boolvec_t BV(bool a) { return boolvec_t(a); }
-    
-    // int
-    static intvec_t vml_abs(intvec_t x);
-    static intvec_t vml_bitifthen(intvec_t x, intvec_t y, intvec_t z);
-    static intvec_t vml_clz(intvec_t x);
-    static boolvec_t vml_isignbit(intvec_t x);
-    static intvec_t vml_max(intvec_t x, intvec_t y);
-    static intvec_t vml_min(intvec_t x, intvec_t y);
-    static intvec_t vml_popcount(intvec_t x);
-    static intvec_t vml_rotate(intvec_t x, int_t n);
-    static intvec_t vml_rotate(intvec_t x, intvec_t n);
-    
-    // asin
-    static realvec_t vml_acos(realvec_t x);
-    static realvec_t vml_asin(realvec_t x);
-    static realvec_t vml_atan(realvec_t x);
-    static realvec_t vml_atan2(realvec_t y, realvec_t x);
-    
-    // asinh
-    static realvec_t vml_acosh(realvec_t x);
-    static realvec_t vml_asinh(realvec_t x);
-    static realvec_t vml_atanh(realvec_t x);
-    
-    // convert
-    static realvec_t vml_antitrunc(realvec_t x);
-    static realvec_t vml_ceil(realvec_t x);
-    static realvec_t vml_convert_float(intvec_t x);
-    static intvec_t vml_convert_int(realvec_t x);
-    static realvec_t vml_floor(realvec_t x);
-    static intvec_t vml_lrint(realvec_t x);
-    static realvec_t vml_rint(realvec_t x);
-    static realvec_t vml_round(realvec_t x);
-    static realvec_t vml_nextafter(realvec_t x, realvec_t y);
-    static realvec_t vml_trunc(realvec_t x);
-    
-    // fabs
-    static realvec_t vml_copysign(realvec_t x, realvec_t y);
-    static realvec_t vml_fabs(realvec_t x);
-    static realvec_t vml_fdim(realvec_t x, realvec_t y);
-    static realvec_t vml_fma(realvec_t x, realvec_t y, realvec_t z);
-    static realvec_t vml_fmax(realvec_t x, realvec_t y);
-    static realvec_t vml_fmin(realvec_t x, realvec_t y);
-    static realvec_t vml_frexp(realvec_t x, intvec_t* r);
-    static intvec_t vml_ilogb(realvec_t x);
-    static boolvec_t vml_ieee_isfinite(realvec_t x);
-    static boolvec_t vml_ieee_isinf(realvec_t x);
-    static boolvec_t vml_ieee_isnan(realvec_t x);
-    static boolvec_t vml_ieee_isnormal(realvec_t x);
-    static boolvec_t vml_isfinite(realvec_t x);
-    static boolvec_t vml_isinf(realvec_t x);
-    static boolvec_t vml_isnan(realvec_t x);
-    static boolvec_t vml_isnormal(realvec_t x);
-    static realvec_t vml_ldexp(realvec_t x, intvec_t n);
-    static realvec_t vml_mad(realvec_t x, realvec_t y, realvec_t z);
-    static boolvec_t vml_signbit(realvec_t x);
-    
-    // exp
-    static realvec_t vml_exp(realvec_t x);
-    static realvec_t vml_exp10(realvec_t x);
-    static realvec_t vml_exp2(realvec_t x);
-    static realvec_t vml_expm1(realvec_t x);
-    
-    // log
-    static realvec_t vml_log(realvec_t x);
-    static realvec_t vml_log10(realvec_t x);
-    static realvec_t vml_log1p(realvec_t x);
-    static realvec_t vml_log2(realvec_t x);
-    
-    // pow
-    static realvec_t vml_pow(realvec_t x, realvec_t y);
-    
-    // rcp
-    static realvec_t vml_fmod(realvec_t x, realvec_t y);
-    static realvec_t vml_rcp(realvec_t x);
-    static realvec_t vml_remainder(realvec_t x, realvec_t y);
-    
-    // sin
-    static realvec_t vml_cos(realvec_t x);
-    static realvec_t vml_sin(realvec_t x);
-    static realvec_t vml_tan(realvec_t x);
-    
-    // sinh
-    static realvec_t vml_cosh(realvec_t x);
-    static realvec_t vml_sinh(realvec_t x);
-    static realvec_t vml_tanh(realvec_t x);
-    
-    // sqrt
-    static realvec_t vml_cbrt(realvec_t x);
-    static realvec_t vml_hypot(realvec_t x, realvec_t y);
-    static realvec_t vml_rsqrt(realvec_t x);
-    static realvec_t vml_sqrt(realvec_t x);
-  };
-  
 } // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_BASE_H
+#endif // #ifndef MATHFUNCS_BASE_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_convert.h b/lib/kernel/vecmathlib/mathfuncs_convert.h
index 79befbc..9cb1add 100644
--- a/lib/kernel/vecmathlib/mathfuncs_convert.h
+++ b/lib/kernel/vecmathlib/mathfuncs_convert.h
@@ -7,197 +7,179 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x) {
+  // Convert in two passes. Convert as much as possible during the
+  // first pass (lobits), so that the second pass (hibits) may be
+  // omitted if the high bits are known to be zero.
+  int_t lobits = FP::mantissa_bits;
+  // int_t hibits = FP::bits - lobits;
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x)
-  {
-    // Convert in two passes. Convert as much as possible during the
-    // first pass (lobits), so that the second pass (hibits) may be
-    // omitted if the high bits are known to be zero.
-    int_t lobits = FP::mantissa_bits;
-    // int_t hibits = FP::bits - lobits;
-    
-    // Convert lower bits
-    intvec_t xlo = x & IV((U(1) << lobits) - 1);
-    // exponent for the equivalent floating point number
-    int_t exponent_lo = (FP::exponent_offset + lobits) << FP::mantissa_bits;
-    xlo |= exponent_lo;
-    // subtract hidden mantissa bit
-    realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo));
-    
-    // Convert upper bits
-    // make unsigned by subtracting largest negative number
-    // (only do this for the high bits, since they have sufficient
-    // precision to handle the overflow)
-    x ^= FP::signbit_mask;
-    intvec_t xhi = lsr(x, lobits);
-    // exponent for the equivalent floating point number
-    int_t exponent_hi = (FP::exponent_offset + 2*lobits) << FP::mantissa_bits;
-    xhi |= exponent_hi;
-    // subtract hidden mantissa bit
-    realvec_t fhi = as_float(xhi) - RV(FP::as_float(exponent_hi));
-    // add largest negative number again
-    fhi -= RV(R(FP::signbit_mask));
-    // Ensure that the converted low and high bits are calculated
-    // separately, since a real_t doesn't have enough precision to
-    // hold all the bits of an int_t
-    fhi.barrier();
-    
-    // Combine results
-    return flo + fhi;
-  }
-  
-  
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t
-  mathfuncs<realvec_t>::vml_convert_int(realvec_t x)
-  {
-    // Handle overflow
-    // int_t min_int = FP::signbit_mask;
-    // int_t max_int = ~FP::signbit_mask;
-    // boolvec_t is_overflow = x < RV(R(min_int)) || x > RV(R(max_int));
-    // Handle negative numbers
-    boolvec_t is_negative = signbit(x);
-    x = fabs(x);
-    // Handle small numbers
-    boolvec_t issmall = x < RV(1.0);
-    
-    intvec_t shift = ilogb(x) - IV(FP::mantissa_bits);
-    boolvec_t shift_left = x > RV(std::ldexp(R(1.0), FP::mantissa_bits)); 
-    intvec_t ix = as_int(x) & IV(FP::mantissa_mask);
-    // add hidden mantissa bit
-    ix |= U(1) << FP::mantissa_bits;
-    // shift according to exponent (which may truncate)
-    ix = ifthen(shift_left, ix << shift, ix >> -shift);
-    
-    // Handle small numbers
-    ix = ifthen(issmall, IV(I(0)), ix);
-    // Handle negative numbers
-    ix = ifthen(is_negative, -ix, ix);
-    // Handle overflow
-    // ix = ifthen(is_overflow, IV(min_int), ix);
-    
-    return ix;
-  }
-  
-  
-  
-  // Round to nearest integer, breaking ties using prevailing rounding
-  // mode (default: round to even)
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_rint(realvec_t x)
-  {
-    realvec_t r = x;
-    // Round by adding a large number, destroying all excess precision
-    realvec_t offset = copysign(RV(std::ldexp(R(1.0), FP::mantissa_bits)), x);
-    r += offset;
-    // Ensure the rounding is not optimised away
-    r.barrier();
-    r -= offset;
-    return r;
-  }
-  
-  // Round to next integer above
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_ceil(realvec_t x)
-  {
-    // boolvec_t iszero = x == RV(0.0);
-    // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
-    // return ifthen(iszero, x, rint(x + offset));
-    return ifthen(x<RV(0.0), trunc(x), vml_antitrunc(x));
-  }
-  
-  // Round to next integer below
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_floor(realvec_t x)
-  {
-    // boolvec_t iszero = x == RV(0.0);
-    // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
-    // return ifthen(iszero, x, rint(x - offset));
-    return ifthen(x<RV(0.0), vml_antitrunc(x), trunc(x));
-  }
-  
-  // Round to nearest integer, breaking ties using prevailing rounding
-  // mode (default: round to even), returning an integer
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_lrint(realvec_t x)
-  {
-    return convert_int(rint(x));
-  }
-  
-  // Round to nearest integer, breaking ties away from zero
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_round(realvec_t x)
-  {
-    // return copysign(floor(fabs(x)+RV(0.5)), x);
-    return trunc(x + copysign(RV(0.5), x));
-  }
-  
-  // Round to next integer towards zero
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_trunc(realvec_t x)
-  {
-    realvec_t x0 = x;
-    x = fabs(x);
-    boolvec_t istoosmall = x < RV(1.0);
-    boolvec_t istoolarge = x >= RV(std::ldexp(R(1.0), FP::mantissa_bits));
-    // Number of mantissa bits to keep
-    intvec_t nbits = ilogb(x);
-    // This is probably faster than a shift operation
-    realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
-    intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
-    realvec_t y = as_float(as_int(x) & imask);
-    realvec_t r =
+  // Convert lower bits
+  intvec_t xlo = x & IV((U(1) << lobits) - 1);
+  // exponent for the equivalent floating point number
+  int_t exponent_lo = (FP::exponent_offset + lobits) << FP::mantissa_bits;
+  xlo |= exponent_lo;
+  // subtract hidden mantissa bit
+  realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo));
+
+  // Convert upper bits
+  // make unsigned by subtracting largest negative number
+  // (only do this for the high bits, since they have sufficient
+  // precision to handle the overflow)
+  x ^= FP::signbit_mask;
+  intvec_t xhi = lsr(x, lobits);
+  // exponent for the equivalent floating point number
+  int_t exponent_hi = (FP::exponent_offset + 2 * lobits) << FP::mantissa_bits;
+  xhi |= exponent_hi;
+  // subtract hidden mantissa bit
+  realvec_t fhi = as_float(xhi) - RV(FP::as_float(exponent_hi));
+  // add largest negative number again
+  fhi -= RV(R(FP::signbit_mask));
+  // Ensure that the converted low and high bits are calculated
+  // separately, since a real_t doesn't have enough precision to
+  // hold all the bits of an int_t
+  fhi.barrier();
+
+  // Combine results
+  return flo + fhi;
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t
+mathfuncs<realvec_t>::vml_convert_int(realvec_t x) {
+  // Handle overflow
+  // int_t min_int = FP::signbit_mask;
+  // int_t max_int = ~FP::signbit_mask;
+  // boolvec_t is_overflow = x < RV(R(min_int)) || x > RV(R(max_int));
+  // Handle negative numbers
+  boolvec_t is_negative = signbit(x);
+  x = fabs(x);
+  // Handle small numbers
+  boolvec_t issmall = x < RV(1.0);
+
+  intvec_t shift = ilogb(x) - IV(FP::mantissa_bits);
+  boolvec_t shift_left = x > RV(std::ldexp(R(1.0), FP::mantissa_bits));
+  intvec_t ix = as_int(x) & IV(FP::mantissa_mask);
+  // add hidden mantissa bit
+  ix |= U(1) << FP::mantissa_bits;
+  // shift according to exponent (which may truncate)
+  ix = ifthen(shift_left, ix << shift, ix >> -shift);
+
+  // Handle small numbers
+  ix = ifthen(issmall, IV(I(0)), ix);
+  // Handle negative numbers
+  ix = ifthen(is_negative, -ix, ix);
+  // Handle overflow
+  // ix = ifthen(is_overflow, IV(min_int), ix);
+
+  return ix;
+}
+
+// Round to nearest integer, breaking ties using prevailing rounding
+// mode (default: round to even)
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_rint(realvec_t x) {
+  realvec_t r = x;
+  // Round by adding a large number, destroying all excess precision
+  realvec_t offset = copysign(RV(std::ldexp(R(1.0), FP::mantissa_bits)), x);
+  r += offset;
+  // Ensure the rounding is not optimised away
+  r.barrier();
+  r -= offset;
+  return r;
+}
+
+// Round to next integer above
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_ceil(realvec_t x) {
+  // boolvec_t iszero = x == RV(0.0);
+  // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
+  // return ifthen(iszero, x, rint(x + offset));
+  return ifthen(x < RV(0.0), trunc(x), vml_antitrunc(x));
+}
+
+// Round to next integer below
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_floor(realvec_t x) {
+  // boolvec_t iszero = x == RV(0.0);
+  // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
+  // return ifthen(iszero, x, rint(x - offset));
+  return ifthen(x < RV(0.0), vml_antitrunc(x), trunc(x));
+}
+
+// Round to nearest integer, breaking ties using prevailing rounding
+// mode (default: round to even), returning an integer
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_lrint(realvec_t x) {
+  return convert_int(rint(x));
+}
+
+// Round to nearest integer, breaking ties away from zero
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_round(realvec_t x) {
+  // return copysign(floor(fabs(x)+RV(0.5)), x);
+  return trunc(x + copysign(RV(0.5), x));
+}
+
+// Round to next integer towards zero
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_trunc(realvec_t x) {
+  realvec_t x0 = x;
+  x = fabs(x);
+  boolvec_t istoosmall = x < RV(1.0);
+  boolvec_t istoolarge = x >= RV(std::ldexp(R(1.0), FP::mantissa_bits));
+  // Number of mantissa bits to keep
+  intvec_t nbits = ilogb(x);
+  // This is probably faster than a shift operation
+  realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
+  intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
+  realvec_t y = as_float(as_int(x) & imask);
+  realvec_t r =
       copysign(ifthen(istoosmall, RV(0.0), ifthen(istoolarge, x, y)), x0);
-    return r;
-  }
-  
-  // Round to next integer away from zero
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_antitrunc(realvec_t x)
-  {
-    realvec_t x0 = x;
-    x = fabs(x);
-    boolvec_t iszero = x == RV(0.0);
-    boolvec_t issmall = x <= RV(1.0);
-    boolvec_t istoolarge =
-      x > RV(std::ldexp(R(1.0), FP::mantissa_bits) - R(1.0));
-    // Number of mantissa bits to keep
-    intvec_t nbits = ilogb(x);
-    // This is probably faster than a shift operation
-    realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
-    intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
-    realvec_t offset = RV(1.0) - ldexp(RV(1.0), nbits - IV(FP::mantissa_bits));
-    offset.barrier();
-    realvec_t y = as_float(as_int(x + offset) & imask);
-    realvec_t r =
+  return r;
+}
+
+// Round to next integer away from zero
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_antitrunc(realvec_t x) {
+  realvec_t x0 = x;
+  x = fabs(x);
+  boolvec_t iszero = x == RV(0.0);
+  boolvec_t issmall = x <= RV(1.0);
+  boolvec_t istoolarge = x > RV(std::ldexp(R(1.0), FP::mantissa_bits) - R(1.0));
+  // Number of mantissa bits to keep
+  intvec_t nbits = ilogb(x);
+  // This is probably faster than a shift operation
+  realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
+  intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
+  realvec_t offset = RV(1.0) - ldexp(RV(1.0), nbits - IV(FP::mantissa_bits));
+  offset.barrier();
+  realvec_t y = as_float(as_int(x + offset) & imask);
+  realvec_t r =
       copysign(ifthen(iszero, RV(0.0),
-                      ifthen(issmall, RV(1.0),
-                             ifthen(istoolarge, x, y))), x0);
-    return r;
-  }
-  
-  // Next machine representable number from x in direction y
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_nextafter(realvec_t x, realvec_t y)
-  {
-    realvec_t dir = y - x;
-    realvec_t offset = ldexp(RV(FP::epsilon()), ilogb(x));
-    offset = copysign(offset, dir);
-    offset = ifthen(convert_bool(as_int(x) & IV(FP::mantissa_mask)) ||
-                    signbit(x) == signbit(offset),
-                    offset,
-                    offset * RV(0.5));
-    realvec_t r = x + offset;
-    real_t smallest_pos = std::ldexp(FP::min(), -FP::mantissa_bits);
-    return ifthen(dir==RV(0.0), y,
-                  ifthen(x==RV(0.0), copysign(RV(smallest_pos), dir), r));
-  }
-  
+                      ifthen(issmall, RV(1.0), ifthen(istoolarge, x, y))),
+               x0);
+  return r;
+}
+
+// Next machine representable number from x in direction y
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_nextafter(realvec_t x, realvec_t y) {
+  realvec_t dir = y - x;
+  realvec_t offset = ldexp(RV(FP::epsilon()), ilogb(x));
+  offset = copysign(offset, dir);
+  offset = ifthen(convert_bool(as_int(x) & IV(FP::mantissa_mask)) ||
+                      signbit(x) == signbit(offset),
+                  offset, offset * RV(0.5));
+  realvec_t r = x + offset;
+  real_t smallest_pos = std::ldexp(FP::min(), -FP::mantissa_bits);
+  return ifthen(dir == RV(0.0), y,
+                ifthen(x == RV(0.0), copysign(RV(smallest_pos), dir), r));
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_CONVERT_H
+#endif // #ifndef MATHFUNCS_CONVERT_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_exp.h b/lib/kernel/vecmathlib/mathfuncs_exp.h
index d357a21..e35fb1b 100644
--- a/lib/kernel/vecmathlib/mathfuncs_exp.h
+++ b/lib/kernel/vecmathlib/mathfuncs_exp.h
@@ -7,156 +7,145 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_exp2(realvec_t x) {
+  // TODO: Check SLEEF 2.80 algorithm
+  // (in particular the improved-precision truncation)
+
+  // Rescale
+  realvec_t x0 = x;
+
+// realvec_t round_x = rint(x);
+// intvec_t iround_x = convert_int(round_x);
+// r = ldexp(r, iround_x);
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_exp2(realvec_t x)
-  {
-    // TODO: Check SLEEF 2.80 algorithm
-    // (in particular the improved-precision truncation)
-    
-    // Rescale
-    realvec_t x0 = x;
-    
-    // realvec_t round_x = rint(x);
-    // intvec_t iround_x = convert_int(round_x);
-    // r = ldexp(r, iround_x);
-    
 #if 0
     // Straightforward implementation
     realvec_t round_x = rint(x);
     x -= round_x;
 #elif 1
-    // Round by adding, then subtracting again a large number
-    // Add a large number to move the mantissa bits to the right
-    int_t large = (U(1) << FP::mantissa_bits) + FP::exponent_offset;
-    realvec_t tmp = x + RV(R(large));
-    tmp.barrier();
-    
-    realvec_t round_x = tmp - RV(R(large));
-    x -= round_x;
+  // Round by adding, then subtracting again a large number
+  // Add a large number to move the mantissa bits to the right
+  int_t large = (U(1) << FP::mantissa_bits) + FP::exponent_offset;
+  realvec_t tmp = x + RV(R(large));
+  tmp.barrier();
+
+  realvec_t round_x = tmp - RV(R(large));
+  x -= round_x;
 #else
-    // Straightforward implementation, using round instead of rint,
-    // since round is faster for QPX
-    realvec_t round_x = round(x);
-    x -= round_x;
+  // Straightforward implementation, using round instead of rint,
+  // since round is faster for QPX
+  realvec_t round_x = round(x);
+  x -= round_x;
 #endif
-    VML_ASSERT(all(x >= RV(-0.5) && x <= RV(0.5)));
-    
-    // Polynomial expansion
-    realvec_t r;
-    switch (sizeof(real_t)) {
-    case 4:
+  VML_ASSERT(all(x >= RV(-0.5) && x <= RV(0.5)));
+
+  // Polynomial expansion
+  realvec_t r;
+  switch (sizeof(real_t)) {
+  case 4:
 #ifdef VML_HAVE_FP_CONTRACT
-      // float, error=4.55549108005200277750378992345e-9
-      r = RV(0.000154653240842602623787395880898);
-      r = mad(r, x, RV(0.00133952915439234389712105060319));
-      r = mad(r, x, RV(0.0096180399118156827664944870552));
-      r = mad(r, x, RV(0.055503406540531310853149866446));
-      r = mad(r, x, RV(0.240226511015459465468737123346));
-      r = mad(r, x, RV(0.69314720007380208630542805293));
-      r = mad(r, x, RV(0.99999999997182023878745628977));
+    // float, error=4.55549108005200277750378992345e-9
+    r = RV(0.000154653240842602623787395880898);
+    r = mad(r, x, RV(0.00133952915439234389712105060319));
+    r = mad(r, x, RV(0.0096180399118156827664944870552));
+    r = mad(r, x, RV(0.055503406540531310853149866446));
+    r = mad(r, x, RV(0.240226511015459465468737123346));
+    r = mad(r, x, RV(0.69314720007380208630542805293));
+    r = mad(r, x, RV(0.99999999997182023878745628977));
 #else
-      // float, error=1.62772721960621336664735896836e-7
-      r = RV(0.00133952915439234389712105060319);
-      r = mad(r, x, RV(0.009670773148229417605024318985));
-      r = mad(r, x, RV(0.055503406540531310853149866446));
-      r = mad(r, x, RV(0.240222115700585316818177639177));
-      r = mad(r, x, RV(0.69314720007380208630542805293));
-      r = mad(r, x, RV(1.00000005230745711373079206024));
+    // float, error=1.62772721960621336664735896836e-7
+    r = RV(0.00133952915439234389712105060319);
+    r = mad(r, x, RV(0.009670773148229417605024318985));
+    r = mad(r, x, RV(0.055503406540531310853149866446));
+    r = mad(r, x, RV(0.240222115700585316818177639177));
+    r = mad(r, x, RV(0.69314720007380208630542805293));
+    r = mad(r, x, RV(1.00000005230745711373079206024));
 #endif
-      break;
-    case 8:
+    break;
+  case 8:
 #ifdef VML_HAVE_FP_CONTRACT
-      // double, error=9.32016781355638010975628074746e-18
-      r = RV(4.45623165388261696886670014471e-10);
-      r = mad(r, x, RV(7.0733589360775271430968224806e-9));
-      r = mad(r, x, RV(1.01780540270960163558119510246e-7));
-      r = mad(r, x, RV(1.3215437348041505269462510712e-6));
-      r = mad(r, x, RV(0.000015252733849766201174247690629));
-      r = mad(r, x, RV(0.000154035304541242555115696403795));
-      r = mad(r, x, RV(0.00133335581463968601407096905671));
-      r = mad(r, x, RV(0.0096181291075949686712855561931));
-      r = mad(r, x, RV(0.055504108664821672870565883052));
-      r = mad(r, x, RV(0.240226506959101382690753994082));
-      r = mad(r, x, RV(0.69314718055994530864272481773));
-      r = mad(r, x, RV(0.9999999999999999978508676375));
+    // double, error=9.32016781355638010975628074746e-18
+    r = RV(4.45623165388261696886670014471e-10);
+    r = mad(r, x, RV(7.0733589360775271430968224806e-9));
+    r = mad(r, x, RV(1.01780540270960163558119510246e-7));
+    r = mad(r, x, RV(1.3215437348041505269462510712e-6));
+    r = mad(r, x, RV(0.000015252733849766201174247690629));
+    r = mad(r, x, RV(0.000154035304541242555115696403795));
+    r = mad(r, x, RV(0.00133335581463968601407096905671));
+    r = mad(r, x, RV(0.0096181291075949686712855561931));
+    r = mad(r, x, RV(0.055504108664821672870565883052));
+    r = mad(r, x, RV(0.240226506959101382690753994082));
+    r = mad(r, x, RV(0.69314718055994530864272481773));
+    r = mad(r, x, RV(0.9999999999999999978508676375));
 #else
-      // double, error=3.74939899823302048807873981077e-14
-      r = RV(1.02072375599725694063203809188e-7);
-      r = mad(r, x, RV(1.32573274434801314145133004073e-6));
-      r = mad(r, x, RV(0.0000152526647170731944840736190013));
-      r = mad(r, x, RV(0.000154034441925859828261898614555));
-      r = mad(r, x, RV(0.00133335582175770747495287552557));
-      r = mad(r, x, RV(0.0096181291794939392517233403183));
-      r = mad(r, x, RV(0.055504108664525029438908798685));
-      r = mad(r, x, RV(0.240226506957026959772247598695));
-      r = mad(r, x, RV(0.6931471805599487321347668143));
-      r = mad(r, x, RV(1.00000000000000942892870993489));
+    // double, error=3.74939899823302048807873981077e-14
+    r = RV(1.02072375599725694063203809188e-7);
+    r = mad(r, x, RV(1.32573274434801314145133004073e-6));
+    r = mad(r, x, RV(0.0000152526647170731944840736190013));
+    r = mad(r, x, RV(0.000154034441925859828261898614555));
+    r = mad(r, x, RV(0.00133335582175770747495287552557));
+    r = mad(r, x, RV(0.0096181291794939392517233403183));
+    r = mad(r, x, RV(0.055504108664525029438908798685));
+    r = mad(r, x, RV(0.240226506957026959772247598695));
+    r = mad(r, x, RV(0.6931471805599487321347668143));
+    r = mad(r, x, RV(1.00000000000000942892870993489));
 #endif
-      break;
-    default:
-      __builtin_unreachable();
-    }
-    
-    // Undo rescaling
+    break;
+  default:
+    __builtin_unreachable();
+  }
+
+// Undo rescaling
 #if 0
     // Straightforward implementation
     r = ldexp(r, convert_int(round_x));
 #elif 1
-    // Use direct integer manipulation
-    // Extract integer as lowest mantissa bits (highest bits still
-    // contain offset, exponent, and sign)
-    intvec_t itmp = as_int(tmp);
-    // Construct scale factor by setting exponent (this shifts out the
-    // highest bits)
-    realvec_t scale = as_float(itmp << I(FP::mantissa_bits));
-    r *= scale;
+  // Use direct integer manipulation
+  // Extract integer as lowest mantissa bits (highest bits still
+  // contain offset, exponent, and sign)
+  intvec_t itmp = as_int(tmp);
+  // Construct scale factor by setting exponent (this shifts out the
+  // highest bits)
+  realvec_t scale = as_float(itmp << I(FP::mantissa_bits));
+  r *= scale;
 #else
-    // Use floating point operations instead of integer operations,
-    // since these are faster for QPX
-    real_t exponent_factor = R(I(1) << I(FP::mantissa_bits));
-    real_t exponent_offset = R(I(FP::exponent_offset) << I(FP::mantissa_bits));
-    realvec_t exponent = mad(round_x, RV(exponent_factor), RV(exponent_offset));
-    realvec_t scale = as_float(convert_int(exponent));
-    r *= scale;
+  // Use floating point operations instead of integer operations,
+  // since these are faster for QPX
+  real_t exponent_factor = R(I(1) << I(FP::mantissa_bits));
+  real_t exponent_offset = R(I(FP::exponent_offset) << I(FP::mantissa_bits));
+  realvec_t exponent = mad(round_x, RV(exponent_factor), RV(exponent_offset));
+  realvec_t scale = as_float(convert_int(exponent));
+  r *= scale;
 #endif
-    
-    r = ifthen(x0 < RV(R(FP::min_exponent)), RV(0.0), r);
-    
-    return r;
-  }
-  
-  
-  
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_exp(realvec_t x)
-  {
-    return exp2(RV(M_LOG2E) * x);
-  }
 
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_exp10(realvec_t x)
-  {
-    return exp2(RV(M_LOG2E * M_LN10) * x);
-  }
+  r = ifthen(x0 < RV(R(FP::min_exponent)), RV(0.0), r);
+
+  return r;
+}
 
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_expm1(realvec_t x)
-  {
-    // TODO: improve this
-    return exp(x) - RV(1.0);
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_exp(realvec_t x) {
+  return exp2(RV(M_LOG2E) * x);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_exp10(realvec_t x) {
+  return exp2(RV(M_LOG2E * M_LN10) * x);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_expm1(realvec_t x) {
+  // TODO: improve this
+  return exp(x) - RV(1.0);
 #if 0
     r = exp(x) - RV(1.0);
     return ifthen(r == RV(0.0), x, r);
 #endif
-  }
-  
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_EXP_H
+#endif // #ifndef MATHFUNCS_EXP_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_fabs.h b/lib/kernel/vecmathlib/mathfuncs_fabs.h
index 4f31dec..c3f7356 100644
--- a/lib/kernel/vecmathlib/mathfuncs_fabs.h
+++ b/lib/kernel/vecmathlib/mathfuncs_fabs.h
@@ -7,201 +7,176 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y) {
+  intvec_t value = as_int(x) & IV(U(~FP::signbit_mask));
+  intvec_t sign = as_int(y) & IV(FP::signbit_mask);
+  return as_float(sign | value);
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y)
-  {
-    intvec_t value = as_int(x) & IV(U(~FP::signbit_mask));
-    intvec_t sign = as_int(y) & IV(FP::signbit_mask);
-    return as_float(sign | value);
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x)
-  {
-    return as_float(as_int(x) & IV(U(~FP::signbit_mask)));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fdim(realvec_t x, realvec_t y)
-  {
-    // return ifthen(x > y, x - y, RV(0.0));
-    return fmax(x - y, RV(0.0));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fma(realvec_t x, realvec_t y, realvec_t z)
-  {
-    return x * y + z;
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fmax(realvec_t x, realvec_t y)
-  {
-    return ifthen(x < y, y, x);
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fmin(realvec_t x, realvec_t y)
-  {
-    return ifthen(y < x, y, x);
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_frexp(realvec_t x,
-                                            typename realvec_t::intvec_t* irp)
-  {
-    intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
-    intvec_t ir = e - IV(FP::exponent_offset - 1);
-    ir = ifthen(convert_bool(e), ir, IV(std::numeric_limits<int_t>::min()));
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x) {
+  return as_float(as_int(x) & IV(U(~FP::signbit_mask)));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fdim(realvec_t x, realvec_t y) {
+  // return ifthen(x > y, x - y, RV(0.0));
+  return fmax(x - y, RV(0.0));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fma(realvec_t x, realvec_t y, realvec_t z) {
+  return x * y + z;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fmax(realvec_t x, realvec_t y) {
+  return ifthen(x < y, y, x);
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fmin(realvec_t x, realvec_t y) {
+  return ifthen(y < x, y, x);
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_frexp(realvec_t x,
+                                          typename realvec_t::intvec_t *irp) {
+  intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
+  intvec_t ir = e - IV(FP::exponent_offset - 1);
+  ir = ifthen(convert_bool(e), ir, IV(std::numeric_limits<int_t>::min()));
 #if defined VML_HAVE_INF
-    ir = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), ir);
+  ir = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), ir);
 #endif
 #if defined VML_HAVE_NAN
-    ir = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), ir);
+  ir = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), ir);
 #endif
-    realvec_t r =
+  realvec_t r =
       as_float((as_int(x) & IV(FP::signbit_mask | FP::mantissa_mask)) |
                IV(FP::as_int(R(0.5)) & FP::exponent_mask));
-    boolvec_t iszero = x == RV(0.0);
-    ir = ifthen(iszero, IV(I(0)), ir);
-    r = ifthen(iszero, copysign(RV(R(0.0)), r), r);
-    *irp = ir;
-    return r;
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_ilogb(realvec_t x)
-  {
-    // TODO: Check SLEEF 2.80 algorithm
-    intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
-    intvec_t r = e - IV(FP::exponent_offset);
-    r = ifthen(convert_bool(e), r, IV(std::numeric_limits<int_t>::min()));
+  boolvec_t iszero = x == RV(0.0);
+  ir = ifthen(iszero, IV(I(0)), ir);
+  r = ifthen(iszero, copysign(RV(R(0.0)), r), r);
+  *irp = ir;
+  return r;
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_ilogb(realvec_t x) {
+  // TODO: Check SLEEF 2.80 algorithm
+  intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
+  intvec_t r = e - IV(FP::exponent_offset);
+  r = ifthen(convert_bool(e), r, IV(std::numeric_limits<int_t>::min()));
 #if defined VML_HAVE_INF
-    r = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), r);
+  r = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), r);
 #endif
 #if defined VML_HAVE_NAN
-    r = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), r);
+  r = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), r);
 #endif
-    return r;
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_ieee_isfinite(realvec_t x)
-  {
-    return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_ieee_isinf(realvec_t x)
-  {
-    return (as_int(x) & IV(I(~FP::signbit_mask))) == IV(FP::exponent_mask);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_ieee_isnan(realvec_t x)
-  {
-    return
-      (as_int(x) & IV(FP::exponent_mask)) == IV(FP::exponent_mask) &&
-      (as_int(x) & IV(FP::mantissa_mask)) != IV(I(0));
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_ieee_isnormal(realvec_t x)
-  {
-    return
-      (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask) &&
-      (as_int(x) & IV(FP::exponent_mask)) != IV(I(0));
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_isfinite(realvec_t x)
-  {
+  return r;
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isfinite(realvec_t x) {
+  return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask);
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isinf(realvec_t x) {
+  return (as_int(x) & IV(I(~FP::signbit_mask))) == IV(FP::exponent_mask);
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isnan(realvec_t x) {
+  return (as_int(x) & IV(FP::exponent_mask)) == IV(FP::exponent_mask) &&
+         (as_int(x) & IV(FP::mantissa_mask)) != IV(I(0));
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isnormal(realvec_t x) {
+  return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask) &&
+         (as_int(x) & IV(FP::exponent_mask)) != IV(I(0));
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isfinite(realvec_t x) {
 #if defined VML_HAVE_INF || defined VML_HAVE_NAN
-    return vml_ieee_isfinite(x);
+  return vml_ieee_isfinite(x);
 #else
-    return BV(true);
+  return BV(true);
 #endif
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isinf(realvec_t x)
-  {
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isinf(realvec_t x) {
 #if defined VML_HAVE_INF
-    return vml_ieee_isinf(x);
+  return vml_ieee_isinf(x);
 #else
-    return BV(false);
+  return BV(false);
 #endif
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnan(realvec_t x)
-  {
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnan(realvec_t x) {
 #if defined VML_HAVE_NAN
-    return vml_ieee_isnan(x);
+  return vml_ieee_isnan(x);
 #else
-    return BV(false);
+  return BV(false);
 #endif
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnormal(realvec_t x)
-  {
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnormal(realvec_t x) {
 #if defined VML_HAVE_DENORMALS || defined VML_HAVE_INF || defined VML_HAVE_NAN
-    return vml_ieee_isnormal(x);
+  return vml_ieee_isnormal(x);
 #else
-    return BV(true);
+  return BV(true);
 #endif
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_ldexp(realvec_t x, intvec_t n)
-  {
-    // TODO: Check SLEEF 2.80 algorithm
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_ldexp(realvec_t x, intvec_t n) {
+// TODO: Check SLEEF 2.80 algorithm
 #if 0
     realvec_t r = as_float(as_int(x) + (n << I(FP::mantissa_bits)));
     r = ifthen((as_int(x) & IV(FP::exponent_mask)) == IV(I(0)), x, r);
     return r;
 #endif
-    realvec_t r = as_float(as_int(x) + (n << U(FP::mantissa_bits)));
-    int max_n = FP::max_exponent - FP::min_exponent;
-    boolvec_t underflow = n < IV(I(-max_n));
-    boolvec_t overflow = n > IV(I(max_n));
-    intvec_t old_exp =
-      lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
-    intvec_t new_exp = old_exp + n;
-    // TODO: check bit patterns instead
-    underflow =
+  realvec_t r = as_float(as_int(x) + (n << U(FP::mantissa_bits)));
+  int max_n = FP::max_exponent - FP::min_exponent;
+  boolvec_t underflow = n < IV(I(-max_n));
+  boolvec_t overflow = n > IV(I(max_n));
+  intvec_t old_exp = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
+  intvec_t new_exp = old_exp + n;
+  // TODO: check bit patterns instead
+  underflow =
       underflow || new_exp < IV(I(FP::min_exponent + FP::exponent_offset));
-    overflow =
+  overflow =
       overflow || new_exp > IV(I(FP::max_exponent + FP::exponent_offset));
-    r = ifthen(underflow, copysign(RV(R(0.0)), x), r);
-    r = ifthen(overflow, copysign(RV(FP::infinity()), x), r);
-    boolvec_t dont_change = x == RV(R(0.0)) || isinf(x) || isnan(x);
-    r = ifthen(dont_change, x, r);
-    return r;
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_mad(realvec_t x, realvec_t y, realvec_t z)
-  {
-    return x * y + z;
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_signbit(realvec_t x)
-  {
-    return convert_bool(as_int(x) & IV(FP::signbit_mask));
-  }
-  
+  r = ifthen(underflow, copysign(RV(R(0.0)), x), r);
+  r = ifthen(overflow, copysign(RV(FP::infinity()), x), r);
+  boolvec_t dont_change = x == RV(R(0.0)) || isinf(x) || isnan(x);
+  r = ifthen(dont_change, x, r);
+  return r;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_mad(realvec_t x, realvec_t y, realvec_t z) {
+  return x * y + z;
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_signbit(realvec_t x) {
+  return convert_bool(as_int(x) & IV(FP::signbit_mask));
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_FABS_H
+#endif // #ifndef MATHFUNCS_FABS_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_int.h b/lib/kernel/vecmathlib/mathfuncs_int.h
index 862189d..fff65ff 100644
--- a/lib/kernel/vecmathlib/mathfuncs_int.h
+++ b/lib/kernel/vecmathlib/mathfuncs_int.h
@@ -7,129 +7,128 @@
 
 #include <climits>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_abs(intvec_t x) {
+  return ifthen(isignbit(x), -x, x);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t
+mathfuncs<realvec_t>::vml_bitifthen(intvec_t x, intvec_t y, intvec_t z) {
+  return (x & y) | (~x & z);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_clz(intvec_t x) {
+  // These implementations return 8*sizeof(TYPE) when the input is 0
+
+  // These explicit implementations are taken from
+  // <http://aggregate.org/MAGIC/>:
+  //
+  // @techreport{magicalgorithms,
+  //   author={Henry Gordon Dietz},
+  //   title={{The Aggregate Magic Algorithms}},
+  //   institution={University of Kentucky},
+  //   howpublished={Aggregate.Org online technical report},
+  //   date={2013-03-25},
+  //   URL={http://aggregate.org/MAGIC/}
+  // }
+
+  int_t bits = CHAR_BIT * sizeof(int_t);
+  if (bits > 1)
+    x |= lsr(x, 1);
+  if (bits > 2)
+    x |= lsr(x, 2);
+  if (bits > 4)
+    x |= lsr(x, 4);
+  if (bits > 8)
+    x |= lsr(x, 8);
+  if (bits > 16)
+    x |= lsr(x, 16);
+  if (bits > 32)
+    x |= lsr(x, 32);
+  if (bits > 64)
+    x |= lsr(x, 64);
+  assert(bits <= 128);
+  return IV(I(bits)) - popcount(x);
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isignbit(intvec_t x) {
+  return x < IV(I(0));
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_max(intvec_t x,
+                                                           intvec_t y) {
+  return ifthen(x >= y, x, y);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_min(intvec_t x,
+                                                           intvec_t y) {
+  return ifthen(x < y, x, y);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_popcount(intvec_t x) {
+  // These explicit implementations are taken from
+  // <http://aggregate.org/MAGIC/>:
+  //
+  // @techreport{magicalgorithms,
+  //   author={Henry Gordon Dietz},
+  //   title={{The Aggregate Magic Algorithms}},
+  //   institution={University of Kentucky},
+  //   howpublished={Aggregate.Org online technical report},
+  //   date={2013-03-25},
+  //   URL={http://aggregate.org/MAGIC/}
+  // }
+
+  int_t bits = CHAR_BIT * sizeof(int_t);
+
+  // intvec_t x55 = IV(FP::replicate_byte(0x55));
+  // intvec_t x33 = IV(FP::replicate_byte(0x33));
+  // intvec_t x0f = IV(FP::replicate_byte(0x0f));
+  intvec_t x55 = I(~U(0) / U(3));  // 0x0101...
+  intvec_t x33 = I(~U(0) / U(5));  // 0x00110011...
+  intvec_t x0f = I(~U(0) / U(17)); // 0b0000111100001111...
+
+  x -= lsr(x, I(1)) & x55;
+  x = (x & x33) + (lsr(x, I(2)) & x33);
+  x += lsr(x, I(4));
+  x &= x0f;
+  if (bits > 8)
+    x += lsr(x, I(8));
+  if (bits > 16)
+    x += lsr(x, I(16));
+  if (bits > 32)
+    x += lsr(x, I(32));
+  if (bits > 64)
+    x += lsr(x, I(64));
+  assert(bits <= 128);
+  return x & IV(I(0xff));
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
+                                                              int_t n) {
+  int_t mask = CHAR_BIT * sizeof(int_t) - 1;
+  intvec_t left = x << (n & mask);
+  intvec_t right = lsr(x, -n & mask);
+  return left | right;
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
+                                                              intvec_t n) {
+  intvec_t mask = IV(I(CHAR_BIT * sizeof(int_t) - 1));
+  intvec_t left = x << (n & mask);
+  intvec_t right = lsr(x, -n & mask);
+  return left | right;
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_abs(intvec_t x)
-  {
-    return ifthen(isignbit(x), -x, x);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_bitifthen(intvec_t x,
-                                                                   intvec_t y,
-                                                                   intvec_t z)
-  {
-    return (x & y) | (~x & z);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_clz(intvec_t x)
-  {
-    // These implementations return 8*sizeof(TYPE) when the input is 0
-    
-    // These explicit implementations are taken from
-    // <http://aggregate.org/MAGIC/>:
-    // 
-    // @techreport{magicalgorithms,
-    //   author={Henry Gordon Dietz},
-    //   title={{The Aggregate Magic Algorithms}},
-    //   institution={University of Kentucky},
-    //   howpublished={Aggregate.Org online technical report},
-    //   date={2013-03-25},
-    //   URL={http://aggregate.org/MAGIC/}
-    // }
-    
-    int_t bits = CHAR_BIT * sizeof(int_t);
-    if (bits >  1) x |= lsr(x,  1);
-    if (bits >  2) x |= lsr(x,  2);
-    if (bits >  4) x |= lsr(x,  4);
-    if (bits >  8) x |= lsr(x,  8);
-    if (bits > 16) x |= lsr(x, 16);
-    if (bits > 32) x |= lsr(x, 32);
-    if (bits > 64) x |= lsr(x, 64);
-    assert(bits<=128);
-    return IV(I(bits)) - popcount(x);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isignbit(intvec_t x)
-  {
-    return x < IV(I(0));
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_max(intvec_t x,
-                                                             intvec_t y)
-  {
-    return ifthen(x>=y, x, y);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_min(intvec_t x,
-                                                             intvec_t y)
-  {
-    return ifthen(x<y, x, y);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_popcount(intvec_t x)
-  {
-    // These explicit implementations are taken from
-    // <http://aggregate.org/MAGIC/>:
-    // 
-    // @techreport{magicalgorithms,
-    //   author={Henry Gordon Dietz},
-    //   title={{The Aggregate Magic Algorithms}},
-    //   institution={University of Kentucky},
-    //   howpublished={Aggregate.Org online technical report},
-    //   date={2013-03-25},
-    //   URL={http://aggregate.org/MAGIC/}
-    // }
-    
-    int_t bits = CHAR_BIT * sizeof(int_t);
-    
-    // intvec_t x55 = IV(FP::replicate_byte(0x55));
-    // intvec_t x33 = IV(FP::replicate_byte(0x33));
-    // intvec_t x0f = IV(FP::replicate_byte(0x0f));
-    intvec_t x55 = I(~U(0) /  U(3)); // 0x0101...
-    intvec_t x33 = I(~U(0) /  U(5)); // 0x00110011...
-    intvec_t x0f = I(~U(0) / U(17)); // 0b0000111100001111...
-    
-    x -= lsr(x, I(1)) & x55;
-    x = (x & x33) + (lsr(x, I(2)) & x33);
-    x += lsr(x, I(4));
-    x &= x0f;
-    if (bits >  8) x += lsr(x,  I(8));
-    if (bits > 16) x += lsr(x, I(16));
-    if (bits > 32) x += lsr(x, I(32));
-    if (bits > 64) x += lsr(x, I(64));
-    assert(bits<=128);
-    return x & IV(I(0xff));
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
-                                                                int_t n)
-  {
-    int_t mask = CHAR_BIT * sizeof(int_t) - 1;
-    intvec_t left = x << (n & mask);
-    intvec_t right = lsr(x, -n & mask);
-    return left | right;
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
-                                                                intvec_t n)
-  {
-    intvec_t mask = IV(I(CHAR_BIT * sizeof(int_t) - 1));
-    intvec_t left = x << (n & mask);
-    intvec_t right = lsr(x, -n & mask);
-    return left | right;
-  }
-  
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_ASIN_H
+#endif // #ifndef MATHFUNCS_ASIN_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_log.h b/lib/kernel/vecmathlib/mathfuncs_log.h
index cd71eb3..fa517ba 100644
--- a/lib/kernel/vecmathlib/mathfuncs_log.h
+++ b/lib/kernel/vecmathlib/mathfuncs_log.h
@@ -7,93 +7,82 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x) {
+  // Algorithm inspired by SLEEF 2.80
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x)
-  {
-    // Algorithm inspired by SLEEF 2.80
-    
-    // Rescale
-    intvec_t ilogb_x = ilogb(x * RV(M_SQRT2));
-    x = ldexp(x, -ilogb_x);
-    VML_ASSERT(all(x >= RV(M_SQRT1_2) && x <= RV(M_SQRT2)));
-    
-    realvec_t y = (x - RV(1.0)) / (x + RV(1.0));
-    realvec_t y2 = y*y;
-    
-    realvec_t r;
-    switch (sizeof(real_t)) {
-    case 4:
-      // float, error=7.09807175879142775648452461821e-8
-      r = RV(0.59723611417135718739797302426);
-      r = mad(r, y2, RV(0.961524413175528426101613434));
-      r = mad(r, y2, RV(2.88539097665498228703236701));
-      break;
-    case 8:
+  // Rescale
+  intvec_t ilogb_x = ilogb(x * RV(M_SQRT2));
+  x = ldexp(x, -ilogb_x);
+  VML_ASSERT(all(x >= RV(M_SQRT1_2) && x <= RV(M_SQRT2)));
+
+  realvec_t y = (x - RV(1.0)) / (x + RV(1.0));
+  realvec_t y2 = y * y;
+
+  realvec_t r;
+  switch (sizeof(real_t)) {
+  case 4:
+    // float, error=7.09807175879142775648452461821e-8
+    r = RV(0.59723611417135718739797302426);
+    r = mad(r, y2, RV(0.961524413175528426101613434));
+    r = mad(r, y2, RV(2.88539097665498228703236701));
+    break;
+  case 8:
 #ifdef VML_HAVE_FP_CONTRACT
-      // double, error=1.48294180185938512675770096324e-16
-      r = RV(0.243683403415639178527756320773);
-      r = mad(r, y2, RV(0.26136626803870009948502658));
-      r = mad(r, y2, RV(0.320619429891299265439389));
-      r = mad(r, y2, RV(0.4121983452028499242926));
-      r = mad(r, y2, RV(0.577078017761894161436));
-      r = mad(r, y2, RV(0.96179669392233355927));
-      r = mad(r, y2, RV(2.8853900817779295236));
+    // double, error=1.48294180185938512675770096324e-16
+    r = RV(0.243683403415639178527756320773);
+    r = mad(r, y2, RV(0.26136626803870009948502658));
+    r = mad(r, y2, RV(0.320619429891299265439389));
+    r = mad(r, y2, RV(0.4121983452028499242926));
+    r = mad(r, y2, RV(0.577078017761894161436));
+    r = mad(r, y2, RV(0.96179669392233355927));
+    r = mad(r, y2, RV(2.8853900817779295236));
 #else
-      // double, error=2.1410114030383689267772704676e-14
-      r = RV(0.283751646449323373643963474845);
-      r = mad(r, y2, RV(0.31983138095551191299118812));
-      r = mad(r, y2, RV(0.412211603844146279666022));
-      r = mad(r, y2, RV(0.5770779098948940070516));
-      r = mad(r, y2, RV(0.961796694295973716912));
-      r = mad(r, y2, RV(2.885390081777562819196));
+    // double, error=2.1410114030383689267772704676e-14
+    r = RV(0.283751646449323373643963474845);
+    r = mad(r, y2, RV(0.31983138095551191299118812));
+    r = mad(r, y2, RV(0.412211603844146279666022));
+    r = mad(r, y2, RV(0.5770779098948940070516));
+    r = mad(r, y2, RV(0.961796694295973716912));
+    r = mad(r, y2, RV(2.885390081777562819196));
 #endif
-      break;
-    default:
-      __builtin_unreachable();
-    }
-    r *= y;
-    
-    // Undo rescaling
-    r += convert_float(ilogb_x);
-    
-    return r;
-  }
-  
-  
-  
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x)
-  {
-    return log2(x) * RV(M_LN2);
+    break;
+  default:
+    __builtin_unreachable();
   }
+  r *= y;
 
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x)
-  {
-    return log(x) * RV(M_LOG10E);
-  }
+  // Undo rescaling
+  r += convert_float(ilogb_x);
 
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x)
-  {
-    // TODO: Check SLEEF 2.80 algorithm
-    
-    return log(RV(1.0) + x);
+  return r;
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x) {
+  return log2(x) * RV(M_LN2);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x) {
+  return log(x) * RV(M_LOG10E);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x) {
+  // TODO: Check SLEEF 2.80 algorithm
+
+  return log(RV(1.0) + x);
 #if 0
     // Goldberg, theorem 4
     realvec_t x1 = RV(1.0) + x;
     x1.barrier();
     return ifthen(x1 == x, x, x * log(x1) / (x1 - RV(1.0)));
 #endif
-  }
-  
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_LOG_H
+#endif // #ifndef MATHFUNCS_LOG_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_pow.h b/lib/kernel/vecmathlib/mathfuncs_pow.h
index b863570..70bcc80 100644
--- a/lib/kernel/vecmathlib/mathfuncs_pow.h
+++ b/lib/kernel/vecmathlib/mathfuncs_pow.h
@@ -7,30 +7,27 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_pow(realvec_t x, realvec_t y) {
+  // Handle zero
+  boolvec_t is_zero = x == RV(0.0);
+  x = ifthen(is_zero, RV(1.0), x);
+
+  realvec_t r = exp(log(fabs(x)) * y);
+
+  // The result is negative if x<0 and if y is integer and odd
+  realvec_t mod_y = fabs(y) - RV(2.0) * floor(RV(0.5) * fabs(y));
+  realvec_t sign = copysign(mod_y, x) + RV(0.5);
+  r = copysign(r, sign);
+
+  // Handle zero
+  r = ifthen(is_zero, RV(0.0), r);
+
+  return r;
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_pow(realvec_t x, realvec_t y)
-  {
-    // Handle zero
-    boolvec_t is_zero = x == RV(0.0);
-    x = ifthen(is_zero, RV(1.0), x);
-    
-    realvec_t r = exp(log(fabs(x)) * y);
-    
-    // The result is negative if x<0 and if y is integer and odd
-    realvec_t mod_y = fabs(y) - RV(2.0) * floor(RV(0.5) * fabs(y));
-    realvec_t sign = copysign(mod_y, x) + RV(0.5);
-    r = copysign(r, sign);
-    
-    // Handle zero
-    r = ifthen(is_zero, RV(0.0), r);
-    
-    return r;
-  }
-  
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_POW_H
+#endif // #ifndef MATHFUNCS_POW_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_rcp.h b/lib/kernel/vecmathlib/mathfuncs_rcp.h
index 6e12b27..f703454 100644
--- a/lib/kernel/vecmathlib/mathfuncs_rcp.h
+++ b/lib/kernel/vecmathlib/mathfuncs_rcp.h
@@ -7,10 +7,8 @@
 
 #include <cmath>
 
-
-
 namespace vecmathlib {
-  
+
 #if 0
   // This routine works, but may be slower than the one below
   template<typename realvec_t>
@@ -50,66 +48,61 @@ namespace vecmathlib {
     return r;
   }
 #endif
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x)
-  {
-    // Handle negative values
-    realvec_t x0 = x;
-    x = fabs(x);
-    
-    // <https://en.wikipedia.org/wiki/Division_algorithm> [2013-06-28]
-    
-    // Initial guess
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x) {
+  // Handle negative values
+  realvec_t x0 = x;
+  x = fabs(x);
+
+  // <https://en.wikipedia.org/wiki/Division_algorithm> [2013-06-28]
+
+  // Initial guess
+  VML_ASSERT(all(x > RV(0.0)));
+  intvec_t x_exp;
+  x = frexp(x, &x_exp);
+  VML_ASSERT(all(x >= RV(0.5) && x < RV(1.0)));
+  realvec_t r = RV(R(48.0) / R(17.0)) - RV(R(32.0) / R(17.0)) * x;
+
+  // Iterate
+  int const nmax = sizeof(real_t) == 4 ? 3 : 4;
+  for (int n = 0; n < nmax; ++n) {
+    // Step
     VML_ASSERT(all(x > RV(0.0)));
-    intvec_t x_exp;
-    x = frexp(x, &x_exp);
-    VML_ASSERT(all(x >= RV(0.5) && x < RV(1.0)));
-    realvec_t r = RV(R(48.0)/R(17.0)) - RV(R(32.0)/R(17.0)) * x;
-    
-    // Iterate
-    int const nmax = sizeof(real_t)==4 ? 3 : 4;
-    for (int n=0; n<nmax; ++n) {
-      // Step
-      VML_ASSERT(all(x > RV(0.0)));
-      // Newton method:
-      // Solve   f(r) = 0   for   f(r) = x - 1/r
-      //    r <- r - f(r) / f'(r)
-      //    r <- 2 r - r^2 x
-      //    r <- r + r (1 - r x)
-      
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors
-      r += r * (RV(1.0) - x*r);
-      
-      // NEON: r = r * (RV(2.0) - x*r);
-    }
-    r = ldexp(r, -x_exp);
-    
-    // Handle negative values
-    r = copysign(r, x0);
-    
-    return r;
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_remainder(realvec_t x, realvec_t y)
-  {
-    return x - rint(x / y) * y;
-    // realvec_t r = x / y;
-    // return y * (r - rint(r));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fmod(realvec_t x, realvec_t y)
-  {
-    return x - y * trunc(x / y);
-    // realvec_t r = x / y;
-    // return y * (r - trunc(r));
+    // Newton method:
+    // Solve   f(r) = 0   for   f(r) = x - 1/r
+    //    r <- r - f(r) / f'(r)
+    //    r <- 2 r - r^2 x
+    //    r <- r + r (1 - r x)
+
+    // Note: don't rewrite this expression, this may introduce
+    // cancellation errors
+    r += r * (RV(1.0) - x * r);
+
+    // NEON: r = r * (RV(2.0) - x*r);
   }
-  
+  r = ldexp(r, -x_exp);
+
+  // Handle negative values
+  r = copysign(r, x0);
+
+  return r;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_remainder(realvec_t x, realvec_t y) {
+  return x - rint(x / y) * y;
+  // realvec_t r = x / y;
+  // return y * (r - rint(r));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fmod(realvec_t x, realvec_t y) {
+  return x - y * trunc(x / y);
+  // realvec_t r = x / y;
+  // return y * (r - trunc(r));
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_RCP_H
+#endif // #ifndef MATHFUNCS_RCP_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_sin.h b/lib/kernel/vecmathlib/mathfuncs_sin.h
index 8e2afd9..72ffb6f 100644
--- a/lib/kernel/vecmathlib/mathfuncs_sin.h
+++ b/lib/kernel/vecmathlib/mathfuncs_sin.h
@@ -7,230 +7,227 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_sin(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+
+  real_t PI4_A, PI4_B, PI4_C, PI4_D;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    PI4_A = 0.78515625f;
+    PI4_B = 0.00024187564849853515625f;
+    PI4_C = 3.7747668102383613586e-08f;
+    PI4_D = 1.2816720341285448015e-12f;
+    break;
+  case sizeof(double):
+    PI4_A = 0.78539816290140151978;
+    PI4_B = 4.9604678871439933374e-10;
+    PI4_C = 1.1258708853173288931e-18;
+    PI4_D = 1.7607799325916000908e-27;
+    break;
+  }
+
+  realvec_t q = rint(d * RV(M_1_PI));
+  intvec_t iq = convert_int(q);
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_sin(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    real_t PI4_A, PI4_B, PI4_C, PI4_D;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      PI4_A = 0.78515625f;
-      PI4_B = 0.00024187564849853515625f;
-      PI4_C = 3.7747668102383613586e-08f;
-      PI4_D = 1.2816720341285448015e-12f;
-      break;
-    case sizeof(double):
-      PI4_A = 0.78539816290140151978;
-      PI4_B = 4.9604678871439933374e-10;
-      PI4_C = 1.1258708853173288931e-18;
-      PI4_D = 1.7607799325916000908e-27;
-      break;
-    }
-    
-    realvec_t q = rint(d * RV(M_1_PI));
-    intvec_t iq = convert_int(q);
-    
 #ifdef VML_HAVE_FP_CONTRACT
-    d = mad(q, RV(-PI4_A*4), d);
-    d = mad(q, RV(-PI4_B*4), d);
-    d = mad(q, RV(-PI4_C*4), d);
-    d = mad(q, RV(-PI4_D*4), d);
+  d = mad(q, RV(-PI4_A * 4), d);
+  d = mad(q, RV(-PI4_B * 4), d);
+  d = mad(q, RV(-PI4_C * 4), d);
+  d = mad(q, RV(-PI4_D * 4), d);
 #else
-    d = mad(q, RV(-M_PI), d);
+  d = mad(q, RV(-M_PI), d);
 #endif
-    
-    realvec_t s = d * d;
-    
-    d = ifthen(convert_bool(iq & IV(I(1))), -d, d);
-    
-    realvec_t u;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      u = RV(2.6083159809786593541503e-06f);
-      u = mad(u, s, RV(-0.0001981069071916863322258f));
-      u = mad(u, s, RV(0.00833307858556509017944336f));
-      u = mad(u, s, RV(-0.166666597127914428710938f));
-      break;
-    case sizeof(double):
-      u = RV(-7.97255955009037868891952e-18);
-      u = mad(u, s, RV(2.81009972710863200091251e-15));
-      u = mad(u, s, RV(-7.64712219118158833288484e-13));
-      u = mad(u, s, RV(1.60590430605664501629054e-10));
-      u = mad(u, s, RV(-2.50521083763502045810755e-08));
-      u = mad(u, s, RV(2.75573192239198747630416e-06));
-      u = mad(u, s, RV(-0.000198412698412696162806809));
-      u = mad(u, s, RV(0.00833333333333332974823815));
-      u = mad(u, s, RV(-0.166666666666666657414808));
-      break;
-    }
-    
-    u = mad(s, u * d, d);
-    
-    const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
-    u = ifthen(isinf(d), RV(nan), u);
-    
-    return u;
+
+  realvec_t s = d * d;
+
+  d = ifthen(convert_bool(iq & IV(I(1))), -d, d);
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(2.6083159809786593541503e-06f);
+    u = mad(u, s, RV(-0.0001981069071916863322258f));
+    u = mad(u, s, RV(0.00833307858556509017944336f));
+    u = mad(u, s, RV(-0.166666597127914428710938f));
+    break;
+  case sizeof(double):
+    u = RV(-7.97255955009037868891952e-18);
+    u = mad(u, s, RV(2.81009972710863200091251e-15));
+    u = mad(u, s, RV(-7.64712219118158833288484e-13));
+    u = mad(u, s, RV(1.60590430605664501629054e-10));
+    u = mad(u, s, RV(-2.50521083763502045810755e-08));
+    u = mad(u, s, RV(2.75573192239198747630416e-06));
+    u = mad(u, s, RV(-0.000198412698412696162806809));
+    u = mad(u, s, RV(0.00833333333333332974823815));
+    u = mad(u, s, RV(-0.166666666666666657414808));
+    break;
+  }
+
+  u = mad(s, u * d, d);
+
+  const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+  u = ifthen(isinf(d), RV(nan), u);
+
+  return u;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_cos(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+
+  real_t PI4_A, PI4_B, PI4_C, PI4_D;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    PI4_A = 0.78515625f;
+    PI4_B = 0.00024187564849853515625f;
+    PI4_C = 3.7747668102383613586e-08f;
+    PI4_D = 1.2816720341285448015e-12f;
+    break;
+  case sizeof(double):
+    PI4_A = 0.78539816290140151978;
+    PI4_B = 4.9604678871439933374e-10;
+    PI4_C = 1.1258708853173288931e-18;
+    PI4_D = 1.7607799325916000908e-27;
+    break;
   }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_cos(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    real_t PI4_A, PI4_B, PI4_C, PI4_D;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      PI4_A = 0.78515625f;
-      PI4_B = 0.00024187564849853515625f;
-      PI4_C = 3.7747668102383613586e-08f;
-      PI4_D = 1.2816720341285448015e-12f;
-      break;
-    case sizeof(double):
-      PI4_A = 0.78539816290140151978;
-      PI4_B = 4.9604678871439933374e-10;
-      PI4_C = 1.1258708853173288931e-18;
-      PI4_D = 1.7607799325916000908e-27;
-      break;
-    }
-    
-    realvec_t q = mad(RV(2.0), rint(mad(d, RV(M_1_PI), RV(-0.5))), RV(1.0));
-    intvec_t iq = convert_int(q);
-    
+
+  realvec_t q = mad(RV(2.0), rint(mad(d, RV(M_1_PI), RV(-0.5))), RV(1.0));
+  intvec_t iq = convert_int(q);
+
 #ifdef VML_HAVE_FP_CONTRACT
-    d = mad(q, RV(-PI4_A*2), d);
-    d = mad(q, RV(-PI4_B*2), d);
-    d = mad(q, RV(-PI4_C*2), d);
-    d = mad(q, RV(-PI4_D*2), d);
+  d = mad(q, RV(-PI4_A * 2), d);
+  d = mad(q, RV(-PI4_B * 2), d);
+  d = mad(q, RV(-PI4_C * 2), d);
+  d = mad(q, RV(-PI4_D * 2), d);
 #else
-    d = mad(q, RV(-M_PI_2), d);
+  d = mad(q, RV(-M_PI_2), d);
 #endif
-    
-    realvec_t s = d * d;
-    
-    d = ifthen(convert_bool(iq & IV(I(2))), d, -d);
-    
-    realvec_t u;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      u = RV(2.6083159809786593541503e-06f);
-      u = mad(u, s, RV(-0.0001981069071916863322258f));
-      u = mad(u, s, RV(0.00833307858556509017944336f));
-      u = mad(u, s, RV(-0.166666597127914428710938f));
-      break;
-    case sizeof(double):
-      u = RV(-7.97255955009037868891952e-18);
-      u = mad(u, s, RV(2.81009972710863200091251e-15));
-      u = mad(u, s, RV(-7.64712219118158833288484e-13));
-      u = mad(u, s, RV(1.60590430605664501629054e-10));
-      u = mad(u, s, RV(-2.50521083763502045810755e-08));
-      u = mad(u, s, RV(2.75573192239198747630416e-06));
-      u = mad(u, s, RV(-0.000198412698412696162806809));
-      u = mad(u, s, RV(0.00833333333333332974823815));
-      u = mad(u, s, RV(-0.166666666666666657414808));
-      break;
-    }
-    
-    u = mad(s, u * d, d);
-    
-    const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
-    u = ifthen(isinf(d), RV(nan), u);
-    
-    return u;
+
+  realvec_t s = d * d;
+
+  d = ifthen(convert_bool(iq & IV(I(2))), d, -d);
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(2.6083159809786593541503e-06f);
+    u = mad(u, s, RV(-0.0001981069071916863322258f));
+    u = mad(u, s, RV(0.00833307858556509017944336f));
+    u = mad(u, s, RV(-0.166666597127914428710938f));
+    break;
+  case sizeof(double):
+    u = RV(-7.97255955009037868891952e-18);
+    u = mad(u, s, RV(2.81009972710863200091251e-15));
+    u = mad(u, s, RV(-7.64712219118158833288484e-13));
+    u = mad(u, s, RV(1.60590430605664501629054e-10));
+    u = mad(u, s, RV(-2.50521083763502045810755e-08));
+    u = mad(u, s, RV(2.75573192239198747630416e-06));
+    u = mad(u, s, RV(-0.000198412698412696162806809));
+    u = mad(u, s, RV(0.00833333333333332974823815));
+    u = mad(u, s, RV(-0.166666666666666657414808));
+    break;
   }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_tan(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    real_t PI4_A, PI4_B, PI4_C, PI4_D;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      PI4_A = 0.78515625f;
-      PI4_B = 0.00024187564849853515625f;
-      PI4_C = 3.7747668102383613586e-08f;
-      PI4_D = 1.2816720341285448015e-12f;
-      break;
-    case sizeof(double):
-      PI4_A = 0.78539816290140151978;
-      PI4_B = 4.9604678871439933374e-10;
-      PI4_C = 1.1258708853173288931e-18;
-      PI4_D = 1.7607799325916000908e-27;
-      break;
-    }
-    
-    realvec_t q = rint(d * RV(2 * M_1_PI));
-    intvec_t iq = convert_int(q);
-    
-    realvec_t x = d;
-    
+
+  u = mad(s, u * d, d);
+
+  const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+  u = ifthen(isinf(d), RV(nan), u);
+
+  return u;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_tan(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+
+  real_t PI4_A, PI4_B, PI4_C, PI4_D;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    PI4_A = 0.78515625f;
+    PI4_B = 0.00024187564849853515625f;
+    PI4_C = 3.7747668102383613586e-08f;
+    PI4_D = 1.2816720341285448015e-12f;
+    break;
+  case sizeof(double):
+    PI4_A = 0.78539816290140151978;
+    PI4_B = 4.9604678871439933374e-10;
+    PI4_C = 1.1258708853173288931e-18;
+    PI4_D = 1.7607799325916000908e-27;
+    break;
+  }
+
+  realvec_t q = rint(d * RV(2 * M_1_PI));
+  intvec_t iq = convert_int(q);
+
+  realvec_t x = d;
+
 #ifdef VML_HAVE_FP_CONTRACT
-    x = mad(q, RV(-PI4_A*2), x);
-    x = mad(q, RV(-PI4_B*2), x);
-    x = mad(q, RV(-PI4_C*2), x);
-    x = mad(q, RV(-PI4_D*2), x);
+  x = mad(q, RV(-PI4_A * 2), x);
+  x = mad(q, RV(-PI4_B * 2), x);
+  x = mad(q, RV(-PI4_C * 2), x);
+  x = mad(q, RV(-PI4_D * 2), x);
 #else
-    x = mad(q, RV(-M_PI_2), x);
+  x = mad(q, RV(-M_PI_2), x);
 #endif
-    
-    realvec_t s = x * x;
-    
-    x = ifthen(convert_bool(iq & IV(I(1))), -x, x);
-    
-    realvec_t u;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      u = RV(0.00927245803177356719970703f);
-      u = mad(u, s, RV(0.00331984995864331722259521f));
-      u = mad(u, s, RV(0.0242998078465461730957031f));
-      u = mad(u, s, RV(0.0534495301544666290283203f));
-      u = mad(u, s, RV(0.133383005857467651367188f));
-      u = mad(u, s, RV(0.333331853151321411132812f));
-      break;
-    case sizeof(double):
-      u = RV(1.01419718511083373224408e-05);
-      u = mad(u, s, RV(-2.59519791585924697698614e-05));
-      u = mad(u, s, RV(5.23388081915899855325186e-05));
-      u = mad(u, s, RV(-3.05033014433946488225616e-05));
-      u = mad(u, s, RV(7.14707504084242744267497e-05));
-      u = mad(u, s, RV(8.09674518280159187045078e-05));
-      u = mad(u, s, RV(0.000244884931879331847054404));
-      u = mad(u, s, RV(0.000588505168743587154904506));
-      u = mad(u, s, RV(0.00145612788922812427978848));
-      u = mad(u, s, RV(0.00359208743836906619142924));
-      u = mad(u, s, RV(0.00886323944362401618113356));
-      u = mad(u, s, RV(0.0218694882853846389592078));
-      u = mad(u, s, RV(0.0539682539781298417636002));
-      u = mad(u, s, RV(0.133333333333125941821962));
-      u = mad(u, s, RV(0.333333333333334980164153));
-      break;
-    }
-    
-    u = mad(s, u * x, x);
-    
-    u = ifthen(convert_bool(iq & IV(I(1))), rcp(u), u);
-    
-    const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
-    u = ifthen(isinf(d), RV(nan), u);
-    
-    return u;
+
+  realvec_t s = x * x;
+
+  x = ifthen(convert_bool(iq & IV(I(1))), -x, x);
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(0.00927245803177356719970703f);
+    u = mad(u, s, RV(0.00331984995864331722259521f));
+    u = mad(u, s, RV(0.0242998078465461730957031f));
+    u = mad(u, s, RV(0.0534495301544666290283203f));
+    u = mad(u, s, RV(0.133383005857467651367188f));
+    u = mad(u, s, RV(0.333331853151321411132812f));
+    break;
+  case sizeof(double):
+    u = RV(1.01419718511083373224408e-05);
+    u = mad(u, s, RV(-2.59519791585924697698614e-05));
+    u = mad(u, s, RV(5.23388081915899855325186e-05));
+    u = mad(u, s, RV(-3.05033014433946488225616e-05));
+    u = mad(u, s, RV(7.14707504084242744267497e-05));
+    u = mad(u, s, RV(8.09674518280159187045078e-05));
+    u = mad(u, s, RV(0.000244884931879331847054404));
+    u = mad(u, s, RV(0.000588505168743587154904506));
+    u = mad(u, s, RV(0.00145612788922812427978848));
+    u = mad(u, s, RV(0.00359208743836906619142924));
+    u = mad(u, s, RV(0.00886323944362401618113356));
+    u = mad(u, s, RV(0.0218694882853846389592078));
+    u = mad(u, s, RV(0.0539682539781298417636002));
+    u = mad(u, s, RV(0.133333333333125941821962));
+    u = mad(u, s, RV(0.333333333333334980164153));
+    break;
   }
-  
+
+  u = mad(s, u * x, x);
+
+  u = ifthen(convert_bool(iq & IV(I(1))), rcp(u), u);
+
+  const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+  u = ifthen(isinf(d), RV(nan), u);
+
+  return u;
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_SIN_H
+#endif // #ifndef MATHFUNCS_SIN_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_sinh.h b/lib/kernel/vecmathlib/mathfuncs_sinh.h
index 04aa446..a8c2ee3 100644
--- a/lib/kernel/vecmathlib/mathfuncs_sinh.h
+++ b/lib/kernel/vecmathlib/mathfuncs_sinh.h
@@ -7,28 +7,23 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_cosh(realvec_t x) {
+  return RV(0.5) * (exp(x) + exp(-x));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_sinh(realvec_t x) {
+  return RV(0.5) * (exp(x) - exp(-x));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_tanh(realvec_t x) {
+  return sinh(x) / cosh(x);
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_cosh(realvec_t x)
-  {
-    return RV(0.5) * (exp(x) + exp(-x));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_sinh(realvec_t x)
-  {
-    return RV(0.5) * (exp(x) - exp(-x));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_tanh(realvec_t x)
-  {
-    return sinh(x) / cosh(x);
-  }
-  
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_SINH_H
+#endif // #ifndef MATHFUNCS_SINH_H
diff --git a/lib/kernel/vecmathlib/mathfuncs_sqrt.h b/lib/kernel/vecmathlib/mathfuncs_sqrt.h
index dea5fd6..7a362f9 100644
--- a/lib/kernel/vecmathlib/mathfuncs_sqrt.h
+++ b/lib/kernel/vecmathlib/mathfuncs_sqrt.h
@@ -7,13 +7,10 @@
 
 #include <cmath>
 
-
-
 namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x)
-  {
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x) {
 #if 0
     // Handle special case: zero
     boolvec_t is_zero = x <= RV(0.0);
@@ -49,29 +46,23 @@ namespace vecmathlib {
     // Handle special case: zero
     r = ifthen(is_zero, RV(0.0), r);
 #endif
-    
-    realvec_t r = x * rsqrt(x);
-    // Handle special case: zero
-    r = ifthen(x == RV(0.0), RV(0.0), r);
-    
-    return r;
-  }
-  
-  
-  
-  // TODO: Use "Halley's method with cubic convergence":
-  // <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf>
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x)
-  {
-    return pow(x, RV(1.0/3.0));
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x)
-  {
+
+  realvec_t r = x * rsqrt(x);
+  // Handle special case: zero
+  r = ifthen(x == RV(0.0), RV(0.0), r);
+
+  return r;
+}
+
+// TODO: Use "Halley's method with cubic convergence":
+// <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf>
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x) {
+  return pow(x, RV(1.0 / 3.0));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x) {
 #if 0
     // See <http://en.wikipedia.org/wiki/Fast_inverse_square_root>
     realvec_t x_2 = RV(0.5) * x;
@@ -85,46 +76,43 @@ namespace vecmathlib {
     r += r * (RV(0.5) - (x_2 * r * r));
     return r;
 #else
-    // Initial guess
-    // VML_ASSERT(all(x > RV(0.0)));
-    intvec_t ilogb_x = ilogb(x);
-    realvec_t s =
+  // Initial guess
+  // VML_ASSERT(all(x > RV(0.0)));
+  intvec_t ilogb_x = ilogb(x);
+  realvec_t s =
       ifthen(convert_bool(ilogb_x & IV(I(1))), RV(R(0.583)), RV(R(0.824)));
-    realvec_t r = ldexp(s, -(ilogb_x >> I(1)));
-    
-    realvec_t x_2 = RV(0.5) * x;
-    
-    // Iterate
-    // nmax iterations give an accuracy of 2^nmax binary digits. 5
-    // iterations suffice for double precision with its 53 digits.
-    int const nmax = sizeof(real_t)==4 ? 4 : 5;
-    for (int n=0; n<nmax; ++n) {
-      // Step
-      VML_ASSERT(all(r > RV(0.0)));
-      // Newton method:
-      // Solve   f(r) = 0   for   f(r) = x - 1/r^2
-      //    r <- r - f(r) / f'(r)
-      //    r <- (3 r - r^3 x) / 2
-      //    r <- r (3/2 - r^2 x/2)
-      
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors (says who?)
-      // r *= RV(1.5) - x_2 * r*r;
-      r += r * (RV(0.5) - x_2 * r*r);
-    }
-    
-    return r;
-#endif
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y)
-  {
-    return sqrt(x*x + y*y);
+  realvec_t r = ldexp(s, -(ilogb_x >> I(1)));
+
+  realvec_t x_2 = RV(0.5) * x;
+
+  // Iterate
+  // nmax iterations give an accuracy of 2^nmax binary digits. 5
+  // iterations suffice for double precision with its 53 digits.
+  int const nmax = sizeof(real_t) == 4 ? 4 : 5;
+  for (int n = 0; n < nmax; ++n) {
+    // Step
+    VML_ASSERT(all(r > RV(0.0)));
+    // Newton method:
+    // Solve   f(r) = 0   for   f(r) = x - 1/r^2
+    //    r <- r - f(r) / f'(r)
+    //    r <- (3 r - r^3 x) / 2
+    //    r <- r (3/2 - r^2 x/2)
+
+    // Note: don't rewrite this expression, this may introduce
+    // cancellation errors (says who?)
+    // r *= RV(1.5) - x_2 * r*r;
+    r += r * (RV(0.5) - x_2 * r * r);
   }
-  
+
+  return r;
+#endif
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y) {
+  return sqrt(x * x + y * y);
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_SQRT_H
+#endif // #ifndef MATHFUNCS_SQRT_H
diff --git a/lib/kernel/vecmathlib/selftest.cc b/lib/kernel/vecmathlib/selftest.cc
new file mode 100644
index 0000000..5e1a943
--- /dev/null
+++ b/lib/kernel/vecmathlib/selftest.cc
@@ -0,0 +1,1724 @@
+// -*-C++-*-
+
+#include "vecmathlib.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <sstream>
+
+using namespace std;
+
+int num_errors = 0;
+
+template <typename realvec_t> struct vecmathlib_test {
+
+  typedef typename realvec_t::boolvec_t boolvec_t;
+  typedef typename realvec_t::intvec_t intvec_t;
+
+  typedef typename realvec_t::int_t int_t;
+  typedef typename realvec_t::uint_t uint_t;
+  typedef typename realvec_t::real_t real_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+
+  typedef vecmathlib::floatprops<real_t> FP;
+  typedef vecmathlib::mathfuncs<realvec_t> MF;
+
+  // Test each function with this many random values
+  static const int imax = 10000;
+  static real_t accuracy(real_t ulp = R(0.5)) {
+#ifdef VML_HAVE_FP_CONTRACT
+    // Require that 100% of the digits are correct
+    // real_t digit_fraction = 1.0;
+    // We can't do that yet -- require fewer digits
+    real_t digit_fraction = 0.9;
+#else
+    // Require that 80% of the digits are correct
+    real_t digit_fraction = 0.8;
+#endif
+    digit_fraction *= 0.95; // some lenience for testing (why?)
+    return pow(ulp * realvec_t::epsilon(), digit_fraction);
+  }
+
+  static realvec_t random(const real_t xmin, const real_t xmax) {
+    realvec_t x;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      const real_t r = (xmax - xmin) * FP::convert_float(rand()) /
+                       FP::convert_float(RAND_MAX);
+      x.set_elt(i, xmin + r);
+    }
+    return x;
+  }
+
+  static intvec_t random(const int_t nmin, const int_t nmax) {
+    intvec_t n;
+    for (int i = 0; i < intvec_t::size; ++i) {
+      const real_t r = R(nmax - nmin + 1) * R(rand()) / (R(RAND_MAX) + R(1.0));
+      n.set_elt(i, nmin + FP::convert_int(floor(r)));
+    }
+    return n;
+  }
+
+  static bool is_big_endian() {
+    const int i = 1;
+    unsigned char cs[sizeof i];
+    memcpy(cs, &i, sizeof i);
+    return cs[0] == 0;
+  }
+
+  template <typename T> static string hex(const T x) {
+    unsigned char cs[sizeof x];
+    memcpy(cs, &x, sizeof x);
+    ostringstream buf;
+    buf << "0x";
+    const char *const hexdigits = "0123456789abcdef";
+    const int n0 = is_big_endian() ? 0 : sizeof x - 1;
+    const int dn = is_big_endian() ? +1 : -1;
+    const int n1 = n0 + sizeof x * dn;
+    for (int n = n0; n != n1; n += dn) {
+      buf << hexdigits[cs[n] >> 4] << hexdigits[cs[n] & 15];
+    }
+    return buf.str();
+  }
+
+  static boolvec_t supported(realvec_t x) {
+    return x == RV(0.0) || MF::vml_ieee_isnormal(x)
+#ifdef VML_HAVE_DENORMALS
+           || MF::vml_ieee_isfinite(x)
+#endif
+#ifdef VML_HAVE_INF
+           || MF::vml_ieee_isinf(x)
+#endif
+#ifdef VML_HAVE_NAN
+           || MF::vml_ieee_isnan(x)
+#endif
+        ;
+  }
+
+  static boolvec_t supported(intvec_t x) { return true; }
+
+  static boolvec_t supported(boolvec_t x) { return true; }
+
+  // Check load memory access
+  static void check_mem(const char *const func, const realvec_t x,
+                        const real_t *const p, const realvec_t xold,
+                        const int mval) {
+    realvec_t xwant;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      xwant.set_elt(i, mval & (1 << i) ? p[i] : xold[i]);
+    }
+    const boolvec_t isbad = x != xwant;
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   found=" << x << " [" << hex(x) << "]\n"
+           << "   expected=" << xwant << " [" << hex(xwant) << "]\n"
+           << "   mval=" << mval << " [" << hex(mval) << "]\n"
+           << "   isbad=" << isbad << "\n" << flush;
+    }
+  }
+
+  // Check store memory access
+  static void check_mem(const char *const func, const real_t *const p,
+                        const realvec_t x, const real_t *const pold,
+                        const int mval) {
+    realvec_t pv, pvwant;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      pv.set_elt(i, p[i]);
+      pvwant.set_elt(i, mval & (1 << i) ? x[i] : pold[i]);
+    }
+    const boolvec_t isbad = pv != pvwant;
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   found=" << pv << " [" << hex(pv) << "]\n"
+           << "   expected=" << pvwant << " [" << hex(pvwant) << "]\n"
+           << "   isbad=" << isbad << "\n" << flush;
+    }
+  }
+
+  static void check_bool(const char *const func, const bool rstd,
+                         const bool rvml) {
+    const bool dr = rstd ^ rvml;
+    const bool isbad = dr;
+    if (isbad) {
+      ++num_errors;
+      cout << "Error in " << func << ":\n"
+           << "   fstd()=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml()=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   isbad()=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A>
+  static void check_bool(const char *const func, const bool rstd,
+                         const bool rvml, const A x) {
+    const bool dr = rstd ^ rvml;
+    const bool isbad = dr;
+    if (isbad) {
+      ++num_errors;
+      cout << "Error in " << func << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   isbad(x)=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A>
+  static void check_bool(const char *const func, const boolvec_t rstd,
+                         const boolvec_t rvml, const A x) {
+    boolvec_t dr;
+    bool isbad = false;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      dr.set_elt(i, rstd[i] ^ rvml[i]);
+      isbad |= dr[i];
+    }
+    if (isbad) {
+      ++num_errors;
+      cout << "Error in " << func << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x)=" << dr << " [" << hex(rvml) << "]\n"
+           << "   isbad(x)=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A, typename B>
+  static void check_bool(const char *const func, const boolvec_t rstd,
+                         const boolvec_t rvml, const A x, const B y) {
+    boolvec_t dr;
+    bool isbad = false;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      dr.set_elt(i, rstd[i] ^ rvml[i]);
+      isbad |= dr[i];
+    }
+    if (isbad) {
+      ++num_errors;
+      cout << "Error in " << func << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   y=" << y << " [" << hex(y) << "]\n"
+           << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x,y)=" << dr << " [" << hex(rvml) << "]\n"
+           << "   isbad(x,y)=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A>
+  static void check_bool(const char *const func,
+                         bool fstd(typename A::scalar_t x), boolvec_t fvml(A x),
+                         const A x) {
+    boolvec_t rstd;
+    for (int i = 0; i < boolvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i]));
+    }
+    const boolvec_t rvml = fvml(x);
+    const boolvec_t dr = rstd != rvml;
+    const boolvec_t isbad = supported(x) && supported(rstd) && dr;
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x)=" << dr << " [" << hex(dr) << "]\n"
+           << "   isbad(x)=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A, typename B>
+  static void check_bool(const char *const func,
+                         bool fstd(typename A::scalar_t x,
+                                   typename B::scalar_t y),
+                         boolvec_t fvml(A x, B y), const A x, const B y) {
+    boolvec_t rstd;
+    for (int i = 0; i < boolvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i], y[i]));
+    }
+    const boolvec_t rvml = fvml(x, y);
+    const boolvec_t dr = rstd != rvml;
+    const boolvec_t isbad = supported(x) && supported(rstd) && dr;
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   y=" << y << " [" << hex(y) << "]\n"
+           << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x,y)=" << dr << " [" << hex(dr) << "]\n"
+           << "   isbad(x,y)=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A, typename B, typename C>
+  static void
+  check_bool(const char *const func,
+             bool fstd(typename A::scalar_t x, typename B::scalar_t y,
+                       typename C::scalar_t z),
+             boolvec_t fvml(A x, B y, C z), const A x, const B y, const C z) {
+    boolvec_t rstd;
+    for (int i = 0; i < boolvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i], y[i], z[i]));
+    }
+    const boolvec_t rvml = fvml(x, y, z);
+    const boolvec_t dr = rstd != rvml;
+    const boolvec_t isbad = supported(x) && supported(rstd) && dr;
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   y=" << y << " [" << hex(y) << "]\n"
+           << "   z=" << z << " [" << hex(z) << "]\n"
+           << "   fstd(x,y,z)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x,y,z)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x,y,z)=" << dr << " [" << hex(dr) << "]\n"
+           << "   isbad(x,y,z)=" << isbad << "\n" << flush;
+    }
+  }
+
+  static void check_int(const char *const func, const int_t rstd,
+                        const int_t rvml) {
+    const int_t dr = rstd - rvml;
+    const bool isbad = dr;
+    if (isbad) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   fstd()=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml()=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error()=" << dr << " [" << hex(dr) << "]\n"
+           << "   isbad()=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A>
+  static void check_int(const char *const func,
+                        int_t fstd(typename A::scalar_t x), intvec_t fvml(A x),
+                        const A x) {
+    intvec_t rstd;
+    for (int i = 0; i < intvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i]));
+    }
+    const intvec_t rvml = fvml(x);
+    const intvec_t dr = rstd - rvml;
+    const boolvec_t isbad = supported(x) && supported(rstd) && convert_bool(dr);
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x)=" << dr << " [" << hex(dr) << "]\n"
+           << "   isbad(x)=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A, typename B>
+  static void check_int(const char *const func,
+                        int_t fstd(typename A::scalar_t x, B y),
+                        intvec_t fvml(A x, B y), const A x, const B y) {
+    intvec_t rstd;
+    for (int i = 0; i < intvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i], y));
+    }
+    const intvec_t rvml = fvml(x, y);
+    const intvec_t dr = rstd - rvml;
+    const boolvec_t isbad = supported(x) && supported(rstd) && convert_bool(dr);
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   y=" << y << " [" << hex(y) << "]\n"
+           << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x,y)=" << dr << " [" << hex(dr) << "]\n"
+           << "   isbad(x,y)=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A, typename B>
+  static void check_int(const char *const func,
+                        int_t fstd(typename A::scalar_t x,
+                                   typename B::scalar_t y),
+                        intvec_t fvml(A x, B y), const A x, const B y) {
+    intvec_t rstd;
+    for (int i = 0; i < intvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i], y[i]));
+    }
+    const intvec_t rvml = fvml(x, y);
+    const intvec_t dr = rstd - rvml;
+    const boolvec_t isbad =
+        supported(x) && supported(y) && supported(rstd) && convert_bool(dr);
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   y=" << y << " [" << hex(y) << "]\n"
+           << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x,y)=" << dr << " [" << hex(dr) << "]\n"
+           << "   isbad(x,y)=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A, typename B, typename C>
+  static void
+  check_int(const char *const func,
+            int_t fstd(typename A::scalar_t x, typename B::scalar_t y,
+                       typename C::scalar_t z),
+            intvec_t fvml(A x, B y, C z), const A x, const B y, const C z) {
+    intvec_t rstd;
+    for (int i = 0; i < intvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i], y[i], z[i]));
+    }
+    const intvec_t rvml = fvml(x, y, z);
+    const intvec_t dr = rstd - rvml;
+    const boolvec_t isbad = supported(x) && supported(y) && supported(z) &&
+                            supported(rstd) && convert_bool(dr);
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   y=" << y << " [" << hex(y) << "]\n"
+           << "   z=" << z << " [" << hex(z) << "]\n"
+           << "   fstd(x,y,z)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x,y,z)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x,y,z)=" << dr << " [" << hex(dr) << "]\n"
+           << "   isbad(x,y,z)=" << isbad << "\n" << flush;
+    }
+  }
+
+  static void check_real(const char *const func, const real_t rstd,
+                         const real_t rvml) {
+    const real_t dr = rstd - rvml;
+    const bool isbad = dr != R(0.0);
+    if (isbad) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << "():\n"
+           << "   fstd()=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml()=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error()=" << dr << "\n"
+           << "   isbad()=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A>
+  static void check_real(const char *const func, const real_t rstd,
+                         const real_t rvml, const A x, const real_t accuracy) {
+    const real_t dr = rstd - rvml;
+    real_t maxabs = 0.0;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      maxabs = vml_std::fmax(maxabs, vml_std::fabs(x[i]));
+    }
+    const real_t scale = fabs(rstd) + fabs(rvml) + fabs(maxabs) + R(1.0);
+    const bool isbad = fabs(dr) > accuracy * scale;
+    if (isbad) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << "():\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   error(x)=" << dr << "\n"
+           << "   isbad(x)=" << isbad << "\n" << flush;
+    }
+  }
+
+  template <typename A>
+  static void
+  check_real(const char *const func, real_t fstd(typename A::scalar_t x),
+             realvec_t fvml(A x), const A x, const real_t accuracy) {
+    realvec_t rstd;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i]));
+    }
+    const realvec_t rvml = fvml(x);
+    const realvec_t dr = rstd - rvml;
+    const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
+    const boolvec_t isbad = supported(x) && supported(rstd) &&
+                            fabs(dr) > realvec_t(accuracy) * scale;
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   abs-error(x)=" << fabs(dr) << "\n"
+           << "   rel-error(x)=" << fabs(dr) / scale << "\n"
+           << "   isbad(x)=" << isbad << "\n"
+           << "   accuracy=" << accuracy << "\n" << flush;
+    }
+  }
+
+  template <typename A, typename B>
+  static void check_real(const char *const func,
+                         real_t fstd(typename A::scalar_t x, B y),
+                         realvec_t fvml(A x, B y), const A x, const B y,
+                         const real_t accuracy) {
+    realvec_t rstd;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i], y));
+    }
+    const realvec_t rvml = fvml(x, y);
+    const realvec_t dr = rstd - rvml;
+    const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
+    const boolvec_t isbad = supported(x) && supported(rstd) &&
+                            fabs(dr) > realvec_t(accuracy) * scale;
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   y=" << y << " [" << hex(y) << "]\n"
+           << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   abs-error(x,y)=" << fabs(dr) << "\n"
+           << "   rel-error(x,y)=" << fabs(dr) / scale << "\n"
+           << "   isbad(x,y)=" << isbad << "\n"
+           << "   accuracy=" << accuracy << "\n" << flush;
+    }
+  }
+
+  template <typename A, typename B>
+  static void
+  check_real(const char *const func,
+             real_t fstd(typename A::scalar_t x, typename B::scalar_t y),
+             realvec_t fvml(A x, B y), const A x, const B y,
+             const real_t accuracy, const realvec_t offset = RV(0.0)) {
+    realvec_t rstd;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i], y[i]));
+    }
+    realvec_t rvml = fvml(x, y);
+    // Fix up rvml by adding/subtracting the offset
+    rvml = ifthen(fabs(rstd - rvml) > fabs(offset / RV(2.0)),
+                  rvml + copysign(offset, rstd - rvml), rvml);
+    const realvec_t dr = rstd - rvml;
+    const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
+    const boolvec_t isbad = supported(x) && supported(y) && supported(rstd) &&
+                            fabs(dr) > realvec_t(accuracy) * scale;
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   y=" << y << " [" << hex(y) << "]\n"
+           << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   abs-error(x,y)=" << fabs(dr) << "\n"
+           << "   rel-error(x,y)=" << fabs(dr) / scale << "\n"
+           << "   isbad(x,y)=" << isbad << "\n"
+           << "   accuracy=" << accuracy << "\n" << flush;
+    }
+  }
+
+  template <typename A, typename B, typename C>
+  static void check_real(const char *const func,
+                         real_t fstd(typename A::scalar_t x,
+                                     typename B::scalar_t y,
+                                     typename C::scalar_t z),
+                         realvec_t fvml(A x, B y, C z), const A x, const B y,
+                         C const z, const real_t accuracy) {
+    realvec_t rstd;
+    for (int i = 0; i < realvec_t::size; ++i) {
+      rstd.set_elt(i, fstd(x[i], y[i], z[i]));
+    }
+    const realvec_t rvml = fvml(x, y, z);
+    const realvec_t dr = rstd - rvml;
+    const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
+    const boolvec_t isbad = supported(x) && supported(y) && supported(z) &&
+                            supported(rstd) &&
+                            fabs(dr) > realvec_t(accuracy) * scale;
+    if (any(isbad)) {
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
+           << "   x=" << x << " [" << hex(x) << "]\n"
+           << "   y=" << y << " [" << hex(y) << "]\n"
+           << "   z=" << z << " [" << hex(z) << "]\n"
+           << "   fstd(x,y,z)=" << rstd << " [" << hex(rstd) << "]\n"
+           << "   fvml(x,y,z)=" << rvml << " [" << hex(rvml) << "]\n"
+           << "   abs-error(x,y,z)=" << fabs(dr) << "\n"
+           << "   rel-error(x,y,z)=" << fabs(dr) / scale << "\n"
+           << "   isbad(x,y,z)=" << isbad << "\n"
+           << "   accuracy=" << accuracy << "\n" << flush;
+    }
+  }
+
+  static real_t *align_mem(real_t *p) {
+    const ptrdiff_t alignment = sizeof(realvec_t);
+    p = (real_t *)((intptr_t(p) + alignment - 1) & -alignment);
+    assert(intptr_t(p) % alignment == 0);
+    return p;
+  }
+  static string add_suffix(const char *str, int i) {
+    ostringstream buf;
+    buf << str << "." << i;
+    return buf.str();
+  }
+  static void test_mem() {
+    cout << "   testing loada loadu storea storeu (errors may lead to "
+            "segfaults)...\n"
+         << flush;
+    const int n = 4;
+    const int sz = realvec_t::size;
+    const int nbytes = n * sz * sizeof(real_t);
+    real_t *const x = align_mem(new real_t[(n + 1) * sz]);
+    real_t *const xnew = align_mem(new real_t[(n + 1) * sz]);
+    for (int i = 0; i < n; ++i) {
+      realvec_t xv = random(R(-10.0), R(+10.0));
+      memcpy(&x[i * sz], &xv, sizeof xv);
+    }
+    const realvec_t z = random(R(-10.0), R(+10.0));
+
+    // loada
+    {
+      const real_t *p = &x[sz];
+      realvec_t y = realvec_t::loada(p);
+      check_mem("loada", y, p, z, ~0);
+    }
+
+    // loadu
+    for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
+      const real_t *p = &x[sz];
+      realvec_t y = realvec_t::loadu(p + i);
+      check_mem(add_suffix("loadu", i).c_str(), y, p + i, z, ~0);
+    }
+
+    // loadu(ioff)
+    for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
+      const real_t *p = &x[sz];
+      realvec_t y = realvec_t::loadu(p, ioff);
+      check_mem(add_suffix("loadu(ioff)", ioff).c_str(), y, p + ioff, z, ~0);
+    }
+
+    // storea
+    {
+      memcpy(xnew, x, nbytes);
+      real_t *p = &xnew[sz];
+      storea(z, p);
+      check_mem("storea", p, z, &x[sz], ~0);
+    }
+
+    // storeu
+    for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
+      memcpy(xnew, x, nbytes);
+      real_t *p = &xnew[sz];
+      storeu(z, p + i);
+      check_mem(add_suffix("storeu", i).c_str(), p + i, z, &x[sz + i], ~0);
+    }
+
+    // storeu
+    for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
+      memcpy(xnew, x, nbytes);
+      real_t *p = &xnew[sz];
+      storeu(z, p, ioff);
+      check_mem(add_suffix("storeu(ioff)", ioff).c_str(), p + ioff, z,
+                &x[sz + ioff], ~0);
+    }
+
+    for (int mval = 0; mval < (1 << realvec_t::size); ++mval) {
+      boolvec_t mbool;
+      for (int i = 0; i < realvec_t::size; ++i)
+        mbool.set_elt(i, mval & (1 << i));
+      typename realvec_t::mask_t mask(mbool);
+
+      // loada(mask)
+      {
+        const real_t *p = &x[sz];
+        realvec_t y = loada(p, z, mask);
+        check_mem("loada(mask)", y, p, z, mval);
+      }
+
+      // loadu(mask)
+      for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
+        const real_t *p = &x[sz];
+        realvec_t y = loadu(p + i, z, mask);
+        check_mem("loadu(mask)", y, p + i, z, mval);
+      }
+
+      // loadu(ioff, mask)
+      for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
+        const real_t *p = &x[sz];
+        realvec_t y = loadu(p, ioff, z, mask);
+        check_mem("loadu(ioff,mask)", y, p + ioff, z, mval);
+      }
+
+      // storea
+      {
+        memcpy(xnew, x, nbytes);
+        real_t *p = &xnew[sz];
+        storea(z, p, mask);
+        check_mem("storea(mask)", p, z, &x[sz], mval);
+      }
+
+      // storeu
+      for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
+        memcpy(xnew, x, nbytes);
+        real_t *p = &xnew[sz];
+        storeu(z, p + i, mask);
+        check_mem("storeu(mask)", p + i, z, &x[sz + i], mval);
+      }
+
+      // storeu
+      for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
+        memcpy(xnew, x, nbytes);
+        real_t *p = &xnew[sz];
+        storeu(z, p, ioff, mask);
+        check_mem("storeu(ioff,mask)", p + ioff, z, &x[sz + ioff], mval);
+      }
+
+    } // for mval
+  }
+
+  template <typename T> static T local_ifthen(bool b, T x, T y) {
+    return b ? x : y;
+  }
+  static void test_bool() {
+    cout << "   testing boolean operations...\n" << flush;
+
+    const boolvec_t bf = boolvec_t(false);
+    const boolvec_t bt = boolvec_t(true);
+    for (int i = 0; i < realvec_t::size; ++i) {
+      check_bool("false", false, bf[i]);
+      check_bool("true", true, bt[i]);
+    }
+    check_bool("all", false, all(bf), false);
+    check_bool("all", true, all(bt), true);
+    check_bool("any", false, any(bf), false);
+    check_bool("any", true, any(bt), true);
+
+    boolvec_t b0 = bt;
+    boolvec_t b1 = bf;
+    for (int n = 0; n < realvec_t::size; ++n) {
+      b0.set_elt(n, false);
+      b1.set_elt(n, true);
+      for (int i = 0; i < realvec_t::size; ++i) {
+        check_bool("set_elt", i <= n ? false : true, b0[i], false);
+        check_bool("set_elt", i <= n ? true : false, b1[i], true);
+      }
+    }
+
+    for (int n = 0; n < (1 << realvec_t::size); ++n) {
+      boolvec_t x;
+      for (int i = 0; i < realvec_t::size; ++i) {
+        x.set_elt(i, n & (1 << i));
+      }
+      for (int i = 0; i < realvec_t::size; ++i) {
+        bool rstd = n & (1 << i);
+        bool rvml = x[i];
+        check_bool("[]", rstd, rvml, x);
+      }
+
+      {
+        boolvec_t rstd;
+        for (int i = 0; i < realvec_t::size; ++i) {
+          rstd.set_elt(i, !x[i]);
+        }
+        boolvec_t rvml = !x;
+        check_bool("!", rstd, rvml, x);
+      }
+      {
+        bool rstd = x[0];
+        for (int i = 1; i < realvec_t::size; ++i) {
+          rstd &= x[i];
+        }
+        bool rvml = all(x);
+        check_bool("all", rstd, rvml, x);
+      }
+      {
+        bool rstd = x[0];
+        for (int i = 1; i < realvec_t::size; ++i) {
+          rstd |= x[i];
+        }
+        bool rvml = any(x);
+        check_bool("any", rstd, rvml, x);
+      }
+      check_bool(
+          "ifthen(bool)", local_ifthen<bool>,
+          (boolvec_t (*)(boolvec_t, boolvec_t, boolvec_t))vecmathlib::ifthen, x,
+          BV(false), BV(true));
+      check_int("ifthen(int)", local_ifthen<int_t>,
+                (intvec_t (*)(boolvec_t, intvec_t, intvec_t))vecmathlib::ifthen,
+                x, IV(I(1)), IV(I(2)));
+      check_real(
+          "ifthen(real)", local_ifthen<real_t>,
+          ((realvec_t (*)(boolvec_t, realvec_t, realvec_t))vecmathlib::ifthen),
+          x, RV(1.0), RV(2.0), R(0.0));
+    }
+
+    for (int n = 0; n < (1 << realvec_t::size); ++n) {
+      for (int m = 0; m < (1 << realvec_t::size); ++m) {
+        boolvec_t x, y;
+        for (int i = 0; i < realvec_t::size; ++i) {
+          x.set_elt(i, n & (1 << i));
+          y.set_elt(i, m & (1 << i));
+        }
+
+        {
+          boolvec_t rstd;
+          for (int i = 0; i < realvec_t::size; ++i) {
+            rstd.set_elt(i, x[i] && y[i]);
+          }
+          boolvec_t rvml = x && y;
+          check_bool("&&", rstd, rvml, x, y);
+        }
+        {
+          boolvec_t rstd;
+          for (int i = 0; i < realvec_t::size; ++i) {
+            rstd.set_elt(i, x[i] || y[i]);
+          }
+          boolvec_t rvml = x || y;
+          check_bool("||", rstd, rvml, x, y);
+        }
+        {
+          boolvec_t rstd;
+          for (int i = 0; i < realvec_t::size; ++i) {
+            rstd.set_elt(i, x[i] == y[i]);
+          }
+          boolvec_t rvml = x == y;
+          check_bool("==", rstd, rvml, x, y);
+        }
+        {
+          boolvec_t rstd;
+          for (int i = 0; i < realvec_t::size; ++i) {
+            rstd.set_elt(i, x[i] != y[i]);
+          }
+          boolvec_t rvml = x != y;
+          check_bool("!=", rstd, rvml, x, y);
+        }
+      }
+    }
+  }
+
+  static bool local_convert_bool(int_t x) { return x; }
+  static int_t local_convert_int(bool x) { return x; }
+  template <typename T> static T local_pos(T x) { return +x; }
+  template <typename T> static T local_neg(T x) { return -x; }
+  template <typename T> static T local_not(T x) { return ~x; }
+  template <typename T> static T local_add(T x, T y) { return x + y; }
+  template <typename T> static T local_sub(T x, T y) { return x - y; }
+  template <typename T> static T local_mul(T x, T y) { return x * y; }
+  template <typename T> static T local_div(T x, T y) { return x / y; }
+  template <typename T> static T local_mod(T x, T y) { return x % y; }
+  template <typename T> static T local_and(T x, T y) { return x & y; }
+  template <typename T> static T local_or(T x, T y) { return x | y; }
+  template <typename T> static T local_xor(T x, T y) { return x ^ y; }
+
+  static int_t local_lsr(int_t x, int_t y) { return uint_t(x) >> uint_t(y); }
+  template <typename T> static T local_srs(T x, typename T::scalar_t y) {
+    return x >> y;
+  }
+  template <typename T> static T local_sls(T x, typename T::scalar_t y) {
+    return x << y;
+  }
+  template <typename T> static T local_sr(T x, T y) { return x >> y; }
+  template <typename T> static T local_sl(T x, T y) { return x << y; }
+
+  template <typename T> static bool local_isignbit(T x) { return x < 0; }
+  template <typename T> static bool local_eq(T x, T y) { return x == y; }
+  template <typename T> static bool local_ne(T x, T y) { return x != y; }
+  template <typename T> static bool local_lt(T x, T y) { return x < y; }
+  template <typename T> static bool local_le(T x, T y) { return x <= y; }
+  template <typename T> static bool local_gt(T x, T y) { return x > y; }
+  template <typename T> static bool local_ge(T x, T y) { return x >= y; }
+  template <typename T> static boolvec_t local_veq(T x, T y) { return x == y; }
+  template <typename T> static boolvec_t local_vne(T x, T y) { return x != y; }
+  template <typename T> static boolvec_t local_vlt(T x, T y) { return x < y; }
+  template <typename T> static boolvec_t local_vle(T x, T y) { return x <= y; }
+  template <typename T> static boolvec_t local_vgt(T x, T y) { return x > y; }
+  template <typename T> static boolvec_t local_vge(T x, T y) { return x >= y; }
+  static void test_int() {
+    cout << "   testing integer operations...\n" << flush;
+
+    intvec_t i0 = intvec_t(I(0));
+    intvec_t i1 = intvec_t(I(1));
+    intvec_t iiota = intvec_t::iota();
+    for (int i = 0; i < realvec_t::size; ++i) {
+      check_int("0", 0, i0[i]);
+      check_int("1", 1, i1[i]);
+      check_int("iota", i, iiota[i]);
+    }
+
+    i0 = intvec_t(I(1));
+    i1 = intvec_t(I(0));
+    for (int n = 0; n < realvec_t::size; ++n) {
+      i0.set_elt(n, 0);
+      i1.set_elt(n, 1);
+      for (int i = 0; i < realvec_t::size; ++i) {
+        check_bool("set_elt", i <= n ? 0 : 1, i0[i], 0);
+        check_bool("set_elt", i <= n ? 1 : 0, i1[i], 1);
+      }
+    }
+
+    const int_t int_min = std::numeric_limits<int_t>::min();
+    const int_t int_max = std::numeric_limits<int_t>::max();
+    const int_t values[] = {
+        0,           1,       2,           3,           -1,
+        -2,          -3,      int_min,     int_min + 1, int_min + 2,
+        int_min + 3, int_max, int_max - 1, int_max - 2, int_max - 3,
+    };
+    const int nvalues = sizeof values / sizeof *values;
+
+    for (int i = 0; i < nvalues * nvalues + 2 * imax; ++i) {
+      intvec_t x, y;
+      if (i < nvalues * nvalues) {
+        x = values[i % nvalues];
+        y = values[i / nvalues];
+      } else if (i < nvalues * nvalues + imax) {
+        x = random(I(-100), I(+100));
+        y = random(I(-100), I(+100));
+      } else {
+        x = random(int_min / 2, int_max / 2);
+        y = random(int_min / 2, int_max / 2);
+      }
+      boolvec_t b = convert_bool(random(I(0), I(1)));
+
+      check_bool<IV>("convert_bool(int)", local_convert_bool,
+                     vecmathlib::convert_bool, x);
+      check_int<BV>("convert_int(bool)", local_convert_int,
+                    vecmathlib::convert_int, b);
+
+      check_int<IV>("+", local_pos, local_pos, x);
+      check_int<IV>("-", local_neg, local_neg, x);
+      check_int<IV>("~", local_not, local_not, x);
+
+      check_int<IV, IV>("+", local_add, local_add, x, y);
+      check_int<IV, IV>("-", local_sub, local_sub, x, y);
+      check_int<IV, IV>("&", local_and, local_and, x, y);
+      check_int<IV, IV>("|", local_or, local_or, x, y);
+      check_int<IV, IV>("^", local_xor, local_xor, x, y);
+
+      const int_t bits = 8 * sizeof(int_t);
+      check_int<IV, I>("lsr", local_lsr, vecmathlib::lsr, x, y[0] & (bits - 1));
+      check_int<IV, I>(">>", local_sr, local_srs, x, y[0] & (bits - 1));
+      check_int<IV, I>("<<", local_sl, local_sls, x, y[0] & (bits - 1));
+      check_int<IV, IV>("lsr", local_lsr, vecmathlib::lsr, x, y & IV(bits - 1));
+      check_int<IV, IV>(">>", local_sr, local_sr, x, y & IV(bits - 1));
+      check_int<IV, IV>("<<", local_sl, local_sl, x, y & IV(bits - 1));
+
+      check_bool<IV>("isignbit", local_isignbit, vecmathlib::isignbit, x);
+      check_bool<IV, IV>("==", local_eq, local_veq, x, y);
+      check_bool<IV, IV>("!=", local_ne, local_vne, x, y);
+      check_bool<IV, IV>("<", local_lt, local_vlt, x, y);
+      check_bool<IV, IV>("<=", local_le, local_vle, x, y);
+      check_bool<IV, IV>(">", local_gt, local_vgt, x, y);
+      check_bool<IV, IV>(">=", local_ge, local_vge, x, y);
+    }
+  }
+
+  static void test_real() {
+    cout << "   testing real operations...\n" << flush;
+
+    realvec_t r0 = realvec_t(0.0);
+    realvec_t r1 = realvec_t(1.0);
+    for (int i = 0; i < realvec_t::size; ++i) {
+      check_real("0.0", R(0.0), r0[i]);
+      check_real("1.0", R(1.0), r1[i]);
+    }
+
+    r0 = realvec_t(1.0);
+    r1 = realvec_t(0.0);
+    for (int n = 0; n < realvec_t::size; ++n) {
+      r0.set_elt(n, R(0.0));
+      r1.set_elt(n, R(1.0));
+      for (int i = 0; i < realvec_t::size; ++i) {
+        check_bool("set_elt", i <= n ? R(0.0) : R(1.0), r0[i], R(0.0));
+        check_bool("set_elt", i <= n ? R(1.0) : R(0.0), r1[i], R(1.0));
+      }
+    }
+
+    // barrier
+    realvec_t rcancel = r1;
+    rcancel += RV(R(FP::max() / 2));
+    rcancel.barrier();
+    rcancel -= RV(R(FP::max() / 2));
+    check_real("barrier", R(0.0), rcancel[0]);
+
+    // rounding (break ties to even, or break ties away from zero?)
+    realvec_t rbase = RV(R(1.0));
+    rbase += RV(FP::epsilon() / 2);
+    check_real("flt_rounds", R(1.0), rbase[0]);
+    rbase = RV(R(1.0) + FP::epsilon());
+    rbase += RV(FP::epsilon() / 2);
+    check_real("flt_rounds", R(1.0) + 2 * FP::epsilon(), rbase[0]);
+  }
+
+  static int_t local_bitifthen(int_t x, int_t y, int_t z) {
+    return (x & y) | (~x & z);
+  }
+  static int_t local_clz(int_t x) {
+    int bits = CHAR_BIT * sizeof(x);
+    int res = 0;
+    for (; res < bits; ++res) {
+      if (x & (I(1) << (bits - res - 1)))
+        break;
+    }
+    return res;
+  }
+  static int_t local_max(int_t x, int_t y) { return std::max(x, y); }
+  static int_t local_min(int_t x, int_t y) { return std::min(x, y); }
+  static int_t local_popcount(int_t x) {
+    int bits = CHAR_BIT * sizeof(x);
+    int res = 0;
+    for (int d = 0; d < bits; ++d) {
+      if (x & (I(1) << d))
+        ++res;
+    }
+    return res;
+  }
+  static int_t local_rotate(int_t x, int_t n) {
+    int_t mask = CHAR_BIT * sizeof(int_t) - 1;
+    int_t left = x << (n & mask);
+    int_t right = I(U(x) >> U(-n & mask));
+    return left | right;
+  }
+  static void test_abs() {
+    cout << "   testing abs bitifthen clz isignbit max min popcount rotate...\n"
+         << flush;
+
+    for (int i = 0; i < imax; ++i) {
+      const intvec_t x = random(I(-1000000), I(+1000000));
+      const intvec_t y = random(I(-1000000), I(+1000000));
+      const intvec_t z = random(I(-1000000), I(+1000000));
+
+      check_int<IV>("abs", std::abs, vecmathlib::abs, x);
+      check_int<IV, IV, IV>("bitifthen", local_bitifthen, vecmathlib::bitifthen,
+                            x, y, z);
+      check_int<IV>("clz", local_clz, vecmathlib::clz, x);
+      check_int<IV, IV>("max", local_max, vecmathlib::max, x, y);
+      check_int<IV, IV>("min", local_min, vecmathlib::min, x, y);
+      check_int<IV>("popcount", local_popcount, vecmathlib::popcount, x);
+      check_int<IV, IV>("rotate", local_rotate, vecmathlib::rotate, x, y[0]);
+      check_int<IV, IV>("rotate", local_rotate, vecmathlib::rotate, x, y);
+    }
+  }
+
+  // Change signature: "int" -> "int_t"
+  static real_t local_frexp0(real_t x) {
+    int r;
+    return vml_std::frexp(x, &r);
+  }
+  static int_t local_frexp1(real_t x) {
+    if (vml_std::isinf(x))
+      return std::numeric_limits<int_t>::max();
+    if (vml_std::isnan(x))
+      return std::numeric_limits<int_t>::min();
+    int r;
+    vml_std::frexp(x, &r);
+    return r;
+  }
+  static realvec_t local_vfrexp0(realvec_t x) {
+    intvec_t r;
+    return vecmathlib::frexp(x, &r);
+  }
+  static intvec_t local_vfrexp1(realvec_t x) {
+    intvec_t r;
+    vecmathlib::frexp(x, &r);
+    return r;
+  }
+  static int_t local_ilogb(real_t x) {
+    if (x == R(0.0))
+      return std::numeric_limits<int_t>::min();
+    if (vml_std::isinf(x))
+      return std::numeric_limits<int_t>::max();
+    if (vml_std::isnan(x))
+      return std::numeric_limits<int_t>::min();
+    return vml_std::ilogb(x);
+  }
+  static real_t local_ldexp(real_t x, int_t n) { return ldexp(x, n); }
+  static real_t local_mad(real_t x, real_t y, real_t z) { return x * y + z; }
+  static void test_fabs() {
+    cout << "   testing + - + - * == != < <= > >= copysign fabs fdim fma fmax "
+            "fmin frexp ilogb isfinite isinf isnan isnormal ldexp mad "
+            "nextafter signbit...\n"
+         << flush;
+
+    const real_t eps = FP::epsilon();
+    const real_t int_min = R(std::numeric_limits<int_t>::min());
+    const real_t int_max = R(std::numeric_limits<int_t>::max());
+    const real_t uint_min = R(std::numeric_limits<uint_t>::min());
+    const real_t uint_max = R(std::numeric_limits<uint_t>::max());
+    const real_t values[] = {
+        R(+0.0),
+        R(+0.1),
+        R(+0.9),
+        R(+1.0),
+        R(+1.1),
+        R(-0.0),
+        R(-0.1),
+        R(-0.9),
+        R(-1.0),
+        R(-1.1),
+        R(+0.0) + eps,
+        R(+0.1) + eps,
+        R(+0.9) + eps,
+        R(+1.0) + eps,
+        R(+1.1) + eps,
+        R(-0.0) + eps,
+        R(-0.1) + eps,
+        R(-0.9) + eps,
+        R(-1.0) + eps,
+        R(-1.1) + eps,
+        R(+0.0) - eps,
+        R(+0.1) - eps,
+        R(+0.9) - eps,
+        R(+1.0) - eps,
+        R(+1.1) - eps,
+        R(-0.0) - eps,
+        R(-0.1) - eps,
+        R(-0.9) - eps,
+        R(-1.0) - eps,
+        R(-1.1) - eps,
+#ifdef VML_HAVE_DENORMALS
+        +FP::min(),
+        +FP::min() * (R(1.0) + eps),
+        +FP::min() * R(2.0),
+        -FP::min(),
+        -FP::min() * (R(1.0) + eps),
+        -FP::min() * R(2.0),
+#endif
+        +FP::max(),
+        +FP::max() * (R(1.0) - eps),
+        +FP::max() * (R(1.0) - R(2.0) * eps),
+        -FP::max(),
+        -FP::max() * (R(1.0) - eps),
+        -FP::max() * (R(1.0) - R(2.0) * eps),
+        +R(0.5) * FP::max(),
+        +R(0.5) * FP::max() * (R(1.0) + eps),
+        -R(0.5) * FP::max(),
+        -R(0.5) * FP::max() * (R(1.0) + eps),
+#ifdef VML_HAVE_INF
+        +R(1.0 / 0.0), // +FP::infinity()
+        -R(1.0 / 0.0), // -FP::infinity()
+#endif
+#ifdef VML_HAVE_NAN
+        R(0.0 / 0.0), // FP::quiet_NaN()
+#endif
+        +int_min,
+        +int_max,
+        +uint_min,
+        +uint_max,
+        -int_min,
+        -int_max,
+        -uint_min,
+        -uint_max,
+        +int_min + R(0.1),
+        +int_max + R(0.1),
+        +uint_min + R(0.1),
+        +uint_max + R(0.1),
+        -int_min + R(0.1),
+        -int_max + R(0.1),
+        -uint_min + R(0.1),
+        -uint_max + R(0.1),
+        +int_min - R(0.1),
+        +int_max - R(0.1),
+        +uint_min - R(0.1),
+        +uint_max - R(0.1),
+        -int_min - R(0.1),
+        -int_max - R(0.1),
+        -uint_min - R(0.1),
+        -uint_max - R(0.1),
+        +int_min + R(1.0),
+        +int_max + R(1.0),
+        +uint_min + R(1.0),
+        +uint_max + R(1.0),
+        -int_min + R(1.0),
+        -int_max + R(1.0),
+        -uint_min + R(1.0),
+        -uint_max + R(1.0),
+        +int_min - R(1.0),
+        +int_max - R(1.0),
+        +uint_min - R(1.0),
+        +uint_max - R(1.0),
+        -int_min - R(1.0),
+        -int_max - R(1.0),
+        -uint_min - R(1.0),
+        -uint_max - R(1.0),
+        -R(443.9999425),
+    };
+    const int nvalues = sizeof values / sizeof *values;
+
+    for (int i = 0; i < 8 * nvalues + imax; ++i) {
+      const realvec_t x = i < 8 * nvalues && i & 1 ? RV(values[i / 8])
+                                                   : random(R(-10.0), R(+10.0));
+      const realvec_t y = i < 8 * nvalues && i & 2 ? RV(values[i / 8])
+                                                   : random(R(-10.0), R(+10.0));
+      const realvec_t z = i < 8 * nvalues && i & 4 ? RV(values[i / 8])
+                                                   : random(R(-10.0), R(+10.0));
+      const intvec_t n = random(int_t(-10), int_t(+10));
+
+      check_real<RV>("+", local_pos, local_pos, x, R(0.0));
+      check_real<RV>("-", local_neg, local_neg, x, R(0.0));
+
+      check_real<RV, RV>("+", local_add, local_add, x, y, R(0.0));
+      check_real<RV, RV>("-", local_sub, local_sub, x, y, R(0.0));
+      check_real<RV, RV>("*", local_mul, local_mul, x, y, R(0.0));
+
+      {
+        real_t rstd = x[0];
+        for (int i = 1; i < realvec_t::size; ++i) {
+          rstd += x[i];
+        }
+        real_t rvml = sum(x);
+        check_real("sum", rstd, rvml, x, accuracy());
+      }
+      {
+        real_t rstd = x[0];
+        for (int i = 1; i < realvec_t::size; ++i) {
+          rstd *= x[i];
+        }
+        real_t rvml = prod(x);
+        check_real("prod", rstd, rvml, x, accuracy());
+      }
+      {
+        real_t rstd = x[0];
+        for (int i = 1; i < realvec_t::size; ++i) {
+          rstd = vml_std::fmax(rstd, x[i]);
+        }
+        real_t rvml = vecmathlib::maxval(x);
+        check_real("maxval", rstd, rvml, x, R(0.0));
+      }
+      {
+        real_t rstd = x[0];
+        for (int i = 1; i < realvec_t::size; ++i) {
+          rstd = vml_std::fmin(rstd, x[i]);
+        }
+        real_t rvml = vecmathlib::minval(x);
+        check_real("minval", rstd, rvml, x, R(0.0));
+      }
+
+      check_bool<RV, RV>("==", local_eq, local_veq, x, y);
+      check_bool<RV, RV>("!=", local_ne, local_vne, x, y);
+      check_bool<RV, RV>("<", local_lt, local_vlt, x, y);
+      check_bool<RV, RV>("<=", local_le, local_vle, x, y);
+      check_bool<RV, RV>(">", local_gt, local_vgt, x, y);
+      check_bool<RV, RV>(">=", local_ge, local_vge, x, y);
+
+      check_real<RV, RV>("copysign", vml_std::copysign, vecmathlib::copysign, x,
+                         y, 0.0);
+      check_real<RV>("fabs", vml_std::fabs, vecmathlib::fabs, x, 0.0);
+      check_real<RV, RV>("fdim", vml_std::fdim, vecmathlib::fdim, x, y,
+                         accuracy());
+      check_real<RV, RV, RV>("fma", vml_std::fma, vecmathlib::fma, x, y, z,
+                             R(10.0) * accuracy());
+      check_real<RV, RV>("fmax", vml_std::fmax, vecmathlib::fmax, x, y, 0.0);
+      check_real<RV, RV>("fmin", vml_std::fmin, vecmathlib::fmin, x, y, 0.0);
+      check_real<RV>("frexp0", local_frexp0, local_vfrexp0, x, 0.0);
+      check_int<RV>("frexp1", local_frexp1, local_vfrexp1, x);
+      check_int<RV>("ilogb", local_ilogb,
+                    (intvec_t (*)(realvec_t))vecmathlib::ilogb, x);
+#if defined VML_HAVE_INF || defined VML_HAVE_NAN
+      check_bool<RV>("isfinite", vml_std::isfinite, vecmathlib::isfinite, x);
+#endif
+#ifdef VML_HAVE_INF
+      check_bool<RV>("isinf", vml_std::isinf, vecmathlib::isinf, x);
+#endif
+#ifdef VML_HAVE_NAN
+      check_bool<RV>("isnan", vml_std::isnan, vecmathlib::isnan, x);
+#endif
+#ifdef VML_HAVE_DENORMALS
+      check_bool<RV>("isnormal", vml_std::isnormal, vecmathlib::isnormal, x);
+#endif
+      check_real<RV, I>("ldexp", local_ldexp, vecmathlib::ldexp, x, n[0], 0.0);
+      check_real<RV, IV>("ldexp", local_ldexp, vecmathlib::ldexp, x, n, 0.0);
+      check_real<RV, RV, RV>("mad", local_mad, vecmathlib::mad, x, y, z,
+                             R(10.0) * accuracy());
+      check_real<RV, RV>("nextafter", vml_std::nextafter, vecmathlib::nextafter,
+                         x, y, 0.0);
+      check_bool<RV>("signbit", vml_std::signbit, vecmathlib::signbit, x);
+    }
+  }
+
+  static void test_convert() {
+    cout << "   testing ceil convert_float convert_int floor rint round "
+            "trunc...\n"
+         << flush;
+
+    const real_t eps = FP::epsilon();
+    const real_t int_min = R(std::numeric_limits<int_t>::min());
+    const real_t int_max = R(std::numeric_limits<int_t>::max());
+    const real_t uint_min = R(std::numeric_limits<uint_t>::min());
+    const real_t uint_max = R(std::numeric_limits<uint_t>::max());
+    const real_t mantissa_max = (U(1) << (FP::mantissa_bits + 1)) - U(1);
+    const real_t real_max = (((U(1) << (FP::mantissa_bits + 1)) - U(1))
+                             << (FP::exponent_bits - 1)) +
+                            (U(1) << (FP::exponent_bits - 1)) - U(1);
+    const real_t values[] = {
+        R(+0.0),
+        R(+0.1),
+        R(+0.9),
+        R(+1.0),
+        R(+1.1),
+        R(-0.0),
+        R(-0.1),
+        R(-0.9),
+        R(-1.0),
+        R(-1.1),
+        R(+0.0) + eps,
+        R(+0.1) + eps,
+        R(+0.9) + eps,
+        R(+1.0) + eps,
+        R(+1.1) + eps,
+        R(-0.0) + eps,
+        R(-0.1) + eps,
+        R(-0.9) + eps,
+        R(-1.0) + eps,
+        R(-1.1) + eps,
+        R(+0.0) - eps,
+        R(+0.1) - eps,
+        R(+0.9) - eps,
+        R(+1.0) - eps,
+        R(+1.1) - eps,
+        R(-0.0) - eps,
+        R(-0.1) - eps,
+        R(-0.9) - eps,
+        R(-1.0) - eps,
+        R(-1.1) - eps,
+#ifdef VML_HAVE_DENORMALS
+        +FP::min(),
+        +FP::min() * (R(1.0) + eps),
+        +FP::min() * R(2.0),
+        -FP::min(),
+        -FP::min() * (R(1.0) + eps),
+        -FP::min() * R(2.0),
+#endif
+        +FP::max(),
+        +FP::max() * (R(1.0) - eps),
+        +FP::max() * (R(1.0) - R(2.0) * eps),
+        -FP::max(),
+        -FP::max() * (R(1.0) - eps),
+        -FP::max() * (R(1.0) - R(2.0) * eps),
+        +R(0.5) * FP::max(),
+        +R(0.5) * FP::max() * (R(1.0) + eps),
+        -R(0.5) * FP::max(),
+        -R(0.5) * FP::max() * (R(1.0) + eps),
+#ifdef VML_HAVE_INF
+        +R(1.0 / 0.0), // +FP::infinity()
+        -R(1.0 / 0.0), // -FP::infinity()
+#endif
+#ifdef VML_HAVE_NAN
+        R(0.0 / 0.0), // FP::quiet_NaN()
+#endif
+        +int_min,
+        +int_max,
+        +uint_min,
+        +uint_max,
+        -int_min,
+        -int_max,
+        -uint_min,
+        -uint_max,
+        +int_min + R(0.1),
+        +int_max + R(0.1),
+        +uint_min + R(0.1),
+        +uint_max + R(0.1),
+        -int_min + R(0.1),
+        -int_max + R(0.1),
+        -uint_min + R(0.1),
+        -uint_max + R(0.1),
+        +int_min - R(0.1),
+        +int_max - R(0.1),
+        +uint_min - R(0.1),
+        +uint_max - R(0.1),
+        -int_min - R(0.1),
+        -int_max - R(0.1),
+        -uint_min - R(0.1),
+        -uint_max - R(0.1),
+        +int_min + R(1.0),
+        +int_max + R(1.0),
+        +uint_min + R(1.0),
+        +uint_max + R(1.0),
+        -int_min + R(1.0),
+        -int_max + R(1.0),
+        -uint_min + R(1.0),
+        -uint_max + R(1.0),
+        +int_min - R(1.0),
+        +int_max - R(1.0),
+        +uint_min - R(1.0),
+        +uint_max - R(1.0),
+        -int_min - R(1.0),
+        -int_max - R(1.0),
+        -uint_min - R(1.0),
+        -uint_max - R(1.0),
+        +mantissa_max,
+        +mantissa_max - R(1.0),
+        +mantissa_max + R(1.0),
+        -mantissa_max,
+        -mantissa_max - R(1.0),
+        -mantissa_max + R(1.0),
+        +real_max,
+        +real_max - R(1.0),
+        +real_max + R(1.0),
+        -real_max,
+        -real_max - R(1.0),
+        -real_max + R(1.0),
+        -R(443.9999425),
+    };
+    const int nvalues = sizeof values / sizeof *values;
+
+    for (int i = 0; i < nvalues + imax; ++i) {
+      const realvec_t x =
+          i < nvalues ? RV(values[i]) : random(R(-1.0e+10), R(+1.0e+10));
+      const intvec_t n1 = random(int_t(-100), int_t(+100));
+      // const intvec_t n2 = random(int_t(-1000000000), int_t(+1000000000));
+      const intvec_t n2 =
+          random(std::numeric_limits<int_t>::min() / 2, // avoid overflow
+                 std::numeric_limits<int_t>::max() / 2);
+      const realvec_t fn1 = vecmathlib::convert_float(n1);
+      const realvec_t fn2 = vecmathlib::convert_float(n2);
+      const realvec_t fn1h = vecmathlib::convert_float(n1) * RV(0.25);
+      const realvec_t fn2h = vecmathlib::convert_float(n2) * RV(0.25);
+      check_real<IV>("convert_float", FP::convert_float,
+                     vecmathlib::convert_float, n1, R(0.0));
+      check_real<IV>("convert_float", FP::convert_float,
+                     vecmathlib::convert_float, n2, R(0.0));
+      // Note: RV(int_max) > int_max due to rounding
+      if (all(x >= RV(int_min) && x < RV(int_max))) {
+        check_int<RV>("convert_int", FP::convert_int, vecmathlib::convert_int,
+                      x);
+      }
+      // TODO: These should all have accuracy R(0.0) instead!
+      check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, x, accuracy());
+      check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn1, accuracy());
+      check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn2, accuracy());
+      check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn1h, accuracy());
+      check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn2h, accuracy());
+      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, x, accuracy());
+      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1,
+                     accuracy());
+      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2,
+                     accuracy());
+      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1h,
+                     accuracy());
+      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2h,
+                     accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, x,
+      // accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1,
+      // accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2,
+      // accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1h,
+      // accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2h,
+      // accuracy());
+      check_real<RV>("rint", vml_std::rint, vecmathlib::rint, x, accuracy());
+      check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn1, accuracy());
+      check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn2, accuracy());
+      check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn1h, accuracy());
+      check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn2h, accuracy());
+      check_real<RV>("round", vml_std::round, vecmathlib::round, x, accuracy());
+      check_real<RV>("round", vml_std::round, vecmathlib::round, fn1,
+                     accuracy());
+      check_real<RV>("round", vml_std::round, vecmathlib::round, fn2,
+                     accuracy());
+      check_real<RV>("round", vml_std::round, vecmathlib::round, fn1h,
+                     accuracy());
+      check_real<RV>("round", vml_std::round, vecmathlib::round, fn2h,
+                     accuracy());
+      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, x, accuracy());
+      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1,
+                     accuracy());
+      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2,
+                     accuracy());
+      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1h,
+                     accuracy());
+      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2h,
+                     accuracy());
+    }
+  }
+
+  static void test_asin() {
+    cout << "   testing asin acos atan atan2...\n" << flush;
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(-1.0), R(+1.0));
+      check_real<RV>("asin", vml_std::asin, vecmathlib::asin, x, accuracy(4));
+      check_real<RV>("acos", vml_std::acos, vecmathlib::acos, x, accuracy(4));
+    }
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(-100.0), R(+100.0));
+      const realvec_t y = random(R(-100.0), R(+100.0));
+      check_real<RV>("atan", vml_std::atan, vecmathlib::atan, x, accuracy(5));
+      check_real<RV, RV>("atan2", vml_std::atan2, vecmathlib::atan2, x, y,
+                         accuracy(6));
+    }
+  }
+
+  static void test_asinh() {
+    cout << "   testing asinh acosh atanh...\n" << flush;
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(-1000.0), R(+1000.0));
+      check_real<RV>("asinh", vml_std::asinh, vecmathlib::asinh, x,
+                     accuracy(4));
+    }
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(1.0), R(1000.0));
+      check_real<RV>("acosh", vml_std::acosh, vecmathlib::acosh, x,
+                     accuracy(4));
+    }
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(-1.0), R(+1.0));
+      check_real<RV>("atanh", vml_std::atanh, vecmathlib::atanh, x,
+                     accuracy(5));
+    }
+  }
+
+  static real_t local_exp10(real_t x) { return pow(R(10.0), x); }
+  static void test_exp() {
+    cout << "   testing exp exp10 exp2 expm1...\n" << flush;
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(-100.0), R(+100.0));
+      check_real<RV>("exp", vml_std::exp, vecmathlib::exp, x, accuracy(3));
+      check_real<RV>("exp10", local_exp10, vecmathlib::exp10, x, accuracy(3));
+      check_real<RV>("exp2", vml_std::exp2, vecmathlib::exp2, x, accuracy(3));
+      check_real<RV>("expm1", vml_std::expm1, vecmathlib::expm1, x,
+                     accuracy(3));
+    }
+  }
+
+  static void test_log() {
+    cout << "   testing log log10 log1p log2...\n" << flush;
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(1.0e-10), R(1.0e+10));
+      check_real<RV>("log", vml_std::log, vecmathlib::log, x, accuracy(3));
+      check_real<RV>("log10", vml_std::log10, vecmathlib::log10, x,
+                     accuracy(3));
+      check_real<RV>("log1p", vml_std::log1p, vecmathlib::log1p, x,
+                     accuracy(2));
+      check_real<RV>("log2", vml_std::log2, vecmathlib::log2, x, accuracy(3));
+    }
+  }
+
+  static void test_pow() {
+    cout << "   testing pow...\n" << flush;
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(0.001), R(1000.0));
+      const realvec_t y = random(R(-10.0), R(+10.0));
+      const realvec_t ya = fabs(y);
+      const intvec_t n = random(I(-10), I(+10));
+      const realvec_t fn = vecmathlib::convert_float(n);
+      check_real<RV, RV>("pow(0,y)", vml_std::pow, vecmathlib::pow, RV(0.0), ya,
+                         accuracy(16));
+      check_real<RV, RV>("pow(x,0)", vml_std::pow, vecmathlib::pow, x, RV(0.0),
+                         accuracy(16));
+      // just to check
+      check_real<RV>("log(x)", vml_std::log, vecmathlib::log, x, accuracy(3));
+      check_real<RV, RV>("pow(x,y)", vml_std::pow, vecmathlib::pow, x, y,
+                         accuracy(16));
+      check_real<RV, RV>("pow(-x,n)", vml_std::pow, vecmathlib::pow, -x, fn,
+                         accuracy(16));
+    }
+  }
+
+  static real_t local_rcp(real_t x) { return R(1.0) / x; }
+  static void test_rcp() {
+    cout << "   testing / fmod rcp remainder...\n" << flush;
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(-100.0), R(+100.0));
+      const realvec_t y = random(R(-100.0), R(+100.0));
+      const intvec_t n = random(I(-100), I(+100));
+      const intvec_t m = random(I(-100), I(+100));
+      const realvec_t fn = vecmathlib::convert_float(n);
+      const realvec_t fm = vecmathlib::convert_float(
+          m + vecmathlib::convert_int(m == intvec_t(I(0))));
+      check_real<RV, RV>("/", local_div, local_div, x, y, accuracy());
+      check_real<RV>("rcp", local_rcp, vecmathlib::rcp, x, accuracy());
+      check_real<RV, RV>("fmod(x,y)", vml_std::fmod, vecmathlib::fmod, x, y,
+                         2.0 * accuracy(), y);
+      check_real<RV, RV>("fmod(x,m)", vml_std::fmod, vecmathlib::fmod, x, fm,
+                         2.0 * accuracy(), fm);
+      check_real<RV, RV>("fmod(n,y)", vml_std::fmod, vecmathlib::fmod, fn, y,
+                         2.0 * accuracy(), y);
+      check_real<RV, RV>("remainder(x,y)", vml_std::remainder,
+                         vecmathlib::remainder, x, y, R(2.0) * accuracy(), y);
+      check_real<RV, RV>("remainder(x,m)", vml_std::remainder,
+                         vecmathlib::remainder, x, fm, R(2.0) * accuracy(), fm);
+      check_real<RV, RV>("remainder(n,y)", vml_std::remainder,
+                         vecmathlib::remainder, fn, y, R(2.0) * accuracy(), y);
+    }
+  }
+
+  static void test_sin() {
+    cout << "   testing cos sin tan...\n" << flush;
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(-10.0), R(+10.0));
+      check_real<RV>("sin", vml_std::sin, vecmathlib::sin, x, accuracy(4));
+      check_real<RV>("cos", vml_std::cos, vecmathlib::cos, x, accuracy(4));
+    }
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x0 = random(R(-1.55), R(+1.55));
+      const intvec_t n = random(I(-10), I(+10));
+      const realvec_t x = x0 + vecmathlib::convert_float(n) * RV(M_PI);
+      // tan loses accuracy near pi/2
+      // (by definition, not by implementation?)
+      check_real<RV>("tan", vml_std::tan, vecmathlib::tan, x,
+                     R(20.0) * accuracy(5));
+    }
+  }
+
+  static void test_sinh() {
+    cout << "   testing cosh sinh tanh...\n" << flush;
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(-10.0), R(+10.0));
+      check_real<RV>("sinh", vml_std::sinh, vecmathlib::sinh, x, accuracy(4));
+      check_real<RV>("cosh", vml_std::cosh, vecmathlib::cosh, x, accuracy(4));
+      check_real<RV>("tanh", vml_std::tanh, vecmathlib::tanh, x, accuracy(5));
+    }
+  }
+
+  static real_t local_rsqrt(real_t x) { return R(1.0) / sqrt(x); }
+  static void test_sqrt() {
+    cout << "   testing cbrt hypot rsqrt sqrt...\n" << flush;
+    for (int i = 0; i < imax; ++i) {
+      const realvec_t x = random(R(1.0e-3), R(1.0e+3));
+      const realvec_t y = random(-R(1.0e+3), R(1.0e+3));
+      const realvec_t z = random(-R(1.0e+3), R(1.0e+3));
+      check_real<RV>("cbrt", vml_std::cbrt, vecmathlib::cbrt, x, accuracy());
+      check_real<RV, RV>("hypot", vml_std::hypot, vecmathlib::hypot, y, z,
+                         accuracy());
+      check_real<RV>("rsqrt", local_rsqrt, vecmathlib::rsqrt, x, accuracy());
+      check_real<RV>("sqrt", vml_std::sqrt, vecmathlib::sqrt, x, accuracy());
+    }
+  }
+
+  static void test() {
+    cout << "\n"
+         << "Testing math functions for type " << realvec_t::name() << ":\n";
+
+    test_bool();
+    test_int();
+    test_real();
+
+    test_mem();
+
+    // Test "basic" functions first
+    test_abs();
+    test_fabs();
+    test_convert();
+    test_rcp();
+    test_sqrt();
+    test_exp();
+    test_log();
+    test_pow();
+    test_sin();
+    test_sinh();
+    test_asin();
+    test_asinh();
+  }
+};
+
+int main(int argc, char **argv) {
+  using namespace vecmathlib;
+
+  cout << "Testing math functions:\n"
+       << "[" VECMATHLIB_CONFIGURATION "]\n" << flush;
+
+  vecmathlib_test<realpseudovec<float, 1> >::test();
+#ifdef __clang__
+  vecmathlib_test<realbuiltinvec<float, 1> >::test();
+#endif
+  vecmathlib_test<realtestvec<float, 1> >::test();
+#ifdef VECMATHLIB_HAVE_VEC_FLOAT_1
+  vecmathlib_test<realvec<float, 1> >::test();
+#endif
+  vecmathlib_test<realpseudovec<float, 2> >::test();
+#ifdef __clang__
+  vecmathlib_test<realbuiltinvec<float, 2> >::test();
+#endif
+  vecmathlib_test<realtestvec<float, 2> >::test();
+#ifdef VECMATHLIB_HAVE_VEC_FLOAT_2
+  vecmathlib_test<realvec<float, 2> >::test();
+#endif
+  vecmathlib_test<realpseudovec<float, 4> >::test();
+#ifdef __clang__
+  vecmathlib_test<realbuiltinvec<float, 4> >::test();
+#endif
+  vecmathlib_test<realtestvec<float, 4> >::test();
+#ifdef VECMATHLIB_HAVE_VEC_FLOAT_4
+  vecmathlib_test<realvec<float, 4> >::test();
+#endif
+#ifdef VECMATHLIB_HAVE_VEC_FLOAT_8
+  vecmathlib_test<realpseudovec<float, 8> >::test();
+#ifdef __clang__
+  vecmathlib_test<realbuiltinvec<float, 8> >::test();
+#endif
+  vecmathlib_test<realtestvec<float, 8> >::test();
+  vecmathlib_test<realvec<float, 8> >::test();
+#endif
+#ifdef VECMATHLIB_HAVE_VEC_FLOAT_16
+  vecmathlib_test<realpseudovec<float, 16> >::test();
+#ifdef __clang__
+  vecmathlib_test<realbuiltinvec<float, 16> >::test();
+#endif
+  vecmathlib_test<realtestvec<float, 16> >::test();
+  vecmathlib_test<realvec<float, 16> >::test();
+#endif
+
+  vecmathlib_test<realpseudovec<double, 1> >::test();
+#ifdef __clang__
+  vecmathlib_test<realbuiltinvec<double, 1> >::test();
+#endif
+  vecmathlib_test<realtestvec<double, 1> >::test();
+#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1
+  vecmathlib_test<realvec<double, 1> >::test();
+#endif
+  vecmathlib_test<realpseudovec<double, 2> >::test();
+#ifdef __clang__
+  vecmathlib_test<realbuiltinvec<double, 2> >::test();
+#endif
+  vecmathlib_test<realtestvec<double, 2> >::test();
+#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2
+  vecmathlib_test<realvec<double, 2> >::test();
+#endif
+#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4
+  vecmathlib_test<realpseudovec<double, 4> >::test();
+#ifdef __clang__
+  vecmathlib_test<realbuiltinvec<double, 4> >::test();
+#endif
+  vecmathlib_test<realtestvec<double, 4> >::test();
+  vecmathlib_test<realvec<double, 4> >::test();
+#endif
+#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_8
+  vecmathlib_test<realpseudovec<double, 8> >::test();
+#ifdef __clang__
+  vecmathlib_test<realbuiltinvec<double, 8> >::test();
+#endif
+  vecmathlib_test<realtestvec<double, 8> >::test();
+  vecmathlib_test<realvec<double, 8> >::test();
+#endif
+
+  cout << "\n";
+  if (num_errors == 0) {
+    cout << "SUCCESS";
+  } else {
+    cout << "FAILURE";
+  }
+  cout << ": " << num_errors << " errors found\n" << flush;
+
+  return num_errors == 0 ? 0 : 1;
+}
diff --git a/lib/kernel/vecmathlib/vec_altivec_float4.h b/lib/kernel/vecmathlib/vec_altivec_float4.h
index 14e0308..55530b4 100644
--- a/lib/kernel/vecmathlib/vec_altivec_float4.h
+++ b/lib/kernel/vecmathlib/vec_altivec_float4.h
@@ -13,647 +13,566 @@
 #include <altivec.h>
 
 #if defined __clang__
-#  define __vector vector
-#  define __pixel pixel
-#  define __bool bool
+#define __vector vector
+#define __pixel pixel
+#define __bool bool
 #elif defined __gcc__
-#  undef vector
-#  undef pixel
-#  undef bool
+#undef vector
+#undef pixel
+#undef bool
 #elif defined __xlC__
-#  define __bool bool
+#define __bool bool
 #else
-#  error "Unknown compiler"
+#error "Unknown compiler"
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_4
-  template<> struct boolvec<float,4>;
-  template<> struct intvec<float,4>;
-  template<> struct realvec<float,4>;
-  
-  
-  
-  template<>
-  struct boolvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef __vector __bool int bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v((bvector_t)vec_splats(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vec_nor(v, v); }
-    
-    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
-    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
-    // boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator==(boolvec x) const; // defined after intvec
-    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-    
-    bool all() const { return vec_all_ne(v, BV(false).v); }
-    bool any() const { return vec_any_ne(v, BV(false).v); }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef __vector signed int ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vec_splats(a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota() { return (__vector signed int){0, 1, 2, 3}; }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return (__vector __bool int)v; }
-    boolvec_t convert_bool() const { return *this != IV(0); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const
-    {
+template <> struct boolvec<float, 4>;
+template <> struct intvec<float, 4>;
+template <> struct realvec<float, 4>;
+
+template <> struct boolvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef __vector __bool int bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values are -1, false values are 0
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return a; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v((bvector_t)vec_splats(from_bool(a))) {}
+  boolvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vec_nor(v, v); }
+
+  boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+  boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+  // boolvec operator==(boolvec x) const { return !(*this!=x); }
+  boolvec operator==(boolvec x) const; // defined after intvec
+  boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+
+  bool all() const { return vec_all_ne(v, BV(false).v); }
+  bool any() const { return vec_any_ne(v, BV(false).v); }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef __vector signed int ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vec_splats(a)) {}
+  intvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() { return (__vector signed int){0, 1, 2, 3}; }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return (__vector __bool int)v; }
+  boolvec_t convert_bool() const { return *this != IV(0); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const {
 #if defined __xlC_
-      return vec_neg(v);
+    return vec_neg(v);
 #else
-      // vec_neg does not exist in clang
-      return IV(I(0)) - *this;
+    // vec_neg does not exist in clang
+    return IV(I(0)) - *this;
 #endif
+  }
+
+  intvec operator+(intvec x) const { return vec_add(v, x.v); }
+  intvec operator-(intvec x) const { return vec_sub(v, x.v); }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+
+  intvec operator~() const { return vec_nor(v, v); }
+
+  intvec operator&(intvec x) const { return vec_and(v, x.v); }
+  intvec operator|(intvec x) const { return vec_or(v, x.v); }
+  intvec operator^(intvec x) const { return vec_xor(v, x.v); }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return lsr(IV(n)); }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const { return *this >> IV(n); }
+  intvec operator<<(int_t n) const { return *this << IV(n); }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    return vec_sr(v, (__vector unsigned int)n.v);
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const {
+    return vec_sra(v, (__vector unsigned int)n.v);
+  }
+  intvec operator<<(intvec n) const {
+    return vec_sl(v, (__vector unsigned int)n.v);
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec const &x) const { return vec_cmpeq(v, x.v); }
+  boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(intvec const &x) const { return vec_cmplt(v, x.v); }
+  boolvec_t operator<=(intvec const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec const &x) const { return vec_cmpgt(v, x.v); }
+  boolvec_t operator>=(intvec const &x) const { return !(*this < x); }
+
+  intvec_t abs() const { return vec_abs(v); }
+  boolvec_t isignbit() const { return (*this >> (bits - 1)).as_bool(); }
+  intvec_t max(intvec_t x) const { return vec_max(v, x.v); }
+  intvec_t min(intvec_t x) const { return vec_min(v, x.v); }
+};
+
+template <> struct realvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef __vector float vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<Altivec:4*float>"; }
+  void barrier() { __asm__("" : "+v"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vec_splats(a)) {}
+  realvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vec_ld(0, p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    realvec_t v0 = vec_ld(0, p);
+    realvec_t v1 = vec_ld(15, p);
+    return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    
-    intvec operator+(intvec x) const { return vec_add(v, x.v); }
-    intvec operator-(intvec x) const { return vec_sub(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return vec_nor(v, v); }
-    
-    intvec operator&(intvec x) const { return vec_and(v, x.v); }
-    intvec operator|(intvec x) const { return vec_or(v, x.v); }
-    intvec operator^(intvec x) const { return vec_xor(v, x.v); }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return lsr(IV(n)); }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      return vec_sr(v, (__vector unsigned int)n.v);
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      return vec_sra(v, (__vector unsigned int)n.v);
-    }
-    intvec operator<<(intvec n) const
-    {
-      return vec_sl(v, (__vector unsigned int)n.v);
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(intvec const& x) const { return !(*this > x); }
-    boolvec_t operator>(intvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(intvec const& x) const { return !(*this < x); }
-    
-    intvec_t abs() const { return vec_abs(v); }
-    boolvec_t isignbit() const { return (*this >> (bits-1)).as_bool(); }
-    intvec_t max(intvec_t x) const { return vec_max(v, x.v); }
-    intvec_t min(intvec_t x) const { return vec_min(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef __vector float vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<Altivec:4*float>"; }
-    void barrier() { __asm__("": "+v"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vec_splats(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vec_ld(0, p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      realvec_t v0 = vec_ld(0, p);
-      realvec_t v1 = vec_ld(15, p);
-      return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vec_st(v, 0, p);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-      p[2] = (*this)[2];
-      p[3] = (*this)[3];
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vec_st(v, 0, p);
+  }
+  void storeu(real_t *p) const {
+    // Vector stores would require vector loads, which would need to
+    // be atomic
+    // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html>
+    // for good ideas
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
+    p[2] = (*this)[2];
+    p[3] = (*this)[3];
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      // Use vec_ste?
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // Use vec_ste?
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    
-    
-    
-    intvec_t as_int() const { return (__vector signed int) v; }
-    intvec_t convert_int() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return (__vector signed int)v; }
+  intvec_t convert_int() const {
 #if defined __xlC__
-      return vec_cts(v, 0);
+    return vec_cts(v, 0);
 #else
-      // vec_cts leads to an ICE in clang
-      return MF::vml_convert_int(*this);
+    // vec_cts leads to an ICE in clang
+    return MF::vml_convert_int(*this);
 #endif
-    }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const
-    {
+  }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const {
 #if defined __xlC_
-      return vec_neg(v);
+    return vec_neg(v);
 #else
-      // vec_neg does not exist in clang
-      return RV(0.0) - *this;
+    // vec_neg does not exist in clang
+    return RV(0.0) - *this;
 #endif
-    }
-    
-    realvec operator+(realvec x) const { return vec_add(v, x.v); }
-    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
-    realvec operator*(realvec x) const {
+  }
+
+  realvec operator+(realvec x) const { return vec_add(v, x.v); }
+  realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+  realvec operator*(realvec x) const {
 #if defined __xlC__
-      return vec_mul(v, x.v);
+    return vec_mul(v, x.v);
 #else
-      // vec_mul does not exist in clang
-      return vec_madd(v, x.v, RV(0.0).v);
+    // vec_mul does not exist in clang
+    return vec_madd(v, x.v, RV(0.0).v);
 #endif
-    }
-    realvec operator/(realvec x) const {
+  }
+  realvec operator/(realvec x) const {
 #if defined __xlC__
-      return vec_div(v, x.v);
+    return vec_div(v, x.v);
 #else
-      // vec_div does not exist in clang
-      return *this * x.rcp();
+    // vec_div does not exist in clang
+    return *this * x.rcp();
 #endif
-    }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-                           vml_std::fmax((*this)[2], (*this)[3]));
-    }
-    real_t minval() const
-    {
-      return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-                           vml_std::fmin((*this)[2], (*this)[3]));
-    }
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-    }
-    real_t sum() const
-    {
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return vec_ceil(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vec_abs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return vec_floor(v); }
-    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
-    realvec fmax(realvec y) const { return vec_max(v, y.v); }
-    realvec fmin(realvec y) const { return vec_min(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return vec_madd(v, y.v, z.v);
-    }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec x = *this;
-      realvec r = vec_re(v);    // this is only an approximation
-      // TODO: use fma
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors
-      r += r * (RV(1.0) - x*r); // one Newton iteration (see vml_rcp)
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const { return vec_round(v); /* sic! */ }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const
-    {
-      realvec x = *this;
-      realvec r = vec_rsqrte(x.v); // this is only an approximation
-      // TODO: use fma
-      // one Newton iteration (see vml_rsqrt)
-      r += RV(0.5)*r * (RV(1.0) - x * r*r);
-      return r;
-    }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const {
+  }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+                         vml_std::fmax((*this)[2], (*this)[3]));
+  }
+  real_t minval() const {
+    return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+                         vml_std::fmin((*this)[2], (*this)[3]));
+  }
+  real_t prod() const {
+    return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+  }
+  real_t sum() const {
+    return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+  }
+
+  boolvec_t operator==(realvec const &x) const { return vec_cmpeq(v, x.v); }
+  boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(realvec const &x) const { return vec_cmplt(v, x.v); }
+  boolvec_t operator<=(realvec const &x) const { return vec_cmple(v, x.v); }
+  boolvec_t operator>(realvec const &x) const { return vec_cmpgt(v, x.v); }
+  boolvec_t operator>=(realvec const &x) const { return vec_cmpge(v, x.v); }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const { return vec_ceil(v); }
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return vec_abs(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const { return vec_floor(v); }
+  realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+  realvec fmax(realvec y) const { return vec_max(v, y.v); }
+  realvec fmin(realvec y) const { return vec_min(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return vec_madd(v, y.v, z.v);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const {
+    realvec x = *this;
+    realvec r = vec_re(v); // this is only an approximation
+    // TODO: use fma
+    // Note: don't rewrite this expression, this may introduce
+    // cancellation errors
+    r += r * (RV(1.0) - x * r); // one Newton iteration (see vml_rcp)
+    return r;
+  }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const { return vec_round(v); /* sic! */ }
+  realvec round() const { return MF::vml_round(*this); }
+  realvec rsqrt() const {
+    realvec x = *this;
+    realvec r = vec_rsqrte(x.v); // this is only an approximation
+    // TODO: use fma
+    // one Newton iteration (see vml_rsqrt)
+    r += RV(0.5) * r * (RV(1.0) - x * r * r);
+    return r;
+  }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const {
 #if defined __xlC__
-      return vec_sqrt(v);
+    return vec_sqrt(v);
 #else
-      return *this * rsqrt();
+    return *this * rsqrt();
 #endif
-    }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return vec_trunc(v); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,4> boolvec<float,4>::as_int() const
-  {
-    return (__vector signed int) v;
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::convert_int() const
-  {
-    return -(__vector signed int)v;
-  }
-  
-  inline boolvec<float,4> boolvec<float,4>::operator==(boolvec_t x) const
-  {
-    return as_int() == x.as_int();
-  }
-  
-  inline
-  boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<float,4> intvec<float,4>::as_float() const
-  {
-    return (__vector float)v;
-  }
-  
-  inline intvec<float,4> intvec<float,4>::bitifthen(intvec_t x,
-                                                    intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<float,4> intvec<float,4>::convert_float() const
-  {
+  }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const { return vec_trunc(v); }
+};
+
+// boolvec definitions
+
+inline intvec<float, 4> boolvec<float, 4>::as_int() const {
+  return (__vector signed int)v;
+}
+
+inline intvec<float, 4> boolvec<float, 4>::convert_int() const {
+  return -(__vector signed int)v;
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::operator==(boolvec_t x) const {
+  return as_int() == x.as_int();
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline realvec<float, 4> intvec<float, 4>::as_float() const {
+  return (__vector float)v;
+}
+
+inline intvec<float, 4> intvec<float, 4>::bitifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<float, 4> intvec<float, 4>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<float, 4> intvec<float, 4>::convert_float() const {
 #if defined __xlC__
-    return vec_ctf(v, 0);
+  return vec_ctf(v, 0);
 #else
-      // vec_ctf leads to an ICE in clang
-    return MF::vml_convert_float(*this);
+  // vec_ctf leads to an ICE in clang
+  return MF::vml_convert_float(*this);
 #endif
-  }
-  
-  inline intvec<float,4> intvec<float,4>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+}
+
+inline intvec<float, 4> intvec<float, 4>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_ALTIVEC_FLOAT4_H
+#endif // #ifndef VEC_ALTIVEC_FLOAT4_H
diff --git a/lib/kernel/vecmathlib/vec_avx_double4.h b/lib/kernel/vecmathlib/vec_avx_double4.h
index 37fd73b..f01e74c 100644
--- a/lib/kernel/vecmathlib/vec_avx_double4.h
+++ b/lib/kernel/vecmathlib/vec_avx_double4.h
@@ -12,253 +12,244 @@
 // AVX intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_4
-  template<> struct boolvec<double,4>;
-  template<> struct intvec<double,4>;
-  template<> struct realvec<double,4>;
-  
-  
-  
-  template<>
-  struct boolvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef __m256d bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]),
-                                            from_bool(as[2]),
-                                            from_bool(as[1]),
-                                            from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec_t& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return _mm256_xor_pd(boolvec(true), v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return _mm256_and_pd(v, x.v); }
-    boolvec_t operator||(boolvec_t x) const { return _mm256_or_pd(v, x.v); }
-    boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
-    boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_pd(v, x.v); }
-    
-    bool all() const
-    {
-      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
-      return ! (! *this).any();
-    }
-    bool any() const
-    {
-      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
-      return ! bool(_mm256_testz_pd(v, v));
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi64x(a)) {}
-    intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
-    static intvec_t iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm256_castsi256_pd(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec_t x = *this;
-      // We know that boolvec_t values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec_t operator+() const { return *this; }
-    intvec_t operator-() const { return IV(I(0)) - *this; }
-    
-    intvec_t operator+(intvec_t x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi64(vlo, xvlo);
-      vhi = _mm_add_epi64(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec_t operator-(intvec_t x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi64(vlo, xvlo);
-      vhi = _mm_sub_epi64(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v),
-                                               _mm256_castsi256_pd(x.v)));
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v),
-                                              _mm256_castsi256_pd(x.v)));
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v),
-                                               _mm256_castsi256_pd(x.v)));
-    }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srli_epi64(vlo, n);
-      vhi = _mm_srli_epi64(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      // There is no _mm_srai_epi64. To emulate it, add 0x80000000
-      // before shifting, and subtract the shifted 0x80000000 after
-      // shifting
+template <> struct boolvec<double, 4>;
+template <> struct intvec<double, 4>;
+template <> struct realvec<double, 4>;
+
+template <> struct boolvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef __m256d bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
+  boolvec(bool const *as)
+      : v(_mm256_castsi256_pd(
+            _mm256_set_epi64x(from_bool(as[3]), from_bool(as[2]),
+                              from_bool(as[1]), from_bool(as[0])))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec_t &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return _mm256_xor_pd(boolvec(true), v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return _mm256_and_pd(v, x.v); }
+  boolvec_t operator||(boolvec_t x) const { return _mm256_or_pd(v, x.v); }
+  boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+  boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_pd(v, x.v); }
+
+  bool all() const {
+    // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+    return !(!*this).any();
+  }
+  bool any() const {
+    // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+    return !bool(_mm256_testz_pd(v, v));
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef __m256i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm256_set1_epi64x(a)) {}
+  intvec(int_t const *as) : v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
+  static intvec_t iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return _mm256_castsi256_pd(v); }
+  boolvec_t convert_bool() const {
+// Result: convert_bool(0)=false, convert_bool(else)=true
+#ifdef __AVX2__
+    return *this != IV(I(0));
+#else
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec_t x = *this;
+    // We know that boolvec_t values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+#endif
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec_t operator+() const { return *this; }
+  intvec_t operator-() const { return IV(I(0)) - *this; }
+
+  intvec_t operator+(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_add_epi64(v, x.v);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_add_epi64(vlo, xvlo);
+    vhi = _mm_add_epi64(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec_t operator-(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_sub_epi64(v, x.v);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_sub_epi64(vlo, xvlo);
+    vhi = _mm_sub_epi64(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+  intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec_t operator&(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_and_si256(v, x.v);
+#else
+    return _mm256_castpd_si256(
+        _mm256_and_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v)));
+#endif
+  }
+  intvec_t operator|(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_or_si256(v, x.v);
+#else
+    return _mm256_castpd_si256(
+        _mm256_or_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v)));
+#endif
+  }
+  intvec_t operator^(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_xor_si256(v, x.v);
+#else
+    return _mm256_castpd_si256(
+        _mm256_xor_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v)));
+#endif
+  }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const {
+#ifdef __AVX2__
+    return _mm256_srli_epi64(v, n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srli_epi64(vlo, n);
+    vhi = _mm_srli_epi64(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const {
+#ifdef __AVX2__
+    // There is no _mm256_srai_epi64. To emulate it, add 0x80000000
+    // before shifting, and subtract the shifted 0x80000000 after
+    // shifting
+    intvec_t offset = U(1) << (bits - 1);
+    return (*this + offset).lsr(n) - offset.lsr(n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+// There is no _mm_srai_epi64. To emulate it, add 0x80000000
+// before shifting, and subtract the shifted 0x80000000 after
+// shifting
 #if 0
       __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0),
                                          _mm_srli_epi64(vlo, 63));
@@ -271,500 +262,445 @@ namespace vecmathlib {
       vlo = _mm_xor_si128(signmask01, vlo);
       vhi = _mm_xor_si128(signmask23, vhi);
 #else
-      // Convert signed to unsiged
-      vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1)));
-      vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1)));
-      // Shift
-      vlo = _mm_srli_epi64(vlo, n);
-      vhi = _mm_srli_epi64(vhi, n);
-      // Undo conversion
-      vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1-n)));
-      vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1-n)));
+    // Convert signed to unsiged
+    vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits - 1)));
+    vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits - 1)));
+    // Shift
+    vlo = _mm_srli_epi64(vlo, n);
+    vhi = _mm_srli_epi64(vhi, n);
+    // Undo conversion
+    vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits - 1 - n)));
+    vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits - 1 - n)));
 #endif
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec_t operator<<(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_slli_epi64(vlo, n);
-      vhi = _mm_slli_epi64(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const;
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef __m256d vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<AVX:4*double>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_pd(a)) {}
-    realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_pd(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_pd(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec_t operator<<(int_t n) const {
+#ifdef __AVX2__
+    return _mm256_slli_epi64(v, n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_slli_epi64(vlo, n);
+    vhi = _mm_slli_epi64(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+#ifdef __AVX2__
+    return _mm256_srlv_epi64(v, n.v);
+#else
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_pd(p, v);
+    return r;
+#endif
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const {
+#ifdef __AVX2__
+    // See operator>> above
+    intvec_t offset = U(1) << (bits - 1);
+    return (*this + offset).lsr(n) - offset.lsr(n);
+#else
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_pd(p, v);
+    return r;
+#endif
+  }
+  intvec_t operator<<(intvec_t n) const {
+#ifdef __AVX2__
+    return _mm256_sllv_epi64(v, n.v);
+#else
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
+    return r;
+#endif
+  }
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec_t const &x) const {
+#ifdef __AVX2__
+    return _mm256_castsi256_pd(_mm256_cmpeq_epi64(v, x.v));
+#else
+    return !(*this != x);
+#endif
+  }
+  boolvec_t operator!=(intvec_t const &x) const {
+#ifdef __AVX2__
+    return !(*this == x);
+#else
+    return (*this ^ x).convert_bool();
+#endif
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+#ifdef __AVX2__
+    return _mm256_castsi256_pd(_mm256_cmpgt_epi64(x.v, v));
+#else
+    // return (*this - x).as_bool();
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        _mm256_maskstore_pd(p, m.m.as_int(), v);
-      }
+    return r;
+#endif
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef __m256d vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() {
+#ifdef __AVX2__
+    return "<AVX2:4*double>";
+#else
+    return "<AVX:4*double>";
+#endif
+  }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm256_set1_pd(a)) {}
+  realvec(real_t const *as) : v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm256_load_pd(p);
+  }
+  static realvec_t loadu(real_t const *p) { return _mm256_loadu_pd(p); }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        for (int d=0; d<size; ++d) {
-          if (m.m[d]) p[d] = (*this)[d];
-        }
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm256_store_pd(p, v);
+  }
+  void storeu(real_t *p) const { return _mm256_storeu_pd(p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      _mm256_maskstore_pd(p, m.m.as_int(), v);
     }
-    
-    
-    
-    intvec_t as_int() const { return _mm256_castpd_si256(v); }
-    intvec_t convert_int() const
-    {
-      intvec_t r;
-      for (int d=0; d<size; ++d) {
-        r.set_elt(d, floatprops::convert_int((*this)[d]));
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      for (int d = 0; d < size; ++d) {
+        if (m.m[d])
+          p[d] = (*this)[d];
       }
-      return r;
-    }
-    
-    
-    
-    realvec_t operator+() const { return *this; }
-    realvec_t operator-() const { return RV(0.0) - *this; }
-    
-    realvec_t operator+(realvec_t x) const { return _mm256_add_pd(v, x.v); }
-    realvec_t operator-(realvec_t x) const { return _mm256_sub_pd(v, x.v); }
-    realvec_t operator*(realvec_t x) const { return _mm256_mul_pd(v, x.v); }
-    realvec_t operator/(realvec_t x) const { return _mm256_div_pd(v, x.v); }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-      //                      vml_std::fmax((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
-      realvec_t y0022 = x0123.fmax(x1032);
-      return vml_std::fmax(y0022[0], y0022[2]);
-    }
-    real_t minval() const
-    {
-      // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-      //                      vml_std::fmin((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
-      realvec_t y0022 = x0123.fmin(x1032);
-      return vml_std::fmin(y0022[0], y0022[2]);
-    }
-    real_t prod() const
-    {
-      // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
-      realvec_t y0022 = x0123 * x1032;
-      return y0022[0] * y0022[2];
-    }
-    real_t sum() const
-    {
-      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-      // __m256d x = _mm256_hadd_pd(v, v);
-      // __m128d xlo = _mm256_extractf128_pd(x, 0);
-      // __m128d xhi = _mm256_extractf128_pd(x, 1);
-      realvec_t x = *this;
-      x = _mm256_hadd_pd(x.v, x.v);
-      return x[0] + x[2];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
     }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
-    }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const { return _mm256_ceil_pd(v); }
-    realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const { return _mm256_floor_pd(v); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm256_castpd_si256(v); }
+  intvec_t convert_int() const {
+    intvec_t r;
+    for (int d = 0; d < size; ++d) {
+      r.set_elt(d, floatprops::convert_int((*this)[d]));
     }
-    realvec_t fmax(realvec_t y) const { return _mm256_max_pd(v, y.v); }
-    realvec_t fmin(realvec_t y) const { return _mm256_min_pd(v, y.v); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+    return r;
+  }
+
+  realvec_t operator+() const { return *this; }
+  realvec_t operator-() const { return RV(0.0) - *this; }
+
+  realvec_t operator+(realvec_t x) const { return _mm256_add_pd(v, x.v); }
+  realvec_t operator-(realvec_t x) const { return _mm256_sub_pd(v, x.v); }
+  realvec_t operator*(realvec_t x) const { return _mm256_mul_pd(v, x.v); }
+  realvec_t operator/(realvec_t x) const { return _mm256_div_pd(v, x.v); }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+    //                      vml_std::fmax((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+    realvec_t y0022 = x0123.fmax(x1032);
+    return vml_std::fmax(y0022[0], y0022[2]);
+  }
+  real_t minval() const {
+    // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+    //                      vml_std::fmin((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+    realvec_t y0022 = x0123.fmin(x1032);
+    return vml_std::fmin(y0022[0], y0022[2]);
+  }
+  real_t prod() const {
+    // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+    realvec_t y0022 = x0123 * x1032;
+    return y0022[0] * y0022[2];
+  }
+  real_t sum() const {
+    // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    // __m256d x = _mm256_hadd_pd(v, v);
+    // __m128d xlo = _mm256_extractf128_pd(x, 0);
+    // __m128d xhi = _mm256_extractf128_pd(x, 1);
+    realvec_t x = *this;
+    x = _mm256_hadd_pd(x.v, x.v);
+    return x[0] + x[2];
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+  }
+  boolvec_t operator<(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
+  }
+  boolvec_t operator<=(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
+  }
+  boolvec_t operator>(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
+  }
+  boolvec_t operator>=(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
+  }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const { return _mm256_ceil_pd(v); }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const { return _mm256_floor_pd(v); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return _mm256_max_pd(v, y.v); }
+  realvec_t fmin(realvec_t y) const { return _mm256_min_pd(v, y.v); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #ifdef VML_HAVE_NAN
-      return _mm256_cmp_pd(v, v, _CMP_UNORD_Q);
+    return _mm256_cmp_pd(v, v, _CMP_UNORD_Q);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
-    realvec_t remainder(realvec_t y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec_t rint() const
-    {
-      return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return v; }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return _mm256_sqrt_pd(v); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,4> boolvec<double,4>::as_int() const
-  {
-    return _mm256_castpd_si256(v);
-  }
-  
-  inline intvec<double,4> boolvec<double,4>::convert_int() const
-  {
-    //return ifthen(v, U(1), U(0));
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<double,4> boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline
-  intvec<double,4> boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<double,4> boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return _mm256_blendv_pd(y.v, x.v, v);
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline intvec<double,4> intvec<double,4>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline
-  intvec<double,4> intvec<double,4>::bitifthen(intvec_t x, intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::as_float() const
-  {
-    return _mm256_castsi256_pd(v);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::convert_float() const
-  {
-    realvec_t r;
-    for (int d=0; d<size; ++d) {
-      r.set_elt(d, floatprops::convert_float((*this)[d]));
-    }
-    return r;
   }
-  
-  inline boolvec<double,4> intvec<double,4>::isignbit() const
-  {
-    return MF::vml_isignbit(*this);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const {
+    return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return v; }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return _mm256_sqrt_pd(v); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 4> boolvec<double, 4>::as_int() const {
+  return _mm256_castpd_si256(v);
+}
+
+inline intvec<double, 4> boolvec<double, 4>::convert_int() const {
+  // return ifthen(v, U(1), U(0));
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return _mm256_blendv_pd(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<double, 4> intvec<double, 4>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 4> intvec<double, 4>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<double, 4> intvec<double, 4>::as_float() const {
+  return _mm256_castsi256_pd(v);
+}
+
+inline realvec<double, 4> intvec<double, 4>::convert_float() const {
+  realvec_t r;
+  for (int d = 0; d < size; ++d) {
+    r.set_elt(d, floatprops::convert_float((*this)[d]));
+  }
+  return r;
+}
+
+inline intvec<double, 4> intvec<double, 4>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_AVX_DOUBLE4_H
+#endif // #ifndef VEC_AVX_DOUBLE4_H
diff --git a/lib/kernel/vecmathlib/vec_avx_float8.h b/lib/kernel/vecmathlib/vec_avx_float8.h
index bba77cb..f119aee 100644
--- a/lib/kernel/vecmathlib/vec_avx_float8.h
+++ b/lib/kernel/vecmathlib/vec_avx_float8.h
@@ -12,758 +12,697 @@
 // AVX intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_8
-  template<> struct boolvec<float,8>;
-  template<> struct intvec<float,8>;
-  template<> struct realvec<float,8>;
-  
-  
-  
-  template<>
-  struct boolvec<float,8>: floatprops<float>
-  {
-    static int const size = 8;
-    typedef bool scalar_t;
-    typedef __m256 bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm256_castsi256_ps(_mm256_set_epi32(from_bool(as[7]),
-                                           from_bool(as[6]),
-                                           from_bool(as[5]),
-                                           from_bool(as[4]),
-                                           from_bool(as[3]),
-                                           from_bool(as[2]),
-                                           from_bool(as[1]),
-                                           from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec_t& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return _mm256_xor_ps(boolvec(true), v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return _mm256_and_ps(v, x.v); }
-    boolvec_t operator||(boolvec_t x) const { return _mm256_or_ps(v, x.v); }
-    boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
-    boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_ps(v, x.v); }
-    
-    bool all() const
-    {
-      // return
-      //   (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
-      //   (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
-      return ! (! *this).any();
-    }
-    bool any() const
-    {
-      // return
-      //   (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
-      //   (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
-      return ! bool(_mm256_testz_ps(v, v));
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,8>: floatprops<float>
-  {
-    static int const size = 8;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi32(a)) {}
-    intvec(int_t const* as): v(_mm256_set_epi32(as[7], as[6], as[5], as[4],
-                                                as[3], as[2], as[1], as[0])) {}
-    static intvec_t iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm256_castsi256_ps(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec_t x = *this;
-      // We know that boolvec_t values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec_t operator+() const { return *this; }
-    intvec_t operator-() const { return IV(0) - *this; }
-    
-    intvec_t operator+(intvec_t x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi32(vlo, xvlo);
-      vhi = _mm_add_epi32(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec_t operator-(intvec_t x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi32(vlo, xvlo);
-      vhi = _mm_sub_epi32(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srli_epi32(vlo, n);
-      vhi = _mm_srli_epi32(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srai_epi32(vlo, n);
-      vhi = _mm_srai_epi32(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec_t operator<<(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_slli_epi32(vlo, n);
-      vhi = _mm_slli_epi32(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,8>: floatprops<float>
-  {
-    static int const size = 8;
-    typedef real_t scalar_t;
-    typedef __m256 vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<AVX:8*float>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_ps(a)) {}
-    realvec(real_t const* as): v(_mm256_set_ps(as[7], as[6], as[5], as[4],
-                                               as[3], as[2], as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_ps(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_ps(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_ps(p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_ps(p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        _mm256_maskstore_ps(p, m.m.as_int(), v);
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return _mm256_castps_si256(v); }
-    intvec_t convert_int() const { return _mm256_cvttps_epi32(v); }
-    
-    
-    
-    realvec_t operator+() const { return *this; }
-    realvec_t operator-() const { return RV(0.0) - *this; }
-    
-    realvec_t operator+(realvec_t x) const { return _mm256_add_ps(v, x.v); }
-    realvec_t operator-(realvec_t x) const { return _mm256_sub_ps(v, x.v); }
-    realvec_t operator*(realvec_t x) const { return _mm256_mul_ps(v, x.v); }
-    realvec_t operator/(realvec_t x) const { return _mm256_div_ps(v, x.v); }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      // return
-      //   vml_std::fmax(vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-      //                               vml_std::fmax((*this)[2], (*this)[3])),
-      //                 vml_std::fmax(vml_std::fmax((*this)[4], (*this)[5]),
-      //                               vml_std::fmax((*this)[6], (*this)[7])));
-      realvec_t x01234567 = *this;
-      realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
-      realvec_t y00224466 = x01234567.fmax(x10325476);
-      realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
-      realvec_t z00004444 = y00224466.fmax(y22006644);
-      return vml_std::fmax(z00004444[0], z00004444[4]);
-    }
-    real_t minval() const
-    {
-      // return
-      //   vml_std::fmin(vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-      //                               vml_std::fmin((*this)[2], (*this)[3])),
-      //                 vml_std::fmin(vml_std::fmin((*this)[4], (*this)[5]),
-      //                               vml_std::fmin((*this)[6], (*this)[7])));
-      realvec_t x01234567 = *this;
-      realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
-      realvec_t y00224466 = x01234567.fmin(x10325476);
-      realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
-      realvec_t z00004444 = y00224466.fmin(y22006644);
-      return vml_std::fmin(z00004444[0], z00004444[4]);
-    }
-    real_t prod() const
-    {
-      // return
-      //   (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
-      //   (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
-      realvec_t x01234567 = *this;
-      realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
-      realvec_t y00224466 = x01234567 * x10325476;
-      realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
-      realvec_t z00004444 = y00224466 * y22006644;
-      return z00004444[0] * z00004444[4];
-    }
-    real_t sum() const
-    {
-      // return
-      //   (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
-      //   (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
-      // _m256 x = vhaddps(v, v);
-      // x = vhaddps(x, x);
-      // __m128 xlo = _mm256_extractf128_ps(x, 0);
-      // __m128 xhi = _mm256_extractf128_ps(x, 1);
-      // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
-      realvec_t x = *this;
-      x = _mm256_hadd_ps(x.v, x.v);
-      x = _mm256_hadd_ps(x.v, x.v);
-      return x[0] + x[4];
+template <> struct boolvec<float, 8>;
+template <> struct intvec<float, 8>;
+template <> struct realvec<float, 8>;
+
+template <> struct boolvec<float, 8> : floatprops<float> {
+  static int const size = 8;
+  typedef bool scalar_t;
+  typedef __m256 bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {}
+  boolvec(bool const *as)
+      : v(_mm256_castsi256_ps(_mm256_set_epi32(
+            from_bool(as[7]), from_bool(as[6]), from_bool(as[5]),
+            from_bool(as[4]), from_bool(as[3]), from_bool(as[2]),
+            from_bool(as[1]), from_bool(as[0])))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec_t &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return _mm256_xor_ps(boolvec(true), v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return _mm256_and_ps(v, x.v); }
+  boolvec_t operator||(boolvec_t x) const { return _mm256_or_ps(v, x.v); }
+  boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+  boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_ps(v, x.v); }
+
+  bool all() const {
+    // return
+    //   (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
+    //   (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
+    return !(!*this).any();
+  }
+  bool any() const {
+    // return
+    //   (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
+    //   (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
+    return !bool(_mm256_testz_ps(v, v));
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 8> : floatprops<float> {
+  static int const size = 8;
+  typedef int_t scalar_t;
+  typedef __m256i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm256_set1_epi32(a)) {}
+  intvec(int_t const *as)
+      : v(_mm256_set_epi32(as[7], as[6], as[5], as[4], as[3], as[2], as[1],
+                           as[0])) {}
+  static intvec_t iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return _mm256_castsi256_ps(v); }
+  boolvec_t convert_bool() const {
+// Result: convert_bool(0)=false, convert_bool(else)=true
+#ifdef __AVX2__
+    return *this != IV(I(0));
+#else
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec_t x = *this;
+    // We know that boolvec_t values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+#endif
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec_t operator+() const { return *this; }
+  intvec_t operator-() const { return IV(0) - *this; }
+
+  intvec_t operator+(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_add_epi32(v, x.v);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_add_epi32(vlo, xvlo);
+    vhi = _mm_add_epi32(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec_t operator-(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_sub_epi32(v, x.v);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_sub_epi32(vlo, xvlo);
+    vhi = _mm_sub_epi32(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+  intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec_t operator&(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_and_si256(v, x.v);
+#else
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+#endif
+  }
+  intvec_t operator|(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_or_si256(v, x.v);
+#else
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+#endif
+  }
+  intvec_t operator^(intvec_t x) const {
+#ifdef __AVX2__
+    return _mm256_xor_si256(v, x.v);
+#else
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+#endif
+  }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const {
+#ifdef __AVX2__
+    return _mm256_srli_epi32(v, n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srli_epi32(vlo, n);
+    vhi = _mm_srli_epi32(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const {
+#ifdef __AVX2__
+    return _mm256_srai_epi32(v, n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srai_epi32(vlo, n);
+    vhi = _mm_srai_epi32(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec_t operator<<(int_t n) const {
+#ifdef __AVX2__
+    return _mm256_slli_epi32(v, n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_slli_epi32(vlo, n);
+    vhi = _mm_slli_epi32(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+#ifdef __AVX2__
+    return _mm256_srlv_epi32(v, n.v);
+#else
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ);
+    return r;
+#endif
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const {
+#ifdef __AVX2__
+    return _mm256_srav_epi32(v, n.v);
+#else
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+    return r;
+#endif
+  }
+  intvec_t operator<<(intvec_t n) const {
+#ifdef __AVX2__
+    return _mm256_sllv_epi32(v, n.v);
+#else
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ);
+    return r;
+#endif
+  }
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec_t const &x) const {
+#ifdef __AVX2__
+    return _mm256_castsi256_ps(_mm256_cmpeq_epi32(v, x.v));
+#else
+    return !(*this != x);
+#endif
+  }
+  boolvec_t operator!=(intvec_t const &x) const {
+#ifdef __AVX2__
+    return !(*this == x);
+#else
+    return (*this ^ x).convert_bool();
+#endif
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+#ifdef __AVX2__
+    return _mm256_castsi256_ps(_mm256_cmpgt_epi32(x.v, v));
+#else
+    // return (*this - x).as_bool();
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ);
+    return r;
+#endif
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<float, 8> : floatprops<float> {
+  static int const size = 8;
+  typedef real_t scalar_t;
+  typedef __m256 vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() {
+#ifdef __AVX2__
+    return "<AVX2:8*float>";
+#else
+    return "<AVX:8*float>";
+#endif
+  }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm256_set1_ps(a)) {}
+  realvec(real_t const *as)
+      : v(_mm256_set_ps(as[7], as[6], as[5], as[4], as[3], as[2], as[1],
+                        as[0])) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm256_load_ps(p);
+  }
+  static realvec_t loadu(real_t const *p) { return _mm256_loadu_ps(p); }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ);
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm256_store_ps(p, v);
+  }
+  void storeu(real_t *p) const { return _mm256_storeu_ps(p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      _mm256_maskstore_ps(p, m.m.as_int(), v);
     }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const { return _mm256_ceil_ps(v); }
-    realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const { return _mm256_floor_ps(v); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
     }
-    realvec_t fmax(realvec_t y) const { return _mm256_max_ps(v, y.v); }
-    realvec_t fmin(realvec_t y) const { return _mm256_min_ps(v, y.v); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm256_castps_si256(v); }
+  intvec_t convert_int() const { return _mm256_cvttps_epi32(v); }
+
+  realvec_t operator+() const { return *this; }
+  realvec_t operator-() const { return RV(0.0) - *this; }
+
+  realvec_t operator+(realvec_t x) const { return _mm256_add_ps(v, x.v); }
+  realvec_t operator-(realvec_t x) const { return _mm256_sub_ps(v, x.v); }
+  realvec_t operator*(realvec_t x) const { return _mm256_mul_ps(v, x.v); }
+  realvec_t operator/(realvec_t x) const { return _mm256_div_ps(v, x.v); }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    // return
+    //   vml_std::fmax(vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+    //                               vml_std::fmax((*this)[2], (*this)[3])),
+    //                 vml_std::fmax(vml_std::fmax((*this)[4], (*this)[5]),
+    //                               vml_std::fmax((*this)[6], (*this)[7])));
+    realvec_t x01234567 = *this;
+    realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+    realvec_t y00224466 = x01234567.fmax(x10325476);
+    realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+    realvec_t z00004444 = y00224466.fmax(y22006644);
+    return vml_std::fmax(z00004444[0], z00004444[4]);
+  }
+  real_t minval() const {
+    // return
+    //   vml_std::fmin(vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+    //                               vml_std::fmin((*this)[2], (*this)[3])),
+    //                 vml_std::fmin(vml_std::fmin((*this)[4], (*this)[5]),
+    //                               vml_std::fmin((*this)[6], (*this)[7])));
+    realvec_t x01234567 = *this;
+    realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+    realvec_t y00224466 = x01234567.fmin(x10325476);
+    realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+    realvec_t z00004444 = y00224466.fmin(y22006644);
+    return vml_std::fmin(z00004444[0], z00004444[4]);
+  }
+  real_t prod() const {
+    // return
+    //   (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
+    //   (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
+    realvec_t x01234567 = *this;
+    realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+    realvec_t y00224466 = x01234567 * x10325476;
+    realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+    realvec_t z00004444 = y00224466 * y22006644;
+    return z00004444[0] * z00004444[4];
+  }
+  real_t sum() const {
+    // return
+    //   (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
+    //   (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
+    // _m256 x = vhaddps(v, v);
+    // x = vhaddps(x, x);
+    // __m128 xlo = _mm256_extractf128_ps(x, 0);
+    // __m128 xhi = _mm256_extractf128_ps(x, 1);
+    // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
+    realvec_t x = *this;
+    x = _mm256_hadd_ps(x.v, x.v);
+    x = _mm256_hadd_ps(x.v, x.v);
+    return x[0] + x[4];
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ);
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+  }
+  boolvec_t operator<(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ);
+  }
+  boolvec_t operator<=(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ);
+  }
+  boolvec_t operator>(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ);
+  }
+  boolvec_t operator>=(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ);
+  }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const { return _mm256_ceil_ps(v); }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const { return _mm256_floor_ps(v); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return _mm256_max_ps(v, y.v); }
+  realvec_t fmin(realvec_t y) const { return _mm256_min_ps(v, y.v); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #ifdef VML_HAVE_NAN
-      return _mm256_cmp_ps(v, v, _CMP_UNORD_Q);
+    return _mm256_cmp_ps(v, v, _CMP_UNORD_Q);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const
-    {
-      realvec_t x = *this;
-      realvec_t r = _mm256_rcp_ps(x); // this is only an approximation
-      r *= RV(2.0) - r*x;        // one Newton iteration (see vml_rcp)
-      return r;
-    }
-    realvec_t remainder(realvec_t y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec_t rint() const
-    {
-      return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const
-    {
-      realvec_t x = *this;
-      realvec_t r = _mm256_rsqrt_ps(x);    // this is only an approximation
-      r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
-      return r;
-    }
-    boolvec_t signbit() const { return v; }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return _mm256_sqrt_ps(v); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,8> boolvec<float,8>::as_int() const
-  {
-    return _mm256_castps_si256(v);
-  }
-  
-  inline intvec<float,8> boolvec<float,8>::convert_int() const
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<float,8> boolvec<float,8>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline intvec<float,8> boolvec<float,8>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<float,8> boolvec<float,8>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return _mm256_blendv_ps(y.v, x.v, v);
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline intvec<float,8> intvec<float,8>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline realvec<float,8> intvec<float,8>::as_float() const
-  {
-    return _mm256_castsi256_ps(v);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::bitifthen(intvec_t x,
-                                                    intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<float,8> intvec<float,8>::convert_float() const
-  {
-    return _mm256_cvtepi32_ps(v);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const {
+    realvec_t x = *this;
+    realvec_t r = _mm256_rcp_ps(x); // this is only an approximation
+    r *= RV(2.0) - r * x;           // one Newton iteration (see vml_rcp)
+    return r;
+  }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const {
+    return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const {
+    realvec_t x = *this;
+    realvec_t r = _mm256_rsqrt_ps(x);   // this is only an approximation
+    r *= RV(1.5) - RV(0.5) * x * r * r; // one Newton iteration (see vml_rsqrt)
+    return r;
+  }
+  boolvec_t signbit() const { return v; }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return _mm256_sqrt_ps(v); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); }
+};
+
+// boolvec definitions
+
+inline intvec<float, 8> boolvec<float, 8>::as_int() const {
+  return _mm256_castps_si256(v);
+}
+
+inline intvec<float, 8> boolvec<float, 8>::convert_int() const {
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<float, 8> boolvec<float, 8>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<float, 8> boolvec<float, 8>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<float, 8> boolvec<float, 8>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return _mm256_blendv_ps(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<float, 8> intvec<float, 8>::abs() const {
+#ifdef __AVX2__
+  return _mm256_abs_epi32(v);
+#else
+  return MF::vml_abs(*this);
+#endif
+}
+
+inline realvec<float, 8> intvec<float, 8>::as_float() const {
+  return _mm256_castsi256_ps(v);
+}
+
+inline intvec<float, 8> intvec<float, 8>::bitifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<float, 8> intvec<float, 8>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<float, 8> intvec<float, 8>::convert_float() const {
+  return _mm256_cvtepi32_ps(v);
+}
+
+inline intvec<float, 8> intvec<float, 8>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<float, 8> intvec<float, 8>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<float, 8> intvec<float, 8>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<float, 8> intvec<float, 8>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 8> intvec<float, 8>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_AVX_FLOAT8_H
+#endif // #ifndef VEC_AVX_FLOAT8_H
diff --git a/lib/kernel/vecmathlib/vec_avx_fp16_16.h b/lib/kernel/vecmathlib/vec_avx_fp16_16.h
index e461ce4..8dadf64 100644
--- a/lib/kernel/vecmathlib/vec_avx_fp16_16.h
+++ b/lib/kernel/vecmathlib/vec_avx_fp16_16.h
@@ -12,599 +12,584 @@
 // AVX intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FP16_16
-  template<> struct boolvec<fp16,16>;
-  template<> struct intvec<fp16,16>;
-  template<> struct realvec<fp16,16>;
-  
-  
-  
-  template<>
-  struct boolvec<fp16,16>: floatprops<fp16>
-  {
-    static int const size = 16;
-    typedef bool scalar_t;
-    typedef __m256i bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {}
-    boolvec(bool const* as):
-    v(_mm256_set_epi16(from_bool(as[15]),
-                       from_bool(as[14]),
-                       from_bool(as[13]),
-                       from_bool(as[12]),
-                       from_bool(as[11]),
-                       from_bool(as[10]),
-                       from_bool(as[ 9]),
-                       from_bool(as[ 8]),
-                       from_bool(as[ 7]),
-                       from_bool(as[ 6]),
-                       from_bool(as[ 5]),
-                       from_bool(as[ 4]),
-                       from_bool(as[ 3]),
-                       from_bool(as[ 2]),
-                       from_bool(as[ 1]),
-                       from_bool(as[ 0]))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return *this != boolvec(true); }
-    
-    boolvec operator&&(boolvec x) const 
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator||(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    bool all() const
-    {
-      bool r = (*this)[0];
-      for (int n=1; n<size; ++n) r = r && (*this)[n];
-      return r;
-    }
-    bool any() const
-    {
-      bool r = (*this)[0];;
-      for (int n=1; n<size; ++n) r = r || (*this)[n];
-      return r;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<fp16,16>: floatprops<fp16>
-  {
-    static int const size = 16;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi16(a)) {}
-    intvec(int_t const* as):
-    v(_mm256_set_epi16(as[15],
-                       as[14],
-                       as[13],
-                       as[12],
-                       as[11],
-                       as[10],
-                       as[ 9],
-                       as[ 8],
-                       as[ 7],
-                       as[ 6],
-                       as[ 5],
-                       as[ 4],
-                       as[ 3],
-                       as[ 2],
-                       as[ 1],
-                       as[ 0])) {}
-    static intvec iota()
-    {
-      return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8,
-                              7, 6, 5, 4, 3, 2, 1, 0);
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return v; }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec x = *this;
-      // We know that boolvec values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    
-    intvec operator+(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi16(vlo, xvlo);
-      vhi = _mm_add_epi16(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator-(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi16(vlo, xvlo);
-      vhi = _mm_sub_epi16(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator|(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator^(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srli_epi16(vlo, n);
-      vhi = _mm_srli_epi16(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator>>(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srai_epi16(vlo, n);
-      vhi = _mm_srai_epi16(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator<<(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_slli_epi16(vlo, n);
-      vhi = _mm_slli_epi16(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
+template <> struct boolvec<fp16, 16>;
+template <> struct intvec<fp16, 16>;
+template <> struct realvec<fp16, 16>;
+
+template <> struct boolvec<fp16, 16> : floatprops<fp16> {
+  static int const size = 16;
+  typedef bool scalar_t;
+  typedef __m256i bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm256_set1_epi16(from_bool(a))) {}
+  boolvec(bool const *as)
+      : v(_mm256_set_epi16(from_bool(as[15]), from_bool(as[14]),
+                           from_bool(as[13]), from_bool(as[12]),
+                           from_bool(as[11]), from_bool(as[10]),
+                           from_bool(as[9]), from_bool(as[8]), from_bool(as[7]),
+                           from_bool(as[6]), from_bool(as[5]), from_bool(as[4]),
+                           from_bool(as[3]), from_bool(as[2]), from_bool(as[1]),
+                           from_bool(as[0]))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return *this != boolvec(true); }
+
+  boolvec operator&&(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+  boolvec operator||(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+  boolvec operator==(boolvec x) const { return !(*this != x); }
+  boolvec operator!=(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+
+  bool all() const {
+    bool r = (*this)[0];
+    for (int n = 1; n < size; ++n)
+      r = r && (*this)[n];
+    return r;
+  }
+  bool any() const {
+    bool r = (*this)[0];
+    ;
+    for (int n = 1; n < size; ++n)
+      r = r || (*this)[n];
+    return r;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<fp16, 16> : floatprops<fp16> {
+  static int const size = 16;
+  typedef int_t scalar_t;
+  typedef __m256i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm256_set1_epi16(a)) {}
+  intvec(int_t const *as)
+      : v(_mm256_set_epi16(as[15], as[14], as[13], as[12], as[11], as[10],
+                           as[9], as[8], as[7], as[6], as[5], as[4], as[3],
+                           as[2], as[1], as[0])) {}
+  static intvec iota() {
+    return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+                            0);
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return v; }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec x = *this;
+    // We know that boolvec values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return IV(I(0)) - *this; }
+
+  intvec operator+(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_add_epi16(v, x.v);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_add_epi16(vlo, xvlo);
+    vhi = _mm_add_epi16(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec operator-(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_sub_epi16(v, x.v);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_sub_epi16(vlo, xvlo);
+    vhi = _mm_sub_epi16(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+
+  intvec operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec operator&(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_and_si256(v, x.v);
+#else
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+#endif
+  }
+  intvec operator|(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_or_si256(v, x.v);
+#else
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+#endif
+  }
+  intvec operator^(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_xor_si256(v, x.v);
+#else
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+#endif
+  }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec lsr(int_t n) const {
+#ifdef __AVX2__
+    return _mm256_srli_epi16(v, n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srli_epi16(vlo, n);
+    vhi = _mm_srli_epi16(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec operator>>(int_t n) const {
+#ifdef __AVX2__
+    return _mm256_srai_epi16(v, n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srai_epi16(vlo, n);
+    vhi = _mm_srai_epi16(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec operator<<(int_t n) const {
+#ifdef __AVX2__
+    return _mm256_slli_epi16(v, n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_slli_epi16(vlo, n);
+    vhi = _mm_slli_epi16(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
+#ifdef __AVX2__
+    // TODO: Use permute instead of shift/mask?
+    __m256i mlo = _mm256_set1_epi32(U(0x0000ffff));
+    __m256i vlo = _mm256_and_si256(mlo, v);
+    __m256i vhi = v;
+    __m256i clo = _mm256_and_si256(mlo, n);
+    __m256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16));
+    __m256i rlo = _mm256_srlv_epi32(vlo, clo);
+    __m256i rhi = _mm256_andnot_si256(mlo, _mm256_srlv_epi32(vhi, chi));
+    return _mm256_or_si256(rhi, rlo);
+#else
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
+    }
+    return r;
+#endif
+  }
+  intvec operator>>(intvec n) const {
+#ifdef __AVX2__
+    intvec_t offset = U(1) << (bits - 1);
+    return (*this + offset).lsr(n) - offset.lsr(n);
+#else
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
+    }
+    return r;
+#endif
+  }
+  intvec operator<<(intvec n) const {
+#ifdef __AVX2__
+    // TODO: Use permute instead of shift/mask?
+    __m256i mlo = _mm256_set1_epi32(U(0x0000ffff));
+    __m256i vlo = v;
+    __m256i vhi = _mm256_andnot_si256(mlo, v);
+    __m256i clo = _mm256_and_si256(mlo, n);
+    __m256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16));
+    __m256i rlo = _mm256_and_si256(mlo, _mm256_sllv_epi32(vlo, clo));
+    __m256i rhi = _mm256_sllv_epi32(vhi, chi);
+    return _mm256_or_si256(rhi, rlo);
+#else
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
+    }
+    return r;
+#endif
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  boolvec_t operator==(intvec const &x) const {
+#ifdef __AVX2__
+    return _mm256_cmpeq_epi16(v, x.v);
+#else
+    return !(*this != x);
+#endif
+  }
+  boolvec_t operator!=(intvec const &x) const {
+#ifdef __AVX2__
+    return !(*this == x);
+#else
+    return (*this ^ x).convert_bool();
+#endif
+  }
+  boolvec_t operator<(intvec const &x) const {
+#ifdef __AVX2__
+    return _mm256_cmpgt_epi16(x.v, v);
+#else
     // TODO: First compare sign; then if equal, compare sign of difference
     // TODO: Also look for intrinsics
-    boolvec_t operator<(intvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<=(intvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>(intvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>=(intvec const& x) const { __builtin_unreachable(); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<fp16,16>: floatprops<fp16>
-  {
-    static int const size = 16;
-    typedef real_t scalar_t;
-    typedef __m256i vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<AVX:16*fp16>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {}
-    realvec(real_t const* as):
-    v(_mm256_set_epi16(FP::as_int(as[15]),
-                       FP::as_int(as[14]),
-                       FP::as_int(as[13]),
-                       FP::as_int(as[12]),
-                       FP::as_int(as[11]),
-                       FP::as_int(as[10]),
-                       FP::as_int(as[ 9]),
-                       FP::as_int(as[ 8]),
-                       FP::as_int(as[ 7]),
-                       FP::as_int(as[ 6]),
-                       FP::as_int(as[ 5]),
-                       FP::as_int(as[ 4]),
-                       FP::as_int(as[ 3]),
-                       FP::as_int(as[ 2]),
-                       FP::as_int(as[ 1]),
-                       FP::as_int(as[ 0]))) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
+    return r;
+#endif
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<fp16, 16> : floatprops<fp16> {
+  static int const size = 16;
+  typedef real_t scalar_t;
+  typedef __m256i vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() {
+#ifdef __AVX2__
+    return "<AVX2:16*fp16>";
+#else
+    return "<AVX:16*fp16>";
+#endif
+  }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm256_set1_epi16(FP::as_int(a))) {}
+  realvec(real_t const *as)
+      : v(_mm256_set_epi16(
+            FP::as_int(as[15]), FP::as_int(as[14]), FP::as_int(as[13]),
+            FP::as_int(as[12]), FP::as_int(as[11]), FP::as_int(as[10]),
+            FP::as_int(as[9]), FP::as_int(as[8]), FP::as_int(as[7]),
+            FP::as_int(as[6]), FP::as_int(as[5]), FP::as_int(as[4]),
+            FP::as_int(as[3]), FP::as_int(as[2]), FP::as_int(as[1]),
+            FP::as_int(as[0]))) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm256_load_si256((__m256i const *)p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    return _mm256_loadu_si256((__m256i const *)p);
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm256_store_si256((__m256i *)p, v);
+  }
+  void storeu(real_t *p) const { return _mm256_storeu_si256((__m256i *)p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
     }
-    
-    
-    
-    intvec_t as_int() const { return v; }
-    intvec_t convert_int() const { __builtin_unreachable(); }
-    
-    
-    
-    realvec operator+() const { __builtin_unreachable(); }
-    realvec operator-() const { __builtin_unreachable(); }
-    
-    realvec operator+(realvec x) const { __builtin_unreachable(); }
-    realvec operator-(realvec x) const { __builtin_unreachable(); }
-    realvec operator*(realvec x) const { __builtin_unreachable(); }
-    realvec operator/(realvec x) const { __builtin_unreachable(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const { __builtin_unreachable(); }
-    real_t minval() const { __builtin_unreachable(); }
-    real_t prod() const { __builtin_unreachable(); }
-    real_t sum() const { __builtin_unreachable(); }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
-    
-    
-    
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    boolvec_t signbit() const { return v; }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<fp16,16> boolvec<fp16,16>::as_int() const
-  {
-    return v;
-  }
-  
-  inline intvec<fp16,16> boolvec<fp16,16>::convert_int() const
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<fp16,16> boolvec<fp16,16>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline intvec<fp16,16> boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<fp16,16> boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline realvec<fp16,16> intvec<fp16,16>::as_float() const
-  {
-    return v;
-  }
-  
-  inline realvec<fp16,16> intvec<fp16,16>::convert_float() const
-  {
-    __builtin_unreachable();
-  }
-  
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return v; }
+  intvec_t convert_int() const { __builtin_unreachable(); }
+
+  realvec operator+() const { __builtin_unreachable(); }
+  realvec operator-() const { __builtin_unreachable(); }
+
+  realvec operator+(realvec x) const { __builtin_unreachable(); }
+  realvec operator-(realvec x) const { __builtin_unreachable(); }
+  realvec operator*(realvec x) const { __builtin_unreachable(); }
+  realvec operator/(realvec x) const { __builtin_unreachable(); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const { __builtin_unreachable(); }
+  real_t minval() const { __builtin_unreachable(); }
+  real_t prod() const { __builtin_unreachable(); }
+  real_t sum() const { __builtin_unreachable(); }
+
+  boolvec_t operator==(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator!=(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator<(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator<=(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator>(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator>=(realvec const &x) const { __builtin_unreachable(); }
+
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec fabs() const { return MF::vml_fabs(*this); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  boolvec_t signbit() const { return v; }
+};
+
+// boolvec definitions
+
+inline intvec<fp16, 16> boolvec<fp16, 16>::as_int() const { return v; }
+
+inline intvec<fp16, 16> boolvec<fp16, 16>::convert_int() const {
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<fp16, 16> boolvec<fp16, 16>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<fp16, 16> boolvec<fp16, 16>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return ((-convert_int() & x) | (~ - convert_int() & y));
+}
+
+inline realvec<fp16, 16> boolvec<fp16, 16>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_float();
+}
+
+// intvec definitions
+
+inline intvec<fp16, 16> intvec<fp16, 16>::abs() const {
+#ifdef __AVX2__
+  return _mm256_abs_epi16(v);
+#else
+  return MF::vml_abs(*this);
+#endif
+}
+
+inline realvec<fp16, 16> intvec<fp16, 16>::as_float() const { return v; }
+
+inline realvec<fp16, 16> intvec<fp16, 16>::convert_float() const {
+  __builtin_unreachable();
+}
+
+inline intvec<fp16, 16> intvec<fp16, 16>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<fp16, 16> intvec<fp16, 16>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_AVX_FP16_16_H
+#endif // #ifndef VEC_AVX_FP16_16_H
diff --git a/lib/kernel/vecmathlib/vec_avx_fp8_32.h b/lib/kernel/vecmathlib/vec_avx_fp8_32.h
index 5ed93e4..0ae79e7 100644
--- a/lib/kernel/vecmathlib/vec_avx_fp8_32.h
+++ b/lib/kernel/vecmathlib/vec_avx_fp8_32.h
@@ -12,665 +12,592 @@
 // AVX intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FP8_32
-  template<> struct boolvec<fp8,32>;
-  template<> struct intvec<fp8,32>;
-  template<> struct realvec<fp8,32>;
-  
-  
-  
-  template<>
-  struct boolvec<fp8,32>: floatprops<fp8>
-  {
-    static int const size = 32;
-    typedef bool scalar_t;
-    typedef __m256i bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {}
-    boolvec(bool const* as):
-    v(_mm256_set_epi8(from_bool(as[31]),
-                      from_bool(as[30]),
-                      from_bool(as[29]),
-                      from_bool(as[28]),
-                      from_bool(as[27]),
-                      from_bool(as[26]),
-                      from_bool(as[25]),
-                      from_bool(as[24]),
-                      from_bool(as[23]),
-                      from_bool(as[22]),
-                      from_bool(as[21]),
-                      from_bool(as[20]),
-                      from_bool(as[19]),
-                      from_bool(as[18]),
-                      from_bool(as[17]),
-                      from_bool(as[16]),
-                      from_bool(as[15]),
-                      from_bool(as[14]),
-                      from_bool(as[13]),
-                      from_bool(as[12]),
-                      from_bool(as[11]),
-                      from_bool(as[10]),
-                      from_bool(as[ 9]),
-                      from_bool(as[ 8]),
-                      from_bool(as[ 7]),
-                      from_bool(as[ 6]),
-                      from_bool(as[ 5]),
-                      from_bool(as[ 4]),
-                      from_bool(as[ 3]),
-                      from_bool(as[ 2]),
-                      from_bool(as[ 1]),
-                      from_bool(as[ 0]))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return *this != boolvec(true); }
-    
-    boolvec operator&&(boolvec x) const 
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator||(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    bool all() const
-    {
-      bool r = (*this)[0];
-      for (int n=1; n<size; ++n) r = r && (*this)[n];
-      return r;
-    }
-    bool any() const
-    {
-      bool r = (*this)[0];;
-      for (int n=1; n<size; ++n) r = r || (*this)[n];
-      return r;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<fp8,32>: floatprops<fp8>
-  {
-    static int const size = 32;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi8(a)) {}
-    intvec(int_t const* as):
-    v(_mm256_set_epi8(as[31],
-                      as[30],
-                      as[29],
-                      as[28],
-                      as[27],
-                      as[26],
-                      as[25],
-                      as[24],
-                      as[23],
-                      as[22],
-                      as[21],
-                      as[20],
-                      as[19],
-                      as[18],
-                      as[17],
-                      as[16],
-                      as[15],
-                      as[14],
-                      as[13],
-                      as[12],
-                      as[11],
-                      as[10],
-                      as[ 9],
-                      as[ 8],
-                      as[ 7],
-                      as[ 6],
-                      as[ 5],
-                      as[ 4],
-                      as[ 3],
-                      as[ 2],
-                      as[ 1],
-                      as[ 0])) {}
-    static intvec iota()
-    {
-      return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24,
-                             23, 22, 21, 20, 19, 18, 17, 16,
-                             15, 14, 13, 12, 11, 10, 9, 8,
-                             7, 6, 5, 4, 3, 2, 1, 0);
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return v; }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec x = *this;
-      // We know that boolvec values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    
-    intvec operator+(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi8(vlo, xvlo);
-      vhi = _mm_add_epi8(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator-(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi8(vlo, xvlo);
-      vhi = _mm_sub_epi8(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator|(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator^(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      uint_t masklo = U(0x00ffU) >> U(n);
-      uint_t maskhi = U(0xff00U);
-      __m128i mask = _mm_set1_epi16(masklo | maskhi);
-      vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
-      vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator>>(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      uint_t masklo = U(0x00ffU);
-      uint_t maskhi = U(0xff00U);
-      __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8),
-                                    _mm_set1_epi16(masklo));
-      __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n),
-                                    _mm_set1_epi16(maskhi));
-      vlo = _mm_or_si128(vlolo, vlohi);
-      __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8),
-                                    _mm_set1_epi16(masklo));
-      __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n),
-                                    _mm_set1_epi16(maskhi));
-      vhi = _mm_or_si128(vhilo, vhihi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator<<(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      uint_t masklo = U(0x00ffU);
-      uint_t maskhi = U(0xff00U) << U(n);
-      __m128i mask = _mm_set1_epi16(masklo | maskhi);
-      vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
-      vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
+template <> struct boolvec<fp8, 32>;
+template <> struct intvec<fp8, 32>;
+template <> struct realvec<fp8, 32>;
+
+template <> struct boolvec<fp8, 32> : floatprops<fp8> {
+  static int const size = 32;
+  typedef bool scalar_t;
+  typedef __m256i bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm256_set1_epi8(from_bool(a))) {}
+  boolvec(bool const *as)
+      : v(_mm256_set_epi8(
+            from_bool(as[31]), from_bool(as[30]), from_bool(as[29]),
+            from_bool(as[28]), from_bool(as[27]), from_bool(as[26]),
+            from_bool(as[25]), from_bool(as[24]), from_bool(as[23]),
+            from_bool(as[22]), from_bool(as[21]), from_bool(as[20]),
+            from_bool(as[19]), from_bool(as[18]), from_bool(as[17]),
+            from_bool(as[16]), from_bool(as[15]), from_bool(as[14]),
+            from_bool(as[13]), from_bool(as[12]), from_bool(as[11]),
+            from_bool(as[10]), from_bool(as[9]), from_bool(as[8]),
+            from_bool(as[7]), from_bool(as[6]), from_bool(as[5]),
+            from_bool(as[4]), from_bool(as[3]), from_bool(as[2]),
+            from_bool(as[1]), from_bool(as[0]))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return *this != boolvec(true); }
+
+  boolvec operator&&(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+  boolvec operator||(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+  boolvec operator==(boolvec x) const { return !(*this != x); }
+  boolvec operator!=(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+
+  bool all() const {
+    bool r = (*this)[0];
+    for (int n = 1; n < size; ++n)
+      r = r && (*this)[n];
+    return r;
+  }
+  bool any() const {
+    bool r = (*this)[0];
+    ;
+    for (int n = 1; n < size; ++n)
+      r = r || (*this)[n];
+    return r;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<fp8, 32> : floatprops<fp8> {
+  static int const size = 32;
+  typedef int_t scalar_t;
+  typedef __m256i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm256_set1_epi8(a)) {}
+  intvec(int_t const *as)
+      : v(_mm256_set_epi8(as[31], as[30], as[29], as[28], as[27], as[26],
+                          as[25], as[24], as[23], as[22], as[21], as[20],
+                          as[19], as[18], as[17], as[16], as[15], as[14],
+                          as[13], as[12], as[11], as[10], as[9], as[8], as[7],
+                          as[6], as[5], as[4], as[3], as[2], as[1], as[0])) {}
+  static intvec iota() {
+    return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,
+                           18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4,
+                           3, 2, 1, 0);
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return v; }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec x = *this;
+    // We know that boolvec values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return IV(I(0)) - *this; }
+
+  intvec operator+(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_add_epi8(v, x.v);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_add_epi8(vlo, xvlo);
+    vhi = _mm_add_epi8(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec operator-(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_sub_epi8(v, x.v);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_sub_epi8(vlo, xvlo);
+    vhi = _mm_sub_epi8(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+
+  intvec operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec operator&(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_and_si256(v, x.v);
+#else
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+#endif
+  }
+  intvec operator|(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_or_si256(v, x.v);
+#else
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+#endif
+  }
+  intvec operator^(intvec x) const {
+#ifdef __AVX2__
+    return _mm256_xor_si256(v, x.v);
+#else
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+#endif
+  }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec lsr(int_t n) const {
+#ifdef __AVX2__
+    uint_t masklo = U(0x00ffU) >> U(n);
+    uint_t maskhi = U(0xff00U);
+    intvec mask = masklo | maskhi;
+    return intvec(_mm256_srai_epi16(v, n)) & mask;
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    uint_t masklo = U(0x00ffU) >> U(n);
+    uint_t maskhi = U(0xff00U);
+    __m128i mask = _mm_set1_epi16(masklo | maskhi);
+    vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
+    vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec operator>>(int_t n) const {
+#ifdef __AVX2__
+    // There is no _mm256_srai_epi8. To emulate it, add 0x80 before
+    // shifting, and subtract the shifted 0x80 after shifting
+    intvec_t offset = U(1) << (bits - 1);
+    return (*this + offset).lsr(n) - offset.lsr(n);
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    uint_t masklo = U(0x00ffU);
+    uint_t maskhi = U(0xff00U);
+    __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n + 8),
+                                  _mm_set1_epi16(masklo));
+    __m128i vlohi =
+        _mm_and_si128(_mm_srai_epi16(vlo, n), _mm_set1_epi16(maskhi));
+    vlo = _mm_or_si128(vlolo, vlohi);
+    __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n + 8),
+                                  _mm_set1_epi16(masklo));
+    __m128i vhihi =
+        _mm_and_si128(_mm_srai_epi16(vhi, n), _mm_set1_epi16(maskhi));
+    vhi = _mm_or_si128(vhilo, vhihi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec operator<<(int_t n) const {
+#ifdef __AVX2__
+    uint_t masklo = U(0x00ffU);
+    uint_t maskhi = U(0xff00U) << U(n);
+    intvec mask = masklo | maskhi;
+    return intvec(_mm256_slli_epi16(v, n)) & mask;
+#else
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    uint_t masklo = U(0x00ffU);
+    uint_t maskhi = U(0xff00U) << U(n);
+    __m128i mask = _mm_set1_epi16(masklo | maskhi);
+    vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
+    vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
+  }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      return ! (*this != x);
+    return r;
+  }
+  intvec operator>>(intvec n) const {
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    boolvec_t operator!=(intvec const& x) const
-    {
-      return (*this ^ x).convert_bool();
+    return r;
+  }
+  intvec operator<<(intvec n) const {
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
+    return r;
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  boolvec_t operator==(intvec const &x) const {
+#ifdef __AVX2__
+    return _mm256_cmpeq_epi8(v, x.v);
+#else
+    return !(*this != x);
+#endif
+  }
+  boolvec_t operator!=(intvec const &x) const {
+#ifdef __AVX2__
+    return !(*this == x);
+#else
+    return (*this ^ x).convert_bool();
+#endif
+  }
+  boolvec_t operator<(intvec const &x) const {
+#ifdef __AVX2__
+    return _mm256_cmpgt_epi8(x.v, v);
+#else
     // TODO: First compare sign; then if equal, compare sign of difference
     // TODO: Also look for intrinsics
-    boolvec_t operator<(intvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<=(intvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>(intvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>=(intvec const& x) const { __builtin_unreachable(); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<fp8,32>: floatprops<fp8>
-  {
-    static int const size = 32;
-    typedef real_t scalar_t;
-    typedef __m256i vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<AVX:32*fp8>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {}
-    realvec(real_t const* as):
-    v(_mm256_set_epi8(FP::as_int(as[31]),
-                      FP::as_int(as[30]),
-                      FP::as_int(as[29]),
-                      FP::as_int(as[28]),
-                      FP::as_int(as[27]),
-                      FP::as_int(as[26]),
-                      FP::as_int(as[25]),
-                      FP::as_int(as[24]),
-                      FP::as_int(as[23]),
-                      FP::as_int(as[22]),
-                      FP::as_int(as[21]),
-                      FP::as_int(as[20]),
-                      FP::as_int(as[19]),
-                      FP::as_int(as[18]),
-                      FP::as_int(as[17]),
-                      FP::as_int(as[16]),
-                      FP::as_int(as[15]),
-                      FP::as_int(as[14]),
-                      FP::as_int(as[13]),
-                      FP::as_int(as[12]),
-                      FP::as_int(as[11]),
-                      FP::as_int(as[10]),
-                      FP::as_int(as[ 9]),
-                      FP::as_int(as[ 8]),
-                      FP::as_int(as[ 7]),
-                      FP::as_int(as[ 6]),
-                      FP::as_int(as[ 5]),
-                      FP::as_int(as[ 4]),
-                      FP::as_int(as[ 3]),
-                      FP::as_int(as[ 2]),
-                      FP::as_int(as[ 1]),
-                      FP::as_int(as[ 0]))) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
+    return r;
+#endif
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<fp8, 32> : floatprops<fp8> {
+  static int const size = 32;
+  typedef real_t scalar_t;
+  typedef __m256i vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() {
+#ifdef __AVX2__
+    return "<AVX2:32*fp8>";
+#else
+    return "<AVX:32*fp8>";
+#endif
+  }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm256_set1_epi8(FP::as_int(a))) {}
+  realvec(real_t const *as)
+      : v(_mm256_set_epi8(
+            FP::as_int(as[31]), FP::as_int(as[30]), FP::as_int(as[29]),
+            FP::as_int(as[28]), FP::as_int(as[27]), FP::as_int(as[26]),
+            FP::as_int(as[25]), FP::as_int(as[24]), FP::as_int(as[23]),
+            FP::as_int(as[22]), FP::as_int(as[21]), FP::as_int(as[20]),
+            FP::as_int(as[19]), FP::as_int(as[18]), FP::as_int(as[17]),
+            FP::as_int(as[16]), FP::as_int(as[15]), FP::as_int(as[14]),
+            FP::as_int(as[13]), FP::as_int(as[12]), FP::as_int(as[11]),
+            FP::as_int(as[10]), FP::as_int(as[9]), FP::as_int(as[8]),
+            FP::as_int(as[7]), FP::as_int(as[6]), FP::as_int(as[5]),
+            FP::as_int(as[4]), FP::as_int(as[3]), FP::as_int(as[2]),
+            FP::as_int(as[1]), FP::as_int(as[0]))) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm256_load_si256((__m256i const *)p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    return _mm256_loadu_si256((__m256i const *)p);
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm256_store_si256((__m256i *)p, v);
+  }
+  void storeu(real_t *p) const { return _mm256_storeu_si256((__m256i *)p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
     }
-    
-    
-    
-    intvec_t as_int() const { return v; }
-    intvec_t convert_int() const { __builtin_unreachable(); }
-    
-    
-    
-    realvec operator+() const { __builtin_unreachable(); }
-    realvec operator-() const { __builtin_unreachable(); }
-    
-    realvec operator+(realvec x) const { __builtin_unreachable(); }
-    realvec operator-(realvec x) const { __builtin_unreachable(); }
-    realvec operator*(realvec x) const { __builtin_unreachable(); }
-    realvec operator/(realvec x) const { __builtin_unreachable(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const { __builtin_unreachable(); }
-    real_t minval() const { __builtin_unreachable(); }
-    real_t prod() const { __builtin_unreachable(); }
-    real_t sum() const { __builtin_unreachable(); }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
-    
-    
-    
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    boolvec_t signbit() const { return v; }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<fp8,32> boolvec<fp8,32>::as_int() const
-  {
-    return v;
-  }
-  
-  inline intvec<fp8,32> boolvec<fp8,32>::convert_int() const
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<fp8,32> boolvec<fp8,32>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline intvec<fp8,32> boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<fp8,32> boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline realvec<fp8,32> intvec<fp8,32>::as_float() const
-  {
-    return v;
-  }
-  
-  inline realvec<fp8,32> intvec<fp8,32>::convert_float() const
-  {
-    __builtin_unreachable();
-  }
-  
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return v; }
+  intvec_t convert_int() const { __builtin_unreachable(); }
+
+  realvec operator+() const { __builtin_unreachable(); }
+  realvec operator-() const { __builtin_unreachable(); }
+
+  realvec operator+(realvec x) const { __builtin_unreachable(); }
+  realvec operator-(realvec x) const { __builtin_unreachable(); }
+  realvec operator*(realvec x) const { __builtin_unreachable(); }
+  realvec operator/(realvec x) const { __builtin_unreachable(); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const { __builtin_unreachable(); }
+  real_t minval() const { __builtin_unreachable(); }
+  real_t prod() const { __builtin_unreachable(); }
+  real_t sum() const { __builtin_unreachable(); }
+
+  boolvec_t operator==(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator!=(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator<(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator<=(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator>(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator>=(realvec const &x) const { __builtin_unreachable(); }
+
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec fabs() const { return MF::vml_fabs(*this); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  boolvec_t signbit() const { return v; }
+};
+
+// boolvec definitions
+
+inline intvec<fp8, 32> boolvec<fp8, 32>::as_int() const { return v; }
+
+inline intvec<fp8, 32> boolvec<fp8, 32>::convert_int() const {
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<fp8, 32> boolvec<fp8, 32>::ifthen(boolvec_t x,
+                                                 boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<fp8, 32> boolvec<fp8, 32>::ifthen(intvec_t x, intvec_t y) const {
+  return ((-convert_int() & x) | (~ - convert_int() & y));
+}
+
+inline realvec<fp8, 32> boolvec<fp8, 32>::ifthen(realvec_t x,
+                                                 realvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_float();
+}
+
+// intvec definitions
+
+inline intvec<fp8, 32> intvec<fp8, 32>::abs() const {
+#ifdef __AVX2__
+  return _mm256_abs_epi8(v);
+#else
+  return MF::vml_abs(*this);
+#endif
+}
+
+inline realvec<fp8, 32> intvec<fp8, 32>::as_float() const { return v; }
+
+inline realvec<fp8, 32> intvec<fp8, 32>::convert_float() const {
+  __builtin_unreachable();
+}
+
+inline intvec<fp8, 32> intvec<fp8, 32>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<fp8, 32> intvec<fp8, 32>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_AVX_FP8_32_H
+#endif // #ifndef VEC_AVX_FP8_32_H
diff --git a/lib/kernel/vecmathlib/vec_base.h b/lib/kernel/vecmathlib/vec_base.h
index 737a1e0..81c698d 100644
--- a/lib/kernel/vecmathlib/vec_base.h
+++ b/lib/kernel/vecmathlib/vec_base.h
@@ -4,663 +4,544 @@
 #define VEC_BASE_H
 
 #ifndef VML_NO_IOSTREAM
-#  include <iostream>
+#include <iostream>
 #endif
 
 #include "vec_mask.h"
 
+namespace vecmathlib {
 
+template <typename real_t, int size> struct boolvec {};
 
-namespace vecmathlib {
-  
-  template<typename real_t, int size>
-  struct boolvec {
-  };
-  
-  template<typename real_t, int size>
-  struct intvec {
-  };
-  
-  template<typename real_t, int size>
-  struct realvec {
-  };
-  
-
-  
-  // boolvec wrappers
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> as_int(boolvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> convert_int(boolvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline bool all(boolvec<real_t, size> x) { return x.all(); }
-  
-  template<typename real_t, int size>
-  inline bool any(boolvec<real_t, size> x) { return x.any(); }
-  
-  template<typename real_t, int size>
-  inline
-  boolvec<real_t, size> ifthen(boolvec<real_t, size> c,
-                               boolvec<real_t, size> x,
-                               boolvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intvec<real_t, size> ifthen(boolvec<real_t, size> c,
-                              intvec<real_t, size> x,
-                              intvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realvec<real_t, size> ifthen(boolvec<real_t, size> c,
-                               realvec<real_t, size> x,
-                               realvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  
-  
-  // intvec wrappers
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> as_bool(intvec<real_t, size> x)
-  {
-    return x.as_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x)
-  {
-    return x.convert_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> as_float(intvec<real_t, size> x)
-  {
-    return x.as_float();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> convert_float(intvec<real_t, size> x)
-  {
-    return x.convert_float();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> abs(intvec<real_t, size> x)
-  {
-    return x.abs();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> bitifthen(intvec<real_t, size> x,
-                                        intvec<real_t, size> y,
-                                        intvec<real_t, size> z)
-  {
-    return x.bitifthen(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> clz(intvec<real_t, size> x)
-  {
-    return x.clz();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isignbit(intvec<real_t, size> x)
-  {
-    return x.isignbit();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> lsr(intvec<real_t, size> x,
-                                  typename intvec<real_t, size>::int_t n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> lsr(intvec<real_t, size> x,
-                                  intvec<real_t, size> n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> max(intvec<real_t, size> x,
-                                  intvec<real_t, size> y)
-  {
-    return x.max(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> min(intvec<real_t, size> x,
-                                  intvec<real_t, size> y)
-  {
-    return x.min(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> popcount(intvec<real_t, size> x)
-  {
-    return x.popcount();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> rotate(intvec<real_t, size> x,
-                                     typename intvec<real_t, size>::int_t n)
-  {
-    return x.rotate(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> rotate(intvec<real_t, size> x,
-                                     intvec<real_t, size> n)
-  {
-    return x.rotate(n);
-  }
-  
-  
-  
-  // realvec wrappers
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size>
-  loada(real_t const* p,
-        realvec<real_t, size> x,
-        typename realvec<real_t, size>::mask_t const& m)
-  {
-    return x.loada(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size>
-  loadu(real_t const* p,
-        realvec<real_t, size> x,
-        typename realvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size>
-  loadu(real_t const* p, size_t ioff,
-        realvec<real_t, size> x,
-        typename realvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, ioff, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realvec<real_t, size> x, real_t* p)
-  {
-    x.storea(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realvec<real_t, size> x, real_t* p)
-  {
-    x.storeu(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff)
-  {
-    x.storeu(p, ioff);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realvec<real_t, size> x, real_t* p,
-                     typename realvec<real_t, size>::mask_t const& m)
-  {
-    x.storea(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realvec<real_t, size> x, real_t* p,
-                     typename realvec<real_t, size>::mask_t const& m)
-  {
-    x.storeu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff,
-                     typename realvec<real_t, size>::mask_t const &m)
-  {
-    x.storeu(p, ioff, m);
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> as_int(realvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> convert_int(realvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  typename realvec<real_t, size>::real_t maxval(realvec<real_t, size> x)
-  {
-    return x.maxval();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  typename realvec<real_t, size>::real_t minval(realvec<real_t, size> x)
-  {
-    return x.minval();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x)
-  {
-    return x.prod();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x)
-  {
-    return x.sum();
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> acos(realvec<real_t, size> x)
-  {
-    return x.acos();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> acosh(realvec<real_t, size> x)
-  {
-    return x.acosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> asin(realvec<real_t, size> x)
-  {
-    return x.asin();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> asinh(realvec<real_t, size> x)
-  {
-    return x.asinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> atan(realvec<real_t, size> x)
-  {
-    return x.atan();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> atan2(realvec<real_t, size> x,
-                                     realvec<real_t, size> y)
-  {
-    return x.atan2(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> atanh(realvec<real_t, size> x)
-  {
-    return x.atanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> cbrt(realvec<real_t, size> x)
-  {
-    return x.cbrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> ceil(realvec<real_t, size> x)
-  {
-    return x.ceil();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> copysign(realvec<real_t, size> x,
-                                        realvec<real_t, size> y)
-  {
-    return x.copysign(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> cos(realvec<real_t, size> x)
-  {
-    return x.cos();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> cosh(realvec<real_t, size> x)
-  {
-    return x.cosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> exp(realvec<real_t, size> x)
-  {
-    return x.exp();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> exp10(realvec<real_t, size> x)
-  {
-    return x.exp10();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> exp2(realvec<real_t, size> x)
-  {
-    return x.exp2();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> expm1(realvec<real_t, size> x)
-  {
-    return x.expm1();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fabs(realvec<real_t, size> x)
-  {
-    return x.fabs();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> floor(realvec<real_t, size> x)
-  {
-    return x.floor();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fdim(realvec<real_t, size> x,
-                                    realvec<real_t, size> y)
-  {
-    return x.fdim(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fma(realvec<real_t, size> x,
-                                   realvec<real_t, size> y,
-                                   realvec<real_t, size> z)
-  {
-    return x.fma(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fmax(realvec<real_t, size> x,
-                                    realvec<real_t, size> y)
-  {
-    return x.fmax(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fmin(realvec<real_t, size> x,
-                                    realvec<real_t, size> y)
-  {
-    return x.fmin(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fmod(realvec<real_t, size> x,
-                                    realvec<real_t, size> y)
-  {
-    return x.fmod(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> frexp(realvec<real_t, size> x,
-                                     intvec<real_t, size>* r)
-  {
-    return x.frexp(r);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> hypot(realvec<real_t, size> x,
-                                     realvec<real_t, size> y)
-  {
-    return x.hypot(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> ilogb(realvec<real_t, size> x)
-  {
-    return x.ilogb();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isfinite(realvec<real_t, size> x)
-  {
-    return x.isfinite();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isinf(realvec<real_t, size> x)
-  {
-    return x.isinf();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isnan(realvec<real_t, size> x)
-  {
-    return x.isnan();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isnormal(realvec<real_t, size> x)
-  {
-    return x.isnormal();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realvec<real_t, size> ldexp(realvec<real_t, size> x,
-                              typename intvec<real_t, size>::int_t n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realvec<real_t, size> ldexp(realvec<real_t, size> x,
-                               intvec<real_t, size> n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> log(realvec<real_t, size> x)
-  {
-    return x.log();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> log10(realvec<real_t, size> x)
-  {
-    return x.log10();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> log1p(realvec<real_t, size> x)
-  {
-    return x.log1p();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> log2(realvec<real_t, size> x)
-  {
-    return x.log2();
-  }
-    
-  template<typename real_t, int size>
-  inline intvec<real_t, size> lrint(realvec<real_t, size> x)
-  {
-    return x.lrint();
-  }
+template <typename real_t, int size> struct intvec {};
+
+template <typename real_t, int size> struct realvec {};
+
+// boolvec wrappers
+
+template <typename real_t, int size>
+inline intvec<real_t, size> as_int(boolvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> convert_int(boolvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size> inline bool all(boolvec<real_t, size> x) {
+  return x.all();
+}
+
+template <typename real_t, int size> inline bool any(boolvec<real_t, size> x) {
+  return x.any();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> ifthen(boolvec<real_t, size> c,
+                                    boolvec<real_t, size> x,
+                                    boolvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> ifthen(boolvec<real_t, size> c,
+                                   intvec<real_t, size> x,
+                                   intvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ifthen(boolvec<real_t, size> c,
+                                    realvec<real_t, size> x,
+                                    realvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+// intvec wrappers
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> as_bool(intvec<real_t, size> x) {
+  return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x) {
+  return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> as_float(intvec<real_t, size> x) {
+  return x.as_float();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> convert_float(intvec<real_t, size> x) {
+  return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> abs(intvec<real_t, size> x) {
+  return x.abs();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> bitifthen(intvec<real_t, size> x,
+                                      intvec<real_t, size> y,
+                                      intvec<real_t, size> z) {
+  return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> clz(intvec<real_t, size> x) {
+  return x.clz();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isignbit(intvec<real_t, size> x) {
+  return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+                                typename intvec<real_t, size>::int_t n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+                                intvec<real_t, size> n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> max(intvec<real_t, size> x,
+                                intvec<real_t, size> y) {
+  return x.max(y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> min(intvec<real_t, size> x,
+                                intvec<real_t, size> y) {
+  return x.min(y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> popcount(intvec<real_t, size> x) {
+  return x.popcount();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> rotate(intvec<real_t, size> x,
+                                   typename intvec<real_t, size>::int_t n) {
+  return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> rotate(intvec<real_t, size> x,
+                                   intvec<real_t, size> n) {
+  return x.rotate(n);
+}
+
+// realvec wrappers
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+loada(real_t const *p, realvec<real_t, size> x,
+      typename realvec<real_t, size>::mask_t const &m) {
+  return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+loadu(real_t const *p, realvec<real_t, size> x,
+      typename realvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+loadu(real_t const *p, size_t ioff, realvec<real_t, size> x,
+      typename realvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realvec<real_t, size> x, real_t *p) {
+  x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p) {
+  x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p, size_t ioff) {
+  x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realvec<real_t, size> x, real_t *p,
+                   typename realvec<real_t, size>::mask_t const &m) {
+  x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p,
+                   typename realvec<real_t, size>::mask_t const &m) {
+  x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p, size_t ioff,
+                   typename realvec<real_t, size>::mask_t const &m) {
+  x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> as_int(realvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> convert_int(realvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t maxval(realvec<real_t, size> x) {
+  return x.maxval();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t minval(realvec<real_t, size> x) {
+  return x.minval();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x) {
+  return x.prod();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x) {
+  return x.sum();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> acos(realvec<real_t, size> x) {
+  return x.acos();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> acosh(realvec<real_t, size> x) {
+  return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> asin(realvec<real_t, size> x) {
+  return x.asin();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> asinh(realvec<real_t, size> x) {
+  return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> atan(realvec<real_t, size> x) {
+  return x.atan();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> atan2(realvec<real_t, size> x,
+                                   realvec<real_t, size> y) {
+  return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> atanh(realvec<real_t, size> x) {
+  return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> cbrt(realvec<real_t, size> x) {
+  return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ceil(realvec<real_t, size> x) {
+  return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> copysign(realvec<real_t, size> x,
+                                      realvec<real_t, size> y) {
+  return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> cos(realvec<real_t, size> x) {
+  return x.cos();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> cosh(realvec<real_t, size> x) {
+  return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> exp(realvec<real_t, size> x) {
+  return x.exp();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> exp10(realvec<real_t, size> x) {
+  return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> exp2(realvec<real_t, size> x) {
+  return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> expm1(realvec<real_t, size> x) {
+  return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fabs(realvec<real_t, size> x) {
+  return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> floor(realvec<real_t, size> x) {
+  return x.floor();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fdim(realvec<real_t, size> x,
+                                  realvec<real_t, size> y) {
+  return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+fma(realvec<real_t, size> x, realvec<real_t, size> y, realvec<real_t, size> z) {
+  return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fmax(realvec<real_t, size> x,
+                                  realvec<real_t, size> y) {
+  return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fmin(realvec<real_t, size> x,
+                                  realvec<real_t, size> y) {
+  return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fmod(realvec<real_t, size> x,
+                                  realvec<real_t, size> y) {
+  return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> frexp(realvec<real_t, size> x,
+                                   intvec<real_t, size> *r) {
+  return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> hypot(realvec<real_t, size> x,
+                                   realvec<real_t, size> y) {
+  return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> ilogb(realvec<real_t, size> x) {
+  return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isfinite(realvec<real_t, size> x) {
+  return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isinf(realvec<real_t, size> x) {
+  return x.isinf();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isnan(realvec<real_t, size> x) {
+  return x.isnan();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isnormal(realvec<real_t, size> x) {
+  return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ldexp(realvec<real_t, size> x,
+                                   typename intvec<real_t, size>::int_t n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ldexp(realvec<real_t, size> x,
+                                   intvec<real_t, size> n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log(realvec<real_t, size> x) {
+  return x.log();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log10(realvec<real_t, size> x) {
+  return x.log10();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log1p(realvec<real_t, size> x) {
+  return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log2(realvec<real_t, size> x) {
+  return x.log2();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> lrint(realvec<real_t, size> x) {
+  return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+mad(realvec<real_t, size> x, realvec<real_t, size> y, realvec<real_t, size> z) {
+  return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> nextafter(realvec<real_t, size> x,
+                                       realvec<real_t, size> y) {
+  return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> pow(realvec<real_t, size> x,
+                                 realvec<real_t, size> y) {
+  return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> rcp(realvec<real_t, size> x) {
+  return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> remainder(realvec<real_t, size> x,
+                                       realvec<real_t, size> y) {
+  return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> rint(realvec<real_t, size> x) {
+  return x.rint();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> round(realvec<real_t, size> x) {
+  return x.round();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> rsqrt(realvec<real_t, size> x) {
+  return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> signbit(realvec<real_t, size> x) {
+  return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> sin(realvec<real_t, size> x) {
+  return x.sin();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> sinh(realvec<real_t, size> x) {
+  return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> sqrt(realvec<real_t, size> x) {
+  return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> tan(realvec<real_t, size> x) {
+  return x.tan();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> tanh(realvec<real_t, size> x) {
+  return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> trunc(realvec<real_t, size> x) {
+  return x.trunc();
+}
 
-  template<typename real_t, int size>
-  inline realvec<real_t, size> mad(realvec<real_t, size> x,
-                                   realvec<real_t, size> y,
-                                   realvec<real_t, size> z)
-  {
-    return x.mad(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> nextafter(realvec<real_t, size> x,
-                                         realvec<real_t, size> y)
-  {
-    return x.nextafter(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> pow(realvec<real_t, size> x,
-                                   realvec<real_t, size> y)
-  {
-    return x.pow(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> rcp(realvec<real_t, size> x)
-  {
-    return x.rcp();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> remainder(realvec<real_t, size> x,
-                                         realvec<real_t, size> y)
-  {
-    return x.remainder(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> rint(realvec<real_t, size> x)
-  {
-    return x.rint();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> round(realvec<real_t, size> x)
-  {
-    return x.round();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> rsqrt(realvec<real_t, size> x)
-  {
-    return x.rsqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> signbit(realvec<real_t, size> x)
-  {
-    return x.signbit();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> sin(realvec<real_t, size> x)
-  {
-    return x.sin();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> sinh(realvec<real_t, size> x)
-  {
-    return x.sinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> sqrt(realvec<real_t, size> x)
-  {
-    return x.sqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> tan(realvec<real_t, size> x)
-  {
-    return x.tan();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> tanh(realvec<real_t, size> x)
-  {
-    return x.tanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> trunc(realvec<real_t, size> x)
-  {
-    return x.trunc();
-  }
-  
-  
-  
 #ifndef VML_NO_IOSTREAM
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os, boolvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os, intvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os, realvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, boolvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, intvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, realvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
 #endif
-  
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_BASE_H
+#endif // #ifndef VEC_BASE_H
diff --git a/lib/kernel/vecmathlib/vec_builtin.h b/lib/kernel/vecmathlib/vec_builtin.h
index d1afca7..2f1ff90 100644
--- a/lib/kernel/vecmathlib/vec_builtin.h
+++ b/lib/kernel/vecmathlib/vec_builtin.h
@@ -12,1450 +12,1253 @@
 #include <cmath>
 #include <cstring>
 #ifndef VML_NO_IOSTREAM
-#  include <sstream>
+#include <sstream>
 #endif
 #include <string>
 
+namespace vecmathlib {
 
+template <typename T, int N> struct boolbuiltinvec;
+template <typename T, int N> struct intbuiltinvec;
+template <typename T, int N> struct realbuiltinvec;
 
-namespace vecmathlib {
-  
-  template<typename T, int N> struct boolbuiltinvec;
-  template<typename T, int N> struct intbuiltinvec;
-  template<typename T, int N> struct realbuiltinvec;
-  
-  
-  
-  template<typename T, int N>
-  struct boolbuiltinvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static const int size = N;
-    typedef bool scalar_t;
-    typedef int_t bvector_t __attribute__((__ext_vector_type__(N)));
-    static const int alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true is -1, false is 0
-    static int_t from_bool(bool a) { return -uint_t(a); }
-    static bool to_bool(int_t a) { return a; }
-  public:
-    
-    typedef boolbuiltinvec boolvec_t;
-    typedef intbuiltinvec<real_t, size> intvec_t;
-    typedef realbuiltinvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolbuiltinvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolbuiltinvec(const boolbuiltinvec& x): v(x.v) {}
-    // boolbuiltinvec& operator=(const boolbuiltinvec& x) { return v=x.v, *this; }
-    // Can't have a constructor from bvector_t, since this would
-    // conflict with the constructor from bool
-    // boolbuiltinvec(bvector_t x): v(x) {}
-    static boolvec_t mkvec(bvector_t x) { boolvec_t res; res.v=x; return res; }
-    boolbuiltinvec(bool a): v(from_bool(a)) {}
-    boolbuiltinvec(const bool* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(v[n]); }
-    boolvec_t& set_elt(int n, bool a) { return v[n]=from_bool(a), *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intbuiltinvec
-    intvec_t convert_int() const; // defined after intbuiltinvec
-    
-    
-    
-    boolvec_t operator!() const { return mkvec(!v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return mkvec(v && x.v); }
-    boolvec_t operator||(boolvec_t x) const { return mkvec(v || x.v); }
-    boolvec_t operator==(boolvec_t x) const { return mkvec(v == x.v); }
-    boolvec_t operator!=(boolvec_t x) const { return mkvec(v != x.v); }
-    
-    bool all() const
-    {
-      bool res = (*this)[0];
-      for (int d=1; d<size; ++d) res = res && (*this)[d];
-      return res;
-    }
-    bool any() const
-    {
-      bool res = (*this)[0];
-      for (int d=1; d<size; ++d) res = res || (*this)[d];
-      return res;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intbuiltinvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realbuiltinvec
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct intbuiltinvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static const int size = N;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t __attribute__((__ext_vector_type__(N)));
-    typedef uint_t uvector_t __attribute__((__ext_vector_type__(N)));
-    static const int alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    static_assert(size * sizeof(real_t) == sizeof(uvector_t),
-                  "vector size is wrong");
-    
-    typedef boolbuiltinvec<real_t, size> boolvec_t;
-    typedef intbuiltinvec intvec_t;
-    typedef realbuiltinvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intbuiltinvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intbuiltinvec(const intbuiltinvec& x): v(x.v) {}
-    // intbuiltinvec& operator=(const intbuiltinvec& x) { return v=x.v, *this; }
-    // Can't have a constructor from ivector_t, since this would
-    // conflict with the constructor from int_t
-    // intbuiltinvec(ivector_t x): v(x) {}
-    static intvec_t mkvec(ivector_t x) { intvec_t res; res.v=x; return res; }
-    intbuiltinvec(int_t a): v(a) {}
-    intbuiltinvec(const int_t* as) { std::memcpy(&v, as, sizeof v); }
-    static intvec_t iota()
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.set_elt(d, d);
-      return res;
-    }
-    
-    int_t operator[](int n) const { return v[n]; }
-    intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const
-    {
-      boolvec_t res;
-      std::memcpy(&res.v, &v, sizeof res.v);
-      return res;
-    }
-    boolvec_t convert_bool() const { return *this != IV(I(0)); }
-    realvec_t as_float() const;      // defined after realbuiltinvec
-    realvec_t convert_float() const; // defined after realbuiltinvec
-    
-    
-    
-    intvec_t operator+() const { return mkvec(+v); }
-    intvec_t operator-() const { return mkvec(-v); }
-    
-    intvec_t operator+(intvec_t x) const { return mkvec(v + x.v); }
-    intvec_t operator-(intvec_t x) const { return mkvec(v - x.v); }
-    intvec_t operator*(intvec_t x) const { return mkvec(v * x.v); }
-    intvec_t operator/(intvec_t x) const { return mkvec(v / x.v); }
-    intvec_t operator%(intvec_t x) const { return mkvec(v % x.v); }
-    
-    intvec_t& operator+=(const intvec_t& x) { return *this=*this+x; }
-    intvec_t& operator-=(const intvec_t& x) { return *this=*this-x; }
-    intvec_t& operator*=(const intvec_t& x) { return *this=*this*x; }
-    intvec_t& operator/=(const intvec_t& x) { return *this=*this/x; }
-    intvec_t& operator%=(const intvec_t& x) { return *this=*this%x; }
-    
-    
-    
-    intvec_t operator~() const { return mkvec(~v); }
-    
-    intvec_t operator&(intvec_t x) const { return mkvec(v & x.v); }
-    intvec_t operator|(intvec_t x) const { return mkvec(v | x.v); }
-    intvec_t operator^(intvec_t x) const { return mkvec(v ^ x.v); }
-    
-    intvec_t& operator&=(const intvec_t& x) { return *this=*this&x; }
-    intvec_t& operator|=(const intvec_t& x) { return *this=*this|x; }
-    intvec_t& operator^=(const intvec_t& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const
-    {
-      return MF::vml_bitifthen(*this, x, y);
-    }
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      return mkvec(ivector_t(uvector_t(v) >> U(n)));
-    }
-    intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
-    intvec_t operator>>(int_t n) const { return mkvec(v >> n); }
-    intvec_t operator<<(int_t n) const { return mkvec(v << n); }
-    
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      return mkvec(ivector_t(uvector_t(v)>>uvector_t(n.v)));
-    }
-    intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
-    intvec_t operator>>(intvec_t n) const { return mkvec(v >> n.v); }
-    intvec_t operator<<(intvec_t n) const { return mkvec(v << n.v); }
-    
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) {
-        int_t val = (*this)[d];
-        int_t cnt = val == 0 ? CHAR_BIT * sizeof val : builtin_clz(U(val));
-        res.set_elt(d, cnt);
-      }
-      return res;
-    }
-    intvec_t popcount() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_popcount(U((*this)[d])));
-      }
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v == x.v);
-    }
-    boolvec_t operator!=(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v != x.v);
-    }
-    boolvec_t operator<(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v < x.v);
-    }
-    boolvec_t operator<=(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v <= x.v);
-    }
-    boolvec_t operator>(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v > x.v);
-    }
-    boolvec_t operator>=(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v >= x.v);
+template <typename T, int N> struct boolbuiltinvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static const int size = N;
+  typedef bool scalar_t;
+  typedef int_t bvector_t __attribute__((__ext_vector_type__(N)));
+  static const int alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true is -1, false is 0
+  static int_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(int_t a) { return a; }
+
+public:
+  typedef boolbuiltinvec boolvec_t;
+  typedef intbuiltinvec<real_t, size> intvec_t;
+  typedef realbuiltinvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolbuiltinvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolbuiltinvec(const boolbuiltinvec& x): v(x.v) {}
+  // boolbuiltinvec& operator=(const boolbuiltinvec& x) { return v=x.v, *this; }
+  // Can't have a constructor from bvector_t, since this would
+  // conflict with the constructor from bool
+  // boolbuiltinvec(bvector_t x): v(x) {}
+  static boolvec_t mkvec(bvector_t x) {
+    boolvec_t res;
+    res.v = x;
+    return res;
+  }
+  boolbuiltinvec(bool a) : v(from_bool(a)) {}
+  boolbuiltinvec(const bool *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return to_bool(v[n]); }
+  boolvec_t &set_elt(int n, bool a) { return v[n] = from_bool(a), *this; }
+
+  intvec_t as_int() const;      // defined after intbuiltinvec
+  intvec_t convert_int() const; // defined after intbuiltinvec
+
+  boolvec_t operator!() const { return mkvec(!v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return mkvec(v && x.v); }
+  boolvec_t operator||(boolvec_t x) const { return mkvec(v || x.v); }
+  boolvec_t operator==(boolvec_t x) const { return mkvec(v == x.v); }
+  boolvec_t operator!=(boolvec_t x) const { return mkvec(v != x.v); }
+
+  bool all() const {
+    bool res = (*this)[0];
+    for (int d = 1; d < size; ++d)
+      res = res && (*this)[d];
+    return res;
+  }
+  bool any() const {
+    bool res = (*this)[0];
+    for (int d = 1; d < size; ++d)
+      res = res || (*this)[d];
+    return res;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intbuiltinvec
+  realvec_t ifthen(realvec_t x,
+                   realvec_t y) const; // defined after realbuiltinvec
+};
+
+template <typename T, int N> struct intbuiltinvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static const int size = N;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t __attribute__((__ext_vector_type__(N)));
+  typedef uint_t uvector_t __attribute__((__ext_vector_type__(N)));
+  static const int alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+  static_assert(size * sizeof(real_t) == sizeof(uvector_t),
+                "vector size is wrong");
+
+  typedef boolbuiltinvec<real_t, size> boolvec_t;
+  typedef intbuiltinvec intvec_t;
+  typedef realbuiltinvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intbuiltinvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intbuiltinvec(const intbuiltinvec& x): v(x.v) {}
+  // intbuiltinvec& operator=(const intbuiltinvec& x) { return v=x.v, *this; }
+  // Can't have a constructor from ivector_t, since this would
+  // conflict with the constructor from int_t
+  // intbuiltinvec(ivector_t x): v(x) {}
+  static intvec_t mkvec(ivector_t x) {
+    intvec_t res;
+    res.v = x;
+    return res;
+  }
+  intbuiltinvec(int_t a) : v(a) {}
+  intbuiltinvec(const int_t *as) { std::memcpy(&v, as, sizeof v); }
+  static intvec_t iota() {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.set_elt(d, d);
+    return res;
+  }
+
+  int_t operator[](int n) const { return v[n]; }
+  intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; }
+
+  boolvec_t as_bool() const {
+    boolvec_t res;
+    std::memcpy(&res.v, &v, sizeof res.v);
+    return res;
+  }
+  boolvec_t convert_bool() const { return *this != IV(I(0)); }
+  realvec_t as_float() const;      // defined after realbuiltinvec
+  realvec_t convert_float() const; // defined after realbuiltinvec
+
+  intvec_t operator+() const { return mkvec(+v); }
+  intvec_t operator-() const { return mkvec(-v); }
+
+  intvec_t operator+(intvec_t x) const { return mkvec(v + x.v); }
+  intvec_t operator-(intvec_t x) const { return mkvec(v - x.v); }
+  intvec_t operator*(intvec_t x) const { return mkvec(v * x.v); }
+  intvec_t operator/(intvec_t x) const { return mkvec(v / x.v); }
+  intvec_t operator%(intvec_t x) const { return mkvec(v % x.v); }
+
+  intvec_t &operator+=(const intvec_t &x) { return *this = *this + x; }
+  intvec_t &operator-=(const intvec_t &x) { return *this = *this - x; }
+  intvec_t &operator*=(const intvec_t &x) { return *this = *this * x; }
+  intvec_t &operator/=(const intvec_t &x) { return *this = *this / x; }
+  intvec_t &operator%=(const intvec_t &x) { return *this = *this % x; }
+
+  intvec_t operator~() const { return mkvec(~v); }
+
+  intvec_t operator&(intvec_t x) const { return mkvec(v & x.v); }
+  intvec_t operator|(intvec_t x) const { return mkvec(v | x.v); }
+  intvec_t operator^(intvec_t x) const { return mkvec(v ^ x.v); }
+
+  intvec_t &operator&=(const intvec_t &x) { return *this = *this & x; }
+  intvec_t &operator|=(const intvec_t &x) { return *this = *this | x; }
+  intvec_t &operator^=(const intvec_t &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const {
+    return MF::vml_bitifthen(*this, x, y);
+  }
+
+  intvec_t lsr(int_t n) const { return mkvec(ivector_t(uvector_t(v) >> U(n))); }
+  intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
+  intvec_t operator>>(int_t n) const { return mkvec(v >> n); }
+  intvec_t operator<<(int_t n) const { return mkvec(v << n); }
+
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    return mkvec(ivector_t(uvector_t(v) >> uvector_t(n.v)));
+  }
+  intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
+  intvec_t operator>>(intvec_t n) const { return mkvec(v >> n.v); }
+  intvec_t operator<<(intvec_t n) const { return mkvec(v << n.v); }
+
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d) {
+      int_t val = (*this)[d];
+      int_t cnt = val == 0 ? CHAR_BIT * sizeof val : builtin_clz(U(val));
+      res.set_elt(d, cnt);
     }
-    
-    intvec_t abs() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.set_elt(d, builtin_abs((*this)[d]));
-      return res;
+    return res;
+  }
+  intvec_t popcount() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_popcount(U((*this)[d])));
     }
-    
-    boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
-    
-    intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
-    intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct realbuiltinvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static const int size = N;
-    typedef real_t scalar_t;
-    typedef real_t vector_t __attribute__((__ext_vector_type__(N)));
-    static const int alignment = sizeof(vector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
+    return res;
+  }
+
+  boolvec_t operator==(const intvec_t &x) const {
+    return boolvec_t::mkvec(v == x.v);
+  }
+  boolvec_t operator!=(const intvec_t &x) const {
+    return boolvec_t::mkvec(v != x.v);
+  }
+  boolvec_t operator<(const intvec_t &x) const {
+    return boolvec_t::mkvec(v < x.v);
+  }
+  boolvec_t operator<=(const intvec_t &x) const {
+    return boolvec_t::mkvec(v <= x.v);
+  }
+  boolvec_t operator>(const intvec_t &x) const {
+    return boolvec_t::mkvec(v > x.v);
+  }
+  boolvec_t operator>=(const intvec_t &x) const {
+    return boolvec_t::mkvec(v >= x.v);
+  }
+
+  intvec_t abs() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.set_elt(d, builtin_abs((*this)[d]));
+    return res;
+  }
+
+  boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
+
+  intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
+  intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
+};
+
+template <typename T, int N> struct realbuiltinvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static const int size = N;
+  typedef real_t scalar_t;
+  typedef real_t vector_t __attribute__((__ext_vector_type__(N)));
+  static const int alignment = sizeof(vector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
 #ifndef VML_NO_IOSTREAM
-    static const char* name()
-    {
-      static std::string name_;
-      if (name_.empty()) {
-        std::stringstream buf;
-        buf << "<builtin:" << N << "*" << FP::name() << ">";
-        name_ = buf.str();
-      }
-      return name_.c_str();
+  static const char *name() {
+    static std::string name_;
+    if (name_.empty()) {
+      std::stringstream buf;
+      buf << "<builtin:" << N << "*" << FP::name() << ">";
+      name_ = buf.str();
     }
+    return name_.c_str();
+  }
 #endif
-    void barrier() { volatile vector_t x __attribute__((__unused__)) = v; }
-    
-    typedef boolbuiltinvec<real_t, size> boolvec_t;
-    typedef intbuiltinvec<real_t, size> intvec_t;
-    typedef realbuiltinvec realvec_t;
-    
-  private:
-    boolvec_t mapb(bool f(real_t)) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    intvec_t map(int_t f(real_t)) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t)) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, int_t), intvec_t x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, int_t*), intvec_t* x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) {
-        int_t ix;
-        res.v[d] = f(v[d], &ix);
-        x->set_elt(d, ix);
-      }
-      return res;
-    }
-    realvec_t map(real_t f(real_t, real_t), realvec_t x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, real_t, real_t),
-                  realvec_t x, realvec_t y) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d], y.v[d]);
-      return res;
+  void barrier() { volatile vector_t x __attribute__((__unused__)) = v; }
+
+  typedef boolbuiltinvec<real_t, size> boolvec_t;
+  typedef intbuiltinvec<real_t, size> intvec_t;
+  typedef realbuiltinvec realvec_t;
+
+private:
+  boolvec_t mapb(bool f(real_t)) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  intvec_t map(int_t f(real_t)) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t)) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, int_t), intvec_t x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, int_t *), intvec_t *x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d) {
+      int_t ix;
+      res.v[d] = f(v[d], &ix);
+      x->set_elt(d, ix);
     }
-  public:
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realbuiltinvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realbuiltinvec(const realbuiltinvec& x): v(x.v) {}
-    // realbuiltinvec& operator=(const realbuiltinvec& x) { return v=x.v, *this; }
-    // Can't have a constructor from vector_t, since this would
-    // conflict with the constructor from real_t
-    // realbuiltinvec(vector_t x): v(x) {}
-    static realvec_t mkvec(vector_t x) { realvec_t res; res.v=x; return res; }
-    realbuiltinvec(real_t a): v(a) {}
-    realbuiltinvec(const real_t* as) { std::memcpy(&v, as, sizeof v); }
-    
-    real_t operator[](int n) const { return v[n]; }
-    realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(const real_t* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, real_t), realvec_t x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, real_t, real_t), realvec_t x,
+                realvec_t y) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d], y.v[d]);
+    return res;
+  }
+
+public:
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realbuiltinvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realbuiltinvec(const realbuiltinvec& x): v(x.v) {}
+  // realbuiltinvec& operator=(const realbuiltinvec& x) { return v=x.v, *this; }
+  // Can't have a constructor from vector_t, since this would
+  // conflict with the constructor from real_t
+  // realbuiltinvec(vector_t x): v(x) {}
+  static realvec_t mkvec(vector_t x) {
+    realvec_t res;
+    res.v = x;
+    return res;
+  }
+  realbuiltinvec(real_t a) : v(a) {}
+  realbuiltinvec(const real_t *as) { std::memcpy(&v, as, sizeof v); }
+
+  real_t operator[](int n) const { return v[n]; }
+  realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(const real_t *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
 #if __has_builtin(__builtin_assume_aligned)
-      p = (const real_t*)__builtin_assume_aligned(p, sizeof(realvec_t));
+    p = (const real_t *)__builtin_assume_aligned(p, sizeof(realvec_t));
 #endif
-      return mkvec(*(const vector_t*)p);
-    }
-    static realvec_t loadu(const real_t* p)
-    {
-      // return mkvec(*(const vector_t*)p);
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.set_elt(d, p[d]);
-      return res;
-      // realvec_t res;
-      // memcpy(&res.v, p, sizeof res.v);
-      // return res;
-    }
-    static realvec_t loadu(const real_t* p, size_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(const real_t* p, const mask_t& m) const
-    {
-      return m.m.ifthen(loada(p), *this);
-    }
-    realvec_t loadu(const real_t* p, const mask_t& m) const
-    {
-      return m.m.ifthen(loadu(p), *this);
-    }
-    realvec_t loadu(const real_t* p, size_t ioff, const mask_t& m) const
-    {
-      return m.m.ifthen(loadu(p, ioff), *this);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
+    return mkvec(*(const vector_t *)p);
+  }
+  static realvec_t loadu(const real_t *p) {
+    // return mkvec(*(const vector_t*)p);
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.set_elt(d, p[d]);
+    return res;
+    // realvec_t res;
+    // memcpy(&res.v, p, sizeof res.v);
+    // return res;
+  }
+  static realvec_t loadu(const real_t *p, size_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(const real_t *p, const mask_t &m) const {
+    return m.m.ifthen(loada(p), *this);
+  }
+  realvec_t loadu(const real_t *p, const mask_t &m) const {
+    return m.m.ifthen(loadu(p), *this);
+  }
+  realvec_t loadu(const real_t *p, size_t ioff, const mask_t &m) const {
+    return m.m.ifthen(loadu(p, ioff), *this);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
 #if __has_builtin(__builtin_assume_aligned)
-      p = __builtin_assume_aligned(p, sizeof(realvec_t));
+    p = (real_t *)__builtin_assume_aligned(p, sizeof(realvec_t));
 #endif
-      *(vector_t*)p = v;
-    }
-    void storeu(real_t* p) const
-    {
-      // *(vector_t*)p = v;
-      for (int d=0; d<size; ++d) p[d] = (*this)[d];
-      // memcpy(p, &v, sizeof res.v);
-    }
-    void storeu(real_t* p, size_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p, m);
-    }
-    void storeu(real_t* p, const mask_t& m) const
-    {
-      for (int d=0; d<size; ++d) if (m.m[d]) p[d] = (*this)[d];
-    }
-    void storeu(real_t* p, size_t ioff, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const
-    {
-      intvec_t res;
-      std::memcpy(&res.v, &v, sizeof res.v);
-      return res;
-    }
-    intvec_t convert_int() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.set_elt(d, int_t((*this)[d]));
-      return res;
-    }
-    
-    
-    
-    realvec_t operator+() const { return mkvec(+v); }
-    realvec_t operator-() const { return mkvec(-v); }
-    
-    realvec_t operator+(realvec_t x) const { return mkvec(v + x.v); }
-    realvec_t operator-(realvec_t x) const { return mkvec(v - x.v); }
-    realvec_t operator*(realvec_t x) const { return mkvec(v * x.v); }
-    realvec_t operator/(realvec_t x) const { return mkvec(v / x.v); }
-    
-    realvec_t& operator+=(const realvec_t& x) { return *this=*this+x; }
-    realvec_t& operator-=(const realvec_t& x) { return *this=*this-x; }
-    realvec_t& operator*=(const realvec_t& x) { return *this=*this*x; }
-    realvec_t& operator/=(const realvec_t& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) {
-        res = builtin_fmax(res, (*this)[d]);
-      }
-      return res;
-    }
-    real_t minval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) {
-        res = builtin_fmin(res, (*this)[d]);
-      }
-      return res;
-    }
-    real_t prod() const
-    {
-      real_t res = (*this)[0];
-      for (int d=1; d<size; ++d) res *= (*this)[d];
-      return res;
-    }
-    real_t sum() const
-    {
-      real_t res = (*this)[0];
-      for (int d=1; d<size; ++d) res += (*this)[d];
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v == x.v);
-    }
-    boolvec_t operator!=(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v != x.v);
-    }
-    boolvec_t operator<(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v < x.v);
-    }
-    boolvec_t operator<=(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v <= x.v);
-    }
-    boolvec_t operator>(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v > x.v);
-    }
-    boolvec_t operator>=(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v >= x.v);
-    }
-    
-    
-    
-    realvec_t acos() const { return map(builtin_acos); }
-    realvec_t acosh() const { return map(builtin_acosh); }
-    realvec_t asin() const { return map(builtin_asin); }
-    realvec_t asinh() const { return map(builtin_asinh); }
-    realvec_t atan() const { return map(builtin_atan); }
-    realvec_t atan2(realvec_t y) const { return map(builtin_atan2, y); }
-    realvec_t atanh() const { return map(builtin_atanh); }
-    realvec_t cbrt() const { return map(builtin_cbrt); }
-    realvec_t ceil() const { return map(builtin_ceil); }
-    realvec_t copysign(realvec_t y) const { return map(builtin_copysign, y); }
-    realvec_t cos() const { return map(builtin_cos); }
-    realvec_t cosh() const { return map(builtin_cosh); }
-    realvec_t exp() const { return map(builtin_exp); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return map(builtin_exp2); }
-    realvec_t expm1() const { return map(builtin_expm1); }
-    realvec_t fabs() const { return map(builtin_fabs); }
-    realvec_t fdim(realvec_t y) const { return map(builtin_fdim, y); }
-    realvec_t floor() const { return map(builtin_floor); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return map(builtin_fma, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return map(builtin_fmax, y); }
-    realvec_t fmin(realvec_t y) const { return map(builtin_fmin, y); }
-    realvec_t fmod(realvec_t y) const { return map(builtin_fmod, y); }
-    realvec_t frexp(intvec_t* r) const
-    {
-      realvec_t res;
-      intvec_t exp;
-      for (int d=0; d<size; ++d) {
-        real_t val = (*this)[d];
-        int iexp;
-        res.set_elt(d, __builtin_frexp(val, &iexp));
-        int_t jexp = int_t(iexp);
-        if (__builtin_isinf(val)) jexp = std::numeric_limits<int_t>::max();
-        if (__builtin_isnan(val)) jexp = std::numeric_limits<int_t>::min();
-        exp.set_elt(d, jexp);
-      }
-      *r = exp;
-      return res;
-    }
-    realvec_t hypot(realvec_t y) const { return map(builtin_hypot, y); }
-    intvec_t ilogb() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) {
-        real_t val = (*this)[d];
-        int iexp = __builtin_ilogb(val);
-        int_t jexp = int_t(iexp);
-        if (val == R(0.0)) jexp = std::numeric_limits<int_t>::min();
-        if (__builtin_isinf(val)) jexp = std::numeric_limits<int_t>::max();
-        if (__builtin_isnan(val)) jexp = std::numeric_limits<int_t>::min();
-        res.set_elt(d, jexp);
-      }
-      return res;
-    }
-    boolvec_t isfinite() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_isfinite((*this)[d]) != 0);
-      }
-      return res;
-    }
-    boolvec_t isinf() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_isinf((*this)[d]) != 0);
-      }
-      return res;
-    }
-    boolvec_t isnan() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_isnan((*this)[d]) != 0);
-      }
-      return res;
-    }
-    boolvec_t isnormal() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_isnormal((*this)[d]) != 0);
-      }
-      return res;
-    }
-    realvec_t ldexp(int_t n) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_ldexp((*this)[d], int(n)));
-      }
-      return res;
-    }
-    realvec_t ldexp(intvec_t n) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_ldexp((*this)[d], int(n[d])));
-      }
-      return res;
+    *(vector_t *)p = v;
+  }
+  void storeu(real_t *p) const {
+    // *(vector_t*)p = v;
+    for (int d = 0; d < size; ++d)
+      p[d] = (*this)[d];
+    // memcpy(p, &v, sizeof res.v);
+  }
+  void storeu(real_t *p, size_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p, m);
+  }
+  void storeu(real_t *p, const mask_t &m) const {
+    for (int d = 0; d < size; ++d)
+      if (m.m[d])
+        p[d] = (*this)[d];
+  }
+  void storeu(real_t *p, size_t ioff, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const {
+    intvec_t res;
+    std::memcpy(&res.v, &v, sizeof res.v);
+    return res;
+  }
+  intvec_t convert_int() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.set_elt(d, int_t((*this)[d]));
+    return res;
+  }
+
+  realvec_t operator+() const { return mkvec(+v); }
+  realvec_t operator-() const { return mkvec(-v); }
+
+  realvec_t operator+(realvec_t x) const { return mkvec(v + x.v); }
+  realvec_t operator-(realvec_t x) const { return mkvec(v - x.v); }
+  realvec_t operator*(realvec_t x) const { return mkvec(v * x.v); }
+  realvec_t operator/(realvec_t x) const { return mkvec(v / x.v); }
+
+  realvec_t &operator+=(const realvec_t &x) { return *this = *this + x; }
+  realvec_t &operator-=(const realvec_t &x) { return *this = *this - x; }
+  realvec_t &operator*=(const realvec_t &x) { return *this = *this * x; }
+  realvec_t &operator/=(const realvec_t &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d) {
+      res = builtin_fmax(res, (*this)[d]);
     }
-    realvec_t log() const { return map(builtin_log); }
-    realvec_t log10() const { return map(builtin_log10); }
-    realvec_t log1p() const { return map(builtin_log1p); }
-    realvec_t log2() const { return map(builtin_log2); }
-    intvec_t lrint() const
-    {
-      if (sizeof(int_t) <= sizeof(long)) {
-        return map(builtin_lrint);
-      } else if (sizeof(int_t) <= sizeof(long long)) {
-        return map(builtin_llrint);
-      }
-      __builtin_unreachable();
+    return res;
+  }
+  real_t minval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d) {
+      res = builtin_fmin(res, (*this)[d]);
     }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
+    return res;
+  }
+  real_t prod() const {
+    real_t res = (*this)[0];
+    for (int d = 1; d < size; ++d)
+      res *= (*this)[d];
+    return res;
+  }
+  real_t sum() const {
+    real_t res = (*this)[0];
+    for (int d = 1; d < size; ++d)
+      res += (*this)[d];
+    return res;
+  }
+
+  boolvec_t operator==(const realvec_t &x) const {
+    return boolvec_t::mkvec(v == x.v);
+  }
+  boolvec_t operator!=(const realvec_t &x) const {
+    return boolvec_t::mkvec(v != x.v);
+  }
+  boolvec_t operator<(const realvec_t &x) const {
+    return boolvec_t::mkvec(v < x.v);
+  }
+  boolvec_t operator<=(const realvec_t &x) const {
+    return boolvec_t::mkvec(v <= x.v);
+  }
+  boolvec_t operator>(const realvec_t &x) const {
+    return boolvec_t::mkvec(v > x.v);
+  }
+  boolvec_t operator>=(const realvec_t &x) const {
+    return boolvec_t::mkvec(v >= x.v);
+  }
+
+  realvec_t acos() const { return map(builtin_acos); }
+  realvec_t acosh() const { return map(builtin_acosh); }
+  realvec_t asin() const { return map(builtin_asin); }
+  realvec_t asinh() const { return map(builtin_asinh); }
+  realvec_t atan() const { return map(builtin_atan); }
+  realvec_t atan2(realvec_t y) const { return map(builtin_atan2, y); }
+  realvec_t atanh() const { return map(builtin_atanh); }
+  realvec_t cbrt() const { return map(builtin_cbrt); }
+  realvec_t ceil() const { return map(builtin_ceil); }
+  realvec_t copysign(realvec_t y) const { return map(builtin_copysign, y); }
+  realvec_t cos() const { return map(builtin_cos); }
+  realvec_t cosh() const { return map(builtin_cosh); }
+  realvec_t exp() const { return map(builtin_exp); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return map(builtin_exp2); }
+  realvec_t expm1() const { return map(builtin_expm1); }
+  realvec_t fabs() const { return map(builtin_fabs); }
+  realvec_t fdim(realvec_t y) const { return map(builtin_fdim, y); }
+  realvec_t floor() const { return map(builtin_floor); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return map(builtin_fma, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return map(builtin_fmax, y); }
+  realvec_t fmin(realvec_t y) const { return map(builtin_fmin, y); }
+  realvec_t fmod(realvec_t y) const { return map(builtin_fmod, y); }
+  realvec_t frexp(intvec_t *r) const {
+    realvec_t res;
+    intvec_t exp;
+    for (int d = 0; d < size; ++d) {
+      real_t val = (*this)[d];
+      int iexp;
+      res.set_elt(d, __builtin_frexp(val, &iexp));
+      int_t jexp = int_t(iexp);
+      if (__builtin_isinf(val))
+        jexp = std::numeric_limits<int_t>::max();
+      if (__builtin_isnan(val))
+        jexp = std::numeric_limits<int_t>::min();
+      exp.set_elt(d, jexp);
+    }
+    *r = exp;
+    return res;
+  }
+  realvec_t hypot(realvec_t y) const { return map(builtin_hypot, y); }
+  intvec_t ilogb() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d) {
+      real_t val = (*this)[d];
+      int iexp = __builtin_ilogb(val);
+      int_t jexp = int_t(iexp);
+      if (val == R(0.0))
+        jexp = std::numeric_limits<int_t>::min();
+      if (__builtin_isinf(val))
+        jexp = std::numeric_limits<int_t>::max();
+      if (__builtin_isnan(val))
+        jexp = std::numeric_limits<int_t>::min();
+      res.set_elt(d, jexp);
     }
-    realvec_t nextafter(realvec_t y) const { return map(builtin_nextafter, y); }
-    realvec_t pow(realvec_t y) const { return map(builtin_pow, y); }
-    realvec_t rcp() const { return RV(1.0) / *this; }
-    realvec_t remainder(realvec_t y) const { return map(builtin_remainder, y); }
-    realvec_t rint() const { return map(builtin_rint); }
-    realvec_t round() const { return map(builtin_round); }
-    realvec_t rsqrt() const { return RV(1.0) / sqrt(); }
-    boolvec_t signbit() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_signbit((*this)[d]) != 0);
-      }
-      return res;
+    return res;
+  }
+  boolvec_t isfinite() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_isfinite((*this)[d]) != 0);
     }
-    realvec_t sin() const { return map(builtin_sin); }
-    realvec_t sinh() const { return map(builtin_sinh); }
-    realvec_t sqrt() const { return map(builtin_sqrt); }
-    realvec_t tan() const { return map(builtin_tan); }
-    realvec_t tanh() const { return map(builtin_tanh); }
-    realvec_t trunc() const { return map(builtin_trunc); }
-  };
-  
-  
-  
-  // boolbuiltinvec definitions
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::intvec_t boolbuiltinvec<T,N>::as_int() const
-  {
-    intvec_t res;
-    std::memcpy(&res.v, &v, sizeof res.v);
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::intvec_t
-  boolbuiltinvec<T,N>::convert_int() const
-  {
-    return - as_int();
-  }
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::boolvec_t
-  boolbuiltinvec<T,N>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    // return v ? x.v : y.v;
+  boolvec_t isinf() const {
     boolvec_t res;
-    for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_isinf((*this)[d]) != 0);
+    }
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::intvec_t
-  boolbuiltinvec<T,N>::ifthen(intvec_t x, intvec_t y) const
-  {
-    // return v ? x.v : y.v;
-    intvec_t res;
-    for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  boolvec_t isnan() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_isnan((*this)[d]) != 0);
+    }
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::realvec_t
-  boolbuiltinvec<T,N>::ifthen(realvec_t x, realvec_t y) const
-  {
-    // return v ? x.v : y.v;
-    realvec_t res;
-    for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  boolvec_t isnormal() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_isnormal((*this)[d]) != 0);
+    }
     return res;
   }
-  
-  
-  
-  // intbuiltinvec definitions
-  
-  template<typename T, int N>
-  inline
-  typename intbuiltinvec<T,N>::realvec_t intbuiltinvec<T,N>::as_float() const
-  {
+  realvec_t ldexp(int_t n) const {
     realvec_t res;
-    std::memcpy(&res.v, &v, sizeof res.v);
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_ldexp((*this)[d], int(n)));
+    }
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename intbuiltinvec<T,N>::realvec_t
-  intbuiltinvec<T,N>::convert_float() const
-  {
+  realvec_t ldexp(intvec_t n) const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.set_elt(d, real_t((*this)[d]));
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_ldexp((*this)[d], int(n[d])));
+    }
     return res;
   }
-  
-  
-  
-  // Wrappers
-  
-  // boolbuiltinvec wrappers
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> as_int(boolbuiltinvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> convert_int(boolbuiltinvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline bool all(boolbuiltinvec<real_t, size> x) { return x.all(); }
-  
-  template<typename real_t, int size>
-  inline bool any(boolbuiltinvec<real_t, size> x) { return x.any(); }
-  
-  template<typename real_t, int size>
-  inline
-  boolbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
-                                      boolbuiltinvec<real_t, size> x,
-                                      boolbuiltinvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
-                                     intbuiltinvec<real_t, size> x,
-                                     intbuiltinvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
-                                      realbuiltinvec<real_t, size> x,
-                                      realbuiltinvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  
-  
-  // intbuiltinvec wrappers
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> abs(intbuiltinvec<real_t, size> x)
-  {
-    return x.abs();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> as_bool(intbuiltinvec<real_t, size> x)
-  {
-    return x.as_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> as_float(intbuiltinvec<real_t, size> x)
-  {
-    return x.as_float();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> bitifthen(intbuiltinvec<real_t, size> x,
-                                        intbuiltinvec<real_t, size> y,
-                                        intbuiltinvec<real_t, size> z)
-  {
-    return x.bitifthen(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> clz(intbuiltinvec<real_t, size> x)
-  {
-    return x.clz();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> convert_bool(intbuiltinvec<real_t, size> x)
-  {
-    return x.convert_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> convert_float(intbuiltinvec<real_t, size> x)
-  {
-    return x.convert_float();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isignbit(intbuiltinvec<real_t, size> x)
-  {
-    return x.isignbit();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x,
-                                  typename intbuiltinvec<real_t, size>::int_t n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x,
-                                  intbuiltinvec<real_t, size> n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> max(intbuiltinvec<real_t, size> x,
-                                  intbuiltinvec<real_t, size> y)
-  {
-    return x.max(y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> min(intbuiltinvec<real_t, size> x,
-                                  intbuiltinvec<real_t, size> y)
-  {
-    return x.min(y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> popcount(intbuiltinvec<real_t, size> x)
-  {
-    return x.popcount();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size>
-  rotate(intbuiltinvec<real_t, size> x,
-         typename intbuiltinvec<real_t, size>::int_t n)
-  {
-    return x.rotate(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> rotate(intbuiltinvec<real_t, size> x,
-                                     intbuiltinvec<real_t, size> n)
-  {
-    return x.rotate(n);
-  }
-  
-  
-  
-  // realbuiltinvec wrappers
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size>
-  loada(real_t const* p,
-        realbuiltinvec<real_t, size> x,
-        typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.loada(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size>
-  loadu(real_t const* p,
-        realbuiltinvec<real_t, size> x,
-        typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size>
-  loadu(real_t const* p, size_t ioff,
-        realbuiltinvec<real_t, size> x,
-        typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, ioff, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realbuiltinvec<real_t, size> x, real_t* p)
-  {
-    return x.storea(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realbuiltinvec<real_t, size> x, real_t* p)
-  {
-    return x.storeu(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realbuiltinvec<real_t, size> x, real_t* p, size_t ioff)
-  {
-    return x.storeu(p, ioff);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realbuiltinvec<real_t, size> x, real_t* p,
-                     typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.storea(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realbuiltinvec<real_t, size> x, real_t* p,
-                     typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realbuiltinvec<real_t, size> x, real_t* p, size_t ioff,
-                     typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, ioff, m);
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> as_int(realbuiltinvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> convert_int(realbuiltinvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t maxval(realbuiltinvec<real_t, size> x)
-  {
-    return x.maxval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t minval(realbuiltinvec<real_t, size> x)
-  {
-    return x.minval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t prod(realbuiltinvec<real_t, size> x)
-  {
-    return x.prod();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t sum(realbuiltinvec<real_t, size> x)
-  {
-    return x.sum();
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> acos(realbuiltinvec<real_t, size> x)
-  {
-    return x.acos();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> acosh(realbuiltinvec<real_t, size> x)
-  {
-    return x.acosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> asin(realbuiltinvec<real_t, size> x)
-  {
-    return x.asin();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> asinh(realbuiltinvec<real_t, size> x)
-  {
-    return x.asinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> atan(realbuiltinvec<real_t, size> x)
-  {
-    return x.atan();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> atan2(realbuiltinvec<real_t, size> x,
-                                            realbuiltinvec<real_t, size> y)
-  {
-    return x.atan2(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> atanh(realbuiltinvec<real_t, size> x)
-  {
-    return x.atanh();
-  }
-    
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> cbrt(realbuiltinvec<real_t, size> x)
-  {
-    return x.cbrt();
-  }
-    
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> ceil(realbuiltinvec<real_t, size> x)
-  {
-    return x.ceil();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> copysign(realbuiltinvec<real_t, size> x,
-                                               realbuiltinvec<real_t, size> y)
-  {
-    return x.copysign(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> cos(realbuiltinvec<real_t, size> x)
-  {
-    return x.cos();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> cosh(realbuiltinvec<real_t, size> x)
-  {
-    return x.cosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> exp(realbuiltinvec<real_t, size> x)
-  {
-    return x.exp();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> exp10(realbuiltinvec<real_t, size> x)
-  {
-    return x.exp10();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> exp2(realbuiltinvec<real_t, size> x)
-  {
-    return x.exp2();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> expm1(realbuiltinvec<real_t, size> x)
-  {
-    return x.expm1();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fabs(realbuiltinvec<real_t, size> x)
-  {
-    return x.fabs();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> floor(realbuiltinvec<real_t, size> x)
-  {
-    return x.floor();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fdim(realbuiltinvec<real_t, size> x,
-                                        realbuiltinvec<real_t, size> y)
-  {
-    return x.fdim(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fma(realbuiltinvec<real_t, size> x,
-                                          realbuiltinvec<real_t, size> y,
-                                          realbuiltinvec<real_t, size> z)
-  {
-    return x.fma(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fmax(realbuiltinvec<real_t, size> x,
-                                           realbuiltinvec<real_t, size> y)
-  {
-    return x.fmax(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fmin(realbuiltinvec<real_t, size> x,
-                                           realbuiltinvec<real_t, size> y)
-  {
-    return x.fmin(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fmod(realbuiltinvec<real_t, size> x,
-                                           realbuiltinvec<real_t, size> y)
-  {
-    return x.fmod(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> frexp(realbuiltinvec<real_t, size> x,
-                                            intbuiltinvec<real_t, size>* r)
-  {
-    return x.frexp(r);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> hypot(realbuiltinvec<real_t, size> x,
-                                            realbuiltinvec<real_t, size> y)
-  {
-    return x.hypot(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> ilogb(realbuiltinvec<real_t, size> x)
-  {
-    return x.ilogb();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isfinite(realbuiltinvec<real_t, size> x)
-  {
-    return x.isfinite();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isinf(realbuiltinvec<real_t, size> x)
-  {
-    return x.isinf();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isnan(realbuiltinvec<real_t, size> x)
-  {
-    return x.isnan();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isnormal(realbuiltinvec<real_t, size> x)
-  {
-    return x.isnormal();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size>
-  ldexp(realbuiltinvec<real_t, size> x,
-        typename intbuiltinvec<real_t, size>::int_t n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size> ldexp(realbuiltinvec<real_t, size> x,
-                                     intbuiltinvec<real_t, size> n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> log(realbuiltinvec<real_t, size> x)
-  {
-    return x.log();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> log10(realbuiltinvec<real_t, size> x)
-  {
-    return x.log10();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> log1p(realbuiltinvec<real_t, size> x)
-  {
-    return x.log1p();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> log2(realbuiltinvec<real_t, size> x)
-  {
-    return x.log2();
-  }
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> lrint(realbuiltinvec<real_t, size> x)
-  {
-    return x.lrint();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> mad(realbuiltinvec<real_t, size> x,
-                                          realbuiltinvec<real_t, size> y,
-                                          realbuiltinvec<real_t, size> z)
-  {
-    return x.mad(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> nextafter(realbuiltinvec<real_t, size> x,
-                                                realbuiltinvec<real_t, size> y)
-  {
-    return x.nextafter(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> pow(realbuiltinvec<real_t, size> x,
-                                          realbuiltinvec<real_t, size> y)
-  {
-    return x.pow(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> rcp(realbuiltinvec<real_t, size> x)
-  {
-    return x.rcp();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> remainder(realbuiltinvec<real_t, size> x,
-                                                realbuiltinvec<real_t, size> y)
-  {
-    return x.remainder(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> rint(realbuiltinvec<real_t, size> x)
-  {
-    return x.rint();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> round(realbuiltinvec<real_t, size> x)
-  {
-    return x.round();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> rsqrt(realbuiltinvec<real_t, size> x)
-  {
-    return x.rsqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> signbit(realbuiltinvec<real_t, size> x)
-  {
-    return x.signbit();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> sin(realbuiltinvec<real_t, size> x)
-  {
-    return x.sin();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> sinh(realbuiltinvec<real_t, size> x)
-  {
-    return x.sinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> sqrt(realbuiltinvec<real_t, size> x)
-  {
-    return x.sqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> tan(realbuiltinvec<real_t, size> x)
-  {
-    return x.tan();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> tanh(realbuiltinvec<real_t, size> x)
-  {
-    return x.tanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> trunc(realbuiltinvec<real_t, size> x)
-  {
-    return x.trunc();
-  }
-  
-  
-  
-#ifndef VML_NO_IOSTREAM
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           boolbuiltinvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           intbuiltinvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           realbuiltinvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
+  realvec_t log() const { return map(builtin_log); }
+  realvec_t log10() const { return map(builtin_log10); }
+  realvec_t log1p() const { return map(builtin_log1p); }
+  realvec_t log2() const { return map(builtin_log2); }
+  intvec_t lrint() const {
+    if (sizeof(int_t) <= sizeof(long)) {
+      return map(builtin_lrint);
+    } else if (sizeof(int_t) <= sizeof(long long)) {
+      return map(builtin_llrint);
+    }
+    __builtin_unreachable();
+  }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return map(builtin_nextafter, y); }
+  realvec_t pow(realvec_t y) const { return map(builtin_pow, y); }
+  realvec_t rcp() const { return RV(1.0) / *this; }
+  realvec_t remainder(realvec_t y) const { return map(builtin_remainder, y); }
+  realvec_t rint() const { return map(builtin_rint); }
+  realvec_t round() const { return map(builtin_round); }
+  realvec_t rsqrt() const { return RV(1.0) / sqrt(); }
+  boolvec_t signbit() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_signbit((*this)[d]) != 0);
     }
-    os << "]";
-    return os;
+    return res;
   }
+  realvec_t sin() const { return map(builtin_sin); }
+  realvec_t sinh() const { return map(builtin_sinh); }
+  realvec_t sqrt() const { return map(builtin_sqrt); }
+  realvec_t tan() const { return map(builtin_tan); }
+  realvec_t tanh() const { return map(builtin_tanh); }
+  realvec_t trunc() const { return map(builtin_trunc); }
+};
+
+// boolbuiltinvec definitions
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::intvec_t
+boolbuiltinvec<T, N>::as_int() const {
+  intvec_t res;
+  std::memcpy(&res.v, &v, sizeof res.v);
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::intvec_t
+boolbuiltinvec<T, N>::convert_int() const {
+  return -as_int();
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::boolvec_t
+boolbuiltinvec<T, N>::ifthen(boolvec_t x, boolvec_t y) const {
+  // return v ? x.v : y.v;
+  boolvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::intvec_t
+boolbuiltinvec<T, N>::ifthen(intvec_t x, intvec_t y) const {
+  // return v ? x.v : y.v;
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::realvec_t
+boolbuiltinvec<T, N>::ifthen(realvec_t x, realvec_t y) const {
+  // return v ? x.v : y.v;
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  return res;
+}
+
+// intbuiltinvec definitions
+
+template <typename T, int N>
+inline typename intbuiltinvec<T, N>::realvec_t
+intbuiltinvec<T, N>::as_float() const {
+  realvec_t res;
+  std::memcpy(&res.v, &v, sizeof res.v);
+  return res;
+}
+
+template <typename T, int N>
+inline typename intbuiltinvec<T, N>::realvec_t
+intbuiltinvec<T, N>::convert_float() const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.set_elt(d, real_t((*this)[d]));
+  return res;
+}
+
+// Wrappers
+
+// boolbuiltinvec wrappers
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> as_int(boolbuiltinvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> convert_int(boolbuiltinvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline bool all(boolbuiltinvec<real_t, size> x) {
+  return x.all();
+}
+
+template <typename real_t, int size>
+inline bool any(boolbuiltinvec<real_t, size> x) {
+  return x.any();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
+                                           boolbuiltinvec<real_t, size> x,
+                                           boolbuiltinvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
+                                          intbuiltinvec<real_t, size> x,
+                                          intbuiltinvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
+                                           realbuiltinvec<real_t, size> x,
+                                           realbuiltinvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+// intbuiltinvec wrappers
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> abs(intbuiltinvec<real_t, size> x) {
+  return x.abs();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> as_bool(intbuiltinvec<real_t, size> x) {
+  return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> as_float(intbuiltinvec<real_t, size> x) {
+  return x.as_float();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> bitifthen(intbuiltinvec<real_t, size> x,
+                                             intbuiltinvec<real_t, size> y,
+                                             intbuiltinvec<real_t, size> z) {
+  return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> clz(intbuiltinvec<real_t, size> x) {
+  return x.clz();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size>
+convert_bool(intbuiltinvec<real_t, size> x) {
+  return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+convert_float(intbuiltinvec<real_t, size> x) {
+  return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isignbit(intbuiltinvec<real_t, size> x) {
+  return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size>
+lsr(intbuiltinvec<real_t, size> x,
+    typename intbuiltinvec<real_t, size>::int_t n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x,
+                                       intbuiltinvec<real_t, size> n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> max(intbuiltinvec<real_t, size> x,
+                                       intbuiltinvec<real_t, size> y) {
+  return x.max(y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> min(intbuiltinvec<real_t, size> x,
+                                       intbuiltinvec<real_t, size> y) {
+  return x.min(y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> popcount(intbuiltinvec<real_t, size> x) {
+  return x.popcount();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size>
+rotate(intbuiltinvec<real_t, size> x,
+       typename intbuiltinvec<real_t, size>::int_t n) {
+  return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> rotate(intbuiltinvec<real_t, size> x,
+                                          intbuiltinvec<real_t, size> n) {
+  return x.rotate(n);
+}
+
+// realbuiltinvec wrappers
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+loada(real_t const *p, realbuiltinvec<real_t, size> x,
+      typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+loadu(real_t const *p, realbuiltinvec<real_t, size> x,
+      typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+loadu(real_t const *p, size_t ioff, realbuiltinvec<real_t, size> x,
+      typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realbuiltinvec<real_t, size> x, real_t *p) {
+  return x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p) {
+  return x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p, size_t ioff) {
+  return x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realbuiltinvec<real_t, size> x, real_t *p,
+                   typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p,
+                   typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p, size_t ioff,
+                   typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> as_int(realbuiltinvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> convert_int(realbuiltinvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline real_t maxval(realbuiltinvec<real_t, size> x) {
+  return x.maxval();
+}
+
+template <typename real_t, int size>
+inline real_t minval(realbuiltinvec<real_t, size> x) {
+  return x.minval();
+}
+
+template <typename real_t, int size>
+inline real_t prod(realbuiltinvec<real_t, size> x) {
+  return x.prod();
+}
+
+template <typename real_t, int size>
+inline real_t sum(realbuiltinvec<real_t, size> x) {
+  return x.sum();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> acos(realbuiltinvec<real_t, size> x) {
+  return x.acos();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> acosh(realbuiltinvec<real_t, size> x) {
+  return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> asin(realbuiltinvec<real_t, size> x) {
+  return x.asin();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> asinh(realbuiltinvec<real_t, size> x) {
+  return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> atan(realbuiltinvec<real_t, size> x) {
+  return x.atan();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> atan2(realbuiltinvec<real_t, size> x,
+                                          realbuiltinvec<real_t, size> y) {
+  return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> atanh(realbuiltinvec<real_t, size> x) {
+  return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> cbrt(realbuiltinvec<real_t, size> x) {
+  return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> ceil(realbuiltinvec<real_t, size> x) {
+  return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> copysign(realbuiltinvec<real_t, size> x,
+                                             realbuiltinvec<real_t, size> y) {
+  return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> cos(realbuiltinvec<real_t, size> x) {
+  return x.cos();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> cosh(realbuiltinvec<real_t, size> x) {
+  return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> exp(realbuiltinvec<real_t, size> x) {
+  return x.exp();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> exp10(realbuiltinvec<real_t, size> x) {
+  return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> exp2(realbuiltinvec<real_t, size> x) {
+  return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> expm1(realbuiltinvec<real_t, size> x) {
+  return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fabs(realbuiltinvec<real_t, size> x) {
+  return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> floor(realbuiltinvec<real_t, size> x) {
+  return x.floor();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fdim(realbuiltinvec<real_t, size> x,
+                                         realbuiltinvec<real_t, size> y) {
+  return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fma(realbuiltinvec<real_t, size> x,
+                                        realbuiltinvec<real_t, size> y,
+                                        realbuiltinvec<real_t, size> z) {
+  return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fmax(realbuiltinvec<real_t, size> x,
+                                         realbuiltinvec<real_t, size> y) {
+  return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fmin(realbuiltinvec<real_t, size> x,
+                                         realbuiltinvec<real_t, size> y) {
+  return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fmod(realbuiltinvec<real_t, size> x,
+                                         realbuiltinvec<real_t, size> y) {
+  return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> frexp(realbuiltinvec<real_t, size> x,
+                                          intbuiltinvec<real_t, size> *r) {
+  return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> hypot(realbuiltinvec<real_t, size> x,
+                                          realbuiltinvec<real_t, size> y) {
+  return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> ilogb(realbuiltinvec<real_t, size> x) {
+  return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isfinite(realbuiltinvec<real_t, size> x) {
+  return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isinf(realbuiltinvec<real_t, size> x) {
+  return x.isinf();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isnan(realbuiltinvec<real_t, size> x) {
+  return x.isnan();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isnormal(realbuiltinvec<real_t, size> x) {
+  return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+ldexp(realbuiltinvec<real_t, size> x,
+      typename intbuiltinvec<real_t, size>::int_t n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> ldexp(realbuiltinvec<real_t, size> x,
+                                          intbuiltinvec<real_t, size> n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log(realbuiltinvec<real_t, size> x) {
+  return x.log();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log10(realbuiltinvec<real_t, size> x) {
+  return x.log10();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log1p(realbuiltinvec<real_t, size> x) {
+  return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log2(realbuiltinvec<real_t, size> x) {
+  return x.log2();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> lrint(realbuiltinvec<real_t, size> x) {
+  return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> mad(realbuiltinvec<real_t, size> x,
+                                        realbuiltinvec<real_t, size> y,
+                                        realbuiltinvec<real_t, size> z) {
+  return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> nextafter(realbuiltinvec<real_t, size> x,
+                                              realbuiltinvec<real_t, size> y) {
+  return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> pow(realbuiltinvec<real_t, size> x,
+                                        realbuiltinvec<real_t, size> y) {
+  return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> rcp(realbuiltinvec<real_t, size> x) {
+  return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> remainder(realbuiltinvec<real_t, size> x,
+                                              realbuiltinvec<real_t, size> y) {
+  return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> rint(realbuiltinvec<real_t, size> x) {
+  return x.rint();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> round(realbuiltinvec<real_t, size> x) {
+  return x.round();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> rsqrt(realbuiltinvec<real_t, size> x) {
+  return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> signbit(realbuiltinvec<real_t, size> x) {
+  return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> sin(realbuiltinvec<real_t, size> x) {
+  return x.sin();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> sinh(realbuiltinvec<real_t, size> x) {
+  return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> sqrt(realbuiltinvec<real_t, size> x) {
+  return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> tan(realbuiltinvec<real_t, size> x) {
+  return x.tan();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> tanh(realbuiltinvec<real_t, size> x) {
+  return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> trunc(realbuiltinvec<real_t, size> x) {
+  return x.trunc();
+}
+
+#ifndef VML_NO_IOSTREAM
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         boolbuiltinvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         intbuiltinvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         realbuiltinvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
 #endif
-  
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_BUILTIN_H
+#endif // #ifndef VEC_BUILTIN_H
diff --git a/lib/kernel/vecmathlib/vec_mask.h b/lib/kernel/vecmathlib/vec_mask.h
index 6f8c996..053e43a 100644
--- a/lib/kernel/vecmathlib/vec_mask.h
+++ b/lib/kernel/vecmathlib/vec_mask.h
@@ -5,74 +5,67 @@
 
 #include <cstdlib>
 
+namespace vecmathlib {
 
+template <typename realvec_t> class mask_t {
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  class mask_t {
-    
-    typedef typename realvec_t::boolvec_t boolvec_t;
-    typedef typename realvec_t::intvec_t intvec_t;
-    static const int size = realvec_t::size;
-    
-  public:
-    std::ptrdiff_t imin, imax;
-    std::ptrdiff_t i;
-    boolvec_t m;
-    bool all_m;
-    
-  public:
-    
-    // Construct a mask from a boolvec
-    mask_t(boolvec_t m_): m(m_), all_m(all(m)) {}
-    
-    // Construct a mask for a particular location i
-    mask_t(std::ptrdiff_t i_,
-           std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff):
-      imin(imin_), imax(imax_), i(i_)
-    {
-      all_m = i-imin >= 0 && i+size-1-imax < 0;
-      if (__builtin_expect(all_m, true)) {
-        m = true;
-      } else {
-        m = (! isignbit(intvec_t(i          - imin) + intvec_t::iota()) &&
-               isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota()));
-      }
+  typedef typename realvec_t::boolvec_t boolvec_t;
+  typedef typename realvec_t::intvec_t intvec_t;
+  static const int size = realvec_t::size;
+
+public:
+  std::ptrdiff_t imin, imax;
+  std::ptrdiff_t i;
+  boolvec_t m;
+  bool all_m;
+
+public:
+  // Construct a mask from a boolvec
+  mask_t(boolvec_t m_) : m(m_), all_m(all(m)) {}
+
+  // Construct a mask for a particular location i
+  mask_t(std::ptrdiff_t i_, std::ptrdiff_t imin_, std::ptrdiff_t imax_,
+         std::ptrdiff_t ioff)
+      : imin(imin_), imax(imax_), i(i_) {
+    all_m = i - imin >= 0 && i + size - 1 - imax < 0;
+    if (__builtin_expect(all_m, true)) {
+      m = true;
+    } else {
+      m = (!isignbit(intvec_t(i - imin) + intvec_t::iota()) &&
+           isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota()));
     }
-    
-    // Construct a mask for a loop starting at imin, aligned down
-    mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff):
-      imin(imin_), imax(imax_), i(imin_ - (ioff + imin_) % size)
-    {
-      all_m = i-imin >= 0 && i+size-1-imax < 0;
-      if (__builtin_expect(all_m, true)) {
-        m = true;
-      } else {
-        m = (! isignbit(intvec_t(i          - imin) + intvec_t::iota()) &&
-               isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota()));
-      }
+  }
+
+  // Construct a mask for a loop starting at imin, aligned down
+  mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff)
+      : imin(imin_), imax(imax_), i(imin_ - (ioff + imin_) % size) {
+    all_m = i - imin >= 0 && i + size - 1 - imax < 0;
+    if (__builtin_expect(all_m, true)) {
+      m = true;
+    } else {
+      m = (!isignbit(intvec_t(i - imin) + intvec_t::iota()) &&
+           isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota()));
     }
-    
-    // Get current index
-    std::ptrdiff_t index() const { return i; }
-    
-    // Looping condition
-    operator bool() const { return i<imax; }
-    
-    // Loop stepper
-    void operator++()
-    {
-      i += size;
-      all_m = i + size-1 - imax < 0;
-      if (__builtin_expect(all_m, true)) {
-        m = true;
-      } else {
-        m = isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota());
-      }
+  }
+
+  // Get current index
+  std::ptrdiff_t index() const { return i; }
+
+  // Looping condition
+  operator bool() const { return i < imax; }
+
+  // Loop stepper
+  void operator++() {
+    i += size;
+    all_m = i + size - 1 - imax < 0;
+    if (__builtin_expect(all_m, true)) {
+      m = true;
+    } else {
+      m = isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota());
     }
-  };
-  
+  }
+};
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_MASK_H
+#endif // #ifndef VEC_MASK_H
diff --git a/lib/kernel/vecmathlib/vec_mic_double8.h b/lib/kernel/vecmathlib/vec_mic_double8.h
index 68dd5aa..ef22088 100644
--- a/lib/kernel/vecmathlib/vec_mic_double8.h
+++ b/lib/kernel/vecmathlib/vec_mic_double8.h
@@ -12,697 +12,585 @@
 // MIC intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_8
-  template<> struct boolvec<double,8>;
-  template<> struct intvec<double,8>;
-  template<> struct realvec<double,8>;
-  
-  
-  
-  template<>
-  struct boolvec<double,8>: floatprops<double>
-  {
-    static const int size = 8;
-    typedef bool scalar_t;
-    typedef __mask8 bvector_t;
-    static const int alignment = sizeof(bvector_t);
-    
-    // static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-    //               "vector size is wrong");
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(const boolvec& x): v(x.v) {}
-    // boolvec& operator=(const boolvec& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(- bvector_t(a)) {}
-    boolvec(const bool* as):
-      v((bvector_t(as[0]) << 0) |
-        (bvector_t(as[1]) << 1) |
-        (bvector_t(as[2]) << 2) |
-        (bvector_t(as[3]) << 3) |
-        (bvector_t(as[4]) << 4) |
-        (bvector_t(as[5]) << 5) |
-        (bvector_t(as[6]) << 6) |
-        (bvector_t(as[7]) << 7))
-    {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return (v >> n) & 1;
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      v &= ~ (bvector_t(1) << n);
-      v |= bvector_t(a) << n;
-      return *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return _mm512_knot(v); }
-    
-    boolvec operator&&(boolvec x) const { return _mm512_kand(v, x.v); }
-    boolvec operator||(boolvec x) const { return _mm512_kor(v, x.v); }
-    boolvec operator==(boolvec x) const { return _mm512_kxnor(v, x.v); }
-    boolvec operator!=(boolvec x) const { return _mm512_kxor(v, x.v); }
-    
-    bool all() const { return _mm512_kortestc(v, v); }
-    bool any() const { return ! bool(_mm512_kortestz(v, v)); }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,8>: floatprops<double>
-  {
-    static const int size = 8;
-    typedef int_t scalar_t;
-    typedef __m512i ivector_t;
-    static const int alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(const intvec& x): v(x.v) {}
-    // intvec& operator=(const intvec& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm512_set1_epi64(a)) {}
-    intvec(const int_t* as)
-    {
-      v = _mm512_undefined_epi32();
-      // v = _mm512_loadunpacklo_epi32(v, as);
-      // v = _mm512_loadunpackhi_epi32(v, as+8);
-      for (int n=0; n<size; ++n) set_elt(n, as[n]);
-    }
-    static intvec iota()
-    {
-      intvec r;
-      for (int n=0; n<size; ++n) r.set_elt(n, n);
-      return r;
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-  private:
-    static __mmask8 mask16tomask8(__mmask16 m16)
-    {
-      // combine 01
-      m16 = ((m16 >> 1) | m16) & 0b0011001100110011;
-      // combine 0123
-      m16 = ((m16 >> 2) | m16) & 0b0000111100001111;
-      // combine 01234567
-      m16 = ((m16 >> 4) | m16) & 0b0000000011111111;
-      return m16;
-    }
-  public:
-    boolvec_t as_bool() const { return convert_bool(); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      __mmask16 r16 = _mm512_test_epi32_mask(v, v);
-      return mask16tomask8(r16);
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    intvec operator+(intvec x) const { return _mm512_add_epi64(v, x.v); }
-    intvec operator-(intvec x) const { return _mm512_sub_epi64(v, x.v); }
-    
-    intvec& operator+=(const intvec& x) { return *this=*this+x; }
-    intvec& operator-=(const intvec& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    intvec operator&(intvec x) const { return _mm512_and_epi64(v, x.v); }
-    intvec operator|(intvec x) const { return _mm512_or_epi64(v, x.v); }
-    intvec operator^(intvec x) const { return _mm512_xor_epi64(v, x.v); }
-    
-    intvec& operator&=(const intvec& x) { return *this=*this&x; }
-    intvec& operator|=(const intvec& x) { return *this=*this|x; }
-    intvec& operator^=(const intvec& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
-      if (n < 32) {
-        __m512i vlo = _mm512_srli_epi32(v, n);
-        __m512i vhi = _mm512_slli_epi32(v, 32-n);
-        vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
-        return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo);
-      } else {
-        __m512i vlo = _mm512_srli_epi32(v, n-32);
-        __m512i vhi = _mm512_setzero_epi32();
-        return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
-      }
-    }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const
-    {
-      if (n < 32) {
-        __mm512i vlo = _mm512_srai_epi32(v, n);
-        __mm512i vlo0 = _mm512_srli_epi32(v, n);
-        __mm512i vhi = _mm512_slli_epi32(v, 32-n);
-        vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
-        return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo0);
-      } else {
-        __m512i vlo = _mm512_srai_epi32(v, n-32);
-        __m512i vhi = _mm512_srai_epi32(v, 31);
-        return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
-      }
-    }
-    intvec operator<<(int_t n) const
-    {
-      if (n < 32) {
-        __m512i vlo = _mm512_srli_epi32(v, n);
-        __m512i vhi = _mm512_slli_epi32(v, 32-n);
-        vlo = _mm512_swizzle_epi32(vlo, _MM_SWIZ_REG_CDAB);
-        return _mm512_mask_or_epi32(vhi, 0xb1010101010101010, vhi, vlo);
-      } else {
-        __m512i vlo = _mm512_setzero_epi32();
-        __m512i vhi = _mm512_slli_epi32(v, n-32);
-        return _mm512_mask_swizzle_epi32(vhi, 0xb1010101010101010, vlo);
-      }
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      // TODO: improve this
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      // TODO: improve this
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      // TODO: improve this
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const
-    {
-      // Return 8*sizeof(TYPE) when the input is 0
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        // __lzcnt64
-        r.set_elt(i, __builtin_clzll((*this)[i]));
-      }
-      return r;
-    }
-    intvec_t popcount() const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        // _mm_popcnt_u64
-        r.set_elt(i, __builtin_popcountll((*this)[i]));
-      }
-      return r;
-    }
-    
-    
-    
-    boolvec_t operator==(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_EQ));
-    }
-    boolvec_t operator!=(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_NE));
-    }
-    boolvec_t operator<(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LT));
-    }
-    boolvec_t operator<=(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LE));
-    }
-    boolvec_t operator>(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GT));
-    }
-    boolvec_t operator>=(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GE));
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const;
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,8>: floatprops<double>
-  {
-    static const int size = 8;
-    typedef real_t scalar_t;
-    typedef __m512d vector_t;
-    static const int alignment = sizeof(vector_t);
-    
-    static const char* name() { return "<MIC:8*double>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(const realvec& x): v(x.v) {}
-    // realvec& operator=(const realvec& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm512_set1_pd(a)) {}
-    realvec(const real_t* as)
-    {
-      v = _mm512_undefined_pd();
-      // v = _mm512_loadunpacklo_pd(v, as);
-      // v = _mm512_loadunpackhi_pd(v, as+8);
-      for (int n=0; n<size; ++n) set_elt(n, as[n]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(const real_t* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm512_load_pd(p);
-    }
-    static realvec_t loadu(const real_t* p)
-    {
-      realvec_t r(_mm512_undefined_pd());
-      r.v = _mm512_loadunpacklo_pd(r.v, p);
-      r.v = _mm512_loadunpackhi_pd(r.v, p+8);
-      return r.v;
-    }
-    static realvec_t loadu(const real_t* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(const real_t* p, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm512_mask_load_pd(v, m.m.v, p);
+template <> struct boolvec<double, 8>;
+template <> struct intvec<double, 8>;
+template <> struct realvec<double, 8>;
+
+template <> struct boolvec<double, 8> : floatprops<double> {
+  static const int size = 8;
+  typedef bool scalar_t;
+  typedef __mask8 bvector_t;
+  static const int alignment = sizeof(bvector_t);
+
+  // static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+  //               "vector size is wrong");
+
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(const boolvec& x): v(x.v) {}
+  // boolvec& operator=(const boolvec& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(-bvector_t(a)) {}
+  boolvec(const bool *as)
+      : v((bvector_t(as[0]) << 0) | (bvector_t(as[1]) << 1) |
+          (bvector_t(as[2]) << 2) | (bvector_t(as[3]) << 3) |
+          (bvector_t(as[4]) << 4) | (bvector_t(as[5]) << 5) |
+          (bvector_t(as[6]) << 6) | (bvector_t(as[7]) << 7)) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return (v >> n) & 1; }
+  boolvec &set_elt(int n, bool a) {
+    v &= ~(bvector_t(1) << n);
+    v |= bvector_t(a) << n;
+    return *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return _mm512_knot(v); }
+
+  boolvec operator&&(boolvec x) const { return _mm512_kand(v, x.v); }
+  boolvec operator||(boolvec x) const { return _mm512_kor(v, x.v); }
+  boolvec operator==(boolvec x) const { return _mm512_kxnor(v, x.v); }
+  boolvec operator!=(boolvec x) const { return _mm512_kxor(v, x.v); }
+
+  bool all() const { return _mm512_kortestc(v, v); }
+  bool any() const { return !bool(_mm512_kortestz(v, v)); }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 8> : floatprops<double> {
+  static const int size = 8;
+  typedef int_t scalar_t;
+  typedef __m512i ivector_t;
+  static const int alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(const intvec& x): v(x.v) {}
+  // intvec& operator=(const intvec& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm512_set1_epi64(a)) {}
+  intvec(const int_t *as) {
+    v = _mm512_undefined_epi32();
+    // v = _mm512_loadunpacklo_epi32(v, as);
+    // v = _mm512_loadunpackhi_epi32(v, as+8);
+    for (int n = 0; n < size; ++n)
+      set_elt(n, as[n]);
+  }
+  static intvec iota() {
+    intvec r;
+    for (int n = 0; n < size; ++n)
+      r.set_elt(n, n);
+    return r;
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+private:
+  static __mmask8 mask16tomask8(__mmask16 m16) {
+    // combine 01
+    m16 = ((m16 >> 1) | m16) & 0b0011001100110011;
+    // combine 0123
+    m16 = ((m16 >> 2) | m16) & 0b0000111100001111;
+    // combine 01234567
+    m16 = ((m16 >> 4) | m16) & 0b0000000011111111;
+    return m16;
+  }
+
+public:
+  boolvec_t as_bool() const { return convert_bool(); }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    __mmask16 r16 = _mm512_test_epi32_mask(v, v);
+    return mask16tomask8(r16);
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return IV(I(0)) - *this; }
+  intvec operator+(intvec x) const { return _mm512_add_epi64(v, x.v); }
+  intvec operator-(intvec x) const { return _mm512_sub_epi64(v, x.v); }
+
+  intvec &operator+=(const intvec &x) { return *this = *this + x; }
+  intvec &operator-=(const intvec &x) { return *this = *this - x; }
+
+  intvec operator~() const { return IV(~U(0)) ^ *this; }
+  intvec operator&(intvec x) const { return _mm512_and_epi64(v, x.v); }
+  intvec operator|(intvec x) const { return _mm512_or_epi64(v, x.v); }
+  intvec operator^(intvec x) const { return _mm512_xor_epi64(v, x.v); }
+
+  intvec &operator&=(const intvec &x) { return *this = *this & x; }
+  intvec &operator|=(const intvec &x) { return *this = *this | x; }
+  intvec &operator^=(const intvec &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec lsr(int_t n) const {
+    if (n < 32) {
+      __m512i vlo = _mm512_srli_epi32(v, n);
+      __m512i vhi = _mm512_slli_epi32(v, 32 - n);
+      vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
+      return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo);
+    } else {
+      __m512i vlo = _mm512_srli_epi32(v, n - 32);
+      __m512i vhi = _mm512_setzero_epi32();
+      return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
     }
-    realvec_t loadu(const real_t* p, const mask_t& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
+  }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const {
+    if (n < 32) {
+      __mm512i vlo = _mm512_srai_epi32(v, n);
+      __mm512i vlo0 = _mm512_srli_epi32(v, n);
+      __mm512i vhi = _mm512_slli_epi32(v, 32 - n);
+      vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
+      return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo0);
+    } else {
+      __m512i vlo = _mm512_srai_epi32(v, n - 32);
+      __m512i vhi = _mm512_srai_epi32(v, 31);
+      return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
     }
-    realvec_t loadu(const real_t* p, std::ptrdiff_t ioff, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
+  }
+  intvec operator<<(int_t n) const {
+    if (n < 32) {
+      __m512i vlo = _mm512_srli_epi32(v, n);
+      __m512i vhi = _mm512_slli_epi32(v, 32 - n);
+      vlo = _mm512_swizzle_epi32(vlo, _MM_SWIZ_REG_CDAB);
+      return _mm512_mask_or_epi32(vhi, 0xb1010101010101010, vhi, vlo);
+    } else {
+      __m512i vlo = _mm512_setzero_epi32();
+      __m512i vhi = _mm512_slli_epi32(v, n - 32);
+      return _mm512_mask_swizzle_epi32(vhi, 0xb1010101010101010, vlo);
     }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm512_store_pd(p, v);
+  }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
+    // TODO: improve this
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    void storeu(real_t* p) const
-    {
-      _mm512_packstorelo_pd(p, v);
-      _mm512_packstorehi_pd(p+8, v);
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const {
+    // TODO: improve this
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
+    return r;
+  }
+  intvec operator<<(intvec n) const {
+    // TODO: improve this
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    void storea(real_t* p, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm512_mask_store_pd(p, m.m.v, v);
+    return r;
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const {
+    // Return 8*sizeof(TYPE) when the input is 0
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      // __lzcnt64
+      r.set_elt(i, __builtin_clzll((*this)[i]));
     }
-    void storeu(real_t* p, const mask_t& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        for (int n=0; n<size; ++n) {
-          if (m.m[n]) p[n] = (*this)[n];
-        }
-      }
+    return r;
+  }
+  intvec_t popcount() const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      // _mm_popcnt_u64
+      r.set_elt(i, __builtin_popcountll((*this)[i]));
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+    return r;
+  }
+
+  boolvec_t operator==(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_EQ));
+  }
+  boolvec_t operator!=(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_NE));
+  }
+  boolvec_t operator<(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LT));
+  }
+  boolvec_t operator<=(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LE));
+  }
+  boolvec_t operator>(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GT));
+  }
+  boolvec_t operator>=(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GE));
+  }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const;
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 8> : floatprops<double> {
+  static const int size = 8;
+  typedef real_t scalar_t;
+  typedef __m512d vector_t;
+  static const int alignment = sizeof(vector_t);
+
+  static const char *name() { return "<MIC:8*double>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(const realvec& x): v(x.v) {}
+  // realvec& operator=(const realvec& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm512_set1_pd(a)) {}
+  realvec(const real_t *as) {
+    v = _mm512_undefined_pd();
+    // v = _mm512_loadunpacklo_pd(v, as);
+    // v = _mm512_loadunpackhi_pd(v, as+8);
+    for (int n = 0; n < size; ++n)
+      set_elt(n, as[n]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(const real_t *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm512_load_pd(p);
+  }
+  static realvec_t loadu(const real_t *p) {
+    realvec_t r(_mm512_undefined_pd());
+    r.v = _mm512_loadunpacklo_pd(r.v, p);
+    r.v = _mm512_loadunpackhi_pd(r.v, p + 8);
+    return r.v;
+  }
+  static realvec_t loadu(const real_t *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(const real_t *p, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm512_mask_load_pd(v, m.m.v, p);
+  }
+  realvec_t loadu(const real_t *p, const mask_t &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    
-    
-    
-    intvec_t as_int() const { return _mm512_castpd_si512(v); }
-    intvec_t convert_int() const
-    {
-      intvec_t r(_mm512_undefined_epi32());
-      for (int n=0; n<size; ++n) {
-        r.set_elt(n, floatprops::convert_int((*this)[n]));
+  }
+  realvec_t loadu(const real_t *p, std::ptrdiff_t ioff, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm512_store_pd(p, v);
+  }
+  void storeu(real_t *p) const {
+    _mm512_packstorelo_pd(p, v);
+    _mm512_packstorehi_pd(p + 8, v);
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm512_mask_store_pd(p, m.m.v, v);
+  }
+  void storeu(real_t *p, const mask_t &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      for (int n = 0; n < size; ++n) {
+        if (m.m[n])
+          p[n] = (*this)[n];
       }
-      return r;
-    }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return _mm512_add_pd(v, x.v); }
-    realvec operator-(realvec x) const { return _mm512_sub_pd(v, x.v); }
-    realvec operator*(realvec x) const { return _mm512_mul_pd(v, x.v); }
-    realvec operator/(realvec x) const { return _mm512_div_pd(v, x.v); }
-    
-    realvec& operator+=(const realvec& x) { return *this=*this+x; }
-    realvec& operator-=(const realvec& x) { return *this=*this-x; }
-    realvec& operator*=(const realvec& x) { return *this=*this*x; }
-    realvec& operator/=(const realvec& x) { return *this=*this/x; }
-    
-    real_t maxval() const { returm _mm512_reduce_gmax_pd(v); }
-    real_t minval() const { returm _mm512_reduce_gmin_pd(v); }
-    real_t prod() const { returm _mm512_reduce_mul_pd(v); }
-    real_t sum() const { returm _mm512_reduce_add_pd(v); }
-    
-    
-    
-    boolvec_t operator==(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_EQ_OQ);
-    }
-    boolvec_t operator!=(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
-    }
-    boolvec_t operator<(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_LT_OQ);
-    }
-    boolvec_t operator<=(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_LE_OQ);
-    }
-    boolvec_t operator>(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_GT_OQ);
     }
-    boolvec_t operator>=(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_GE_OQ);
-    }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return _mm512_ceil_pd(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return _mm512_floor_pd(v); }
-    realvec fma(realvec y, realvec z) const
-    {
-      return _mm512_fmadd_pd(v, x.v, y.v);
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm512_castpd_si512(v); }
+  intvec_t convert_int() const {
+    intvec_t r(_mm512_undefined_epi32());
+    for (int n = 0; n < size; ++n) {
+      r.set_elt(n, floatprops::convert_int((*this)[n]));
     }
-    realvec fmax(realvec y) const { return _mm512_gmax_pd(v, y.v); }
-    realvec fmin(realvec y) const { return _mm512_gmin_pd(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+    return r;
+  }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return RV(0.0) - *this; }
+
+  realvec operator+(realvec x) const { return _mm512_add_pd(v, x.v); }
+  realvec operator-(realvec x) const { return _mm512_sub_pd(v, x.v); }
+  realvec operator*(realvec x) const { return _mm512_mul_pd(v, x.v); }
+  realvec operator/(realvec x) const { return _mm512_div_pd(v, x.v); }
+
+  realvec &operator+=(const realvec &x) { return *this = *this + x; }
+  realvec &operator-=(const realvec &x) { return *this = *this - x; }
+  realvec &operator*=(const realvec &x) { return *this = *this * x; }
+  realvec &operator/=(const realvec &x) { return *this = *this / x; }
+
+  real_t maxval() const { returm _mm512_reduce_gmax_pd(v); }
+  real_t minval() const { returm _mm512_reduce_gmin_pd(v); }
+  real_t prod() const { returm _mm512_reduce_mul_pd(v); }
+  real_t sum() const { returm _mm512_reduce_add_pd(v); }
+
+  boolvec_t operator==(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_EQ_OQ);
+  }
+  boolvec_t operator!=(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+  }
+  boolvec_t operator<(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_LT_OQ);
+  }
+  boolvec_t operator<=(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_LE_OQ);
+  }
+  boolvec_t operator>(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_GT_OQ);
+  }
+  boolvec_t operator>=(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_GE_OQ);
+  }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const { return _mm512_ceil_pd(v); }
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return MF::vml_fabs(*this); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const { return _mm512_floor_pd(v); }
+  realvec fma(realvec y, realvec z) const {
+    return _mm512_fmadd_pd(v, x.v, y.v);
+  }
+  realvec fmax(realvec y) const { return _mm512_gmax_pd(v, y.v); }
+  realvec fmin(realvec y) const { return _mm512_gmin_pd(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #ifdef VML_HAVE_NAN
-      return _mm512_cmp_pd(v, v, _CMP_UNORD_Q);
+    return _mm512_cmp_pd(v, v, _CMP_UNORD_Q);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return _mm512_fmadd_pd(v, x.v, y.v);
-    }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const { return _mm512_div_pd(_mm512_set1_pd(1.0), v); }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-      return _mm512_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
-    }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return as_int().signbit(); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return _mm512_sqrt_pd(v); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return _mm512_round_pd(v, _MM_FROUND_TO_ZERO); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,4> boolvec<double,4>::as_int() const
-  {
-    return _mm512_castpd_si512(v);
-  }
-  
-  inline intvec<double,4> boolvec<double,4>::convert_int() const
-  {
-    return ifthen(v, IV(I(1)), IV(I(0)));
-  }
-  
-  inline
-  boolvec<double,4> boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return (v & x.v) | (~v & y.v);
-  }
-  
-  inline
-  intvec<double,4> boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return _mm512_blend_epi64(v, y.v, x.v)
-  }
-  
-  inline
-  realvec<double,4> boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return _mm512_blend_pd(v, y.v, x.v)
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<double,4> intvec<double,4>::as_float() const
-  {
-    return _mm512_castsi512_pd(v);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::convert_float() const
-  {
-    intvec_t r(_mm512_undefined_pd());
-    for (int n=0; n<size; ++n) {
-      r.set_elt(n, floatprops::convert_float((*this)[n]));
-    }
-    return r;
   }
-  
-  inline intvec<double,8> intvec<double,8>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::bitifthen(intvec_t x,
-                                                      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline boolvec<double,8> intvec<double,8>::isignbit() const
-  {
-    return MF::vml_isignbit(*this);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return _mm512_fmadd_pd(v, x.v, y.v);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const { return _mm512_div_pd(_mm512_set1_pd(1.0), v); }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const { return _mm512_round_pd(v, _MM_FROUND_TO_NEAREST_INT); }
+  realvec round() const { return MF::vml_round(*this); }
+  realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return as_int().signbit(); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const { return _mm512_sqrt_pd(v); }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const { return _mm512_round_pd(v, _MM_FROUND_TO_ZERO); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 4> boolvec<double, 4>::as_int() const {
+  return _mm512_castpd_si512(v);
+}
+
+inline intvec<double, 4> boolvec<double, 4>::convert_int() const {
+  return ifthen(v, IV(I(1)), IV(I(0)));
+}
+
+inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return (v & x.v) | (~v & y.v);
+}
+
+inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return _mm512_blend_epi64(v, y.v, x.v)
+}
+
+inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return _mm512_blend_pd(v, y.v, x.v)
+}
+
+// intvec definitions
+
+inline realvec<double, 4> intvec<double, 4>::as_float() const {
+  return _mm512_castsi512_pd(v);
+}
+
+inline realvec<double, 4> intvec<double, 4>::convert_float() const {
+  intvec_t r(_mm512_undefined_pd());
+  for (int n = 0; n < size; ++n) {
+    r.set_elt(n, floatprops::convert_float((*this)[n]));
+  }
+  return r;
+}
+
+inline intvec<double, 8> intvec<double, 8>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline intvec<double, 8> intvec<double, 8>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline boolvec<double, 8> intvec<double, 8>::isignbit() const {
+  return MF::vml_isignbit(*this);
+}
+
+inline intvec<double, 8> intvec<double, 8>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 8> intvec<double, 8>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 8> intvec<double, 8>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 8> intvec<double, 8>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_MIC_DOUBLE8_H
+#endif // #ifndef VEC_MIC_DOUBLE8_H
diff --git a/lib/kernel/vecmathlib/vec_neon_float2.h b/lib/kernel/vecmathlib/vec_neon_float2.h
index 3a21a05..6df9969 100644
--- a/lib/kernel/vecmathlib/vec_neon_float2.h
+++ b/lib/kernel/vecmathlib/vec_neon_float2.h
@@ -14,608 +14,511 @@
 // Neon intrinsics
 #include <arm_neon.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_2
-  template<> struct boolvec<float,2>;
-  template<> struct intvec<float,2>;
-  template<> struct realvec<float,2>;
-  
-  
-  
-  template<>
-  struct boolvec<float,2>: floatprops<float>
-  {
-    static int const size = 2;
-    typedef bool scalar_t;
-    typedef uint32x2_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vdup_n_u32(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vmvn_u32(v); }
-    
-    boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); }
-    boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); }
-    boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); }
-    boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); }
-    
-    bool all() const
-    {
-      boolvec r = vpmin_u32(v, v);
-      return r[0];
-    }
-    bool any() const
-    {
-      boolvec r = vpmax_u32(v, v);
-      return r[0];
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,2>: floatprops<float>
-  {
-    static int const size = 2;
-    typedef int_t scalar_t;
-    typedef int32x2_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vdup_n_s32(a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota()
-    {
-      return vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0));
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return vreinterpret_u32_s32(v); }
-    boolvec_t convert_bool() const { return *this != IV(0); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return vneg_s32(v); }
-    
-    intvec operator+(intvec x) const { return vadd_s32(v, x.v); }
-    intvec operator-(intvec x) const { return vsub_s32(v, x.v); }
-    intvec operator*(intvec x) const { return vmul_s32(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    intvec& operator*=(intvec const& x) { return *this=*this*x; }
-    
-    
-    
-    intvec operator~() const { return vmvn_s32(v); }
-    
-    intvec operator&(intvec x) const { return vand_s32(v, x.v); }
-    intvec operator|(intvec x) const { return vorr_s32(v, x.v); }
-    intvec operator^(intvec x) const { return veor_s32(v, x.v); }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const
-    {
-      return vbsl_s32(vreinterpret_u32_s32(v), x.v, y.v);
-    }
-    
-    
-    
-    intvec_t lsr(int_t n) const { return lsr(IV(n)); }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v));
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      return vshl_s32(v, (-n).v);
-    }
-    intvec operator<<(intvec n) const
-    {
-      return vshl_s32(v, n.v);
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const { return vclz_s32(v); }
-    intvec_t popcount() const
-    {
-      return vpaddl_s16(vpaddl_s8(vcnt_s8(vreinterpret_s8_s32(v))));
-    }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const { return vceq_s32(v, x.v); }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const { return vclt_s32(v, x.v); }
-    boolvec_t operator<=(intvec const& x) const { return vcle_s32(v, x.v); }
-    boolvec_t operator>(intvec const& x) const { return vcgt_s32(v, x.v); }
-    boolvec_t operator>=(intvec const& x) const { return vcge_s32(v, x.v); }
-    
-    intvec_t abs() const { return vabs_s32(v); }
-    boolvec_t isignbit() const
-    {
-      //return *this < IV(I(0));
-      return intvec(vshr_n_s32(v, FP::bits-1)).as_bool();
-    }
-    intvec_t max(intvec_t x) const { return vmax_s32(v, x.v); }
-    intvec_t min(intvec_t x) const { return vmin_s32(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,2>: floatprops<float>
-  {
-    static int const size = 2;
-    typedef real_t scalar_t;
-    typedef float32x2_t vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<NEON:2*float>"; }
-    void barrier() { __asm__("": "+w"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vdup_n_f32(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vld1_f32(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
+template <> struct boolvec<float, 2>;
+template <> struct intvec<float, 2>;
+template <> struct realvec<float, 2>;
+
+template <> struct boolvec<float, 2> : floatprops<float> {
+  static int const size = 2;
+  typedef bool scalar_t;
+  typedef uint32x2_t bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values are -1, false values are 0
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return a; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(vdup_n_u32(from_bool(a))) {}
+  boolvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vmvn_u32(v); }
+
+  boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); }
+  boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); }
+  boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); }
+  boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); }
+
+  bool all() const {
+    boolvec r = vpmin_u32(v, v);
+    return r[0];
+  }
+  bool any() const {
+    boolvec r = vpmax_u32(v, v);
+    return r[0];
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 2> : floatprops<float> {
+  static int const size = 2;
+  typedef int_t scalar_t;
+  typedef int32x2_t ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vdup_n_s32(a)) {}
+  intvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() {
+    return vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0));
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return vreinterpret_u32_s32(v); }
+  boolvec_t convert_bool() const { return *this != IV(0); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return vneg_s32(v); }
+
+  intvec operator+(intvec x) const { return vadd_s32(v, x.v); }
+  intvec operator-(intvec x) const { return vsub_s32(v, x.v); }
+  intvec operator*(intvec x) const { return vmul_s32(v, x.v); }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+  intvec &operator*=(intvec const &x) { return *this = *this * x; }
+
+  intvec operator~() const { return vmvn_s32(v); }
+
+  intvec operator&(intvec x) const { return vand_s32(v, x.v); }
+  intvec operator|(intvec x) const { return vorr_s32(v, x.v); }
+  intvec operator^(intvec x) const { return veor_s32(v, x.v); }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const {
+    return vbsl_s32(vreinterpret_u32_s32(v), x.v, y.v);
+  }
+
+  intvec_t lsr(int_t n) const { return lsr(IV(n)); }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const { return *this >> IV(n); }
+  intvec operator<<(int_t n) const { return *this << IV(n); }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
+    return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v));
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const { return vshl_s32(v, (-n).v); }
+  intvec operator<<(intvec n) const { return vshl_s32(v, n.v); }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const { return vclz_s32(v); }
+  intvec_t popcount() const {
+    return vpaddl_s16(vpaddl_s8(vcnt_s8(vreinterpret_s8_s32(v))));
+  }
+
+  boolvec_t operator==(intvec const &x) const { return vceq_s32(v, x.v); }
+  boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(intvec const &x) const { return vclt_s32(v, x.v); }
+  boolvec_t operator<=(intvec const &x) const { return vcle_s32(v, x.v); }
+  boolvec_t operator>(intvec const &x) const { return vcgt_s32(v, x.v); }
+  boolvec_t operator>=(intvec const &x) const { return vcge_s32(v, x.v); }
+
+  intvec_t abs() const { return vabs_s32(v); }
+  boolvec_t isignbit() const {
+    // return *this < IV(I(0));
+    return intvec(vshr_n_s32(v, FP::bits - 1)).as_bool();
+  }
+  intvec_t max(intvec_t x) const { return vmax_s32(v, x.v); }
+  intvec_t min(intvec_t x) const { return vmin_s32(v, x.v); }
+};
+
+template <> struct realvec<float, 2> : floatprops<float> {
+  static int const size = 2;
+  typedef real_t scalar_t;
+  typedef float32x2_t vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<NEON:2*float>"; }
+  void barrier() { __asm__("" : "+w"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vdup_n_f32(a)) {}
+  realvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vld1_f32(p);
+  }
+  static realvec_t loadu(real_t const *p) {
 #if defined __ARM_FEATURE_UNALIGNED
-      return vld1_f32(p);
+    return vld1_f32(p);
 #else
-      realvec_t r;
-      r.set_elt(0, p[0]);
-      r.set_elt(1, p[1]);
-      return r;
+    realvec_t r;
+    r.set_elt(0, p[0]);
+    r.set_elt(1, p[1]);
+    return r;
 #endif
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vst1_f32(p, v);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vst1_f32(p, v);
+  }
+  void storeu(real_t *p) const {
+// Vector stores would require vector loads, which would need to
+// be atomic
 #if defined __ARM_FEATURE_UNALIGNED
-      vst1_f32(p, v);
+    vst1_f32(p, v);
 #else
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
 #endif
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return vreinterpret_s32_f32(v); }
-    intvec_t convert_int() const { return vcvt_s32_f32(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return vneg_f32(v); }
-    
-    realvec operator+(realvec x) const { return vadd_f32(v, x.v); }
-    realvec operator-(realvec x) const { return vsub_f32(v, x.v); }
-    realvec operator*(realvec x) const { return vmul_f32(v, x.v); }
-    realvec operator/(realvec x) const { return *this * x.rcp(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      realvec r = vpmax_f32(v, v);
-      return r[0];
-    }
-    real_t minval() const
-    {
-      realvec r = vpmin_f32(v, v);
-      return r[0];
-    }
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1];
-    }
-    real_t sum() const
-    {
-      realvec r = vpadd_f32(v, v);
-      return r[0];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vceq_f32(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vclt_f32(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vcle_f32(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vcgt_f32(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vcge_f32(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const
-    {
-      // return vrndp_f32(v);
-      return MF::vml_ceil(*this);
-    }
-    realvec copysign(realvec y) const
-    {
-      return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v);
-    }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vabs_f32(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const
-    {
-      // return vrndm_f32(v);
-      return MF::vml_floor(*this);
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return vfma_f32(z.v, v, y.v);
-    }
-    realvec fmax(realvec y) const { return vmax_f32(v, y.v); }
-    realvec fmin(realvec y) const { return vmin_f32(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      // TODO: vfma_f32
-      return vmla_f32(z.v, v, y.v);
-    }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec r = vrecpe_f32(v);
-      r *= vrecps_f32(v, r);
-      r *= vrecps_f32(v, r);
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-      // return vrndn_f32(v);
-      return MF::vml_rint(*this);
-    }
-    realvec round() const
-    {
-      // return vrnda_f32(v);
-      return MF::vml_round(*this);
-    }
-    realvec rsqrt() const
-    {
-      realvec r = vrsqrte_f32(v);
-      r *= vrsqrts_f32(v, r*r);
-      r *= vrsqrts_f32(v, r*r);
-      return r;
-    }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return *this * rsqrt(); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const
-    {
-      // return vrnd_f32(v);
-      return MF::vml_trunc(*this);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
     }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,2> boolvec<float,2>::as_int() const
-  {
-    return vreinterpret_s32_u32(v);
-  }
-  
-  inline intvec<float,2> boolvec<float,2>::convert_int() const
-  {
-    return - as_int();
-  }
-  
-  inline
-  boolvec<float,2> boolvec<float,2>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return vbsl_u32(v, x.v, y.v);
-  }
-  
-  inline intvec<float,2> boolvec<float,2>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return vbsl_s32(v, x.v, y.v);
-  }
-  
-  inline
-  realvec<float,2> boolvec<float,2>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vbsl_f32(v, x.v, y.v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<float,2> intvec<float,2>::as_float() const
-  {
-    return vreinterpret_f32_s32(v);
-  }
-  
-  inline realvec<float,2> intvec<float,2>::convert_float() const
-  {
-    return vcvt_f32_s32(v);
-  }
-  
-  inline intvec<float,2> intvec<float,2>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,2> intvec<float,2>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return vreinterpret_s32_f32(v); }
+  intvec_t convert_int() const { return vcvt_s32_f32(v); }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return vneg_f32(v); }
+
+  realvec operator+(realvec x) const { return vadd_f32(v, x.v); }
+  realvec operator-(realvec x) const { return vsub_f32(v, x.v); }
+  realvec operator*(realvec x) const { return vmul_f32(v, x.v); }
+  realvec operator/(realvec x) const { return *this * x.rcp(); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    realvec r = vpmax_f32(v, v);
+    return r[0];
+  }
+  real_t minval() const {
+    realvec r = vpmin_f32(v, v);
+    return r[0];
+  }
+  real_t prod() const { return (*this)[0] * (*this)[1]; }
+  real_t sum() const {
+    realvec r = vpadd_f32(v, v);
+    return r[0];
+  }
+
+  boolvec_t operator==(realvec const &x) const { return vceq_f32(v, x.v); }
+  boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(realvec const &x) const { return vclt_f32(v, x.v); }
+  boolvec_t operator<=(realvec const &x) const { return vcle_f32(v, x.v); }
+  boolvec_t operator>(realvec const &x) const { return vcgt_f32(v, x.v); }
+  boolvec_t operator>=(realvec const &x) const { return vcge_f32(v, x.v); }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const {
+    // return vrndp_f32(v);
+    return MF::vml_ceil(*this);
+  }
+  realvec copysign(realvec y) const {
+    return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v);
+  }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return vabs_f32(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const {
+    // return vrndm_f32(v);
+    return MF::vml_floor(*this);
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return vfma_f32(z.v, v, y.v);
+  }
+  realvec fmax(realvec y) const { return vmax_f32(v, y.v); }
+  realvec fmin(realvec y) const { return vmin_f32(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    // TODO: vfma_f32
+    return vmla_f32(z.v, v, y.v);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const {
+    realvec r = vrecpe_f32(v);
+    r *= vrecps_f32(v, r);
+    r *= vrecps_f32(v, r);
+    return r;
+  }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const {
+    // return vrndn_f32(v);
+    return MF::vml_rint(*this);
+  }
+  realvec round() const {
+    // return vrnda_f32(v);
+    return MF::vml_round(*this);
+  }
+  realvec rsqrt() const {
+    realvec r = vrsqrte_f32(v);
+    r *= vrsqrts_f32(v, r * r);
+    r *= vrsqrts_f32(v, r * r);
+    return r;
+  }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const { return *this * rsqrt(); }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const {
+    // return vrnd_f32(v);
+    return MF::vml_trunc(*this);
+  }
+};
+
+// boolvec definitions
+
+inline intvec<float, 2> boolvec<float, 2>::as_int() const {
+  return vreinterpret_s32_u32(v);
+}
+
+inline intvec<float, 2> boolvec<float, 2>::convert_int() const {
+  return -as_int();
+}
+
+inline boolvec<float, 2> boolvec<float, 2>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return vbsl_u32(v, x.v, y.v);
+}
+
+inline intvec<float, 2> boolvec<float, 2>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return vbsl_s32(v, x.v, y.v);
+}
+
+inline realvec<float, 2> boolvec<float, 2>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return vbsl_f32(v, x.v, y.v);
+}
+
+// intvec definitions
+
+inline realvec<float, 2> intvec<float, 2>::as_float() const {
+  return vreinterpret_f32_s32(v);
+}
+
+inline realvec<float, 2> intvec<float, 2>::convert_float() const {
+  return vcvt_f32_s32(v);
+}
+
+inline intvec<float, 2> intvec<float, 2>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 2> intvec<float, 2>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_NEON_FLOAT2_H
+#endif // #ifndef VEC_NEON_FLOAT2_H
diff --git a/lib/kernel/vecmathlib/vec_neon_float4.h b/lib/kernel/vecmathlib/vec_neon_float4.h
index 2bd9dda..9ec1e79 100644
--- a/lib/kernel/vecmathlib/vec_neon_float4.h
+++ b/lib/kernel/vecmathlib/vec_neon_float4.h
@@ -14,628 +14,537 @@
 // Neon intrinsics
 #include <arm_neon.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_4
-  template<> struct boolvec<float,4>;
-  template<> struct intvec<float,4>;
-  template<> struct realvec<float,4>;
-  
-  
-  
-  template<>
-  struct boolvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef uint32x4_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vdupq_n_u32(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vmvnq_u32(v); }
-    
-    boolvec operator&&(boolvec x) const { return vandq_u32(v, x.v); }
-    boolvec operator||(boolvec x) const { return vorrq_u32(v, x.v); }
-    boolvec operator==(boolvec x) const { return vceqq_u32(v, x.v); }
-    boolvec operator!=(boolvec x) const { return veorq_u32(v, x.v); }
-    
-    bool all() const
-    {
-      uint32x2_t x = vpmin_u32(vget_low_u32(v), vget_high_u32(v));
-      uint32x2_t y = vpmin_u32(x, x);
-      uint32_t z = vget_lane_u32(y, 0);
-      return to_bool(z);
-    }
-    bool any() const
-    {
-      uint32x2_t x = vpmax_u32(vget_low_u32(v), vget_high_u32(v));
-      uint32x2_t y = vpmax_u32(x, x);
-      uint32_t z = vget_lane_u32(y, 0);
-      return to_bool(z);
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef int32x4_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vdupq_n_s32(a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota()
-    {
-      return
-        vcombine_s32(vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)),
-                     vcreate_s32((uint64_t(3) << uint64_t(32)) | uint64_t(2)));
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return vreinterpretq_u32_s32(v); }
-    boolvec_t convert_bool() const { return *this != IV(0); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return vnegq_s32(v); }
-    
-    intvec operator+(intvec x) const { return vaddq_s32(v, x.v); }
-    intvec operator-(intvec x) const { return vsubq_s32(v, x.v); }
-    intvec operator*(intvec x) const { return vmulq_s32(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    intvec& operator*=(intvec const& x) { return *this=*this*x; }
-    
-    
-    
-    intvec operator~() const { return vmvnq_s32(v); }
-    
-    intvec operator&(intvec x) const { return vandq_s32(v, x.v); }
-    intvec operator|(intvec x) const { return vorrq_s32(v, x.v); }
-    intvec operator^(intvec x) const { return veorq_s32(v, x.v); }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const
-    {
-      return vbslq_s32(vreinterpretq_u32_s32(v), x.v, y.v);
-    }
-    
-    
-    
-    intvec_t lsr(int_t n) const { return lsr(IV(n)); }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v), (-n).v));
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      return vshlq_s32(v, (-n).v);
-    }
-    intvec operator<<(intvec n) const
-    {
-      return vshlq_s32(v, n.v);
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const { return vclzq_s32(v); }
-    intvec_t popcount() const
-    {
-      return vpaddlq_s16(vpaddlq_s8(vcntq_s8(vreinterpretq_s8_s32(v))));
-    }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const { return vceqq_s32(v, x.v); }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const { return vcltq_s32(v, x.v); }
-    boolvec_t operator<=(intvec const& x) const { return vcleq_s32(v, x.v); }
-    boolvec_t operator>(intvec const& x) const { return vcgtq_s32(v, x.v); }
-    boolvec_t operator>=(intvec const& x) const { return vcgeq_s32(v, x.v); }
-    
-    intvec_t abs() const { return vabsq_s32(v); }
-    boolvec_t isignbit() const
-    {
-      //return *this < IV(I(0));
-      return intvec(vshrq_n_s32(v, FP::bits-1)).as_bool();
-    }
-    intvec_t max(intvec_t x) const { return vmaxq_s32(v, x.v); }
-    intvec_t min(intvec_t x) const { return vminq_s32(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef float32x4_t vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<NEON:4*float>"; }
-    void barrier() { __asm__("": "+w"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vdupq_n_f32(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vld1q_f32(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
+template <> struct boolvec<float, 4>;
+template <> struct intvec<float, 4>;
+template <> struct realvec<float, 4>;
+
+template <> struct boolvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef uint32x4_t bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values are -1, false values are 0
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return a; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(vdupq_n_u32(from_bool(a))) {}
+  boolvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vmvnq_u32(v); }
+
+  boolvec operator&&(boolvec x) const { return vandq_u32(v, x.v); }
+  boolvec operator||(boolvec x) const { return vorrq_u32(v, x.v); }
+  boolvec operator==(boolvec x) const { return vceqq_u32(v, x.v); }
+  boolvec operator!=(boolvec x) const { return veorq_u32(v, x.v); }
+
+  bool all() const {
+    uint32x2_t x = vpmin_u32(vget_low_u32(v), vget_high_u32(v));
+    uint32x2_t y = vpmin_u32(x, x);
+    uint32_t z = vget_lane_u32(y, 0);
+    return to_bool(z);
+  }
+  bool any() const {
+    uint32x2_t x = vpmax_u32(vget_low_u32(v), vget_high_u32(v));
+    uint32x2_t y = vpmax_u32(x, x);
+    uint32_t z = vget_lane_u32(y, 0);
+    return to_bool(z);
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef int32x4_t ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vdupq_n_s32(a)) {}
+  intvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() {
+    return vcombine_s32(
+        vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)),
+        vcreate_s32((uint64_t(3) << uint64_t(32)) | uint64_t(2)));
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return vreinterpretq_u32_s32(v); }
+  boolvec_t convert_bool() const { return *this != IV(0); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return vnegq_s32(v); }
+
+  intvec operator+(intvec x) const { return vaddq_s32(v, x.v); }
+  intvec operator-(intvec x) const { return vsubq_s32(v, x.v); }
+  intvec operator*(intvec x) const { return vmulq_s32(v, x.v); }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+  intvec &operator*=(intvec const &x) { return *this = *this * x; }
+
+  intvec operator~() const { return vmvnq_s32(v); }
+
+  intvec operator&(intvec x) const { return vandq_s32(v, x.v); }
+  intvec operator|(intvec x) const { return vorrq_s32(v, x.v); }
+  intvec operator^(intvec x) const { return veorq_s32(v, x.v); }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const {
+    return vbslq_s32(vreinterpretq_u32_s32(v), x.v, y.v);
+  }
+
+  intvec_t lsr(int_t n) const { return lsr(IV(n)); }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const { return *this >> IV(n); }
+  intvec operator<<(int_t n) const { return *this << IV(n); }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v), (-n).v));
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const { return vshlq_s32(v, (-n).v); }
+  intvec operator<<(intvec n) const { return vshlq_s32(v, n.v); }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const { return vclzq_s32(v); }
+  intvec_t popcount() const {
+    return vpaddlq_s16(vpaddlq_s8(vcntq_s8(vreinterpretq_s8_s32(v))));
+  }
+
+  boolvec_t operator==(intvec const &x) const { return vceqq_s32(v, x.v); }
+  boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(intvec const &x) const { return vcltq_s32(v, x.v); }
+  boolvec_t operator<=(intvec const &x) const { return vcleq_s32(v, x.v); }
+  boolvec_t operator>(intvec const &x) const { return vcgtq_s32(v, x.v); }
+  boolvec_t operator>=(intvec const &x) const { return vcgeq_s32(v, x.v); }
+
+  intvec_t abs() const { return vabsq_s32(v); }
+  boolvec_t isignbit() const {
+    // return *this < IV(I(0));
+    return intvec(vshrq_n_s32(v, FP::bits - 1)).as_bool();
+  }
+  intvec_t max(intvec_t x) const { return vmaxq_s32(v, x.v); }
+  intvec_t min(intvec_t x) const { return vminq_s32(v, x.v); }
+};
+
+template <> struct realvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef float32x4_t vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<NEON:4*float>"; }
+  void barrier() { __asm__("" : "+w"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vdupq_n_f32(a)) {}
+  realvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vld1q_f32(p);
+  }
+  static realvec_t loadu(real_t const *p) {
 #if defined __ARM_FEATURE_UNALIGNED
-      return vld1q_f32(p);
+    return vld1q_f32(p);
 #else
-      realvec_t r;
-      r.set_elt(0, p[0]);
-      r.set_elt(1, p[1]);
-      r.set_elt(2, p[2]);
-      r.set_elt(3, p[3]);
-      return r;
+    realvec_t r;
+    r.set_elt(0, p[0]);
+    r.set_elt(1, p[1]);
+    r.set_elt(2, p[2]);
+    r.set_elt(3, p[3]);
+    return r;
 #endif
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vst1q_f32(p, v);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vst1q_f32(p, v);
+  }
+  void storeu(real_t *p) const {
+// Vector stores would require vector loads, which would need to
+// be atomic
 #if defined __ARM_FEATURE_UNALIGNED
-      vst1q_f32(p, v);
+    vst1q_f32(p, v);
 #else
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-      p[2] = (*this)[2];
-      p[3] = (*this)[3];
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
+    p[2] = (*this)[2];
+    p[3] = (*this)[3];
 #endif
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return vreinterpretq_s32_f32(v); }
-    intvec_t convert_int() const { return vcvtq_s32_f32(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return vnegq_f32(v); }
-    
-    realvec operator+(realvec x) const { return vaddq_f32(v, x.v); }
-    realvec operator-(realvec x) const { return vsubq_f32(v, x.v); }
-    realvec operator*(realvec x) const { return vmulq_f32(v, x.v); }
-    realvec operator/(realvec x) const { return *this * x.rcp(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      float32x2_t x = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
-      float32x2_t y = vpmax_f32(x, x);
-      float32_t z = vget_lane_f32(y, 0);
-      return z;
-    }
-    real_t minval() const
-    {
-      float32x2_t x = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
-      float32x2_t y = vpmin_f32(x, x);
-      float32_t z = vget_lane_f32(y, 0);
-      return z;
-    }
-    real_t prod() const
-    {
-      // TODO: multiply pairwise with 2-vectors
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-    }
-    real_t sum() const
-    {
-      float32x2_t x = vpadd_f32(vget_low_f32(v), vget_high_f32(v));
-      float32x2_t y = vpadd_f32(x, x);
-      float32_t z = vget_lane_f32(y, 0);
-      return z;
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vceqq_f32(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vcltq_f32(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vcleq_f32(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vcgtq_f32(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vcgeq_f32(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const
-    {
-      // return vrndpq_f32(v);
-      return MF::vml_ceil(*this);
-    }
-    realvec copysign(realvec y) const
-    {
-      return vbslq_f32(vdupq_n_u32(FP::signbit_mask), y.v, v);
-    }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vabsq_f32(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const
-    {
-      // return vrndmq_f32(v);
-      return MF::vml_floor(*this);
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return vfmaq_f32(z.v, v, y.v);
-    }
-    realvec fmax(realvec y) const { return vmaxq_f32(v, y.v); }
-    realvec fmin(realvec y) const { return vminq_f32(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return vmlaq_f32(z.v, v, y.v);
-    }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec r = vrecpeq_f32(v);
-      r *= vrecpsq_f32(v, r);
-      r *= vrecpsq_f32(v, r);
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-      // return vrndnq_f32(v);
-      return MF::vml_rint(*this);
-    }
-    realvec round() const
-    {
-      // return vrndaq_f32(v);
-      return MF::vml_round(*this);
-    }
-    realvec rsqrt() const
-    {
-      realvec r = vrsqrteq_f32(v);
-      r *= vrsqrtsq_f32(v, r*r);
-      r *= vrsqrtsq_f32(v, r*r);
-      return r;
-    }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return *this * rsqrt(); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const
-    {
-      // return vrndq_f32(v);
-      return MF::vml_trunc(*this);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,4> boolvec<float,4>::as_int() const
-  {
-    return vreinterpretq_s32_u32(v);
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::convert_int() const
-  {
-    return - as_int();
-  }
-  
-  inline
-  boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return vbslq_u32(v, x.v, y.v);
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return vbslq_s32(v, x.v, y.v);
-  }
-  
-  inline
-  realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vbslq_f32(v, x.v, y.v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<float,4> intvec<float,4>::as_float() const
-  {
-    return vreinterpretq_f32_s32(v);
-  }
-  
-  inline realvec<float,4> intvec<float,4>::convert_float() const
-  {
-    return vcvtq_f32_s32(v);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return vreinterpretq_s32_f32(v); }
+  intvec_t convert_int() const { return vcvtq_s32_f32(v); }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return vnegq_f32(v); }
+
+  realvec operator+(realvec x) const { return vaddq_f32(v, x.v); }
+  realvec operator-(realvec x) const { return vsubq_f32(v, x.v); }
+  realvec operator*(realvec x) const { return vmulq_f32(v, x.v); }
+  realvec operator/(realvec x) const { return *this * x.rcp(); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    float32x2_t x = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
+    float32x2_t y = vpmax_f32(x, x);
+    float32_t z = vget_lane_f32(y, 0);
+    return z;
+  }
+  real_t minval() const {
+    float32x2_t x = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
+    float32x2_t y = vpmin_f32(x, x);
+    float32_t z = vget_lane_f32(y, 0);
+    return z;
+  }
+  real_t prod() const {
+    // TODO: multiply pairwise with 2-vectors
+    return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+  }
+  real_t sum() const {
+    float32x2_t x = vpadd_f32(vget_low_f32(v), vget_high_f32(v));
+    float32x2_t y = vpadd_f32(x, x);
+    float32_t z = vget_lane_f32(y, 0);
+    return z;
+  }
+
+  boolvec_t operator==(realvec const &x) const { return vceqq_f32(v, x.v); }
+  boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(realvec const &x) const { return vcltq_f32(v, x.v); }
+  boolvec_t operator<=(realvec const &x) const { return vcleq_f32(v, x.v); }
+  boolvec_t operator>(realvec const &x) const { return vcgtq_f32(v, x.v); }
+  boolvec_t operator>=(realvec const &x) const { return vcgeq_f32(v, x.v); }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const {
+    // return vrndpq_f32(v);
+    return MF::vml_ceil(*this);
+  }
+  realvec copysign(realvec y) const {
+    return vbslq_f32(vdupq_n_u32(FP::signbit_mask), y.v, v);
+  }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return vabsq_f32(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const {
+    // return vrndmq_f32(v);
+    return MF::vml_floor(*this);
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return vfmaq_f32(z.v, v, y.v);
+  }
+  realvec fmax(realvec y) const { return vmaxq_f32(v, y.v); }
+  realvec fmin(realvec y) const { return vminq_f32(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return vmlaq_f32(z.v, v, y.v);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const {
+    realvec r = vrecpeq_f32(v);
+    r *= vrecpsq_f32(v, r);
+    r *= vrecpsq_f32(v, r);
+    return r;
+  }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const {
+    // return vrndnq_f32(v);
+    return MF::vml_rint(*this);
+  }
+  realvec round() const {
+    // return vrndaq_f32(v);
+    return MF::vml_round(*this);
+  }
+  realvec rsqrt() const {
+    realvec r = vrsqrteq_f32(v);
+    r *= vrsqrtsq_f32(v, r * r);
+    r *= vrsqrtsq_f32(v, r * r);
+    return r;
+  }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const { return *this * rsqrt(); }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const {
+    // return vrndq_f32(v);
+    return MF::vml_trunc(*this);
+  }
+};
+
+// boolvec definitions
+
+inline intvec<float, 4> boolvec<float, 4>::as_int() const {
+  return vreinterpretq_s32_u32(v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::convert_int() const {
+  return -as_int();
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return vbslq_u32(v, x.v, y.v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return vbslq_s32(v, x.v, y.v);
+}
+
+inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return vbslq_f32(v, x.v, y.v);
+}
+
+// intvec definitions
+
+inline realvec<float, 4> intvec<float, 4>::as_float() const {
+  return vreinterpretq_f32_s32(v);
+}
+
+inline realvec<float, 4> intvec<float, 4>::convert_float() const {
+  return vcvtq_f32_s32(v);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_NEON_FLOAT4_H
+#endif // #ifndef VEC_NEON_FLOAT4_H
diff --git a/lib/kernel/vecmathlib/vec_pseudo.h b/lib/kernel/vecmathlib/vec_pseudo.h
index 2aafc23..c4cbbc1 100644
--- a/lib/kernel/vecmathlib/vec_pseudo.h
+++ b/lib/kernel/vecmathlib/vec_pseudo.h
@@ -12,1668 +12,1492 @@
 #include <climits>
 #include <cstdlib>
 #ifndef VML_NO_IOSTREAM
-#  include <sstream>
+#include <sstream>
 #endif
 #include <string>
 
+namespace vecmathlib {
 
+template <typename T, int N> struct boolpseudovec;
+template <typename T, int N> struct intpseudovec;
+template <typename T, int N> struct realpseudovec;
 
-namespace vecmathlib {
-  
-  template<typename T, int N> struct boolpseudovec;
-  template<typename T, int N> struct intpseudovec;
-  template<typename T, int N> struct realpseudovec;
-  
-  
-  
-  template<typename T, int N>
-  struct boolpseudovec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef bool scalar_t;
-    typedef bool bvector_t[size];
-    static int const alignment = sizeof(bool);
-    
-    typedef boolpseudovec boolvec_t;
-    typedef intpseudovec<real_t, size> intvec_t;
-    typedef realpseudovec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolpseudovec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolpseudovec(boolpseudovec const& x): v(x.v) {}
-    // boolpseudovec& operator=(boolpseudovec const& x) { return v=x.v, *this; }
-    boolpseudovec(bool a) { for (int d=0; d<size; ++d) v[d]=a; }
-    boolpseudovec(bool const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    
-    bool operator[](int n) const { return v[n]; }
-    boolvec_t& set_elt(int n, bool a) { return v[n]=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intpseudovec
-    intvec_t convert_int() const; // defined after intpseudovec
-    
-    
-    
-    boolvec_t operator!() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = !v[d];
-      return res;
-    }
-    
-    boolvec_t operator&&(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] && x.v[d];
-      return res;
-    }
-    boolvec_t operator||(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] || x.v[d];
-      return res;
-    }
-    boolvec_t operator==(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    
-    bool all() const
-    {
-      bool res = v[0];
-      for (int d=1; d<size; ++d) res = res && v[d];
-      return res;
-    }
-    bool any() const
-    {
-      bool res = v[0];
-      for (int d=1; d<size; ++d) res = res || v[d];
-      return res;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intpseudovec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realpseudovec
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct intpseudovec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t[size];
-    static int const alignment = sizeof(int_t);
-    
-    typedef boolpseudovec<real_t, size> boolvec_t;
-    typedef intpseudovec intvec_t;
-    typedef realpseudovec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intpseudovec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intpseudovec(intpseudovec const& x): v(x.v) {}
-    // intpseudovec& operator=(intpseudovec const& x) { return v=x.v, *this; }
-    intpseudovec(int_t a) { for (int d=0; d<size; ++d) v[d]=a; }
-    intpseudovec(int_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    static intvec_t iota()
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d]=d;
-      return res;
-    }
-    
-    int_t operator[](int n) const { return v[n]; }
-    intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d]=v[d];
-      return res;
-    }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d];
-      return res;
-    }
-    realvec_t as_float() const;      // defined after realpseudovec
-    realvec_t convert_float() const; // defined after realpseudovec
-    
-    
-    
-    intvec_t operator+() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = + v[d];
-      return res;
-    }
-    intvec_t operator-() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = - v[d];
-      return res;
-    }
-    
-    intvec_t& operator+=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] += x.v[d];
-      return *this;
-    }
-    intvec_t& operator-=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] -= x.v[d];
-      return *this;
-    }
-    intvec_t& operator*=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] *= x.v[d];
-      return *this;
-    }
-    intvec_t& operator/=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] /= x.v[d];
-      return *this;
-    }
-    intvec_t& operator%=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] %= x.v[d];
-      return *this;
-    }
-    
-    intvec_t operator+(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res += x;
-    }
-    intvec_t operator-(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res -= x;
-    }
-    intvec_t operator*(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res *= x;
-    }
-    intvec_t operator/(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res /= x;
-    }
-    intvec_t operator%(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res %= x;
-    }
-    
-    
-    
-    intvec_t operator~() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = ~ v[d];
-      return res;
-    }
-    
-    intvec_t& operator&=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] &= x.v[d];
-      return *this;
-    }
-    intvec_t& operator|=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] |= x.v[d];
-      return *this;
-    }
-    intvec_t& operator^=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] ^= x.v[d];
-      return *this;
-    }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res &= x;
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res |= x;
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res ^= x;
-    }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n));
-      return res;
-    }
-    intvec_t rotate(int_t n) const;
-    intvec_t& operator>>=(int_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] >>= n;
-      return *this;
-    }
-    intvec_t& operator<<=(int_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] <<= n;
-      return *this;
-    }
-    intvec_t operator>>(int_t n) const
-    {
-      intvec_t res = *this;
-      return res >>= n;
-    }
-    intvec_t operator<<(int_t n) const
-    {
-      intvec_t res = *this;
-      return res <<= n;
-    }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n.v[d]));
-      return res;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t& operator>>=(intvec_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] >>= n.v[d];
-      return *this;
-    }
-    intvec_t& operator<<=(intvec_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] <<= n.v[d];
-      return *this;
-    }
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t res = *this;
-      return res >>= n;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t res = *this;
-      return res <<= n;
-    }
-    
-    intvec_t clz() const
-    {
-      intvec_t res;
+template <typename T, int N> struct boolpseudovec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef bool scalar_t;
+  typedef bool bvector_t[size];
+  static int const alignment = sizeof(bool);
+
+  typedef boolpseudovec boolvec_t;
+  typedef intpseudovec<real_t, size> intvec_t;
+  typedef realpseudovec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolpseudovec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolpseudovec(boolpseudovec const& x): v(x.v) {}
+  // boolpseudovec& operator=(boolpseudovec const& x) { return v=x.v, *this; }
+  boolpseudovec(bool a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  boolpseudovec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+
+  bool operator[](int n) const { return v[n]; }
+  boolvec_t &set_elt(int n, bool a) { return v[n] = a, *this; }
+
+  intvec_t as_int() const;      // defined after intpseudovec
+  intvec_t convert_int() const; // defined after intpseudovec
+
+  boolvec_t operator!() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = !v[d];
+    return res;
+  }
+
+  boolvec_t operator&&(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] && x.v[d];
+    return res;
+  }
+  boolvec_t operator||(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] || x.v[d];
+    return res;
+  }
+  boolvec_t operator==(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+
+  bool all() const {
+    bool res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = res && v[d];
+    return res;
+  }
+  bool any() const {
+    bool res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = res || v[d];
+    return res;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intpseudovec
+  realvec_t ifthen(realvec_t x,
+                   realvec_t y) const; // defined after realpseudovec
+};
+
+template <typename T, int N> struct intpseudovec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t[size];
+  static int const alignment = sizeof(int_t);
+
+  typedef boolpseudovec<real_t, size> boolvec_t;
+  typedef intpseudovec intvec_t;
+  typedef realpseudovec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intpseudovec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intpseudovec(intpseudovec const& x): v(x.v) {}
+  // intpseudovec& operator=(intpseudovec const& x) { return v=x.v, *this; }
+  intpseudovec(int_t a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  intpseudovec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+  static intvec_t iota() {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = d;
+    return res;
+  }
+
+  int_t operator[](int n) const { return v[n]; }
+  intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; }
+
+  boolvec_t as_bool() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d];
+    return res;
+  }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d];
+    return res;
+  }
+  realvec_t as_float() const;      // defined after realpseudovec
+  realvec_t convert_float() const; // defined after realpseudovec
+
+  intvec_t operator+() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = +v[d];
+    return res;
+  }
+  intvec_t operator-() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = -v[d];
+    return res;
+  }
+
+  intvec_t &operator+=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] += x.v[d];
+    return *this;
+  }
+  intvec_t &operator-=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] -= x.v[d];
+    return *this;
+  }
+  intvec_t &operator*=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] *= x.v[d];
+    return *this;
+  }
+  intvec_t &operator/=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] /= x.v[d];
+    return *this;
+  }
+  intvec_t &operator%=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] %= x.v[d];
+    return *this;
+  }
+
+  intvec_t operator+(intvec_t x) const {
+    intvec_t res = *this;
+    return res += x;
+  }
+  intvec_t operator-(intvec_t x) const {
+    intvec_t res = *this;
+    return res -= x;
+  }
+  intvec_t operator*(intvec_t x) const {
+    intvec_t res = *this;
+    return res *= x;
+  }
+  intvec_t operator/(intvec_t x) const {
+    intvec_t res = *this;
+    return res /= x;
+  }
+  intvec_t operator%(intvec_t x) const {
+    intvec_t res = *this;
+    return res %= x;
+  }
+
+  intvec_t operator~() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = ~v[d];
+    return res;
+  }
+
+  intvec_t &operator&=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] &= x.v[d];
+    return *this;
+  }
+  intvec_t &operator|=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] |= x.v[d];
+    return *this;
+  }
+  intvec_t &operator^=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] ^= x.v[d];
+    return *this;
+  }
+
+  intvec_t operator&(intvec_t x) const {
+    intvec_t res = *this;
+    return res &= x;
+  }
+  intvec_t operator|(intvec_t x) const {
+    intvec_t res = *this;
+    return res |= x;
+  }
+  intvec_t operator^(intvec_t x) const {
+    intvec_t res = *this;
+    return res ^= x;
+  }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = I(U(v[d]) >> U(n));
+    return res;
+  }
+  intvec_t rotate(int_t n) const;
+  intvec_t &operator>>=(int_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] >>= n;
+    return *this;
+  }
+  intvec_t &operator<<=(int_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] <<= n;
+    return *this;
+  }
+  intvec_t operator>>(int_t n) const {
+    intvec_t res = *this;
+    return res >>= n;
+  }
+  intvec_t operator<<(int_t n) const {
+    intvec_t res = *this;
+    return res <<= n;
+  }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = I(U(v[d]) >> U(n.v[d]));
+    return res;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t &operator>>=(intvec_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] >>= n.v[d];
+    return *this;
+  }
+  intvec_t &operator<<=(intvec_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] <<= n.v[d];
+    return *this;
+  }
+  intvec_t operator>>(intvec_t n) const {
+    intvec_t res = *this;
+    return res >>= n;
+  }
+  intvec_t operator<<(intvec_t n) const {
+    intvec_t res = *this;
+    return res <<= n;
+  }
+
+  intvec_t clz() const {
+    intvec_t res;
 #if defined __clang__ || defined __gcc__
-      for (int d=0; d<size; ++d) {
-        if (v[d] == 0) {
-          res.v[d] = CHAR_BIT * sizeof v[d];
+    for (int d = 0; d < size; ++d) {
+      if (v[d] == 0) {
+        res.v[d] = CHAR_BIT * sizeof v[d];
+      } else {
+        if (sizeof v[d] == sizeof(long long)) {
+          res.v[d] = __builtin_clzll(v[d]);
+        } else if (sizeof v[d] == sizeof(long)) {
+          res.v[d] = __builtin_clzl(v[d]);
+        } else if (sizeof v[d] == sizeof(int)) {
+          res.v[d] = __builtin_clz(v[d]);
+        } else if (sizeof v[d] == sizeof(short)) {
+          res.v[d] = __builtin_clzs(v[d]);
+        } else if (sizeof v[d] == sizeof(char)) {
+          res.v[d] = __builtin_clzs((unsigned short)(unsigned char)v[d]) -
+                     CHAR_BIT * (sizeof(short) - sizeof(char));
         } else {
-          if (sizeof v[d] == sizeof(long long)) {
-            res.v[d] = __builtin_clzll(v[d]);
-          } else if (sizeof v[d] == sizeof(long)) {
-            res.v[d] = __builtin_clzl(v[d]);
-          } else if (sizeof v[d] == sizeof(int)) {
-            res.v[d] = __builtin_clz(v[d]);
-          } else if (sizeof v[d] == sizeof(short)) {
-            res.v[d] = __builtin_clzs(v[d]);
-          } else if (sizeof v[d] == sizeof(char)) {
-            res.v[d] =
-              __builtin_clzs((unsigned short)(unsigned char)v[d]) -
-              CHAR_BIT * (sizeof(short) - sizeof(char));
-          } else {
-            __builtin_unreachable();
-          }
+          __builtin_unreachable();
         }
       }
+    }
 #else
-      res = MF::vml_clz(*this);
+    res = MF::vml_clz(*this);
 #endif
-      return res;
-    }
-    intvec_t popcount() const
-    {
-      intvec_t res;
+    return res;
+  }
+  intvec_t popcount() const {
+    intvec_t res;
 #if defined __clang__ || defined __gcc__
-      if (sizeof(int_t) == sizeof(long long)) {
-        for (int d=0; d<size; ++d) res.v[d] = __builtin_popcountll(v[d]);
-      } else if (sizeof(int_t) == sizeof(long)) {
-        for (int d=0; d<size; ++d) res.v[d] = __builtin_popcountl(v[d]);
-      } else if (sizeof(int_t) <= sizeof(int)) {
-        for (int d=0; d<size; ++d) res.v[d] = __builtin_popcount(v[d]);
-      } else {
-        __builtin_unreachable();
-      }
+    if (sizeof(int_t) == sizeof(long long)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = __builtin_popcountll(v[d]);
+    } else if (sizeof(int_t) == sizeof(long)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = __builtin_popcountl(v[d]);
+    } else if (sizeof(int_t) <= sizeof(int)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = __builtin_popcount(v[d]);
+    } else {
+      __builtin_unreachable();
+    }
 #else
-      res = MF::vml_popcount(*this);
+    res = MF::vml_popcount(*this);
 #endif
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
-      return res;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
-      return res;
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
-      return res;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
-      return res;
-    }
-    
-    intvec_t abs() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = std::abs(v[d]);
-      return res;
-    }
-    
-    boolvec_t isignbit() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < 0;
-      return res;
-    }
-    
-    intvec_t max(intvec_t x) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = std::max(v[d], x.v[d]);
-      return res;
-    }
-    
-    intvec_t min(intvec_t x) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = std::min(v[d], x.v[d]);
-      return res;
-    }
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct realpseudovec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef real_t scalar_t;
-    typedef real_t vector_t[size];
-    static int const alignment = sizeof(real_t);
-    
+    return res;
+  }
+
+  boolvec_t operator==(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < x.v[d];
+    return res;
+  }
+  boolvec_t operator<=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] <= x.v[d];
+    return res;
+  }
+  boolvec_t operator>(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] > x.v[d];
+    return res;
+  }
+  boolvec_t operator>=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] >= x.v[d];
+    return res;
+  }
+
+  intvec_t abs() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = std::abs(v[d]);
+    return res;
+  }
+
+  boolvec_t isignbit() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < 0;
+    return res;
+  }
+
+  intvec_t max(intvec_t x) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = std::max(v[d], x.v[d]);
+    return res;
+  }
+
+  intvec_t min(intvec_t x) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = std::min(v[d], x.v[d]);
+    return res;
+  }
+};
+
+template <typename T, int N> struct realpseudovec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef real_t scalar_t;
+  typedef real_t vector_t[size];
+  static int const alignment = sizeof(real_t);
+
 #ifndef VML_NO_IOSTREAM
-    static char const* name()
-    {
-      static std::string name_;
-      if (name_.empty()) {
-        std::stringstream buf;
-        buf << "<libm:" << N << "*" << FP::name() << ">";
-        name_ = buf.str();
-      }
-      return name_.c_str();
+  static char const *name() {
+    static std::string name_;
+    if (name_.empty()) {
+      std::stringstream buf;
+      buf << "<libm:" << N << "*" << FP::name() << ">";
+      name_ = buf.str();
     }
+    return name_.c_str();
+  }
 #endif
-    void barrier()
-    {
+  void barrier() {
 #if defined __GNUC__ && !defined __clang__ && !defined __ICC
-      // GCC crashes when +X is used as constraint
-#  if defined __SSE2__
-      for (int d=0; d<size; ++d) __asm__("": "+x"(v[d]));
-#  elif defined __PPC64__       // maybe also __PPC__
-      for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
-#  elif defined __arm__
-      for (int d=0; d<size; ++d) __asm__("": "+w"(v[d]));
-#  else
-#    error "Floating point barrier undefined on this architecture"
-#  endif
+// GCC crashes when +X is used as constraint
+#if defined __SSE2__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+x"(v[d]));
+#elif defined __PPC64__ // maybe also __PPC__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+f"(v[d]));
+#elif defined __arm__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+w"(v[d]));
+#else
+#error "Floating point barrier undefined on this architecture"
+#endif
 #elif defined __clang__
-      for (int d=0; d<size; ++d) __asm__("": "+X"(v[d]));
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+X"(v[d]));
 #elif defined __ICC
-      for (int d=0; d<size; ++d) {
-        real_t tmp = v[d];
-        __asm__("": "+X"(tmp));
-        v[d] = tmp;
-      }
+    for (int d = 0; d < size; ++d) {
+      real_t tmp = v[d];
+      __asm__("" : "+X"(tmp));
+      v[d] = tmp;
+    }
 #elif defined __IBMCPP__
-      for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+f"(v[d]));
 #else
-#  error "Floating point barrier undefined on this architecture"
+#error "Floating point barrier undefined on this architecture"
 #endif
-    }
-    
-    typedef boolpseudovec<real_t, size> boolvec_t;
-    typedef intpseudovec<real_t, size> intvec_t;
-    typedef realpseudovec realvec_t;
-    
-  private:
-    boolvec_t mapb(bool f(real_t)) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    intvec_t map(int_t f(real_t)) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t)) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, int_t), intvec_t x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, real_t), realvec_t x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, real_t, real_t),
-                  realvec_t x, realvec_t y) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d], y.v[d]);
-      return res;
-    }
-  public:
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realpseudovec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realpseudovec(realpseudovec const& x): v(x.v) {}
-    // realpseudovec& operator=(realpseudovec const& x) { return v=x.v, *this; }
-    realpseudovec(real_t a) { for (int d=0; d<size; ++d) v[d]=a; }
-    realpseudovec(real_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    
-    real_t operator[](int n) const { return v[n]; }
-    realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = p[d];
-      return res;
-    }
-    static realvec_t loadu(real_t const* p, size_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      return m.m.ifthen(loada(p), *this);
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      return m.m.ifthen(loadu(p), *this);
-    }
-    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
-    {
-      return m.m.ifthen(loadu(p, ioff), *this);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p);
-    }
-    void storeu(real_t* p) const
-    {
-      for (int d=0; d<size; ++d) p[d] = v[d];
-    }
-    void storeu(real_t* p, size_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p, m);
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d];
-    }
-    void storeu(real_t* p, size_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = FP::as_int(v[d]);
-      return res;
-    }
-    intvec_t convert_int() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = FP::convert_int(v[d]);
-      return res;
-    }
-    
-    
-    
-    realvec_t operator+() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = + v[d];
-      return res;
-    }
-    realvec_t operator-() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = - v[d];
-      return res;
-    }
-    
-    realvec_t& operator+=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] += x.v[d];
-      return *this;
-    }
-    realvec_t& operator-=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] -= x.v[d];
-      return *this;
-    }
-    realvec_t& operator*=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] *= x.v[d];
-      return *this;
-    }
-    realvec_t& operator/=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] /= x.v[d];
-      return *this;
-    }
-    
-    realvec_t operator+(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res += x;
-    }
-    realvec_t operator-(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res -= x;
-    }
-    realvec_t operator*(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res *= x;
-    }
-    realvec_t operator/(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res /= x;
-    }
-    
-    real_t maxval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res = vml_std::fmax(res, v[d]);
-      return res;
-    }
-    real_t minval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res = vml_std::fmin(res, v[d]);
-      return res;
-    }
-    real_t prod() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res *= v[d];
-      return res;
-    }
-    real_t sum() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res += v[d];
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
-      return res;
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
-      return res;
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
-      return res;
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
-      return res;
-    }
-    
-    
-    
-    realvec_t acos() const { return map(vml_std::acos); }
-    realvec_t acosh() const { return map(vml_std::acosh); }
-    realvec_t asin() const { return map(vml_std::asin); }
-    realvec_t asinh() const { return map(vml_std::asinh); }
-    realvec_t atan() const { return map(vml_std::atan); }
-    realvec_t atan2(realvec_t y) const
-    {
-      return MF::vml_atan2(*this, y);
-    }
-    realvec_t atanh() const { return map(vml_std::atanh); }
-    realvec_t cbrt() const { return map(vml_std::cbrt); }
-    realvec_t ceil() const { return map(vml_std::ceil); }
-    realvec_t copysign(realvec_t y) const
-    {
-      return map(vml_std::copysign, y);
-    }
-    realvec_t cos() const { return map(vml_std::cos); }
-    realvec_t cosh() const { return map(vml_std::cosh); }
-    realvec_t exp() const { return map(vml_std::exp); }
-    realvec_t exp10() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = vml_std::exp(R(M_LN10) * v[d]);
-      return res;
-    }
-    realvec_t exp2() const { return map(vml_std::exp2); }
-    realvec_t expm1() const { return map(vml_std::expm1); }
-    realvec_t fabs() const { return map(vml_std::fabs); }
-    realvec_t fdim(realvec_t y) const { return map(vml_std::fdim, y); }
-    realvec_t floor() const { return map(vml_std::floor); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return map(vml_std::fma, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return map(vml_std::fmax, y); }
-    realvec_t fmin(realvec_t y) const { return map(vml_std::fmin, y); }
-    realvec_t fmod(realvec_t y) const { return map(vml_std::fmod, y); }
-    realvec_t frexp(intvec_t* ires) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) {
-        int iri;
-        real_t r = vml_std::frexp(v[d], &iri);
-        int_t ir = iri;
+  }
+
+  typedef boolpseudovec<real_t, size> boolvec_t;
+  typedef intpseudovec<real_t, size> intvec_t;
+  typedef realpseudovec realvec_t;
+
+private:
+  boolvec_t mapb(bool f(real_t)) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  intvec_t map(int_t f(real_t)) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t)) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, int_t), intvec_t x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, real_t), realvec_t x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, real_t, real_t), realvec_t x,
+                realvec_t y) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d], y.v[d]);
+    return res;
+  }
+
+public:
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realpseudovec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realpseudovec(realpseudovec const& x): v(x.v) {}
+  // realpseudovec& operator=(realpseudovec const& x) { return v=x.v, *this; }
+  realpseudovec(real_t a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  realpseudovec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+
+  real_t operator[](int n) const { return v[n]; }
+  realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = p[d];
+    return res;
+  }
+  static realvec_t loadu(real_t const *p, size_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    return m.m.ifthen(loada(p), *this);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    return m.m.ifthen(loadu(p), *this);
+  }
+  realvec_t loadu(real_t const *p, size_t ioff, mask_t const &m) const {
+    return m.m.ifthen(loadu(p, ioff), *this);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p);
+  }
+  void storeu(real_t *p) const {
+    for (int d = 0; d < size; ++d)
+      p[d] = v[d];
+  }
+  void storeu(real_t *p, size_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    for (int d = 0; d < size; ++d)
+      if (m.m[d])
+        p[d] = v[d];
+  }
+  void storeu(real_t *p, size_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = FP::as_int(v[d]);
+    return res;
+  }
+  intvec_t convert_int() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = FP::convert_int(v[d]);
+    return res;
+  }
+
+  realvec_t operator+() const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = +v[d];
+    return res;
+  }
+  realvec_t operator-() const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = -v[d];
+    return res;
+  }
+
+  realvec_t &operator+=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] += x.v[d];
+    return *this;
+  }
+  realvec_t &operator-=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] -= x.v[d];
+    return *this;
+  }
+  realvec_t &operator*=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] *= x.v[d];
+    return *this;
+  }
+  realvec_t &operator/=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] /= x.v[d];
+    return *this;
+  }
+
+  realvec_t operator+(realvec_t x) const {
+    realvec_t res = *this;
+    return res += x;
+  }
+  realvec_t operator-(realvec_t x) const {
+    realvec_t res = *this;
+    return res -= x;
+  }
+  realvec_t operator*(realvec_t x) const {
+    realvec_t res = *this;
+    return res *= x;
+  }
+  realvec_t operator/(realvec_t x) const {
+    realvec_t res = *this;
+    return res /= x;
+  }
+
+  real_t maxval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = vml_std::fmax(res, v[d]);
+    return res;
+  }
+  real_t minval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = vml_std::fmin(res, v[d]);
+    return res;
+  }
+  real_t prod() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res *= v[d];
+    return res;
+  }
+  real_t sum() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res += v[d];
+    return res;
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+  boolvec_t operator<(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < x.v[d];
+    return res;
+  }
+  boolvec_t operator<=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] <= x.v[d];
+    return res;
+  }
+  boolvec_t operator>(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] > x.v[d];
+    return res;
+  }
+  boolvec_t operator>=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] >= x.v[d];
+    return res;
+  }
+
+  realvec_t acos() const { return map(vml_std::acos); }
+  realvec_t acosh() const { return map(vml_std::acosh); }
+  realvec_t asin() const { return map(vml_std::asin); }
+  realvec_t asinh() const { return map(vml_std::asinh); }
+  realvec_t atan() const { return map(vml_std::atan); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return map(vml_std::atanh); }
+  realvec_t cbrt() const { return map(vml_std::cbrt); }
+  realvec_t ceil() const { return map(vml_std::ceil); }
+  realvec_t copysign(realvec_t y) const { return map(vml_std::copysign, y); }
+  realvec_t cos() const { return map(vml_std::cos); }
+  realvec_t cosh() const { return map(vml_std::cosh); }
+  realvec_t exp() const { return map(vml_std::exp); }
+  realvec_t exp10() const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = vml_std::exp(R(M_LN10) * v[d]);
+    return res;
+  }
+  realvec_t exp2() const { return map(vml_std::exp2); }
+  realvec_t expm1() const { return map(vml_std::expm1); }
+  realvec_t fabs() const { return map(vml_std::fabs); }
+  realvec_t fdim(realvec_t y) const { return map(vml_std::fdim, y); }
+  realvec_t floor() const { return map(vml_std::floor); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return map(vml_std::fma, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return map(vml_std::fmax, y); }
+  realvec_t fmin(realvec_t y) const { return map(vml_std::fmin, y); }
+  realvec_t fmod(realvec_t y) const { return map(vml_std::fmod, y); }
+  realvec_t frexp(intvec_t *ires) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d) {
+      int iri;
+      real_t r = vml_std::frexp(v[d], &iri);
+      int_t ir = iri;
 #if defined VML_HAVE_INF
-        if (vml_std::isinf(v[d])) ir = std::numeric_limits<int_t>::max();
+      if (vml_std::isinf(v[d]))
+        ir = std::numeric_limits<int_t>::max();
 #endif
 #if defined VML_HAVE_NAN
-        if (vml_std::isnan(v[d])) ir = std::numeric_limits<int_t>::min();
+      if (vml_std::isnan(v[d]))
+        ir = std::numeric_limits<int_t>::min();
 #endif
-        res.v[d] = r;
-        ires->v[d] = ir;
-      }
-      return res;
+      res.v[d] = r;
+      ires->v[d] = ir;
     }
-    realvec_t hypot(realvec_t y) const { return map(vml_std::hypot, y); }
-    intvec_t ilogb() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) {
-        int_t r = vml_std::ilogb(v[d]);
-        typedef std::numeric_limits<int_t> NL;
-        if (FP_ILOGB0 != NL::min() and v[d] == R(0.0)) {
-          r = NL::min();
+    return res;
+  }
+  realvec_t hypot(realvec_t y) const { return map(vml_std::hypot, y); }
+  intvec_t ilogb() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d) {
+      int_t r = vml_std::ilogb(v[d]);
+      typedef std::numeric_limits<int_t> NL;
+      if (FP_ILOGB0 != NL::min() and v[d] == R(0.0)) {
+        r = NL::min();
 #if defined VML_HAVE_INF
-        } else if (INT_MAX != NL::max() and vml_std::isinf(v[d])) {
-          r = NL::max();
+      } else if (INT_MAX != NL::max() and vml_std::isinf(v[d])) {
+        r = NL::max();
 #endif
 #if defined VML_HAVE_NAN
-        } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v[d])) {
-          r = NL::min();
+      } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v[d])) {
+        r = NL::min();
 #endif
-        }
-        res.v[d] = r;
       }
-      return res;
+      res.v[d] = r;
     }
-    boolvec_t isfinite() const { return mapb(vml_std::isfinite); }
-    boolvec_t isinf() const { return mapb(vml_std::isinf); }
-    boolvec_t isnan() const { return mapb(vml_std::isnan); }
-    boolvec_t isnormal() const { return mapb(vml_std::isnormal); }
-    realvec_t ldexp(int_t n) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = vml_std::ldexp(v[d], n);
-      return res;
-    }
-    realvec_t ldexp(intvec_t n) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = vml_std::ldexp(v[d], n.v[d]);
-      return res;
-    }
-    realvec_t log() const { return map(vml_std::log); }
-    realvec_t log10() const { return map(vml_std::log10); }
-    realvec_t log1p() const { return map(vml_std::log1p); }
-    realvec_t log2() const { return map(vml_std::log2); }
-    intvec_t lrint() const
-    {
-      realvec_t res;
-      if (sizeof(int_t) <= sizeof(long)) {
-        for (int d=0; d<size; ++d) res.v[d] = vml_std::lrint(v[d]);
-      } else if (sizeof(int_t) <= sizeof(long long)) {
-        for (int d=0; d<size; ++d) res.v[d] = vml_std::llrint(v[d]);
-      } else {
-        __builtin_unreachable();
-      }
-      return res;
-    }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return map(vml_std::nextafter, y);
-    }
-    realvec_t pow(realvec_t y) const { return map(vml_std::pow, y); }
-    realvec_t rcp() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = R(1.0) / v[d];
-      return res;
-    }
-    realvec_t remainder(realvec_t y) const
-    {
-      return map(vml_std::remainder, y);
-    }
-    realvec_t rint() const { return map(vml_std::rint); }
-    realvec_t round() const { return map(vml_std::round); }
-    realvec_t rsqrt() const { return sqrt().rcp(); }
-    boolvec_t signbit() const { return mapb(vml_std::signbit); }
-    realvec_t sin() const { return map(vml_std::sin); }
-    realvec_t sinh() const { return map(vml_std::sinh); }
-    realvec_t sqrt() const { return map(vml_std::sqrt); }
-    realvec_t tan() const { return map(vml_std::tan); }
-    realvec_t tanh() const { return map(vml_std::tanh); }
-    realvec_t trunc() const { return map(vml_std::trunc); }
-  };
-  
-  
-  
-  // boolpseudovec definitions
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::intvec_t boolpseudovec<T,N>::as_int() const
-  {
-    return convert_int();
-  }
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::intvec_t boolpseudovec<T,N>::convert_int() const
-  {
-    intvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d];
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::boolvec_t
-  boolpseudovec<T,N>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    boolvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
-    return res;
-  }
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::intvec_t
-  boolpseudovec<T,N>::ifthen(intvec_t x, intvec_t y) const
-  {
-    intvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+  boolvec_t isfinite() const { return mapb(vml_std::isfinite); }
+  boolvec_t isinf() const { return mapb(vml_std::isinf); }
+  boolvec_t isnan() const { return mapb(vml_std::isnan); }
+  boolvec_t isnormal() const { return mapb(vml_std::isnormal); }
+  realvec_t ldexp(int_t n) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = vml_std::ldexp(v[d], n);
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::realvec_t
-  boolpseudovec<T,N>::ifthen(realvec_t x, realvec_t y) const
-  {
+  realvec_t ldexp(intvec_t n) const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+    for (int d = 0; d < size; ++d)
+      res.v[d] = vml_std::ldexp(v[d], n.v[d]);
     return res;
   }
-  
-  
-  
-  // intpseudovec definitions
-  
-  template<typename T, int N>
-  inline
-  typename intpseudovec<T,N>::realvec_t intpseudovec<T,N>::as_float() const
-  {
+  realvec_t log() const { return map(vml_std::log); }
+  realvec_t log10() const { return map(vml_std::log10); }
+  realvec_t log1p() const { return map(vml_std::log1p); }
+  realvec_t log2() const { return map(vml_std::log2); }
+  intvec_t lrint() const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = FP::as_float(v[d]);
+    if (sizeof(int_t) <= sizeof(long)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = vml_std::lrint(v[d]);
+    } else if (sizeof(int_t) <= sizeof(long long)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = vml_std::llrint(v[d]);
+    } else {
+      __builtin_unreachable();
+    }
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  intpseudovec<T,N> intpseudovec<T,N>::bitifthen(intvec_t x, intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  template<typename T, int N>
-  inline
-  typename intpseudovec<T,N>::realvec_t intpseudovec<T,N>::convert_float() const
-  {
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return map(vml_std::nextafter, y); }
+  realvec_t pow(realvec_t y) const { return map(vml_std::pow, y); }
+  realvec_t rcp() const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = FP::convert_float(v[d]);
+    for (int d = 0; d < size; ++d)
+      res.v[d] = R(1.0) / v[d];
     return res;
   }
-  
-  template<typename T, int N>
-  inline intpseudovec<T,N> intpseudovec<T,N>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  template<typename T, int N>
-  inline intpseudovec<T,N> intpseudovec<T,N>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-
-
-  // Wrappers
-  
-  // boolpseudovec wrappers
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> as_int(boolpseudovec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> convert_int(boolpseudovec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline bool all(boolpseudovec<real_t, size> x) { return x.all(); }
-  
-  template<typename real_t, int size>
-  inline bool any(boolpseudovec<real_t, size> x) { return x.any(); }
-  
-  template<typename real_t, int size>
-  inline
-  boolpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
-                                     boolpseudovec<real_t, size> x,
-                                     boolpseudovec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
-                                    intpseudovec<real_t, size> x,
-                                    intpseudovec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
-                                     realpseudovec<real_t, size> x,
-                                     realpseudovec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  
-  
-  // intpseudovec wrappers
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> abs(intpseudovec<real_t, size> x)
-  {
-    return x.abs();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> as_bool(intpseudovec<real_t, size> x)
-  {
-    return x.as_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> as_float(intpseudovec<real_t, size> x)
-  {
-    return x.as_float();
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> bitifthen(intpseudovec<real_t, size> x,
-                                              intpseudovec<real_t, size> y,
-                                              intpseudovec<real_t, size> z)
-  {
-    return x.bitifthen(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> clz(intpseudovec<real_t, size> x)
-  {
-    return x.clz();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> convert_bool(intpseudovec<real_t, size> x)
-  {
-    return x.convert_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> convert_float(intpseudovec<real_t, size> x)
-  {
-    return x.convert_float();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isignbit(intpseudovec<real_t, size> x)
-  {
-    return x.isignbit();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x,
-                                 typename intpseudovec<real_t, size>::int_t n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x,
-                                        intpseudovec<real_t, size> n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> max(intpseudovec<real_t, size> x,
-                                        intpseudovec<real_t, size> y)
-  {
-    return x.max(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> min(intpseudovec<real_t, size> x,
-                                        intpseudovec<real_t, size> y)
-  {
-    return x.min(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> popcount(intpseudovec<real_t, size> x)
-  {
-    return x.popcount();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x,
-                                    typename
-                                    intpseudovec<real_t, size>::int_t n)
-  {
-    return x.rotate(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x,
-                                           intpseudovec<real_t, size> n)
-  {
-    return x.rotate(n);
-  }
-  
-  
-  
-  // realpseudovec wrappers
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size>
-  loada(real_t const* p,
-        realpseudovec<real_t, size> x,
-        typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.loada(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size>
-  loadu(real_t const* p,
-        realpseudovec<real_t, size> x,
-        typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size>
-  loadu(real_t const* p, size_t ioff,
-        realpseudovec<real_t, size> x,
-        typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, ioff, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realpseudovec<real_t, size> x, real_t* p)
-  {
-    return x.storea(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realpseudovec<real_t, size> x, real_t* p)
-  {
-    return x.storeu(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff)
-  {
-    return x.storeu(p, ioff);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realpseudovec<real_t, size> x, real_t* p,
-                     typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.storea(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realpseudovec<real_t, size> x, real_t* p,
-                     typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff,
-                     typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, ioff, m);
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> convert_int(realpseudovec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t maxval(realpseudovec<real_t, size> x)
-  {
-    return x.maxval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t minval(realpseudovec<real_t, size> x)
-  {
-    return x.minval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t prod(realpseudovec<real_t, size> x)
-  {
-    return x.prod();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t sum(realpseudovec<real_t, size> x)
-  {
-    return x.sum();
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> acos(realpseudovec<real_t, size> x)
-  {
-    return x.acos();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> acosh(realpseudovec<real_t, size> x)
-  {
-    return x.acosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> asin(realpseudovec<real_t, size> x)
-  {
-    return x.asin();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> asinh(realpseudovec<real_t, size> x)
-  {
-    return x.asinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> atan(realpseudovec<real_t, size> x)
-  {
-    return x.atan();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> atan2(realpseudovec<real_t, size> x,
-                                           realpseudovec<real_t, size> y)
-  {
-    return x.atan2(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> atanh(realpseudovec<real_t, size> x)
-  {
-    return x.atanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> cbrt(realpseudovec<real_t, size> x)
-  {
-    return x.cbrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> ceil(realpseudovec<real_t, size> x)
-  {
-    return x.ceil();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> copysign(realpseudovec<real_t, size> x,
-                                              realpseudovec<real_t, size> y)
-  {
-    return x.copysign(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> cos(realpseudovec<real_t, size> x)
-  {
-    return x.cos();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> cosh(realpseudovec<real_t, size> x)
-  {
-    return x.cosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> exp(realpseudovec<real_t, size> x)
-  {
-    return x.exp();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> exp10(realpseudovec<real_t, size> x)
-  {
-    return x.exp10();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> exp2(realpseudovec<real_t, size> x)
-  {
-    return x.exp2();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> expm1(realpseudovec<real_t, size> x)
-  {
-    return x.expm1();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fabs(realpseudovec<real_t, size> x)
-  {
-    return x.fabs();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> floor(realpseudovec<real_t, size> x)
-  {
-    return x.floor();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fdim(realpseudovec<real_t, size> x,
-                                          realpseudovec<real_t, size> y)
-  {
-    return x.fdim(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fma(realpseudovec<real_t, size> x,
-                                         realpseudovec<real_t, size> y,
-                                         realpseudovec<real_t, size> z)
-  {
-    return x.fma(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fmax(realpseudovec<real_t, size> x,
-                                          realpseudovec<real_t, size> y)
-  {
-    return x.fmax(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fmin(realpseudovec<real_t, size> x,
-                                          realpseudovec<real_t, size> y)
-  {
-    return x.fmin(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fmod(realpseudovec<real_t, size> x,
-                                          realpseudovec<real_t, size> y)
-  {
-    return x.fmod(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> frexp(realpseudovec<real_t, size> x,
-                                           intpseudovec<real_t, size>* r)
-  {
-    return x.frexp(r);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> hypot(realpseudovec<real_t, size> x,
-                                           realpseudovec<real_t, size> y)
-  {
-    return x.hypot(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> ilogb(realpseudovec<real_t, size> x)
-  {
-    return x.ilogb();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isfinite(realpseudovec<real_t, size> x)
-  {
-    return x.isfinite();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isinf(realpseudovec<real_t, size> x)
-  {
-    return x.isinf();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isnan(realpseudovec<real_t, size> x)
-  {
-    return x.isnan();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isnormal(realpseudovec<real_t, size> x)
-  {
-    return x.isnormal();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x,
-                                    typename intpseudovec<real_t, size>::int_t
-                                    n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x,
-                                    intpseudovec<real_t, size> n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> log(realpseudovec<real_t, size> x)
-  {
-    return x.log();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> log10(realpseudovec<real_t, size> x)
-  {
-    return x.log10();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> log1p(realpseudovec<real_t, size> x)
-  {
-    return x.log1p();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> log2(realpseudovec<real_t, size> x)
-  {
-    return x.log2();
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> lrint(realpseudovec<real_t, size> x)
-  {
-    return x.lrint();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> mad(realpseudovec<real_t, size> x,
-                                         realpseudovec<real_t, size> y,
-                                         realpseudovec<real_t, size> z)
-  {
-    return x.mad(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> nextafter(realpseudovec<real_t, size> x,
-                                               realpseudovec<real_t, size> y)
-  {
-    return x.nextafter(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> pow(realpseudovec<real_t, size> x,
-                                         realpseudovec<real_t, size> y)
-  {
-    return x.pow(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> rcp(realpseudovec<real_t, size> x)
-  {
-    return x.rcp();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> remainder(realpseudovec<real_t, size> x,
-                                               realpseudovec<real_t, size> y)
-  {
-    return x.remainder(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> rint(realpseudovec<real_t, size> x)
-  {
-    return x.rint();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> round(realpseudovec<real_t, size> x)
-  {
-    return x.round();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> rsqrt(realpseudovec<real_t, size> x)
-  {
-    return x.rsqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> signbit(realpseudovec<real_t, size> x)
-  {
-    return x.signbit();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> sin(realpseudovec<real_t, size> x)
-  {
-    return x.sin();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> sinh(realpseudovec<real_t, size> x)
-  {
-    return x.sinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> sqrt(realpseudovec<real_t, size> x)
-  {
-    return x.sqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> tan(realpseudovec<real_t, size> x)
-  {
-    return x.tan();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> tanh(realpseudovec<real_t, size> x)
-  {
-    return x.tanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> trunc(realpseudovec<real_t, size> x)
-  {
-    return x.trunc();
-  }
-  
-  
-  
+  realvec_t remainder(realvec_t y) const { return map(vml_std::remainder, y); }
+  realvec_t rint() const { return map(vml_std::rint); }
+  realvec_t round() const { return map(vml_std::round); }
+  realvec_t rsqrt() const { return sqrt().rcp(); }
+  boolvec_t signbit() const { return mapb(vml_std::signbit); }
+  realvec_t sin() const { return map(vml_std::sin); }
+  realvec_t sinh() const { return map(vml_std::sinh); }
+  realvec_t sqrt() const { return map(vml_std::sqrt); }
+  realvec_t tan() const { return map(vml_std::tan); }
+  realvec_t tanh() const { return map(vml_std::tanh); }
+  realvec_t trunc() const { return map(vml_std::trunc); }
+};
+
+// boolpseudovec definitions
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::intvec_t
+boolpseudovec<T, N>::as_int() const {
+  return convert_int();
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::intvec_t
+boolpseudovec<T, N>::convert_int() const {
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::boolvec_t
+boolpseudovec<T, N>::ifthen(boolvec_t x, boolvec_t y) const {
+  boolvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::intvec_t
+boolpseudovec<T, N>::ifthen(intvec_t x, intvec_t y) const {
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::realvec_t
+boolpseudovec<T, N>::ifthen(realvec_t x, realvec_t y) const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+// intpseudovec definitions
+
+template <typename T, int N>
+inline typename intpseudovec<T, N>::realvec_t
+intpseudovec<T, N>::as_float() const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = FP::as_float(v[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline intpseudovec<T, N> intpseudovec<T, N>::bitifthen(intvec_t x,
+                                                        intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+template <typename T, int N>
+inline typename intpseudovec<T, N>::realvec_t
+intpseudovec<T, N>::convert_float() const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = FP::convert_float(v[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline intpseudovec<T, N> intpseudovec<T, N>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+template <typename T, int N>
+inline intpseudovec<T, N> intpseudovec<T, N>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+// Wrappers
+
+// boolpseudovec wrappers
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> as_int(boolpseudovec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> convert_int(boolpseudovec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline bool all(boolpseudovec<real_t, size> x) {
+  return x.all();
+}
+
+template <typename real_t, int size>
+inline bool any(boolpseudovec<real_t, size> x) {
+  return x.any();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
+                                          boolpseudovec<real_t, size> x,
+                                          boolpseudovec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
+                                         intpseudovec<real_t, size> x,
+                                         intpseudovec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
+                                          realpseudovec<real_t, size> x,
+                                          realpseudovec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+// intpseudovec wrappers
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> abs(intpseudovec<real_t, size> x) {
+  return x.abs();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> as_bool(intpseudovec<real_t, size> x) {
+  return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> as_float(intpseudovec<real_t, size> x) {
+  return x.as_float();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> bitifthen(intpseudovec<real_t, size> x,
+                                            intpseudovec<real_t, size> y,
+                                            intpseudovec<real_t, size> z) {
+  return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> clz(intpseudovec<real_t, size> x) {
+  return x.clz();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> convert_bool(intpseudovec<real_t, size> x) {
+  return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> convert_float(intpseudovec<real_t, size> x) {
+  return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isignbit(intpseudovec<real_t, size> x) {
+  return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size>
+lsr(intpseudovec<real_t, size> x,
+    typename intpseudovec<real_t, size>::int_t n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x,
+                                      intpseudovec<real_t, size> n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> max(intpseudovec<real_t, size> x,
+                                      intpseudovec<real_t, size> y) {
+  return x.max(y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> min(intpseudovec<real_t, size> x,
+                                      intpseudovec<real_t, size> y) {
+  return x.min(y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> popcount(intpseudovec<real_t, size> x) {
+  return x.popcount();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size>
+rotate(intpseudovec<real_t, size> x,
+       typename intpseudovec<real_t, size>::int_t n) {
+  return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x,
+                                         intpseudovec<real_t, size> n) {
+  return x.rotate(n);
+}
+
+// realpseudovec wrappers
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+loada(real_t const *p, realpseudovec<real_t, size> x,
+      typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+loadu(real_t const *p, realpseudovec<real_t, size> x,
+      typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+loadu(real_t const *p, size_t ioff, realpseudovec<real_t, size> x,
+      typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realpseudovec<real_t, size> x, real_t *p) {
+  return x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p) {
+  return x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p, size_t ioff) {
+  return x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realpseudovec<real_t, size> x, real_t *p,
+                   typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p,
+                   typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p, size_t ioff,
+                   typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> convert_int(realpseudovec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline real_t maxval(realpseudovec<real_t, size> x) {
+  return x.maxval();
+}
+
+template <typename real_t, int size>
+inline real_t minval(realpseudovec<real_t, size> x) {
+  return x.minval();
+}
+
+template <typename real_t, int size>
+inline real_t prod(realpseudovec<real_t, size> x) {
+  return x.prod();
+}
+
+template <typename real_t, int size>
+inline real_t sum(realpseudovec<real_t, size> x) {
+  return x.sum();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> acos(realpseudovec<real_t, size> x) {
+  return x.acos();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> acosh(realpseudovec<real_t, size> x) {
+  return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> asin(realpseudovec<real_t, size> x) {
+  return x.asin();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> asinh(realpseudovec<real_t, size> x) {
+  return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> atan(realpseudovec<real_t, size> x) {
+  return x.atan();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> atan2(realpseudovec<real_t, size> x,
+                                         realpseudovec<real_t, size> y) {
+  return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> atanh(realpseudovec<real_t, size> x) {
+  return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> cbrt(realpseudovec<real_t, size> x) {
+  return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> ceil(realpseudovec<real_t, size> x) {
+  return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> copysign(realpseudovec<real_t, size> x,
+                                            realpseudovec<real_t, size> y) {
+  return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> cos(realpseudovec<real_t, size> x) {
+  return x.cos();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> cosh(realpseudovec<real_t, size> x) {
+  return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> exp(realpseudovec<real_t, size> x) {
+  return x.exp();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> exp10(realpseudovec<real_t, size> x) {
+  return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> exp2(realpseudovec<real_t, size> x) {
+  return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> expm1(realpseudovec<real_t, size> x) {
+  return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fabs(realpseudovec<real_t, size> x) {
+  return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> floor(realpseudovec<real_t, size> x) {
+  return x.floor();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fdim(realpseudovec<real_t, size> x,
+                                        realpseudovec<real_t, size> y) {
+  return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fma(realpseudovec<real_t, size> x,
+                                       realpseudovec<real_t, size> y,
+                                       realpseudovec<real_t, size> z) {
+  return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fmax(realpseudovec<real_t, size> x,
+                                        realpseudovec<real_t, size> y) {
+  return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fmin(realpseudovec<real_t, size> x,
+                                        realpseudovec<real_t, size> y) {
+  return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fmod(realpseudovec<real_t, size> x,
+                                        realpseudovec<real_t, size> y) {
+  return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> frexp(realpseudovec<real_t, size> x,
+                                         intpseudovec<real_t, size> *r) {
+  return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> hypot(realpseudovec<real_t, size> x,
+                                         realpseudovec<real_t, size> y) {
+  return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> ilogb(realpseudovec<real_t, size> x) {
+  return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isfinite(realpseudovec<real_t, size> x) {
+  return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isinf(realpseudovec<real_t, size> x) {
+  return x.isinf();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isnan(realpseudovec<real_t, size> x) {
+  return x.isnan();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isnormal(realpseudovec<real_t, size> x) {
+  return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+ldexp(realpseudovec<real_t, size> x,
+      typename intpseudovec<real_t, size>::int_t n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x,
+                                         intpseudovec<real_t, size> n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log(realpseudovec<real_t, size> x) {
+  return x.log();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log10(realpseudovec<real_t, size> x) {
+  return x.log10();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log1p(realpseudovec<real_t, size> x) {
+  return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log2(realpseudovec<real_t, size> x) {
+  return x.log2();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> lrint(realpseudovec<real_t, size> x) {
+  return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> mad(realpseudovec<real_t, size> x,
+                                       realpseudovec<real_t, size> y,
+                                       realpseudovec<real_t, size> z) {
+  return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> nextafter(realpseudovec<real_t, size> x,
+                                             realpseudovec<real_t, size> y) {
+  return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> pow(realpseudovec<real_t, size> x,
+                                       realpseudovec<real_t, size> y) {
+  return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> rcp(realpseudovec<real_t, size> x) {
+  return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> remainder(realpseudovec<real_t, size> x,
+                                             realpseudovec<real_t, size> y) {
+  return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> rint(realpseudovec<real_t, size> x) {
+  return x.rint();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> round(realpseudovec<real_t, size> x) {
+  return x.round();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> rsqrt(realpseudovec<real_t, size> x) {
+  return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> signbit(realpseudovec<real_t, size> x) {
+  return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> sin(realpseudovec<real_t, size> x) {
+  return x.sin();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> sinh(realpseudovec<real_t, size> x) {
+  return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> sqrt(realpseudovec<real_t, size> x) {
+  return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> tan(realpseudovec<real_t, size> x) {
+  return x.tan();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> tanh(realpseudovec<real_t, size> x) {
+  return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> trunc(realpseudovec<real_t, size> x) {
+  return x.trunc();
+}
+
 #ifndef VML_NO_IOSTREAM
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           boolpseudovec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           intpseudovec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           realpseudovec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         boolpseudovec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         intpseudovec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         realpseudovec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
 #endif
-  
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_PSEUDO_H
+#endif // #ifndef VEC_PSEUDO_H
diff --git a/lib/kernel/vecmathlib/vec_qpx_double4.h b/lib/kernel/vecmathlib/vec_qpx_double4.h
index 9fa6bd0..b88b0da 100644
--- a/lib/kernel/vecmathlib/vec_qpx_double4.h
+++ b/lib/kernel/vecmathlib/vec_qpx_double4.h
@@ -11,785 +11,662 @@
 
 // QPX intrinsics
 #ifdef __clang__
-#  include <qpxintrin.h>
+#include <qpxintrin.h>
 #else
-#  include <builtins.h>
+#include <builtins.h>
 #endif
 #include <mass_simd.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_4
-  template<> struct boolvec<double,4>;
-  template<> struct intvec<double,4>;
-  template<> struct realvec<double,4>;
-  
-  
-  
-  template<>
-  struct boolvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef vector4double bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // canonical true is +1.0, canonical false is -1.0
-    // >=0 is true, -0 is true, nan is false
-    static real_t from_bool(bool a) { return a ? +1.0 : -1.0; }
-    static bool to_bool(real_t a) { return a>=0.0; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vec_splats(from_bool(a))) {}
-    boolvec(const bool* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(v[n]);
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return v[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vec_not(v); }
-    
-    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
-    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
-    boolvec operator==(boolvec x) const
-    {
-      return vec_logical(v, x.v, 0x9);
-    }
-    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-    
-    bool all() const
-    {
-      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
-      boolvec x0123 = *this;
-      boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
-      boolvec y0022 = x0123 && x1032;
-      return y0022[0] && y0022[2];
-    }
-    bool any() const
-    {
-      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
-      boolvec x0123 = *this;
-      boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
-      boolvec y0022 = x0123 || x1032;
-      return y0022[0] || y0022[2];
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef vector4double ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(const intvec& x): v(x.v) {}
-    // intvec& operator=(const intvec& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vec_splats(FP::as_float(a))) {}
-    intvec(const int_t* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota()
-    {
-      const int_t iota_[] = {0, 1, 2, 3};
-      return intvec(iota_);
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return FP::as_int(v[n]);
-    }
-    intvec& set_elt(int n, int_t a)
-    {
-      return v[n]=FP::as_float(a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return v; }
-    boolvec_t convert_bool() const { return *this != IV(I(0)); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, -(*this)[d]);
-      return r;
-    }
-    
-    intvec operator+(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] + x[d]);
-      return r;
-    }
-    intvec operator-(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] - x[d]);
-      return r;
-    }
-    
-    intvec& operator+=(intvec x) { return *this=*this+x; }
-    intvec& operator-=(intvec x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, ~(*this)[d]);
-      return r;
-    }
-    
-    intvec operator&(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] & x[d]);
-      return r;
-    }
-    intvec operator|(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] | x[d]);
-      return r;
-    }
-    intvec operator^(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] ^ x[d]);
-      return r;
-    }
-    
-    intvec& operator&=(intvec x) { return *this=*this&x; }
-    intvec& operator|=(intvec x) { return *this=*this|x; }
-    intvec& operator^=(intvec x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      intvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n));
-      return r;
-    }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n);
-      return r;
-    }
-    intvec operator<<(int_t n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n);
-      return r;
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n[d]));
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n[d]);
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n[d]);
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] == x[d]);
-      return r;
-    }
-    boolvec_t operator!=(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] != x[d]);
-      return r;
-    }
-    boolvec_t operator<(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] < x[d]);
-      return r;
-    }
-    boolvec_t operator<=(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] <= x[d]);
-      return r;
-    }
-    boolvec_t operator>(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] > x[d]);
-      return r;
-    }
-    boolvec_t operator>=(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >= x[d]);
-      return r;
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const;
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef vector4double vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static const char* name() { return "<QPX:4*double>"; }
-    void barrier() { __asm__("": "+v"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(const realvec& x): v(x.v) {}
-    // realvec& operator=(const realvec& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vec_splats(a)) {}
-    realvec(const real_t* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return v[n];
-    }
-    realvec& set_elt(int n, real_t a)
-    {
-      return v[n]=a, *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(const real_t* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vec_lda(0, (real_t*)p);
-    }
-    static realvec_t loadu(const real_t* p)
-    {
-      realvec_t v0 = vec_ld(0, (real_t*)p);
-      realvec_t v1 = vec_ld(31, (real_t*)p);
-      return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t*)p));
-    }
-    static realvec_t loadu(const real_t* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      // TODO: use load instruction with fixed offset
-      return loadu(p+ioff);
-    }
-    realvec_t loada(const real_t* p, mask_t m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(const real_t* p, mask_t m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(const real_t* p, std::ptrdiff_t ioff, mask_t m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      // TODO: use load instruction with fixed offset
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vec_sta(v, 0, p);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-      p[2] = (*this)[2];
-      p[3] = (*this)[3];
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, mask_t m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return v; }
-    intvec_t convert_int() const { return vec_ctidz(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return vec_neg(v); }
-    
-    realvec operator+(realvec x) const { return vec_add(v, x.v); }
-    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
-    realvec operator*(realvec x) const { return vec_mul(v, x.v); }
-    realvec operator/(realvec x) const
-    {
-      // return vec_swdiv_nochk(v, x.v);
-      return div_fastd4(v, x.v);
-    }
-    
-    realvec& operator+=(realvec x) { return *this=*this+x; }
-    realvec& operator-=(realvec x) { return *this=*this-x; }
-    realvec& operator*=(realvec x) { return *this=*this*x; }
-    realvec& operator/=(realvec x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-      //                      vml_std::fmax((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
-      realvec_t y0022 = x0123.fmax(x1032);
-      return vml_std::fmax(y0022[0], y0022[2]);
+template <> struct boolvec<double, 4>;
+template <> struct intvec<double, 4>;
+template <> struct realvec<double, 4>;
+
+template <> struct boolvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef vector4double bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // canonical true is +1.0, canonical false is -1.0
+  // >=0 is true, -0 is true, nan is false
+  static real_t from_bool(bool a) { return a ? +1.0 : -1.0; }
+  static bool to_bool(real_t a) { return a >= 0.0; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(vec_splats(from_bool(a))) {}
+  boolvec(const bool *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return to_bool(v[n]); }
+  boolvec &set_elt(int n, bool a) { return v[n] = from_bool(a), *this; }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vec_not(v); }
+
+  boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+  boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+  boolvec operator==(boolvec x) const { return vec_logical(v, x.v, 0x9); }
+  boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+
+  bool all() const {
+    // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+    boolvec x0123 = *this;
+    boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+    boolvec y0022 = x0123 && x1032;
+    return y0022[0] && y0022[2];
+  }
+  bool any() const {
+    // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+    boolvec x0123 = *this;
+    boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+    boolvec y0022 = x0123 || x1032;
+    return y0022[0] || y0022[2];
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef vector4double ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(const intvec& x): v(x.v) {}
+  // intvec& operator=(const intvec& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vec_splats(FP::as_float(a))) {}
+  intvec(const int_t *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() {
+    const int_t iota_[] = {0, 1, 2, 3};
+    return intvec(iota_);
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const { return FP::as_int(v[n]); }
+  intvec &set_elt(int n, int_t a) { return v[n] = FP::as_float(a), *this; }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return v; }
+  boolvec_t convert_bool() const { return *this != IV(I(0)); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, -(*this)[d]);
+    return r;
+  }
+
+  intvec operator+(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] + x[d]);
+    return r;
+  }
+  intvec operator-(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] - x[d]);
+    return r;
+  }
+
+  intvec &operator+=(intvec x) { return *this = *this + x; }
+  intvec &operator-=(intvec x) { return *this = *this - x; }
+
+  intvec operator~() const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, ~(*this)[d]);
+    return r;
+  }
+
+  intvec operator&(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] & x[d]);
+    return r;
+  }
+  intvec operator|(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] | x[d]);
+    return r;
+  }
+  intvec operator^(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] ^ x[d]);
+    return r;
+  }
+
+  intvec &operator&=(intvec x) { return *this = *this & x; }
+  intvec &operator|=(intvec x) { return *this = *this | x; }
+  intvec &operator^=(intvec x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const {
+    intvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, U((*this)[d]) >> U(n));
+    return r;
+  }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] >> n);
+    return r;
+  }
+  intvec operator<<(int_t n) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] << n);
+    return r;
+  }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, U((*this)[d]) >> U(n[d]));
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] >> n[d]);
+    return r;
+  }
+  intvec operator<<(intvec n) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] << n[d]);
+    return r;
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] == x[d]);
+    return r;
+  }
+  boolvec_t operator!=(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] != x[d]);
+    return r;
+  }
+  boolvec_t operator<(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] < x[d]);
+    return r;
+  }
+  boolvec_t operator<=(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] <= x[d]);
+    return r;
+  }
+  boolvec_t operator>(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] > x[d]);
+    return r;
+  }
+  boolvec_t operator>=(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] >= x[d]);
+    return r;
+  }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const;
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef vector4double vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static const char *name() { return "<QPX:4*double>"; }
+  void barrier() { __asm__("" : "+v"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(const realvec& x): v(x.v) {}
+  // realvec& operator=(const realvec& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vec_splats(a)) {}
+  realvec(const real_t *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const { return v[n]; }
+  realvec &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(const real_t *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vec_lda(0, (real_t *)p);
+  }
+  static realvec_t loadu(const real_t *p) {
+    realvec_t v0 = vec_ld(0, (real_t *)p);
+    realvec_t v1 = vec_ld(31, (real_t *)p);
+    return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t *)p));
+  }
+  static realvec_t loadu(const real_t *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    // TODO: use load instruction with fixed offset
+    return loadu(p + ioff);
+  }
+  realvec_t loada(const real_t *p, mask_t m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    real_t minval() const
-    {
-      // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-      //                      vml_std::fmin((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
-      realvec_t y0022 = x0123.fmin(x1032);
-      return vml_std::fmin(y0022[0], y0022[2]);
+  }
+  realvec_t loadu(const real_t *p, mask_t m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    real_t prod() const
-    {
-      // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-      realvec_t x = vec_xmul(v, v);
-      return x[1] * x[3];
+  }
+  realvec_t loadu(const real_t *p, std::ptrdiff_t ioff, mask_t m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    // TODO: use load instruction with fixed offset
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vec_sta(v, 0, p);
+  }
+  void storeu(real_t *p) const {
+    // Vector stores would require vector loads, which would need to
+    // be atomic
+    // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html>
+    // for good ideas
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
+    p[2] = (*this)[2];
+    p[3] = (*this)[3];
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    real_t sum() const
-    {
-      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-      realvec_t c1 = vec_logical(v, v, 0xf); // +1.0
-      realvec_t x = vec_xxmadd(v, c1, v);
-      return x[0] + x[2];
+  }
+  void storeu(real_t *p, mask_t m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    
-    
-    
-    boolvec_t operator==(realvec x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(realvec x) const { return ! (*this == x); }
-    boolvec_t operator<(realvec x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(realvec x) const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return v; }
+  intvec_t convert_int() const { return vec_ctidz(v); }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return vec_neg(v); }
+
+  realvec operator+(realvec x) const { return vec_add(v, x.v); }
+  realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+  realvec operator*(realvec x) const { return vec_mul(v, x.v); }
+  realvec operator/(realvec x) const {
+    // return vec_swdiv_nochk(v, x.v);
+    return div_fastd4(v, x.v);
+  }
+
+  realvec &operator+=(realvec x) { return *this = *this + x; }
+  realvec &operator-=(realvec x) { return *this = *this - x; }
+  realvec &operator*=(realvec x) { return *this = *this * x; }
+  realvec &operator/=(realvec x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+    //                      vml_std::fmax((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+    realvec_t y0022 = x0123.fmax(x1032);
+    return vml_std::fmax(y0022[0], y0022[2]);
+  }
+  real_t minval() const {
+    // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+    //                      vml_std::fmin((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+    realvec_t y0022 = x0123.fmin(x1032);
+    return vml_std::fmin(y0022[0], y0022[2]);
+  }
+  real_t prod() const {
+    // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    realvec_t x = vec_xmul(v, v);
+    return x[1] * x[3];
+  }
+  real_t sum() const {
+    // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    realvec_t c1 = vec_logical(v, v, 0xf); // +1.0
+    realvec_t x = vec_xxmadd(v, c1, v);
+    return x[0] + x[2];
+  }
+
+  boolvec_t operator==(realvec x) const { return vec_cmpeq(v, x.v); }
+  boolvec_t operator!=(realvec x) const { return !(*this == x); }
+  boolvec_t operator<(realvec x) const { return vec_cmplt(v, x.v); }
+  boolvec_t operator<=(realvec x) const {
 #ifdef VML_HAVE_NAN
-      return *this < x || *this == x;
+    return *this < x || *this == x;
 #else
-      return ! (*this > x);
+    return !(*this > x);
 #endif
-    }
-    boolvec_t operator>(realvec x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(realvec x) const
-    {
+  }
+  boolvec_t operator>(realvec x) const { return vec_cmpgt(v, x.v); }
+  boolvec_t operator>=(realvec x) const {
 #ifdef VML_HAVE_NAN
-      return *this > x || *this == x;
+    return *this > x || *this == x;
 #else
-      return ! (*this < x);
+    return !(*this < x);
 #endif
-    }
-    
-    
-    
-    realvec acos() const { return acosd4(v); }
-    realvec acosh() const { return acoshd4(v); }
-    realvec asin() const { return asind4(v); }
-    realvec asinh() const { return asinhd4(v); }
-    realvec atan() const { return atand4(v); }
-    realvec atan2(realvec y) const { return atan2d4(v, y.v); }
-    realvec atanh() const { return atanhd4(v); }
-    realvec cbrt() const { return cbrtd4(v); }
-    realvec ceil() const { return vec_ceil(v); }
-    realvec copysign(realvec y) const { return vec_cpsgn(y.v, v); }
-    realvec cos() const { return cosd4(v); }
-    realvec cosh() const { return coshd4(v); }
-    realvec exp() const { return expd4(v); }
-    realvec exp10() const { return exp10d4(v); }
-    realvec exp2() const { return exp2d4(v); }
-    realvec expm1() const { return expm1d4(v); }
-    realvec fabs() const { return vec_abs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return vec_floor(v); }
-    realvec fma(realvec y, realvec z) const
-    {
-      return vec_madd(v, y.v, z.v);
-    }
-    realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); }
-    realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return hypotd4(v, y.v); }
-    intvec_t ilogb() const
-    {
-      // int_t ilogb_[] = {
-      //   ::ilogb((*this)[0]),
-      //   ::ilogb((*this)[1]),
-      //   ::ilogb((*this)[2]),
-      //   ::ilogb((*this)[3])
-      // };
-      // return intvec_t(ilogb_);
-      return MF::vml_ilogb(v);
-    }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+  }
+
+  realvec acos() const { return acosd4(v); }
+  realvec acosh() const { return acoshd4(v); }
+  realvec asin() const { return asind4(v); }
+  realvec asinh() const { return asinhd4(v); }
+  realvec atan() const { return atand4(v); }
+  realvec atan2(realvec y) const { return atan2d4(v, y.v); }
+  realvec atanh() const { return atanhd4(v); }
+  realvec cbrt() const { return cbrtd4(v); }
+  realvec ceil() const { return vec_ceil(v); }
+  realvec copysign(realvec y) const { return vec_cpsgn(y.v, v); }
+  realvec cos() const { return cosd4(v); }
+  realvec cosh() const { return coshd4(v); }
+  realvec exp() const { return expd4(v); }
+  realvec exp10() const { return exp10d4(v); }
+  realvec exp2() const { return exp2d4(v); }
+  realvec expm1() const { return expm1d4(v); }
+  realvec fabs() const { return vec_abs(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const { return vec_floor(v); }
+  realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+  realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); }
+  realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return hypotd4(v, y.v); }
+  intvec_t ilogb() const {
+    // int_t ilogb_[] = {
+    //   ::ilogb((*this)[0]),
+    //   ::ilogb((*this)[1]),
+    //   ::ilogb((*this)[2]),
+    //   ::ilogb((*this)[3])
+    // };
+    // return intvec_t(ilogb_);
+    return MF::vml_ilogb(v);
+  }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #ifdef VML_HAVE_NAN
-      return vec_tstnan(v, v);
+    return vec_tstnan(v, v);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); }
-    realvec ldexp(intvec_t n) const
-    {
-      real_t ldexp_[] = {
-        vml_std::ldexp((*this)[0], n[0]),
-        vml_std::ldexp((*this)[1], n[1]),
-        vml_std::ldexp((*this)[2], n[2]),
-        vml_std::ldexp((*this)[3], n[3])
-      };
-      return realvec_t(ldexp_);
-    }
-    realvec log() const { return logd4(v); }
-    realvec log10() const { return log10d4(v); }
-    realvec log1p() const { return log1pd4(v); }
-    realvec log2() const { return log2d4(v); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec nextafter(realvec y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec pow(realvec y) const { return powd4(v, y.v); }
-    realvec rcp() const { return recip_fastd4(v); }
-    realvec remainder(realvec y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec rint() const
-    {
-      return MF::vml_rint(*this);
-      // This is tempting, but seems too invasive
-      // #ifdef VML_HAVE_FP_CONTRACT
-      //       return MF::vml_rint(*this);
-      // #else
-      //       return vec_round(v);      // use round instead of rint
-      // #endif
-    }
-    realvec round() const { return vec_round(v); }
-    realvec rsqrt() const
-    {
-      realvec x = *this;
-      realvec r = vec_rsqrte(x.v); // this is only an approximation
-      // TODO: use fma
-      // two Newton iterations (see vml_rsqrt)
-      r += RV(0.5)*r * (RV(1.0) - x * r*r);
-      r += RV(0.5)*r * (RV(1.0) - x * r*r);
-      return r;
-    }
-    boolvec_t signbit() const
-    {
-      return !RV(1.0).copysign(*this).as_int().as_bool();
-    }
-    realvec sin() const { return sind4(v); }
-    realvec sinh() const { return sinhd4(v); }
-    realvec sqrt() const
-    {
-      // return vec_sqrtsw_nochk(v);
-      return *this * rsqrt();
-    }
-    realvec tan() const { return tand4(v); }
-    realvec tanh() const { return tanhd4(v); }
-    realvec trunc() const { return vec_trunc(v); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,4> boolvec<double,4>::as_int() const
-  {
-    return v;
-  }
-  
-  inline intvec<double,4> boolvec<double,4>::convert_int() const
-  {
-    return ifthen(IV(I(1)), IV(I(0)));
-  }
-  
-  inline
-  boolvec<double,4>
-  boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline
-  intvec<double,4>
-  boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<double,4>
-  boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline intvec<double,4> intvec<double,4>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::as_float() const
-  {
-    return v;
-  }
-  
-  inline intvec<double,4> intvec<double,4>::bitifthen(intvec_t x,
-                                                      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::convert_float() const
-  {
-    return vec_cfid(v);
-  }
-  
-  inline boolvec<double,4> intvec<double,4>::isignbit() const
-  {
-    return MF::vml_isignbit(*this);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); }
+  realvec ldexp(intvec_t n) const {
+    real_t ldexp_[] = {
+        vml_std::ldexp((*this)[0], n[0]), vml_std::ldexp((*this)[1], n[1]),
+        vml_std::ldexp((*this)[2], n[2]), vml_std::ldexp((*this)[3], n[3])};
+    return realvec_t(ldexp_);
+  }
+  realvec log() const { return logd4(v); }
+  realvec log10() const { return log10d4(v); }
+  realvec log1p() const { return log1pd4(v); }
+  realvec log2() const { return log2d4(v); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return powd4(v, y.v); }
+  realvec rcp() const { return recip_fastd4(v); }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const {
+    return MF::vml_rint(*this);
+    // This is tempting, but seems too invasive
+    // #ifdef VML_HAVE_FP_CONTRACT
+    //       return MF::vml_rint(*this);
+    // #else
+    //       return vec_round(v);      // use round instead of rint
+    // #endif
+  }
+  realvec round() const { return vec_round(v); }
+  realvec rsqrt() const {
+    realvec x = *this;
+    realvec r = vec_rsqrte(x.v); // this is only an approximation
+    // TODO: use fma
+    // two Newton iterations (see vml_rsqrt)
+    r += RV(0.5) * r * (RV(1.0) - x * r * r);
+    r += RV(0.5) * r * (RV(1.0) - x * r * r);
+    return r;
+  }
+  boolvec_t signbit() const {
+    return !RV(1.0).copysign(*this).as_int().as_bool();
+  }
+  realvec sin() const { return sind4(v); }
+  realvec sinh() const { return sinhd4(v); }
+  realvec sqrt() const {
+    // return vec_sqrtsw_nochk(v);
+    return *this * rsqrt();
+  }
+  realvec tan() const { return tand4(v); }
+  realvec tanh() const { return tanhd4(v); }
+  realvec trunc() const { return vec_trunc(v); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 4> boolvec<double, 4>::as_int() const { return v; }
+
+inline intvec<double, 4> boolvec<double, 4>::convert_int() const {
+  return ifthen(IV(I(1)), IV(I(0)));
+}
+
+inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<double, 4> intvec<double, 4>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline realvec<double, 4> intvec<double, 4>::as_float() const { return v; }
+
+inline intvec<double, 4> intvec<double, 4>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 4> intvec<double, 4>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<double, 4> intvec<double, 4>::convert_float() const {
+  return vec_cfid(v);
+}
+
+inline boolvec<double, 4> intvec<double, 4>::isignbit() const {
+  return MF::vml_isignbit(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_QPX_DOUBLE4_H
+#endif // #ifndef VEC_QPX_DOUBLE4_H
diff --git a/lib/kernel/vecmathlib/vec_sse_double1.h b/lib/kernel/vecmathlib/vec_sse_double1.h
index 5558356..d727de8 100644
--- a/lib/kernel/vecmathlib/vec_sse_double1.h
+++ b/lib/kernel/vecmathlib/vec_sse_double1.h
@@ -12,589 +12,493 @@
 
 // SSE2 intrinsics
 #include <emmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
 #endif
-#ifdef __SSE4_1__               // Intel's SSE 4.1
-#  include <smmintrin.h>
+#ifdef __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
 #endif
-#ifdef __SSE4A__                // AMD's SSE 4a
-#  include <ammintrin.h>
+#ifdef __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
 #endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_1
-  template<> struct boolvec<double,1>;
-  template<> struct intvec<double,1>;
-  template<> struct realvec<double,1>;
-  
-  
-  
-  template<>
-  struct boolvec<double,1>: floatprops<double>
-  {
-    static int const size = 1;
-    typedef bool scalar_t;
-    typedef uint_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-    // true values are non-zero, false values are zero
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(a) {}
-    boolvec(bool const* as): v(as[0]) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return v; }
-    boolvec_t& set_elt(int n, bool a) { return v=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return !v; }
-    
-    boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
-    boolvec_t operator||(boolvec_t x) const { return v || x.v; }
-    boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
-    boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
-    
-    bool all() const { return *this; }
-    bool any() const { return *this; }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,1>: floatprops<double>
-  {
-    static int const size = 1;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(int_t a): v(a) {}
-    intvec(int_t const* as): v(as[0]) {}
-    static intvec_t iota() { return intvec(I(0)); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return v; }
-    intvec_t& set_elt(int n, int_t a) { return v=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return U(v); }
-    boolvec_t convert_bool() const { return bool(v); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec_t operator+() const { return +v; }
-    intvec_t operator-() const { return -v; }
-    
-    intvec_t operator+(intvec_t x) const { return v+x.v; }
-    intvec_t operator-(intvec_t x) const { return v-x.v; }
-    intvec_t operator*(intvec_t x) const { return v*x.v; }
-    intvec_t operator/(intvec_t x) const { return v/x.v; }
-    intvec_t operator%(intvec_t x) const { return v%x.v; }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    intvec_t& operator*=(intvec_t const& x) { return *this=*this*x; }
-    intvec_t& operator/=(intvec_t const& x) { return *this=*this/x; }
-    intvec_t& operator%=(intvec_t const& x) { return *this=*this%x; }
-    
-    
-    
-    intvec_t operator~() const { return ~v; }
-    
-    intvec_t operator&(intvec_t x) const { return v&x.v; }
-    intvec_t operator|(intvec_t x) const { return v|x.v; }
-    intvec_t operator^(intvec_t x) const { return v^x.v; }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return U(v) >> U(n); }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const { return v>>n; }
-    intvec_t operator<<(int_t n) const { return v<<n; }
-    
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const { return v>>n; }
-    intvec_t operator<<(intvec_t n) const { return v<<n; }
-    
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const { return __builtin_clzll(v); }
-    intvec_t popcount() const { return __builtin_popcountll(v); }
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const { return v==x.v; }
-    boolvec_t operator!=(intvec_t const& x) const { return v!=x.v; }
-    boolvec_t operator<(intvec_t const& x) const { return v<x.v; }
-    boolvec_t operator<=(intvec_t const& x) const { return v<=x.v; }
-    boolvec_t operator>(intvec_t const& x) const { return v>x.v; }
-    boolvec_t operator>=(intvec_t const& x) const { return v>=x.v; }
-    
-    intvec_t abs() const { return std::abs(v); }
-    boolvec_t isignbit() const { return v<0; }
-    intvec_t max(intvec_t x) const { return std::max(v, x.v); }
-    intvec_t min(intvec_t x) const { return std::min(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,1>: floatprops<double>
-  {
-    static int const size = 1;
-    typedef real_t scalar_t;
-    typedef double vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:1*double>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-  private:
-    static __m128d from_double(double a) { return _mm_set_sd(a); }
-    static double to_double(__m128d a) { return _mm_cvtsd_f64(a); }
-  public:
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(real_t a): v(a) {}
-    realvec(real_t const* as): v(as[0]) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return v; }
-    realvec_t& set_elt(int n, real_t a) { return v=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      *p = v;
-    }
-    void storeu(real_t* p) const
-    {
-      *p = v;
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff);
+template <> struct boolvec<double, 1>;
+template <> struct intvec<double, 1>;
+template <> struct realvec<double, 1>;
+
+template <> struct boolvec<double, 1> : floatprops<double> {
+  static int const size = 1;
+  typedef bool scalar_t;
+  typedef uint_t bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+  // true values are non-zero, false values are zero
+
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(a) {}
+  boolvec(bool const *as) : v(as[0]) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return v; }
+  boolvec_t &set_elt(int n, bool a) { return v = a, *this; }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return !v; }
+
+  boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
+  boolvec_t operator||(boolvec_t x) const { return v || x.v; }
+  boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
+  boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
+
+  bool all() const { return *this; }
+  bool any() const { return *this; }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 1> : floatprops<double> {
+  static int const size = 1;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(int_t a) : v(a) {}
+  intvec(int_t const *as) : v(as[0]) {}
+  static intvec_t iota() { return intvec(I(0)); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const { return v; }
+  intvec_t &set_elt(int n, int_t a) { return v = a, *this; }
+
+  boolvec_t as_bool() const { return U(v); }
+  boolvec_t convert_bool() const { return bool(v); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec_t operator+() const { return +v; }
+  intvec_t operator-() const { return -v; }
+
+  intvec_t operator+(intvec_t x) const { return v + x.v; }
+  intvec_t operator-(intvec_t x) const { return v - x.v; }
+  intvec_t operator*(intvec_t x) const { return v * x.v; }
+  intvec_t operator/(intvec_t x) const { return v / x.v; }
+  intvec_t operator%(intvec_t x) const { return v % x.v; }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+  intvec_t &operator*=(intvec_t const &x) { return *this = *this * x; }
+  intvec_t &operator/=(intvec_t const &x) { return *this = *this / x; }
+  intvec_t &operator%=(intvec_t const &x) { return *this = *this % x; }
+
+  intvec_t operator~() const { return ~v; }
+
+  intvec_t operator&(intvec_t x) const { return v & x.v; }
+  intvec_t operator|(intvec_t x) const { return v | x.v; }
+  intvec_t operator^(intvec_t x) const { return v ^ x.v; }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return U(v) >> U(n); }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const { return v >> n; }
+  intvec_t operator<<(int_t n) const { return v << n; }
+
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const { return v >> n; }
+  intvec_t operator<<(intvec_t n) const { return v << n; }
+
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const { return __builtin_clzll(v); }
+  intvec_t popcount() const { return __builtin_popcountll(v); }
+
+  boolvec_t operator==(intvec_t const &x) const { return v == x.v; }
+  boolvec_t operator!=(intvec_t const &x) const { return v != x.v; }
+  boolvec_t operator<(intvec_t const &x) const { return v < x.v; }
+  boolvec_t operator<=(intvec_t const &x) const { return v <= x.v; }
+  boolvec_t operator>(intvec_t const &x) const { return v > x.v; }
+  boolvec_t operator>=(intvec_t const &x) const { return v >= x.v; }
+
+  intvec_t abs() const { return std::abs(v); }
+  boolvec_t isignbit() const { return v < 0; }
+  intvec_t max(intvec_t x) const { return std::max(v, x.v); }
+  intvec_t min(intvec_t x) const { return std::min(v, x.v); }
+};
+
+template <> struct realvec<double, 1> : floatprops<double> {
+  static int const size = 1;
+  typedef real_t scalar_t;
+  typedef double vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<SSE2:1*double>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+private:
+  static __m128d from_double(double a) { return _mm_set_sd(a); }
+  static double to_double(__m128d a) { return _mm_cvtsd_f64(a); }
+
+public:
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(real_t a) : v(a) {}
+  realvec(real_t const *as) : v(as[0]) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const { return v; }
+  realvec_t &set_elt(int n, real_t a) { return v = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return *p;
+  }
+  static realvec_t loadu(real_t const *p) { return *p; }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loada(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return *this;
     }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return *this;
     }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      }
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loada(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    *p = v;
+  }
+  void storeu(real_t *p) const { *p = v; }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storea(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
     }
-    
-    
-    
-    intvec_t as_int() const { return floatprops::as_int(v); }
-    intvec_t convert_int() const {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storea(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return floatprops::as_int(v); }
+  intvec_t convert_int() const {
 #ifdef __x86_64__
-      return _mm_cvttsd_si64(_mm_set_sd(v));
+    return _mm_cvttsd_si64(_mm_set_sd(v));
 #else
-      return floatprops::convert_int(v);
+    return floatprops::convert_int(v);
 #endif
-    }
-    
-    
-    
-    realvec_t operator+() const { return +v; }
-    realvec_t operator-() const { return -v; }
-    
-    realvec_t operator+(realvec_t x) const { return v+x.v; }
-    realvec_t operator-(realvec_t x) const { return v-x.v; }
-    realvec_t operator*(realvec_t x) const { return v*x.v; }
-    realvec_t operator/(realvec_t x) const { return v/x.v; }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const { return *this; }
-    real_t minval() const { return *this; }
-    real_t prod() const { return *this; }
-    real_t sum() const { return *this; }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const { return v==x.v; }
-    boolvec_t operator!=(realvec_t const& x) const { return v!=x.v; }
-    boolvec_t operator<(realvec_t const& x) const { return v<x.v; }
-    boolvec_t operator<=(realvec_t const& x) const { return v<=x.v; }
-    boolvec_t operator>(realvec_t const& x) const { return v>x.v; }
-    boolvec_t operator>=(realvec_t const& x) const { return v>=x.v; }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const
-    {
+  }
+
+  realvec_t operator+() const { return +v; }
+  realvec_t operator-() const { return -v; }
+
+  realvec_t operator+(realvec_t x) const { return v + x.v; }
+  realvec_t operator-(realvec_t x) const { return v - x.v; }
+  realvec_t operator*(realvec_t x) const { return v * x.v; }
+  realvec_t operator/(realvec_t x) const { return v / x.v; }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const { return *this; }
+  real_t minval() const { return *this; }
+  real_t prod() const { return *this; }
+  real_t sum() const { return *this; }
+
+  boolvec_t operator==(realvec_t const &x) const { return v == x.v; }
+  boolvec_t operator!=(realvec_t const &x) const { return v != x.v; }
+  boolvec_t operator<(realvec_t const &x) const { return v < x.v; }
+  boolvec_t operator<=(realvec_t const &x) const { return v <= x.v; }
+  boolvec_t operator>(realvec_t const &x) const { return v > x.v; }
+  boolvec_t operator>=(realvec_t const &x) const { return v >= x.v; }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const {
 #ifdef __SSE4_1__
-      return to_double(_mm_ceil_sd(from_double(v), from_double(v)));
+    return to_double(_mm_ceil_sd(from_double(v), from_double(v)));
 #else
-      return vml_std::ceil(v);
+    return vml_std::ceil(v);
 #endif
-    }
-    realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return vml_std::fabs(v); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const
-    {
+  }
+  realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return vml_std::fabs(v); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const {
 #ifdef __SSE4_1__
-      return to_double(_mm_floor_sd(from_double(v), from_double(v)));
+    return to_double(_mm_floor_sd(from_double(v), from_double(v)));
 #else
-      return vml_std::floor(v);
+    return vml_std::floor(v);
 #endif
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const
-    {
-      return to_double(_mm_max_sd(from_double(v), from_double(y.v)));
-    }
-    realvec_t fmin(realvec_t y) const
-    {
-      return to_double(_mm_min_sd(from_double(v), from_double(y.v)));
-    }
-    realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
-    realvec_t frexp(intvec_t* irp) const
-    {
-      int iri;
-      realvec_t r = vml_std::frexp(v, &iri);
-      int_t ir = iri;
-      if (isinf()) ir = std::numeric_limits<int_t>::max();
-      if (isnan()) ir = std::numeric_limits<int_t>::min();
-      irp->v = ir;
-      return r;
-    }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const
-    {
-      int_t r = vml_std::ilogb(v);
-      typedef std::numeric_limits<int_t> NL;
-      if (FP_ILOGB0 != NL::min() and v == R(0.0)) {
-        r = NL::min();
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const {
+    return to_double(_mm_max_sd(from_double(v), from_double(y.v)));
+  }
+  realvec_t fmin(realvec_t y) const {
+    return to_double(_mm_min_sd(from_double(v), from_double(y.v)));
+  }
+  realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
+  realvec_t frexp(intvec_t *irp) const {
+    int iri;
+    realvec_t r = vml_std::frexp(v, &iri);
+    int_t ir = iri;
+    if (isinf())
+      ir = std::numeric_limits<int_t>::max();
+    if (isnan())
+      ir = std::numeric_limits<int_t>::min();
+    irp->v = ir;
+    return r;
+  }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const {
+    int_t r = vml_std::ilogb(v);
+    typedef std::numeric_limits<int_t> NL;
+    if (FP_ILOGB0 != NL::min() and v == R(0.0)) {
+      r = NL::min();
 #if defined VML_HAVE_INF
-      } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
-        r = NL::max();
+    } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
+      r = NL::max();
 #endif
 #if defined VML_HAVE_NAN
-      } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v)) {
-        r = NL::min();
+    } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v)) {
+      r = NL::min();
 #endif
-      }
-      return r;
-    }
-    boolvec_t isfinite() const { return vml_std::isfinite(v); }
-    boolvec_t isinf() const { return vml_std::isinf(v); }
-    boolvec_t isnan() const
-    {
-      // This is wrong:
-      // return _mm_ucomineq_sd(from_double(v), from_double(v));
-      // This works:
-      // char r;
-      // __asm__("ucomisd %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
-      // return boolvec_t::scalar_t(r);
-      // This works as well:
-      return vml_std::isnan(v);
-    }
-    boolvec_t isnormal() const { return vml_std::isnormal(v); }
-    realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
-    realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return R(1.0)/v; }
-    realvec_t remainder(realvec_t y) const
-    {
-      return vml_std::remainder(v, y.v);
     }
-    realvec_t rint() const
-    {
+    return r;
+  }
+  boolvec_t isfinite() const { return vml_std::isfinite(v); }
+  boolvec_t isinf() const { return vml_std::isinf(v); }
+  boolvec_t isnan() const {
+    // This is wrong:
+    // return _mm_ucomineq_sd(from_double(v), from_double(v));
+    // This works:
+    // char r;
+    // __asm__("ucomisd %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
+    // return boolvec_t::scalar_t(r);
+    // This works as well:
+    return vml_std::isnan(v);
+  }
+  boolvec_t isnormal() const { return vml_std::isnormal(v); }
+  realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
+  realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return R(1.0) / v; }
+  realvec_t remainder(realvec_t y) const { return vml_std::remainder(v, y.v); }
+  realvec_t rint() const {
 #ifdef __SSE4_1__
-      return to_double(_mm_round_sd(from_double(v), from_double(v),
-                                    _MM_FROUND_TO_NEAREST_INT));
+    return to_double(_mm_round_sd(from_double(v), from_double(v),
+                                  _MM_FROUND_TO_NEAREST_INT));
 #else
-      return MF::vml_rint(*this);
+    return MF::vml_rint(*this);
 #endif
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return vml_std::signbit(v); }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const
-    {
-      return to_double(_mm_sqrt_sd(from_double(v), from_double(v)));
-    }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const
-    {
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return vml_std::signbit(v); }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const {
+    return to_double(_mm_sqrt_sd(from_double(v), from_double(v)));
+  }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const {
 #ifdef __SSE4_1__
-      return to_double(_mm_round_sd(from_double(v), from_double(v),
-                                    _MM_FROUND_TO_ZERO));
+    return to_double(
+        _mm_round_sd(from_double(v), from_double(v), _MM_FROUND_TO_ZERO));
 #else
-      return MF::vml_trunc(*this);
+    return MF::vml_trunc(*this);
 #endif
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,1> boolvec<double,1>::as_int() const
-  {
-    return I(v);
   }
-  
-  inline intvec<double,1> boolvec<double,1>::convert_int() const
-  {
-    return v;
-  }
-  
-  inline
-  boolvec<double,1> boolvec<double,1>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  inline
-  intvec<double,1> boolvec<double,1>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  inline
-  realvec<double,1> boolvec<double,1>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<double,1> intvec<double,1>::as_float() const
-  {
-    return FP::as_float(v);
-  }
-  
-  inline realvec<double,1> intvec<double,1>::convert_float() const
-  {
+};
+
+// boolvec definitions
+
+inline intvec<double, 1> boolvec<double, 1>::as_int() const { return I(v); }
+
+inline intvec<double, 1> boolvec<double, 1>::convert_int() const { return v; }
+
+inline boolvec<double, 1> boolvec<double, 1>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return v ? x : y;
+}
+
+inline intvec<double, 1> boolvec<double, 1>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return v ? x : y;
+}
+
+inline realvec<double, 1> boolvec<double, 1>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return v ? x : y;
+}
+
+// intvec definitions
+
+inline realvec<double, 1> intvec<double, 1>::as_float() const {
+  return FP::as_float(v);
+}
+
+inline realvec<double, 1> intvec<double, 1>::convert_float() const {
 #ifdef __x86_64__
-    return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v));
+  return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v));
 #else
-    return FP::convert_float(v);
+  return FP::convert_float(v);
 #endif
-  }
-  
-  inline intvec<double,1> intvec<double,1>::bitifthen(intvec_t x,
-                                                      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,1> intvec<double,1>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,1> intvec<double,1>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+}
+
+inline intvec<double, 1> intvec<double, 1>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 1> intvec<double, 1>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 1> intvec<double, 1>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_SSE_DOUBLE1_H
+#endif // #ifndef VEC_SSE_DOUBLE1_H
diff --git a/lib/kernel/vecmathlib/vec_sse_double2.h b/lib/kernel/vecmathlib/vec_sse_double2.h
index 11790c3..095f458 100644
--- a/lib/kernel/vecmathlib/vec_sse_double2.h
+++ b/lib/kernel/vecmathlib/vec_sse_double2.h
@@ -11,737 +11,600 @@
 
 // SSE2 intrinsics
 #include <emmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
 #endif
-#ifdef __SSE4_1__               // Intel's SSE 4.1
-#  include <smmintrin.h>
+#ifdef __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
 #endif
-#ifdef __SSE4A__                // AMD's SSE 4a
-#  include <ammintrin.h>
+#ifdef __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
 #endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_2
-  template<> struct boolvec<double,2>;
-  template<> struct intvec<double,2>;
-  template<> struct realvec<double,2>;
-  
-  
-  
-  template<>
-  struct boolvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef bool scalar_t;
-    typedef __m128d bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm_castsi128_pd(_mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec_t& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return _mm_xor_pd(boolvec(true), v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return _mm_and_pd(v, x.v); }
-    boolvec_t operator||(boolvec_t x) const { return _mm_or_pd(v, x.v); }
-    boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
-    boolvec_t operator!=(boolvec_t x) const { return _mm_xor_pd(v, x.v); }
-    
-    bool all() const
-    {
+template <> struct boolvec<double, 2>;
+template <> struct intvec<double, 2>;
+template <> struct realvec<double, 2>;
+
+template <> struct boolvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef bool scalar_t;
+  typedef __m128d bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {}
+  boolvec(bool const *as)
+      : v(_mm_castsi128_pd(
+            _mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec_t &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return _mm_xor_pd(boolvec(true), v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return _mm_and_pd(v, x.v); }
+  boolvec_t operator||(boolvec_t x) const { return _mm_or_pd(v, x.v); }
+  boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+  boolvec_t operator!=(boolvec_t x) const { return _mm_xor_pd(v, x.v); }
+
+  bool all() const {
 #if defined __AVX__
-      return ! (! *this).any();
+    return !(!*this).any();
 #else
-      return (*this)[0] && (*this)[1];
+    return (*this)[0] && (*this)[1];
 #endif
-    }
-    bool any() const
-    {
+  }
+  bool any() const {
 #if defined __AVX__
-      return ! bool(_mm_testz_pd(v, v));
+    return !bool(_mm_testz_pd(v, v));
 #else
-      return (*this)[0] || (*this)[1];
+    return (*this)[0] || (*this)[1];
 #endif
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef int_t scalar_t;
+  typedef __m128i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm_set1_epi64x(a)) {}
+  intvec(int_t const *as) : v(_mm_set_epi64x(as[1], as[0])) {}
+  static intvec_t iota() { return _mm_set_epi64x(1, 0); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return _mm_castsi128_pd(v); }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec_t x = *this;
+    // We know that boolvec_t values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec_t operator+() const { return *this; }
+  intvec_t operator-() const { return IV(I(0)) - *this; }
+
+  intvec_t operator+(intvec_t x) const { return _mm_add_epi64(v, x.v); }
+  intvec_t operator-(intvec_t x) const { return _mm_sub_epi64(v, x.v); }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+  intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec_t operator&(intvec_t x) const {
+    return _mm_castpd_si128(
+        _mm_and_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v)));
+  }
+  intvec_t operator|(intvec_t x) const {
+    return _mm_castpd_si128(
+        _mm_or_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v)));
+  }
+  intvec_t operator^(intvec_t x) const {
+    return _mm_castpd_si128(
+        _mm_xor_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v)));
+  }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return _mm_srli_epi64(v, n); }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const {
+    // There is no _mm_srai_epi64. To emulate it, add 0x80000000
+    // before shifting, and subtract the shifted 0x80000000 after
+    // shifting
+    intvec_t x = *this;
+    // Convert signed to unsiged
+    x += U(1) << (bits - 1);
+    // Shift
+    x = x.lsr(n);
+    // Undo conversion
+    x -= U(1) << (bits - 1 - n);
+    return x;
+  }
+  intvec_t operator<<(int_t n) const { return _mm_slli_epi64(v, n); }
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef int_t scalar_t;
-    typedef __m128i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm_set1_epi64x(a)) {}
-    intvec(int_t const* as): v(_mm_set_epi64x(as[1], as[0])) {}
-    static intvec_t iota() { return _mm_set_epi64x(1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm_castsi128_pd(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec_t x = *this;
-      // We know that boolvec_t values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec_t operator+() const { return *this; }
-    intvec_t operator-() const { return IV(I(0)) - *this; }
-    
-    intvec_t operator+(intvec_t x) const { return _mm_add_epi64(v, x.v); }
-    intvec_t operator-(intvec_t x) const { return _mm_sub_epi64(v, x.v); }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(v),
-                                         _mm_castsi128_pd(x.v)));
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(v),
-                                        _mm_castsi128_pd(x.v)));
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      return _mm_castpd_si128(_mm_xor_pd(_mm_castsi128_pd(v),
-                                         _mm_castsi128_pd(x.v)));
-    }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return _mm_srli_epi64(v, n); }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const
-    {
-      // There is no _mm_srai_epi64. To emulate it, add 0x80000000
-      // before shifting, and subtract the shifted 0x80000000 after
-      // shifting
-      intvec_t x = *this;
-      // Convert signed to unsiged
-      x += U(1) << (bits-1);
-      // Shift
-      x = x.lsr(n);
-      // Undo conversion
-      x -= U(1) << (bits-1-n);
-      return x;
-    }
-    intvec_t operator<<(int_t n) const { return _mm_slli_epi64(v, n); }
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef real_t scalar_t;
-    typedef __m128d vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:2*double>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm_set1_pd(a)) {}
-    realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm_load_pd(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm_loadu_pd(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
+    return r;
+  }
+  intvec_t operator<<(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm_store_pd(p, v);
+    return r;
+  }
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec_t const &x) const { return !(*this != x); }
+  boolvec_t operator!=(intvec_t const &x) const {
+    return (*this ^ x).convert_bool();
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+    // return (*this - x).as_bool();
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    void storeu(real_t* p) const
-    {
-      return _mm_storeu_pd(p, v);
+    return r;
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef real_t scalar_t;
+  typedef __m128d vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<SSE2:2*double>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm_set1_pd(a)) {}
+  realvec(real_t const *as) : v(_mm_set_pd(as[1], as[0])) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm_load_pd(p);
+  }
+  static realvec_t loadu(real_t const *p) { return _mm_loadu_pd(p); }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm_store_pd(p, v);
+  }
+  void storeu(real_t *p) const { return _mm_storeu_pd(p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
 #if defined __AVX__
-        _mm_maskstore_pd(p, m.m.as_int(), v);
+      _mm_maskstore_pd(p, m.m.as_int(), v);
 #else
-        if      (m.m[0]) _mm_storel_pd(p  , v);
-        else if (m.m[1]) _mm_storeh_pd(p+1, v);
+      if (m.m[0])
+        _mm_storel_pd(p, v);
+      else if (m.m[1])
+        _mm_storeh_pd(p + 1, v);
 #endif
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if      (m.m[0]) _mm_storel_pd(p  , v);
-        else if (m.m[1]) _mm_storeh_pd(p+1, v);
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
     }
-    
-    
-    
-    intvec_t as_int() const { return _mm_castpd_si128(v); }
-    intvec_t convert_int() const
-    {
-      intvec_t r;
-      r.set_elt(0, floatprops::convert_int((*this)[0]));
-      r.set_elt(1, floatprops::convert_int((*this)[1]));
-      return r;
-    }
-    
-    
-    
-    realvec_t operator+() const { return *this; }
-    realvec_t operator-() const { return RV(0.0) - *this; }
-    
-    realvec_t operator+(realvec_t x) const { return _mm_add_pd(v, x.v); }
-    realvec_t operator-(realvec_t x) const { return _mm_sub_pd(v, x.v); }
-    realvec_t operator*(realvec_t x) const { return _mm_mul_pd(v, x.v); }
-    realvec_t operator/(realvec_t x) const { return _mm_div_pd(v, x.v); }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      return vml_std::fmax((*this)[0], (*this)[1]);
-    }
-    real_t minval() const
-    {
-      return vml_std::fmin((*this)[0], (*this)[1]);
-    }
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1];
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        _mm_storel_pd(p, v);
+      else if (m.m[1])
+        _mm_storeh_pd(p + 1, v);
     }
-    real_t sum() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm_castpd_si128(v); }
+  intvec_t convert_int() const {
+    intvec_t r;
+    r.set_elt(0, floatprops::convert_int((*this)[0]));
+    r.set_elt(1, floatprops::convert_int((*this)[1]));
+    return r;
+  }
+
+  realvec_t operator+() const { return *this; }
+  realvec_t operator-() const { return RV(0.0) - *this; }
+
+  realvec_t operator+(realvec_t x) const { return _mm_add_pd(v, x.v); }
+  realvec_t operator-(realvec_t x) const { return _mm_sub_pd(v, x.v); }
+  realvec_t operator*(realvec_t x) const { return _mm_mul_pd(v, x.v); }
+  realvec_t operator/(realvec_t x) const { return _mm_div_pd(v, x.v); }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const { return vml_std::fmax((*this)[0], (*this)[1]); }
+  real_t minval() const { return vml_std::fmin((*this)[0], (*this)[1]); }
+  real_t prod() const { return (*this)[0] * (*this)[1]; }
+  real_t sum() const {
 #ifdef __SSE3__
-      return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
+    return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
 #else
-      return (*this)[0] + (*this)[1];
+    return (*this)[0] + (*this)[1];
 #endif
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      return _mm_cmpeq_pd(v, x.v);
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      return _mm_cmpneq_pd(v, x.v);
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      return _mm_cmplt_pd(v, x.v);
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      return _mm_cmple_pd(v, x.v);
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      return _mm_cmpgt_pd(v, x.v);
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      return _mm_cmpge_pd(v, x.v);
-    }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const
-    {
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    return _mm_cmpeq_pd(v, x.v);
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    return _mm_cmpneq_pd(v, x.v);
+  }
+  boolvec_t operator<(realvec_t const &x) const { return _mm_cmplt_pd(v, x.v); }
+  boolvec_t operator<=(realvec_t const &x) const {
+    return _mm_cmple_pd(v, x.v);
+  }
+  boolvec_t operator>(realvec_t const &x) const { return _mm_cmpgt_pd(v, x.v); }
+  boolvec_t operator>=(realvec_t const &x) const {
+    return _mm_cmpge_pd(v, x.v);
+  }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const {
 #ifdef __SSE4_1__
-      return _mm_ceil_pd(v);
+    return _mm_ceil_pd(v);
 #else
-      return MF::vml_ceil(*this);
+    return MF::vml_ceil(*this);
 #endif
- }
-    realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const
-    {
+  }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const {
 #ifdef __SSE4_1__
-      return _mm_floor_pd(v);
+    return _mm_floor_pd(v);
 #else
-      return MF::vml_floor(*this);
+    return MF::vml_floor(*this);
 #endif
- }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return _mm_max_pd(v, y.v); }
-    realvec_t fmin(realvec_t y) const { return _mm_min_pd(v, y.v); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return _mm_max_pd(v, y.v); }
+  realvec_t fmin(realvec_t y) const { return _mm_min_pd(v, y.v); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #ifdef VML_HAVE_NAN
-      return _mm_cmpunord_pd(v, v);
+    return _mm_cmpunord_pd(v, v);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); }
-    realvec_t remainder(realvec_t y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec_t rint() const
-    {
+  }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const {
 #ifdef __SSE4_1__
-      return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
+    return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
 #else
-      return MF::vml_rint(*this);
+    return MF::vml_rint(*this);
 #endif
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return v; }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return _mm_sqrt_pd(v); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const
-    {
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return v; }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return _mm_sqrt_pd(v); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const {
 #ifdef __SSE4_1__
-      return _mm_round_pd(v, _MM_FROUND_TO_ZERO);
+    return _mm_round_pd(v, _MM_FROUND_TO_ZERO);
 #else
-      return MF::vml_trunc(*this);
+    return MF::vml_trunc(*this);
 #endif
- }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,2> boolvec<double,2>::as_int() const
-  {
-    return _mm_castpd_si128(v);
-  }
-  
-  inline intvec<double,2> boolvec<double,2>::convert_int() const
-  {
-    //return ifthen(v, U(1), U(0));
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<double,2> boolvec<double,2>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline
-  intvec<double,2> boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<double,2> boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const
-  {
+  }
+};
+
+// boolvec definitions
+
+inline intvec<double, 2> boolvec<double, 2>::as_int() const {
+  return _mm_castpd_si128(v);
+}
+
+inline intvec<double, 2> boolvec<double, 2>::convert_int() const {
+  // return ifthen(v, U(1), U(0));
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<double, 2> boolvec<double, 2>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<double, 2> boolvec<double, 2>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<double, 2> boolvec<double, 2>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
 #ifdef __SSE4_1__
-    return _mm_blendv_pd(y.v, x.v, v);
+  return _mm_blendv_pd(y.v, x.v, v);
 #else
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
+  return ((-convert_int() & x.as_int()) | (~ - convert_int() & y.as_int()))
+      .as_float();
 #endif
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<double,2> intvec<double,2>::as_float() const
-  {
-    return _mm_castsi128_pd(v);
-  }
-  
-  inline realvec<double,2> intvec<double,2>::convert_float() const
-  {
-    realvec_t r;
-    r.set_elt(0, floatprops::convert_float((*this)[0]));
-    r.set_elt(1, floatprops::convert_float((*this)[1]));
-    return r;
-  }
-  
-  inline intvec<double,2> intvec<double,2>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::bitifthen(intvec_t x,
-                                                      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+}
+
+// intvec definitions
+
+inline realvec<double, 2> intvec<double, 2>::as_float() const {
+  return _mm_castsi128_pd(v);
+}
+
+inline realvec<double, 2> intvec<double, 2>::convert_float() const {
+  realvec_t r;
+  r.set_elt(0, floatprops::convert_float((*this)[0]));
+  r.set_elt(1, floatprops::convert_float((*this)[1]));
+  return r;
+}
+
+inline intvec<double, 2> intvec<double, 2>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 2> intvec<double, 2>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_SSE_DOUBLE2_H
+#endif // #ifndef VEC_SSE_DOUBLE2_H
diff --git a/lib/kernel/vecmathlib/vec_sse_float1.h b/lib/kernel/vecmathlib/vec_sse_float1.h
index 9cee891..a84a046 100644
--- a/lib/kernel/vecmathlib/vec_sse_float1.h
+++ b/lib/kernel/vecmathlib/vec_sse_float1.h
@@ -12,583 +12,489 @@
 
 // SSE2 intrinsics
 #include <emmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
 #endif
-#ifdef __SSE4_1__               // Intel's SSE 4.1
-#  include <smmintrin.h>
+#ifdef __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
 #endif
-#ifdef __SSE4A__                // AMD's SSE 4a
-#  include <ammintrin.h>
+#ifdef __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
 #endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_1
-  template<> struct boolvec<float,1>;
-  template<> struct intvec<float,1>;
-  template<> struct realvec<float,1>;
-  
-  
-  
-  template<>
-  struct boolvec<float,1>: floatprops<float>
-  {
-    static int const size = 1;
-    typedef bool scalar_t;
-    typedef uint_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-    // true values are non-zero, false values are zero
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(a) {}
-    boolvec(bool const* as): v(as[0]) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return v; }
-    boolvec_t& set_elt(int n, bool a) { return v=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return !v; }
-    
-    boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
-    boolvec_t operator||(boolvec_t x) const { return v || x.v; }
-    boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
-    boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
-    
-    bool all() const { return *this; }
-    bool any() const { return *this; }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,1>: floatprops<float>
-  {
-    static int const size = 1;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(int_t a): v(a) {}
-    intvec(int_t const* as): v(as[0]) {}
-    static intvec_t iota() { return intvec(I(0)); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return v; }
-    intvec_t& set_elt(int n, int_t a) { return v=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return U(v); }
-    boolvec_t convert_bool() const { return bool(v); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec_t operator+() const { return +v; }
-    intvec_t operator-() const { return -v; }
-    
-    intvec_t operator+(intvec_t x) const { return v+x.v; }
-    intvec_t operator-(intvec_t x) const { return v-x.v; }
-    intvec_t operator*(intvec_t x) const { return v*x.v; }
-    intvec_t operator/(intvec_t x) const { return v/x.v; }
-    intvec_t operator%(intvec_t x) const { return v%x.v; }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    intvec_t& operator*=(intvec_t const& x) { return *this=*this*x; }
-    intvec_t& operator/=(intvec_t const& x) { return *this=*this/x; }
-    intvec_t& operator%=(intvec_t const& x) { return *this=*this%x; }
-    
-    
-    
-    intvec_t operator~() const { return ~v; }
-    
-    intvec_t operator&(intvec_t x) const { return v&x.v; }
-    intvec_t operator|(intvec_t x) const { return v|x.v; }
-    intvec_t operator^(intvec_t x) const { return v^x.v; }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return U(v) >> U(n); }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const { return v>>n; }
-    intvec_t operator<<(int_t n) const { return v<<n; }
-    
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const { return v>>n; }
-    intvec_t operator<<(intvec_t n) const { return v<<n; }
-    
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const { return __builtin_clz(v); }
-    intvec_t popcount() const { return __builtin_popcount(v); }
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const { return v==x.v; }
-    boolvec_t operator!=(intvec_t const& x) const { return v!=x.v; }
-    boolvec_t operator<(intvec_t const& x) const { return v<x.v; }
-    boolvec_t operator<=(intvec_t const& x) const { return v<=x.v; }
-    boolvec_t operator>(intvec_t const& x) const { return v>x.v; }
-    boolvec_t operator>=(intvec_t const& x) const { return v>=x.v; }
-    
-    intvec_t abs() const { return std::abs(v); }
-    boolvec_t isignbit() const { return v<0; }
-    intvec_t max(intvec_t x) const { return std::max(v, x.v); }
-    intvec_t min(intvec_t x) const { return std::min(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,1>: floatprops<float>
-  {
-    static int const size = 1;
-    typedef real_t scalar_t;
-    typedef float vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:1*float>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-  private:
-    static __m128 from_float(float a) { return _mm_set_ss(a); }
-    static float to_float(__m128 a) { return _mm_cvtss_f32(a); }
-  public:
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(real_t a): v(a) {}
-    realvec(real_t const* as): v(as[0]) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return v; }
-    realvec_t& set_elt(int n, real_t a) { return v=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      *p = v;
-    }
-    void storeu(real_t* p) const
-    {
-      *p = v;
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      }
+template <> struct boolvec<float, 1>;
+template <> struct intvec<float, 1>;
+template <> struct realvec<float, 1>;
+
+template <> struct boolvec<float, 1> : floatprops<float> {
+  static int const size = 1;
+  typedef bool scalar_t;
+  typedef uint_t bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+  // true values are non-zero, false values are zero
+
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(a) {}
+  boolvec(bool const *as) : v(as[0]) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return v; }
+  boolvec_t &set_elt(int n, bool a) { return v = a, *this; }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return !v; }
+
+  boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
+  boolvec_t operator||(boolvec_t x) const { return v || x.v; }
+  boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
+  boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
+
+  bool all() const { return *this; }
+  bool any() const { return *this; }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 1> : floatprops<float> {
+  static int const size = 1;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(int_t a) : v(a) {}
+  intvec(int_t const *as) : v(as[0]) {}
+  static intvec_t iota() { return intvec(I(0)); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const { return v; }
+  intvec_t &set_elt(int n, int_t a) { return v = a, *this; }
+
+  boolvec_t as_bool() const { return U(v); }
+  boolvec_t convert_bool() const { return bool(v); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec_t operator+() const { return +v; }
+  intvec_t operator-() const { return -v; }
+
+  intvec_t operator+(intvec_t x) const { return v + x.v; }
+  intvec_t operator-(intvec_t x) const { return v - x.v; }
+  intvec_t operator*(intvec_t x) const { return v * x.v; }
+  intvec_t operator/(intvec_t x) const { return v / x.v; }
+  intvec_t operator%(intvec_t x) const { return v % x.v; }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+  intvec_t &operator*=(intvec_t const &x) { return *this = *this * x; }
+  intvec_t &operator/=(intvec_t const &x) { return *this = *this / x; }
+  intvec_t &operator%=(intvec_t const &x) { return *this = *this % x; }
+
+  intvec_t operator~() const { return ~v; }
+
+  intvec_t operator&(intvec_t x) const { return v & x.v; }
+  intvec_t operator|(intvec_t x) const { return v | x.v; }
+  intvec_t operator^(intvec_t x) const { return v ^ x.v; }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return U(v) >> U(n); }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const { return v >> n; }
+  intvec_t operator<<(int_t n) const { return v << n; }
+
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const { return v >> n; }
+  intvec_t operator<<(intvec_t n) const { return v << n; }
+
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const { return __builtin_clz(v); }
+  intvec_t popcount() const { return __builtin_popcount(v); }
+
+  boolvec_t operator==(intvec_t const &x) const { return v == x.v; }
+  boolvec_t operator!=(intvec_t const &x) const { return v != x.v; }
+  boolvec_t operator<(intvec_t const &x) const { return v < x.v; }
+  boolvec_t operator<=(intvec_t const &x) const { return v <= x.v; }
+  boolvec_t operator>(intvec_t const &x) const { return v > x.v; }
+  boolvec_t operator>=(intvec_t const &x) const { return v >= x.v; }
+
+  intvec_t abs() const { return std::abs(v); }
+  boolvec_t isignbit() const { return v < 0; }
+  intvec_t max(intvec_t x) const { return std::max(v, x.v); }
+  intvec_t min(intvec_t x) const { return std::min(v, x.v); }
+};
+
+template <> struct realvec<float, 1> : floatprops<float> {
+  static int const size = 1;
+  typedef real_t scalar_t;
+  typedef float vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<SSE2:1*float>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+private:
+  static __m128 from_float(float a) { return _mm_set_ss(a); }
+  static float to_float(__m128 a) { return _mm_cvtss_f32(a); }
+
+public:
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(real_t a) : v(a) {}
+  realvec(real_t const *as) : v(as[0]) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const { return v; }
+  realvec_t &set_elt(int n, real_t a) { return v = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return *p;
+  }
+  static realvec_t loadu(real_t const *p) { return *p; }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loada(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return *this;
     }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return *this;
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff, m);
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loada(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    *p = v;
+  }
+  void storeu(real_t *p) const { *p = v; }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storea(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
     }
-    
-    
-    
-    intvec_t as_int() const { return floatprops::as_int(v); }
-    intvec_t convert_int() const {
-      // return floatprops::convert_int(v);
-      return _mm_cvttss_si32(_mm_set_ss(v));
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
     }
-    
-    
-    
-    realvec_t operator+() const { return +v; }
-    realvec_t operator-() const { return -v; }
-    
-    realvec_t operator+(realvec_t x) const { return v+x.v; }
-    realvec_t operator-(realvec_t x) const { return v-x.v; }
-    realvec_t operator*(realvec_t x) const { return v*x.v; }
-    realvec_t operator/(realvec_t x) const { return v/x.v; }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const { return *this; }
-    real_t minval() const { return *this; }
-    real_t prod() const { return *this; }
-    real_t sum() const { return *this; }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const { return v==x.v; }
-    boolvec_t operator!=(realvec_t const& x) const { return v!=x.v; }
-    boolvec_t operator<(realvec_t const& x) const { return v<x.v; }
-    boolvec_t operator<=(realvec_t const& x) const { return v<=x.v; }
-    boolvec_t operator>(realvec_t const& x) const { return v>x.v; }
-    boolvec_t operator>=(realvec_t const& x) const { return v>=x.v; }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storea(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return floatprops::as_int(v); }
+  intvec_t convert_int() const {
+    // return floatprops::convert_int(v);
+    return _mm_cvttss_si32(_mm_set_ss(v));
+  }
+
+  realvec_t operator+() const { return +v; }
+  realvec_t operator-() const { return -v; }
+
+  realvec_t operator+(realvec_t x) const { return v + x.v; }
+  realvec_t operator-(realvec_t x) const { return v - x.v; }
+  realvec_t operator*(realvec_t x) const { return v * x.v; }
+  realvec_t operator/(realvec_t x) const { return v / x.v; }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const { return *this; }
+  real_t minval() const { return *this; }
+  real_t prod() const { return *this; }
+  real_t sum() const { return *this; }
+
+  boolvec_t operator==(realvec_t const &x) const { return v == x.v; }
+  boolvec_t operator!=(realvec_t const &x) const { return v != x.v; }
+  boolvec_t operator<(realvec_t const &x) const { return v < x.v; }
+  boolvec_t operator<=(realvec_t const &x) const { return v <= x.v; }
+  boolvec_t operator>(realvec_t const &x) const { return v > x.v; }
+  boolvec_t operator>=(realvec_t const &x) const { return v >= x.v; }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const {
 #ifdef __SSE4_1__
-      return to_float(_mm_ceil_ss(from_float(v), from_float(v)));
+    return to_float(_mm_ceil_ss(from_float(v), from_float(v)));
 #else
-      return vml_std::ceil(v);
+    return vml_std::ceil(v);
 #endif
-    }
-    realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return vml_std::fabs(v); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const
-    {
+  }
+  realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return vml_std::fabs(v); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const {
 #ifdef __SSE4_1__
-      return to_float(_mm_floor_ss(from_float(v), from_float(v)));
+    return to_float(_mm_floor_ss(from_float(v), from_float(v)));
 #else
-      return vml_std::floor(v);
+    return vml_std::floor(v);
 #endif
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const
-    {
-      return to_float(_mm_max_ss(from_float(v), from_float(y.v)));
-    }
-    realvec_t fmin(realvec_t y) const
-    {
-      return to_float(_mm_min_ss(from_float(v), from_float(y.v)));
-    }
-    realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
-    realvec_t frexp(intvec_t* irp) const
-    {
-      int iri;
-      realvec_t r = vml_std::frexp(v, &iri);
-      int_t ir = iri;
-      if (isinf()) ir = std::numeric_limits<int_t>::max();
-      if (isnan()) ir = std::numeric_limits<int_t>::min();
-      irp->v = ir;
-      return r;
-    }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const
-    {
-      int_t r = vml_std::ilogb(v);
-      typedef std::numeric_limits<int_t> NL;
-      if (FP_ILOGB0 != NL::min() and *this == RV(R(0.0))) {
-        r = NL::min();
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const {
+    return to_float(_mm_max_ss(from_float(v), from_float(y.v)));
+  }
+  realvec_t fmin(realvec_t y) const {
+    return to_float(_mm_min_ss(from_float(v), from_float(y.v)));
+  }
+  realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
+  realvec_t frexp(intvec_t *irp) const {
+    int iri;
+    realvec_t r = vml_std::frexp(v, &iri);
+    int_t ir = iri;
+    if (isinf())
+      ir = std::numeric_limits<int_t>::max();
+    if (isnan())
+      ir = std::numeric_limits<int_t>::min();
+    irp->v = ir;
+    return r;
+  }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const {
+    int_t r = vml_std::ilogb(v);
+    typedef std::numeric_limits<int_t> NL;
+    if (FP_ILOGB0 != NL::min() and *this == RV(R(0.0))) {
+      r = NL::min();
 #if defined VML_HAVE_INF
-      } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
-        r = NL::max();
+    } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
+      r = NL::max();
 #endif
 #if defined VML_HAVE_NAN
-      } else if (FP_ILOGBNAN != NL::min() and isnan()) {
-        r = NL::min();
+    } else if (FP_ILOGBNAN != NL::min() and isnan()) {
+      r = NL::min();
 #endif
-      }
-      return r;
     }
-    boolvec_t isfinite() const { return vml_std::isfinite(v); }
-    boolvec_t isinf() const { return vml_std::isinf(v); }
-    boolvec_t isnan() const
-    {
+    return r;
+  }
+  boolvec_t isfinite() const { return vml_std::isfinite(v); }
+  boolvec_t isinf() const { return vml_std::isinf(v); }
+  boolvec_t isnan() const {
 #if defined VML_HAVE_NAN
-      // This is wrong:
-      // return _mm_ucomineq_ss(from_float(v), from_float(v));
-      // This works:
-      // char r;
-      // __asm__("ucomiss %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
-      // return boolvec_t::scalar_t(r);
-      // This works as well:
-      return vml_std::isnan(v);
+    // This is wrong:
+    // return _mm_ucomineq_ss(from_float(v), from_float(v));
+    // This works:
+    // char r;
+    // __asm__("ucomiss %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
+    // return boolvec_t::scalar_t(r);
+    // This works as well:
+    return vml_std::isnan(v);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return vml_std::isnormal(v); }
-    realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
-    realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return R(1.0)/v; }
-    realvec_t remainder(realvec_t y) const
-    {
-      return vml_std::remainder(v, y.v);
-    }
-    realvec_t rint() const
-    {
+  }
+  boolvec_t isnormal() const { return vml_std::isnormal(v); }
+  realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
+  realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return R(1.0) / v; }
+  realvec_t remainder(realvec_t y) const { return vml_std::remainder(v, y.v); }
+  realvec_t rint() const {
 #ifdef __SSE4_1__
-      return to_float(_mm_round_ss(from_float(v), from_float(v),
-                                   _MM_FROUND_TO_NEAREST_INT));
+    return to_float(
+        _mm_round_ss(from_float(v), from_float(v), _MM_FROUND_TO_NEAREST_INT));
 #else
-      return MF::vml_rint(*this);
+    return MF::vml_rint(*this);
 #endif
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return vml_std::signbit(v); }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const
-    {
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return vml_std::signbit(v); }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const {
 #ifdef __SSE4_1__
-      return to_float(_mm_round_ss(from_float(v), from_float(v),
-                                   _MM_FROUND_TO_ZERO));
+    return to_float(
+        _mm_round_ss(from_float(v), from_float(v), _MM_FROUND_TO_ZERO));
 #else
-      return MF::vml_trunc(*this);
+    return MF::vml_trunc(*this);
 #endif
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,1> boolvec<float,1>::as_int() const
-  {
-    return I(v);
-  }
-  
-  inline intvec<float,1> boolvec<float,1>::convert_int() const
-  {
-    return v;
-  }
-  
-  inline
-  boolvec<float,1> boolvec<float,1>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  inline intvec<float,1> boolvec<float,1>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  inline
-  realvec<float,1> boolvec<float,1>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<float,1> intvec<float,1>::as_float() const
-  {
-    return FP::as_float(v);
-  }
-  
-  inline intvec<float,1> intvec<float,1>::bitifthen(intvec_t x,
-                                                    intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline realvec<float,1> intvec<float,1>::convert_float() const
-  {
-    // return FP::convert_float(v);
-    return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v));
   }
-  
-  inline intvec<float,1> intvec<float,1>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,1> intvec<float,1>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+};
+
+// boolvec definitions
+
+inline intvec<float, 1> boolvec<float, 1>::as_int() const { return I(v); }
+
+inline intvec<float, 1> boolvec<float, 1>::convert_int() const { return v; }
+
+inline boolvec<float, 1> boolvec<float, 1>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return v ? x : y;
+}
+
+inline intvec<float, 1> boolvec<float, 1>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return v ? x : y;
+}
+
+inline realvec<float, 1> boolvec<float, 1>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return v ? x : y;
+}
+
+// intvec definitions
+
+inline realvec<float, 1> intvec<float, 1>::as_float() const {
+  return FP::as_float(v);
+}
+
+inline intvec<float, 1> intvec<float, 1>::bitifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline realvec<float, 1> intvec<float, 1>::convert_float() const {
+  // return FP::convert_float(v);
+  return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v));
+}
+
+inline intvec<float, 1> intvec<float, 1>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 1> intvec<float, 1>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_SSE_FLOAT1_H
+#endif // #ifndef VEC_SSE_FLOAT1_H
diff --git a/lib/kernel/vecmathlib/vec_sse_float4.h b/lib/kernel/vecmathlib/vec_sse_float4.h
index 34ac64f..f8e8e80 100644
--- a/lib/kernel/vecmathlib/vec_sse_float4.h
+++ b/lib/kernel/vecmathlib/vec_sse_float4.h
@@ -11,766 +11,642 @@
 
 // SSE2 intrinsics
 #include <xmmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
 #endif
-#ifdef __SSSE3__                // Intel's SSSE 3
-#  include <tmmintrin.h>
+#ifdef __SSSE3__ // Intel's SSSE 3
+#include <tmmintrin.h>
 #endif
-#if defined __SSE4_1__          // Intel's SSE 4.1
-#  include <smmintrin.h>
+#if defined __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
 #endif
-#if defined __SSE4A__           // AMD's SSE 4a
-#  include <ammintrin.h>
+#if defined __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
 #endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_4
-  template<> struct boolvec<float,4>;
-  template<> struct intvec<float,4>;
-  template<> struct realvec<float,4>;
-  
-  
-  
-  template<>
-  struct boolvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef __m128 bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - int_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]),
-                                     from_bool(as[2]),
-                                     from_bool(as[1]),
-                                     from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec_t& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return _mm_xor_ps(boolvec(true), v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return _mm_and_ps(v, x.v); }
-    boolvec_t operator||(boolvec_t x) const { return _mm_or_ps(v, x.v); }
-    boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
-    boolvec_t operator!=(boolvec_t x) const { return _mm_xor_ps(v, x.v); }
-    
-    bool all() const
-    {
-      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+template <> struct boolvec<float, 4>;
+template <> struct intvec<float, 4>;
+template <> struct realvec<float, 4>;
+
+template <> struct boolvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef __m128 bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {}
+  boolvec(bool const *as)
+      : v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]), from_bool(as[2]),
+                                         from_bool(as[1]), from_bool(as[0])))) {
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec_t &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return _mm_xor_ps(boolvec(true), v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return _mm_and_ps(v, x.v); }
+  boolvec_t operator||(boolvec_t x) const { return _mm_or_ps(v, x.v); }
+  boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+  boolvec_t operator!=(boolvec_t x) const { return _mm_xor_ps(v, x.v); }
+
+  bool all() const {
+// return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
 #if defined __AVX__
-      return ! (! *this).any();
+    return !(!*this).any();
 #else
-      boolvec_t x = *this;
-      x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
-      return x[0] && x[2];
+    boolvec_t x = *this;
+    x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1));
+    return x[0] && x[2];
 #endif
-    }
-    bool any() const
-    {
-      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+  }
+  bool any() const {
+// return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
 #if defined __AVX__
-      return ! bool(_mm_testz_ps(v, v));
+    return !bool(_mm_testz_ps(v, v));
 #else
-      boolvec_t x = *this;
-      x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
-      return x[0] || x[2];
+    boolvec_t x = *this;
+    x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1));
+    return x[0] || x[2];
 #endif
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef __m128i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm_set1_epi32(a)) {}
+  intvec(int_t const *as) : v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {}
+  static intvec_t iota() { return _mm_set_epi32(3, 2, 1, 0); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return _mm_castsi128_ps(v); }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    return !IV(_mm_cmpeq_epi32(v, IV(0))).as_bool();
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec_t operator+() const { return *this; }
+  intvec_t operator-() const { return IV(0) - *this; }
+
+  intvec_t operator+(intvec_t x) const { return _mm_add_epi32(v, x.v); }
+  intvec_t operator-(intvec_t x) const { return _mm_sub_epi32(v, x.v); }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+  intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec_t operator&(intvec_t x) const {
+    return _mm_castps_si128(
+        _mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v)));
+  }
+  intvec_t operator|(intvec_t x) const {
+    return _mm_castps_si128(
+        _mm_or_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v)));
+  }
+  intvec_t operator^(intvec_t x) const {
+    return _mm_castps_si128(
+        _mm_xor_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v)));
+  }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return _mm_srli_epi32(v, n); }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const { return _mm_srai_epi32(v, n); }
+  intvec_t operator<<(int_t n) const { return _mm_slli_epi32(v, n); }
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef __m128i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm_set1_epi32(a)) {}
-    intvec(int_t const* as): v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {}
-    static intvec_t iota() { return _mm_set_epi32(3, 2, 1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm_castsi128_ps(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      return ! IV(_mm_cmpeq_epi32(v, IV(0))).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec_t operator+() const { return *this; }
-    intvec_t operator-() const { return IV(0) - *this; }
-    
-    intvec_t operator+(intvec_t x) const { return _mm_add_epi32(v, x.v); }
-    intvec_t operator-(intvec_t x) const { return _mm_sub_epi32(v, x.v); }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      return _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v),
-                                         _mm_castsi128_ps(x.v)));
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(v),
-                                        _mm_castsi128_ps(x.v)));
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(v),
-                                         _mm_castsi128_ps(x.v)));
-    }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return _mm_srli_epi32(v, n); }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const { return _mm_srai_epi32(v, n); }
-    intvec_t operator<<(int_t n) const { return _mm_slli_epi32(v, n); }
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef __m128 vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:4*float>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm_set1_ps(a)) {}
-    realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm_load_ps(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm_loadu_ps(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      if (ioff==0) return loada(p);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
+    return r;
+  }
+  intvec_t operator<<(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm_store_ps(p, v);
+    return r;
+  }
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec_t const &x) const { return !(*this != x); }
+  boolvec_t operator!=(intvec_t const &x) const {
+    return (*this ^ x).convert_bool();
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+    // return (*this - x).as_bool();
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    void storeu(real_t* p) const
-    {
-      return _mm_storeu_ps(p, v);
+    return r;
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef __m128 vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<SSE2:4*float>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm_set1_ps(a)) {}
+  realvec(real_t const *as) : v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm_load_ps(p);
+  }
+  static realvec_t loadu(real_t const *p) { return _mm_loadu_ps(p); }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    if (ioff == 0)
+      return loada(p);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm_store_ps(p, v);
+  }
+  void storeu(real_t *p) const { return _mm_storeu_ps(p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
 #if defined __AVX__
-        _mm_maskstore_ps(p, m.m.as_int(), v);
+      _mm_maskstore_ps(p, m.m.as_int(), v);
 #else
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
 #endif
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return _mm_castps_si128(v); }
-    intvec_t convert_int() const { return _mm_cvttps_epi32(v); }
-    
-    
-    
-    realvec_t operator+() const { return *this; }
-    realvec_t operator-() const { return RV(0.0) - *this; }
-    
-    realvec_t operator+(realvec_t x) const { return _mm_add_ps(v, x.v); }
-    realvec_t operator-(realvec_t x) const { return _mm_sub_ps(v, x.v); }
-    realvec_t operator*(realvec_t x) const { return _mm_mul_ps(v, x.v); }
-    realvec_t operator/(realvec_t x) const { return _mm_div_ps(v, x.v); }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-      //                      vml_std::fmax((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
-      realvec_t y0022 = x0123.fmax(x1032);
-      return vml_std::fmax(y0022[0], y0022[2]);
-    }
-    real_t minval() const
-    {
-      // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-      //                      vml_std::fmin((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
-      realvec_t y0022 = x0123.fmin(x1032);
-      return vml_std::fmin(y0022[0], y0022[2]);
     }
-    real_t prod() const
-    {
-      // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
-      realvec_t y0022 = x0123 * x1032;
-      return y0022[0] * y0022[2];
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    real_t sum() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm_castps_si128(v); }
+  intvec_t convert_int() const { return _mm_cvttps_epi32(v); }
+
+  realvec_t operator+() const { return *this; }
+  realvec_t operator-() const { return RV(0.0) - *this; }
+
+  realvec_t operator+(realvec_t x) const { return _mm_add_ps(v, x.v); }
+  realvec_t operator-(realvec_t x) const { return _mm_sub_ps(v, x.v); }
+  realvec_t operator*(realvec_t x) const { return _mm_mul_ps(v, x.v); }
+  realvec_t operator/(realvec_t x) const { return _mm_div_ps(v, x.v); }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+    //                      vml_std::fmax((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+    realvec_t y0022 = x0123.fmax(x1032);
+    return vml_std::fmax(y0022[0], y0022[2]);
+  }
+  real_t minval() const {
+    // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+    //                      vml_std::fmin((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+    realvec_t y0022 = x0123.fmin(x1032);
+    return vml_std::fmin(y0022[0], y0022[2]);
+  }
+  real_t prod() const {
+    // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+    realvec_t y0022 = x0123 * x1032;
+    return y0022[0] * y0022[2];
+  }
+  real_t sum() const {
 #ifdef __SSE3__
-      realvec_t x = *this;
-      x = _mm_hadd_ps(x.v, x.v);
-      x = _mm_hadd_ps(x.v, x.v);
-      return x[0];
+    realvec_t x = *this;
+    x = _mm_hadd_ps(x.v, x.v);
+    x = _mm_hadd_ps(x.v, x.v);
+    return x[0];
 #else
-      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
-      realvec_t y0022 = x0123 + x1032;
-      return y0022[0] + y0022[2];
+    // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+    realvec_t y0022 = x0123 + x1032;
+    return y0022[0] + y0022[2];
 #endif
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      return _mm_cmpeq_ps(v, x.v);
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      return _mm_cmpneq_ps(v, x.v);
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      return _mm_cmplt_ps(v, x.v);
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      return _mm_cmple_ps(v, x.v);
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      return _mm_cmpgt_ps(v, x.v);
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      return _mm_cmpge_ps(v, x.v);
-    }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const
-    {
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    return _mm_cmpeq_ps(v, x.v);
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    return _mm_cmpneq_ps(v, x.v);
+  }
+  boolvec_t operator<(realvec_t const &x) const { return _mm_cmplt_ps(v, x.v); }
+  boolvec_t operator<=(realvec_t const &x) const {
+    return _mm_cmple_ps(v, x.v);
+  }
+  boolvec_t operator>(realvec_t const &x) const { return _mm_cmpgt_ps(v, x.v); }
+  boolvec_t operator>=(realvec_t const &x) const {
+    return _mm_cmpge_ps(v, x.v);
+  }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const {
 #ifdef __SSE4_1__
-      return _mm_ceil_ps(v);
+    return _mm_ceil_ps(v);
 #else
-      return MF::vml_ceil(*this);
+    return MF::vml_ceil(*this);
 #endif
-    }
-    realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const
-    {
+  }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const {
 #ifdef __SSE4_1__
-      return _mm_floor_ps(v);
+    return _mm_floor_ps(v);
 #else
-      return MF::vml_floor(*this);
+    return MF::vml_floor(*this);
 #endif
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return _mm_max_ps(v, y.v); }
-    realvec_t fmin(realvec_t y) const { return _mm_min_ps(v, y.v); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return _mm_max_ps(v, y.v); }
+  realvec_t fmin(realvec_t y) const { return _mm_min_ps(v, y.v); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #if defined VML_HAVE_NAN
-      return _mm_cmpunord_ps(v, v);
+    return _mm_cmpunord_ps(v, v);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const
-    {
-      realvec_t x = *this;
-      realvec_t r = _mm_rcp_ps(x); // this is only an approximation
-      r *= RV(2.0) - r*x;        // one Newton iteration (see vml_rcp)
-      return r;
-    }
-    realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
-    realvec_t rint() const
-    {
+  }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const {
+    realvec_t x = *this;
+    realvec_t r = _mm_rcp_ps(x); // this is only an approximation
+    r *= RV(2.0) - r * x;        // one Newton iteration (see vml_rcp)
+    return r;
+  }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const {
 #ifdef __SSE4_1__
-      return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
+    return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
 #else
-      return MF::vml_rint(*this);
+    return MF::vml_rint(*this);
 #endif
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const
-    {
-      realvec_t x = *this;
-      realvec_t r = _mm_rsqrt_ps(x);    // this is only an approximation
-      r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
-      return r;
-    }
-    boolvec_t signbit() const { return v; }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return _mm_sqrt_ps(v); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const
-    {
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const {
+    realvec_t x = *this;
+    realvec_t r = _mm_rsqrt_ps(x);      // this is only an approximation
+    r *= RV(1.5) - RV(0.5) * x * r * r; // one Newton iteration (see vml_rsqrt)
+    return r;
+  }
+  boolvec_t signbit() const { return v; }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return _mm_sqrt_ps(v); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const {
 #ifdef __SSE4_1__
-      return _mm_round_ps(v, _MM_FROUND_TO_ZERO);
+    return _mm_round_ps(v, _MM_FROUND_TO_ZERO);
 #else
-      return MF::vml_trunc(*this);
+    return MF::vml_trunc(*this);
 #endif
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,4> boolvec<float,4>::as_int() const
-  {
-    return _mm_castps_si128(v);
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::convert_int() const
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const
-  {
+  }
+};
+
+// boolvec definitions
+
+inline intvec<float, 4> boolvec<float, 4>::as_int() const {
+  return _mm_castps_si128(v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::convert_int() const {
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
 #ifdef __SSE4_1__
-    return _mm_blendv_ps(y.v, x.v, v);
+  return _mm_blendv_ps(y.v, x.v, v);
 #else
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
+  return ((-convert_int() & x.as_int()) | (~ - convert_int() & y.as_int()))
+      .as_float();
 #endif
-  }
+}
+
+// intvec definitions
 
-  
-  
-  // intvec definitions
-  
-  inline intvec<float,4> intvec<float,4>::abs() const
-  {
+inline intvec<float, 4> intvec<float, 4>::abs() const {
 #ifdef __SSSE3__
-    return _mm_abs_epi32(v);
+  return _mm_abs_epi32(v);
 #else
-    return MF::vml_abs(*this);
+  return MF::vml_abs(*this);
 #endif
-  }
-  
-  inline realvec<float,4> intvec<float,4>::as_float() const
-  {
-    return _mm_castsi128_ps(v);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::bitifthen(intvec_t x,
-                                                    intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<float,4> intvec<float,4>::convert_float() const
-  {
-    return _mm_cvtepi32_ps(v);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::max(intvec_t x) const
-  {
+}
+
+inline realvec<float, 4> intvec<float, 4>::as_float() const {
+  return _mm_castsi128_ps(v);
+}
+
+inline intvec<float, 4> intvec<float, 4>::bitifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<float, 4> intvec<float, 4>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<float, 4> intvec<float, 4>::convert_float() const {
+  return _mm_cvtepi32_ps(v);
+}
+
+inline intvec<float, 4> intvec<float, 4>::max(intvec_t x) const {
 #ifdef __SSE4_1__
-      return _mm_max_epi32(v, x.v);
+  return _mm_max_epi32(v, x.v);
 #else
-      return MF::vml_max(*this, v);
+  return MF::vml_max(*this, x);
 #endif
-  }
-  
-  inline intvec<float,4> intvec<float,4>::min(intvec_t x) const
-  {
+}
+
+inline intvec<float, 4> intvec<float, 4>::min(intvec_t x) const {
 #ifdef __SSE4_1__
-      return _mm_min_epi32(v, x.v);
+  return _mm_min_epi32(v, x.v);
 #else
-      return MF::vml_min(*this, v);
+  return MF::vml_min(*this, x);
 #endif
-  }
-  
-  inline intvec<float,4> intvec<float,4>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+}
+
+inline intvec<float, 4> intvec<float, 4>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_SSE_FLOAT4_H
+#endif // #ifndef VEC_SSE_FLOAT4_H
diff --git a/lib/kernel/vecmathlib/vec_test.h b/lib/kernel/vecmathlib/vec_test.h
index 46fc9d1..c27b75e 100644
--- a/lib/kernel/vecmathlib/vec_test.h
+++ b/lib/kernel/vecmathlib/vec_test.h
@@ -9,1474 +9,1280 @@
 
 #include <cmath>
 #ifndef VML_NO_IOSTREAM
-#  include <sstream>
+#include <sstream>
 #endif
 
+namespace vecmathlib {
 
+template <typename T, int N> struct booltestvec;
+template <typename T, int N> struct inttestvec;
+template <typename T, int N> struct realtestvec;
+
+template <typename T, int N> struct booltestvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef bool scalar_t;
+  typedef bool bvector_t[size];
+  static int const alignment = sizeof(bool);
+
+  typedef booltestvec boolvec_t;
+  typedef inttestvec<real_t, size> intvec_t;
+  typedef realtestvec<real_t, size> realvec_t;
+
+  // short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  booltestvec() {}
+  // can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // booltestvec(booltestvec const& x): v(x.v) {}
+  // booltestvec& operator=(booltestvec const& x) { return v=x.v, *this; }
+  // booltestvec(vector_t x): v(x) {}
+  booltestvec(bool a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  booltestvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+
+  bool operator[](int n) const { return v[n]; }
+  boolvec_t &set_elt(int n, bool a) { return v[n] = a, *this; }
+
+  intvec_t as_int() const;      // defined after inttestvec
+  intvec_t convert_int() const; // defined after inttestvec
+
+  boolvec_t operator!() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = !v[d];
+    return res;
+  }
+
+  boolvec_t operator&&(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] && x.v[d];
+    return res;
+  }
+  boolvec_t operator||(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] || x.v[d];
+    return res;
+  }
+  boolvec_t operator==(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+
+  bool all() const {
+    bool res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = res && v[d];
+    return res;
+  }
+  bool any() const {
+    bool res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = res || v[d];
+    return res;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after inttestvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realtestvec
+};
+
+template <typename T, int N> struct inttestvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t[size];
+  static int const alignment = sizeof(int_t);
+
+  typedef booltestvec<real_t, size> boolvec_t;
+  typedef inttestvec intvec_t;
+  typedef realtestvec<real_t, size> realvec_t;
+
+  // short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  inttestvec() {}
+  // can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // inttestvec(inttestvec const& x): v(x.v) {}
+  // inttestvec& operator=(inttestvec const& x) { return v=x.v, *this; }
+  // inttestvec(vector_t x): v(x) {}
+  inttestvec(int_t a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  inttestvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+  static intvec_t iota() {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = d;
+    return res;
+  }
+
+  int_t operator[](int n) const { return v[n]; }
+  intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; }
+
+  boolvec_t as_bool() const { return convert_bool(); }
+  boolvec_t convert_bool() const {
+    // result: convert_bool(0)=false, convert_bool(else)=true
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d];
+    return res;
+  }
+  realvec_t as_float() const;      // defined after realtestvec
+  realvec_t convert_float() const; // defined after realtestvec
+
+  intvec_t operator+() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = +v[d];
+    return res;
+  }
+  intvec_t operator-() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = -v[d];
+    return res;
+  }
+
+  intvec_t &operator+=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] += x.v[d];
+    return *this;
+  }
+  intvec_t &operator-=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] -= x.v[d];
+    return *this;
+  }
+  intvec_t &operator*=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] *= x.v[d];
+    return *this;
+  }
+  intvec_t &operator/=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] /= x.v[d];
+    return *this;
+  }
+  intvec_t &operator%=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] %= x.v[d];
+    return *this;
+  }
+
+  intvec_t operator+(intvec_t x) const {
+    intvec_t res = *this;
+    return res += x;
+  }
+  intvec_t operator-(intvec_t x) const {
+    intvec_t res = *this;
+    return res -= x;
+  }
+  intvec_t operator*(intvec_t x) const {
+    intvec_t res = *this;
+    return res *= x;
+  }
+  intvec_t operator/(intvec_t x) const {
+    intvec_t res = *this;
+    return res /= x;
+  }
+  intvec_t operator%(intvec_t x) const {
+    intvec_t res = *this;
+    return res %= x;
+  }
+
+  intvec_t operator~() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = ~v[d];
+    return res;
+  }
+
+  intvec_t &operator&=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] &= x.v[d];
+    return *this;
+  }
+  intvec_t &operator|=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] |= x.v[d];
+    return *this;
+  }
+  intvec_t &operator^=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] ^= x.v[d];
+    return *this;
+  }
+
+  intvec_t operator&(intvec_t x) const {
+    intvec_t res = *this;
+    return res &= x;
+  }
+  intvec_t operator|(intvec_t x) const {
+    intvec_t res = *this;
+    return res |= x;
+  }
+  intvec_t operator^(intvec_t x) const {
+    intvec_t res = *this;
+    return res ^= x;
+  }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const {
+    return MF::vml_bitifthen(*this, x, y);
+  }
+
+  intvec_t lsr(int_t n) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = I(U(v[d]) >> U(n));
+    return res;
+  }
+  intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
+  intvec_t &operator>>=(int_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] >>= n;
+    return *this;
+  }
+  intvec_t &operator<<=(int_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] <<= n;
+    return *this;
+  }
+  intvec_t operator>>(int_t n) const {
+    intvec_t res = *this;
+    return res >>= n;
+  }
+  intvec_t operator<<(int_t n) const {
+    intvec_t res = *this;
+    return res <<= n;
+  }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = I(U(v[d]) >> U(n.v[d]));
+    return res;
+  }
+  intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
+  intvec_t &operator>>=(intvec_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] >>= n.v[d];
+    return *this;
+  }
+  intvec_t &operator<<=(intvec_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] <<= n.v[d];
+    return *this;
+  }
+  intvec_t operator>>(intvec_t n) const {
+    intvec_t res = *this;
+    return res >>= n;
+  }
+  intvec_t operator<<(intvec_t n) const {
+    intvec_t res = *this;
+    return res <<= n;
+  }
+
+  intvec_t clz() const { return MF::vml_clz(*this); }
+  intvec_t popcount() const { return MF::vml_popcount(*this); }
+
+  boolvec_t operator==(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < x.v[d];
+    return res;
+  }
+  boolvec_t operator<=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] <= x.v[d];
+    return res;
+  }
+  boolvec_t operator>(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] > x.v[d];
+    return res;
+  }
+  boolvec_t operator>=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] >= x.v[d];
+    return res;
+  }
+
+  intvec_t abs() const { return MF::vml_abs(*this); }
+  boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
+  intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
+  intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
+};
+
+template <typename T, int N> struct realtestvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef real_t scalar_t;
+  typedef real_t vector_t[size];
+  static int const alignment = sizeof(real_t);
 
-namespace vecmathlib {
-  
-  template<typename T, int N> struct booltestvec;
-  template<typename T, int N> struct inttestvec;
-  template<typename T, int N> struct realtestvec;
-  
-  
-  
-  template<typename T, int N>
-  struct booltestvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef bool scalar_t;
-    typedef bool bvector_t[size];
-    static int const alignment = sizeof(bool);
-    
-    typedef booltestvec boolvec_t;
-    typedef inttestvec<real_t, size> intvec_t;
-    typedef realtestvec<real_t, size> realvec_t;
-    
-    // short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    booltestvec() {}
-    // can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // booltestvec(booltestvec const& x): v(x.v) {}
-    // booltestvec& operator=(booltestvec const& x) { return v=x.v, *this; }
-    //booltestvec(vector_t x): v(x) {}
-    booltestvec(bool a) { for (int d=0; d<size; ++d) v[d]=a; }
-    booltestvec(bool const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    
-    bool operator[](int n) const { return v[n]; }
-    boolvec_t& set_elt(int n, bool a) { return v[n]=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after inttestvec
-    intvec_t convert_int() const; // defined after inttestvec
-    
-    
-    
-    boolvec_t operator!() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = !v[d];
-      return res;
-    }
-    
-    boolvec_t operator&&(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] && x.v[d];
-      return res;
-    }
-    boolvec_t operator||(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] || x.v[d];
-      return res;
-    }
-    boolvec_t operator==(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    
-    bool all() const
-    {
-      bool res = v[0];
-      for (int d=1; d<size; ++d) res = res && v[d];
-      return res;
-    }
-    bool any() const
-    {
-      bool res = v[0];
-      for (int d=1; d<size; ++d) res = res || v[d];
-      return res;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after inttestvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realtestvec
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct inttestvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t[size];
-    static int const alignment = sizeof(int_t);
-    
-    typedef booltestvec<real_t, size> boolvec_t;
-    typedef inttestvec intvec_t;
-    typedef realtestvec<real_t, size> realvec_t;
-    
-    // short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    inttestvec() {}
-    // can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // inttestvec(inttestvec const& x): v(x.v) {}
-    // inttestvec& operator=(inttestvec const& x) { return v=x.v, *this; }
-    //inttestvec(vector_t x): v(x) {}
-    inttestvec(int_t a) { for (int d=0; d<size; ++d) v[d]=a; }
-    inttestvec(int_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    static intvec_t iota()
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d]=d;
-      return res;
-    }
-    
-    int_t operator[](int n) const { return v[n]; }
-    intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return convert_bool(); }
-    boolvec_t convert_bool() const
-    {
-      // result: convert_bool(0)=false, convert_bool(else)=true
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d]=v[d];
-      return res;
-    }
-    realvec_t as_float() const;      // defined after realtestvec
-    realvec_t convert_float() const; // defined after realtestvec
-    
-    
-    
-    intvec_t operator+() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = + v[d];
-      return res;
-    }
-    intvec_t operator-() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = - v[d];
-      return res;
-    }
-    
-    intvec_t& operator+=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] += x.v[d];
-      return *this;
-    }
-    intvec_t& operator-=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] -= x.v[d];
-      return *this;
-    }
-    intvec_t& operator*=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] *= x.v[d];
-      return *this;
-    }
-    intvec_t& operator/=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] /= x.v[d];
-      return *this;
-    }
-    intvec_t& operator%=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] %= x.v[d];
-      return *this;
-    }
-    
-    intvec_t operator+(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res += x;
-    }
-    intvec_t operator-(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res -= x;
-    }
-    intvec_t operator*(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res *= x;
-    }
-    intvec_t operator/(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res /= x;
-    }
-    intvec_t operator%(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res %= x;
-    }
-    
-    
-    
-    intvec_t operator~() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = ~ v[d];
-      return res;
-    }
-    
-    intvec_t& operator&=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] &= x.v[d];
-      return *this;
-    }
-    intvec_t& operator|=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] |= x.v[d];
-      return *this;
-    }
-    intvec_t& operator^=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] ^= x.v[d];
-      return *this;
-    }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res &= x;
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res |= x;
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res ^= x;
-    }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const
-    {
-      return MF::vml_bitifthen(*this, x, y);
-    }
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n));
-      return res;
-    }
-    intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
-    intvec_t& operator>>=(int_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] >>= n;
-      return *this;
-    }
-    intvec_t& operator<<=(int_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] <<= n;
-      return *this;
-    }
-    intvec_t operator>>(int_t n) const
-    {
-      intvec_t res = *this;
-      return res >>= n;
-    }
-    intvec_t operator<<(int_t n) const
-    {
-      intvec_t res = *this;
-      return res <<= n;
-    }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n.v[d]));
-      return res;
-    }
-    intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
-    intvec_t& operator>>=(intvec_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] >>= n.v[d];
-      return *this;
-    }
-    intvec_t& operator<<=(intvec_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] <<= n.v[d];
-      return *this;
-    }
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t res = *this;
-      return res >>= n;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t res = *this;
-      return res <<= n;
-    }
-    
-    intvec_t clz() const { return MF::vml_clz(*this); }
-    intvec_t popcount() const { return MF::vml_popcount(*this); }
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
-      return res;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
-      return res;
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
-      return res;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
-      return res;
-    }
-    
-    intvec_t abs() const { return MF::vml_abs(*this); }
-    boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
-    intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
-    intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct realtestvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef real_t scalar_t;
-    typedef real_t vector_t[size];
-    static int const alignment = sizeof(real_t);
-    
 #ifndef VML_NO_IOSTREAM
-    static char const* name()
-    {
-      static std::string name_;
-      if (name_.empty()) {
-        std::stringstream buf;
-        buf << "<VML:" << N << "*" << FP::name() << ">";
-        name_ = buf.str();
-      }
-      return name_.c_str();
+  static char const *name() {
+    static std::string name_;
+    if (name_.empty()) {
+      std::stringstream buf;
+      buf << "<VML:" << N << "*" << FP::name() << ">";
+      name_ = buf.str();
     }
+    return name_.c_str();
+  }
 #endif
-    void barrier()
-    {
+  void barrier() {
 #if defined __GNUC__ && !defined __clang__ && !defined __ICC
-      // GCC crashes when +X is used as constraint
-#  if defined __SSE2__
-      for (int d=0; d<size; ++d) __asm__("": "+x"(v[d]));
-#  elif defined __PPC64__       // maybe also __PPC__
-      for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
-#  elif defined __arm__
-      for (int d=0; d<size; ++d) __asm__("": "+w"(v[d]));
-#  else
-#    error "Floating point barrier undefined on this architecture"
-#  endif
+// GCC crashes when +X is used as constraint
+#if defined __SSE2__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+x"(v[d]));
+#elif defined __PPC64__ // maybe also __PPC__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+f"(v[d]));
+#elif defined __arm__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+w"(v[d]));
+#else
+#error "Floating point barrier undefined on this architecture"
+#endif
 #elif defined __clang__
-      for (int d=0; d<size; ++d) __asm__("": "+X"(v[d]));
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+X"(v[d]));
 #elif defined __ICC
-      for (int d=0; d<size; ++d) {
-        real_t tmp = v[d];
-        __asm__("": "+X"(tmp));
-        v[d] = tmp;
-      }
+    for (int d = 0; d < size; ++d) {
+      real_t tmp = v[d];
+      __asm__("" : "+X"(tmp));
+      v[d] = tmp;
+    }
 #elif defined __IBMCPP__
-      for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+f"(v[d]));
 #else
-#  error "Floating point barrier undefined on this architecture"
+#error "Floating point barrier undefined on this architecture"
 #endif
-    }
-    
-    typedef booltestvec<real_t, size> boolvec_t;
-    typedef inttestvec<real_t, size> intvec_t;
-    typedef realtestvec realvec_t;
-    
-    // short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realtestvec() {}
-    // can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realtestvec(realtestvec const& x): v(x.v) {}
-    // realtestvec& operator=(realtestvec const& x) { return v=x.v, *this; }
-    //realtestvec(vector_t x): v(x) {}
-    realtestvec(real_t a) { for (int d=0; d<size; ++d) v[d]=a; }
-    realtestvec(real_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    
-    real_t operator[](int n) const { return v[n]; }
-    realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = p[d];
-      return res;
-    }
-    static realvec_t loadu(real_t const* p, size_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      return m.m.ifthen(loada(p), *this);
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      return m.m.ifthen(loadu(p), *this);
-    }
-    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
-    {
-      return m.m.ifthen(loadu(p, ioff), *this);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p);
-    }
-    void storeu(real_t* p) const
-    {
-      for (int d=0; d<size; ++d) p[d] = v[d];
-    }
-    void storeu(real_t* p, size_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p, m);
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d];
-    }
-    void storeu(real_t* p, size_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = FP::as_int(v[d]);
-      return res;
-    }
-    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
-    
-    
-    
-    realvec_t operator+() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = + v[d];
-      return res;
-    }
-    realvec_t operator-() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = - v[d];
-      return res;
-    }
-    
-    realvec_t& operator+=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] += x.v[d];
-      return *this;
-    }
-    realvec_t& operator-=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] -= x.v[d];
-      return *this;
-    }
-    realvec_t& operator*=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] *= x.v[d];
-      return *this;
-    }
-    realvec_t& operator/=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] /= x.v[d];
-      return *this;
-    }
-    
-    realvec_t operator+(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res += x;
-    }
-    realvec_t operator-(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res -= x;
-    }
-    realvec_t operator*(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res *= x;
-    }
-    realvec_t operator/(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res /= x;
-    }
-    
-    real_t maxval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res = vml_std::fmax(res, v[d]);
-      return res;
-    }
-    real_t minval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res = vml_std::fmin(res, v[d]);
-      return res;
-    }
-    real_t prod() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res *= v[d];
-      return res;
-    }
-    real_t sum() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res += v[d];
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
-      return res;
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
-      return res;
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
-      return res;
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
-      return res;
-    }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const { return MF::vml_ceil(*this); }
-    realvec_t copysign(realvec_t y) const
-    {
-      return MF::vml_copysign(*this, y);
-    }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const { return MF::vml_floor(*this); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return MF::vml_fmax(*this, y); }
-    realvec_t fmin(realvec_t y) const { return MF::vml_fmin(*this, y); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    intvec_t lrint() const { return MF::vml_lrint(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return MF::vml_rcp(*this); }
-    realvec_t remainder(realvec_t y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec_t rint() const { return MF::vml_rint(*this); }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return MF::vml_sqrt(*this); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const { return MF::vml_trunc(*this); }
-  };
-  
-  
-  
-  // booltestvec definitions
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::intvec_t
-  booltestvec<T,N>::as_int() const
-  {
-    return convert_int();
-  }
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::intvec_t
-  booltestvec<T,N>::convert_int() const
-  {
-    intvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d];
-    return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::boolvec_t
-  booltestvec<T,N>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    boolvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+
+  typedef booltestvec<real_t, size> boolvec_t;
+  typedef inttestvec<real_t, size> intvec_t;
+  typedef realtestvec realvec_t;
+
+  // short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realtestvec() {}
+  // can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realtestvec(realtestvec const& x): v(x.v) {}
+  // realtestvec& operator=(realtestvec const& x) { return v=x.v, *this; }
+  // realtestvec(vector_t x): v(x) {}
+  realtestvec(real_t a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  realtestvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+
+  real_t operator[](int n) const { return v[n]; }
+  realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = p[d];
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::intvec_t
-  booltestvec<T,N>::ifthen(intvec_t x, intvec_t y) const
-  {
+  static realvec_t loadu(real_t const *p, size_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    return m.m.ifthen(loada(p), *this);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    return m.m.ifthen(loadu(p), *this);
+  }
+  realvec_t loadu(real_t const *p, size_t ioff, mask_t const &m) const {
+    return m.m.ifthen(loadu(p, ioff), *this);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p);
+  }
+  void storeu(real_t *p) const {
+    for (int d = 0; d < size; ++d)
+      p[d] = v[d];
+  }
+  void storeu(real_t *p, size_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    for (int d = 0; d < size; ++d)
+      if (m.m[d])
+        p[d] = v[d];
+  }
+  void storeu(real_t *p, size_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const {
     intvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+    for (int d = 0; d < size; ++d)
+      res.v[d] = FP::as_int(v[d]);
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::realvec_t
-  booltestvec<T,N>::ifthen(realvec_t x, realvec_t y) const
-  {
+  intvec_t convert_int() const { return MF::vml_convert_int(*this); }
+
+  realvec_t operator+() const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+    for (int d = 0; d < size; ++d)
+      res.v[d] = +v[d];
     return res;
   }
-
-  
-  
-  // inttestvec definitions
-  
-  template<typename T, int N>
-  inline
-  typename inttestvec<T,N>::realvec_t
-  inttestvec<T,N>::as_float() const
-  {
+  realvec_t operator-() const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = FP::as_float(v[d]);
+    for (int d = 0; d < size; ++d)
+      res.v[d] = -v[d];
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename inttestvec<T,N>::realvec_t
-  inttestvec<T,N>::convert_float() const
-  {
-    return MF::vml_convert_float(*this);
-  }
-  
-
-
-  // Wrappers
-  
-  // booltestvec wrappers
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> as_int(booltestvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> convert_int(booltestvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline bool all(booltestvec<real_t, size> x) { return x.all(); }
-  
-  template<typename real_t, int size>
-  inline bool any(booltestvec<real_t, size> x) { return x.any(); }
-  
-  template<typename real_t, int size>
-  inline
-  booltestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
-                                   booltestvec<real_t, size> x,
-                                   booltestvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
-                                  inttestvec<real_t, size> x,
-                                  inttestvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
-                                   realtestvec<real_t, size> x,
-                                   realtestvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  
-  
-  // inttestvec wrappers
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> abs(inttestvec<real_t, size> x)
-  {
-    return x.abs();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> as_bool(inttestvec<real_t, size> x)
-  {
-    return x.as_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> as_float(inttestvec<real_t, size> x)
-  {
-    return x.as_float();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> bitifthen(inttestvec<real_t, size> x,
-                                     inttestvec<real_t, size> y,
-                                     inttestvec<real_t, size> z)
-  {
-    return x.bitifthen(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> clz(inttestvec<real_t, size> x)
-  {
-    return x.clz();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> convert_bool(inttestvec<real_t, size> x)
-  {
-    return x.convert_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> convert_float(inttestvec<real_t, size> x)
-  {
-    return x.convert_float();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isignbit(inttestvec<real_t, size> x)
-  {
-    return x.isignbit();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> lsr(inttestvec<real_t, size> x,
-                               typename inttestvec<real_t, size>::int_t n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> lsr(inttestvec<real_t, size> x,
-                               inttestvec<real_t, size> n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> max(inttestvec<real_t, size> x,
-                               inttestvec<real_t, size> y)
-  {
-    return x.max(y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> min(inttestvec<real_t, size> x,
-                               inttestvec<real_t, size> y)
-  {
-    return x.min(y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> popcount(inttestvec<real_t, size> x)
-  {
-    return x.popcount();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> rotate(inttestvec<real_t, size> x,
-                                  typename inttestvec<real_t, size>::int_t n)
-  {
-    return x.rotate(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> rotate(inttestvec<real_t, size> x,
-                                  inttestvec<real_t, size> n)
-  {
-    return x.rotate(n);
-  }
-  
-  
-  
-  // realtestvec wrappers
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size>
-  loada(real_t const* p,
-        realtestvec<real_t, size> x,
-        typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.loada(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size>
-  loadu(real_t const* p,
-        realtestvec<real_t, size> x,
-        typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size>
-  loadu(real_t const* p, size_t ioff,
-        realtestvec<real_t, size> x,
-        typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, ioff, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realtestvec<real_t, size> x, real_t* p)
-  {
-    return x.storea(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realtestvec<real_t, size> x, real_t* p)
-  {
-    return x.storeu(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realtestvec<real_t, size> x, real_t* p, size_t ioff)
-  {
-    return x.storeu(p, ioff);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realtestvec<real_t, size> x, real_t* p,
-                     typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.storea(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realtestvec<real_t, size> x, real_t* p,
-                     typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realtestvec<real_t, size> x, real_t* p, size_t ioff,
-                     typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, ioff, m);
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> as_int(realtestvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> convert_int(realtestvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t maxval(realtestvec<real_t, size> x)
-  {
-    return x.maxval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t minval(realtestvec<real_t, size> x)
-  {
-    return x.minval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t prod(realtestvec<real_t, size> x)
-  {
-    return x.prod();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t sum(realtestvec<real_t, size> x)
-  {
-    return x.sum();
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> acos(realtestvec<real_t, size> x)
-  {
-    return x.acos();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> acosh(realtestvec<real_t, size> x)
-  {
-    return x.acosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> asin(realtestvec<real_t, size> x)
-  {
-    return x.asin();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> asinh(realtestvec<real_t, size> x)
-  {
-    return x.asinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> atan(realtestvec<real_t, size> x)
-  {
-    return x.atan();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> atan2(realtestvec<real_t, size> x,
-                                         realtestvec<real_t, size> y)
-  {
-    return x.atan2(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> atanh(realtestvec<real_t, size> x)
-  {
-    return x.atanh();
-  }
-    
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> cbrt(realtestvec<real_t, size> x)
-  {
-    return x.cbrt();
-  }
-    
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> ceil(realtestvec<real_t, size> x)
-  {
-    return x.ceil();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> copysign(realtestvec<real_t, size> x,
-                                            realtestvec<real_t, size> y)
-  {
-    return x.copysign(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> cos(realtestvec<real_t, size> x)
-  {
-    return x.cos();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> cosh(realtestvec<real_t, size> x)
-  {
-    return x.cosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> exp(realtestvec<real_t, size> x)
-  {
-    return x.exp();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> exp10(realtestvec<real_t, size> x)
-  {
-    return x.exp10();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> exp2(realtestvec<real_t, size> x)
-  {
-    return x.exp2();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> expm1(realtestvec<real_t, size> x)
-  {
-    return x.expm1();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fabs(realtestvec<real_t, size> x)
-  {
-    return x.fabs();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> floor(realtestvec<real_t, size> x)
-  {
-    return x.floor();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fdim(realtestvec<real_t, size> x,
-                                        realtestvec<real_t, size> y)
-  {
-    return x.fdim(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fma(realtestvec<real_t, size> x,
-                                       realtestvec<real_t, size> y,
-                                       realtestvec<real_t, size> z)
-  {
-    return x.fma(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fmax(realtestvec<real_t, size> x,
-                                        realtestvec<real_t, size> y)
-  {
-    return x.fmax(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fmin(realtestvec<real_t, size> x,
-                                        realtestvec<real_t, size> y)
-  {
-    return x.fmin(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fmod(realtestvec<real_t, size> x,
-                                        realtestvec<real_t, size> y)
-  {
-    return x.fmod(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> frexp(realtestvec<real_t, size> x,
-                                         inttestvec<real_t, size>* r)
-  {
-    return x.frexp(r);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> hypot(realtestvec<real_t, size> x,
-                                         realtestvec<real_t, size> y)
-  {
-    return x.hypot(y);
-  }
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> ilogb(realtestvec<real_t, size> x)
-  {
-    return x.ilogb();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isfinite(realtestvec<real_t, size> x)
-  {
-    return x.isfinite();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isinf(realtestvec<real_t, size> x)
-  {
-    return x.isinf();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isnan(realtestvec<real_t, size> x)
-  {
-    return x.isnan();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isnormal(realtestvec<real_t, size> x)
-  {
-    return x.isnormal();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x,
-                                  typename inttestvec<real_t, size>::int_t n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x,
-                                  inttestvec<real_t, size> n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> log(realtestvec<real_t, size> x)
-  {
-    return x.log();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> log10(realtestvec<real_t, size> x)
-  {
-    return x.log10();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> log1p(realtestvec<real_t, size> x)
-  {
-    return x.log1p();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> log2(realtestvec<real_t, size> x)
-  {
-    return x.log2();
-  }
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> lrint(realtestvec<real_t, size> x)
-  {
-    return x.lrint();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> mad(realtestvec<real_t, size> x,
-                                       realtestvec<real_t, size> y,
-                                       realtestvec<real_t, size> z)
-  {
-    return x.mad(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> nextafter(realtestvec<real_t, size> x,
-                                             realtestvec<real_t, size> y)
-  {
-    return x.nextafter(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> pow(realtestvec<real_t, size> x,
-                                       realtestvec<real_t, size> y)
-  {
-    return x.pow(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> rcp(realtestvec<real_t, size> x)
-  {
-    return x.rcp();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> remainder(realtestvec<real_t, size> x,
-                                             realtestvec<real_t, size> y)
-  {
-    return x.remainder(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> rint(realtestvec<real_t, size> x)
-  {
-    return x.rint();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> round(realtestvec<real_t, size> x)
-  {
-    return x.round();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> rsqrt(realtestvec<real_t, size> x)
-  {
-    return x.rsqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> signbit(realtestvec<real_t, size> x)
-  {
-    return x.signbit();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> sin(realtestvec<real_t, size> x)
-  {
-    return x.sin();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> sinh(realtestvec<real_t, size> x)
-  {
-    return x.sinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> sqrt(realtestvec<real_t, size> x)
-  {
-    return x.sqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> tan(realtestvec<real_t, size> x)
-  {
-    return x.tan();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> tanh(realtestvec<real_t, size> x)
-  {
-    return x.tanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> trunc(realtestvec<real_t, size> x)
-  {
-    return x.trunc();
-  }
-  
-  
-  
-#ifndef VML_NO_IOSTREAM
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           booltestvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           inttestvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           realtestvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
+
+  realvec_t &operator+=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] += x.v[d];
+    return *this;
+  }
+  realvec_t &operator-=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] -= x.v[d];
+    return *this;
+  }
+  realvec_t &operator*=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] *= x.v[d];
+    return *this;
+  }
+  realvec_t &operator/=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] /= x.v[d];
+    return *this;
   }
+
+  realvec_t operator+(realvec_t x) const {
+    realvec_t res = *this;
+    return res += x;
+  }
+  realvec_t operator-(realvec_t x) const {
+    realvec_t res = *this;
+    return res -= x;
+  }
+  realvec_t operator*(realvec_t x) const {
+    realvec_t res = *this;
+    return res *= x;
+  }
+  realvec_t operator/(realvec_t x) const {
+    realvec_t res = *this;
+    return res /= x;
+  }
+
+  real_t maxval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = vml_std::fmax(res, v[d]);
+    return res;
+  }
+  real_t minval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = vml_std::fmin(res, v[d]);
+    return res;
+  }
+  real_t prod() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res *= v[d];
+    return res;
+  }
+  real_t sum() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res += v[d];
+    return res;
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+  boolvec_t operator<(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < x.v[d];
+    return res;
+  }
+  boolvec_t operator<=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] <= x.v[d];
+    return res;
+  }
+  boolvec_t operator>(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] > x.v[d];
+    return res;
+  }
+  boolvec_t operator>=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] >= x.v[d];
+    return res;
+  }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const { return MF::vml_ceil(*this); }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const { return MF::vml_floor(*this); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return MF::vml_fmax(*this, y); }
+  realvec_t fmin(realvec_t y) const { return MF::vml_fmin(*this, y); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  intvec_t lrint() const { return MF::vml_lrint(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return MF::vml_rcp(*this); }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const { return MF::vml_rint(*this); }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return MF::vml_sqrt(*this); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const { return MF::vml_trunc(*this); }
+};
+
+// booltestvec definitions
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::intvec_t booltestvec<T, N>::as_int() const {
+  return convert_int();
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::intvec_t
+booltestvec<T, N>::convert_int() const {
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::boolvec_t
+booltestvec<T, N>::ifthen(boolvec_t x, boolvec_t y) const {
+  boolvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::intvec_t
+booltestvec<T, N>::ifthen(intvec_t x, intvec_t y) const {
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::realvec_t
+booltestvec<T, N>::ifthen(realvec_t x, realvec_t y) const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+// inttestvec definitions
+
+template <typename T, int N>
+inline typename inttestvec<T, N>::realvec_t inttestvec<T, N>::as_float() const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = FP::as_float(v[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline typename inttestvec<T, N>::realvec_t
+inttestvec<T, N>::convert_float() const {
+  return MF::vml_convert_float(*this);
+}
+
+// Wrappers
+
+// booltestvec wrappers
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> as_int(booltestvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> convert_int(booltestvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline bool all(booltestvec<real_t, size> x) {
+  return x.all();
+}
+
+template <typename real_t, int size>
+inline bool any(booltestvec<real_t, size> x) {
+  return x.any();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
+                                        booltestvec<real_t, size> x,
+                                        booltestvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
+                                       inttestvec<real_t, size> x,
+                                       inttestvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
+                                        realtestvec<real_t, size> x,
+                                        realtestvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+// inttestvec wrappers
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> abs(inttestvec<real_t, size> x) {
+  return x.abs();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> as_bool(inttestvec<real_t, size> x) {
+  return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> as_float(inttestvec<real_t, size> x) {
+  return x.as_float();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> bitifthen(inttestvec<real_t, size> x,
+                                          inttestvec<real_t, size> y,
+                                          inttestvec<real_t, size> z) {
+  return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> clz(inttestvec<real_t, size> x) {
+  return x.clz();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> convert_bool(inttestvec<real_t, size> x) {
+  return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> convert_float(inttestvec<real_t, size> x) {
+  return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isignbit(inttestvec<real_t, size> x) {
+  return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size>
+lsr(inttestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> lsr(inttestvec<real_t, size> x,
+                                    inttestvec<real_t, size> n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> max(inttestvec<real_t, size> x,
+                                    inttestvec<real_t, size> y) {
+  return x.max(y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> min(inttestvec<real_t, size> x,
+                                    inttestvec<real_t, size> y) {
+  return x.min(y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> popcount(inttestvec<real_t, size> x) {
+  return x.popcount();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size>
+rotate(inttestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) {
+  return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> rotate(inttestvec<real_t, size> x,
+                                       inttestvec<real_t, size> n) {
+  return x.rotate(n);
+}
+
+// realtestvec wrappers
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+loada(real_t const *p, realtestvec<real_t, size> x,
+      typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+loadu(real_t const *p, realtestvec<real_t, size> x,
+      typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+loadu(real_t const *p, size_t ioff, realtestvec<real_t, size> x,
+      typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realtestvec<real_t, size> x, real_t *p) {
+  return x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p) {
+  return x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p, size_t ioff) {
+  return x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realtestvec<real_t, size> x, real_t *p,
+                   typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p,
+                   typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p, size_t ioff,
+                   typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> as_int(realtestvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> convert_int(realtestvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline real_t maxval(realtestvec<real_t, size> x) {
+  return x.maxval();
+}
+
+template <typename real_t, int size>
+inline real_t minval(realtestvec<real_t, size> x) {
+  return x.minval();
+}
+
+template <typename real_t, int size>
+inline real_t prod(realtestvec<real_t, size> x) {
+  return x.prod();
+}
+
+template <typename real_t, int size>
+inline real_t sum(realtestvec<real_t, size> x) {
+  return x.sum();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> acos(realtestvec<real_t, size> x) {
+  return x.acos();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> acosh(realtestvec<real_t, size> x) {
+  return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> asin(realtestvec<real_t, size> x) {
+  return x.asin();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> asinh(realtestvec<real_t, size> x) {
+  return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> atan(realtestvec<real_t, size> x) {
+  return x.atan();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> atan2(realtestvec<real_t, size> x,
+                                       realtestvec<real_t, size> y) {
+  return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> atanh(realtestvec<real_t, size> x) {
+  return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> cbrt(realtestvec<real_t, size> x) {
+  return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> ceil(realtestvec<real_t, size> x) {
+  return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> copysign(realtestvec<real_t, size> x,
+                                          realtestvec<real_t, size> y) {
+  return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> cos(realtestvec<real_t, size> x) {
+  return x.cos();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> cosh(realtestvec<real_t, size> x) {
+  return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> exp(realtestvec<real_t, size> x) {
+  return x.exp();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> exp10(realtestvec<real_t, size> x) {
+  return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> exp2(realtestvec<real_t, size> x) {
+  return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> expm1(realtestvec<real_t, size> x) {
+  return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fabs(realtestvec<real_t, size> x) {
+  return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> floor(realtestvec<real_t, size> x) {
+  return x.floor();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fdim(realtestvec<real_t, size> x,
+                                      realtestvec<real_t, size> y) {
+  return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fma(realtestvec<real_t, size> x,
+                                     realtestvec<real_t, size> y,
+                                     realtestvec<real_t, size> z) {
+  return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fmax(realtestvec<real_t, size> x,
+                                      realtestvec<real_t, size> y) {
+  return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fmin(realtestvec<real_t, size> x,
+                                      realtestvec<real_t, size> y) {
+  return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fmod(realtestvec<real_t, size> x,
+                                      realtestvec<real_t, size> y) {
+  return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> frexp(realtestvec<real_t, size> x,
+                                       inttestvec<real_t, size> *r) {
+  return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> hypot(realtestvec<real_t, size> x,
+                                       realtestvec<real_t, size> y) {
+  return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> ilogb(realtestvec<real_t, size> x) {
+  return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isfinite(realtestvec<real_t, size> x) {
+  return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isinf(realtestvec<real_t, size> x) {
+  return x.isinf();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isnan(realtestvec<real_t, size> x) {
+  return x.isnan();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isnormal(realtestvec<real_t, size> x) {
+  return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+ldexp(realtestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x,
+                                       inttestvec<real_t, size> n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log(realtestvec<real_t, size> x) {
+  return x.log();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log10(realtestvec<real_t, size> x) {
+  return x.log10();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log1p(realtestvec<real_t, size> x) {
+  return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log2(realtestvec<real_t, size> x) {
+  return x.log2();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> lrint(realtestvec<real_t, size> x) {
+  return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> mad(realtestvec<real_t, size> x,
+                                     realtestvec<real_t, size> y,
+                                     realtestvec<real_t, size> z) {
+  return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> nextafter(realtestvec<real_t, size> x,
+                                           realtestvec<real_t, size> y) {
+  return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> pow(realtestvec<real_t, size> x,
+                                     realtestvec<real_t, size> y) {
+  return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> rcp(realtestvec<real_t, size> x) {
+  return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> remainder(realtestvec<real_t, size> x,
+                                           realtestvec<real_t, size> y) {
+  return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> rint(realtestvec<real_t, size> x) {
+  return x.rint();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> round(realtestvec<real_t, size> x) {
+  return x.round();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> rsqrt(realtestvec<real_t, size> x) {
+  return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> signbit(realtestvec<real_t, size> x) {
+  return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> sin(realtestvec<real_t, size> x) {
+  return x.sin();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> sinh(realtestvec<real_t, size> x) {
+  return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> sqrt(realtestvec<real_t, size> x) {
+  return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> tan(realtestvec<real_t, size> x) {
+  return x.tan();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> tanh(realtestvec<real_t, size> x) {
+  return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> trunc(realtestvec<real_t, size> x) {
+  return x.trunc();
+}
+
+#ifndef VML_NO_IOSTREAM
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, booltestvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, inttestvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, realtestvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
 #endif
-  
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_TEST_H
+#endif // #ifndef VEC_TEST_H
diff --git a/lib/kernel/vecmathlib/vec_vsx_double2.h b/lib/kernel/vecmathlib/vec_vsx_double2.h
index 6725859..fa43a6f 100644
--- a/lib/kernel/vecmathlib/vec_vsx_double2.h
+++ b/lib/kernel/vecmathlib/vec_vsx_double2.h
@@ -13,679 +13,572 @@
 #include <altivec.h>
 
 #if defined __clang__
-#  define __vector vector
-#  define __pixel pixel
-#  define __bool bool
+#define __vector vector
+#define __pixel pixel
+#define __bool bool
 #elif defined __gcc__
-#  undef vector
-#  undef pixel
-#  undef bool
+#undef vector
+#undef pixel
+#undef bool
 #elif defined __xlC__
-#  define __bool bool
+#define __bool bool
 #else
-#  error "Unknown compiler"
+#error "Unknown compiler"
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_2
-  template<> struct boolvec<double,2>;
-  template<> struct intvec<double,2>;
-  template<> struct realvec<double,2>;
-  
-  
-  
-  template<>
-  struct boolvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef bool scalar_t;
-    typedef __vector __bool long long bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    // truth values are interpreted bit-wise
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v((bvector_t)vec_splats((unsigned long long)from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vec_nor(v, v); }
-    
-    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
-    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-    
-    bool all() const { return vec_all_ne(v, BV(false)); }
-    bool any() const { return vec_any_ne(v, BV(false)); }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef int_t scalar_t;
-    typedef __vector signed long long ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vec_splats((long long)a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota() { return (__vector signed long long){0, 1}; }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return (__vector __bool long long)v; }
-    boolvec_t convert_bool() const { return *this != IV(I(0)); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Permutation control words
-  private:
-    // 0123 4567 -> 1436
-    // exchange pairs
-    static __vector unsigned char perm_int_swap()
-    {
-      return
-        (__vector unsigned char)
-        {4,5,6,7, 16,17,18,19, 12,13,14,15, 24,25,26,27};
-    }
-    // 0123 4567 -> 0426
-    // broadcast high elements of pairs
-    static __vector unsigned char perm_int_bchi()
-    {
-      return
-        (__vector unsigned char)
-        {0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
-    }
-  public:
-    
-    
-
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return vec_neg(v); }
-    
-    intvec operator+(intvec x) const { return vec_add(v, x.v); }
-    intvec operator-(intvec x) const { return vec_sub(v, x.v); }
-    intvec operator*(intvec x) const { return vec_mul(v, x.v); }
-    intvec operator/(intvec x) const { return vec_div(v, x.v); }
-    intvec operator%(intvec x) const { return *this - *this / x * x; }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    intvec& operator*=(intvec const& x) { return *this=*this*x; }
-    intvec& operator/=(intvec const& x) { return *this=*this/x; }
-    intvec& operator%=(intvec const& x) { return *this=*this%x; }
-    
-    
-    
-    intvec operator~() const
-    {
-      return (__vector signed long long)vec_nor((__vector signed int)v, (__vector signed int)v);
-    }
-    
-    intvec operator&(intvec x) const
-    {
-      return (__vector signed long long)vec_and((__vector signed int)v, (__vector signed int)x.v);
-    }
-    intvec operator|(intvec x) const
-    {
-      return (__vector signed long long)vec_or ((__vector signed int)v, (__vector signed int)x.v);
-    }
-    intvec operator^(intvec x) const
-    {
-      return (__vector signed long long)vec_xor((__vector signed int)v, (__vector signed int)x.v);
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec lsr(int_t n) const { return lsr(IV(n)); }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      // return vec_sr(v, (__vector unsigned long long)n.v);
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      // return vec_sra(v, (__vector unsigned long long)n.v);
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      // return vec_sl(v, (__vector unsigned long long)n.v);
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      // return vec_cmpeq(v, x.v);
-      __vector signed int a = (__vector signed int)v;
-      __vector signed int b = (__vector signed int)x.v;
-      __vector __bool int c = vec_cmpeq(a, b);
-      __vector __bool int cx = vec_perm(c, c, perm_int_swap());
-      __vector __bool int r = vec_and(c, cx);
-      return (__vector __bool long long)r;
-    }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const
-    {
-      __vector signed int a = (__vector signed int)v;
-      __vector signed int b = (__vector signed int)x.v;
-      __vector __bool int lt = vec_cmplt(a, b);
-      __vector __bool int eq = vec_cmpeq(a, b);
-      __vector unsigned int ua = (__vector unsigned int)v;
-      __vector unsigned int ub = (__vector unsigned int)x.v;
-      __vector __bool int ult = vec_cmplt(ua, ub);
-      __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap());
-      __vector __bool int r = vec_or(lt, vec_and(eq, ultx));
-      r = vec_perm(r, r, perm_int_bchi());
-      return (__vector __bool long long)r;
-    }
-    boolvec_t operator<=(intvec const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return (*this >> (bits-1)).as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef real_t scalar_t;
-    typedef __vector double vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<VSX:2*double>"; }
-    void barrier() { __asm__("": "+v"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vec_splats(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vec_xld2(0, (real_t*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      // TODO: Can this handle unaligned access?
-      return vec_xld2(0, (real_t*)p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vec_xstd2(v, 0, p);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return (__vector signed long long) v; }
-    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return vec_add(v, x.v); }
-    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
-    realvec operator*(realvec x) const { return vec_mul(v, x.v); }
-    realvec operator/(realvec x) const { return vec_div(v, x.v); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      return vml_std::fmax((*this)[0], (*this)[1]);
+template <> struct boolvec<double, 2>;
+template <> struct intvec<double, 2>;
+template <> struct realvec<double, 2>;
+
+template <> struct boolvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef bool scalar_t;
+  typedef __vector __bool long long bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values are -1, false values are 0
+  // truth values are interpreted bit-wise
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return a; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a)
+      : v((bvector_t)vec_splats((unsigned long long)from_bool(a))) {}
+  boolvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vec_nor(v, v); }
+
+  boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+  boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+  boolvec operator==(boolvec x) const { return !(*this != x); }
+  boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+
+  bool all() const { return vec_all_ne(v, BV(false)); }
+  bool any() const { return vec_any_ne(v, BV(false)); }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef int_t scalar_t;
+  typedef __vector signed long long ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vec_splats((long long)a)) {}
+  intvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() { return (__vector signed long long){0, 1}; }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return (__vector __bool long long)v; }
+  boolvec_t convert_bool() const { return *this != IV(I(0)); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Permutation control words
+private:
+  // 0123 4567 -> 1436
+  // exchange pairs
+  static __vector unsigned char perm_int_swap() {
+    return (__vector unsigned char){4,  5,  6,  7,  16, 17, 18, 19,
+                                    12, 13, 14, 15, 24, 25, 26, 27};
+  }
+  // 0123 4567 -> 0426
+  // broadcast high elements of pairs
+  static __vector unsigned char perm_int_bchi() {
+    return (__vector unsigned char){0, 1, 2,  3,  16, 17, 18, 19,
+                                    8, 9, 10, 11, 24, 25, 26, 27};
+  }
+
+public:
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return vec_neg(v); }
+
+  intvec operator+(intvec x) const { return vec_add(v, x.v); }
+  intvec operator-(intvec x) const { return vec_sub(v, x.v); }
+  intvec operator*(intvec x) const { return vec_mul(v, x.v); }
+  intvec operator/(intvec x) const { return vec_div(v, x.v); }
+  intvec operator%(intvec x) const { return *this - *this / x * x; }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+  intvec &operator*=(intvec const &x) { return *this = *this * x; }
+  intvec &operator/=(intvec const &x) { return *this = *this / x; }
+  intvec &operator%=(intvec const &x) { return *this = *this % x; }
+
+  intvec operator~() const {
+    return (__vector signed long long)vec_nor((__vector signed int)v,
+                                              (__vector signed int)v);
+  }
+
+  intvec operator&(intvec x) const {
+    return (__vector signed long long)vec_and((__vector signed int)v,
+                                              (__vector signed int)x.v);
+  }
+  intvec operator|(intvec x) const {
+    return (__vector signed long long)vec_or((__vector signed int)v,
+                                             (__vector signed int)x.v);
+  }
+  intvec operator^(intvec x) const {
+    return (__vector signed long long)vec_xor((__vector signed int)v,
+                                              (__vector signed int)x.v);
+  }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec lsr(int_t n) const { return lsr(IV(n)); }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const { return *this >> IV(n); }
+  intvec operator<<(int_t n) const { return *this << IV(n); }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
+    // return vec_sr(v, (__vector unsigned long long)n.v);
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    real_t minval() const
-    {
-      return vml_std::fmin((*this)[0], (*this)[1]);
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const {
+    // return vec_sra(v, (__vector unsigned long long)n.v);
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
+    }
+    return r;
+  }
+  intvec operator<<(intvec n) const {
+    // return vec_sl(v, (__vector unsigned long long)n.v);
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1];
+    return r;
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec const &x) const {
+    // return vec_cmpeq(v, x.v);
+    __vector signed int a = (__vector signed int)v;
+    __vector signed int b = (__vector signed int)x.v;
+    __vector __bool int c = vec_cmpeq(a, b);
+    __vector __bool int cx = vec_perm(c, c, perm_int_swap());
+    __vector __bool int r = vec_and(c, cx);
+    return (__vector __bool long long)r;
+  }
+  boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(intvec const &x) const {
+    __vector signed int a = (__vector signed int)v;
+    __vector signed int b = (__vector signed int)x.v;
+    __vector __bool int lt = vec_cmplt(a, b);
+    __vector __bool int eq = vec_cmpeq(a, b);
+    __vector unsigned int ua = (__vector unsigned int)v;
+    __vector unsigned int ub = (__vector unsigned int)x.v;
+    __vector __bool int ult = vec_cmplt(ua, ub);
+    __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap());
+    __vector __bool int r = vec_or(lt, vec_and(eq, ultx));
+    r = vec_perm(r, r, perm_int_bchi());
+    return (__vector __bool long long)r;
+  }
+  boolvec_t operator<=(intvec const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return (*this >> (bits - 1)).as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef real_t scalar_t;
+  typedef __vector double vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<VSX:2*double>"; }
+  void barrier() { __asm__("" : "+v"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vec_splats(a)) {}
+  realvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vec_xld2(0, (real_t *)p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    // TODO: Can this handle unaligned access?
+    return vec_xld2(0, (real_t *)p);
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    real_t sum() const
-    {
-      return (*this)[0] + (*this)[1];
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return vec_ceil(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vec_abs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return vec_floor(v); }
-    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
-    realvec fmax(realvec y) const { return vec_max(v, y.v); }
-    realvec fmin(realvec y) const { return vec_min(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vec_xstd2(v, 0, p);
+  }
+  void storeu(real_t *p) const {
+    // Vector stores would require vector loads, which would need to
+    // be atomic
+    // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html>
+    // for good ideas
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      // Use vec_ste?
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
     }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec x = *this;
-      realvec r = vec_re(v);    // this is only an approximation
-      // TODO: use fma
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors
-      r += r * (RV(1.0) - x*r); // two Newton iterations (see vml_rcp)
-      r += r * (RV(1.0) - x*r);
-      return r;
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // Use vec_ste?
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
     }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const { return vec_round(v); /* sic! */}
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const { return RV(1.0) / sqrt(); }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return vec_sqrt(v); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return vec_trunc(v); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,2> boolvec<double,2>::as_int() const
-  {
-    return (__vector signed long long) v;
-  }
-  
-  inline intvec<double,2> boolvec<double,2>::convert_int() const
-  {
-    return -(__vector signed long long)v;
-  }
-  
-  inline
-  boolvec<double,2> boolvec<double,2>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  intvec<double,2> boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  realvec<double,2> boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline intvec<double,2> intvec<double,2>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline realvec<double,2> intvec<double,2>::as_float() const
-  {
-    return (__vector double)v;
-  }
-  
-  inline intvec<double,2> intvec<double,2>::bitifthen(intvec_t x,
-						      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<double,2> intvec<double,2>::convert_float() const
-  {
-    // return vec_ctd(v, 0);
-    return MF::vml_convert_float(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return (__vector signed long long)v; }
+  intvec_t convert_int() const { return MF::vml_convert_int(*this); }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return RV(0.0) - *this; }
+
+  realvec operator+(realvec x) const { return vec_add(v, x.v); }
+  realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+  realvec operator*(realvec x) const { return vec_mul(v, x.v); }
+  realvec operator/(realvec x) const { return vec_div(v, x.v); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const { return vml_std::fmax((*this)[0], (*this)[1]); }
+  real_t minval() const { return vml_std::fmin((*this)[0], (*this)[1]); }
+  real_t prod() const { return (*this)[0] * (*this)[1]; }
+  real_t sum() const { return (*this)[0] + (*this)[1]; }
+
+  boolvec_t operator==(realvec const &x) const { return vec_cmpeq(v, x.v); }
+  boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(realvec const &x) const { return vec_cmplt(v, x.v); }
+  boolvec_t operator<=(realvec const &x) const { return vec_cmple(v, x.v); }
+  boolvec_t operator>(realvec const &x) const { return vec_cmpgt(v, x.v); }
+  boolvec_t operator>=(realvec const &x) const { return vec_cmpge(v, x.v); }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const { return vec_ceil(v); }
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return vec_abs(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const { return vec_floor(v); }
+  realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+  realvec fmax(realvec y) const { return vec_max(v, y.v); }
+  realvec fmin(realvec y) const { return vec_min(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const {
+    realvec x = *this;
+    realvec r = vec_re(v); // this is only an approximation
+    // TODO: use fma
+    // Note: don't rewrite this expression, this may introduce
+    // cancellation errors
+    r += r * (RV(1.0) - x * r); // two Newton iterations (see vml_rcp)
+    r += r * (RV(1.0) - x * r);
+    return r;
+  }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const { return vec_round(v); /* sic! */ }
+  realvec round() const { return MF::vml_round(*this); }
+  realvec rsqrt() const { return RV(1.0) / sqrt(); }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const { return vec_sqrt(v); }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const { return vec_trunc(v); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 2> boolvec<double, 2>::as_int() const {
+  return (__vector signed long long)v;
+}
+
+inline intvec<double, 2> boolvec<double, 2>::convert_int() const {
+  return -(__vector signed long long)v;
+}
+
+inline boolvec<double, 2> boolvec<double, 2>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+inline intvec<double, 2> boolvec<double, 2>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+inline realvec<double, 2> boolvec<double, 2>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<double, 2> intvec<double, 2>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline realvec<double, 2> intvec<double, 2>::as_float() const {
+  return (__vector double)v;
+}
+
+inline intvec<double, 2> intvec<double, 2>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 2> intvec<double, 2>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<double, 2> intvec<double, 2>::convert_float() const {
+  // return vec_ctd(v, 0);
+  return MF::vml_convert_float(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_VSX_DOUBLE2_H
+#endif // #ifndef VEC_VSX_DOUBLE2_H
diff --git a/lib/kernel/vecmathlib/vecmathlib.h b/lib/kernel/vecmathlib/vecmathlib.h
index 9accd24..0d72add 100644
--- a/lib/kernel/vecmathlib/vecmathlib.h
+++ b/lib/kernel/vecmathlib/vecmathlib.h
@@ -4,16 +4,14 @@
 #define VECMATHLIB_H
 
 #if defined VML_DEBUG || defined VML_NODEBUG
-#  if defined VML_DEBUG && defined VML_NODEBUG
-#    error "Only one of VML_DEBUG or VML_NODEBUG may be defined"
-#  endif
+#if defined VML_DEBUG && defined VML_NODEBUG
+#error "Only one of VML_DEBUG or VML_NODEBUG may be defined"
+#endif
 #else
 // default
-#  define VML_DEBUG
+#define VML_DEBUG
 #endif
 
-
-
 // FP settings
 
 // Possible effects of not having VML_HAVE_FP_CONTRACT:
@@ -23,7 +21,7 @@
 // - can evaluate functions with reduced precision (80% of significant digits)
 
 // default settings
-#undef VML_HAVE_DENORMALS       // TODO
+#undef VML_HAVE_DENORMALS // TODO
 #define VML_HAVE_FP_CONTRACT
 #define VML_HAVE_INF
 #define VML_HAVE_NAN
@@ -31,63 +29,59 @@
 
 // optimized settings
 #ifdef __FAST_MATH__
-#  undef VML_HAVE_DENORMALS
-#  undef VML_HAVE_FP_CONTRACT
-#  undef VML_HAVE_INF
-#  undef VML_HAVE_NAN
+#undef VML_HAVE_DENORMALS
+#undef VML_HAVE_FP_CONTRACT
+#undef VML_HAVE_INF
+#undef VML_HAVE_NAN
 #endif
 
 #ifdef VML_DEBUG
-#  define VML_CONFIG_DEBUG " debug"
+#define VML_CONFIG_DEBUG " debug"
 #else
-#  define VML_CONFIG_DEBUG " no-debug"
+#define VML_CONFIG_DEBUG " no-debug"
 #endif
 #ifdef VML_DENORMALS
-#  define VML_CONFIG_DENORMALS " denormals"
+#define VML_CONFIG_DENORMALS " denormals"
 #else
-#  define VML_CONFIG_DENORMALS " no-denormals"
+#define VML_CONFIG_DENORMALS " no-denormals"
 #endif
 #ifdef VML_FP_CONTRACT
-#  define VML_CONFIG_FP_CONTRACT " fp-contract"
+#define VML_CONFIG_FP_CONTRACT " fp-contract"
 #else
-#  define VML_CONFIG_FP_CONTRACT " no-fp-contract"
+#define VML_CONFIG_FP_CONTRACT " no-fp-contract"
 #endif
 #ifdef VML_INF
-#  define VML_CONFIG_INF " inf"
+#define VML_CONFIG_INF " inf"
 #else
-#  define VML_CONFIG_INF " no-inf"
+#define VML_CONFIG_INF " no-inf"
 #endif
 #ifdef VML_NAN
-#  define VML_CONFIG_NAN " nan"
+#define VML_CONFIG_NAN " nan"
 #else
-#  define VML_CONFIG_NAN " no-nan"
+#define VML_CONFIG_NAN " no-nan"
 #endif
 
 // TODO: introduce mad, as fast version of fma (check FP_FAST_FMA)
 // TODO: introduce ieee_isnan and friends
 // TODO: switch between isnan and ieee_isnan at an outside level
 
-
-
 // This workaround is needed for older libstdc++ versions such as the
 // one in Debian 6.0 when compiled with clang++
 // <http://lists.cs.uiuc.edu/pipermail/cfe-dev/2011-February/013207.html>.
 // The version time stamp used below is the one in Debian 6.0.
-#include <cstring>              // pull in __GLIBCXX__
+#include <cstring> // pull in __GLIBCXX__
 #if defined __GLIBCXX__ && __GLIBCXX__ <= 20101114
-namespace std { class type_info; }
+namespace std {
+class type_info;
+}
 #endif
 
-
-
 #include <cassert>
 
-
-
 #ifdef VML_DEBUG
-#  define VML_ASSERT(x) assert(x)
+#define VML_ASSERT(x) assert(x)
 #else
-#  define VML_ASSERT(x) ((void)0)
+#define VML_ASSERT(x) ((void)0)
 #endif
 
 // Scalarise all vector operations, and use libm's functions (mostly
@@ -96,146 +90,142 @@ namespace std { class type_info; }
 
 #ifdef __clang__
 // Use compiler-provided vector types
-#  include "vec_builtin.h"
+#include "vec_builtin.h"
 #endif
 
 // Scalarise all vector operations; don't use libm, use only
 // Vecmathlib's functions (mostly useful for testing Vecmathlib)
 #include "vec_test.h"
 
-#if defined __ARM_NEON__        // ARM NEON
-#  include "vec_neon_float2.h"
-#  include "vec_neon_float4.h"
-#  define VML_CONFIG_NEON " NEON"
-#else
-#  define VML_CONFIG_NEON
-#endif
-
-#if defined __SSE2__            // Intel SSE 2
-#  include "vec_sse_float1.h"
-#  include "vec_sse_float4.h"
-#  include "vec_sse_double1.h"
-#  include "vec_sse_double2.h"
-#  if defined __SSE3__
-#    define VML_CONFIG_SSE3 " SSE3"
-#  else
-#    define VML_CONFIG_SSE3
-#  endif
-#  if defined __SSSE3__
-#    define VML_CONFIG_SSSE3 " SSSE3"
-#  else
-#    define VML_CONFIG_SSSE3
-#  endif
-#  if defined __SSE4_1__
-#    define VML_CONFIG_SSE4_1 " SSE4.1"
-#  else
-#    define VML_CONFIG_SSE4_1
-#  endif
-#  if defined __SSE4a__
-#    define VML_CONFIG_SSE4a " SSE4a"
-#  else
-#    define VML_CONFIG_SSE4a
-#  endif
-#  define VML_CONFIG_SSE2 " SSE2" VML_CONFIG_SSE3 VML_CONFIG_SSSE3 VML_CONFIG_SSE4_1 VML_CONFIG_SSE4a
-#else
-#  define VML_CONFIG_SSE2
-#endif
-
-#if defined __AVX__             // Intel AVX
-#  include "vec_avx_fp8_32.h"
-#  include "vec_avx_fp16_16.h"
-#  include "vec_avx_float8.h"
-#  include "vec_avx_double4.h"
-#  define VML_CONFIG_AVX " AVX"
-#else
-#  define VML_CONFIG_AVX
-#endif
-
-#if defined __MIC__             // Intel MIC
+#if defined __ARM_NEON__ // ARM NEON
+#include "vec_neon_float2.h"
+#include "vec_neon_float4.h"
+#define VML_CONFIG_NEON " NEON"
+#else
+#define VML_CONFIG_NEON
+#endif
+
+#if defined __SSE2__ // Intel SSE 2
+#include "vec_sse_float1.h"
+#include "vec_sse_float4.h"
+#include "vec_sse_double1.h"
+#include "vec_sse_double2.h"
+#if defined __SSE3__
+#define VML_CONFIG_SSE3 " SSE3"
+#else
+#define VML_CONFIG_SSE3
+#endif
+#if defined __SSSE3__
+#define VML_CONFIG_SSSE3 " SSSE3"
+#else
+#define VML_CONFIG_SSSE3
+#endif
+#if defined __SSE4_1__
+#define VML_CONFIG_SSE4_1 " SSE4.1"
+#else
+#define VML_CONFIG_SSE4_1
+#endif
+#if defined __SSE4a__
+#define VML_CONFIG_SSE4a " SSE4a"
+#else
+#define VML_CONFIG_SSE4a
+#endif
+#define VML_CONFIG_SSE2                                                        \
+  " SSE2" VML_CONFIG_SSE3 VML_CONFIG_SSSE3 VML_CONFIG_SSE4_1 VML_CONFIG_SSE4a
+#else
+#define VML_CONFIG_SSE2
+#endif
+
+#if defined __AVX__ // Intel AVX
+#include "vec_avx_fp8_32.h"
+#include "vec_avx_fp16_16.h"
+#include "vec_avx_float8.h"
+#include "vec_avx_double4.h"
+#define VML_CONFIG_AVX " AVX"
+#else
+#define VML_CONFIG_AVX
+#endif
+
+#if defined __MIC__ // Intel MIC
 // TODO: single precision?
-#  include "vec_mic_double8.h"
-#  define VML_CONFIG_MIC " MIC"
+#include "vec_mic_double8.h"
+#define VML_CONFIG_MIC " MIC"
 #else
-#  define VML_CONFIG_MIC
+#define VML_CONFIG_MIC
 #endif
 
-#if defined __ALTIVEC__         // IBM Altivec
-#  include "vec_altivec_float4.h"
-#  define VML_CONFIG_ALTIVEC " Altivec"
+#if defined __ALTIVEC__ // IBM Altivec
+#include "vec_altivec_float4.h"
+#define VML_CONFIG_ALTIVEC " Altivec"
 #else
-#  define VML_CONFIG_ALTIVEC
+#define VML_CONFIG_ALTIVEC
 #endif
 #if defined __ALTIVEC__ && defined _ARCH_PWR7 // IBM VSX
-#  include "vec_vsx_double2.h"
-#  define VML_CONFIG_VSX " VSX"
+#include "vec_vsx_double2.h"
+#define VML_CONFIG_VSX " VSX"
 #else
-#  define VML_CONFIG_VSX
+#define VML_CONFIG_VSX
 #endif
 
 // TODO: IBM Blue Gene/P DoubleHummer
 
 #if defined __bgq__ && defined __VECTOR4DOUBLE__ // IBM Blue Gene/Q QPX
 // TODO: vec_qpx_float4
-#  include "vec_qpx_double4.h"
-#  define VML_CONFIG_QPX " QPX"
+#include "vec_qpx_double4.h"
+#define VML_CONFIG_QPX " QPX"
 #else
-#  define VML_CONFIG_QPX
+#define VML_CONFIG_QPX
 #endif
 
-#define VECMATHLIB_CONFIGURATION                                        \
-  "VecmathlibConfiguration"                                             \
-  VML_CONFIG_DEBUG                                                      \
-  VML_CONFIG_DENORMALS VML_CONFIG_FP_CONTRACT VML_CONFIG_INF VML_CONFIG_NAN \
-  VML_CONFIG_NEON                                                       \
-  VML_CONFIG_SSE2 VML_CONFIG_AVX VML_CONFIG_MIC                         \
-  VML_CONFIG_ALTIVEC VML_CONFIG_VSX                                     \
-  VML_CONFIG_QPX
-
-
+#define VECMATHLIB_CONFIGURATION                                               \
+  "VecmathlibConfiguration" VML_CONFIG_DEBUG VML_CONFIG_DENORMALS              \
+      VML_CONFIG_FP_CONTRACT VML_CONFIG_INF VML_CONFIG_NAN VML_CONFIG_NEON     \
+          VML_CONFIG_SSE2 VML_CONFIG_AVX VML_CONFIG_MIC VML_CONFIG_ALTIVEC     \
+              VML_CONFIG_VSX VML_CONFIG_QPX
 
 // Define "best" vector types
 namespace vecmathlib {
-  
+
 #if defined VECMATHLIB_HAVE_VEC_FLOAT_16
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 16
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 16
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_8
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 8
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 8
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_4
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 4
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 4
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_2
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 2
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 2
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_1
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 1
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 1
 #endif
-  
+
 #if defined VECMATHLIB_HAVE_VEC_DOUBLE_8
-#  define VECMATHLIB_MAX_DOUBLE_VECSIZE 8
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 8
 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_4
-#  define VECMATHLIB_MAX_DOUBLE_VECSIZE 4
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 4
 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2
-#  define VECMATHLIB_MAX_DOUBLE_VECSIZE 2
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 2
 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_1
-#  define VECMATHLIB_MAX_DOUBLE_VECSIZE 1
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 1
 #endif
-  
+
 #ifdef VECMATHLIB_MAX_FLOAT_VECSIZE
-  typedef realvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> float32_vec;
-  typedef intvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE>  int32_vec;
-  typedef boolvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> bool32_vec;
+typedef realvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> float32_vec;
+typedef intvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> int32_vec;
+typedef boolvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> bool32_vec;
 #else
-  typedef realpseudovec<float,1> float32_vec;
-  typedef intpseudovec<float,1>  int32_vec;
-  typedef boolpseudovec<float,1> bool32_vec;
+typedef realpseudovec<float, 1> float32_vec;
+typedef intpseudovec<float, 1> int32_vec;
+typedef boolpseudovec<float, 1> bool32_vec;
 #endif
-  
+
 #ifdef VECMATHLIB_MAX_DOUBLE_VECSIZE
-  typedef realvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> float64_vec;
-  typedef intvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE>  int64_vec;
-  typedef boolvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> bool64_vec;
+typedef realvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> float64_vec;
+typedef intvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> int64_vec;
+typedef boolvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> bool64_vec;
 #else
-  typedef realpseudovec<double,1> float64_vec;
-  typedef intpseudovec<double,1>  int64_vec;
-  typedef boolpseudovec<double,1> bool64_vec;
+typedef realpseudovec<double, 1> float64_vec;
+typedef intpseudovec<double, 1> int64_vec;
+typedef boolpseudovec<double, 1> bool64_vec;
 #endif
 }
 
diff --git a/lib/kernel/write_image.cl b/lib/kernel/write_image.cl
index 47f543b..0c824e0 100644
--- a/lib/kernel/write_image.cl
+++ b/lib/kernel/write_image.cl
@@ -34,9 +34,8 @@
 #endif
 
 /* writes pixel to coord in image */
-void pocl_write_pixel (void* color_, void* image, int4 coord)
+void pocl_write_pixel (void* color_, ADDRESS_SPACE dev_image_t* dev_image, int4 coord)
 {  
-  ADDRESS_SPACE dev_image_t* dev_image = *((ADDRESS_SPACE dev_image_t**)image);
   uint4 *color = (uint4*)color_;
   int i, idx;
   int width = dev_image->width;
@@ -88,7 +87,7 @@ void pocl_write_pixel (void* color_, void* image, int4 coord)
   {                                                                     \
   int4 coord4;                                                          \
   INITCOORD##__COORD__(coord4, coord);                                  \
-  pocl_write_pixel (&color, &image, coord4);                             \
+  pocl_write_pixel (&color, *(ADDRESS_SPACE dev_image_t**)&image, coord4);                             \
   }                                                                     \
 
 IMPLEMENT_WRITE_IMAGE_INT_COORD(image2d_t, uint4, ui, int2)
diff --git a/lib/llvmopencl/AllocasToEntry.cc b/lib/llvmopencl/AllocasToEntry.cc
index a3dffff..0b4193b 100644
--- a/lib/llvmopencl/AllocasToEntry.cc
+++ b/lib/llvmopencl/AllocasToEntry.cc
@@ -21,18 +21,13 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "config.h"
 #include <sstream>
 #include <iostream>
 
-#if !defined LLVM_3_2 && !defined LLVM_3_3
-#  include <llvm/IR/Constants.h>
-#endif
-#ifdef LLVM_3_2
-#  include <llvm/Instructions.h>
-#else
-#  include <llvm/IR/Instructions.h>
-#endif
+#include "config.h"
+
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
 
 #include "AllocasToEntry.h"
 
@@ -59,7 +54,7 @@ AllocasToEntry::runOnFunction(Function &F)
   // This solves problem with dynamic stack objects that are 
   // not supported by some targets (TCE).
   Function::iterator I = F.begin();
-  Instruction *firstInsertionPt = (I++)->getFirstInsertionPt();
+  Instruction *firstInsertionPt = &*(I++)->getFirstInsertionPt();
     
   bool changed = false;
   for (Function::iterator E = F.end(); I != E; ++I) {
diff --git a/lib/llvmopencl/AllocasToEntry.h b/lib/llvmopencl/AllocasToEntry.h
index 6cd6ae3..4d6a6d8 100644
--- a/lib/llvmopencl/AllocasToEntry.h
+++ b/lib/llvmopencl/AllocasToEntry.h
@@ -25,12 +25,8 @@
 #define _POCL_ALLOCAS_TO_ENTRY_H
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Function.h"
-#else
-#include "llvm/IR/Function.h"
-#endif
 
+#include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/lib/llvmopencl/AutomaticLocals.cc b/lib/llvmopencl/AutomaticLocals.cc
index 8342b2c..eee1f95 100644
--- a/lib/llvmopencl/AutomaticLocals.cc
+++ b/lib/llvmopencl/AutomaticLocals.cc
@@ -23,30 +23,14 @@
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "config.h"
 #include "pocl.h"
-#include "Workgroup.h"
+
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#ifdef LLVM_3_1
-#include "llvm/Target/TargetData.h"
-#elif defined LLVM_3_2
-#include "llvm/DataLayout.h"
-#else
 #include "llvm/IR/DataLayout.h"
-#endif
 
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Argument.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -54,9 +38,9 @@ IGNORE_COMPILER_WARNING("-Wunused-parameter")
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#endif
 
 #include "LLVMUtils.h"
+#include "Workgroup.h"
 
 POP_COMPILER_DIAGS
 
@@ -85,9 +69,7 @@ static RegisterPass<AutomaticLocals> X("automatic-locals",
 
 void
 AutomaticLocals::getAnalysisUsage(AnalysisUsage &AU) const {
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DataLayout>();
-#elif (LLVM_OLDER_THAN_3_7)
+#if (LLVM_OLDER_THAN_3_7)
   AU.addRequired<DataLayoutPass>();
 #endif
 }
@@ -108,7 +90,7 @@ AutomaticLocals::runOnModule(Module &M)
     if (!Workgroup::isKernelToProcess(*mi))
       continue;
   
-    Function *F = mi;
+    Function *F = &*mi;
 
     Function *new_kernel = ProcessAutomaticLocals(F);
     if (new_kernel != F)
@@ -152,7 +134,7 @@ AutomaticLocals::ProcessAutomaticLocals(Function *F)
     std::string funcName = "";
     funcName = F->getName().str();
     if (is_automatic_local(funcName, *i)) {
-      locals.push_back(i);
+      locals.push_back(&*i);
       // Add the parameters to the end of the function parameter list.
       parameters.push_back(i->getType());
     }
@@ -179,13 +161,13 @@ AutomaticLocals::ProcessAutomaticLocals(Function *F)
          e = F->arg_end();
        i != e; ++i) {
     j->setName(i->getName());
-    vv[i] = j;
+    vv[&*i] = &*j;
     ++j;
   }
   
   for (int i = 0; j != new_kernel->arg_end(); ++i, ++j) {
     j->setName("_local" + Twine(i));
-    vv[locals[i]] = j;
+    vv[locals[i]] = &*j;
   }
                                  
   SmallVector<ReturnInst *, 1> ri;
diff --git a/lib/llvmopencl/Barrier.h b/lib/llvmopencl/Barrier.h
index eb25ffc..e46e7e7 100644
--- a/lib/llvmopencl/Barrier.h
+++ b/lib/llvmopencl/Barrier.h
@@ -21,17 +21,12 @@
 // THE SOFTWARE.
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Instructions.h"
-#include "llvm/Function.h"
-#include "llvm/Module.h"
-#else
+
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/IR/GlobalValue.h"
-#endif
 
 #include "llvm/Support/Casting.h"
 
diff --git a/lib/llvmopencl/BarrierBlock.cc b/lib/llvmopencl/BarrierBlock.cc
index d6c3a5b..4281515 100644
--- a/lib/llvmopencl/BarrierBlock.cc
+++ b/lib/llvmopencl/BarrierBlock.cc
@@ -20,15 +20,14 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "BarrierBlock.h"
-#include "Barrier.h"
+#include <cassert>
+
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Instructions.h"
-#else
+
 #include "llvm/IR/Instructions.h"
-#endif
-#include <cassert>
+
+#include "BarrierBlock.h"
+#include "Barrier.h"
 
 using namespace llvm;
 using namespace pocl;
diff --git a/lib/llvmopencl/BarrierBlock.h b/lib/llvmopencl/BarrierBlock.h
index a920142..1e9864a 100644
--- a/lib/llvmopencl/BarrierBlock.h
+++ b/lib/llvmopencl/BarrierBlock.h
@@ -21,11 +21,8 @@
 // THE SOFTWARE.
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/BasicBlock.h"
-#else
+
 #include "llvm/IR/BasicBlock.h"
-#endif
 
 #ifndef _POCL_BARRIER_BLOCK_H
 #define _POCL_BARRIER_BLOCK_H
diff --git a/lib/llvmopencl/BarrierTailReplication.cc b/lib/llvmopencl/BarrierTailReplication.cc
index 3a9b96b..1f059b3 100644
--- a/lib/llvmopencl/BarrierTailReplication.cc
+++ b/lib/llvmopencl/BarrierTailReplication.cc
@@ -21,28 +21,24 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <iostream>
+#include <algorithm>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "config.h"
-#include "BarrierTailReplication.h"
-#include "Barrier.h"
-#include "Workgroup.h"
+#include "pocl.h"
+
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#else
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
-#endif
 
+#include "BarrierTailReplication.h"
+#include "Barrier.h"
+#include "Workgroup.h"
 #include "VariableUniformityAnalysis.h"
 
-#include <iostream>
-#include <algorithm>
-
 POP_COMPILER_DIAGS
 
 using namespace llvm;
@@ -63,13 +59,8 @@ char BarrierTailReplication::ID = 0;
 void
 BarrierTailReplication::getAnalysisUsage(AnalysisUsage &AU) const
 {
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DominatorTree>();
-  AU.addPreserved<DominatorTree>();
-#else
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
-#endif
 #ifdef LLVM_OLDER_THAN_3_7
   AU.addRequired<LoopInfo>();
   AU.addPreserved<LoopInfo>();
@@ -91,12 +82,8 @@ BarrierTailReplication::runOnFunction(Function &F)
   std::cerr << "### BTR on " << F.getName().str() << std::endl;
 #endif
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DT = &getAnalysis<DominatorTree>();
-#else
   DTP = &getAnalysis<DominatorTreeWrapperPass>();
   DT = &DTP->getDomTree();
-#endif
 
 #ifdef LLVM_OLDER_THAN_3_7
   LI = &getAnalysis<LoopInfo>();
@@ -106,11 +93,7 @@ BarrierTailReplication::runOnFunction(Function &F)
 
   bool changed = ProcessFunction(F);
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DT->verifyAnalysis();
-#else
   DT->verifyDomTree();
-#endif
 
   LI->verifyAnalysis();
   /* The created tails might contain PHI nodes with operands 
@@ -120,7 +103,7 @@ BarrierTailReplication::runOnFunction(Function &F)
   for (Function::iterator i = F.begin(), e = F.end();
        i != e; ++i)
     {
-      llvm::BasicBlock *bb = i;
+      llvm::BasicBlock *bb = &*i;
       changed |= CleanupPHIs(bb);
     }      
 
@@ -236,17 +219,9 @@ BarrierTailReplication::ReplicateJoinedSubgraphs(BasicBlock *dominator,
       {
         // We have modified the function. Possibly created new loops.
         // Update analysis passes.
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-        DT->runOnFunction(*f);
-#else
         DTP->runOnFunction(*f);
-#endif
 
-#ifdef LLVM_3_1
-        LI->getBase().Calculate(DT->getBase());
-#else
         LI->runOnFunction(*f);
-#endif
       }
   }
   processed_bbs.insert(subgraph_entry);
@@ -381,7 +356,7 @@ BarrierTailReplication::ReplicateBasicBlocks(BasicBlockVector &new_graph,
     for (BasicBlock::iterator i2 = b->begin(), e2 = b->end();
 	 i2 != e2; ++i2) {
       Instruction *i = i2->clone();
-      reference_map.insert(std::make_pair(i2, i));
+      reference_map.insert(std::make_pair(&*i2, i));
       new_b->getInstList().push_back(i);
     }
 
@@ -437,7 +412,7 @@ BarrierTailReplication::UpdateReferences(const BasicBlockVector &graph,
     BasicBlock *b = *i;
     for (BasicBlock::iterator i2 = b->begin(), e2 = b->end();
          i2 != e2; ++i2) {
-      Instruction *i = i2;
+      Instruction *i = &*i2;
       RemapInstruction(i, reference_map,
                        RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);
     }
diff --git a/lib/llvmopencl/BarrierTailReplication.h b/lib/llvmopencl/BarrierTailReplication.h
index 08efcd6..d485643 100644
--- a/lib/llvmopencl/BarrierTailReplication.h
+++ b/lib/llvmopencl/BarrierTailReplication.h
@@ -21,29 +21,19 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <map>
+#include <set>
+
 #ifndef POCL_BARRIER_TAIL_REPLICATION
 #define POCL_BARRIER_TAIL_REPLICATION
 
-#include "config.h"
 #include "pocl.h"
 
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Function.h"
-#else
 #include "llvm/IR/Function.h"
-#endif
-
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-#include "llvm/Analysis/Dominators.h"
-#else
 #include "llvm/IR/Dominators.h"
-#endif
-
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include <map>
-#include <set>
 
 namespace pocl {
   class Workgroup;
@@ -64,9 +54,7 @@ namespace pocl {
     typedef std::map<llvm::Value *, llvm::Value *> ValueValueMap;
 
     llvm::DominatorTree *DT;
-#if ! (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
     llvm::DominatorTreeWrapperPass *DTP;
-#endif
 
 #ifdef LLVM_OLDER_THAN_3_7
     llvm::LoopInfo *LI;
diff --git a/lib/llvmopencl/BreakConstantGEPs.cpp b/lib/llvmopencl/BreakConstantGEPs.cpp
index 8003772..b19e38f 100644
--- a/lib/llvmopencl/BreakConstantGEPs.cpp
+++ b/lib/llvmopencl/BreakConstantGEPs.cpp
@@ -18,38 +18,26 @@
 
 #define DEBUG_TYPE "break-constgeps"
 
+#include <iostream>
+#include <map>
+#include <utility>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "config.h"
 #include "pocl.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Constants.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instruction.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#else
+
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
-#endif
 #include "llvm/ADT/Statistic.h"
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-#include "llvm/Support/InstIterator.h"
-#else
 #include "llvm/IR/InstIterator.h"
-#endif
 
 #include "BreakConstantGEPs.h"
 #include "Workgroup.h"
 
-#include <iostream>
-#include <map>
-#include <utility>
-
 // Identifier variable for the pass
 char BreakConstantGEPs::ID = 0;
 
@@ -85,9 +73,7 @@ hasConstantGEP (Value * V) {
         CE->getOpcode() == Instruction::GetElementPtr ||
         CE->getOpcode() == Instruction::BitCast;
 
-#if !(defined(LLVM_3_2) || defined(LLVM_3_3))
     isGEPOrCast |= CE->getOpcode() == Instruction::AddrSpaceCast;
-#endif
     if (isGEPOrCast) {
       return CE;
     } else {
@@ -204,9 +190,7 @@ convertExpression (ConstantExpr * CE, Instruction * InsertPt) {
     case Instruction::FPExt:
     case Instruction::PtrToInt:
     case Instruction::IntToPtr:
-#if !(defined(LLVM_3_2) || defined(LLVM_3_3))
     case Instruction::AddrSpaceCast:
-#endif
     case Instruction::BitCast: {
       Instruction::CastOps Op = (Instruction::CastOps)(CE->getOpcode());
       NewInst = CastInst::Create (Op,
@@ -221,7 +205,7 @@ convertExpression (ConstantExpr * CE, Instruction * InsertPt) {
     case Instruction:: ICmp: {
       Instruction::OtherOps Op = (Instruction::OtherOps)(CE->getOpcode());
       NewInst = CmpInst::Create (Op,
-                                 CE->getPredicate(),
+                                 (llvm::CmpInst::Predicate)CE->getPredicate(),
                                  CE->getOperand(0),
                                  CE->getOperand(1),
                                  CE->getName(),
@@ -284,7 +268,7 @@ BreakConstantGEPs::runOnFunction (Function & F) {
       // Scan through the operands of this instruction.  If it is a constant
       // expression GEP, insert an instruction GEP before the instruction.
       //
-      Instruction * I = i;
+      Instruction * I = &*i;
       for (unsigned index = 0; index < I->getNumOperands(); ++index) {
         if (hasConstantGEP (I->getOperand(index))) {
           Worklist.push_back (I);
diff --git a/lib/llvmopencl/BreakConstantGEPs.h b/lib/llvmopencl/BreakConstantGEPs.h
index 6af75cd..983e497 100644
--- a/lib/llvmopencl/BreakConstantGEPs.h
+++ b/lib/llvmopencl/BreakConstantGEPs.h
@@ -20,14 +20,8 @@
 #define BREAKCONSTANTGEPS_H
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Module.h"
-#else
+
 #include "llvm/IR/Module.h"
-#endif
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-#include "llvm/Analysis/Dominators.h"
-#endif
 #include "llvm/Pass.h"
 
 using namespace llvm;
diff --git a/lib/llvmopencl/CanonicalizeBarriers.cc b/lib/llvmopencl/CanonicalizeBarriers.cc
index 89349d5..80027c7 100644
--- a/lib/llvmopencl/CanonicalizeBarriers.cc
+++ b/lib/llvmopencl/CanonicalizeBarriers.cc
@@ -21,31 +21,24 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "pocl.h"
-#include "config.h"
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Dominators.h"
+
 #include "CanonicalizeBarriers.h"
 #include "BarrierBlock.h"
 #include "Barrier.h"
 #include "Workgroup.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <iostream>
-
 #include "VariableUniformityAnalysis.h"
 
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#else
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#endif
-
-#if ! (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-#include "llvm/IR/Dominators.h"
-#endif
 POP_COMPILER_DIAGS
 
 using namespace llvm;
@@ -62,11 +55,7 @@ char CanonicalizeBarriers::ID = 0;
 void
 CanonicalizeBarriers::getAnalysisUsage(AnalysisUsage &AU) const
 {
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DominatorTree>();
-#else
   AU.addRequired<DominatorTreeWrapperPass>();
-#endif
   AU.addPreserved<VariableUniformityAnalysis>();    
 }
 
@@ -93,7 +82,7 @@ CanonicalizeBarriers::runOnFunction(Function &F)
   }
 
   for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
-    BasicBlock *b = i;
+    BasicBlock *b = &*i;
     TerminatorInst *t = b->getTerminator();
 
     const bool isExitNode = 
@@ -128,11 +117,7 @@ CanonicalizeBarriers::runOnFunction(Function &F)
     }
   }
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DT = getAnalysisIfAvailable<DominatorTree>();
-#else
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-#endif
   return ProcessFunction(F);
 }
 
@@ -150,11 +135,11 @@ CanonicalizeBarriers::ProcessFunction(Function &F) {
 
   for (Function::iterator i = F.begin(), e = F.end();
        i != e; ++i) {
-    BasicBlock *b = i;
+    BasicBlock *b = &*i;
     for (BasicBlock::iterator i = b->begin(), e = b->end();
          i != e; ++i) {
       if (isa<Barrier>(i)) {
-        Barriers.insert(i);
+        Barriers.insert(&*i);
       }
     }
   }
@@ -224,7 +209,7 @@ CanonicalizeBarriers::ProcessFunction(Function &F) {
     emptyRegionDeleted = false;
     for (Function::iterator i = F.begin(), e = F.end();
          i != e; ++i) {
-        BasicBlock *b = i;
+        BasicBlock *b = &*i;
         llvm::TerminatorInst *t = b->getTerminator();
         if (!Barrier::endsWithBarrier(b) || t->getNumSuccessors() != 1) 
           continue;
diff --git a/lib/llvmopencl/CanonicalizeBarriers.h b/lib/llvmopencl/CanonicalizeBarriers.h
index 7d7b878..99bac82 100644
--- a/lib/llvmopencl/CanonicalizeBarriers.h
+++ b/lib/llvmopencl/CanonicalizeBarriers.h
@@ -20,15 +20,13 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <set>
+
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Function.h"
-#else
+
 #include "llvm/IR/Function.h"
-#endif
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Pass.h"
-#include <set>
 
 namespace pocl {
   class Workgroup;
diff --git a/lib/llvmopencl/DebugHelpers.cc b/lib/llvmopencl/DebugHelpers.cc
index 7e2444b..0a7dc35 100644
--- a/lib/llvmopencl/DebugHelpers.cc
+++ b/lib/llvmopencl/DebugHelpers.cc
@@ -20,35 +20,26 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "CompilerWarnings.h"
-IGNORE_COMPILER_WARNING("-Wunused-parameter")
-
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <set>
 
-#include "DebugHelpers.h"
+#include "CompilerWarnings.h"
+IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "config.h"
 #include "pocl.h"
 
-#include "Barrier.h"
-#include "BarrierBlock.h"
-#include "Workgroup.h"
-
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#endif
-
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
+#include "DebugHelpers.h"
+#include "Barrier.h"
+#include "BarrierBlock.h"
+#include "Workgroup.h"
+
 POP_COMPILER_DIAGS
 
 using namespace llvm;
@@ -177,14 +168,14 @@ void dumpCFG(
   }
 
   for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
-    BasicBlock *b = i;
+    BasicBlock *b = &*i;
     if (regionBBs.find(b) != regionBBs.end()) continue;
     printBasicBlock
       (b, s, highlights != NULL && highlights->find(b) != highlights->end());
   }
 
   for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
-    BasicBlock *b = i;
+    BasicBlock *b = &*i;
     printBranches
       (b, s, highlights != NULL && highlights->find(b) != highlights->end());
   }
@@ -200,7 +191,7 @@ bool chopBBs(llvm::Function &F, llvm::Pass &P) {
   do {
     fchanged = false;
     for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
-      BasicBlock *b = i;
+      BasicBlock *b = &*i;
       
       if (b->size() > MAX_INSTRUCTIONS_PER_BB + 1)
         {
@@ -214,7 +205,7 @@ bool chopBBs(llvm::Function &F, llvm::Pass &P) {
 #ifdef LLVM_OLDER_THAN_3_7
           SplitBlock(b, splitPoint, &P);
 #else
-          SplitBlock(b, splitPoint);
+          SplitBlock(b, &*splitPoint);
 #endif
           fchanged = true;
           break;
diff --git a/lib/llvmopencl/DebugHelpers.h b/lib/llvmopencl/DebugHelpers.h
index a5830cd..d83fb0a 100644
--- a/lib/llvmopencl/DebugHelpers.h
+++ b/lib/llvmopencl/DebugHelpers.h
@@ -24,20 +24,16 @@
 #define _POCL_DEBUG_HELPERS_H
 
 #include <string>
-
-#include "ParallelRegion.h"
+#if _MSC_VER
+#  include <set>
+#endif
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Function.h"
-#else
+
 #include "llvm/IR/Function.h"
-#endif
 #include "llvm/Pass.h"
 
-#if _MSC_VER
-#  include <set>
-#endif
+#include "ParallelRegion.h"
 
 namespace pocl {
   // View CFG with visual aids to debug kernel compiler problems.
diff --git a/lib/llvmopencl/Flatten.cc b/lib/llvmopencl/Flatten.cc
index 1756df1..fb7898c 100644
--- a/lib/llvmopencl/Flatten.cc
+++ b/lib/llvmopencl/Flatten.cc
@@ -22,21 +22,20 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <iostream>
+#include <string>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "config.h"
-#include <iostream>
-#include <string>
-#include "Workgroup.h"
+
 #include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Pass.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/Module.h"
-#endif
+
+#include "Workgroup.h"
 
 POP_COMPILER_DIAGS
 
@@ -71,21 +70,11 @@ Flatten::runOnModule(Module &M)
   bool changed = false;
   for (llvm::Module::iterator i = M.begin(), e = M.end(); i != e; ++i)
     {
-      llvm::Function *f = i;
+      llvm::Function *f = &*i;
       if (f->isDeclaration()) continue;
       if (KernelName == f->getName() || 
           (KernelName == "" && pocl::Workgroup::isKernelToProcess(*f)))
         {
-#ifdef LLVM_3_1
-          f->removeFnAttr(Attribute::AlwaysInline);
-          f->addFnAttr(Attribute::NoInline);
-#elif defined LLVM_3_2
-          AttrBuilder b;
-          f->removeFnAttr
-            (Attributes::get(M.getContext(), 
-                             b.addAttribute(Attributes::AlwaysInline)));
-          f->addFnAttr(Attributes::NoInline);
-#else
           AttributeSet attrs;
           f->removeAttributes(
               AttributeSet::FunctionIndex, 
@@ -94,7 +83,6 @@ Flatten::runOnModule(Module &M)
                AttributeSet::FunctionIndex, Attribute::AlwaysInline));
 
           f->addFnAttr(Attribute::NoInline);
-#endif
 
           f->setLinkage(llvm::GlobalValue::ExternalLinkage);
           changed = true;
@@ -104,16 +92,6 @@ Flatten::runOnModule(Module &M)
         } 
       else
         {
-#ifdef LLVM_3_1
-          f->removeFnAttr(Attribute::NoInline);
-          f->addFnAttr(Attribute::AlwaysInline);
-#elif defined LLVM_3_2
-          AttrBuilder b;
-          f->removeFnAttr(Attributes::get
-                          (M.getContext(), 
-                           b.addAttribute(Attributes::NoInline)));
-          f->addFnAttr(Attributes::AlwaysInline);
-#else
           AttributeSet attrs;
           f->removeAttributes(
               AttributeSet::FunctionIndex, 
@@ -121,7 +99,6 @@ Flatten::runOnModule(Module &M)
                                  AttributeSet::FunctionIndex, 
                                  Attribute::NoInline));
           f->addFnAttr(Attribute::AlwaysInline);
-#endif
 
           f->setLinkage(llvm::GlobalValue::InternalLinkage);
           changed = true;
@@ -166,11 +143,7 @@ Flatten::runOnModule(Module &M)
 
     for (Value::use_iterator i = v->use_begin(), e = v->use_end();
 	 i != e; ++i) {
-#if defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4
-      llvm::User *user = *i;
-#else
       llvm::User *user = i->getUser();
-#endif
       if (Instruction *ci = dyn_cast<Instruction>(user) {
         // Prevent infinite looping on recursive functions
         // (though OpenCL does not allow this?)
diff --git a/lib/llvmopencl/GenerateHeader.cc b/lib/llvmopencl/GenerateHeader.cc
index a7f70e7..fa8510b 100644
--- a/lib/llvmopencl/GenerateHeader.cc
+++ b/lib/llvmopencl/GenerateHeader.cc
@@ -20,35 +20,22 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "config.h"
 #include "pocl.h"
-#include "Workgroup.h"
+
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#ifdef LLVM_3_1
-#include "llvm/Target/TargetData.h"
-#elif defined LLVM_3_2
-#include "llvm/DataLayout.h"
-#else
 #include "llvm/IR/DataLayout.h"
-#endif
 
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Argument.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -56,8 +43,8 @@ IGNORE_COMPILER_WARNING("-Wunused-parameter")
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#endif
 
+#include "Workgroup.h"
 #include "LLVMUtils.h"
 
 POP_COMPILER_DIAGS
@@ -99,13 +86,10 @@ static RegisterPass<GenerateHeader> X("generate-header",
 void
 GenerateHeader::getAnalysisUsage(AnalysisUsage &AU) const
 {
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DataLayout>();
-#elif (defined LLVM_OLDER_THAN_3_7)
-  AU.addRequired<DataLayoutPass>();
-#else
-  // In LLVM 3.7, DataLayout is not a pass anymore, it can be created from 
+  // In LLVM 3.7, DataLayout is not a pass anymore, it can be created from
   // a llvm::Module
+#ifdef LLVM_OLDER_THAN_3_7
+  AU.addRequired<DataLayoutPass>();
 #endif
 }
 
@@ -119,25 +103,15 @@ GenerateHeader::runOnModule(Module &M)
   // kernels
   FunctionMapping kernels;
 
-  #if LLVM_VERSION_MAJOR <= 3 && LLVM_VERSION_MINOR <6
-  string ErrorInfo;
-  #else
   std::error_code ErrorInfo;
-  #endif
 
-  #if defined LLVM_3_2 || defined LLVM_3_3 
-  raw_fd_ostream out(Header.c_str(), ErrorInfo, raw_fd_ostream::F_Append);
-  #elif defined LLVM_3_4 || defined LLVM_3_5
-  raw_fd_ostream out(Header.c_str(), ErrorInfo, sys::fs::F_Append);
-  #else
   raw_fd_ostream out(Header, ErrorInfo, sys::fs::F_Append);
-  #endif
 
   for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) {
     if (!Workgroup::isKernelToProcess(*mi))
       continue;
   
-    Function *F = mi;
+    Function *F = &*mi;
 
     ProcessPointers(F, out);    
     ProcessReqdWGSize(F, out);
@@ -165,7 +139,6 @@ GenerateHeader::runOnModule(Module &M)
   return changed;
 }
 
-#include <iostream>
 
 void
 GenerateHeader::ProcessReqdWGSize(Function *F,
@@ -178,13 +151,6 @@ GenerateHeader::ProcessReqdWGSize(Function *F,
   if (size_info) {
     for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) {
       llvm::MDNode *KernelSizeInfo = size_info->getOperand(i);
-#ifdef LLVM_OLDER_THAN_3_6
-      if (KernelSizeInfo->getOperand(0) != F) 
-        continue;
-      LocalSizeX = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(1)))->getLimitedValue();
-      LocalSizeY = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(2)))->getLimitedValue();
-      LocalSizeZ = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(3)))->getLimitedValue();
-#else
       if (dyn_cast<ValueAsMetadata>(KernelSizeInfo->getOperand(0).get())->getValue() != F) 
         continue;
       LocalSizeX = (llvm::cast<ConstantInt>(
@@ -196,7 +162,6 @@ GenerateHeader::ProcessReqdWGSize(Function *F,
       LocalSizeZ = (llvm::cast<ConstantInt>(
                      llvm::dyn_cast<ConstantAsMetadata>(
                        KernelSizeInfo->getOperand(3))->getValue()))->getLimitedValue();
-#endif
       break;
     }
   }
@@ -300,10 +265,7 @@ GenerateHeader::ProcessAutomaticLocals(Function *F,
                                        raw_fd_ostream &out)
 {
   Module *M = F->getParent();
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DataLayout &TDr = getAnalysis<DataLayout>();
-  DataLayout *TD=&TDr;
-#elif (defined LLVM_OLDER_THAN_3_7)
+#ifdef  LLVM_OLDER_THAN_3_7
   const DataLayout *TD = &getAnalysis<DataLayoutPass>().getDataLayout();
 #else
   const DataLayout DLayout(F->getParent());
@@ -327,7 +289,7 @@ GenerateHeader::ProcessAutomaticLocals(Function *F,
       // Additional checks might be needed here. For now
       // we assume any global starting with kernel name
       // is declaring a local variable.
-      locals.push_back(i);
+      locals.push_back(&*i);
       // Add the parameters to the end of the function parameter list.
       parameters.push_back(i->getType());
     }
diff --git a/lib/llvmopencl/ImplicitConditionalBarriers.cc b/lib/llvmopencl/ImplicitConditionalBarriers.cc
index 6b3c2c2..340d7bf 100644
--- a/lib/llvmopencl/ImplicitConditionalBarriers.cc
+++ b/lib/llvmopencl/ImplicitConditionalBarriers.cc
@@ -21,27 +21,22 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "config.h"
-#include "ImplicitConditionalBarriers.h"
-#include "Barrier.h"
-#include "BarrierBlock.h"
-#include "Workgroup.h"
+
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#endif
-
-#include <iostream>
 
+#include "ImplicitConditionalBarriers.h"
+#include "Barrier.h"
+#include "BarrierBlock.h"
+#include "Workgroup.h"
 #include "VariableUniformityAnalysis.h"
 
 POP_COMPILER_DIAGS
@@ -64,13 +59,8 @@ ImplicitConditionalBarriers::getAnalysisUsage(AnalysisUsage &AU) const
 {
   AU.addRequired<PostDominatorTree>();
   AU.addPreserved<PostDominatorTree>();
-  #if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DominatorTree>();
-  AU.addPreserved<DominatorTree>();
-  #else
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
-  #endif
   AU.addPreserved<VariableUniformityAnalysis>();
 }
 
@@ -83,11 +73,7 @@ BasicBlock*
 ImplicitConditionalBarriers::firstNonBackedgePredecessor(
     llvm::BasicBlock *bb) {
 
-    #if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-    DominatorTree *DT = &getAnalysis<DominatorTree>();
-    #else
     DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    #endif
 
     pred_iterator I = pred_begin(bb), E = pred_end(bb);
     if (I == E) return NULL;
@@ -110,7 +96,7 @@ ImplicitConditionalBarriers::runOnFunction(Function &F) {
   typedef std::vector<BasicBlock*> BarrierBlockIndex;
   BarrierBlockIndex conditionalBarriers;
   for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
-    BasicBlock *b = i;
+    BasicBlock *b = &*i;
     if (!Barrier::hasBarrier(b)) continue;
 
     // Unconditional barrier postdominates the entry node.
diff --git a/lib/llvmopencl/ImplicitConditionalBarriers.h b/lib/llvmopencl/ImplicitConditionalBarriers.h
index 32920fa..94e3404 100644
--- a/lib/llvmopencl/ImplicitConditionalBarriers.h
+++ b/lib/llvmopencl/ImplicitConditionalBarriers.h
@@ -21,12 +21,8 @@
 // THE SOFTWARE.
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Function.h"
-#else
-#include "llvm/IR/Function.h"
-#endif
 
+#include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/PostDominators.h"
 
diff --git a/lib/llvmopencl/ImplicitLoopBarriers.cc b/lib/llvmopencl/ImplicitLoopBarriers.cc
index 23fcef1..a6ba8b6 100644
--- a/lib/llvmopencl/ImplicitLoopBarriers.cc
+++ b/lib/llvmopencl/ImplicitLoopBarriers.cc
@@ -21,28 +21,21 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <iostream>
+
 #include "config.h"
-#include "ImplicitLoopBarriers.h"
-#include "Barrier.h"
-#include "Workgroup.h"
+
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#endif
-#if ! (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
 #include "llvm/IR/Dominators.h"
-#endif
 
+#include "ImplicitLoopBarriers.h"
+#include "Barrier.h"
+#include "Workgroup.h"
 #include "VariableUniformityAnalysis.h"
 
-#include <iostream>
-
 //#define DEBUG_ILOOP_BARRIERS
 
 using namespace llvm;
@@ -57,13 +50,8 @@ namespace {
 char ImplicitLoopBarriers::ID = 0;
 
 void ImplicitLoopBarriers::getAnalysisUsage(AnalysisUsage &AU) const {
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DominatorTree>();
-  AU.addPreserved<DominatorTree>();
-#else
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
-#endif
   AU.addRequired<VariableUniformityAnalysis>();
   AU.addPreserved<VariableUniformityAnalysis>();
 }
diff --git a/lib/llvmopencl/ImplicitLoopBarriers.h b/lib/llvmopencl/ImplicitLoopBarriers.h
index ff112dc..dd4fa12 100644
--- a/lib/llvmopencl/ImplicitLoopBarriers.h
+++ b/lib/llvmopencl/ImplicitLoopBarriers.h
@@ -20,9 +20,10 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "llvm/Analysis/LoopPass.h"
 #include <set>
 
+#include "llvm/Analysis/LoopPass.h"
+
 namespace pocl {
   class ImplicitLoopBarriers : public llvm::LoopPass {
     
diff --git a/lib/llvmopencl/IsolateRegions.cc b/lib/llvmopencl/IsolateRegions.cc
index 9120b7f..d160e22 100644
--- a/lib/llvmopencl/IsolateRegions.cc
+++ b/lib/llvmopencl/IsolateRegions.cc
@@ -20,18 +20,20 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "IsolateRegions.h"
-#include "Barrier.h"
-#include "Workgroup.h"
-#include "llvm/Analysis/RegionInfo.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "config.h"
 #include "pocl.h"
 
-#include <iostream>
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#include "IsolateRegions.h"
+#include "Barrier.h"
+#include "Workgroup.h"
 #include "VariableUniformityAnalysis.h"
 
 POP_COMPILER_DIAGS
@@ -169,11 +171,7 @@ IsolateRegions::addDummyBefore(llvm::Region *R, llvm::BasicBlock *bb) {
     if (R->contains(pred))
       regionPreds.push_back(pred);
   }
-#ifdef LLVM_3_0
-  llvm::BasicBlock* newExit = 
-    SplitBlockPredecessors
-    (bb, &regionPreds[0], regionPreds.size(), ".r_exit", this);
-#elif (defined LLVM_OLDER_THAN_3_7)
+#ifdef LLVM_OLDER_THAN_3_7
   llvm::BasicBlock* newExit = 
     SplitBlockPredecessors(bb, regionPreds, ".r_exit", this);
 #else
diff --git a/lib/llvmopencl/Kernel.cc b/lib/llvmopencl/Kernel.cc
index 8fc8485..fe17fae 100644
--- a/lib/llvmopencl/Kernel.cc
+++ b/lib/llvmopencl/Kernel.cc
@@ -21,27 +21,22 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "Kernel.h"
-#include "Barrier.h"
-#include <iostream>
+#include "pocl.h"
 
-#include "config.h"
-#ifdef LLVM_3_1
-#include "llvm/Support/IRBuilder.h"
-#elif defined LLVM_3_2
-#include "llvm/IRBuilder.h"
-#include "llvm/InlineAsm.h"
-#else
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
-#endif
-POP_COMPILER_DIAGS
 
+#include "Kernel.h"
+#include "Barrier.h"
 #include "DebugHelpers.h"
 
+POP_COMPILER_DIAGS
+
 using namespace llvm;
 using namespace pocl;
 
@@ -281,9 +276,7 @@ Kernel::addLocalSizeInitCode(size_t LocalSizeX, size_t LocalSizeY, size_t LocalS
   llvm::Module* M = getParent();
 
   int size_t_width = 32;
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  if (M->getPointerSize() == llvm::Module::Pointer64)
-#elif (defined LLVM_3_5 || defined LLVM_3_6) 
+#ifdef LLVM_OLDER_THAN_3_7
   // This breaks (?) if _local_size_x is not stored in AS0,
   // but it always will be as it's just a pseudo variable that
   // will be scalarized.
diff --git a/lib/llvmopencl/Kernel.h b/lib/llvmopencl/Kernel.h
index 28f078c..9ff9dd9 100644
--- a/lib/llvmopencl/Kernel.h
+++ b/lib/llvmopencl/Kernel.h
@@ -23,14 +23,11 @@
 #ifndef _POCL_KERNEL_H
 #define _POCL_KERNEL_H
 
-#include "ParallelRegion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopInfo.h"
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-#include "llvm/Analysis/Dominators.h"
-#else
 #include "llvm/IR/Dominators.h"
-#endif
+
+#include "ParallelRegion.h"
 
 namespace pocl {
 
diff --git a/lib/llvmopencl/LLVMFileUtils.cc b/lib/llvmopencl/LLVMFileUtils.cc
index be63ccb..1697f90 100644
--- a/lib/llvmopencl/LLVMFileUtils.cc
+++ b/lib/llvmopencl/LLVMFileUtils.cc
@@ -28,55 +28,15 @@
 
 #include "llvm/Bitcode/ReaderWriter.h"
 
-/* namespace of OpenFlags enum (F_Binary, F_Excl etc) */
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
-#define OPEN_FLAGS_ENUM raw_fd_ostream
-#else
-#define OPEN_FLAGS_ENUM sys::fs
-#endif
-
-
-/* Older llvms:
- * llvm::error_code instead of std::error_code
- * "file existed" is an output argument, instead of "ignore if file existed"
- * different OpenFlags (no F_Binary)
- * File locking disabled: llvm::LockManager class doesn't exist in LLVM < 3.5
- */
-#if LLVM_OLDER_THAN_3_5
-
-#define DISABLE_LOCKMANAGER
-
-#include <llvm/Support/system_error.h>
-#define STD_ERROR_CODE llvm::error_code
-static bool existed;
-#define EXIST_ARG existed
-#define DEFAULT_OPEN_FLAGS OPEN_FLAGS_ENUM::F_Binary
-
-#else
-
 #include <llvm/Support/Errc.h>
-#define STD_ERROR_CODE std::error_code
-#define EXIST_ARG true
-#define DEFAULT_OPEN_FLAGS OPEN_FLAGS_ENUM::F_RW
-
-#endif
 
 #define RETURN_IF_EC if (ec) return ec.default_error_condition().value()
+#define OPEN_FOR_READ ec = sys::fs::openFileForRead(p, fd)
+#define OPEN_CREATE ec = sys::fs::openFileForWrite(p, fd, sys::fs::F_RW | sys::fs::F_Excl)
+#define OPEN_FOR_APPEND ec = sys::fs::openFileForWrite(p, fd, sys::fs::F_RW | sys::fs::F_Append)
 
-/* no openFile* functions in sys::fs before llvm 3.4, so fallback to open */
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
-    #define OPEN_FOR_READ fd = open(path, O_RDONLY)
-    #define OPEN_CREATE fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR)
-    #define OPEN_FOR_APPEND fd = open(path, O_WRONLY | O_APPEND)
-    #define RETURN_IF_ERRNO if (fd < 0) return errno;
-#else
-    #define OPEN_FOR_READ ec = sys::fs::openFileForRead(p, fd)
-    #define OPEN_CREATE ec = sys::fs::openFileForWrite(p, fd, DEFAULT_OPEN_FLAGS | sys::fs::F_Excl)
-    #define OPEN_FOR_APPEND ec = sys::fs::openFileForWrite(p, fd, DEFAULT_OPEN_FLAGS | sys::fs::F_Append)
-    #define RETURN_IF_ERRNO RETURN_IF_EC
-#endif
-
-
+/* #define to disable locking completely */
+#undef DISABLE_LOCKMANAGER
 
 using namespace llvm;
 
@@ -84,7 +44,7 @@ using namespace llvm;
 
 int
 pocl_rm_rf(const char* path) {
-    STD_ERROR_CODE ec;
+    std::error_code ec;
     SmallString<128> DirNative;
 
     sys::path::native(Twine(path), DirNative);
@@ -95,13 +55,7 @@ pocl_rm_rf(const char* path) {
          Dir != DirEnd && !ec; Dir.increment(ec)) {
         Twine p = Dir->path();
         std::string s = p.str();
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
-        sys::fs::file_status result;
-        sys::fs::status(p, result);
-        if (sys::fs::is_directory(result)) {
-#else
         if (sys::fs::is_directory(p)) {
-#endif
             DirSet.push_back(s);
         } else
             FileSet.push_back(s);
@@ -110,14 +64,14 @@ pocl_rm_rf(const char* path) {
 
     std::vector<std::string>::iterator it;
     for (it = FileSet.begin(); it != FileSet.end(); ++it) {
-        ec = sys::fs::remove(*it, EXIST_ARG);
+        ec = sys::fs::remove(*it, true);
         RETURN_IF_EC;
     }
 
     std::sort(DirSet.begin(), DirSet.end());
     std::vector<std::string>::reverse_iterator it2;
     for (it2 = DirSet.rbegin(); it2 != DirSet.rend(); ++it2) {
-        ec = sys::fs::remove(*it2, EXIST_ARG);
+        ec = sys::fs::remove(*it2, true);
         RETURN_IF_EC;
     }
 
@@ -128,14 +82,14 @@ pocl_rm_rf(const char* path) {
 int
 pocl_mkdir_p(const char* path) {
     Twine p(path);
-    STD_ERROR_CODE ec = sys::fs::create_directories(p, EXIST_ARG);
+    std::error_code ec = sys::fs::create_directories(p, true);
     return ec.default_error_condition().value();
 }
 
 int
 pocl_remove(const char* path) {
     Twine p(path);
-    STD_ERROR_CODE ec = sys::fs::remove(p, EXIST_ARG);
+    std::error_code ec = sys::fs::remove(p, true);
     return ec.default_error_condition().value();
 }
 
@@ -151,18 +105,18 @@ pocl_exists(const char* path) {
 int
 pocl_filesize(const char* path, uint64_t* res) {
     Twine p(path);
-    STD_ERROR_CODE ec = sys::fs::file_size(p, *res);
+    std::error_code ec = sys::fs::file_size(p, *res);
     return ec.default_error_condition().value();
 }
 
 int pocl_touch_file(const char* path) {
     Twine p(path);
-    STD_ERROR_CODE ec = sys::fs::remove(p, EXIST_ARG);
+    std::error_code ec = sys::fs::remove(p, true);
     RETURN_IF_EC;
 
     int fd;
     OPEN_CREATE;
-    RETURN_IF_ERRNO;
+    RETURN_IF_EC;
 
     return (close(fd) ? (-errno) : 0);
 
@@ -181,11 +135,11 @@ pocl_read_file(const char* path, char** content, uint64_t *filesize) {
     ssize_t fsize = (ssize_t)(*filesize);
     if (!errcode) {
         int fd;
-        STD_ERROR_CODE ec;
+        std::error_code ec;
         Twine p(path);
 
         OPEN_FOR_READ;
-        RETURN_IF_ERRNO;
+        RETURN_IF_EC;
 
         // +1 so we can later simply turn it into a C string, if needed
         *content = (char*)malloc(fsize+1);
@@ -210,7 +164,7 @@ int pocl_write_file(const char *path, const char* content,
                                     int         append,
                                     int         dont_rewrite) {
     int fd;
-    STD_ERROR_CODE ec;
+    std::error_code ec;
     Twine p(path);
 
     assert(path);
@@ -232,7 +186,7 @@ int pocl_write_file(const char *path, const char* content,
     else
         OPEN_CREATE;
 
-    RETURN_IF_ERRNO;
+    RETURN_IF_EC;
 
     if (write(fd, content, (ssize_t)count) < (ssize_t)count)
         return errno ? -errno : -1;
@@ -250,11 +204,7 @@ int pocl_write_module(void *module, const char* path, int dont_rewrite) {
     assert(path);
 
     Twine p(path);
-#ifdef LLVM_OLDER_THAN_3_6
-    std::string ec;
-#else
-    STD_ERROR_CODE ec;
-#endif
+    std::error_code ec;
 
     if (pocl_exists(path)) {
         if (dont_rewrite)
@@ -266,13 +216,8 @@ int pocl_write_module(void *module, const char* path, int dont_rewrite) {
         }
     }
 
-    raw_fd_ostream os(path, ec, DEFAULT_OPEN_FLAGS | OPEN_FLAGS_ENUM::F_Excl);
-#ifdef LLVM_OLDER_THAN_3_6
-    if (!ec.empty())
-        return 2;
-#else
+    raw_fd_ostream os(path, ec, sys::fs::F_RW | sys::fs::F_Excl);
     RETURN_IF_EC;
-#endif
 
     WriteBitcodeToFile((llvm::Module*)module, os);
     os.close();
diff --git a/lib/llvmopencl/LLVMUtils.cc b/lib/llvmopencl/LLVMUtils.cc
index b64b657..a0a496b 100644
--- a/lib/llvmopencl/LLVMUtils.cc
+++ b/lib/llvmopencl/LLVMUtils.cc
@@ -23,15 +23,9 @@
 #include "LLVMUtils.h"
 
 #include "pocl.h"
-#include "config.h"
 
-#ifdef LLVM_3_2
-#include <llvm/Module.h>
-#include <llvm/Metadata.h>
-#else
 #include <llvm/IR/Module.h>
 #include <llvm/IR/Metadata.h>
-#endif
 
 using namespace llvm;
 
@@ -61,24 +55,15 @@ regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels)
               Function *old_kernel = (*i).first;
               Function *new_kernel = (*i).second;
               Function *func_from_md;
-#ifdef LLVM_OLDER_THAN_3_6
-              func_from_md = dyn_cast<Function>(wgsizeMD->getOperand(0));
-#else
               func_from_md = dyn_cast<Function>(
                 dyn_cast<ValueAsMetadata>(wgsizeMD->getOperand(0))->getValue());
-#endif
               if (old_kernel == new_kernel || wgsizeMD->getNumOperands() == 0 ||
                   func_from_md != old_kernel) 
                 continue;
               // found a wg size metadata that points to the old kernel, copy its
               // operands except the first one to a new MDNode
-#ifdef LLVM_OLDER_THAN_3_6
-              SmallVector<Value*, 8> operands;
-              operands.push_back(new_kernel);
-#else
               SmallVector<Metadata*, 8> operands;
               operands.push_back(llvm::ValueAsMetadata::get(new_kernel));
-#endif
               for (unsigned opr = 1; opr < wgsizeMD->getNumOperands(); ++opr) {
                   operands.push_back(wgsizeMD->getOperand(opr));
               }
@@ -97,12 +82,8 @@ regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels)
   for (FunctionMapping::const_iterator i = kernels.begin(),
          e = kernels.end();
        i != e; ++i) {
-#ifdef LLVM_OLDER_THAN_3_6
-    MDNode *md = MDNode::get(M.getContext(), ArrayRef<Value *>((*i).second));
-#else
     MDNode *md = MDNode::get(M.getContext(), ArrayRef<Metadata *>(
       llvm::ValueAsMetadata::get((*i).second)));
-#endif
     nmd->addOperand(md);
   }
 }
diff --git a/lib/llvmopencl/LLVMUtils.h b/lib/llvmopencl/LLVMUtils.h
index ef86990..d0a7bf0 100644
--- a/lib/llvmopencl/LLVMUtils.h
+++ b/lib/llvmopencl/LLVMUtils.h
@@ -23,21 +23,14 @@
 #ifndef _POCL_LLVM_UTILS_H
 #define _POCL_LLVM_UTILS_H
 
-#include "pocl.h"
 #include <map>
 #include <string>
 
-#include "config.h"
+#include "pocl.h"
 
-#ifdef LLVM_3_2
-#include <llvm/Module.h>
-#include <llvm/Metadata.h>
-#include <llvm/DerivedTypes.h>
-#else
 #include <llvm/IR/Module.h>
 #include <llvm/IR/Metadata.h>
 #include <llvm/IR/DerivedTypes.h>
-#endif
 
 namespace llvm {
     class Module;
@@ -64,7 +57,7 @@ inline bool
 is_image_type(const llvm::Type& t) 
 {
   if (t.isPointerTy() && t.getPointerElementType()->isStructTy()) {
-    llvm::StringRef name = t.getPointerElementType()->getStructName().str();
+    llvm::StringRef name = t.getPointerElementType()->getStructName();
     if (name.startswith("opencl.image2d_t") || name.startswith("opencl.image3d_t") ||
         name.startswith("opencl.image1d_t") || name.startswith("struct._pocl_image"))
       return true;
@@ -77,7 +70,7 @@ is_sampler_type(const llvm::Type& t)
 {
   if (t.isPointerTy() && t.getPointerElementType()->isStructTy()) 
     {
-      llvm::StringRef name = t.getPointerElementType()->getStructName().str();
+      llvm::StringRef name = t.getPointerElementType()->getStructName();
       if (name.startswith("opencl.sampler_t_")) return true;     
     }
   return false;
diff --git a/lib/llvmopencl/LoopBarriers.cc b/lib/llvmopencl/LoopBarriers.cc
index 566a725..dc247dc 100644
--- a/lib/llvmopencl/LoopBarriers.cc
+++ b/lib/llvmopencl/LoopBarriers.cc
@@ -21,28 +21,18 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "config.h"
 #include "pocl.h"
 
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#endif
-
-#if ! (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
 #include "llvm/IR/Dominators.h"
-#endif
-
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <iostream>
 
 #include "LoopBarriers.h"
 #include "Barrier.h"
@@ -66,13 +56,8 @@ char LoopBarriers::ID = 0;
 void
 LoopBarriers::getAnalysisUsage(AnalysisUsage &AU) const
 {
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DominatorTree>();
-  AU.addPreserved<DominatorTree>();
-#else
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
-#endif
 
 }
 
@@ -85,19 +70,11 @@ LoopBarriers::runOnLoop(Loop *L, LPPassManager &LPM)
   if (!Workgroup::hasWorkgroupBarriers(*L->getHeader()->getParent()))
     return false;
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DT = &getAnalysis<DominatorTree>();
-#else
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-#endif
 
   bool changed = ProcessLoop(L, LPM);
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DT->verifyAnalysis();
-#else
   DT->verifyDomTree();
-#endif
 
   return changed;
 }
diff --git a/lib/llvmopencl/LoopBarriers.h b/lib/llvmopencl/LoopBarriers.h
index 404e3ce..1d31309 100644
--- a/lib/llvmopencl/LoopBarriers.h
+++ b/lib/llvmopencl/LoopBarriers.h
@@ -20,6 +20,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <set>
+
 #ifndef POCL_LOOP_BARRIERS_H
 #define POCL_LOOP_BARRIERS_H
 
@@ -27,7 +29,6 @@
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "llvm/Analysis/LoopPass.h"
-#include <set>
 
 POP_COMPILER_DIAGS
 
diff --git a/lib/llvmopencl/Makefile.in b/lib/llvmopencl/Makefile.in
index 267cfa0..e3d08ed 100644
--- a/lib/llvmopencl/Makefile.in
+++ b/lib/llvmopencl/Makefile.in
@@ -302,6 +302,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -309,6 +310,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -326,8 +328,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -403,6 +403,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/lib/llvmopencl/PHIsToAllocas.cc b/lib/llvmopencl/PHIsToAllocas.cc
index 9d0b118..3bd16cd 100644
--- a/lib/llvmopencl/PHIsToAllocas.cc
+++ b/lib/llvmopencl/PHIsToAllocas.cc
@@ -20,24 +20,18 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "PHIsToAllocas.h"
-#include "Workgroup.h"
-#include "WorkitemHandlerChooser.h"
-#include "WorkitemLoops.h"
-#include "VariableUniformityAnalysis.h"
+#include <iostream>
 
 #include "config.h"
 
-#ifdef LLVM_3_1
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/TypeBuilder.h"
-#elif defined LLVM_3_2
-#include "llvm/IRBuilder.h"
-#include "llvm/TypeBuilder.h"
-#else
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/TypeBuilder.h"
-#endif
+
+#include "PHIsToAllocas.h"
+#include "Workgroup.h"
+#include "WorkitemHandlerChooser.h"
+#include "WorkitemLoops.h"
+#include "VariableUniformityAnalysis.h"
 
 namespace {
   static
@@ -45,8 +39,6 @@ namespace {
       "phistoallocas", "Convert all PHI nodes to allocas");
 }
 
-#include <iostream>
-
 namespace pocl {
 
 char PHIsToAllocas::ID = 0;
@@ -81,7 +73,7 @@ PHIsToAllocas::runOnFunction(Function &F) {
   for (Function::iterator bb = F.begin(); bb != F.end(); ++bb) {
     for (BasicBlock::iterator p = bb->begin(); 
          p != bb->end(); ++p) {
-        Instruction* instr = p;
+        Instruction* instr = &*p;
         if (isa<PHINode>(instr)) {
             PHIs.push_back(instr);
         }
@@ -128,7 +120,7 @@ PHIsToAllocas::BreakPHIToAllocas(PHINode* phi) {
 
   const bool OriginalPHIWasUniform = VUA.isUniform(function, phi);
 
-  IRBuilder<> builder(function->getEntryBlock().getFirstInsertionPt());
+  IRBuilder<> builder(&*(function->getEntryBlock().getFirstInsertionPt()));
 
   llvm::Instruction *alloca = 
     builder.CreateAlloca(phi->getType(), 0, allocaName);
diff --git a/lib/llvmopencl/PHIsToAllocas.h b/lib/llvmopencl/PHIsToAllocas.h
index c974b3e..8c77478 100644
--- a/lib/llvmopencl/PHIsToAllocas.h
+++ b/lib/llvmopencl/PHIsToAllocas.h
@@ -24,11 +24,8 @@
 #define _POCL_PHIS_TO_ALLOCAS_H
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Function.h"
-#else
+
 #include "llvm/IR/Function.h"
-#endif
 #include "llvm/Pass.h"
 
 namespace llvm {
diff --git a/lib/llvmopencl/ParallelRegion.cc b/lib/llvmopencl/ParallelRegion.cc
index b4b7537..7d53afe 100644
--- a/lib/llvmopencl/ParallelRegion.cc
+++ b/lib/llvmopencl/ParallelRegion.cc
@@ -22,29 +22,21 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "ParallelRegion.h"
-#include "Barrier.h"
-#include "Kernel.h"
-#include "config.h"
+#include <set>
+#include <sstream>
+#include <map>
+#include <algorithm>
+
 #include "pocl.h"
-#ifdef LLVM_3_1
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/ValueSymbolTable.h"
-#elif defined LLVM_3_2
-#include "llvm/IRBuilder.h"
-#include "llvm/ValueSymbolTable.h"
-#else
+
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#endif
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
-#include <set>
-#include <sstream>
-#include <map>
-#include <algorithm>
-
+#include "ParallelRegion.h"
+#include "Barrier.h"
+#include "Kernel.h"
 #include "DebugHelpers.h"
 
 using namespace std;
@@ -78,7 +70,7 @@ ParallelRegion::GenerateTempNames(llvm::BasicBlock *bb)
 {
   for (llvm::BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; ++i)
     {
-      llvm::Instruction *instr = i;
+      llvm::Instruction *instr = &*i;
       if (instr->hasName() || !instr->isUsedOutsideOfBlock(bb)) continue;
       int tempCounter = 0;
       std::string tempName = "";
@@ -168,7 +160,7 @@ ParallelRegion::remap(ValueToValueMapTy &map)
 
     for (BasicBlock::iterator ii = (*i)->begin(), ee = (*i)->end();
          ii != ee; ++ii)
-      RemapInstruction(ii, map,
+      RemapInstruction(&*ii, map,
                        RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);
 
 #ifdef DEBUG_REMAP
@@ -212,8 +204,12 @@ ParallelRegion::chainAfter(ParallelRegion *region)
     successor->getParent()->getBasicBlockList();
   
   for (iterator i = begin(), e = end(); i != e; ++i)
+
+#ifdef LLVM_OLDER_THAN_3_8
     bb_list.insertAfter(tail, *i);
-  
+#else
+    bb_list.insertAfter(tail->getIterator(), *i);
+#endif
   t->setSuccessor(0, entryBB());
 
   t = exitBB()->getTerminator();
@@ -296,9 +292,7 @@ ParallelRegion::insertLocalIdInit(llvm::BasicBlock* entry,
   Module *M = entry->getParent()->getParent();
 
   int size_t_width = 32;
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  if (M->getPointerSize() == llvm::Module::Pointer64)
-#elif (defined LLVM_3_5 || defined LLVM_3_6) 
+#ifdef LLVM_OLDER_THAN_3_7
   // This breaks (?) if _local_size_x is not stored in AS0,
   // but it always will be as it's just a pseudo variable that
   // will be scalarized.
@@ -370,10 +364,10 @@ ParallelRegion::Create(const SmallPtrSet<BasicBlock *, 8>& bbs, BasicBlock *entr
   // is the same as original function order.
   Function *F = entry->getParent();
   for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
-    BasicBlock *b = i;
+    BasicBlock *b = &*i;
     for (SmallPtrSetIterator<BasicBlock *> j = bbs.begin(); j != bbs.end(); ++j) {
       if (*j == b) {
-        new_region->push_back(i);
+        new_region->push_back(&*i);
         if (entry == *j)
             new_region->setEntryBBIndex(new_region->size() - 1);
         else if (exit == *j)
@@ -482,25 +476,6 @@ ParallelRegion::Verify()
 void
 ParallelRegion::AddParallelLoopMetadata(llvm::MDNode *identifier) {
 
-#ifdef LLVM_OLDER_THAN_3_6
-  for (iterator i = begin(), e = end(); i != e; ++i) {
-    BasicBlock* bb = *i;      
-    for (BasicBlock::iterator ii = bb->begin(), ee = bb->end();
-         ii != ee; ii++) {
-      if (ii->mayReadOrWriteMemory()) {
-        std::vector<Value*> loopIds;
-        MDNode *oldIds = ii->getMetadata("llvm.mem.parallel_loop_access");
-        if (oldIds != NULL) {
-          for (unsigned i = 0; i < oldIds->getNumOperands(); ++i) {
-            loopIds.push_back(oldIds->getOperand(i));
-          }
-        }
-        ii->setMetadata("llvm.mem.parallel_loop_access", 
-                        MDNode::get(bb->getContext(), loopIds));
-      }
-    }
-  }
-#else
   for (iterator i = begin(), e = end(); i != e; ++i) {
     BasicBlock* bb = *i;      
     for (BasicBlock::iterator ii = bb->begin(), ee = bb->end();
@@ -515,7 +490,6 @@ ParallelRegion::AddParallelLoopMetadata(llvm::MDNode *identifier) {
       }
     }
   }
-#endif
 }
 
 void
@@ -524,38 +498,6 @@ ParallelRegion::AddIDMetadata(
     std::size_t x, 
     std::size_t y, 
     std::size_t z) {
-#ifdef LLVM_OLDER_THAN_3_6 
-    int counter = 1;
-    Value *v1[] = {
-        MDString::get(context, "WI_region"),      
-        ConstantInt::get(Type::getInt32Ty(context), pRegionId)};      
-    MDNode* mdRegion = MDNode::get(context, v1);  
-    Value *v2[] = {
-        MDString::get(context, "WI_xyz"),      
-        ConstantInt::get(Type::getInt32Ty(context), x),
-        ConstantInt::get(Type::getInt32Ty(context), y),      
-        ConstantInt::get(Type::getInt32Ty(context), z)};      
-    MDNode* mdXYZ = MDNode::get(context, v2);  
-    Value *v[] = {
-        MDString::get(context, "WI_data"),      
-        mdRegion,
-        mdXYZ};
-    MDNode* md = MDNode::get(context, v);              
-    
-    for (iterator i = begin(), e = end(); i != e; ++i) {
-      BasicBlock* bb = *i;      
-      for (BasicBlock::iterator ii = bb->begin();
-            ii != bb->end(); ii++) {
-        Value *v3[] = {
-            MDString::get(context, "WI_counter"),      
-            ConstantInt::get(Type::getInt32Ty(context), counter)};      
-        MDNode* mdCounter = MDNode::get(context, v3);  
-        counter++;
-        ii->setMetadata("wi", md);
-        ii->setMetadata("wi_counter", mdCounter);
-      }
-    }
-#else
     int counter = 1;
     Metadata *v1[] = {
         MDString::get(context, "WI_region"),      
@@ -592,7 +534,6 @@ ParallelRegion::AddIDMetadata(
         ii->setMetadata("wi_counter", mdCounter);
       }
     }
-#endif
 }
 
 
@@ -654,7 +595,7 @@ llvm::Instruction*
 ParallelRegion::LocalIDZLoad()
 {
   if (LocalIDZLoadInstr != NULL) return LocalIDZLoadInstr;
-  IRBuilder<> builder(entryBB()->getFirstInsertionPt());
+  IRBuilder<> builder(&*(entryBB()->getFirstInsertionPt()));
   return LocalIDZLoadInstr = 
     builder.CreateLoad
     (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Z_GLOBAL));
@@ -668,7 +609,7 @@ llvm::Instruction*
 ParallelRegion::LocalIDYLoad()
 {
   if (LocalIDYLoadInstr != NULL) return LocalIDYLoadInstr;
-  IRBuilder<> builder(entryBB()->getFirstInsertionPt());
+  IRBuilder<> builder(&*(entryBB()->getFirstInsertionPt()));
   return LocalIDYLoadInstr = 
     builder.CreateLoad
     (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Y_GLOBAL));
@@ -682,7 +623,7 @@ llvm::Instruction*
 ParallelRegion::LocalIDXLoad()
 {
   if (LocalIDXLoadInstr != NULL) return LocalIDXLoadInstr;
-  IRBuilder<> builder(entryBB()->getFirstInsertionPt());
+  IRBuilder<> builder(&*(entryBB()->getFirstInsertionPt()));
   return LocalIDXLoadInstr = 
     builder.CreateLoad
     (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_X_GLOBAL));
@@ -720,31 +661,10 @@ ParallelRegion::InjectPrintF
        /*Name=*/"printf", M); 
     printfFunc->setCallingConv(CallingConv::C);
 
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-    AttrListPtr func_printf_PAL;
-#else
     AttributeSet func_printf_PAL;
-#endif
     {
-#ifdef LLVM_3_1
-      SmallVector<AttributeWithIndex, 4> Attrs;
-      AttributeWithIndex PAWI;
-      PAWI.Index = 1U; 
-      PAWI.Attrs = Attribute::NoCapture;
-      Attrs.push_back(PAWI);
-      PAWI.Index = 4294967295U; 
-      PAWI.Attrs = Attribute::NoUnwind;
-      Attrs.push_back(PAWI);
-      func_printf_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
-#elif defined LLVM_3_2
-      SmallVector<AttributeWithIndex, 4> Attrs;
-      Attrs.push_back(AttributeWithIndex::get(M->getContext(), 1U, Attributes::NoCapture));
-      Attrs.push_back(AttributeWithIndex::get(M->getContext(), 4294967295U, Attributes::NoUnwind));
-      func_printf_PAL = AttrListPtr::get(M->getContext(), Attrs);
-#else
       func_printf_PAL.addAttribute( M->getContext(), 1U, Attribute::NoCapture);
       func_printf_PAL.addAttribute( M->getContext(), 4294967295U, Attribute::NoUnwind);
-#endif
     }
     printfFunc->setAttributes(func_printf_PAL);
   }
@@ -835,7 +755,7 @@ ParallelRegion::InjectVariablePrintouts()
       for (llvm::BasicBlock::iterator instr = bb->begin();
            instr != bb->end(); ++instr) 
         {
-          llvm::Instruction *instruction = instr;
+          llvm::Instruction *instruction = &*instr;
           if (isa<PointerType>(instruction->getType()) ||
               !instruction->hasName()) continue;
           std::string name = instruction->getName().str();
@@ -884,7 +804,7 @@ ParallelRegion::LocalizeIDLoads()
       for (llvm::BasicBlock::iterator instrI = bb->begin();
            instrI != bb->end(); ++instrI) 
         {
-	  llvm::Instruction *instr = instrI;
+          llvm::Instruction *instr = &*instrI;
 	  if (instr == LocalIDXLoadInstr ||
 	      instr == LocalIDYLoadInstr ||
 	      instr == LocalIDZLoadInstr) continue;
diff --git a/lib/llvmopencl/ParallelRegion.h b/lib/llvmopencl/ParallelRegion.h
index 6f32f3c..212346d 100644
--- a/lib/llvmopencl/ParallelRegion.h
+++ b/lib/llvmopencl/ParallelRegion.h
@@ -24,26 +24,19 @@
 #ifndef _POCL_PARALLEL_REGION_H
 #define _POCL_PARALLEL_REGION_H
 
-#include "BarrierBlock.h"
+#include <vector>
+#include <sstream>
+
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/BasicBlock.h"
-#include "llvm/LLVMContext.h"
-#else
+
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/LLVMContext.h"
-#endif
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-#include "llvm/Support/CFG.h"
-#else
 #include "llvm/IR/CFG.h"
-#endif
-
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include <vector>
-#include <sstream>
+
+#include "BarrierBlock.h"
 
 namespace pocl {
 
diff --git a/lib/llvmopencl/TargetAddressSpaces.cc b/lib/llvmopencl/TargetAddressSpaces.cc
index b94e218..c40e3c2 100644
--- a/lib/llvmopencl/TargetAddressSpaces.cc
+++ b/lib/llvmopencl/TargetAddressSpaces.cc
@@ -21,20 +21,15 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "config.h"
 #include <iostream>
 #include <string>
 #include <set>
 
-#ifdef LLVM_3_2
-# include <llvm/Instructions.h>
-# include <llvm/IntrinsicInst.h>
-#else
+#include "pocl.h"
+
 # include <llvm/IR/Instructions.h>
 # include <llvm/IR/Module.h>
 # include <llvm/IR/IntrinsicInst.h>
-#endif
-
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/Transforms/Utils/ValueMapper.h>
 #include <llvm/Transforms/Utils/Cloning.h>
@@ -43,7 +38,6 @@
 #include "TargetAddressSpaces.h"
 #include "Workgroup.h"
 #include "LLVMUtils.h"
-#include "pocl.h"
 
 #define DEBUG_TARGET_ADDRESS_SPACES
 
@@ -64,32 +58,117 @@ TargetAddressSpaces::TargetAddressSpaces() : ModulePass(ID) {
 }
 
 static Type *
-ConvertedType(llvm::Type *type, std::map<unsigned, unsigned> &addrSpaceMap) {
+ConvertedType(llvm::Type *type, std::map<unsigned, unsigned> &addrSpaceMap,
+              std::map<llvm::Type*, llvm::StructType*> &convertedStructsCache) {
 
   if (type->isPointerTy()) {
     unsigned AS = type->getPointerAddressSpace();
     unsigned newAS = addrSpaceMap[AS];
-    return PointerType::get(ConvertedType(type->getPointerElementType(), addrSpaceMap), newAS);
+    return PointerType::get(
+          ConvertedType(type->getPointerElementType(),
+                        addrSpaceMap, convertedStructsCache),
+          newAS);
   } else if (type->isArrayTy()) {
-    return ArrayType::get
-      (ConvertedType(type->getArrayElementType(), addrSpaceMap), type->getArrayNumElements());
-  } else { /* TODO: pointers inside structs */
+    return ArrayType::get(
+          ConvertedType(type->getArrayElementType(),
+                        addrSpaceMap, convertedStructsCache),
+          type->getArrayNumElements());
+#ifndef TCE_AVAILABLE
+  } else if (type->isStructTy()) {
+    if (convertedStructsCache[type])
+      return convertedStructsCache[type];
+
+    llvm::StructType* t = dyn_cast<llvm::StructType>(type);
+    llvm::StructType* tn;
+    if (!t->isLiteral()) {
+      std::string s = t->getName().str();
+      s += "_tas_struct";
+      tn = StructType::create(t->getContext(), s);
+      convertedStructsCache[type] = tn;
+    }
+    std::vector<llvm::Type*> newtypes;
+    for (llvm::StructType::element_iterator i = t->element_begin(),
+         e = t->element_end(); i < e; ++i) {
+      newtypes.push_back(ConvertedType(*i, addrSpaceMap, convertedStructsCache));
+    }
+    ArrayRef<Type*> a(newtypes);
+    if (t->isLiteral()) {
+      tn = StructType::get(t->getContext(), a, t->isPacked());
+      convertedStructsCache[type] = tn;
+    } else {
+      tn->setBody(a, t->isPacked());
+    }
+    return tn;
+#endif
+  } else {
     return type;
   }
 }
 
 static bool
-UpdateAddressSpace(llvm::Value& val, std::map<unsigned, unsigned> &addrSpaceMap) {
+UpdateAddressSpace(llvm::Value& val, std::map<unsigned, unsigned> &addrSpaceMap,
+                   std::map<llvm::Type*, llvm::StructType*> &convertedStructsCache) {
   Type *type = val.getType();
   if (!type->isPointerTy()) return false;
 
-  Type *newType = ConvertedType(type, addrSpaceMap);
+  Type *newType = ConvertedType(type, addrSpaceMap, convertedStructsCache);
   if (newType == type) return false;
 
   val.mutateType(newType);
   return true;
 }
 
+/* Removes AddrSpaceCastInst either as Inst or ConstantExpr, if they cast
+   to generic addrspace, or if they point to the same AS
+   ConstExpr removing is 2 step: CE -> convert to ASCI -> remove ASCI.
+
+   \param [in] v the ASCI to remove
+   \param [in] beforeinst in case of a ConstantExpr, after converting it to Instr
+               we need to insert it into BB; this is an Instr before
+               which we insert it (it's the CE itself)
+   \returns true if replacement took place (-> BB iterator needs to restart)
+*/
+static bool removeASCI(llvm::Value *v, llvm::Instruction *beforeinst,
+                     std::map<unsigned, unsigned> &addrSpaceMap,
+                     std::map<llvm::Type*, llvm::StructType*> &convertedStructsCache) {
+  if (isa<ConstantExpr>(v)) {
+      ConstantExpr *ce = dyn_cast<ConstantExpr>(v);
+      Value *in = ce->getAsInstruction();
+      AddrSpaceCastInst *asci = dyn_cast<AddrSpaceCastInst>(in);
+      assert(asci);
+      if (asci->getDestTy()->getPointerAddressSpace() == POCL_ADDRESS_SPACE_GENERIC) {
+        asci->insertBefore(beforeinst);
+        v->replaceAllUsesWith(in);
+        in->takeName(v);
+        return true;
+      } else
+        return false;
+  }
+  if (isa<AddrSpaceCastInst>(v)) {
+      AddrSpaceCastInst *as = dyn_cast<AddrSpaceCastInst>(v);
+      Type* SrcTy = as->getSrcTy();
+      Type* DstTy = as->getDestTy();
+      if (isa<PointerType>(SrcTy) && isa<PointerType>(DstTy)) {
+        if ((DstTy->getPointerAddressSpace() == SrcTy->getPointerAddressSpace())
+            || (DstTy->getPointerAddressSpace() == POCL_ADDRESS_SPACE_GENERIC))
+          {
+            if (DstTy->getPointerAddressSpace() == POCL_ADDRESS_SPACE_GENERIC)
+              UpdateAddressSpace(*as, addrSpaceMap, convertedStructsCache);
+            Value* srcVal = as->getOperand(0);
+            as->replaceAllUsesWith(srcVal);
+            as->eraseFromParent();
+            return true;
+          }
+      }
+  }
+
+  return false;
+
+}
+
+
+
+
 /**
  * After converting the pointer address spaces, there
  * might be llvm.memcpy.* or llvm.memset.* calls to wrong
@@ -104,10 +183,10 @@ FixMemIntrinsics(llvm::Function& F) {
   std::vector<llvm::MemIntrinsic*> intrinsics;
   for (llvm::Function::iterator bbi = F.begin(), bbe = F.end(); bbi != bbe;
        ++bbi) {
-    llvm::BasicBlock* bb = bbi;
+    llvm::BasicBlock* bb = &*bbi;
     for (llvm::BasicBlock::iterator ii = bb->begin(), ie = bb->end();
          ii != ie; ++ii) {
-      llvm::Instruction *instr = ii;
+      llvm::Instruction *instr = &*ii;
       if (!isa<llvm::MemIntrinsic>(instr)) continue;
       intrinsics.push_back(dyn_cast<llvm::MemIntrinsic>(instr));
     }
@@ -141,72 +220,35 @@ FixMemIntrinsics(llvm::Function& F) {
   }
 }
 
-bool
-TargetAddressSpaces::runOnModule(llvm::Module &M) {
 
-  llvm::StringRef arch(M.getTargetTriple());
 
-  std::map<unsigned, unsigned> addrSpaceMap;
 
-  if (arch.startswith("x86_64")) {
-    /* x86_64 supports flattening the address spaces at the backend, but
-       we still flatten them in pocl due to a couple of reasons.
+static void
+run(llvm::Module &M,
+    std::map<unsigned, unsigned> &addrSpaceMap,
+    bool handle_generic_AS) {
 
-       At least LLVM 3.5 exposes an issue with pocl's printf or another LLVM pass:
-       After the code emission optimizations there appears a
-       PHI node where the two alternative pointer assignments have different
-       address spaces:
-       %format.addr.2347 =
-          phi i8 addrspace(3)* [ %incdec.ptr58, %if.end56 ],
-                               [ %format.addr.1, %while.body45.preheader ]
+  std::map<llvm::Type*, llvm::StructType*> convertedStructsCache;
 
-       This leads to an LLVM crash when it tries to generate a no-op bitcast
-       while it won't be such due to the address space difference (I assume).
-       Workaround this by flattening the address spaces to 0 here also for
-       x86_64 until the real culprit is found.
-
-       Another reason is that LoopVectorizer of LLVM 3.7 crashes when it
-       tries to create a masked store intrinsics with the fake address space
-       ids, so we need to flatten them out before vectorizing.
-    */
-    addrSpaceMap[POCL_ADDRESS_SPACE_GLOBAL] =
-        addrSpaceMap[POCL_ADDRESS_SPACE_LOCAL] =
-        addrSpaceMap[POCL_ADDRESS_SPACE_CONSTANT] = 0;
+  /* Handle global variables. These should be fixed *after*
+     fixing the instruction referring to them.  If we fix
+     the address spaces before, there might be possible
+     illegal bitcasts casting the LLVM's global pointer to
+     another one, causing the CloneFunctionInto to crash when
+     it encounters such.
 
-  } else if (arch.startswith("arm")) {
-    /* Same thing happens here as with x86_64 above.
-     * NB: LLVM 3.5 on ARM did not need this yet, for some reason
-     */
-#if defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4 || defined_LLVM_3_5
-    return false;
-#else
-    addrSpaceMap[POCL_ADDRESS_SPACE_GLOBAL] =
-        addrSpaceMap[POCL_ADDRESS_SPACE_LOCAL] =
-        addrSpaceMap[POCL_ADDRESS_SPACE_CONSTANT] = 0;
-#endif
-  } else if (arch.startswith("tce")) {
-    /* TCE requires the remapping. */
-    addrSpaceMap[POCL_ADDRESS_SPACE_GLOBAL] = 3;
-    addrSpaceMap[POCL_ADDRESS_SPACE_LOCAL] = 4;
-    /* LLVM 3.2 detects 'constant' as cuda_constant (5) in the fake
-       address space map. Add it for compatibility. */
-    addrSpaceMap[5] = addrSpaceMap[POCL_ADDRESS_SPACE_CONSTANT] = 5;     
-  } else if (arch.startswith("mips")) {
-    addrSpaceMap[POCL_ADDRESS_SPACE_GLOBAL] =
-        addrSpaceMap[POCL_ADDRESS_SPACE_LOCAL] =
-        addrSpaceMap[POCL_ADDRESS_SPACE_CONSTANT] = 0;
-  } else if (arch.startswith("amdgcn") || arch.startswith("hsail")) {
-    addrSpaceMap[POCL_ADDRESS_SPACE_GLOBAL] = 1;
-    addrSpaceMap[POCL_ADDRESS_SPACE_LOCAL] = 3;
-    addrSpaceMap[POCL_ADDRESS_SPACE_CONSTANT] = 2;
-  } else {
-    /* Assume the fake address space map works directly in case not
-       overridden here.  */
-    return false;
+     Update: ^this seems not to be an issue anymore and this commit
+     seems to cause the problems it is trying to fix on hsa and
+     amd scanlargearrays....  Original commit:
+     dcbcd39811638bcb953afbbfdd2620fb8ab45af4
+  */
+  llvm::Module::global_iterator globalI = M.global_begin();
+  llvm::Module::global_iterator globalE = M.global_end();
+  for (; globalI != globalE; ++globalI) {
+    llvm::Value &global = *globalI;
+    UpdateAddressSpace(global, addrSpaceMap, convertedStructsCache);
   }
 
-  bool changed = false;
-
   FunctionMapping funcReplacements;
   std::vector<llvm::Function*> unhandledFuncs;
 
@@ -217,7 +259,7 @@ TargetAddressSpaces::runOnModule(llvm::Module &M) {
        functionI != functionE; ++functionI) {
     if (functionI->empty() || functionI->getName().startswith("_GLOBAL")) 
       continue;
-    unhandledFuncs.push_back(functionI);
+    unhandledFuncs.push_back(&*functionI);
   }
 
   for (std::vector<llvm::Function*>::iterator i = unhandledFuncs.begin(), 
@@ -230,10 +272,11 @@ TargetAddressSpaces::runOnModule(llvm::Module &M) {
     for (Function::const_arg_iterator i = F.arg_begin(),
            e = F.arg_end();
          i != e; ++i)
-      parameters.push_back(ConvertedType(i->getType(), addrSpaceMap));
+      parameters.push_back(
+            ConvertedType(i->getType(), addrSpaceMap, convertedStructsCache));
 
     llvm::FunctionType *ft = FunctionType::get
-      (ConvertedType(F.getReturnType(), addrSpaceMap),
+      (ConvertedType(F.getReturnType(), addrSpaceMap, convertedStructsCache),
        parameters, F.isVarArg());
 
     llvm::Function *newFunc = Function::Create(ft, F.getLinkage(), "", &M);
@@ -245,44 +288,82 @@ TargetAddressSpaces::runOnModule(llvm::Module &M) {
            e = F.arg_end();
          i != e; ++i) {
       j->setName(i->getName());
-      vv[i] = j;
+      vv[&*i] = &*j;
       ++j;
     }
 
     SmallVector<ReturnInst *, 1> ri;
 
+    if (handle_generic_AS) {
+    /* Remove generic address space casts. Converts types with generic AS to
+     * private AS and then removes redundant AS casting instructions */
+    for (llvm::Function::iterator bbi = F.begin(), bbe = F.end(); bbi != bbe;
+         ++bbi)
+      for (llvm::BasicBlock::iterator ii = bbi->begin(), ie = bbi->end(); ii != ie;
+           ++ii) {
+
+        llvm::Instruction *instr = &*ii;
+
+        if (isa<AddrSpaceCastInst>(instr)) {
+            if (removeASCI(instr, nullptr, addrSpaceMap, convertedStructsCache))
+              { ii = bbi->begin(); continue; }
+          }
+        if (isa<StoreInst>(instr)) {
+            StoreInst *st = dyn_cast<StoreInst>(instr);
+            Value *pt = st->getPointerOperand();
+            if (Operator::getOpcode(pt) == Instruction::AddrSpaceCast) {
+              if (removeASCI(pt, instr, addrSpaceMap, convertedStructsCache))
+                { ii = bbi->begin(); continue; }
+            } else
+              if (st->getPointerAddressSpace() == POCL_ADDRESS_SPACE_GENERIC)
+                UpdateAddressSpace(*pt, addrSpaceMap, convertedStructsCache);
+        }
+        if (isa<LoadInst>(instr)) {
+            LoadInst *ld = dyn_cast<LoadInst>(instr);
+            Value *pt = ld->getPointerOperand();
+            if (Operator::getOpcode(pt) == Instruction::AddrSpaceCast) {
+              if (removeASCI(pt, instr, addrSpaceMap, convertedStructsCache))
+                { ii = bbi->begin(); continue; }
+            } else
+              if (ld->getPointerAddressSpace() == POCL_ADDRESS_SPACE_GENERIC)
+                UpdateAddressSpace(*pt, addrSpaceMap, convertedStructsCache);
+        }
+        if (isa<GetElementPtrInst>(instr)) {
+            GetElementPtrInst *gep = dyn_cast<GetElementPtrInst>(instr);
+            Value *pt = gep->getPointerOperand();
+            if (Operator::getOpcode(pt) == Instruction::AddrSpaceCast) {
+                if (removeASCI(pt, instr, addrSpaceMap, convertedStructsCache))
+                  { ii = bbi->begin(); continue; }
+              } else {
+                if (gep->getPointerAddressSpace() == POCL_ADDRESS_SPACE_GENERIC)
+                  UpdateAddressSpace(*pt, addrSpaceMap, convertedStructsCache);
+              }
+        }
+
+      }
+    }
+
     class AddressSpaceReMapper : public ValueMapTypeRemapper {
     public:
-      AddressSpaceReMapper(std::map<unsigned, unsigned> &addrSpaceMap) :
-        addrSpaceMap_(addrSpaceMap) {}      
+      AddressSpaceReMapper(std::map<unsigned, unsigned> &addrSpaceMap,
+                           std::map<llvm::Type*, llvm::StructType*> *c) :
+        cStructCache_(c), addrSpaceMap_(addrSpaceMap) {}
+
       Type* remapType(Type *type) {
-        Type *newType = ConvertedType(type, addrSpaceMap_);
+        Type *newType = ConvertedType(type, addrSpaceMap_, *cStructCache_);
         if (newType == type) return type;
         return newType;
       }
     private:
+      std::map<llvm::Type*, llvm::StructType*> *cStructCache_;
       std::map<unsigned, unsigned>& addrSpaceMap_;
-    } asvtm(addrSpaceMap);
+    } asvtm(addrSpaceMap, &convertedStructsCache);
 
     CloneFunctionInto(newFunc, &F, vv, true, ri, "", NULL, &asvtm);
     FixMemIntrinsics(*newFunc);
     funcReplacements[&F] = newFunc;
   }
 
-  /* Handle global variables. These should be fixed *after*
-     fixing the instruction referring to them.  If we fix
-     the address spaces before, there might be possible
-     illegal bitcasts casting the LLVM's global pointer to
-     another one, causing the CloneFunctionInto to crash when
-     it encounters such.
-   */
-  llvm::Module::global_iterator globalI = M.global_begin();
-  llvm::Module::global_iterator globalE = M.global_end();
-  for (; globalI != globalE; ++globalI) {
-    llvm::Value &global = *globalI;
-    changed |= UpdateAddressSpace(global, addrSpaceMap);
-  }
-  
   /* Replace all references to the old function to the new one.
      Also, for LLVM 3.4, replace the pointercasts to bitcasts in
      case the new address spaces are the same in both sides. */
@@ -294,9 +375,8 @@ TargetAddressSpaces::runOnModule(llvm::Module &M) {
          ++bbi) 
       for (llvm::BasicBlock::iterator ii = bbi->begin(), ie = bbi->end(); ii != ie;
            ++ii) {
-        llvm::Instruction *instr = ii;
+        llvm::Instruction *instr = &*ii;
 
-#if !(defined(LLVM_3_2) || defined(LLVM_3_3))
         if (isa<AddrSpaceCastInst>(instr)) {
           // Convert (now illegal) addresspacecasts to bitcasts.
 
@@ -315,7 +395,6 @@ TargetAddressSpaces::runOnModule(llvm::Module &M) {
           ii = bbi->begin();
           continue;
         }
-#endif
         
         if (!isa<CallInst>(instr)) continue;
 
@@ -340,11 +419,7 @@ TargetAddressSpaces::runOnModule(llvm::Module &M) {
     if (i->first->getNumUses() > 0) {
       for (Value::use_iterator ui = i->first->use_begin(), 
              ue = i->first->use_end(); ui != ue; ++ui) {
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-        User* user = *ui;
-#else
         User* user = (*ui).getUser();
-#endif
         user->dump();
       }
       assert ("All users of the function were not fixed?" &&
@@ -356,7 +431,93 @@ TargetAddressSpaces::runOnModule(llvm::Module &M) {
     i = funcReplacements.begin();
   }
 
-  return true;
 }
 
+#define POCL_AS_FAKE_GENERIC 0
+#define POCL_AS_FAKE_GLOBAL 201
+#define POCL_AS_FAKE_LOCAL 202
+#define POCL_AS_FAKE_CONSTANT 203
+
+bool
+TargetAddressSpaces::runOnModule(llvm::Module &M) {
+
+  /* Annoying but we need to do two AS conversions.
+   * This is neccessary because the Pocl fake AS numbers
+   * conflict with real AS numbers (for some devices).
+   * First we map the Pocl fake AS numbers higher (above 200),
+   * then we map that down to real device AS */
+
+  std::map<unsigned, unsigned> addrSpaceMapUp;
+
+  addrSpaceMapUp[POCL_ADDRESS_SPACE_GLOBAL] = POCL_AS_FAKE_GLOBAL;
+  addrSpaceMapUp[POCL_ADDRESS_SPACE_LOCAL] = POCL_AS_FAKE_LOCAL;
+  addrSpaceMapUp[POCL_ADDRESS_SPACE_GENERIC] = POCL_AS_FAKE_GENERIC;
+  addrSpaceMapUp[POCL_ADDRESS_SPACE_CONSTANT] = POCL_AS_FAKE_CONSTANT;
+
+  run(M, addrSpaceMapUp, true);
+
+  std::map<unsigned, unsigned> addrSpaceMapDown;
+
+  llvm::StringRef arch(M.getTargetTriple());
+
+  if (arch.startswith("x86_64")) {
+    /* x86_64 supports flattening the address spaces at the backend, but
+       we still flatten them in pocl due to a couple of reasons.
+
+       At least LLVM 3.5 exposes an issue with pocl's printf or another LLVM pass:
+       After the code emission optimizations there appears a
+       PHI node where the two alternative pointer assignments have different
+       address spaces:
+       %format.addr.2347 =
+          phi i8 addrspace(3)* [ %incdec.ptr58, %if.end56 ],
+                               [ %format.addr.1, %while.body45.preheader ]
+
+       This leads to an LLVM crash when it tries to generate a no-op bitcast
+       while it won't be such due to the address space difference (I assume).
+       Workaround this by flattening the address spaces to 0 here also for
+       x86_64 until the real culprit is found.
+
+       Another reason is that LoopVectorizer of LLVM 3.7 crashes when it
+       tries to create a masked store intrinsics with the fake address space
+       ids, so we need to flatten them out before vectorizing.
+    */
+    addrSpaceMapDown[POCL_AS_FAKE_GLOBAL] =
+        addrSpaceMapDown[POCL_AS_FAKE_LOCAL] =
+        addrSpaceMapDown[POCL_AS_FAKE_GENERIC] =
+        addrSpaceMapDown[POCL_AS_FAKE_CONSTANT] = 0;
+
+  } else if (arch.startswith("arm")) {
+    /* Same thing happens here as with x86_64 above.
+     */
+    addrSpaceMapDown[POCL_AS_FAKE_GLOBAL] =
+        addrSpaceMapDown[POCL_AS_FAKE_LOCAL] =
+        addrSpaceMapDown[POCL_AS_FAKE_GENERIC] =
+        addrSpaceMapDown[POCL_AS_FAKE_CONSTANT] = 0;
+  } else if (arch.startswith("tce")) {
+    /* TCE requires the remapping. */
+    addrSpaceMapDown[POCL_AS_FAKE_GENERIC] = 0;
+    addrSpaceMapDown[POCL_AS_FAKE_GLOBAL] = 3;
+    addrSpaceMapDown[POCL_AS_FAKE_LOCAL] = 4;
+    addrSpaceMapDown[POCL_AS_FAKE_CONSTANT] = 5;
+  } else if (arch.startswith("mips")) {
+    addrSpaceMapDown[POCL_AS_FAKE_GLOBAL] =
+    addrSpaceMapDown[POCL_AS_FAKE_LOCAL] =
+    addrSpaceMapDown[POCL_AS_FAKE_GENERIC] =
+    addrSpaceMapDown[POCL_AS_FAKE_CONSTANT] = 0;
+  } else if (arch.startswith("amdgcn") || arch.startswith("hsail")) {
+    addrSpaceMapDown[POCL_AS_FAKE_GENERIC] = 0;
+    addrSpaceMapDown[POCL_AS_FAKE_GLOBAL] = 1;
+    addrSpaceMapDown[POCL_AS_FAKE_LOCAL] = 3;
+    addrSpaceMapDown[POCL_AS_FAKE_CONSTANT] = 2;
+  } else {
+    /* Assume the fake address space map works directly in case not
+       overridden here.  */
+    return false;
+  }
+
+  run(M, addrSpaceMapDown, false);
+
+  return true;
 }
+
+} // namespace pocl
diff --git a/lib/llvmopencl/TargetAddressSpaces.h b/lib/llvmopencl/TargetAddressSpaces.h
index 1afb197..fc53dcd 100644
--- a/lib/llvmopencl/TargetAddressSpaces.h
+++ b/lib/llvmopencl/TargetAddressSpaces.h
@@ -25,12 +25,8 @@
 #define _POCL_TARGET_ADDRESS_SPACES_H
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Function.h"
-#else
-#include "llvm/IR/Function.h"
-#endif
 
+#include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
 
 namespace pocl {
diff --git a/lib/llvmopencl/VariableUniformityAnalysis.cc b/lib/llvmopencl/VariableUniformityAnalysis.cc
index f9565ab..dc86c2a 100644
--- a/lib/llvmopencl/VariableUniformityAnalysis.cc
+++ b/lib/llvmopencl/VariableUniformityAnalysis.cc
@@ -20,30 +20,20 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <sstream>
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "config.h"
 #include "pocl.h"
 
-#include <sstream>
-#include <iostream>
-
-#ifdef LLVM_3_2
-#include "llvm/Metadata.h"
-#include "llvm/Constants.h"
-#include "llvm/Module.h"
-#include "llvm/Instructions.h"
-#include "llvm/ValueSymbolTable.h"
-#include "llvm/DataLayout.h"
-#else
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/IR/DataLayout.h"
-#endif
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Analysis/PostDominators.h"
 
@@ -84,21 +74,10 @@ VariableUniformityAnalysis::getAnalysisUsage(llvm::AnalysisUsage &AU) const {
   AU.addPreserved<LoopInfoWrapperPass>();
 #endif
   // required by LoopInfo:
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DominatorTree>();
-  AU.addPreserved<DominatorTree>();
-#else
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addPreserved<DominatorTreeWrapperPass>();
-#endif
 
-#ifdef LLVM_3_1
-  AU.addRequired<TargetData>();
-  AU.addPreserved<TargetData>();
-#elif (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DataLayout>();
-  AU.addPreserved<DataLayout>();
-#elif (defined OLDER_THAN_LLVM_3_7)
+#ifdef LLVM_OLDER_THAN_3_7
   AU.addRequired<DataLayoutPass>();
   AU.addPreserved<DataLayoutPass>();
 #endif
@@ -330,11 +309,7 @@ VariableUniformityAnalysis::isUniform(llvm::Function *f, llvm::Value* v) {
     for (Instruction::use_iterator ui = instruction->use_begin(),
            ue = instruction->use_end();
          ui != ue; ++ui) {
-#if defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4
-      llvm::Instruction *user = cast<Instruction>(*ui);
-#else
       llvm::Instruction *user = cast<Instruction>(ui->getUser());
-#endif
       if (user == NULL) continue;
       
       llvm::StoreInst *store = dyn_cast<llvm::StoreInst>(user);
@@ -425,23 +400,11 @@ VariableUniformityAnalysis::isUniform(llvm::Function *f, llvm::Value* v) {
   // Atomic operations might look like uniform if only considering the operands
   // (access a global memory location of which ordering by default is not
   // constrained), but their semantics have ordering: Each work-item should get
-  // their own value from that memory location. isAtomic() check was introduced
-  // only in LLVM 3.6 so we have to use a more general mayWriteToMemory check
-  // with earlier versions. 
-#if defined(LLVM_3_2) || defined(LLVM_3_3) || defined(LLVM_3_4) || defined(LLVM_3_5)
-  // Volatile loads are set to mayWriteToMemory (perhaps because they might read
-  // an I/O register which might update at read) which are treated as a special case 
-  // here as we know this is not the case in OpenCL C memory accesses.
-  if (instr->mayWriteToMemory() && !isa<llvm::LoadInst>(instr)) {
-      setUniform(f, v, false);
-      return false;
-  }
-#else
+  // their own value from that memory location.
   if (instr->isAtomic()) {
       setUniform(f, v, false);
       return false;
   }
-#endif
 
   // not computed previously, scan all operands of the instruction
   // and figure out their uniformity recursively
diff --git a/lib/llvmopencl/VariableUniformityAnalysis.h b/lib/llvmopencl/VariableUniformityAnalysis.h
index 989282a..3a93ccc 100644
--- a/lib/llvmopencl/VariableUniformityAnalysis.h
+++ b/lib/llvmopencl/VariableUniformityAnalysis.h
@@ -24,12 +24,8 @@
 #define POCL_VARIABLE_UNIFORMITY_ANALYSIS_H
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Function.h"
-#else
-#include "llvm/IR/Function.h"
-#endif
 
+#include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
 
 namespace pocl {
diff --git a/lib/llvmopencl/WorkItemAliasAnalysis.cc b/lib/llvmopencl/WorkItemAliasAnalysis.cc
index a46a79f..5e0e920 100644
--- a/lib/llvmopencl/WorkItemAliasAnalysis.cc
+++ b/lib/llvmopencl/WorkItemAliasAnalysis.cc
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2012 Tampere University of Technology.
+    Copyright (c) 2012-2015 Tampere University of Technology.
 
     Permission is hereby granted, free of charge, to any person obtaining a
     copy of this software and associated documentation files (the "Software"),
@@ -27,33 +27,26 @@
  * @author Vladimír Guzma 2012
  */
 
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "config.h"
 #include "pocl.h"
-#include <iostream>
 
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Pass.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Metadata.h"
-#include "llvm/Constants.h"
-#else
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
-#endif
 
 POP_COMPILER_DIAGS
 
-
 using namespace llvm;
 
-namespace {
-
 #ifdef LLVM_OLDER_THAN_3_7
 typedef AliasAnalysis::AliasResult AliasResult;
 #else
@@ -64,6 +57,8 @@ typedef llvm::AliasResult AliasResult;
 /// WorkItemAliasAnalysis - This is a simple alias analysis
 /// implementation that uses pocl metadata to make sure memory accesses from
 /// different work items are not aliasing.
+///
+#ifdef LLVM_OLDER_THAN_3_8
 class WorkItemAliasAnalysis : public FunctionPass, public AliasAnalysis {
 public:
     static char ID; 
@@ -79,34 +74,106 @@ public:
         return this;
     }
 
-
 #ifdef LLVM_OLDER_THAN_3_7
     virtual void initializePass() {
         InitializeAliasAnalysis(this);
     }
     virtual bool runOnFunction(llvm::Function &) {
-      InitializeAliasAnalysis(this);
-      return false;
+        InitializeAliasAnalysis(this);
+        return false;
     }
 #else
     virtual bool runOnFunction(llvm::Function &F) {
-      InitializeAliasAnalysis(this, &F.getParent()->getDataLayout());
-      return false;
+        InitializeAliasAnalysis(this, &F.getParent()->getDataLayout());
+        return false;
     }
 #endif
     
     private:
         virtual void getAnalysisUsage(AnalysisUsage &AU) const;
         virtual AliasResult alias(const Location &LocA, const Location &LocB);
-    };
+};
+
+#else
+
+// LLVM 3.8+
+
+class WorkItemAAResult : public AAResultBase<WorkItemAAResult> {
+    friend AAResultBase<WorkItemAAResult>;
+
+public:
+    static char ID;
+
+    WorkItemAAResult(const TargetLibraryInfo &TLI)
+        : AAResultBase(TLI) {}
+    WorkItemAAResult(const WorkItemAAResult &Arg)
+        : AAResultBase(Arg.TLI) {}
+    WorkItemAAResult(WorkItemAAResult &&Arg)
+        : AAResultBase(Arg.TLI) {}
+
+    AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);
+};
+
+class WorkItemAA {
+public:
+    typedef WorkItemAAResult Result;
+
+    /// \brief Opaque, unique identifier for this analysis pass.
+    static void *ID() { return (void *)&PassID; }
+
+    WorkItemAAResult run(Function &F, AnalysisManager<Function> *AM);
+
+    /// \brief Provide access to a name for this pass for debugging purposes.
+    static StringRef name() { return "WorkItemAliasAnalysis"; }
+
+private:
+    static char PassID;
+};
+
+/// Legacy wrapper pass to provide the (WorkItemAAWrapperPass) object.
+class WorkItemAliasAnalysis : public FunctionPass {
+    std::unique_ptr<WorkItemAAResult> Result;
+
+    virtual void anchor();
+
+public:
+    static char ID;
+
+    WorkItemAliasAnalysis() : FunctionPass(ID) {};
+
+    WorkItemAAResult &getResult() { return *Result; }
+    const WorkItemAAResult &getResult() const { return *Result; }
+
+    bool runOnFunction(Function &F) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+char WorkItemAA::PassID;
+char WorkItemAAResult::ID = 0;
+void WorkItemAliasAnalysis::anchor() {}
+
+WorkItemAAResult WorkItemAA::run(Function &F, AnalysisManager<Function> *AM) {
+    return WorkItemAAResult( AM->getResult<WorkItemAA>(F) );
 }
 
+bool WorkItemAliasAnalysis::runOnFunction(llvm::Function &F) {
+    auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
+    Result.reset(new WorkItemAAResult(TLIWP.getTLI()));
+    return false;
+}
+
+#endif // LLVM 3.8+
+
 // Register this pass...
 char WorkItemAliasAnalysis::ID = 0;
 RegisterPass<WorkItemAliasAnalysis>
     X("wi-aa", "Work item alias analysis.", false, false);
 // Register it also to pass group
-RegisterAnalysisGroup<AliasAnalysis> Y(X);  
+#ifdef LLVM_OLDER_THAN_3_8
+RegisterAnalysisGroup<AliasAnalysis> Y(X);
+#else
+RegisterAnalysisGroup<WorkItemAAResult> Y(X);
+#endif
 
 FunctionPass *createWorkItemAliasAnalysisPass() {
     return new WorkItemAliasAnalysis();
@@ -122,17 +189,24 @@ extern "C" {
 void
 WorkItemAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
     AU.setPreservesAll();
+#ifdef LLVM_OLDER_THAN_3_8
     AliasAnalysis::getAnalysisUsage(AU);
+#else
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+#endif
 }
 
-
 /**
  * Test if memory locations are from different work items from same region.
  * Then they can not alias.
  */
+
 AliasResult
-WorkItemAliasAnalysis::alias(const Location &LocA,
-                             const Location &LocB) {
+#ifdef LLVM_OLDER_THAN_3_8
+WorkItemAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
+#else
+WorkItemAAResult::alias(const Location &LocA, const Location &LocB) {
+#endif
     // If either of the memory references is empty, it doesn't matter what the
     // pointer values are. This allows the code below to ignore this special
     // case.
@@ -157,15 +231,10 @@ WorkItemAliasAnalysis::alias(const Location &LocA,
             // Fall back to other AAs.
             const MDNode* mdRegionA = dyn_cast<MDNode>(mdA->getOperand(1));
             const MDNode* mdRegionB = dyn_cast<MDNode>(mdB->getOperand(1)); 
-#ifdef LLVM_OLDER_THAN_3_6
-            ConstantInt* C1 = dyn_cast<ConstantInt>(mdRegionA->getOperand(1));
-            ConstantInt* C2 = dyn_cast<ConstantInt>(mdRegionB->getOperand(1));
-#else
             ConstantInt* C1 = dyn_cast<ConstantInt>(
               dyn_cast<ConstantAsMetadata>(mdRegionA->getOperand(1))->getValue());
             ConstantInt* C2 = dyn_cast<ConstantInt>(
               dyn_cast<ConstantAsMetadata>(mdRegionB->getOperand(1))->getValue());
-#endif
             if (C1->getValue() == C2->getValue()) {
                 // Now we have both locations from same region. Check for different
                 // work items.
@@ -174,16 +243,6 @@ WorkItemAliasAnalysis::alias(const Location &LocA,
                 assert(iXYZ->getNumOperands() == 4);
                 assert(jXYZ->getNumOperands() == 4);
 
-#ifdef LLVM_OLDER_THAN_3_6               
-                ConstantInt *CIX = dyn_cast<ConstantInt>(iXYZ->getOperand(1));
-                ConstantInt *CJX = dyn_cast<ConstantInt>(jXYZ->getOperand(1));
-
-                ConstantInt *CIY = dyn_cast<ConstantInt>(iXYZ->getOperand(2));
-                ConstantInt *CJY = dyn_cast<ConstantInt>(jXYZ->getOperand(2));
-                
-                ConstantInt *CIZ = dyn_cast<ConstantInt>(iXYZ->getOperand(3));
-                ConstantInt *CJZ = dyn_cast<ConstantInt>(jXYZ->getOperand(3));
-#else
                 ConstantInt *CIX = 
                   dyn_cast<ConstantInt>(
                       dyn_cast<ConstantAsMetadata>(
@@ -210,7 +269,6 @@ WorkItemAliasAnalysis::alias(const Location &LocA,
                   dyn_cast<ConstantInt>(
                     dyn_cast<ConstantAsMetadata>(
                       jXYZ->getOperand(3))->getValue());
-#endif
                 
                 if ( !(CIX->getValue() == CJX->getValue()
                     && CIY->getValue() == CJY->getValue()
@@ -221,6 +279,10 @@ WorkItemAliasAnalysis::alias(const Location &LocA,
         }
     }
   
+#ifdef LLVM_OLDER_THAN_3_8
     // Forward the query to the next analysis.
     return AliasAnalysis::alias(LocA, LocB);
+#else
+    return WorkItemAAResult::alias(LocA, LocB);
+#endif
 }
diff --git a/lib/llvmopencl/Workgroup.cc b/lib/llvmopencl/Workgroup.cc
index d06bd71..f8859ef 100644
--- a/lib/llvmopencl/Workgroup.cc
+++ b/lib/llvmopencl/Workgroup.cc
@@ -22,36 +22,18 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <cstdio>
+#include <map>
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "Barrier.h"
-#include "Workgroup.h"
+#include "pocl.h"
 
-#include "CanonicalizeBarriers.h"
-#include "BarrierTailReplication.h"
-#include "WorkitemReplication.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "config.h"
-#ifdef LLVM_3_1
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/TypeBuilder.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Module.h"
-#elif defined LLVM_3_2
-#include "llvm/IRBuilder.h"
-#include "llvm/TypeBuilder.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/TypeBuilder.h"
 #include "llvm/IR/BasicBlock.h"
@@ -59,15 +41,16 @@ IGNORE_COMPILER_WARNING("-Wunused-parameter")
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
-#endif
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <cstdio>
-#include <map>
-#include <iostream>
 
-#include "pocl.h"
+#include "CanonicalizeBarriers.h"
+#include "BarrierTailReplication.h"
+#include "WorkitemReplication.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+
 
 #if _MSC_VER
 #  include "vccompat.hpp"
@@ -163,20 +146,7 @@ bool
 Workgroup::runOnModule(Module &M)
 {
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  if (M.getPointerSize() == llvm::Module::Pointer64)
-    {
-      TypeBuilder<PoclContext, true>::setSizeTWidth(64);
-    }
-  else if (M.getPointerSize() == llvm::Module::Pointer32) 
-    {
-      TypeBuilder<PoclContext, true>::setSizeTWidth(32);
-    }
-  else 
-    {
-      assert (false && "Target has an unsupported pointer width.");
-    }  
-#elif (defined LLVM_3_5 || defined LLVM_3_6)
+#ifdef LLVM_OLDER_THAN_3_7
   if (M.getDataLayout()->getPointerSize(0) == 8)
     {
       TypeBuilder<PoclContext, true>::setSizeTWidth(64);
@@ -211,13 +181,9 @@ Workgroup::runOnModule(Module &M)
 
   for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
     if (!isKernelToProcess(*i)) continue;
-    Function *L = createLauncher(M, i);
+    Function *L = createLauncher(M, &*i);
       
-#if defined LLVM_3_2
-    L->addFnAttr(Attributes::NoInline);
-#else
     L->addFnAttr(Attribute::NoInline);
-#endif
 
     privatizeContext(M, L);
 
@@ -260,7 +226,7 @@ createLauncher(Module &M, Function *F)
   SmallVector<Value *, 8> arguments;
   Function::arg_iterator ai = L->arg_begin();
   for (unsigned i = 0, e = F->getArgumentList().size(); i != e; ++i)  {
-    arguments.push_back(ai);
+    arguments.push_back(&*ai);
     ++ai;
   }  
 
@@ -274,11 +240,11 @@ createLauncher(Module &M, Function *F)
 
   IRBuilder<> builder(BasicBlock::Create(M.getContext(), "", L));
 
-#if defined LLVM_OLDER_THAN_3_7
+#ifdef LLVM_OLDER_THAN_3_7
   ptr = builder.CreateStructGEP(ai,
 				TypeBuilder<PoclContext, true>::WORK_DIM);
 #else
-  ptr = builder.CreateStructGEP(ai->getType()->getPointerElementType(), ai,
+  ptr = builder.CreateStructGEP(ai->getType()->getPointerElementType(), &*ai,
                                 TypeBuilder<PoclContext, true>::WORK_DIM);
 #endif
   gv = M.getGlobalVariable("_work_dim");
@@ -289,20 +255,18 @@ createLauncher(Module &M, Function *F)
 
 
   int size_t_width = 32;
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  if (M.getPointerSize() == llvm::Module::Pointer64)
-#elif (defined LLVM_OLDER_THAN_3_7)
+#ifdef LLVM_OLDER_THAN_3_7
   if (M.getDataLayout()->getPointerSize(0) == 8)
 #else
   if (M.getDataLayout().getPointerSize(0) == 8)
 #endif
     size_t_width = 64;
 
-#if defined LLVM_OLDER_THAN_3_7
+#ifdef LLVM_OLDER_THAN_3_7
   ptr = builder.CreateStructGEP(ai,
 				TypeBuilder<PoclContext, true>::GROUP_ID);
 #else
-  ptr = builder.CreateStructGEP(ai->getType()->getPointerElementType(), ai,
+  ptr = builder.CreateStructGEP(ai->getType()->getPointerElementType(), &*ai,
                                 TypeBuilder<PoclContext, true>::GROUP_ID);
 #endif
   for (int i = 0; i < 3; ++i) {
@@ -329,7 +293,7 @@ createLauncher(Module &M, Function *F)
   ptr = builder.CreateStructGEP(ai,
 				TypeBuilder<PoclContext, true>::NUM_GROUPS);
 #else
-  ptr = builder.CreateStructGEP(ai->getType()->getPointerElementType(), ai,
+  ptr = builder.CreateStructGEP(ai->getType()->getPointerElementType(), &*ai,
                                 TypeBuilder<PoclContext, true>::NUM_GROUPS);
 #endif
   for (int i = 0; i < 3; ++i) {
@@ -356,7 +320,7 @@ createLauncher(Module &M, Function *F)
   ptr = builder.CreateStructGEP(ai,
 				TypeBuilder<PoclContext, true>::GLOBAL_OFFSET);
 #else
-  ptr = builder.CreateStructGEP(ai->getType()->getPointerElementType(), ai,
+  ptr = builder.CreateStructGEP(ai->getType()->getPointerElementType(), &*ai,
                                 TypeBuilder<PoclContext, true>::GLOBAL_OFFSET);
 #endif
   for (int i = 0; i < 3; ++i) {
@@ -551,7 +515,7 @@ createWorkgroup(Module &M, Function *F)
        ii != ee; ++ii) {
     Type *t = ii->getType();
 
-    Value *gep = builder.CreateGEP(ai,
+    Value *gep = builder.CreateGEP(&*ai,
             ConstantInt::get(IntegerType::get(M.getContext(), 32), i));
     Value *pointer = builder.CreateLoad(gep);
 
@@ -559,17 +523,9 @@ createWorkgroup(Module &M, Function *F)
      * as is to the function, no need to load form it first. */
     Value *value;
     if (ii->hasByValAttr()) {
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
-        value = builder.CreateBitCast(pointer, t);
-#else
         value = builder.CreatePointerCast(pointer, t);
-#endif
     } else {
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
-        value = builder.CreateBitCast(pointer, t->getPointerTo());
-#else
         value = builder.CreatePointerCast(pointer, t->getPointerTo());
-#endif
         value = builder.CreateLoad(value);
     }
 
@@ -577,7 +533,7 @@ createWorkgroup(Module &M, Function *F)
     ++i;
   }
 
-  arguments.back() = ++ai;
+  arguments.back() = &*(++ai);
   
   builder.CreateCall(F, ArrayRef<Value*>(arguments));
   builder.CreateRetVoid();
@@ -611,11 +567,7 @@ createWorkgroupFast(Module &M, Function *F)
     dyn_cast<Function>(M.getOrInsertFunction(funcName + "_workgroup_fast", ft));
   assert(workgroup != NULL);
 
-#if defined LLVM_3_2
-  workgroup->addFnAttr(Attributes::NoInline);
-#else
   workgroup->addFnAttr(Attribute::NoInline);
-#endif
 
   builder.SetInsertPoint(BasicBlock::Create(M.getContext(), "", workgroup));
 
@@ -626,18 +578,14 @@ createWorkgroupFast(Module &M, Function *F)
   for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
        ii != ee; ++i, ++ii) {
     Type *t = ii->getType();
-    Value *gep = builder.CreateGEP(ai, 
+    Value *gep = builder.CreateGEP(&*ai,
             ConstantInt::get(IntegerType::get(M.getContext(), 32), i));
     Value *pointer = builder.CreateLoad(gep);
      
     if (t->isPointerTy()) {
       if (!ii->hasByValAttr()) {
         /* Assume the pointer is directly in the arg array. */
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
-        arguments.push_back(builder.CreateBitCast(pointer, t));
-#else
         arguments.push_back(builder.CreatePointerCast(pointer, t));
-#endif
         continue;
       }
 
@@ -649,13 +597,6 @@ createWorkgroupFast(Module &M, Function *F)
     /* If it's a pass by value pointer argument, we just pass the pointer
      * as is to the function, no need to load from it first. */
     Value *value;
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
-    if (!ii->hasByValAttr() || ((PointerType*)t)->getAddressSpace() == 1)
-      value = builder.CreateBitCast
-        (pointer, t->getPointerTo(POCL_ADDRESS_SPACE_GLOBAL));
-    else
-      value = builder.CreateBitCast(pointer, t->getPointerTo());
-#else
 
     if (!ii->hasByValAttr() || ((PointerType*)t)->getAddressSpace() == 1)
       value = builder.CreatePointerCast
@@ -663,7 +604,6 @@ createWorkgroupFast(Module &M, Function *F)
     else
       value = builder.CreatePointerCast(pointer, t->getPointerTo());
 
-#endif
     if (!ii->hasByValAttr()) {
       value = builder.CreateLoad(value);
     }
@@ -671,7 +611,7 @@ createWorkgroupFast(Module &M, Function *F)
     arguments.push_back(value);
   }
 
-  arguments.back() = ++ai;
+  arguments.back() = &*(++ai);
   
   builder.CreateCall(F, ArrayRef<Value*>(arguments));
   builder.CreateRetVoid();
@@ -700,14 +640,10 @@ Workgroup::isKernelToProcess(const Function &F)
   for (unsigned i = 0, e = kernels->getNumOperands(); i != e; ++i) {
     if (kernels->getOperand(i)->getOperand(0) == NULL)
       continue; // globaldce might have removed uncalled kernels
-#ifdef LLVM_OLDER_THAN_3_6
-    Function *k = cast<Function>(kernels->getOperand(i)->getOperand(0));
-#else
     Function *k = 
       cast<Function>(
         dyn_cast<ValueAsMetadata>(kernels->getOperand(i)->getOperand(0))
           ->getValue());
-#endif
     if (&F == k)
       return true;
   }
@@ -724,7 +660,7 @@ Workgroup::hasWorkgroupBarriers(const Function &F)
 {
   for (llvm::Function::const_iterator i = F.begin(), e = F.end();
        i != e; ++i) {
-    const llvm::BasicBlock* bb = i;
+    const llvm::BasicBlock* bb = &*i;
     if (Barrier::hasBarrier(bb)) {
 
       // Ignore the implicit entry and exit barriers.
diff --git a/lib/llvmopencl/Workgroup.h b/lib/llvmopencl/Workgroup.h
index 48d2c3a..e406857 100644
--- a/lib/llvmopencl/Workgroup.h
+++ b/lib/llvmopencl/Workgroup.h
@@ -24,11 +24,8 @@
 #define _POCL_WORKGROUP_H
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Module.h"
-#else
+
 #include "llvm/IR/Module.h"
-#endif
 #include "llvm/Pass.h"
 
 namespace pocl {
diff --git a/lib/llvmopencl/WorkitemHandler.cc b/lib/llvmopencl/WorkitemHandler.cc
index 544d52f..82b8d0d 100644
--- a/lib/llvmopencl/WorkitemHandler.cc
+++ b/lib/llvmopencl/WorkitemHandler.cc
@@ -22,31 +22,24 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+#include <sstream>
+#include <iostream>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "config.h"
-#include <sstream>
-#include <iostream>
+#include "pocl.h"
 
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Metadata.h"
-#include "llvm/Constants.h"
-#include "llvm/Module.h"
-#include "llvm/Instructions.h"
-#include "llvm/ValueSymbolTable.h"
-#else
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#endif
 #include "llvm/Support/CommandLine.h"
+
 #include "WorkitemHandler.h"
 #include "Kernel.h"
 #include "DebugHelpers.h"
-#include "pocl.h"
 
 POP_COMPILER_DIAGS
 
@@ -92,13 +85,6 @@ WorkitemHandler::Initialize(Kernel *K) {
   if (size_info) {
     for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) {
       llvm::MDNode *KernelSizeInfo = size_info->getOperand(i);
-#ifdef LLVM_OLDER_THAN_3_6
-      if (KernelSizeInfo->getOperand(0) != K) 
-        continue;
-      LocalSizeX = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(1)))->getLimitedValue();
-      LocalSizeY = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(2)))->getLimitedValue();
-      LocalSizeZ = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(3)))->getLimitedValue();
-#else
       if (dyn_cast<ValueAsMetadata>(
         KernelSizeInfo->getOperand(0).get())->getValue() != K) 
         continue;
@@ -112,7 +98,6 @@ WorkitemHandler::Initialize(Kernel *K) {
       LocalSizeZ = (llvm::cast<ConstantInt>(
                      llvm::dyn_cast<ConstantAsMetadata>(
                        KernelSizeInfo->getOperand(3))->getValue()))->getLimitedValue();
-#endif
       break;
     }
   }
@@ -128,14 +113,7 @@ WorkitemHandler::Initialize(Kernel *K) {
 
   llvm::Type *localIdType; 
   size_t_width = 0;
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  if (M->getPointerSize() == llvm::Module::Pointer64)
-    size_t_width = 64;
-  else if (M->getPointerSize() == llvm::Module::Pointer32)
-    size_t_width = 32;
-  else
-    assert (false && "Only 32 and 64 bit size_t widths supported.");
-#elif (defined LLVM_OLDER_THAN_3_7)
+#ifdef LLVM_OLDER_THAN_3_7
   if (M->getDataLayout()->getPointerSize(0) == 8)
     size_t_width = 64;
   else if (M->getDataLayout()->getPointerSize(0) == 4)
@@ -159,16 +137,10 @@ WorkitemHandler::Initialize(Kernel *K) {
 }
 
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-bool
-WorkitemHandler::dominatesUse
-(llvm::DominatorTree *DT, Instruction &I, unsigned i) {
-#else
 bool
 WorkitemHandler::dominatesUse
 (llvm::DominatorTreeWrapperPass *DTP, Instruction &I, unsigned i) {
   DominatorTree *DT = &DTP->getDomTree();
-#endif
   Instruction *Op = cast<Instruction>(I.getOperand(i));
   BasicBlock *OpBlock = Op->getParent();
   PHINode *PN = dyn_cast<PHINode>(&I);
@@ -225,22 +197,16 @@ WorkitemHandler::dominatesUse
    the old one. This should ensure the reachability without 
    the costly dominance analysis.
 */
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-bool
-WorkitemHandler::fixUndominatedVariableUses(llvm::DominatorTree *DT, 
-                                            llvm::Function &F) 
-#else
 bool
 WorkitemHandler::fixUndominatedVariableUses(llvm::DominatorTreeWrapperPass *DT,
                                             llvm::Function &F)
-#endif
 {
   bool changed = false;
   DT->runOnFunction(F);
 
   for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) 
     {
-      llvm::BasicBlock *bb = i;
+      llvm::BasicBlock *bb = &*i;
       for (llvm::BasicBlock::iterator ins = bb->begin(), inse = bb->end();
            ins != inse; ++ins)
         {
diff --git a/lib/llvmopencl/WorkitemHandler.h b/lib/llvmopencl/WorkitemHandler.h
index b42682c..feb8e50 100644
--- a/lib/llvmopencl/WorkitemHandler.h
+++ b/lib/llvmopencl/WorkitemHandler.h
@@ -28,16 +28,9 @@
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "config.h"
-#if (defined LLVM_3_1 || defined LLVM_3_2)
-#include "llvm/Function.h"
-#else
-#include "llvm/IR/Function.h"
-#endif
 
-#if ! (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Dominators.h"
-#endif
-
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -64,13 +57,8 @@ namespace pocl {
   protected:
     
     void movePhiNodes(llvm::BasicBlock* src, llvm::BasicBlock* dst);
-    #if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-    bool fixUndominatedVariableUses(llvm::DominatorTree *DT, llvm::Function &F);
-    bool dominatesUse(llvm::DominatorTree *DT, llvm::Instruction &I, unsigned i);
-    #else
     bool fixUndominatedVariableUses(llvm::DominatorTreeWrapperPass *DT, llvm::Function &F);
     bool dominatesUse(llvm::DominatorTreeWrapperPass *DT, llvm::Instruction &I, unsigned i);
-    #endif
 
     unsigned size_t_width;
 
diff --git a/lib/llvmopencl/WorkitemHandlerChooser.cc b/lib/llvmopencl/WorkitemHandlerChooser.cc
index 8fac3b2..62b0db4 100644
--- a/lib/llvmopencl/WorkitemHandlerChooser.cc
+++ b/lib/llvmopencl/WorkitemHandlerChooser.cc
@@ -23,6 +23,11 @@
 
 #define DEBUG_TYPE "workitem-loops"
 
+#include <iostream>
+
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+
 #include "WorkitemHandlerChooser.h"
 #include "WorkitemLoops.h"
 #include "WorkitemReplication.h"
@@ -30,11 +35,6 @@
 #include "CanonicalizeBarriers.h"
 #include "Kernel.h"
 
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/LoopInfo.h"
-
-#include <iostream>
-
 using namespace llvm;
 using namespace pocl;
 
@@ -44,7 +44,6 @@ namespace {
       "workitem-handler-chooser", 
       "Finds the best way to handle work-items to produce a multi-WG function.",
       false, false);
-  
 }
 
 namespace pocl {
diff --git a/lib/llvmopencl/WorkitemLoops.cc b/lib/llvmopencl/WorkitemLoops.cc
index 859bbc1..e43a486 100644
--- a/lib/llvmopencl/WorkitemLoops.cc
+++ b/lib/llvmopencl/WorkitemLoops.cc
@@ -23,47 +23,31 @@
 
 #define DEBUG_TYPE "workitem-loops"
 
-#include "WorkitemLoops.h"
-#include "Workgroup.h"
-#include "Barrier.h"
-#include "Kernel.h"
-#include "config.h"
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
 #include "pocl.h"
 
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Support/CommandLine.h"
-#ifdef LLVM_3_1
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/TypeBuilder.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/ValueSymbolTable.h"
-#elif defined LLVM_3_2
-#include "llvm/IRBuilder.h"
-#include "llvm/TypeBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/ValueSymbolTable.h"
-#else
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/TypeBuilder.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#endif
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
+#include "WorkitemLoops.h"
+#include "Workgroup.h"
+#include "Barrier.h"
+#include "Kernel.h"
 #include "WorkitemHandlerChooser.h"
 
-#include <iostream>
-#include <map>
-#include <sstream>
-#include <vector>
-
 //#define DUMP_CFGS
 
 #include "DebugHelpers.h"
@@ -95,14 +79,7 @@ WorkitemLoops::getAnalysisUsage(AnalysisUsage &AU) const
 #else
   AU.addRequired<LoopInfoWrapperPass>();
 #endif
-#ifdef LLVM_3_1
-  AU.addRequired<TargetData>();
-#endif
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DominatorTree>();
-#else
   AU.addRequired<DominatorTreeWrapperPass>();
-#endif
 
   AU.addRequired<VariableUniformityAnalysis>();
   AU.addPreserved<pocl::VariableUniformityAnalysis>();
@@ -122,12 +99,8 @@ WorkitemLoops::runOnFunction(Function &F)
       pocl::WorkitemHandlerChooser::POCL_WIH_LOOPS)
     return false;
 
-  #if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DT = &getAnalysis<DominatorTree>();
-  #else
   DTP = &getAnalysis<DominatorTreeWrapperPass>();
   DT = &DTP->getDomTree();
-  #endif
 #ifdef LLVM_OLDER_THAN_3_7
   LI = &getAnalysis<LoopInfo>();
 #else
@@ -151,11 +124,7 @@ WorkitemLoops::runOnFunction(Function &F)
   F.viewCFG();
 #endif
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  changed |= fixUndominatedVariableUses(DT, F);
-#else
   changed |= fixUndominatedVariableUses(DTP, F);
-#endif
 
 #if 0
   /* Split large BBs so we can print the Dot without it crashing. */
@@ -243,11 +212,7 @@ WorkitemLoops::CreateLoopAround
     BasicBlock::Create(C, "pregion_for_cond", F, exitBB);
 
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DT->runOnFunction(*F);
-#else
   DTP->runOnFunction(*F);
-#endif
 
   //  F->viewCFG();
   /* Fix the old edges jumping to the region to jump to the basic block
@@ -314,12 +279,10 @@ WorkitemLoops::CreateLoopAround
 
   /* This creation of the identifier metadata is copied from
      LLVM's MDBuilder::createAnonymousTBAARoot(). */
-#ifdef LLVM_3_7
-  MDNode *Dummy = MDNode::getTemporary(C, ArrayRef<Metadata*>()).release();
-#elif LLVM_OLDER_THAN_3_6
-  MDNode *Dummy = MDNode::getTemporary(C, ArrayRef<Value*>());
-#elif LLVM_OLDER_THAN_3_7
+#ifdef LLVM_OLDER_THAN_3_7
   MDNode *Dummy = MDNode::getTemporary(C, ArrayRef<Metadata*>());
+#else
+  MDNode *Dummy = MDNode::getTemporary(C, ArrayRef<Metadata*>()).release();
 #endif
 
   MDNode *Root = MDNode::get(C, Dummy);
@@ -332,11 +295,7 @@ WorkitemLoops::CreateLoopAround
   // We now have
   //   !1 = metadata !{metadata !1} <- self-referential root
 
-#ifdef LLVM_3_3
-  loopBranch->setMetadata("llvm.loop.parallel", Root);
-#else
   loopBranch->setMetadata("llvm.loop", Root);
-#endif
   region.AddParallelLoopMetadata(Root);
 
 
@@ -386,7 +345,7 @@ WorkitemLoops::ProcessFunction(Function &F)
           original_parallel_regions);
 #endif
 
-  IRBuilder<> builder(F.getEntryBlock().getFirstInsertionPt());
+  IRBuilder<> builder(&*(F.getEntryBlock().getFirstInsertionPt()));
   localIdXFirstVar = 
     builder.CreateAlloca
     (IntegerType::get(F.getContext(), size_t_width), 0, ".pocl.local_id_x_init");
@@ -583,7 +542,7 @@ WorkitemLoops::ProcessFunction(Function &F)
 
     if (!peeledRegion[pr]) continue;
     pr->insertPrologue(0, 0, 0);
-    builder.SetInsertPoint(pr->entryBB()->getFirstInsertionPt());
+    builder.SetInsertPoint(&*(pr->entryBB()->getFirstInsertionPt()));
     builder.CreateStore
       (ConstantInt::get(IntegerType::get(F.getContext(), size_t_width), 1), 
        localIdXFirstVar);       
@@ -623,7 +582,7 @@ WorkitemLoops::FixMultiRegionVariables(ParallelRegion *region)
       for (llvm::BasicBlock::iterator instr = bb->begin();
            instr != bb->end(); ++instr) 
         {
-          llvm::Instruction *instruction = instr;
+          llvm::Instruction *instruction = &*instr;
           instructionsInRegion.insert(instruction);
         }
     }
@@ -637,19 +596,15 @@ WorkitemLoops::FixMultiRegionVariables(ParallelRegion *region)
       for (llvm::BasicBlock::iterator instr = bb->begin();
            instr != bb->end(); ++instr) 
         {
-          llvm::Instruction *instruction = instr;
+          llvm::Instruction *instruction = &*instr;
 
-          if (ShouldNotBeContextSaved(instr)) continue;
+          if (ShouldNotBeContextSaved(&*instr)) continue;
 
           for (Instruction::use_iterator ui = instruction->use_begin(),
                  ue = instruction->use_end();
                ui != ue; ++ui) 
             {
-#if defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4
-              llvm::Instruction *user = dyn_cast<Instruction>(*ui);
-#else
               llvm::Instruction *user = dyn_cast<Instruction>(ui->getUser());
-#endif
 
               if (user == NULL) continue;
               // If the instruction is used outside this region inside another
@@ -696,12 +651,15 @@ WorkitemLoops::AddContextSave
     }
 
   /* Save the produced variable to the array. */
-  BasicBlock::iterator definition = dyn_cast<Instruction>(instruction);
-
+#ifdef LLVM_OLDER_THAN_3_8
+  BasicBlock::iterator definition = (dyn_cast<Instruction>(instruction));
+#else
+  BasicBlock::iterator definition = (dyn_cast<Instruction>(instruction))->getIterator();
+#endif
   ++definition;
   while (isa<PHINode>(definition)) ++definition;
 
-  IRBuilder<> builder(definition); 
+  IRBuilder<> builder(&*definition);
   std::vector<llvm::Value *> gepArgs;
   gepArgs.push_back(ConstantInt::get(IntegerType::get(instruction->getContext(), size_t_width), 0));
 
@@ -801,7 +759,8 @@ WorkitemLoops::GetContextArray(llvm::Instruction *instruction)
   if (contextArrays.find(varName) != contextArrays.end())
     return contextArrays[varName];
 
-  IRBuilder<> builder(instruction->getParent()->getParent()->getEntryBlock().getFirstInsertionPt());
+  BasicBlock &bb = instruction->getParent()->getParent()->getEntryBlock();
+  IRBuilder<> builder(&*(bb.getFirstInsertionPt()));
 
   llvm::Type *elementType;
   if (isa<AllocaInst>(instruction))
@@ -887,11 +846,7 @@ WorkitemLoops::AddContextSaveRestore
          ue = instruction->use_end();
        ui != ue; ++ui) 
     {
-#if defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4
-      llvm::Instruction *user = cast<Instruction>(*ui);
-#else
       llvm::Instruction *user = cast<Instruction>(ui->getUser());
-#endif
       if (user == NULL) continue;
       if (user == theStore) continue;
       uses.push_back(user);
diff --git a/lib/llvmopencl/WorkitemLoops.h b/lib/llvmopencl/WorkitemLoops.h
index 7c4429d..b4d56e8 100644
--- a/lib/llvmopencl/WorkitemLoops.h
+++ b/lib/llvmopencl/WorkitemLoops.h
@@ -23,17 +23,15 @@
 #ifndef _POCL_WORKITEM_LOOPS_H
 #define _POCL_WORKITEM_LOOPS_H
 
-#include "pocl.h"
+#include <map>
+#include <vector>
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-#include "llvm/Analysis/Dominators.h"
-#endif
+#include "pocl.h"
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include <map>
-#include <vector>
+
 #include "WorkitemHandler.h"
 #include "ParallelRegion.h"
 
@@ -68,9 +66,7 @@ namespace pocl {
     llvm::LoopInfoWrapperPass *LI;
 #endif
     llvm::PostDominatorTree *PDT;
-#if ! (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
     llvm::DominatorTreeWrapperPass *DTP;
-#endif
 
     ParallelRegion::ParallelRegionVector *original_parallel_regions;
 
diff --git a/lib/llvmopencl/WorkitemReplication.cc b/lib/llvmopencl/WorkitemReplication.cc
index 2c52acf..c101217 100644
--- a/lib/llvmopencl/WorkitemReplication.cc
+++ b/lib/llvmopencl/WorkitemReplication.cc
@@ -24,48 +24,34 @@
 
 #define DEBUG_TYPE "workitem"
 
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
 #include "CompilerWarnings.h"
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
-#include "WorkitemReplication.h"
-#include "Workgroup.h"
-#include "Barrier.h"
-#include "Kernel.h"
+#include "pocl.h"
+
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "config.h"
-#include "pocl.h"
-
-#ifdef LLVM_3_1
-#include "llvm/Support/IRBuilder.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/ValueSymbolTable.h"
-#elif defined LLVM_3_2
-#include "llvm/IRBuilder.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/ValueSymbolTable.h"
-#else
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#endif
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#include "WorkitemReplication.h"
+#include "Workgroup.h"
+#include "Barrier.h"
+#include "Kernel.h"
 #include "WorkitemHandlerChooser.h"
 #include "DebugHelpers.h"
 #include "VariableUniformityAnalysis.h"
 
-#include <iostream>
-#include <map>
-#include <sstream>
-#include <vector>
-
 //#define DEBUG_BB_MERGING
 //#define DUMP_RESULT_CFG
 //#define DEBUG_PR_REPLICATION
@@ -92,22 +78,14 @@ char WorkitemReplication::ID = 0;
 void
 WorkitemReplication::getAnalysisUsage(AnalysisUsage &AU) const
 {
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DominatorTree>();
-#else
   AU.addRequired<DominatorTreeWrapperPass>();
-#endif
 
 #ifdef LLVM_OLDER_THAN_3_7
   AU.addRequired<LoopInfo>();
 #else
   AU.addRequired<LoopInfoWrapperPass>();
 #endif
-#ifdef LLVM_3_1
-  AU.addRequired<TargetData>();
-#elif (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  AU.addRequired<DataLayout>();
-#elif (defined LLVM_OLDER_THAN_3_7)
+#ifdef LLVM_OLDER_THAN_3_7
   AU.addRequired<DataLayoutPass>();
 #endif
   AU.addRequired<pocl::WorkitemHandlerChooser>();
@@ -124,12 +102,8 @@ WorkitemReplication::runOnFunction(Function &F)
       pocl::WorkitemHandlerChooser::POCL_WIH_FULL_REPLICATION)
     return false;
 
-  #if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DT = &getAnalysis<DominatorTree>();
-  #else
   DTP = &getAnalysis<DominatorTreeWrapperPass>();
   DT = &DTP->getDomTree();
-  #endif
 
 #ifdef LLVM_OLDER_THAN_3_7
   LI = &getAnalysis<LoopInfo>();
@@ -143,11 +117,7 @@ WorkitemReplication::runOnFunction(Function &F)
   cfgPrinter->runOnFunction(F);
 #endif
 
-  #if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  changed |= fixUndominatedVariableUses(DT, F);
-  #else
   changed |= fixUndominatedVariableUses(DTP, F);
-  #endif
   return changed;
 }
 
@@ -169,8 +139,8 @@ WorkitemReplication::ProcessFunction(Function &F)
 
   BasicBlockVector original_bbs;
   for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
-      if (!Barrier::hasBarrier(i))
-        original_bbs.push_back(i);
+      if (!Barrier::hasBarrier(&*i))
+        original_bbs.push_back(&*i);
   }
 
 #ifdef LLVM_OLDER_THAN_3_7
@@ -202,15 +172,7 @@ WorkitemReplication::ProcessFunction(Function &F)
 #endif
   
   // Measure the required context (variables alive in more than one region).
-#ifdef LLVM_3_1
-  TargetData &TD = getAnalysis<TargetData>();
-#elif (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-  DataLayout &TD = getAnalysis<DataLayout>();
-#elif (defined LLVM_OLDER_THAN_3_7)
-  const DataLayout &TD = getAnalysis<DataLayoutPass>().getDataLayout();
-#else
   const DataLayout &TD = F.getParent()->getDataLayout();
-#endif
 
   for (SmallVector<ParallelRegion *, 8>::iterator
          i = original_parallel_regions->begin(), 
@@ -227,11 +189,7 @@ WorkitemReplication::ProcessFunction(Function &F)
         for (Value::use_iterator i4 = i3->use_begin(), e4 = i3->use_end();
              i4 != e4; ++i4) {
           // Instructions can only be used by instructions.
-#if defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4
-          llvm::Instruction *user = cast<Instruction>(*i4);
-#else
           llvm::Instruction *user = cast<Instruction>(i4->getUser());
-#endif
           
           if (find (pr->begin(), pr->end(), user->getParent()) ==
               pr->end()) {
diff --git a/lib/llvmopencl/WorkitemReplication.h b/lib/llvmopencl/WorkitemReplication.h
index 843ec4a..1fbf5b4 100644
--- a/lib/llvmopencl/WorkitemReplication.h
+++ b/lib/llvmopencl/WorkitemReplication.h
@@ -24,19 +24,16 @@
 #ifndef _POCL_WORKITEM_REPLICATION_H
 #define _POCL_WORKITEM_REPLICATION_H
 
-#include "config.h"
+#include <map>
+#include <vector>
+
 #include "pocl.h"
 
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-#include "llvm/Analysis/Dominators.h"
-#else
 #include "llvm/IR/Dominators.h"
-#endif
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include <map>
-#include <vector>
+
 #include "WorkitemHandler.h"
 
 namespace pocl {
@@ -56,9 +53,7 @@ namespace pocl {
   private:
 
     llvm::DominatorTree *DT;
-#if ! (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
     llvm::DominatorTreeWrapperPass *DTP;
-#endif
 
 #ifdef LLVM_OLDER_THAN_3_7
     llvm::LoopInfo *LI;
diff --git a/lib/llvmopencl/linker.cpp b/lib/llvmopencl/linker.cpp
index 3208b1b..797b161 100644
--- a/lib/llvmopencl/linker.cpp
+++ b/lib/llvmopencl/linker.cpp
@@ -12,26 +12,20 @@
    licence. See file COPYING.
  */
 
+#include <list>
+#include <iostream>
+
 #include "config.h"
 #include "pocl.h"
 
-#ifdef LLVM_3_2
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#else
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#endif
-
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
-#include <list>
-#include <iostream>
-
 #include "linker.h"
+
 using namespace llvm;
 
 //#include <cstdio>
@@ -57,6 +51,90 @@ find_from_list(llvm::StringRef             needle,
     }
     return false;
 }
+
+/* Fixes address space on opencl.imageX_t arguments to be global.
+ * Note this does not change the types in Function->FunctionType
+ * so it's only used inside CopyFunc on kernel library functions */
+static void fixOpenCLimageArguments(llvm::Function *Func) {
+  Function::arg_iterator b = Func->arg_begin();
+  Function::arg_iterator e = Func->arg_end();
+  for (; b != e; b++)  {
+      Argument *j = &*b;
+      Type *t = j->getType();
+      if (t->isPointerTy() && t->getPointerElementType()->isStructTy()) {
+        Type *pe_type = t->getPointerElementType();
+        if (pe_type->getStructName().startswith("opencl.image"))  {
+          //std::cerr << "BOOM \n" << t2->getStructName().data() << "\n";
+          Type *new_t = PointerType::get(pe_type, POCL_ADDRESS_SPACE_GLOBAL);
+          j->mutateType(new_t);
+        }// else
+          //std::cerr << "NO BOOM  " << t2->getStructName().data() << "\n";
+      }
+  }
+
+}
+
+/* Fixes opencl.imageX_t type arguments which miss address space global
+ * returns F if no changes are required, or a new cloned F if the arguments
+ * require a fix. To be used on user's kernel code itself, not on kernel library.
+ */
+static llvm::Function *
+CloneFuncFixOpenCLImageT(llvm::Module *Mod, llvm::Function *F)
+{
+  assert(F && "No function to copy");
+  assert(!F->isDeclaration());
+
+  int changed = 0;
+  ValueToValueMapTy VVMap;
+  SmallVector<Type *, 8> sv;
+  for (Function::arg_iterator i = F->arg_begin(), e = F->arg_end();
+       i != e; ++i) {
+      Argument *j = &*i;
+      Type *t = j->getType();
+      Type *new_t = t;
+      if (t->isPointerTy() && t->getPointerElementType()->isStructTy()) {
+          Type *pe_type = t->getPointerElementType();
+          if (pe_type->getStructName().startswith("opencl.image")) {
+            if (t->getPointerAddressSpace() != POCL_ADDRESS_SPACE_GLOBAL) {
+              new_t = PointerType::get(pe_type, POCL_ADDRESS_SPACE_GLOBAL);
+              changed = 1;
+              }
+            }
+        }
+      sv.push_back(new_t);
+    }
+
+    if (!changed)
+      return F;
+
+    F->removeFromParent();
+
+    FunctionType *NewFT = FunctionType::get(F->getReturnType(),
+                                         ArrayRef<Type *> (sv),
+                                         false);
+    assert(NewFT);
+    llvm::Function *DstFunc = nullptr;
+
+    DstFunc = Function::Create(NewFT, F->getLinkage(), F->getName(), Mod);
+
+    Function::arg_iterator j = DstFunc->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(),
+         e = F->arg_end();
+         i != e; ++i) {
+        j->setName(i->getName());
+        VVMap[&*i] = &*j;
+        ++j;
+    }
+
+    DstFunc->copyAttributesFrom(F);
+
+    SmallVector<ReturnInst*, 8> RI;          // Ignore returns cloned.
+    CloneFunctionInto(DstFunc, F, VVMap, true, RI);
+    delete F;
+
+    return DstFunc;
+}
+
 /* Find all functions in the calltree of F, append their
  * name to list.
  */
@@ -124,13 +202,14 @@ CopyFunc( const llvm::StringRef Name,
          e=SrcFunc->arg_end();
          i != e; ++i) {
         j->setName(i->getName());
-        VVMap[i]=j;
+        VVMap[&*i] = &*j;
         ++j;
     }
     if (!SrcFunc->isDeclaration()) {
         SmallVector<ReturnInst*, 8> RI;          // Ignore returns cloned.
         DB_PRINT("  cloning %s\n", Name.data());
         CloneFunctionInto(DstFunc, SrcFunc, VVMap, true, RI);
+        fixOpenCLimageArguments(DstFunc);
     } else {
         DB_PRINT("  found %s, but its a declaration, do nothing\n",
                  Name.data());
@@ -191,8 +270,22 @@ link(llvm::Module *krn, const llvm::Module *lib)
     ValueToValueMapTy vvm;
     std::list<llvm::StringRef> declared;
 
-    // Inspect the kernel, find undefined functions
     llvm::Module::iterator fi,fe;
+
+    // Find and fix opencl.imageX_t arguments
+    for (fi=krn->begin(), fe=krn->end();
+         fi != fe;
+         fi++) {
+        llvm::Function *f = &*fi;
+        if (f->isDeclaration())
+            continue;
+        // need to restart iteration if we replace a function
+        if (CloneFuncFixOpenCLImageT(krn, f) != f) {
+          fi = krn->begin();
+          }
+      }
+
+    // Inspect the kernel, find undefined functions
     for (fi=krn->begin(), fe=krn->end();
          fi != fe;
          fi++) {
@@ -204,7 +297,7 @@ link(llvm::Module *krn, const llvm::Module *lib)
 
         // Find all functions the kernel source calls
         // TODO: is there no direct way?
-        find_called_functions(fi, declared);
+        find_called_functions(&*fi, declared);
     }
     declared.sort(stringref_cmp);
     declared.unique(stringref_equal);
@@ -226,8 +319,8 @@ link(llvm::Module *krn, const llvm::Module *lib)
                                               (GlobalVariable*) 0,
                                               gi->getThreadLocalMode(),
                                               gi->getType()->getAddressSpace());
-        GV->copyAttributesFrom(gi);
-        vvm[gi]=GV;
+        GV->copyAttributesFrom(&*gi);
+        vvm[&*gi]=GV;
     }
 
     // For each undefined function in krn, clone it from the lib to the krn module,
@@ -247,10 +340,7 @@ link(llvm::Module *krn, const llvm::Module *lib)
          ai++) {
         DB_PRINT(" %s\n", ai->getName().data());
         GlobalAlias *GA =
-#if (defined LLVM_3_2 || defined LLVM_3_3 || defined LLVM_3_4)
-            new GlobalAlias(ai->getType(), ai->getLinkage(),
-                            ai->getName(), NULL, krn);
-#elif (defined LLVM_OLDER_THAN_3_7)
+#ifndef LLVM_3_7
             GlobalAlias::create(ai->getType(),
                                 ai->getType()->getAddressSpace(),
                                 ai->getLinkage(), ai->getName(), NULL, krn);
@@ -259,15 +349,15 @@ link(llvm::Module *krn, const llvm::Module *lib)
                                 ai->getLinkage(), ai->getName(), NULL, krn);
 #endif
 
-        GA->copyAttributesFrom(ai);
-        vvm[ai]=GA;
+        GA->copyAttributesFrom(&*ai);
+        vvm[&*ai]=GA;
     }
 
     // initialize the globals that were copied
     for (gi=lib->global_begin(), ge=lib->global_end();
          gi != ge;
          gi++) {
-        GlobalVariable *GV=cast<GlobalVariable>(vvm[gi]);
+        GlobalVariable *GV=cast<GlobalVariable>(vvm[&*gi]);
         if (gi->hasInitializer())
             GV->setInitializer(MapValue(gi->getInitializer(), vvm));
     }
@@ -282,11 +372,7 @@ link(llvm::Module *krn, const llvm::Module *lib)
         DB_PRINT(" %s:\n", NMD.getName().data());
         NamedMDNode *NewNMD=krn->getOrInsertNamedMetadata(NMD.getName());
         for (unsigned i=0, e=NMD.getNumOperands(); i != e; ++i)
-#ifdef LLVM_OLDER_THAN_3_6
-            NewNMD->addOperand(MapValue(NMD.getOperand(i), vvm));
-#else
             NewNMD->addOperand(MapMetadata(NMD.getOperand(i), vvm));
-#endif
     }
 }
 
diff --git a/lib/llvmopencl/linker.h b/lib/llvmopencl/linker.h
index 9bafaa3..a3131b5 100644
--- a/lib/llvmopencl/linker.h
+++ b/lib/llvmopencl/linker.h
@@ -2,11 +2,8 @@
 #define POCL_LINKER_H
 
 #include "config.h"
-#ifdef LLVM_3_2
-#include "llvm/Module.h"
-#else
+
 #include "llvm/IR/Module.h"
-#endif
 
 /**
  * Link in module lib to krn.
diff --git a/lib/poclu/Makefile.in b/lib/poclu/Makefile.in
index 368ac01..f90d0a7 100644
--- a/lib/poclu/Makefile.in
+++ b/lib/poclu/Makefile.in
@@ -274,6 +274,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -281,6 +282,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -298,8 +300,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -375,6 +375,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index 319f445..e021881 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -25,6 +25,12 @@
 
 if(UNIX)
 
+  if((X86_64 OR I386) AND KERNELLIB_HOST_DISTRO_VARIANTS)
+    set(KERNLIB_VARIANT "sse2")
+  else()
+    set(KERNLIB_VARIANT "${LLC_HOST_CPU}")
+  endif()
+
   # build-dir script
   set(LLVMOPENCL_LOCATION "$<TARGET_FILE:llvmopencl>")
   set(KERNEL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include")
diff --git a/scripts/Makefile.am b/scripts/Makefile.am
index 5363132..6d42002 100644
--- a/scripts/Makefile.am
+++ b/scripts/Makefile.am
@@ -50,7 +50,8 @@ do_subst = sed -e 's|[@]abs_top_srcdir[@]|$(abs_top_srcdir)|g'		\
                -e 's|[@]OPT[@]|$(OPT)|g'				\
                -e 's|[@]LLVM_LINK[@]|$(LLVM_LINK)|g'			\
                -e 's|[@]LLC[@]|$(LLC)|g' 				\
-               -e 's|[@]LLVM_VERSION[@]|$(LLVM_VERSION)|g' 
+               -e 's|[@]LLVM_VERSION[@]|$(LLVM_VERSION)|g'              \
+               -e 's|[@]HOST_DEVICE_EXTENSION_DEFINES[@]|$(HOST_DEVICE_EXTENSION_DEFINES)|g'
 
 pocl-standalone: %: %.in Makefile ../install-paths.h
 	$(do_subst) < $< > $@
diff --git a/scripts/Makefile.in b/scripts/Makefile.in
index f967e8e..3758cce 100644
--- a/scripts/Makefile.in
+++ b/scripts/Makefile.in
@@ -222,6 +222,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -229,6 +230,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -246,8 +248,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -323,6 +323,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -408,7 +409,8 @@ do_subst = sed -e 's|[@]abs_top_srcdir[@]|$(abs_top_srcdir)|g'		\
                -e 's|[@]OPT[@]|$(OPT)|g'				\
                -e 's|[@]LLVM_LINK[@]|$(LLVM_LINK)|g'			\
                -e 's|[@]LLC[@]|$(LLC)|g' 				\
-               -e 's|[@]LLVM_VERSION[@]|$(LLVM_VERSION)|g' 
+               -e 's|[@]LLVM_VERSION[@]|$(LLVM_VERSION)|g'              \
+               -e 's|[@]HOST_DEVICE_EXTENSION_DEFINES[@]|$(HOST_DEVICE_EXTENSION_DEFINES)|g'
 
 all: all-am
 
diff --git a/scripts/pocl-standalone.in b/scripts/pocl-standalone.in
index e527568..aab9662 100644
--- a/scripts/pocl-standalone.in
+++ b/scripts/pocl-standalone.in
@@ -54,7 +54,6 @@ then
 fi
 
 case $target in
-    cellspu-*) target_dir="cellspu";;
     tce*)     target_dir="tce"
                target="tce-tut-llvm"
                ;;
@@ -66,6 +65,7 @@ case $target in
               LLC_FLAGS=""
               LD_FLAGS="";;
     @OCL_KERNEL_TARGET@)   CLANG_FLAGS="@HOST_CLANG_FLAGS@"
+              DEVICE_CL_FLAGS="@HOST_DEVICE_EXTENSION_DEFINES@"
               LLC_FLAGS="@HOST_LLC_FLAGS@"
               LD_FLAGS="@HOST_LD_FLAGS@";;
     @TARGET@) CLANG_FLAGS="@TARGET_CLANG_FLAGS@"
@@ -85,13 +85,13 @@ kernel_bc="${tempdir}/kernel.bc"
 
 # BEGIN REMOVE ONCE INSTALLED
 pocl_kernel_compiler_lib=@abs_top_builddir@/lib/llvmopencl/.libs/llvmopencl.so
- at CLANG@ ${CLANG_FLAGS} $EXTRA_CLANG_FLAGS -c -emit-llvm -I at abs_top_builddir@ -include @abs_top_srcdir@/include/_kernel.h -o ${kernel_bc} -x cl $1
+ at CLANG@ ${CLANG_FLAGS} ${DEVICE_CL_FLAGS} $EXTRA_CLANG_FLAGS -c -emit-llvm -I at abs_top_builddir@ -include @abs_top_srcdir@/include/_kernel.h -o ${kernel_bc} -x cl $1
 rm -f ${header}
 @OPT@ ${LLC_FLAGS} -load=$pocl_kernel_compiler_lib -generate-header -disable-output -header=${header} ${kernel_bc}
 if false ; then
 # END REMOVE ONCE INSTALLED
 pocl_kernel_compiler_lib=@pkglibdir@/llvmopencl.so
- at CLANG@ ${CLANG_FLAGS} $EXTRA_CLANG_FLAGS -c -emit-llvm -include @pkgdataincludedir@/_kernel.h -o ${kernel_bc} -x cl $1
+ at CLANG@ ${CLANG_FLAGS} ${DEVICE_CL_FLAGS} $EXTRA_CLANG_FLAGS -c -emit-llvm -include @pkgdataincludedir@/_kernel.h -o ${kernel_bc} -x cl $1
 rm -f ${header}
 @OPT@ ${LLC_FLAGS} -load=$pocl_kernel_compiler_lib -generate-header -disable-output -header=${header} ${kernel_bc}
 # BEGIN REMOVE ONCE INSTALLED
diff --git a/scripts/pocl-standalone.in.cmake b/scripts/pocl-standalone.in.cmake
index 2ce711c..b1e793a 100644
--- a/scripts/pocl-standalone.in.cmake
+++ b/scripts/pocl-standalone.in.cmake
@@ -30,8 +30,6 @@ if [ -n "$POCL_VERBOSE" ]; then
     echo 0=$0 @=$@
 fi
 
-target=@OCL_KERNEL_TARGET@
-
 while getopts h:t:o: o
 do
     case "$o" in
@@ -54,21 +52,19 @@ then
 fi
 
 case $target in
-    cellspu-*) target_dir="cellspu";;
-    tce*)     target_dir="tce"
-               target="tce-tut-llvm"
-               ;;
-    *)         target_dir="host";;
-esac
-
-case $target in
-    tce*)     CLANG_FLAGS="@TCE_TARGET_CLANG_FLAGS@"
+    tce*)     target="tce-tut-llvm"
+              target_dir="tce"
+              KERNLIB="kernel-${target}.bc"
+              CLANG_FLAGS="@TCE_TARGET_CLANG_FLAGS@"
               LLC_FLAGS="@TCE_TARGET_LLC_FLAGS@"
-              LD_FLAGS="@@";;
-    cell*)    CLANG_FLAGS="@CELL_TARGET_CLANG_FLAGS@"
-              LLC_FLAGS="@CELL_TARGET_LLC_FLAGS@"
-              LD_FLAGS="@@";;
-    *)        CLANG_FLAGS="@HOST_CLANG_FLAGS@"
+              LD_FLAGS="@@"
+              ;;
+    *)        target=@OCL_KERNEL_TARGET@
+              target_dir="host"
+              cpu=@KERNLIB_VARIANT@
+              KERNLIB="kernel-${target}-${cpu}.bc"
+              CLANG_FLAGS="@HOST_CLANG_FLAGS@"
+              DEVICE_CL_FLAGS="@HOST_DEVICE_EXTENSION_DEFINES@"
               LLC_FLAGS="@HOST_LLC_FLAGS@"
               LD_FLAGS="@HOST_LD_FLAGS@";;
 # TODO
@@ -77,7 +73,7 @@ case $target in
 #              LD_FLAGS="@TARGET_LD_FLAGS@";;
 esac
 CLANG_FLAGS="$CLANG_FLAGS -fasm -fsigned-char -Xclang -ffake-address-space-map"
-echo $target
+
 # With fp-contract we get calls to fma with processors which do not
 # have fma instructions. These ruin the performance. Better to have
 # the mul+add separated in the IR.
@@ -89,15 +85,15 @@ mkdir ${tempdir}
 kernel_bc="${tempdir}/kernel.bc"
 
 pocl_kernel_compiler_lib=@LLVMOPENCL_LOCATION@
- at CLANG@ ${CLANG_FLAGS} $EXTRA_CLANG_FLAGS -c -emit-llvm @ADD_INCLUDE@ -include @KERNEL_INCLUDE_DIR@/_kernel.h -o ${kernel_bc} -x cl $1
+ at CLANG@ ${CLANG_FLAGS} ${DEVICE_CL_FLAGS} $EXTRA_CLANG_FLAGS -c -emit-llvm @ADD_INCLUDE@ -include @KERNEL_INCLUDE_DIR@/_kernel.h -o ${kernel_bc} -x cl $1
 rm -f ${header}
- at LLVM_OPT@ ${LLC_FLAGS} -load=$pocl_kernel_compiler_lib -generate-header -disable-output -header=${header} ${kernel_bc}
+ at LLVM_OPT@ ${LLC_FLAGS} -load=${pocl_kernel_compiler_lib} -generate-header -disable-output -header=${header} ${kernel_bc}
 
 linked_bc="${tempdir}/linked.bc"
 linked_out="${linked_bc}.out"
-full_target_dir=@FULL_TARGET_DIR@
 
- at LLVM_LINK@ -o ${linked_bc} ${kernel_bc} $full_target_dir/kernel-$target.bc
+
+ at LLVM_LINK@ -o ${linked_bc} ${kernel_bc} @FULL_TARGET_DIR@/${KERNLIB}
 
 OPT_SWITCH="-O3"
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index be3d89f..b7d490e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -23,39 +23,78 @@
 #
 #=============================================================================
 
-function(add_test_custom RUN_CMD TEST_NAME RESULT_FILE)
-  foreach(LOOPVAR ${ARGN})
-    set(RUN_CMD "${RUN_CMD}####${LOOPVAR}")
+#function(add_test_custom RUN_CMD TEST_NAME RESULT_FILE)
+#  foreach(LOOPVAR ${ARGN})
+#    set(RUN_CMD "${RUN_CMD}####${LOOPVAR}")
+#  endforeach()
+#endfunction()
+
+include(CMakeParseArguments)
+
+# This is a wrapper around add_test
+# Solves several problems:
+# 1) allows expected outputs (optionally sorted)
+# 2) handles the exit status problem (test properties WILL_FAIL does not work if
+#    the test exits with !0 exit status)
+
+function(add_test_pocl)
+
+  set(options SORT_OUTPUT)
+  set(oneValueArgs EXPECTED_OUTPUT NAME WORKING_DIRECTORY)
+  set(multiValueArgs COMMAND)
+  cmake_parse_arguments(POCL_TEST "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+
+  #message(STATUS "POCL_TEST_NAME: ${POCL_TEST_NAME}")
+  #message(STATUS "POCL_TEST_COMMAND: ${POCL_TEST_COMMAND}")
+
+  unset(RUN_CMD)
+  foreach(LOOPVAR ${POCL_TEST_COMMAND})
+    if(NOT RUN_CMD)
+      set(RUN_CMD "${CMAKE_CURRENT_BINARY_DIR}/${LOOPVAR}")
+    else()
+      set(RUN_CMD "${RUN_CMD}####${LOOPVAR}")
+    endif()
   endforeach()
 
-  add_test( "${TEST_NAME}"
-    "${CMAKE_COMMAND}"
-    -Dtest_cmd=${RUN_CMD}
-    -Doutput_blessed=${CMAKE_CURRENT_SOURCE_DIR}/${RESULT_FILE}
-    -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake"
-  )
+  set(POCL_TEST_ARGLIST "NAME" "${POCL_TEST_NAME}")
+  if(POCL_TEST_WORKING_DIRECTORY)
+    list(APPEND POCL_TEST_ARGLIST "WORKING_DIRECTORY")
+    list(APPEND POCL_TEST_ARGLIST "${POCL_TEST_WORKING_DIRECTORY}")
+  endif()
+
+  list(APPEND POCL_TEST_ARGLIST "COMMAND" "${CMAKE_COMMAND}" "-Dtest_cmd=${RUN_CMD}")
+  if(POCL_TEST_EXPECTED_OUTPUT)
+    list(APPEND POCL_TEST_ARGLIST
+      "-Doutput_blessed=${CMAKE_CURRENT_SOURCE_DIR}/${POCL_TEST_EXPECTED_OUTPUT}")
+  endif()
+  if(POCL_TEST_SORT_OUTPUT)
+    list(APPEND POCL_TEST_ARGLIST "-Dsort_output=1")
+    endif()
+  list(APPEND POCL_TEST_ARGLIST "-P" "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake")
+
+  add_test(${POCL_TEST_ARGLIST} )
+  set_tests_properties("${POCL_TEST_NAME}" PROPERTIES
+                       PASS_REGULAR_EXPRESSION "OK"
+                       FAIL_REGULAR_EXPRESSION "FAIL")
+
 endfunction()
 
+
 add_test("pocl_version_check" "runtime/test_version")
 set_tests_properties("pocl_version_check"
   PROPERTIES
   ENVIRONMENT "POCL_DEVICES=basic"
-  PASS_REGULAR_EXPRESSION "basic")
+  PASS_REGULAR_EXPRESSION "basic"
+  LABELS "internal")
 
 #######################################################################
 
 add_subdirectory("kernel")
-if(HAVE_OPENCL_HPP)
-  add_subdirectory("regression")
-else()
-  message(STATUS "cl.hpp unavailable, skipping regression tests")
-endif()
+add_subdirectory("regression")
 add_subdirectory("runtime")
 add_subdirectory("workgroup")
 if(ENABLE_TCE)
-  #add_subdirectory("tce")
-endif()
-if(ENABLE_SPU)
-  #add_subdirectory("cell")
+  add_subdirectory("tce")
 endif()
 
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 5a1f539..f22d007 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -22,7 +22,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-DIST_SUBDIRS = kernel regression runtime workgroup tce cell
+DIST_SUBDIRS = kernel regression runtime workgroup tce
 SUBDIRS = kernel regression runtime workgroup
 if TCE_AVAILABLE
 SUBDIRS += tce
diff --git a/tests/Makefile.in b/tests/Makefile.in
index 68a4a03..74bfdb9 100644
--- a/tests/Makefile.in
+++ b/tests/Makefile.in
@@ -252,6 +252,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -259,6 +260,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -276,8 +278,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -353,6 +353,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
@@ -412,7 +413,7 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-DIST_SUBDIRS = kernel regression runtime workgroup tce cell
+DIST_SUBDIRS = kernel regression runtime workgroup tce
 SUBDIRS = kernel regression runtime workgroup $(am__append_1)
 TESTSUITE = $(srcdir)/testsuite
 EXTRA_DIST = testsuite.at testsuite-samples.at testsuite-viennacl.at testsuite-regression.at \
diff --git a/tests/cell/Makefile.am b/tests/cell/Makefile.am
deleted file mode 100644
index d7ed933..0000000
--- a/tests/cell/Makefile.am
+++ /dev/null
@@ -1,25 +0,0 @@
-# Process this file with automake to produce Makefile.in (in this,
-# and all subdirectories).
-# Makefile.am for tests/cell.
-# 
-# Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-SUBDIRS = hello
diff --git a/tests/cell/Makefile.in b/tests/cell/Makefile.in
deleted file mode 100644
index 97f6459..0000000
--- a/tests/cell/Makefile.in
+++ /dev/null
@@ -1,730 +0,0 @@
-# Makefile.in generated by automake 1.15 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994-2014 Free Software Foundation, Inc.
-
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
- at SET_MAKE@
-
-# Process this file with automake to produce Makefile.in (in this,
-# and all subdirectories).
-# Makefile.am for tests/cell.
-# 
-# Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-VPATH = @srcdir@
-am__is_gnu_make = { \
-  if test -z '$(MAKELEVEL)'; then \
-    false; \
-  elif test -n '$(MAKE_HOST)'; then \
-    true; \
-  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
-    true; \
-  else \
-    false; \
-  fi; \
-}
-am__make_running_with_option = \
-  case $${target_option-} in \
-      ?) ;; \
-      *) echo "am__make_running_with_option: internal error: invalid" \
-              "target option '$${target_option-}' specified" >&2; \
-         exit 1;; \
-  esac; \
-  has_opt=no; \
-  sane_makeflags=$$MAKEFLAGS; \
-  if $(am__is_gnu_make); then \
-    sane_makeflags=$$MFLAGS; \
-  else \
-    case $$MAKEFLAGS in \
-      *\\[\ \	]*) \
-        bs=\\; \
-        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
-          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
-    esac; \
-  fi; \
-  skip_next=no; \
-  strip_trailopt () \
-  { \
-    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
-  }; \
-  for flg in $$sane_makeflags; do \
-    test $$skip_next = yes && { skip_next=no; continue; }; \
-    case $$flg in \
-      *=*|--*) continue;; \
-        -*I) strip_trailopt 'I'; skip_next=yes;; \
-      -*I?*) strip_trailopt 'I';; \
-        -*O) strip_trailopt 'O'; skip_next=yes;; \
-      -*O?*) strip_trailopt 'O';; \
-        -*l) strip_trailopt 'l'; skip_next=yes;; \
-      -*l?*) strip_trailopt 'l';; \
-      -[dEDm]) skip_next=yes;; \
-      -[JT]) skip_next=yes;; \
-    esac; \
-    case $$flg in \
-      *$$target_option*) has_opt=yes; break;; \
-    esac; \
-  done; \
-  test $$has_opt = yes
-am__make_dryrun = (target_option=n; $(am__make_running_with_option))
-am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
-pkgdatadir = $(datadir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkglibexecdir = $(libexecdir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-subdir = tests/cell
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
-	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
-	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
-	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \
-	$(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/config.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-AM_V_P = $(am__v_P_ at AM_V@)
-am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
-am__v_P_0 = false
-am__v_P_1 = :
-AM_V_GEN = $(am__v_GEN_ at AM_V@)
-am__v_GEN_ = $(am__v_GEN_ at AM_DEFAULT_V@)
-am__v_GEN_0 = @echo "  GEN     " $@;
-am__v_GEN_1 = 
-AM_V_at = $(am__v_at_ at AM_V@)
-am__v_at_ = $(am__v_at_ at AM_DEFAULT_V@)
-am__v_at_0 = @
-am__v_at_1 = 
-SOURCES =
-DIST_SOURCES =
-RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
-	ctags-recursive dvi-recursive html-recursive info-recursive \
-	install-data-recursive install-dvi-recursive \
-	install-exec-recursive install-html-recursive \
-	install-info-recursive install-pdf-recursive \
-	install-ps-recursive install-recursive installcheck-recursive \
-	installdirs-recursive pdf-recursive ps-recursive \
-	tags-recursive uninstall-recursive
-am__can_run_installinfo = \
-  case $$AM_UPDATE_INFO_DIR in \
-    n|no|NO) false;; \
-    *) (install-info --version) >/dev/null 2>&1;; \
-  esac
-RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
-  distclean-recursive maintainer-clean-recursive
-am__recursive_targets = \
-  $(RECURSIVE_TARGETS) \
-  $(RECURSIVE_CLEAN_TARGETS) \
-  $(am__extra_recursive_targets)
-AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
-	distdir
-am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
-# Read a list of newline-separated strings from the standard input,
-# and print each of them once, without duplicates.  Input order is
-# *not* preserved.
-am__uniquify_input = $(AWK) '\
-  BEGIN { nonempty = 0; } \
-  { items[$$0] = 1; nonempty = 1; } \
-  END { if (nonempty) { for (i in items) print i; }; } \
-'
-# Make sure the list of sources is unique.  This is necessary because,
-# e.g., the same source file might be shared among _SOURCES variables
-# for different programs/libraries.
-am__define_uniq_tagged_files = \
-  list='$(am__tagged_files)'; \
-  unique=`for i in $$list; do \
-    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-  done | $(am__uniquify_input)`
-ETAGS = etags
-CTAGS = ctags
-DIST_SUBDIRS = $(SUBDIRS)
-am__DIST_COMMON = $(srcdir)/Makefile.in
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-am__relativize = \
-  dir0=`pwd`; \
-  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
-  sed_rest='s,^[^/]*/*,,'; \
-  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
-  sed_butlast='s,/*[^/]*$$,,'; \
-  while test -n "$$dir1"; do \
-    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
-    if test "$$first" != "."; then \
-      if test "$$first" = ".."; then \
-        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
-        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
-      else \
-        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
-        if test "$$first2" = "$$first"; then \
-          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
-        else \
-          dir2="../$$dir2"; \
-        fi; \
-        dir0="$$dir0"/"$$first"; \
-      fi; \
-    fi; \
-    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
-  done; \
-  reldir="$$dir2"
-ACLOCAL = @ACLOCAL@
-AMTAR = @AMTAR@
-AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
-AR = @AR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
-BOOST_LDFLAGS = @BOOST_LDFLAGS@
-BUILD_TIMESTAMP = @BUILD_TIMESTAMP@
-CC = @CC@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CLANG = @CLANG@
-CLANGXX = @CLANGXX@
-CLANGXX_FLAGS = @CLANGXX_FLAGS@
-CLFLAGS = @CLFLAGS@
-CPP = @CPP@
-CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-DLLTOOL = @DLLTOOL@
-DSYMUTIL = @DSYMUTIL@
-DUMPBIN = @DUMPBIN@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-FGREP = @FGREP@
-FORCED_CLFLAGS = @FORCED_CLFLAGS@
-GLEW_CFLAGS = @GLEW_CFLAGS@
-GLEW_LIBS = @GLEW_LIBS@
-GREP = @GREP@
-HOST = @HOST@
-HOST_AS_FLAGS = @HOST_AS_FLAGS@
-HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
-HOST_CPU = @HOST_CPU@
-HOST_LD_FLAGS = @HOST_LD_FLAGS@
-HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
-HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
-HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
-HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
-HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
-HSAILASM = @HSAILASM@
-HSA_INCLUDES = @HSA_INCLUDES@
-HSA_LIBS = @HSA_LIBS@
-HWLOC_CFLAGS = @HWLOC_CFLAGS@
-HWLOC_LIBS = @HWLOC_LIBS@
-ICD_LD_FLAGS = @ICD_LD_FLAGS@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-KERNEL_COMPILER_LIB_VERSION = @KERNEL_COMPILER_LIB_VERSION@
-LD = @LD@
-LDFLAGS = @LDFLAGS@
-LD_FLAGS_BIN = @LD_FLAGS_BIN@
-LIBOBJS = @LIBOBJS@
-LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
-LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
-LIBTOOL = @LIBTOOL@
-LIB_AGE_VERSION = @LIB_AGE_VERSION@
-LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
-LIB_FIRST_VERSION = @LIB_FIRST_VERSION@
-LIB_REVISION_VERSION = @LIB_REVISION_VERSION@
-LIB_VERSION = @LIB_VERSION@
-LIPO = @LIPO@
-LLC = @LLC@
-LLVM_AS = @LLVM_AS@
-LLVM_CONFIG = @LLVM_CONFIG@
-LLVM_CXX_FLAGS = @LLVM_CXX_FLAGS@
-LLVM_LDFLAGS = @LLVM_LDFLAGS@
-LLVM_LIBS = @LLVM_LIBS@
-LLVM_LINK = @LLVM_LINK@
-LLVM_OPT = @LLVM_OPT@
-LLVM_VERSION = @LLVM_VERSION@
-LN_S = @LN_S@
-LTDL_LIBS = @LTDL_LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
-MAKEINFO = @MAKEINFO@
-MANIFEST_TOOL = @MANIFEST_TOOL@
-MKDIR_P = @MKDIR_P@
-NM = @NM@
-NMEDIT = @NMEDIT@
-OBJDUMP = @OBJDUMP@
-OBJEXT = @OBJEXT@
-OCL_ICD_CFLAGS = @OCL_ICD_CFLAGS@
-OCL_ICD_LIBS = @OCL_ICD_LIBS@
-OCL_KERNEL_ARCH = @OCL_KERNEL_ARCH@
-OCL_KERNEL_TARGET = @OCL_KERNEL_TARGET@
-OCL_KERNEL_TARGET_CPU = @OCL_KERNEL_TARGET_CPU@
-OCL_TARGETS = @OCL_TARGETS@
-OPENCL_CFLAGS = @OPENCL_CFLAGS@
-OPENCL_CMAKE = @OPENCL_CMAKE@
-OPENCL_EXTLIBS = @OPENCL_EXTLIBS@
-OPENCL_LIBS = @OPENCL_LIBS@
-OPT = @OPT@
-OTOOL = @OTOOL@
-OTOOL64 = @OTOOL64@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_URL = @PACKAGE_URL@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PKG_CONFIG = @PKG_CONFIG@
-PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
-PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
-POAT_TESTSUITES = @POAT_TESTSUITES@
-POCL_DEVICE_ADDRESS_BITS = @POCL_DEVICE_ADDRESS_BITS@
-PTHREAD_CC = @PTHREAD_CC@
-PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SDL_CFLAGS = @SDL_CFLAGS@
-SDL_LIBS = @SDL_LIBS@
-SED = @SED@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-TARGET = @TARGET@
-TARGET_CLANG_FLAGS = @TARGET_CLANG_FLAGS@
-TARGET_CPU = @TARGET_CPU@
-TARGET_LLC_FLAGS = @TARGET_LLC_FLAGS@
-TARGET_SIZEOF_DOUBLE = @TARGET_SIZEOF_DOUBLE@
-TARGET_SIZEOF_HALF = @TARGET_SIZEOF_HALF@
-TARGET_SIZEOF_LONG = @TARGET_SIZEOF_LONG@
-TARGET_SIZEOF_VOID_P = @TARGET_SIZEOF_VOID_P@
-TCECC = @TCECC@
-TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
-TCE_AVAILABLE = @TCE_AVAILABLE@
-TCE_CONFIG = @TCE_CONFIG@
-VERSION = @VERSION@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_ct_AR = @ac_ct_AR@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
-acx_pthread_config = @acx_pthread_config@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_build_prefix = @top_build_prefix@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-SUBDIRS = hello
-all: all-recursive
-
-.SUFFIXES:
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
-	        && { if test -f $@; then exit 0; else break; fi; }; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign tests/cell/Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign tests/cell/Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure:  $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
-
-mostlyclean-libtool:
-	-rm -f *.lo
-
-clean-libtool:
-	-rm -rf .libs _libs
-
-# This directory's subdirectories are mostly independent; you can cd
-# into them and run 'make' without going through this Makefile.
-# To change the values of 'make' variables: instead of editing Makefiles,
-# (1) if the variable is set in 'config.status', edit 'config.status'
-#     (which will cause the Makefiles to be regenerated when you run 'make');
-# (2) otherwise, pass the desired values on the 'make' command line.
-$(am__recursive_targets):
-	@fail=; \
-	if $(am__make_keepgoing); then \
-	  failcom='fail=yes'; \
-	else \
-	  failcom='exit 1'; \
-	fi; \
-	dot_seen=no; \
-	target=`echo $@ | sed s/-recursive//`; \
-	case "$@" in \
-	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
-	  *) list='$(SUBDIRS)' ;; \
-	esac; \
-	for subdir in $$list; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    dot_seen=yes; \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done; \
-	if test "$$dot_seen" = "no"; then \
-	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
-	fi; test -z "$$fail"
-
-ID: $(am__tagged_files)
-	$(am__define_uniq_tagged_files); mkid -fID $$unique
-tags: tags-recursive
-TAGS: tags
-
-tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	set x; \
-	here=`pwd`; \
-	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
-	  include_option=--etags-include; \
-	  empty_fix=.; \
-	else \
-	  include_option=--include; \
-	  empty_fix=; \
-	fi; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    test ! -f $$subdir/TAGS || \
-	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
-	  fi; \
-	done; \
-	$(am__define_uniq_tagged_files); \
-	shift; \
-	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  if test $$# -gt 0; then \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      "$$@" $$unique; \
-	  else \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      $$unique; \
-	  fi; \
-	fi
-ctags: ctags-recursive
-
-CTAGS: ctags
-ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	$(am__define_uniq_tagged_files); \
-	test -z "$(CTAGS_ARGS)$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && $(am__cd) $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) "$$here"
-cscopelist: cscopelist-recursive
-
-cscopelist-am: $(am__tagged_files)
-	list='$(am__tagged_files)'; \
-	case "$(srcdir)" in \
-	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
-	  *) sdir=$(subdir)/$(srcdir) ;; \
-	esac; \
-	for i in $$list; do \
-	  if test -f "$$i"; then \
-	    echo "$(subdir)/$$i"; \
-	  else \
-	    echo "$$sdir/$$i"; \
-	  fi; \
-	done >> $(top_builddir)/cscope.files
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    $(am__make_dryrun) \
-	      || test -d "$(distdir)/$$subdir" \
-	      || $(MKDIR_P) "$(distdir)/$$subdir" \
-	      || exit 1; \
-	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
-	    $(am__relativize); \
-	    new_distdir=$$reldir; \
-	    dir1=$$subdir; dir2="$(top_distdir)"; \
-	    $(am__relativize); \
-	    new_top_distdir=$$reldir; \
-	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
-	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
-	    ($(am__cd) $$subdir && \
-	      $(MAKE) $(AM_MAKEFLAGS) \
-	        top_distdir="$$new_top_distdir" \
-	        distdir="$$new_distdir" \
-		am__remove_distdir=: \
-		am__skip_length_check=: \
-		am__skip_mode_fix=: \
-	        distdir) \
-	      || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-recursive
-all-am: Makefile
-installdirs: installdirs-recursive
-installdirs-am:
-install: install-recursive
-install-exec: install-exec-recursive
-install-data: install-data-recursive
-uninstall: uninstall-recursive
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-recursive
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-recursive
-
-clean-am: clean-generic clean-libtool mostlyclean-am
-
-distclean: distclean-recursive
-	-rm -f Makefile
-distclean-am: clean-am distclean-generic distclean-tags
-
-dvi: dvi-recursive
-
-dvi-am:
-
-html: html-recursive
-
-html-am:
-
-info: info-recursive
-
-info-am:
-
-install-data-am:
-
-install-dvi: install-dvi-recursive
-
-install-dvi-am:
-
-install-exec-am:
-
-install-html: install-html-recursive
-
-install-html-am:
-
-install-info: install-info-recursive
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-recursive
-
-install-pdf-am:
-
-install-ps: install-ps-recursive
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-recursive
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-recursive
-
-mostlyclean-am: mostlyclean-generic mostlyclean-libtool
-
-pdf: pdf-recursive
-
-pdf-am:
-
-ps: ps-recursive
-
-ps-am:
-
-uninstall-am:
-
-.MAKE: $(am__recursive_targets) install-am install-strip
-
-.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \
-	check-am clean clean-generic clean-libtool cscopelist-am ctags \
-	ctags-am distclean distclean-generic distclean-libtool \
-	distclean-tags distdir dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-info install-info-am install-man \
-	install-pdf install-pdf-am install-ps install-ps-am \
-	install-strip installcheck installcheck-am installdirs \
-	installdirs-am maintainer-clean maintainer-clean-generic \
-	mostlyclean mostlyclean-generic mostlyclean-libtool pdf pdf-am \
-	ps ps-am tags tags-am uninstall uninstall-am
-
-.PRECIOUS: Makefile
-
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/tests/cell/hello/Makefile.am b/tests/cell/hello/Makefile.am
deleted file mode 100644
index ce2627b..0000000
--- a/tests/cell/hello/Makefile.am
+++ /dev/null
@@ -1,35 +0,0 @@
-# Process this file with automake to produce Makefile.in (in this,
-# and all subdirectories).
-# 
-# Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-noinst_PROGRAMS = host
-
-host_SOURCES = host.cpp 
-host_LDADD = ../../../lib/CL/libpocl.la ../../../lib/poclu/libpoclu.la @LD_FLAGS_BIN@
-host_CXXFLAGS = @PTHREAD_CFLAGS@ -std=c++11 -Wno-deprecated -Wno-deprecated-declarations
-
-AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/lib/CL \
-	-DSRCDIR='"$(abs_srcdir)"'
-
-
-run: host
-	@POCL_DEVICES="cellspu" ./host
diff --git a/tests/cell/hello/Makefile.in b/tests/cell/hello/Makefile.in
deleted file mode 100644
index 446dc63..0000000
--- a/tests/cell/hello/Makefile.in
+++ /dev/null
@@ -1,721 +0,0 @@
-# Makefile.in generated by automake 1.15 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994-2014 Free Software Foundation, Inc.
-
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
- at SET_MAKE@
-
-# Process this file with automake to produce Makefile.in (in this,
-# and all subdirectories).
-# 
-# Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-VPATH = @srcdir@
-am__is_gnu_make = { \
-  if test -z '$(MAKELEVEL)'; then \
-    false; \
-  elif test -n '$(MAKE_HOST)'; then \
-    true; \
-  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
-    true; \
-  else \
-    false; \
-  fi; \
-}
-am__make_running_with_option = \
-  case $${target_option-} in \
-      ?) ;; \
-      *) echo "am__make_running_with_option: internal error: invalid" \
-              "target option '$${target_option-}' specified" >&2; \
-         exit 1;; \
-  esac; \
-  has_opt=no; \
-  sane_makeflags=$$MAKEFLAGS; \
-  if $(am__is_gnu_make); then \
-    sane_makeflags=$$MFLAGS; \
-  else \
-    case $$MAKEFLAGS in \
-      *\\[\ \	]*) \
-        bs=\\; \
-        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
-          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
-    esac; \
-  fi; \
-  skip_next=no; \
-  strip_trailopt () \
-  { \
-    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
-  }; \
-  for flg in $$sane_makeflags; do \
-    test $$skip_next = yes && { skip_next=no; continue; }; \
-    case $$flg in \
-      *=*|--*) continue;; \
-        -*I) strip_trailopt 'I'; skip_next=yes;; \
-      -*I?*) strip_trailopt 'I';; \
-        -*O) strip_trailopt 'O'; skip_next=yes;; \
-      -*O?*) strip_trailopt 'O';; \
-        -*l) strip_trailopt 'l'; skip_next=yes;; \
-      -*l?*) strip_trailopt 'l';; \
-      -[dEDm]) skip_next=yes;; \
-      -[JT]) skip_next=yes;; \
-    esac; \
-    case $$flg in \
-      *$$target_option*) has_opt=yes; break;; \
-    esac; \
-  done; \
-  test $$has_opt = yes
-am__make_dryrun = (target_option=n; $(am__make_running_with_option))
-am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
-pkgdatadir = $(datadir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkglibexecdir = $(libexecdir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-noinst_PROGRAMS = host$(EXEEXT)
-subdir = tests/cell/hello
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
-	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
-	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
-	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \
-	$(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/config.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-PROGRAMS = $(noinst_PROGRAMS)
-am_host_OBJECTS = host-host.$(OBJEXT)
-host_OBJECTS = $(am_host_OBJECTS)
-host_DEPENDENCIES = ../../../lib/CL/libpocl.la \
-	../../../lib/poclu/libpoclu.la
-AM_V_lt = $(am__v_lt_ at AM_V@)
-am__v_lt_ = $(am__v_lt_ at AM_DEFAULT_V@)
-am__v_lt_0 = --silent
-am__v_lt_1 = 
-host_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(host_CXXFLAGS) \
-	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-AM_V_P = $(am__v_P_ at AM_V@)
-am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
-am__v_P_0 = false
-am__v_P_1 = :
-AM_V_GEN = $(am__v_GEN_ at AM_V@)
-am__v_GEN_ = $(am__v_GEN_ at AM_DEFAULT_V@)
-am__v_GEN_0 = @echo "  GEN     " $@;
-am__v_GEN_1 = 
-AM_V_at = $(am__v_at_ at AM_V@)
-am__v_at_ = $(am__v_at_ at AM_DEFAULT_V@)
-am__v_at_0 = @
-am__v_at_1 = 
-DEFAULT_INCLUDES = -I. at am__isrc@ -I$(top_builddir)
-depcomp = $(SHELL) $(top_srcdir)/config/depcomp
-am__depfiles_maybe = depfiles
-am__mv = mv -f
-CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
-	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
-LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \
-	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
-	$(AM_CXXFLAGS) $(CXXFLAGS)
-AM_V_CXX = $(am__v_CXX_ at AM_V@)
-am__v_CXX_ = $(am__v_CXX_ at AM_DEFAULT_V@)
-am__v_CXX_0 = @echo "  CXX     " $@;
-am__v_CXX_1 = 
-CXXLD = $(CXX)
-CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
-	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-AM_V_CXXLD = $(am__v_CXXLD_ at AM_V@)
-am__v_CXXLD_ = $(am__v_CXXLD_ at AM_DEFAULT_V@)
-am__v_CXXLD_0 = @echo "  CXXLD   " $@;
-am__v_CXXLD_1 = 
-SOURCES = $(host_SOURCES)
-DIST_SOURCES = $(host_SOURCES)
-am__can_run_installinfo = \
-  case $$AM_UPDATE_INFO_DIR in \
-    n|no|NO) false;; \
-    *) (install-info --version) >/dev/null 2>&1;; \
-  esac
-am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
-# Read a list of newline-separated strings from the standard input,
-# and print each of them once, without duplicates.  Input order is
-# *not* preserved.
-am__uniquify_input = $(AWK) '\
-  BEGIN { nonempty = 0; } \
-  { items[$$0] = 1; nonempty = 1; } \
-  END { if (nonempty) { for (i in items) print i; }; } \
-'
-# Make sure the list of sources is unique.  This is necessary because,
-# e.g., the same source file might be shared among _SOURCES variables
-# for different programs/libraries.
-am__define_uniq_tagged_files = \
-  list='$(am__tagged_files)'; \
-  unique=`for i in $$list; do \
-    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-  done | $(am__uniquify_input)`
-ETAGS = etags
-CTAGS = ctags
-am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/config/depcomp
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-ACLOCAL = @ACLOCAL@
-AMTAR = @AMTAR@
-AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
-AR = @AR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
-BOOST_LDFLAGS = @BOOST_LDFLAGS@
-BUILD_TIMESTAMP = @BUILD_TIMESTAMP@
-CC = @CC@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CLANG = @CLANG@
-CLANGXX = @CLANGXX@
-CLANGXX_FLAGS = @CLANGXX_FLAGS@
-CLFLAGS = @CLFLAGS@
-CPP = @CPP@
-CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-DLLTOOL = @DLLTOOL@
-DSYMUTIL = @DSYMUTIL@
-DUMPBIN = @DUMPBIN@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-FGREP = @FGREP@
-FORCED_CLFLAGS = @FORCED_CLFLAGS@
-GLEW_CFLAGS = @GLEW_CFLAGS@
-GLEW_LIBS = @GLEW_LIBS@
-GREP = @GREP@
-HOST = @HOST@
-HOST_AS_FLAGS = @HOST_AS_FLAGS@
-HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
-HOST_CPU = @HOST_CPU@
-HOST_LD_FLAGS = @HOST_LD_FLAGS@
-HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
-HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
-HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
-HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
-HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
-HSAILASM = @HSAILASM@
-HSA_INCLUDES = @HSA_INCLUDES@
-HSA_LIBS = @HSA_LIBS@
-HWLOC_CFLAGS = @HWLOC_CFLAGS@
-HWLOC_LIBS = @HWLOC_LIBS@
-ICD_LD_FLAGS = @ICD_LD_FLAGS@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-KERNEL_COMPILER_LIB_VERSION = @KERNEL_COMPILER_LIB_VERSION@
-LD = @LD@
-LDFLAGS = @LDFLAGS@
-LD_FLAGS_BIN = @LD_FLAGS_BIN@
-LIBOBJS = @LIBOBJS@
-LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
-LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
-LIBTOOL = @LIBTOOL@
-LIB_AGE_VERSION = @LIB_AGE_VERSION@
-LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
-LIB_FIRST_VERSION = @LIB_FIRST_VERSION@
-LIB_REVISION_VERSION = @LIB_REVISION_VERSION@
-LIB_VERSION = @LIB_VERSION@
-LIPO = @LIPO@
-LLC = @LLC@
-LLVM_AS = @LLVM_AS@
-LLVM_CONFIG = @LLVM_CONFIG@
-LLVM_CXX_FLAGS = @LLVM_CXX_FLAGS@
-LLVM_LDFLAGS = @LLVM_LDFLAGS@
-LLVM_LIBS = @LLVM_LIBS@
-LLVM_LINK = @LLVM_LINK@
-LLVM_OPT = @LLVM_OPT@
-LLVM_VERSION = @LLVM_VERSION@
-LN_S = @LN_S@
-LTDL_LIBS = @LTDL_LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
-MAKEINFO = @MAKEINFO@
-MANIFEST_TOOL = @MANIFEST_TOOL@
-MKDIR_P = @MKDIR_P@
-NM = @NM@
-NMEDIT = @NMEDIT@
-OBJDUMP = @OBJDUMP@
-OBJEXT = @OBJEXT@
-OCL_ICD_CFLAGS = @OCL_ICD_CFLAGS@
-OCL_ICD_LIBS = @OCL_ICD_LIBS@
-OCL_KERNEL_ARCH = @OCL_KERNEL_ARCH@
-OCL_KERNEL_TARGET = @OCL_KERNEL_TARGET@
-OCL_KERNEL_TARGET_CPU = @OCL_KERNEL_TARGET_CPU@
-OCL_TARGETS = @OCL_TARGETS@
-OPENCL_CFLAGS = @OPENCL_CFLAGS@
-OPENCL_CMAKE = @OPENCL_CMAKE@
-OPENCL_EXTLIBS = @OPENCL_EXTLIBS@
-OPENCL_LIBS = @OPENCL_LIBS@
-OPT = @OPT@
-OTOOL = @OTOOL@
-OTOOL64 = @OTOOL64@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_URL = @PACKAGE_URL@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PKG_CONFIG = @PKG_CONFIG@
-PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
-PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
-POAT_TESTSUITES = @POAT_TESTSUITES@
-POCL_DEVICE_ADDRESS_BITS = @POCL_DEVICE_ADDRESS_BITS@
-PTHREAD_CC = @PTHREAD_CC@
-PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SDL_CFLAGS = @SDL_CFLAGS@
-SDL_LIBS = @SDL_LIBS@
-SED = @SED@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-TARGET = @TARGET@
-TARGET_CLANG_FLAGS = @TARGET_CLANG_FLAGS@
-TARGET_CPU = @TARGET_CPU@
-TARGET_LLC_FLAGS = @TARGET_LLC_FLAGS@
-TARGET_SIZEOF_DOUBLE = @TARGET_SIZEOF_DOUBLE@
-TARGET_SIZEOF_HALF = @TARGET_SIZEOF_HALF@
-TARGET_SIZEOF_LONG = @TARGET_SIZEOF_LONG@
-TARGET_SIZEOF_VOID_P = @TARGET_SIZEOF_VOID_P@
-TCECC = @TCECC@
-TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
-TCE_AVAILABLE = @TCE_AVAILABLE@
-TCE_CONFIG = @TCE_CONFIG@
-VERSION = @VERSION@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_ct_AR = @ac_ct_AR@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
-acx_pthread_config = @acx_pthread_config@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_build_prefix = @top_build_prefix@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-host_SOURCES = host.cpp 
-host_LDADD = ../../../lib/CL/libpocl.la ../../../lib/poclu/libpoclu.la @LD_FLAGS_BIN@
-host_CXXFLAGS = @PTHREAD_CFLAGS@ -std=c++11 -Wno-deprecated -Wno-deprecated-declarations
-AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/lib/CL \
-	-DSRCDIR='"$(abs_srcdir)"'
-
-all: all-am
-
-.SUFFIXES:
-.SUFFIXES: .cpp .lo .o .obj
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
-	        && { if test -f $@; then exit 0; else break; fi; }; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign tests/cell/hello/Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign tests/cell/hello/Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure:  $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
-
-clean-noinstPROGRAMS:
-	@list='$(noinst_PROGRAMS)'; test -n "$$list" || exit 0; \
-	echo " rm -f" $$list; \
-	rm -f $$list || exit $$?; \
-	test -n "$(EXEEXT)" || exit 0; \
-	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
-	echo " rm -f" $$list; \
-	rm -f $$list
-
-host$(EXEEXT): $(host_OBJECTS) $(host_DEPENDENCIES) $(EXTRA_host_DEPENDENCIES) 
-	@rm -f host$(EXEEXT)
-	$(AM_V_CXXLD)$(host_LINK) $(host_OBJECTS) $(host_LDADD) $(LIBS)
-
-mostlyclean-compile:
-	-rm -f *.$(OBJEXT)
-
-distclean-compile:
-	-rm -f *.tab.c
-
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/host-host.Po at am__quote@
-
-.cpp.o:
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXXCOMPILE) -c -o $@ $<
-
-.cpp.obj:
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
-
-.cpp.lo:
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
-
-host-host.o: host.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(host_CXXFLAGS) $(CXXFLAGS) -MT host-host.o -MD -MP -MF $(DEPDIR)/host-host.Tpo -c -o host-host.o `test -f 'host.cpp' || echo '$(srcdir)/'`host.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/host-host.Tpo $(DEPDIR)/host-host.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='host.cpp' object='host-host.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(host_CXXFLAGS) $(CXXFLAGS) -c -o host-host.o `test -f 'host.cpp' || echo '$(srcdir)/'`host.cpp
-
-host-host.obj: host.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(host_CXXFLAGS) $(CXXFLAGS) -MT host-host.obj -MD -MP -MF $(DEPDIR)/host-host.Tpo -c -o host-host.obj `if test -f 'host.cpp'; then $(CYGPATH_W) 'host.cpp'; else $(CYGPATH_W) '$(srcdir)/host.cpp'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/host-host.Tpo $(DEPDIR)/host-host.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='host.cpp' object='host-host.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(host_CXXFLAGS) $(CXXFLAGS) -c -o host-host.obj `if test -f 'host.cpp'; then $(CYGPATH_W) 'host.cpp'; else $(CYGPATH_W) '$(srcdir)/host.cpp'; fi`
-
-mostlyclean-libtool:
-	-rm -f *.lo
-
-clean-libtool:
-	-rm -rf .libs _libs
-
-ID: $(am__tagged_files)
-	$(am__define_uniq_tagged_files); mkid -fID $$unique
-tags: tags-am
-TAGS: tags
-
-tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	set x; \
-	here=`pwd`; \
-	$(am__define_uniq_tagged_files); \
-	shift; \
-	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  if test $$# -gt 0; then \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      "$$@" $$unique; \
-	  else \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      $$unique; \
-	  fi; \
-	fi
-ctags: ctags-am
-
-CTAGS: ctags
-ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
-	$(am__define_uniq_tagged_files); \
-	test -z "$(CTAGS_ARGS)$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && $(am__cd) $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) "$$here"
-cscopelist: cscopelist-am
-
-cscopelist-am: $(am__tagged_files)
-	list='$(am__tagged_files)'; \
-	case "$(srcdir)" in \
-	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
-	  *) sdir=$(subdir)/$(srcdir) ;; \
-	esac; \
-	for i in $$list; do \
-	  if test -f "$$i"; then \
-	    echo "$(subdir)/$$i"; \
-	  else \
-	    echo "$$sdir/$$i"; \
-	  fi; \
-	done >> $(top_builddir)/cscope.files
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-am
-all-am: Makefile $(PROGRAMS)
-installdirs:
-install: install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-am
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-am
-
-clean-am: clean-generic clean-libtool clean-noinstPROGRAMS \
-	mostlyclean-am
-
-distclean: distclean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-distclean-am: clean-am distclean-compile distclean-generic \
-	distclean-tags
-
-dvi: dvi-am
-
-dvi-am:
-
-html: html-am
-
-html-am:
-
-info: info-am
-
-info-am:
-
-install-data-am:
-
-install-dvi: install-dvi-am
-
-install-dvi-am:
-
-install-exec-am:
-
-install-html: install-html-am
-
-install-html-am:
-
-install-info: install-info-am
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-am
-
-install-pdf-am:
-
-install-ps: install-ps-am
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-am
-
-mostlyclean-am: mostlyclean-compile mostlyclean-generic \
-	mostlyclean-libtool
-
-pdf: pdf-am
-
-pdf-am:
-
-ps: ps-am
-
-ps-am:
-
-uninstall-am:
-
-.MAKE: install-am install-strip
-
-.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
-	clean-libtool clean-noinstPROGRAMS cscopelist-am ctags \
-	ctags-am distclean distclean-compile distclean-generic \
-	distclean-libtool distclean-tags distdir dvi dvi-am html \
-	html-am info info-am install install-am install-data \
-	install-data-am install-dvi install-dvi-am install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-man install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-compile \
-	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
-	tags tags-am uninstall uninstall-am
-
-.PRECIOUS: Makefile
-
-
-run: host
-	@POCL_DEVICES="cellspu" ./host
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/tests/cell/hello/host.cpp b/tests/cell/hello/host.cpp
deleted file mode 100644
index ef6cdc1..0000000
--- a/tests/cell/hello/host.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Smoke tests the cell device driver.
-
-   Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
-   
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-   
-   The above copyright notice and this permission notice shall be included in
-   all copies or substantial portions of the Software.
-   
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-   THE SOFTWARE.
-*/
-
-// Enable OpenCL C++ exceptions
-#define CL_HPP_ENABLE_EXCEPTIONS
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#include <CL/cl2.hpp>
-
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <cassert>
-
-#include "poclu.h"
-
-static char
-kernelSourceCode[] = 
-"kernel \n"
-"void test_kernel(constant char *input,\n"
-"                 __global char *output,\n"
-"                 float a,\n"
-"                 int b) {\n"
-"    constant char* pos = input; \n"
-"    __global char *outpos = output;\n"
-"    while (*pos) {\n"
-"        *(outpos++) = *(pos++);\n"
-"    }\n"
-"    *outpos = '\\0';\n"
-"    output[1] = 'O'; \n"
-"}\n";
-
-int
-main(void)
-{
-    const int OUTPUT_SIZE = 6;
-    const char *input = "PING\0";
-    char output[OUTPUT_SIZE];
-    float a = 23456.0f;
-    int b = 2000001;   
-
-    try {
-        std::vector<cl::Platform> platformList;
-
-        // Pick platform
-        cl::Platform::get(&platformList);
-
-        // Pick first platform
-        cl_context_properties cprops[] = {
-            CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0};
-        cl::Context context(CL_DEVICE_TYPE_ACCELERATOR, cprops);
-
-        // Query the set of devices attached to the context
-        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
-        
-        assert (devices.size() == 1);
-
-        cl::Device device = devices.at(0);
-
-        assert (device.getInfo<CL_DEVICE_NAME>() == "cellspu");
-
-        a = poclu_bswap_cl_float (device(), a);
-        b = poclu_bswap_cl_int (device(), b);
-
-        // Create and program from source
-        cl::Program::Sources sources({kernelSourceCode});
-        cl::Program program(context, sources);
-
-        // Build program
-        program.build(devices);
-
-        cl::Buffer inputBuffer = cl::Buffer(
-            context, 
-            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
-            strlen (input), (void *) &input[0]);
-
-        // Create buffer for that uses the host ptr C
-        cl::Buffer outputBuffer = cl::Buffer(
-            context, 
-            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, 
-            OUTPUT_SIZE, (void *) &output[0]);
-
-        // Create kernel object
-        cl::Kernel kernel(program, "test_kernel");
-
-        // Set kernel args
-        kernel.setArg(0, inputBuffer);
-        kernel.setArg(1, outputBuffer);
-        kernel.setArg(2, a);
-        kernel.setArg(3, b);
-
-        // Create command queue
-        cl::CommandQueue queue(context, devices[0], CL_QUEUE_PROFILING_ENABLE);
- 
-        cl::Event enqEvent;
-
-        // Do the work
-        queue.enqueueNDRangeKernel(
-            kernel, 
-            cl::NullRange, 
-            cl::NDRange(1),
-            cl::NullRange,
-            NULL, &enqEvent);
- 
-        cl::Event mapEvent;
-        (int *) queue.enqueueMapBuffer(
-            outputBuffer,
-            CL_TRUE, // block 
-            CL_MAP_READ,
-            0, OUTPUT_SIZE, NULL, &mapEvent);
-       
-        if (std::string(output) == "PONG") 
-            std::cout << "OK\n";
-        else
-            std::cerr << "FAIL, received: " << output << "\n";
-
-        cl::Event unmapEvent;
-        // Finally release our hold on accessing the memory
-        queue.enqueueUnmapMemObject(
-            outputBuffer,
-            (void *) &output[0],
-            NULL,
-            &unmapEvent);
-
-        queue.finish();
-
-        assert (enqEvent.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() == CL_COMPLETE);
-        assert (mapEvent.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() == CL_COMPLETE);
-        assert (unmapEvent.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() == CL_COMPLETE);
-
-
-        assert (
-            enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() <=
-            enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());
-
-        assert (
-            enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>() <=
-            enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>());
-
-        assert (
-            enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() <
-            enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>());
-
-#if 0
-        std::cerr << "exec time: " 
-                  << enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() -
-            enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() << std::endl;
-#endif
-
-        assert (
-            mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() <=
-            mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());
-
-        assert (
-            mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>() <=
-            mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>());
-
-
-        assert (
-            mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() <=
-            mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>());
-
-        assert (
-            unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>() <=
-            unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());
-
-        assert (
-            unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>() <=
-            unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>());
-
-        assert (
-            unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() <=
-            unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>());
-
-        assert (enqEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() <=
-                mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>());
-
-        assert (mapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() <=
-                unmapEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>());
-
-    } 
-    catch (cl::Error err) {
-         std::cerr
-             << "ERROR: "
-             << err.what()
-             << "("
-             << err.err()
-             << ")"
-             << std::endl;
-
-         return EXIT_FAILURE;
-    }
-
-    return EXIT_SUCCESS;
-}
diff --git a/tests/kernel/CMakeLists.txt b/tests/kernel/CMakeLists.txt
index d0b6158..44ccbfd 100644
--- a/tests/kernel/CMakeLists.txt
+++ b/tests/kernel/CMakeLists.txt
@@ -41,23 +41,32 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 ${OPENCL_CFLAGS}")
 add_executable("kernel" "kernel.c") # test_as_type.cl test_bitselect.cl test_convert_sat_regression.cl test_convert_type_*.cl test_fabs.cl test_fmin_fmax_fma.cl test_hadd.cl test_min_max.cl test_length_distance.cl test_rotate.cl test_short16.cl test_sizeof.cl test_block.cl test_printf.cl
 target_link_libraries("kernel" ${POCLU_LINK_OPTIONS})
 
-add_test("kernel/test_as_type" "kernel" "test_as_type")
+add_test_pocl(NAME "kernel/test_as_type"
+              COMMAND "kernel" "test_as_type")
 
-add_test("kernel/test_convert_type_1" "kernel" "test_convert_type_1")
+add_test_pocl(NAME "kernel/test_convert_type_1"
+              COMMAND "kernel" "test_convert_type_1")
 
-add_test("kernel/test_convert_type_2" "kernel" "test_convert_type_2")
+add_test_pocl(NAME "kernel/test_convert_type_2"
+              COMMAND "kernel" "test_convert_type_2")
 
-add_test("kernel/test_convert_type_4" "kernel" "test_convert_type_4")
+add_test_pocl(NAME "kernel/test_convert_type_4"
+              COMMAND "kernel" "test_convert_type_4")
 
-add_test("kernel/test_convert_type_8" "kernel" "test_convert_type_8")
+add_test_pocl(NAME "kernel/test_convert_type_8"
+              COMMAND "kernel" "test_convert_type_8")
 
-add_test("kernel/test_convert_type_16" "kernel" "test_convert_type_16")
+add_test_pocl(NAME "kernel/test_convert_type_16"
+              COMMAND "kernel" "test_convert_type_16")
 
-add_test("kernel/test_bitselect" "kernel" "test_bitselect")
+add_test_pocl(NAME "kernel/test_bitselect"
+              COMMAND "kernel" "test_bitselect")
 
-add_test("kernel/test_hadd_loopvec" "kernel" "test_hadd")
+add_test_pocl(NAME "kernel/test_hadd_loopvec"
+              COMMAND "kernel" "test_hadd")
 
-add_test("kernel/test_hadd_loops" "kernel" "test_hadd")
+add_test_pocl(NAME "kernel/test_hadd_loops"
+              COMMAND "kernel" "test_hadd")
 
 set_tests_properties( "kernel/test_as_type" "kernel/test_bitselect"
   "kernel/test_convert_type_1" "kernel/test_convert_type_2" "kernel/test_convert_type_4"
@@ -76,23 +85,32 @@ set_tests_properties("kernel/test_hadd_loops"
 set_tests_properties("kernel/test_hadd_loopvec"
   PROPERTIES ENVIRONMENT "POCL_WORK_GROUP_METHOD=loopvec")
 
-add_test("kernel/test_min_max" "kernel" "test_min_max")
+add_test_pocl(NAME "kernel/test_min_max"
+              COMMAND "kernel" "test_min_max")
 
-add_test("kernel/test_length_distance" "kernel" "test_length_distance")
+add_test_pocl(NAME "kernel/test_length_distance"
+              COMMAND "kernel" "test_length_distance")
 
-add_test("kernel/test_fmin_fmax_fma" "kernel" "test_fmin_fmax_fma")
+add_test_pocl(NAME "kernel/test_fmin_fmax_fma"
+              COMMAND "kernel" "test_fmin_fmax_fma")
 
-add_test("kernel/test_convert_sat_regression" "kernel" "test_convert_sat_regression")
+add_test_pocl(NAME "kernel/test_convert_sat_regression"
+              COMMAND "kernel" "test_convert_sat_regression")
 
-add_test("kernel/test_rotate" "kernel" "test_rotate")
+add_test_pocl(NAME "kernel/test_rotate"
+              COMMAND "kernel" "test_rotate")
 
-add_test("kernel/test_fabs" "kernel" "test_fabs")
+add_test_pocl(NAME "kernel/test_fabs"
+              COMMAND "kernel" "test_fabs")
 
-add_test("kernel/test_short16" "kernel" "test_short16")
+add_test_pocl(NAME "kernel/test_short16"
+              COMMAND "kernel" "test_short16")
 
-add_test("kernel/test_frexp_modf" "kernel" "test_frexp_modf")
+add_test_pocl(NAME "kernel/test_frexp_modf"
+              COMMAND "kernel" "test_frexp_modf")
 
-add_test("kernel/test_local_struct_array" "kernel" "test_local_struct_array")
+add_test_pocl(NAME "kernel/test_local_struct_array"
+              COMMAND "kernel" "test_local_struct_array")
 
 set_tests_properties("kernel/test_min_max" "kernel/test_length_distance"
   "kernel/test_fmin_fmax_fma" "kernel/test_local_struct_array"
@@ -103,43 +121,14 @@ set_tests_properties("kernel/test_min_max" "kernel/test_length_distance"
     FAIL_REGULAR_EXPRESSION "FAIL"
     PASS_REGULAR_EXPRESSION "\nOK\n"
     PROCESSORS 1
-    DEPENDS "pocl_version_check")
-
-if(LLVM_3_2 AND POWERPC)
-  set_tests_properties("kernel/test_short16"
-    PROPERTIES WILL_FAIL 1)
-endif()
-
-if(LLVM_3_2 OR (LLVM_3_3 AND POWERPC))
-  set_tests_properties("kernel/test_rotate"
-    PROPERTIES WILL_FAIL 1)
-endif()
+    DEPENDS "pocl_version_check"
+    LABELS "internal;kernel")
 
 if(LLVM_ASSERTS_BUILD)
   set_tests_properties("kernel/test_local_struct_array"
     PROPERTIES WILL_FAIL 1)
 endif()
 
-# 3-element vector cases fail when vectorizer is enabled,
-# at least with Intel Core i5 and AMD FX8. Assume it fails on all others too.
-if(X86_64 AND (LLVM_3_5 OR LLVM_3_6))
-  set_tests_properties("kernel/test_hadd_loopvec"
-    PROPERTIES WILL_FAIL 1)
-endif()
-
-if(POWERPC)
-  set_tests_properties(
-  "kernel/test_convert_type_1" "kernel/test_convert_type_2" "kernel/test_convert_type_4"
-  "kernel/test_convert_type_8" "kernel/test_convert_type_16"
-    PROPERTIES WILL_FAIL 1)
-endif()
-
-if(POWERPC)
-  set_tests_properties("kernel/test_as_type" "kernel/test_fmin_fmax_fma"
-  "kernel/test_bitselect" "kernel/test_fabs" "test_hadd"
-    PROPERTIES WILL_FAIL 1)
-endif()
-
 
 ######################################################################
 if(MSVC)
@@ -154,9 +143,11 @@ target_link_libraries("sampler_address_clamp" ${POCLU_LINK_OPTIONS})
 add_executable("image_query_funcs" "image_query_funcs.c") #test_image_query_funcs.cl
 target_link_libraries("image_query_funcs" ${POCLU_LINK_OPTIONS})
 
-add_test("kernel/test_sampler_address_clamp" "sampler_address_clamp")
+add_test_pocl(NAME "kernel/test_sampler_address_clamp"
+              COMMAND "sampler_address_clamp")
 
-add_test("kernel/test_image_query_funcs" "image_query_funcs")
+add_test_pocl(NAME "kernel/test_image_query_funcs"
+              COMMAND "image_query_funcs")
 
 set_tests_properties( "kernel/test_sampler_address_clamp"
   "kernel/test_image_query_funcs"
@@ -164,35 +155,46 @@ set_tests_properties( "kernel/test_sampler_address_clamp"
     COST 4.0
     PASS_REGULAR_EXPRESSION "\nOK\n"
     PROCESSORS 1
-    DEPENDS "pocl_version_check")
+    DEPENDS "pocl_version_check"
+    LABELS "internal;kernel")
 
 ######################################################################
 
 add_executable("test_shuffle" "test_shuffle.cc")
 target_link_libraries("test_shuffle" ${POCLU_LINK_OPTIONS})
 
-add_test("kernel/test_shuffle_char" "test_shuffle" "char")
+add_test_pocl(NAME "kernel/test_shuffle_char"
+              COMMAND "test_shuffle" "char")
 
-add_test("kernel/test_shuffle_short" "test_shuffle" "short")
+add_test_pocl(NAME "kernel/test_shuffle_short"
+              COMMAND "test_shuffle" "short")
 
-add_test("kernel/test_shuffle_ushort" "test_shuffle" "ushort")
+add_test_pocl(NAME "kernel/test_shuffle_ushort"
+              COMMAND "test_shuffle" "ushort")
 
 if(NOT CL_DISABLE_HALF)
-  add_test("kernel/test_shuffle_half" "test_shuffle" "half")
+  add_test_pocl(NAME "kernel/test_shuffle_half"
+              COMMAND "test_shuffle" "half")
   set(HALF_TEST "kernel/test_shuffle_half")
 endif()
 
-add_test("kernel/test_shuffle_int" "test_shuffle" "int")
+add_test_pocl(NAME "kernel/test_shuffle_int"
+              COMMAND "test_shuffle" "int")
 
-add_test("kernel/test_shuffle_uint" "test_shuffle" "uint")
+add_test_pocl(NAME "kernel/test_shuffle_uint"
+              COMMAND "test_shuffle" "uint")
 
-add_test("kernel/test_shuffle_float" "test_shuffle" "float")
+add_test_pocl(NAME "kernel/test_shuffle_float"
+              COMMAND "test_shuffle" "float")
 
-add_test("kernel/test_shuffle_long" "test_shuffle" "long")
+add_test_pocl(NAME "kernel/test_shuffle_long"
+              COMMAND "test_shuffle" "long")
 
-add_test("kernel/test_shuffle_ulong" "test_shuffle" "ulong")
+add_test_pocl(NAME "kernel/test_shuffle_ulong"
+              COMMAND "test_shuffle" "ulong")
 
-add_test("kernel/test_shuffle_double" "test_shuffle" "double")
+add_test_pocl(NAME "kernel/test_shuffle_double"
+              COMMAND "test_shuffle" "double")
 
 set_tests_properties("kernel/test_shuffle_char" "kernel/test_shuffle_short"
   "kernel/test_shuffle_ushort"   ${HALF_TEST}
@@ -203,14 +205,16 @@ set_tests_properties("kernel/test_shuffle_char" "kernel/test_shuffle_short"
     COST 77
     PASS_REGULAR_EXPRESSION "OK\n"
     PROCESSORS 1
-    DEPENDS "pocl_version_check")
+    DEPENDS "pocl_version_check"
+    LABELS "internal;kernel")
 
 
 
 ######################################################################
 
 
-add_test("kernel/test_printf" "kernel" "test_printf")
+add_test_pocl(NAME "kernel/test_printf"
+              COMMAND "kernel" "test_printf")
 
 set_tests_properties("kernel/test_printf"
   PROPERTIES
@@ -241,15 +245,17 @@ OK"
     PROCESSORS 1
     DEPENDS "pocl_version_check")
 
-if(I386 OR (LLVM_VERSION VERSION_LESS "3.4"))
+if(I386)
   set_tests_properties("kernel/test_printf"
     PROPERTIES WILL_FAIL 1)
 endif()
 
 ######################################################################
 
-add_test_custom("${CMAKE_CURRENT_BINARY_DIR}/kernel" "kernel/test_sizeof_uint"
-                "test_sizeof_expout.txt" "test_sizeof")
+add_test_pocl(NAME "kernel/test_sizeof_uint"
+              EXPECTED_OUTPUT "test_sizeof_expout.txt"
+              COMMAND "kernel" "test_sizeof")
+
 
 ######################################################################
 
diff --git a/tests/kernel/Makefile.in b/tests/kernel/Makefile.in
index e2426dd..9480e04 100644
--- a/tests/kernel/Makefile.in
+++ b/tests/kernel/Makefile.in
@@ -280,6 +280,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -287,6 +288,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -304,8 +306,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -381,6 +381,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/tests/kernel/test_as_type.cl b/tests/kernel/test_as_type.cl
index 09d5b13..8d98f3d 100644
--- a/tests/kernel/test_as_type.cl
+++ b/tests/kernel/test_as_type.cl
@@ -1,11 +1,5 @@
 // TESTING: as_TYPEn
 
-#if __clang_major__ == 3 && __clang_minor__ < 4
-typedef const char* string;     /* for backward compatibility */
-#else
-typedef constant char* string;
-#endif
-
 
 _CL_NOINLINE
 void clear_bytes(uchar* p, uchar c, size_t n)
@@ -17,7 +11,7 @@ void clear_bytes(uchar* p, uchar c, size_t n)
 
 _CL_NOINLINE
 void compare_bytes(
-    string name,
+    constant char* name,
     const uchar* dst, size_t dst_size, size_t dst_elsize,
     const uchar* src, size_t src_size, size_t src_elsize)
 {
@@ -64,7 +58,7 @@ kernel void test_as_type()
     union { DST value; uchar raw[sizeof(DST)]; } dst;           \
     clear_bytes(src.raw, 0x44, sizeof(SRC));                    \
     clear_bytes(dst.raw, 0x99, sizeof(DST));                    \
-    src.value = *((SRC*)data);                                  \
+    src.value = *((private SRC*)data);                          \
     dst.value = as_##DST(src.value);                            \
     compare_bytes("as_" #DST "((" #SRC "))",                    \
         dst.raw, sizeof(DST), N, src.raw, sizeof(SRC), M);      \
diff --git a/tests/kernel/test_shuffle.cc b/tests/kernel/test_shuffle.cc
index db0b953..4ee8a0e 100644
--- a/tests/kernel/test_shuffle.cc
+++ b/tests/kernel/test_shuffle.cc
@@ -26,256 +26,308 @@ cl_device_id did;
 cl_platform_id pid;
 cl_command_queue queue;
 
-// One shuffle testcase
-// D: data type (in OCL host syntax)
-// M: mask type
-// n: output & mask number of vector elements
-// m: in1 & in2 number of vector elements
-// ocl_type: the type of D in OCL-C syntax
+#define ERRCHECK()  if (check_cl_error(errcode, __LINE__, __FUNCTION__)) abort();
+
+static const unsigned vecelts[5]={2,3,4,8,16};
+static const int stimuli[] = {4, 2, 69, 4, 5, 0, 45, 16, 4, 6, 1, 18, 28, 14,
+                 22, 16, 8, 2, 0, 31, 42, 11, 62, 88, 99, 23, 13};
+
 template <typename D, typename M>
-class testcase {
+class TestShuffle {
+  cl_mem mem_in1,
+  mem_in2,
+  mem_out,
+  mem_mask1,
+  mem_mask2;
+
+  cl_program prog;
+
+  D in1 [16] __attribute__ ((aligned (128)));
+  D in2 [16] __attribute__ ((aligned (128)));
+  D out [16] __attribute__ ((aligned (128)));
+  M mask1 [16] __attribute__ ((aligned (128)));
+  M mask2 [16] __attribute__ ((aligned (128)));
+  const char* ocl_type;
+  unsigned size;
+  cl_int errcode;
+
+private:
+  /* Prints into std::string all the OpenCL kernel sources for each n,m combo */
+  void testcase_src(std::string & src) {
+    char buf[1024];
+    int rv;
+    unsigned n, m;
+    const char* mask_type;
+    switch(sizeof(M)) {
+      case 1:
+        mask_type = "uchar"; break;
+      case 2:
+        mask_type = "ushort"; break;
+      case 4:
+        mask_type = "uint"; break;
+      case 8:
+        mask_type = "ulong"; break;
+      default:
+        mask_type = "UNKNOWN_MASK";
+    }
+
+    for(unsigned n_loop=0; n_loop<5; n_loop++) {
+        for(unsigned m_loop=0; m_loop<5; m_loop++) {
+
+            n = vecelts[n_loop];
+            m = vecelts[m_loop];
+            rv = 0;
+            buf[0] = 0;
+            rv=sprintf(buf,
+                           "__kernel void test_shuffle_%d_%d("
+                           "__global %s%d *in, __global %s%d *mask, __global %s%d *out) {\n"
+                           "*out = shuffle( *in, *mask);\n}\n",
+                           m, n, ocl_type, m, mask_type, n, ocl_type, n);
+            rv+=sprintf(buf+rv,
+                           "__kernel void test_shuffle2_%d_%d("
+                           "__global %s%d *in1, __global %s%d *in2, __global %s%d *mask, __global %s%d *out) {\n"
+                           "*out = shuffle2( *in1, *in2, *mask);\n}\n",
+                           m, n, ocl_type, m, ocl_type, m, mask_type, n, ocl_type, n);
+            src.append(buf);
+        }
+    }
+  }
+
+  #define nsize (n==3?4:n)
+  #define msize (m==3?4:m)
+  // assume out is filled with 'shuffle(in, mask)'	// return true if ok
+  bool output_matches_1(unsigned n, unsigned m)
+  {
+    bool error=false;
+    for(unsigned i=0; i<n; i++)
+      {
+        unsigned mm = mask1[i] % msize;
+        error |= (out[i] != in1[mm]);
+      }
+    return !error;
+  }
+
+  // assume out is filled with 'shuffle2(in1, in2, mask)'
+  // return true if ok
+  bool output_matches_2(unsigned n, unsigned m)
+  {
+    bool error=false;
+    for(unsigned i=0; i<n; i++)
+      {
+        unsigned msk = mask2[i] % (2*msize);
+        D correct = (msk < msize) ? in1[msk] : in2[msk-msize];
+        if (out[i] != correct) {
+            error |= true;
+            printf("element %d should be %d (mask %d), got %d\n", i, (int)correct, (int)mask2[i], (int)out[i]);
+          }
+      }
+    return !error;
+  }
+
+
+  // helpers: prints a vector as [0, 1, 2]
+  // cast to int, so vectors of 'char' come out correctly
+  void print_in1(unsigned n, unsigned m)
+  {
+    std::cout << "["<<(int)in1[0];
+    for(unsigned i=1; i<m; i++)
+      std::cout << ", " <<(int)in1[i];
+    std::cout << "]";
+  }
+  void print_in2(unsigned n, unsigned m)
+  {
+    std::cout << "["<<(int)in2[0];
+    for(unsigned i=1; i<m; i++)
+      std::cout << ", " <<(int)in2[i];
+    std::cout << "]";
+  }
+  void print_mask1(unsigned n, unsigned m)
+  {
+    std::cout << "["<<(int)mask1[0];
+    for(unsigned i=1; i<n; i++)
+      std::cout << ", " <<(int)mask1[i];
+    std::cout << "]";
+  }
+  void print_mask2(unsigned n, unsigned m)
+  {
+    std::cout << "["<<(int)mask2[0];
+    for(unsigned i=1; i<n; i++)
+      std::cout << ", " <<(int)mask2[i];
+    std::cout << "]";
+  }
+  void print_out(unsigned n, unsigned m)
+  {
+    std::cout << "["<<(int)out[0];
+    for(unsigned i=1; i<n; i++)
+      std::cout << ", " <<(int)out[i];
+    std::cout << "]";
+  }
+
+  /* Run one shuffle test, return true if successful*/
+
+  bool run_single_test(unsigned n, unsigned m){
+    bool rv=true;
+    cl_kernel krn, krn2;
+    char kern_name[128], kern_name2[128];
+
+    snprintf(kern_name, 128, "test_shuffle_%d_%d", m, n);
+    krn = clCreateKernel(prog, kern_name, &errcode);
+    ERRCHECK()
+
+    errcode = clSetKernelArg( krn, 0, sizeof(cl_mem), &mem_in1 );
+    ERRCHECK()
+    errcode = clSetKernelArg( krn, 1, sizeof(cl_mem), &mem_mask1 );
+    ERRCHECK()
+    errcode = clSetKernelArg( krn, 2, sizeof(cl_mem), &mem_out );
+    ERRCHECK()
+
+    errcode = clEnqueueTask( queue, krn, 0, NULL, NULL );
+    ERRCHECK()
+    errcode = clEnqueueReadBuffer( queue, mem_out, CL_TRUE, 0, size, out, 0, NULL, NULL );
+    ERRCHECK()
+    errcode = clFinish(queue);
+    ERRCHECK()
+
+    if(!output_matches_1(n, m))
+      {
+        std::cout << "Error in shuffle " << ocl_type << " " << m;
+        std::cout << " => " << ocl_type << " " << n << " :";
+        print_out(n, m);
+        std::cout << " = shuffle( ";
+        print_in1(n, m);
+        std::cout << ", ";
+        print_mask1(n, m);
+        std::cout << ");" << std::endl;
+        rv=false;
+      }
+
+    // Now test shuffle2()
+    clReleaseKernel(krn);
+
+    snprintf(kern_name2, 128, "test_shuffle2_%d_%d", m, n);
+    krn2 = clCreateKernel(prog, kern_name2, &errcode);
+    ERRCHECK()
+    errcode = clSetKernelArg( krn2, 0, sizeof(cl_mem), &mem_in1 );
+    ERRCHECK()
+    errcode = clSetKernelArg( krn2, 1, sizeof(cl_mem), &mem_in2 );
+    ERRCHECK()
+    errcode = clSetKernelArg( krn2, 2, sizeof(cl_mem), &mem_mask2 );
+    ERRCHECK()
+    errcode = clSetKernelArg( krn2, 3, sizeof(cl_mem), &mem_out );
+    ERRCHECK()
+    errcode = clEnqueueTask( queue, krn2, 0, NULL, NULL );
+    ERRCHECK()
+    errcode = clEnqueueReadBuffer( queue, mem_out, CL_TRUE, 0, size, out, 0, NULL, NULL );
+    ERRCHECK()
+    errcode = clFinish(queue);
+    ERRCHECK()
+
+    if(!output_matches_2(n, m))
+      {
+        std::cout << "Error in shuffle2 " << ocl_type << " " << m;
+        std::cout << " => " << ocl_type << " " << n << " :";
+        print_out(n, m);
+        std::cout << " = shuffle2( ";
+        print_in1(n, m);
+        std::cout << ", ";
+        print_in2(n, m);
+        std::cout << ", ";
+        print_mask2(n, m);
+        std::cout << ");" << std::endl;
+        rv=false;
+      }
+    clReleaseKernel(krn2);
+    return rv;
+  }
+
+
+
+
+
 public:
-	unsigned n, m;
-	unsigned nsize, msize;
-	D *in1;
-	D *in2;
-	D *out;
-	M *mask1;
-	M *mask2;
-	const char *d_type;
-
-  testcase(unsigned n_, unsigned m_, const char* ocl_label) :
-    n(n_), m(m_),
-    nsize(n==3?4:n), msize(m==3?4:m),
-    d_type(ocl_label) {
+  TestShuffle(const char* type) {
+    ocl_type = type;
+    size  = sizeof(D) * 16;
+    for(unsigned i=0; i<16; i++) {
+      mask1[i] = (M)stimuli[i];
+      mask2[i] = (M)stimuli[i];
+    }
+  }
+
+  unsigned run()
+  {
+
     // Fixed pseudorandom stimuli to make the test deterministic.
     // Random stimuli leads to randomly appearing/disappearing
     // problems which are irritating and hard to reproduce. Values which reduce
-    // to element 3 might produce an undefined value in case of 3 element inputs so 
+    // to element 3 might produce an undefined value in case of 3 element inputs so
     // let's not use them in the stimulus.
-    int stimuli[] = {4, 2, 69, 4, 5, 0, 45, 16, 4, 6, 1, 18, 28, 14, 
-                     22, 16, 8, 2, 0, 31, 42, 11, 62, 88, 99, 23, 13};
-    in1=new D[msize];       
-    in2=new D[msize];
-    out=new D[nsize];
-    mask1=new M[nsize];
-    mask2=new M[nsize];
-    
-    for(unsigned i=0; i<m; i++) {
-      in1[i]=(D)i;
-      in2[i]=(D)(i+m);
-    }
-    for(unsigned i=0; i<n; i++) {
-      mask1[i] = stimuli[i];
-      mask2[i] = stimuli[i];
-    }
-  }
 
-	int create_source(char *buf)
-	{
-		int rv;
-		rv=sprintf(buf,
-		           "__kernel void test_shuffle("
-		           "__global %s%d *in, __global %s%d *mask, __global %s%d *out) {\n"
-		           "*out = shuffle( *in, *mask);\n}\n",
-		           d_type, m, get_mask_type(), n, d_type, n);
-		rv+=sprintf(buf+rv,
-		           "__kernel void test_shuffle2("
-		           "__global %s%d *in1, __global %s%d *in2, __global %s%d *mask, __global %s%d *out) {\n"
-		           "*out = shuffle2( *in1, *in2, *mask);\n}\n",
-		           d_type, m, d_type, m, get_mask_type(), n, d_type, n);
-		return rv;
-	}
+    mem_in1 = clCreateBuffer(ctx,
+                             CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                             size, in1, &errcode);
+    ERRCHECK()
+    mem_in2 = clCreateBuffer(ctx,
+                             CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                             size, in2, &errcode);
+    ERRCHECK()
+    mem_mask1 = clCreateBuffer(ctx,
+                              CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                              size, mask1, &errcode);
+    ERRCHECK()
+    mem_mask2 = clCreateBuffer(ctx,
+                              CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                              size, mask2, &errcode);
+    ERRCHECK()
+    mem_out = clCreateBuffer(ctx,
+                             CL_MEM_WRITE_ONLY,
+                             size, NULL, &errcode);
+    ERRCHECK()
 
-	// assume out is filled with 'shuffle(in, mask)'
-	// return true if ok
-	bool output_matches_1()
-	{
-		bool error=false;
-		for(unsigned i=0; i<n; i++)
-		{
-			unsigned m = mask1[i] % msize;
-			error |= out[i] != in1[m];
-		}
-		return !error;
-	}
+    std::string source;
+    testcase_src(source);
 
-	// assume out is filled with 'shuffle2(in1, in2, mask)'
-	// return true if ok
-	bool output_matches_2()
-	{
-		bool error=false;
-		for(unsigned i=0; i<n; i++)
-		{
-			unsigned msk = mask2[i] % (2*msize);
-            D correct = (msk < msize) ? in1[msk] : in2[msk-msize];
-            if (out[i] != correct) {
-              error |= true;
-              printf("element %d should be %d (mask %d), got %d\n", i, (int)correct, (int)mask2[i], (int)out[i]);
-            }
-		}
-		return !error;
-	}
+    const char *c_src = source.c_str();
+    size_t srclen = source.size();
 
-	const char* get_mask_type()
-	{
-		switch(sizeof(M)) {
-		case 1:
-			return "uchar"; break;
-		case 2:
-			return "ushort"; break;
-		case 4:
-			return "uint"; break;
-		case 8:
-			return "ulong"; break;
-		default:
-			return NULL;
-		}
-	}
+    prog = clCreateProgramWithSource(ctx, 1, &c_src, &srclen, &errcode);
+    ERRCHECK()
+    errcode = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
+    ERRCHECK()
 
-	// helpers: prints a vector as [0, 1, 2]
-	// cast to int, so vectors of 'char' come out correctly
-	void print_in1()
-	{
-		std::cout << "["<<(int)in1[0];
-		for(unsigned i=1; i<m; i++)
-			std::cout << ", " <<(int)in1[i];
-		std::cout << "]";
-	}
-	void print_in2()
-	{
-		std::cout << "["<<(int)in2[0];
-		for(unsigned i=1; i<m; i++)
-			std::cout << ", " <<(int)in2[i];
-		std::cout << "]";
-	}
-	void print_mask1()
-	{
-		std::cout << "["<<(int)mask1[0];
-		for(unsigned i=1; i<n; i++)
-			std::cout << ", " <<(int)mask1[i];
-		std::cout << "]";
-	}
-	void print_mask2()
-	{
-		std::cout << "["<<(int)mask2[0];
-		for(unsigned i=1; i<n; i++)
-			std::cout << ", " <<(int)mask2[i];
-		std::cout << "]";
-	}
-	void print_out()
-	{
-		std::cout << "["<<(int)out[0];
-		for(unsigned i=1; i<n; i++)
-			std::cout << ", " <<(int)out[i];
-		std::cout << "]";
-	}
-};
+    unsigned errors = 0;
+    for(unsigned n_loop=0; n_loop<5; n_loop++) {
+          for(unsigned m_loop=0; m_loop<5; m_loop++) {
+              unsigned m = vecelts[m_loop];
+              for(unsigned i=0; i<m; i++) {
+                in2[i]=(D)(i+m);
+                in1[i] = (D)i;
+              }
+              if (!run_single_test(vecelts[n_loop], vecelts[m_loop]))
+                errors++;
+          }
+    }
 
+    clReleaseMemObject(mem_in1);
+    clReleaseMemObject(mem_in2);
+    clReleaseMemObject(mem_mask1);
+    clReleaseMemObject(mem_mask2);
+    clReleaseMemObject(mem_out);
+    clReleaseProgram(prog);
+
+    return errors;
+  }
+
+};
 
 
-/* Run one shuffle test, return true if successful*/
-template<typename D, typename M>
-bool runtest( int n, int m, const char* ocl_type){
-	char *buf = (char*) malloc(1024);
-	const char *src[1];
-	src[0]=buf;
-	int numchars;
-	cl_mem mem_in1, mem_in2, mem_out, mem_mask1, mem_mask2;
-	cl_program prog;
-	cl_kernel krn;
-	bool rv=true;
-
-	testcase<D,M> tc(n, m, ocl_type);
-	int size=sizeof(D);
-
-    int mAligned = m;
-    if (m == 3) mAligned = 4;
-
-    int nAligned = n;
-    if (n == 3) nAligned = 4;
-
-	mem_in1 = clCreateBuffer(ctx,
-	                         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-	                         size * mAligned, tc.in1, NULL);
-	mem_in2 = clCreateBuffer(ctx,
-	                         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-	                         size * nAligned, tc.in2, NULL);
-	mem_mask1 = clCreateBuffer(ctx,
-	                          CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-	                          size * nAligned, tc.mask1, NULL);
-	mem_mask2 = clCreateBuffer(ctx,
-	                          CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-	                          size * nAligned, tc.mask2, NULL);
-	mem_out = clCreateBuffer(ctx,
-	                         CL_MEM_WRITE_ONLY,
-	                         size * nAligned, NULL, NULL);
-
-	numchars = tc.create_source( buf );
-	buf[numchars]=0;
-
-	prog = clCreateProgramWithSource(ctx, 1, src, NULL, NULL);
-	clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
-	krn = clCreateKernel(prog, "test_shuffle", NULL);
-
-	clSetKernelArg( krn, 0, sizeof(cl_mem), &mem_in1 );
-	clSetKernelArg( krn, 1, sizeof(cl_mem), &mem_mask1 );
-	clSetKernelArg( krn, 2, sizeof(cl_mem), &mem_out );
-	clEnqueueTask( queue, krn, 0, NULL, NULL );
-	clEnqueueReadBuffer( queue, mem_out, CL_TRUE, 0, size*nAligned, tc.out, 0, NULL, NULL );
-	clFinish(queue);
-
-	if(!tc.output_matches_1()) {
-		std::cout << "Error in shuffle " << ocl_type << " " << m;
-		std::cout << " => " << ocl_type << " " << n << " :";
-		tc.print_out();
-		std::cout << " = shuffle( ";
-		tc.print_in1();
-		std::cout << ", ";
-		tc.print_mask1();
-		std::cout << ");" << std::endl;
-		rv=false;
-	}
 
-	// Now test shuffle2()
-	clReleaseKernel(krn);
-	krn = clCreateKernel(prog, "test_shuffle2", NULL);
-	clSetKernelArg( krn, 0, sizeof(cl_mem), &mem_in1 );
-	clSetKernelArg( krn, 1, sizeof(cl_mem), &mem_in2 );
-	clSetKernelArg( krn, 2, sizeof(cl_mem), &mem_mask2 );
-	clSetKernelArg( krn, 3, sizeof(cl_mem), &mem_out );
-	clEnqueueTask( queue, krn, 0, NULL, NULL );
-	clEnqueueReadBuffer( queue, mem_out, CL_TRUE, 0, size*nAligned, tc.out, 0, NULL, NULL );
-	clFinish(queue);
-
-	if(!tc.output_matches_2()) {
-		std::cout << "Error in shuffle2 " << ocl_type << " " << m;
-		std::cout << " => " << ocl_type << " " << n << " :";
-		tc.print_out();
-		std::cout << " = shuffle2( ";
-		tc.print_in1();
-		std::cout << ", ";
-		tc.print_in2();
-		std::cout << ", ";
-		tc.print_mask2();
-		std::cout << ");" << std::endl;
-		rv=false;
-	}
 
-	clReleaseMemObject(mem_in1);
-	clReleaseMemObject(mem_in2);
-	clReleaseMemObject(mem_mask1);
-	clReleaseMemObject(mem_mask2);
-	clReleaseMemObject(mem_out);
-	clReleaseKernel(krn);
-	clReleaseProgram(prog);
 
-	return rv;
-}
 
 int main( int argc, char *argv[])
 {
-	int num_errors = 0;
+	unsigned num_errors = 0;
 
 	if( argc != 2 ) {
 		std::cout << "give element type"<<std::endl;
@@ -292,51 +344,56 @@ int main( int argc, char *argv[])
 	 * templating mechanism to create the test for shorts instead
 	 * of halfs.
 	 */
-	int vecelts[5]={2,3,4,8,16};
-	for(unsigned n_loop=0; n_loop<5; n_loop++) {
-		for(unsigned m_loop=0; m_loop<5; m_loop++) {
-			bool rv;
-			if( strcmp("char", argv[1]) == 0 )
-				rv=runtest<cl_char, cl_uchar>
-				        (vecelts[n_loop], vecelts[m_loop], "char");
-			else if( strcmp("uchar", argv[1]) == 0 )
-				rv=runtest<cl_uchar, cl_uchar>
-				        (vecelts[n_loop], vecelts[m_loop], "uchar");
-			else if( strcmp("short", argv[1]) == 0 )
-				rv=runtest<cl_short, cl_ushort>
-				        (vecelts[n_loop], vecelts[m_loop], "short");
-			else if( strcmp("ushort", argv[1]) == 0 )
-				rv=runtest<cl_ushort, cl_ushort>
-				        (vecelts[n_loop], vecelts[m_loop], "ushort");
-			else if( strcmp("int", argv[1]) == 0 )
-				rv=runtest<cl_int, cl_uint>
-				        (vecelts[n_loop], vecelts[m_loop], "int");
-			else if( strcmp("uint", argv[1]) == 0 )
-				rv=runtest<cl_uint, cl_uint>
-				        (vecelts[n_loop], vecelts[m_loop], "uint");
-			else if( strcmp("long", argv[1]) == 0 )
-				rv=runtest<cl_long, cl_ulong>
-				        (vecelts[n_loop], vecelts[m_loop], "long");
-			else if( strcmp("ulong", argv[1]) == 0 )
-				rv=runtest<cl_ulong, cl_ulong>
-				        (vecelts[n_loop], vecelts[m_loop], "ulong");
-			else if( strcmp("half", argv[1]) == 0 )
-				rv=runtest<cl_half, cl_ushort>
-				        (vecelts[n_loop], vecelts[m_loop], "half");
-			else if( strcmp("float", argv[1]) == 0 )
-				rv=runtest<cl_float, cl_uint>
-				        (vecelts[n_loop], vecelts[m_loop], "float");
-			else if( strcmp("double", argv[1]) == 0 )
-				rv=runtest<cl_double, cl_ulong>
-				        (vecelts[n_loop], vecelts[m_loop], "double");
-			else {
-				std::cout << "Error: unknown type " << argv[1] << ": use OCL-C types"<<std::endl;
-				return -1;
-			}
-			if(rv==false)
-				num_errors++;
-		}
-	}
+	 if( strcmp("char", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_char, cl_uchar> t("char"); num_errors = t.run();
+
+	 } else if( strcmp("uchar", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_uchar, cl_uchar> t("uchar"); num_errors = t.run();
+
+	 } else if( strcmp("short", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_short, cl_ushort> t("short"); num_errors = t.run();
+
+	 } else if( strcmp("ushort", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_ushort, cl_ushort> t("ushort"); num_errors = t.run();
+
+	 } else if( strcmp("int", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_int, cl_uint> t("int"); num_errors = t.run();
+
+	 } else if( strcmp("uint", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_uint, cl_uint> t("uint"); num_errors = t.run();
+
+	 } else if( strcmp("long", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_long, cl_ulong> t("long"); num_errors = t.run();
+
+	 } else if( strcmp("ulong", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_ulong, cl_ulong> t("ulong"); num_errors = t.run();
+
+	 } else if( strcmp("half", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_half, cl_ushort> t("half"); num_errors = t.run();
+
+	 } else if( strcmp("float", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_float, cl_uint> t("float"); num_errors = t.run();
+
+	 } else if( strcmp("double", argv[1]) == 0 ) {
+
+	   TestShuffle<cl_double, cl_ulong> t("double"); num_errors = t.run();
+
+	 } else {
+
+	     std::cout << "Error: unknown type " << argv[1] << ": use OCL-C types"<<std::endl;
+	     return -1;
+
+	   }
 
 	if( num_errors == 0)
 		std::cout << "OK" << std::endl;
diff --git a/tests/package.m4 b/tests/package.m4
deleted file mode 100644
index c7320e0..0000000
--- a/tests/package.m4
+++ /dev/null
@@ -1,13 +0,0 @@
-# Signature of the current package.
-m4_define([AT_PACKAGE_NAME],
-  [pocl])
-m4_define([AT_PACKAGE_TARNAME],
-  [pocl])
-m4_define([AT_PACKAGE_VERSION],
-  [0.12])
-m4_define([AT_PACKAGE_STRING],
-  [pocl 0.12])
-m4_define([AT_PACKAGE_BUGREPORT],
-  [pocl-devel at lists.sourceforge.net])
-m4_define([AT_PACKAGE_URL],
-  [])
diff --git a/tests/regression/CMakeLists.txt b/tests/regression/CMakeLists.txt
index 3ce631a..5c06a1d 100644
--- a/tests/regression/CMakeLists.txt
+++ b/tests/regression/CMakeLists.txt
@@ -23,26 +23,35 @@
 #
 #=============================================================================
 
+# Mac OS X currently can't digest cl2.hpp, which all reg tests include
+if(NOT APPLE)
+
+set(C_PROGRAMS_TO_BUILD test_assign_loop_variable_to_privvar_makes_it_local
+     test_assign_loop_variable_to_privvar_makes_it_local_2)
+foreach(PROG ${C_PROGRAMS_TO_BUILD})
+  if(MSVC)
+    set_source_files_properties( "${PROG}.c" PROPERTIES LANGUAGE CXX )
+  endif(MSVC)
+  add_executable("${PROG}" "${PROG}.c")
+  target_link_libraries("${PROG}" ${POCLU_LINK_OPTIONS})
+endforeach()
+
+
 set(PROGRAMS_TO_BUILD test_barrier_between_for_loops test_early_return
   test_for_with_var_iteration_count test_id_dependent_computation
   test_locals test_loop_phi_replication test_multi_level_loops_with_barriers
   test_simple_for_with_a_barrier test_structs_as_args test_vectors_as_args
   test_barrier_before_return test_infinite_loop test_constant_array
   test_undominated_variable test_setargs test_null_arg
-  test_fors_with_var_iteration_counts)
+  test_fors_with_var_iteration_counts test_issue_231)
 
-#AM_LDFLAGS = ../../lib/poclu/libpoclu.la @OPENCL_LIBS@
-# POCLU_LINK_OPTIONS
 
-#AM_CXXFLAGS = @OPENCL_CFLAGS@ -Wno-deprecated -Wno-deprecated-declarations
 if (MSVC)
   add_compile_options(${OPENCL_CFLAGS})
 else ()
   add_compile_options("-std=c++11" "-Wno-deprecated" "-Wno-deprecated-declarations" ${OPENCL_CFLAGS})
 endif ()
  
-
-#AM_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_builddir)/include -I$(top_srcdir)/include -I$(top_srcdir)/lib/CL -DSRCDIR='"$(abs_srcdir)"'
 add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 include_directories("${CMAKE_SOURCE_DIR}/lib/CL")
 
@@ -51,16 +60,6 @@ foreach(PROG ${PROGRAMS_TO_BUILD})
   target_link_libraries("${PROG}" ${POCLU_LINK_OPTIONS})
 endforeach()
 
-set(C_PROGRAMS_TO_BUILD test_assign_loop_variable_to_privvar_makes_it_local
-     test_assign_loop_variable_to_privvar_makes_it_local_2)
-foreach(PROG ${C_PROGRAMS_TO_BUILD})
-  if(MSVC)
-    set_source_files_properties( "${PROG}.c" PROPERTIES LANGUAGE CXX )
-  endif(MSVC)
-  add_executable("${PROG}" "${PROG}.c")
-  target_link_libraries("${PROG}" ${POCLU_LINK_OPTIONS})
-endforeach()
-
 
 ######################################################################
 
@@ -68,158 +67,160 @@ endforeach()
 
 # repl
 
-add_test("\"regression/phi nodes not replicated (repl)\"" "test_loop_phi_replication")
+add_test_pocl(NAME "regression/phi_nodes_not_replicated_REPL" COMMAND "test_loop_phi_replication")
 
-add_test("\"regression/issues with local pointers (repl)\"" "test_locals")
+add_test_pocl(NAME "regression/issues_with_local_pointers_REPL" COMMAND "test_locals")
 
-add_test("\"regression/barrier between two for loops (repl)\"" "test_barrier_between_for_loops")
+add_test_pocl(NAME "regression/barrier_between_two_for_loops_REPL" COMMAND "test_barrier_between_for_loops")
 
-add_test("\"regression/simple for-loop with a barrier inside (repl)\"" "test_simple_for_with_a_barrier")
+add_test_pocl(NAME "regression/simple_for-loop_with_a_barrier_inside_REPL" COMMAND "test_simple_for_with_a_barrier")
 
-add_test("\"regression/for-loop with computation after the brexit (repl)\"" "test_multi_level_loops_with_barriers")
+add_test_pocl(NAME "regression/for-loop_with_computation_after_the_brexit_REPL" COMMAND "test_multi_level_loops_with_barriers")
 
-add_test("\"regression/for-loop with a variable iteration count (repl)\"" "test_for_with_var_iteration_count")
+add_test_pocl(NAME "regression/for-loop_with_a_variable_iteration_count_REPL" COMMAND "test_for_with_var_iteration_count")
 
-add_test("\"regression/early return before a barrier region (repl)\"" "test_early_return")
+add_test_pocl(NAME "regression/early_return_before_a_barrier_region_REPL" COMMAND "test_early_return")
 
-add_test("\"regression/id-dependent computation before kernel exit (repl)\"" "test_id_dependent_computation")
+add_test_pocl(NAME "regression/id-dependent_computation_before_kernel_exit_REPL" COMMAND "test_id_dependent_computation")
 
-add_test("\"regression/barrier just before return (repl)\"" "test_barrier_before_return")
+add_test_pocl(NAME "regression/barrier_just_before_return_REPL" COMMAND "test_barrier_before_return")
 
-add_test("\"regression/infinite loop (repl)\"" "test_infinite_loop")
+add_test_pocl(NAME "regression/infinite_loop_REPL" COMMAND "test_infinite_loop")
 
-add_test("\"regression/undominated variable from conditional barrier handling (repl)\"" "test_undominated_variable")
+add_test_pocl(NAME "regression/undominated_variable_from_conditional_barrier_handling_REPL" COMMAND "test_undominated_variable")
 
-add_test("\"regression/assigning a loop iterator variable to a private makes it local (repl)\""
-                           "test_assign_loop_variable_to_privvar_makes_it_local")
+add_test_pocl(NAME "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_REPL"
+              COMMAND "test_assign_loop_variable_to_privvar_makes_it_local")
 
-add_test("\"regression/assigning a loop iterator variable to a private makes it local 2 (repl)\""
-                           "test_assign_loop_variable_to_privvar_makes_it_local_2")
+add_test_pocl(NAME "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_REPL"
+              COMMAND "test_assign_loop_variable_to_privvar_makes_it_local_2")
 
-set_tests_properties("\"regression/phi nodes not replicated (repl)\""
-  "\"regression/issues with local pointers (repl)\""
-  "\"regression/barrier between two for loops (repl)\""
-  "\"regression/simple for-loop with a barrier inside (repl)\""
-  "\"regression/for-loop with computation after the brexit (repl)\""
-  "\"regression/for-loop with a variable iteration count (repl)\""
-  "\"regression/early return before a barrier region (repl)\""
-  "\"regression/id-dependent computation before kernel exit (repl)\""
-  "\"regression/barrier just before return (repl)\""
-  "\"regression/infinite loop (repl)\""
-  "\"regression/undominated variable from conditional barrier handling (repl)\""
-  "\"regression/assigning a loop iterator variable to a private makes it local (repl)\""
-  "\"regression/assigning a loop iterator variable to a private makes it local 2 (repl)\""
+set_tests_properties("regression/phi_nodes_not_replicated_REPL"
+  "regression/issues_with_local_pointers_REPL"
+  "regression/barrier_between_two_for_loops_REPL"
+  "regression/simple_for-loop_with_a_barrier_inside_REPL"
+  "regression/for-loop_with_computation_after_the_brexit_REPL"
+  "regression/for-loop_with_a_variable_iteration_count_REPL"
+  "regression/early_return_before_a_barrier_region_REPL"
+  "regression/id-dependent_computation_before_kernel_exit_REPL"
+  "regression/barrier_just_before_return_REPL"
+  "regression/infinite_loop_REPL"
+  "regression/undominated_variable_from_conditional_barrier_handling_REPL"
+  "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_REPL"
+  "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_REPL"
   PROPERTIES
     ENVIRONMENT "POCL_WORK_GROUP_METHOD=workitemrepl"
     COST 1.5
     PROCESSORS 1
-    DEPENDS "pocl_version_check")
+    DEPENDS "pocl_version_check"
+    LABELS "internal;regression;tce")
 
 
 # loops
 
-add_test("\"regression/phi nodes not replicated (loops)\"" "test_loop_phi_replication")
+add_test_pocl(NAME "regression/phi_nodes_not_replicated_LOOPS" COMMAND "test_loop_phi_replication")
 
-add_test("\"regression/issues with local pointers (loops)\"" "test_locals")
+add_test_pocl(NAME "regression/issues_with_local_pointers_LOOPS" COMMAND "test_locals")
 
-add_test("\"regression/barrier between two for loops (loops)\"" "test_barrier_between_for_loops")
+add_test_pocl(NAME "regression/barrier_between_two_for_loops_LOOPS" COMMAND "test_barrier_between_for_loops")
 
-add_test("\"regression/simple for-loop with a barrier inside (loops)\"" "test_simple_for_with_a_barrier")
+add_test_pocl(NAME "regression/simple_for-loop_with_a_barrier_inside_LOOPS" COMMAND "test_simple_for_with_a_barrier")
 
-add_test("\"regression/for-loop with computation after the brexit (loops)\"" "test_multi_level_loops_with_barriers")
+add_test_pocl(NAME "regression/for-loop_with_computation_after_the_brexit_LOOPS" COMMAND "test_multi_level_loops_with_barriers")
 
-add_test("\"regression/for-loop with a variable iteration count (loops)\"" "test_for_with_var_iteration_count")
+add_test_pocl(NAME "regression/for-loop_with_a_variable_iteration_count_LOOPS" COMMAND "test_for_with_var_iteration_count")
 
-add_test("\"regression/early return before a barrier region (loops)\"" "test_early_return")
+add_test_pocl(NAME "regression/early_return_before_a_barrier_region_LOOPS" COMMAND "test_early_return")
 
-add_test("\"regression/id-dependent computation before kernel exit (loops)\"" "test_id_dependent_computation")
+add_test_pocl(NAME "regression/id-dependent_computation_before_kernel_exit_LOOPS" COMMAND "test_id_dependent_computation")
 
-add_test("\"regression/barrier just before return (loops)\"" "test_barrier_before_return")
+add_test_pocl(NAME "regression/barrier_just_before_return_LOOPS" COMMAND "test_barrier_before_return")
 
-add_test("\"regression/infinite loop (loops)\"" "test_infinite_loop")
+add_test_pocl(NAME "regression/infinite_loop_LOOPS" COMMAND "test_infinite_loop")
 
-add_test("\"regression/undominated variable from conditional barrier handling (loops)\"" "test_undominated_variable")
+add_test_pocl(NAME "regression/undominated_variable_from_conditional_barrier_handling_LOOPS" COMMAND "test_undominated_variable")
 
-add_test("\"regression/assigning a loop iterator variable to a private makes it local (loops)\""
-                           "test_assign_loop_variable_to_privvar_makes_it_local")
+add_test_pocl(NAME "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_LOOPS"
+              COMMAND "test_assign_loop_variable_to_privvar_makes_it_local")
 
-add_test("\"regression/assigning a loop iterator variable to a private makes it local 2 (loops)\""
-                           "test_assign_loop_variable_to_privvar_makes_it_local_2")
+add_test_pocl(NAME "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_LOOPS"
+              COMMAND "test_assign_loop_variable_to_privvar_makes_it_local_2")
 
-set_tests_properties("\"regression/phi nodes not replicated (loops)\""
-  "\"regression/issues with local pointers (loops)\""
-  "\"regression/barrier between two for loops (loops)\""
-  "\"regression/simple for-loop with a barrier inside (loops)\""
-  "\"regression/for-loop with computation after the brexit (loops)\""
-  "\"regression/for-loop with a variable iteration count (loops)\""
-  "\"regression/early return before a barrier region (loops)\""
-  "\"regression/id-dependent computation before kernel exit (loops)\""
-  "\"regression/barrier just before return (loops)\""
-  "\"regression/infinite loop (loops)\""
-  "\"regression/undominated variable from conditional barrier handling (loops)\""
-  "\"regression/assigning a loop iterator variable to a private makes it local (loops)\""
-  "\"regression/assigning a loop iterator variable to a private makes it local 2 (loops)\""
+set_tests_properties("regression/phi_nodes_not_replicated_LOOPS"
+  "regression/issues_with_local_pointers_LOOPS"
+  "regression/barrier_between_two_for_loops_LOOPS"
+  "regression/simple_for-loop_with_a_barrier_inside_LOOPS"
+  "regression/for-loop_with_computation_after_the_brexit_LOOPS"
+  "regression/for-loop_with_a_variable_iteration_count_LOOPS"
+  "regression/early_return_before_a_barrier_region_LOOPS"
+  "regression/id-dependent_computation_before_kernel_exit_LOOPS"
+  "regression/barrier_just_before_return_LOOPS"
+  "regression/infinite_loop_LOOPS"
+  "regression/undominated_variable_from_conditional_barrier_handling_LOOPS"
+  "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_LOOPS"
+  "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_LOOPS"
   PROPERTIES
     ENVIRONMENT "POCL_WORK_GROUP_METHOD=workitemloops"
     COST 1.5
     PROCESSORS 1
-    DEPENDS "pocl_version_check")
+    DEPENDS "pocl_version_check"
+    LABELS "internal;regression;tce")
 
 
 # other
 
-add_test("\"regression/setting a buffer argument to NULL causes a segfault\"" "test_null_arg")
+add_test_pocl(NAME "regression/LoopVectorizer_crash_with_Haswell_and_Broadwell_-_issue_231" COMMAND "test_issue_231")
 
-add_test("\"regression/clSetKernelArg overwriting the previous kernel's args\"" "test_setargs")
+add_test_pocl(NAME "regression/setting_a_buffer_argument_to_NULL_causes_a_segfault" COMMAND "test_null_arg")
 
-add_test("\"regression/passing a constant array as an arg\"" "test_constant_array")
+add_test_pocl(NAME "regression/clSetKernelArg_overwriting_the_previous_kernel's_args" COMMAND "test_setargs")
 
-add_test("\"regression/case with multiple variable length loops and a barrier in one\"" "test_fors_with_var_iteration_counts")
+add_test_pocl(NAME "regression/passing_a_constant_array_as_an_arg" COMMAND "test_constant_array")
 
-#add_test("\"regression/struct kernel arguments\"" "test_structs_as_args")
+add_test_pocl(NAME "regression/case_with_multiple_variable_length_loops_and_a_barrier_in_one" COMMAND "test_fors_with_var_iteration_counts")
 
-#add_test("\"regression/vector kernel arguments\"" "test_vectors_as_args")
+# these 2 will fail
+add_test_pocl(NAME "regression/struct_kernel_arguments" COMMAND "test_structs_as_args")
 
-set_tests_properties("\"regression/setting a buffer argument to NULL causes a segfault\""
-  "\"regression/clSetKernelArg overwriting the previous kernel's args\""
-  "\"regression/passing a constant array as an arg\""
-  "\"regression/case with multiple variable length loops and a barrier in one\""
+add_test_pocl(NAME "regression/vector_kernel_arguments" COMMAND "test_vectors_as_args")
+
+set_tests_properties("regression/setting_a_buffer_argument_to_NULL_causes_a_segfault"
+  "regression/clSetKernelArg_overwriting_the_previous_kernel's_args"
+  "regression/passing_a_constant_array_as_an_arg"
+  "regression/case_with_multiple_variable_length_loops_and_a_barrier_in_one"
+  "regression/LoopVectorizer_crash_with_Haswell_and_Broadwell_-_issue_231"
+  "regression/struct_kernel_arguments" "regression/vector_kernel_arguments"
   PROPERTIES
     COST 1.5
     PROCESSORS 1
-    DEPENDS "pocl_version_check")
+    DEPENDS "pocl_version_check"
+    LABELS "internal;regression;tce")
+
+# The vector/struct kernel arguments are known to be flaky and
+# work by luck sometimes. Disable them for now.
+#if((LLVM_CXXFLAGS MATCHES "_DEBUG") OR (NOT LLVM_CXXFLAGS MATCHES "DNDEBUG"))
+#  set_tests_properties("regression/vector_kernel_arguments"
+#    PROPERTIES  WILL_FAIL 1)
+#endif()
 
-#  "\"regression/struct kernel arguments\""
-#  "\"regression/vector kernel arguments\""
+set_tests_properties("regression/struct_kernel_arguments"
+                      PROPERTIES  WILL_FAIL 1)
 
 ###################################################################
 
 if(POWERPC)
-  set_tests_properties("\"regression/for-loop with a variable iteration count (loops)\""
+  set_tests_properties("regression/for-loop_with_a_variable_iteration_count_LOOPS"
     PROPERTIES  WILL_FAIL 1)
-  if(LLVM_3_2)
-    set_tests_properties("\"regression/vector kernel arguments\""
-      PROPERTIES  WILL_FAIL 1)
-  endif()
 endif()
 
-# The vector/struct kernel arguments are known to be flaky and
-# work by luck sometimes. Disable them for now.
-#
-#if((LLVM_CXXFLAGS MATCHES "_DEBUG") OR (NOT LLVM_CXXFLAGS MATCHES "DNDEBUG"))
-#  set_tests_properties("\"regression/vector kernel arguments\""
-#    PROPERTIES  WILL_FAIL 1)
-#endif()
-
 # TODO infinite loop test: AT_SKIP_IF([ env | grep -q POCL_IMPLICIT_FINISH])
 
-#set_tests_properties("\"regression/struct kernel arguments\""
-#  PROPERTIES  WILL_FAIL 1)
-
 set_tests_properties(
-  "\"regression/assigning a loop iterator variable to a private makes it local 2 (repl)\""
-  "\"regression/assigning a loop iterator variable to a private makes it local 2 (loops)\""
+  "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_REPL"
+  "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_LOOPS"
     PROPERTIES PASS_REGULAR_EXPRESSION
 "changing the value at global_id: 6, local_id 2, group_id 1, to: 3
 value is changed at global_id: 6, local_id 2, group_id 1, to: 3
 ")
+
+endif()
diff --git a/tests/regression/Makefile.in b/tests/regression/Makefile.in
index fe5d465..32e1967 100644
--- a/tests/regression/Makefile.in
+++ b/tests/regression/Makefile.in
@@ -388,6 +388,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -395,6 +396,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -412,8 +414,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -489,6 +489,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/tests/regression/README.txt b/tests/regression/README.txt
new file mode 100644
index 0000000..110bf6c
--- /dev/null
+++ b/tests/regression/README.txt
@@ -0,0 +1,4 @@
+Regression tests for fixed bugs or known broken bugs (XFAIL).
+
+Each test should return EXIT_SUCCESS on success and EXIT_FAILURE on
+failure.
diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt
index 8739edd..be51584 100644
--- a/tests/runtime/CMakeLists.txt
+++ b/tests/runtime/CMakeLists.txt
@@ -53,41 +53,46 @@ endforeach()
 #######################################################################
 
 
-add_test("runtime/clGetDeviceInfo" "test_clGetDeviceInfo")
+add_test_pocl(NAME "runtime/clGetDeviceInfo" COMMAND "test_clGetDeviceInfo")
 
-add_test("runtime/clEnqueueNativeKernel" "test_clEnqueueNativeKernel")
+add_test_pocl(NAME "runtime/clEnqueueNativeKernel" COMMAND "test_clEnqueueNativeKernel")
 
-add_test("runtime/clGetEventInfo" "test_clGetEventInfo")
+add_test_pocl(NAME "runtime/clGetEventInfo" COMMAND "test_clGetEventInfo")
 
-add_test("runtime/clCreateProgramWithBinary" "test_clCreateProgramWithBinary")
+add_test_pocl(NAME "runtime/clCreateProgramWithBinary" COMMAND "test_clCreateProgramWithBinary")
 
-add_test(NAME "runtime/clBuildProgram"
-         WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-         COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_clBuildProgram")
+add_test_pocl(NAME "runtime/clBuildProgram"
+              WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+              COMMAND "test_clBuildProgram")
 
-add_test(NAME "runtime/test_kernel_cache_includes"
-         WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-         COMMAND "${CMAKE_CURRENT_BINARY_DIR}/test_kernel_cache_includes")
+add_test_pocl(NAME "runtime/test_kernel_cache_includes"
+              WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+              COMMAND "test_kernel_cache_includes")
 
-add_test("runtime/clFinish" "test_clFinish")
+add_test_pocl(NAME "runtime/clFinish" COMMAND "test_clFinish")
 
-add_test("runtime/test_event_cycle" "test_event_cycle")
+add_test_pocl(NAME "runtime/test_event_cycle" COMMAND "test_event_cycle")
 
 # currently fails, see commit 13e5bc89a6b7675efbc
 #add_test("runtime/test_link_error" "test_link_error")
 
-add_test("runtime/test_read-copy-write-buffer" "test_read-copy-write-buffer")
+add_test_pocl(NAME "runtime/test_read-copy-write-buffer" COMMAND "test_read-copy-write-buffer")
 
-add_test("runtime/clCreateKernel" "test_clCreateKernel")
+add_test_pocl(NAME "runtime/clCreateKernel" COMMAND "test_clCreateKernel")
 
-add_test("runtime/clGetKernelArgInfo" "test_clGetKernelArgInfo")
+add_test_pocl(NAME "runtime/clGetKernelArgInfo" COMMAND "test_clGetKernelArgInfo")
 
-add_test_custom("${CMAKE_CURRENT_BINARY_DIR}/test_clSetEventCallback"
-                "runtime/clSetEventCallback" "test_clSetEventCallback_expout.txt" )
+add_test_pocl(NAME "runtime/clSetEventCallback"
+              COMMAND "test_clSetEventCallback"
+              EXPECTED_OUTPUT "test_clSetEventCallback_expout.txt" )
 
-add_test("runtime/clGetSupportedImageFormats" "test_clGetSupportedImageFormats")
+add_test_pocl(NAME "runtime/clGetSupportedImageFormats" COMMAND "test_clGetSupportedImageFormats")
 
-add_test("runtime/clCreateKernelsInProgram" "test_clCreateKernelsInProgram")
+add_test_pocl(NAME "runtime/clCreateKernelsInProgram" COMMAND "test_clCreateKernelsInProgram")
+
+add_test_pocl(NAME "runtime/clCreateSubDevices" COMMAND  "test_clCreateSubDevices")
+
+add_test_pocl(NAME "runtime/test_event_free" COMMAND  "test_event_free")
 
 set_tests_properties( "runtime/clGetDeviceInfo" "runtime/clEnqueueNativeKernel"
   "runtime/clGetEventInfo" "runtime/clCreateProgramWithBinary"
@@ -96,11 +101,12 @@ set_tests_properties( "runtime/clGetDeviceInfo" "runtime/clEnqueueNativeKernel"
   "runtime/clCreateKernel" "runtime/clGetKernelArgInfo"
   "runtime/test_kernel_cache_includes" "runtime/test_event_cycle"
   "runtime/test_read-copy-write-buffer" #"runtime/test_link_error"
+  "runtime/test_event_free" "runtime/clCreateSubDevices"
   PROPERTIES
     COST 2.0
     PROCESSORS 1
-    DEPENDS "pocl_version_check")
-
+    DEPENDS "pocl_version_check"
+    LABELS "internal;runtime")
 
 
 set_tests_properties("runtime/clGetSupportedImageFormats"
@@ -111,12 +117,6 @@ set_tests_properties("runtime/clCreateKernelsInProgram"
   PROPERTIES
     PASS_REGULAR_EXPRESSION "Hello\nWorld")
 
-if(LLVM_3_2)
-  set_tests_properties("runtime/clGetKernelArgInfo"
-    PROPERTIES WILL_FAIL 1)
-endif()
-
-
 
 set_tests_properties("runtime/clFinish"
   PROPERTIES
diff --git a/tests/runtime/Makefile.in b/tests/runtime/Makefile.in
index 6eb8d16..0da4b54 100644
--- a/tests/runtime/Makefile.in
+++ b/tests/runtime/Makefile.in
@@ -331,6 +331,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -338,6 +339,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -355,8 +357,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -432,6 +432,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/tests/runtime/macro_test.cl b/tests/runtime/macro_test.cl
new file mode 100644
index 0000000..e69de29
diff --git a/tests/runtime/test_clBuildProgram.c b/tests/runtime/test_clBuildProgram.c
index 61ba372..52288e1 100644
--- a/tests/runtime/test_clBuildProgram.c
+++ b/tests/runtime/test_clBuildProgram.c
@@ -27,6 +27,7 @@
 #include <string.h>
 #include <CL/cl.h>
 #include <poclu.h>
+#include "config.h"
 #include "pocl_tests.h"
 
 #define MAX_PLATFORMS 32
@@ -185,5 +186,18 @@ main(void){
   err |= clReleaseProgram(program);
   CHECK_OPENCL_ERROR_IN("'init' kernel name test clean-up");
 
+  // macro test
+  char* macro_kernel = poclu_read_file(SRCDIR "/tests/runtime/test_clBuildProgram_macros.cl" );
+  size_t s = strlen(macro_kernel);
+  program = clCreateProgramWithSource(context, 1, (const char**)&macro_kernel,
+                                      &s, &err);
+  CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource");
+
+  err = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL);
+  TEST_ASSERT(err == CL_SUCCESS);
+
+  err = clReleaseProgram(program);
+  CHECK_OPENCL_ERROR_IN("clReleaseProgram");
+
   return EXIT_SUCCESS;
 }
diff --git a/tests/runtime/test_clBuildProgram_macros.cl b/tests/runtime/test_clBuildProgram_macros.cl
new file mode 100644
index 0000000..c7aac1e
--- /dev/null
+++ b/tests/runtime/test_clBuildProgram_macros.cl
@@ -0,0 +1,15 @@
+#ifdef cl_khr_fp64
+#  pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#else
+#  error "cl_khr_fp64 macro undefined"
+#endif
+
+#ifdef cl_khr_global_int32_base_atomics
+#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : disable
+#else
+#  error "cl_khr_global_int32_base_atomics macro undefined"
+#endif
+
+__kernel void kernel_1() {
+  printf("Hello World\n");
+}
diff --git a/tests/runtime/test_version.c b/tests/runtime/test_version.c
index 4c1a81f..a7a207b 100644
--- a/tests/runtime/test_version.c
+++ b/tests/runtime/test_version.c
@@ -31,7 +31,8 @@ int main(void)
 	if( rv != CL_SUCCESS )
 		return 1;
 	result[rvs]=0;	// spec doesn't say it is null-terminated.
-	if( strcmp( result, "OpenCL 1.2 pocl " PACKAGE_VERSION) != 0 ) {
+	if( strcmp( result, 
+	            "OpenCL " POCL_CL_VERSION " pocl " PACKAGE_VERSION ", LLVM " LLVM_VERSION) != 0 ) {
 		printf("Error: platform is: %s\n", result);
 		return 2;
 	}
diff --git a/lib/CL/devices/cellspu/CMakeLists.txt b/tests/tce/CMakeLists.txt
similarity index 81%
rename from lib/CL/devices/cellspu/CMakeLists.txt
rename to tests/tce/CMakeLists.txt
index 1524daf..8cca155 100644
--- a/lib/CL/devices/cellspu/CMakeLists.txt
+++ b/tests/tce/CMakeLists.txt
@@ -1,7 +1,7 @@
 #=============================================================================
 #   CMake build system files
 #
-#   Copyright (c) 2014 pocl developers
+#   Copyright (c) 2016 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -23,8 +23,12 @@
 #
 #=============================================================================
 
-if(MSVC)
-  set_source_files_properties( cellspu.h cellspu.c PROPERTIES LANGUAGE CXX )
-endif(MSVC)
-add_library("pocl-devices-cellspu" OBJECT cellspu.h cellspu.c)
-set(POCL_DEVICES_OBJS "${POCL_DEVICES_OBJS};$<TARGET_OBJECTS:pocl-devices-cellspu>" PARENT_SCOPE)
+add_compile_options(${TCE_INCLUDES})
+add_compile_options(${TCE_CXXFLAGS})
+
+add_subdirectory("ttasim")
+add_subdirectory("fp16")
+
+if(ENABLE_TCEMC)
+  add_subdirectory("tcemc")
+endif()
diff --git a/tests/tce/Makefile.in b/tests/tce/Makefile.in
index 90efc7c..67b45a5 100644
--- a/tests/tce/Makefile.in
+++ b/tests/tce/Makefile.in
@@ -253,6 +253,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -260,6 +261,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -277,8 +279,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -354,6 +354,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/example1/CMakeLists.txt b/tests/tce/fp16/CMakeLists.txt
similarity index 53%
copy from examples/example1/CMakeLists.txt
copy to tests/tce/fp16/CMakeLists.txt
index 1de70ef..ed5ccb6 100644
--- a/examples/example1/CMakeLists.txt
+++ b/tests/tce/fp16/CMakeLists.txt
@@ -1,7 +1,7 @@
 #=============================================================================
 #   CMake build system files
 #
-#   Copyright (c) 2014 pocl developers
+#   Copyright (c) 2016 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -23,31 +23,38 @@
 #
 #=============================================================================
 
-#AM_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -DSRCDIR='"$(abs_srcdir)"'
-add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
+add_executable("fp16_host" host.cpp)
+
+target_link_libraries("fp16_host" ${POCLU_LINK_OPTIONS} ${LD_FLAGS_BIN})
 
-# example1_CFLAGS = @OPENCL_CFLAGS@
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 ${OPENCL_CFLAGS}")
-add_compile_options(${OPENCL_CFLAGS})
+add_compile_options( -Wno-deprecated -Wno-deprecated-declarations)
+
+add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
-if (MSVC)
-  set_source_files_properties( example1.c example1_exec.c PROPERTIES LANGUAGE CXX )
-endif(MSVC)
-add_executable("example1" example1.c example1_exec.c example1.cl)
+#add_custom_test("fp16_host" "tce/fp16/repl" "expected_out.txt" run)
+add_test_pocl(NAME "tce/fp16/repl" COMMAND "fp16_host" run)
 
-# example1_LDADD = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
-target_link_libraries("example1" ${POCLU_LINK_OPTIONS})
+#AT_SETUP([Half-precision floats on ttasim (repl)])
+set_tests_properties( "tce/fp16/repl"
+  PROPERTIES
+    COST 40.0
+    PROCESSORS 1
+    PASS_REGULAR_EXPRESSION "PING23456.000000 2000001OK"
+    LABELS "tce;tta;ttasim;half"
+    WILL_FAIL 1
+    ENVIRONMENT "POCL_DEVICES=ttasim;POCL_TTASIM0_PARAMETERS=${CMAKE_SOURCE_DIR}/tools/data/test_machine.adf;POCL_WORK_GROUP_METHOD=repl"
+    DEPENDS "pocl_version_check")
 
-add_test("spec_tests/example1_dot_product" "example1")
+#add_custom_test("fp16_host" "tce/fp16/loopvec" "expected_out.txt" run)
+add_test_pocl(NAME "tce/fp16/loopvec" COMMAND "fp16_host" run)
 
-set_tests_properties( "spec_tests/example1_dot_product"
+# AT_SETUP([Half-precision floats on ttasim (loopvec)])
+set_tests_properties( "tce/fp16/loopvec"
   PROPERTIES
     COST 40.0
-    PASS_REGULAR_EXPRESSION "[(]0[.]000000, 0[.]000000, 0[.]000000, 0[.]000000[)] [.] [(]0[.]000000, 0[.]000000, 0[.]000000, 0[.]000000[)] = 0[.]000000
-[(]1[.]000000, 1[.]000000, 1[.]000000, 1[.]000000[)] [.] [(]1[.]000000, 1[.]000000, 1[.]000000, 1[.]000000[)] = 4[.]000000
-[(]2[.]000000, 2[.]000000, 2[.]000000, 2[.]000000[)] [.] [(]2[.]000000, 2[.]000000, 2[.]000000, 2[.]000000[)] = 16[.]000000
-[(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] [.] [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] = 36[.]000000
-OK"
     PROCESSORS 1
-    LABELS "OpenCL_Spec"
+    PASS_REGULAR_EXPRESSION "PING23456.000000 2000001OK"
+    LABELS "tce;tta;ttasim;half"
+    WILL_FAIL 1
+    ENVIRONMENT "POCL_DEVICES=ttasim;POCL_TTASIM0_PARAMETERS=${CMAKE_SOURCE_DIR}/tools/data/test_machine.adf;POCL_WORK_GROUP_METHOD=loopvec"
     DEPENDS "pocl_version_check")
diff --git a/tests/tce/fp16/Makefile.in b/tests/tce/fp16/Makefile.in
index 1155616..1b3c28e 100644
--- a/tests/tce/fp16/Makefile.in
+++ b/tests/tce/fp16/Makefile.in
@@ -245,6 +245,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -252,6 +253,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -269,8 +271,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -346,6 +346,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/tests/tce/fp16/expected_out.txt b/tests/tce/fp16/expected_out.txt
new file mode 100644
index 0000000..d7a772c
--- /dev/null
+++ b/tests/tce/fp16/expected_out.txt
@@ -0,0 +1,17 @@
+through conversion: 42
+2.500000
+2.500000
+2.500000
+2.500000
+2.500000
+2.500000
+2.500000
+2.500000
+32.000000
+32.000000
+32.000000
+32.000000
+32.000000
+32.000000
+32.000000
+32.000000
diff --git a/examples/example2/CMakeLists.txt b/tests/tce/tcemc/CMakeLists.txt
similarity index 66%
copy from examples/example2/CMakeLists.txt
copy to tests/tce/tcemc/CMakeLists.txt
index 19c55d7..2cb0687 100644
--- a/examples/example2/CMakeLists.txt
+++ b/tests/tce/tcemc/CMakeLists.txt
@@ -1,7 +1,7 @@
 #=============================================================================
 #   CMake build system files
 #
-#   Copyright (c) 2014 pocl developers
+#   Copyright (c) 2016 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -23,28 +23,22 @@
 #
 #=============================================================================
 
-#AM_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -DSRCDIR='"$(abs_srcdir)"'
-add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
-
-# example1_CFLAGS = @OPENCL_CFLAGS@
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 ${OPENCL_CFLAGS}")
-add_compile_options(${OPENCL_CFLAGS})
+add_executable("tcemc_host" host.cpp)
 
+target_link_libraries("tcemc_host" ${POCLU_LINK_OPTIONS} ${LD_FLAGS_BIN})
 
-if (MSVC)
-  set_source_files_properties( example2.c PROPERTIES LANGUAGE CXX )
-endif(MSVC)
-add_executable("example2" example2.c example2.cl)
+add_compile_options( -Wno-deprecated -Wno-deprecated-declarations)
 
-# example1_LDADD = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
-target_link_libraries("example2" ${POCLU_LINK_OPTIONS})
+add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
-add_test("spec_tests/example2_matrix_transpose" "example2")
+add_test_pocl(NAME "tce/tcemc" COMMAND "tcemc_host" run)
 
-set_tests_properties( "spec_tests/example2_matrix_transpose"
+#AT_SETUP([A basic TCEMC test])
+set_tests_properties( "tce/tcemc"
   PROPERTIES
-    COST 3.0
-    PASS_REGULAR_EXPRESSION "OK\n"
+    COST 40.0
     PROCESSORS 1
-    LABELS "OpenCL_Spec"
+    PASS_REGULAR_EXPRESSION "PING23456.000000 2000001OK"
+    LABELS "tce;tta;ttasim;tcemc"
+    ENVIRONMENT "POCL_DEVICES=ttasim;POCL_TTASIM0_PARAMETERS=${CMAKE_SOURCE_DIR}/tools/data/test_machine.adf"
     DEPENDS "pocl_version_check")
diff --git a/tests/tce/tcemc/Makefile.in b/tests/tce/tcemc/Makefile.in
index 797a4d6..4b33d36 100644
--- a/tests/tce/tcemc/Makefile.in
+++ b/tests/tce/tcemc/Makefile.in
@@ -245,6 +245,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -252,6 +253,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -269,8 +271,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -346,6 +346,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/examples/standalone/CMakeLists.txt b/tests/tce/ttasim/CMakeLists.txt
similarity index 66%
copy from examples/standalone/CMakeLists.txt
copy to tests/tce/ttasim/CMakeLists.txt
index 42d30d8..329b20e 100644
--- a/examples/standalone/CMakeLists.txt
+++ b/tests/tce/ttasim/CMakeLists.txt
@@ -1,7 +1,7 @@
 #=============================================================================
 #   CMake build system files
 #
-#   Copyright (c) 2014 pocl developers
+#   Copyright (c) 2016 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -23,18 +23,21 @@
 #
 #=============================================================================
 
-#EXTRA_DIST = standalone.cl
+add_executable("ttasim_host" host.cpp)
 
-#noinst_DATA = $(EXTRA_DIST:.cl=.bc)
+target_link_libraries("ttasim_host" ${POCLU_LINK_OPTIONS} ${LD_FLAGS_BIN})
 
-#.cl.bc:
-# ../../scripts/pocl-standalone -h $(@:.bc=.h) -o $@ $<
+add_compile_options( -Wno-deprecated -Wno-deprecated-declarations)
 
-#clean-local:
-# rm -f $(EXTRA_DIST:.cl=.bc) $(EXTRA_DIST:.cl=.h)
+add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
-add_test("pocl-standalone" "/bin/sh" "${CMAKE_BINARY_DIR}/scripts/pocl-standalone"
-          -h "${CMAKE_BINARY_DIR}/standalone.h"
-          -o "${CMAKE_BINARY_DIR}/standalone.bc"
-          "${CMAKE_CURRENT_SOURCE_DIR}/standalone.cl")
+add_test_pocl(NAME "tce/ttasim" COMMAND "ttasim_host")
 
+set_tests_properties( "tce/ttasim"
+  PROPERTIES
+    COST 40.0
+    PROCESSORS 1
+    PASS_REGULAR_EXPRESSION "PING23456.000000 2000001OK"
+    LABELS "tce;tta;ttasim"
+    ENVIRONMENT "POCL_DEVICES=ttasim;POCL_TTASIM0_PARAMETERS=${CMAKE_SOURCE_DIR}/tools/data/test_machine.adf"
+    DEPENDS "pocl_version_check")
diff --git a/tests/tce/ttasim/Makefile.in b/tests/tce/ttasim/Makefile.in
index ec13ce2..bfd3aab 100644
--- a/tests/tce/ttasim/Makefile.in
+++ b/tests/tce/ttasim/Makefile.in
@@ -244,6 +244,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -251,6 +252,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -268,8 +270,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -345,6 +345,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/tests/testsuite b/tests/testsuite
deleted file mode 100755
index 78d61f1..0000000
--- a/tests/testsuite
+++ /dev/null
@@ -1,16385 +0,0 @@
-#! /bin/sh
-# Generated from testsuite.at by GNU Autoconf 2.69.
-#
-# Copyright (C) 2009-2012 Free Software Foundation, Inc.
-#
-# This test suite is free software; the Free Software Foundation gives
-# unlimited permission to copy, distribute and modify it.
-## -------------------- ##
-## M4sh Initialization. ##
-## -------------------- ##
-
-# Be more Bourne compatible
-DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
-  emulate sh
-  NULLCMD=:
-  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '${1+"$@"}'='"$@"'
-  setopt NO_GLOB_SUBST
-else
-  case `(set -o) 2>/dev/null` in #(
-  *posix*) :
-    set -o posix ;; #(
-  *) :
-     ;;
-esac
-fi
-
-
-as_nl='
-'
-export as_nl
-# Printing a long string crashes Solaris 7 /usr/bin/printf.
-as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
-# Prefer a ksh shell builtin over an external printf program on Solaris,
-# but without wasting forks for bash or zsh.
-if test -z "$BASH_VERSION$ZSH_VERSION" \
-    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='print -r --'
-  as_echo_n='print -rn --'
-elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='printf %s\n'
-  as_echo_n='printf %s'
-else
-  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
-    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
-    as_echo_n='/usr/ucb/echo -n'
-  else
-    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
-    as_echo_n_body='eval
-      arg=$1;
-      case $arg in #(
-      *"$as_nl"*)
-	expr "X$arg" : "X\\(.*\\)$as_nl";
-	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
-      esac;
-      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
-    '
-    export as_echo_n_body
-    as_echo_n='sh -c $as_echo_n_body as_echo'
-  fi
-  export as_echo_body
-  as_echo='sh -c $as_echo_body as_echo'
-fi
-
-# The user is always right.
-if test "${PATH_SEPARATOR+set}" != set; then
-  PATH_SEPARATOR=:
-  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
-    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
-      PATH_SEPARATOR=';'
-  }
-fi
-
-
-# IFS
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent editors from complaining about space-tab.
-# (If _AS_PATH_WALK were called with IFS unset, it would disable word
-# splitting by setting IFS to empty value.)
-IFS=" ""	$as_nl"
-
-# Find who we are.  Look in the path if we contain no directory separator.
-as_myself=
-case $0 in #((
-  *[\\/]* ) as_myself=$0 ;;
-  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
-  done
-IFS=$as_save_IFS
-
-     ;;
-esac
-# We did not find ourselves, most probably we were run as `sh COMMAND'
-# in which case we are not to be found in the path.
-if test "x$as_myself" = x; then
-  as_myself=$0
-fi
-if test ! -f "$as_myself"; then
-  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
-  exit 1
-fi
-
-# Unset variables that we do not need and which cause bugs (e.g. in
-# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
-# suppresses any "Segmentation fault" message there.  '((' could
-# trigger a bug in pdksh 5.2.14.
-for as_var in BASH_ENV ENV MAIL MAILPATH
-do eval test x\${$as_var+set} = xset \
-  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
-done
-PS1='$ '
-PS2='> '
-PS4='+ '
-
-# NLS nuisances.
-LC_ALL=C
-export LC_ALL
-LANGUAGE=C
-export LANGUAGE
-
-# CDPATH.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
-
-if test "x$CONFIG_SHELL" = x; then
-  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
-  emulate sh
-  NULLCMD=:
-  # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '\${1+\"\$@\"}'='\"\$@\"'
-  setopt NO_GLOB_SUBST
-else
-  case \`(set -o) 2>/dev/null\` in #(
-  *posix*) :
-    set -o posix ;; #(
-  *) :
-     ;;
-esac
-fi
-"
-  as_required="as_fn_return () { (exit \$1); }
-as_fn_success () { as_fn_return 0; }
-as_fn_failure () { as_fn_return 1; }
-as_fn_ret_success () { return 0; }
-as_fn_ret_failure () { return 1; }
-
-exitcode=0
-as_fn_success || { exitcode=1; echo as_fn_success failed.; }
-as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
-as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
-as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
-if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
-
-else
-  exitcode=1; echo positional parameters were not saved.
-fi
-test x\$exitcode = x0 || exit 1
-test -x / || exit 1"
-  as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
-  as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
-  eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
-  test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
-test \$(( 1 + 1 )) = 2 || exit 1"
-  if (eval "$as_required") 2>/dev/null; then :
-  as_have_required=yes
-else
-  as_have_required=no
-fi
-  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
-
-else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-as_found=false
-for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  as_found=:
-  case $as_dir in #(
-	 /*)
-	   for as_base in sh bash ksh sh5; do
-	     # Try only shells that exist, to save several forks.
-	     as_shell=$as_dir/$as_base
-	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
-		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
-  CONFIG_SHELL=$as_shell as_have_required=yes
-		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
-  break 2
-fi
-fi
-	   done;;
-       esac
-  as_found=false
-done
-$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
-	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
-  CONFIG_SHELL=$SHELL as_have_required=yes
-fi; }
-IFS=$as_save_IFS
-
-
-      if test "x$CONFIG_SHELL" != x; then :
-  export CONFIG_SHELL
-             # We cannot yet assume a decent shell, so we have to provide a
-# neutralization value for shells without unset; and this also
-# works around shells that cannot unset nonexistent variables.
-# Preserve -v and -x to the replacement shell.
-BASH_ENV=/dev/null
-ENV=/dev/null
-(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
-case $- in # ((((
-  *v*x* | *x*v* ) as_opts=-vx ;;
-  *v* ) as_opts=-v ;;
-  *x* ) as_opts=-x ;;
-  * ) as_opts= ;;
-esac
-exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
-# Admittedly, this is quite paranoid, since all the known shells bail
-# out after a failed `exec'.
-$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
-exit 255
-fi
-
-    if test x$as_have_required = xno; then :
-  $as_echo "$0: This script requires a shell more modern than all"
-  $as_echo "$0: the shells that I found on your system."
-  if test x${ZSH_VERSION+set} = xset ; then
-    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
-    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
-  else
-    $as_echo "$0: Please tell bug-autoconf at gnu.org about your system,
-$0: including any error possibly output before this
-$0: message. Then install a modern shell, or manually run
-$0: the script under such a shell if you do have one."
-  fi
-  exit 1
-fi
-fi
-fi
-SHELL=${CONFIG_SHELL-/bin/sh}
-export SHELL
-# Unset more variables known to interfere with behavior of common tools.
-CLICOLOR_FORCE= GREP_OPTIONS=
-unset CLICOLOR_FORCE GREP_OPTIONS
-
-## --------------------- ##
-## M4sh Shell Functions. ##
-## --------------------- ##
-# as_fn_unset VAR
-# ---------------
-# Portably unset VAR.
-as_fn_unset ()
-{
-  { eval $1=; unset $1;}
-}
-as_unset=as_fn_unset
-
-# as_fn_set_status STATUS
-# -----------------------
-# Set $? to STATUS, without forking.
-as_fn_set_status ()
-{
-  return $1
-} # as_fn_set_status
-
-# as_fn_exit STATUS
-# -----------------
-# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
-as_fn_exit ()
-{
-  set +e
-  as_fn_set_status $1
-  exit $1
-} # as_fn_exit
-
-# as_fn_mkdir_p
-# -------------
-# Create "$as_dir" as a directory, including parents if necessary.
-as_fn_mkdir_p ()
-{
-
-  case $as_dir in #(
-  -*) as_dir=./$as_dir;;
-  esac
-  test -d "$as_dir" || eval $as_mkdir_p || {
-    as_dirs=
-    while :; do
-      case $as_dir in #(
-      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
-      *) as_qdir=$as_dir;;
-      esac
-      as_dirs="'$as_qdir' $as_dirs"
-      as_dir=`$as_dirname -- "$as_dir" ||
-$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$as_dir" : 'X\(//\)[^/]' \| \
-	 X"$as_dir" : 'X\(//\)$' \| \
-	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_dir" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-      test -d "$as_dir" && break
-    done
-    test -z "$as_dirs" || eval "mkdir $as_dirs"
-  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
-
-
-} # as_fn_mkdir_p
-
-# as_fn_executable_p FILE
-# -----------------------
-# Test if FILE is an executable regular file.
-as_fn_executable_p ()
-{
-  test -f "$1" && test -x "$1"
-} # as_fn_executable_p
-# as_fn_append VAR VALUE
-# ----------------------
-# Append the text in VALUE to the end of the definition contained in VAR. Take
-# advantage of any shell optimizations that allow amortized linear growth over
-# repeated appends, instead of the typical quadratic growth present in naive
-# implementations.
-if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
-  eval 'as_fn_append ()
-  {
-    eval $1+=\$2
-  }'
-else
-  as_fn_append ()
-  {
-    eval $1=\$$1\$2
-  }
-fi # as_fn_append
-
-# as_fn_arith ARG...
-# ------------------
-# Perform arithmetic evaluation on the ARGs, and store the result in the
-# global $as_val. Take advantage of shells that can avoid forks. The arguments
-# must be portable across $(()) and expr.
-if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
-  eval 'as_fn_arith ()
-  {
-    as_val=$(( $* ))
-  }'
-else
-  as_fn_arith ()
-  {
-    as_val=`expr "$@" || test $? -eq 1`
-  }
-fi # as_fn_arith
-
-
-# as_fn_error STATUS ERROR [LINENO LOG_FD]
-# ----------------------------------------
-# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
-# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
-# script with STATUS, using 1 if that was 0.
-as_fn_error ()
-{
-  as_status=$1; test $as_status -eq 0 && as_status=1
-  if test "$4"; then
-    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
-  fi
-  $as_echo "$as_me: error: $2" >&2
-  as_fn_exit $as_status
-} # as_fn_error
-
-if expr a : '\(a\)' >/dev/null 2>&1 &&
-   test "X`expr 00001 : '.*\(...\)'`" = X001; then
-  as_expr=expr
-else
-  as_expr=false
-fi
-
-if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
-  as_basename=basename
-else
-  as_basename=false
-fi
-
-as_me=`$as_basename -- "$0" ||
-$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
-	 X"$0" : 'X\(//\)$' \| \
-	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X/"$0" |
-    sed '/^.*\/\([^/][^/]*\)\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-
-if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
-  as_dirname=dirname
-else
-  as_dirname=false
-fi
-
-# Avoid depending upon Character Ranges.
-as_cr_letters='abcdefghijklmnopqrstuvwxyz'
-as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-as_cr_Letters=$as_cr_letters$as_cr_LETTERS
-as_cr_digits='0123456789'
-as_cr_alnum=$as_cr_Letters$as_cr_digits
-
-
-  as_lineno_1=$LINENO as_lineno_1a=$LINENO
-  as_lineno_2=$LINENO as_lineno_2a=$LINENO
-  eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
-  test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
-  # Blame Lee E. McMahon (1931-1989) for sed's syntax.  :-)
-  sed -n '
-    p
-    /[$]LINENO/=
-  ' <$as_myself |
-    sed '
-      s/[$]LINENO.*/&-/
-      t lineno
-      b
-      :lineno
-      N
-      :loop
-      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
-      t loop
-      s/-\n.*//
-    ' >$as_me.lineno &&
-  chmod +x "$as_me.lineno" ||
-    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
-
-  # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
-  # already done that, so ensure we don't try to do so again and fall
-  # in an infinite loop.  This has already happened in practice.
-  _as_can_reexec=no; export _as_can_reexec
-  # Don't try to exec as it changes $[0], causing all sort of problems
-  # (the dirname of $[0] is not the place where we might find the
-  # original and so on.  Autoconf is especially sensitive to this).
-  . "./$as_me.lineno"
-  # Exit status is that of the last command.
-  exit
-}
-
-ECHO_C= ECHO_N= ECHO_T=
-case `echo -n x` in #(((((
--n*)
-  case `echo 'xy\c'` in
-  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
-  xy)  ECHO_C='\c';;
-  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
-       ECHO_T='	';;
-  esac;;
-*)
-  ECHO_N='-n';;
-esac
-
-rm -f conf$$ conf$$.exe conf$$.file
-if test -d conf$$.dir; then
-  rm -f conf$$.dir/conf$$.file
-else
-  rm -f conf$$.dir
-  mkdir conf$$.dir 2>/dev/null
-fi
-if (echo >conf$$.file) 2>/dev/null; then
-  if ln -s conf$$.file conf$$ 2>/dev/null; then
-    as_ln_s='ln -s'
-    # ... but there are two gotchas:
-    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
-    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
-    # In both cases, we have to default to `cp -pR'.
-    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
-      as_ln_s='cp -pR'
-  elif ln conf$$.file conf$$ 2>/dev/null; then
-    as_ln_s=ln
-  else
-    as_ln_s='cp -pR'
-  fi
-else
-  as_ln_s='cp -pR'
-fi
-rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
-rmdir conf$$.dir 2>/dev/null
-
-if mkdir -p . 2>/dev/null; then
-  as_mkdir_p='mkdir -p "$as_dir"'
-else
-  test -d ./-p && rmdir ./-p
-  as_mkdir_p=false
-fi
-
-as_test_x='test -x'
-as_executable_p=as_fn_executable_p
-
-# Sed expression to map a string onto a valid CPP name.
-as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
-
-# Sed expression to map a string onto a valid variable name.
-as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
-
-
-
-
-
-SHELL=${CONFIG_SHELL-/bin/sh}
-
-# How were we run?
-at_cli_args="$@"
-
-
-# Not all shells have the 'times' builtin; the subshell is needed to make
-# sure we discard the 'times: not found' message from the shell.
-at_times_p=false
-(times) >/dev/null 2>&1 && at_times_p=:
-
-# CLI Arguments to pass to the debugging scripts.
-at_debug_args=
-# -e sets to true
-at_errexit_p=false
-# Shall we be verbose?  ':' means no, empty means yes.
-at_verbose=:
-at_quiet=
-# Running several jobs in parallel, 0 means as many as test groups.
-at_jobs=1
-at_traceon=:
-at_trace_echo=:
-at_check_filter_trace=:
-
-# Shall we keep the debug scripts?  Must be `:' when the suite is
-# run by a debug script, so that the script doesn't remove itself.
-at_debug_p=false
-# Display help message?
-at_help_p=false
-# Display the version message?
-at_version_p=false
-# List test groups?
-at_list_p=false
-# --clean
-at_clean=false
-# Test groups to run
-at_groups=
-# Whether to rerun failed tests.
-at_recheck=
-# Whether a write failure occurred
-at_write_fail=0
-
-# The directory we run the suite in.  Default to . if no -C option.
-at_dir=`pwd`
-# An absolute reference to this testsuite script.
-case $as_myself in
-  [\\/]* | ?:[\\/]* ) at_myself=$as_myself ;;
-  * ) at_myself=$at_dir/$as_myself ;;
-esac
-# Whether -C is in effect.
-at_change_dir=false
-
-# Whether to enable colored test results.
-at_color=auto
-# List of the tested programs.
-at_tested=''
-# As many question marks as there are digits in the last test group number.
-# Used to normalize the test group numbers so that `ls' lists them in
-# numerical order.
-at_format='???'
-# Description of all the test groups.
-at_help_all="1;testsuite.at:29;check for pocl version;;
-2;testsuite.at:37;example1: dot product;tce hsa;
-3;testsuite.at:44;example1: dot product (SPIR64);spir;
-4;testsuite.at:54;example1: dot product (SPIR32);spir;
-5;testsuite.at:64;example2: matrix transpose;tce hsa;
-6;testsuite.at:70;example2a: matrix transpose (automatic locals);tce hsa;
-7;testsuite.at:78;Kernel functions convert_char*;short16;
-8;testsuite.at:89;Kernel functions printf;printf;
-9;testsuite.at:99;Kernel functions as_type;conversion;
-10;testsuite.at:109;Kernel functions convert_type - scalars;conversion;
-11;testsuite.at:118;Kernel functions convert_type - vector of 2;conversion;
-12;testsuite.at:127;Kernel functions convert_type - vector of 4;conversion;
-13;testsuite.at:136;Kernel functions convert_type - vector of 8;conversion;
-14;testsuite.at:145;Kernel functions convert_type - vector of 16;conversion;
-15;testsuite.at:156;Kernel functions min and max when the operands are of different sign;min_max;
-16;testsuite.at:167;Kernel functions length, distance, and normalize;length_distance;
-17;testsuite.at:177;Kernel functions fmin, fmax, fma;fmin_fmax_fma;
-18;testsuite.at:187;Kernel functions frexp modf;frexp_modf;
-19;testsuite.at:200;A saturating conversion from long to uint;convert_sat_regression;
-20;testsuite.at:210;Kernel functions abs bitselect clz max min popcount;;
-21;testsuite.at:221;Kernel functions fabs signbit isfinite isinf isnan isnormal copysign ilogb ldexp;;
-22;testsuite.at:234;Kernel functions abs abs_diff add_sat hadd mad_hi mad_sat mul_hi rhadd sub_sat (loopvec);;
-23;testsuite.at:248;Kernel functions abs abs_diff add_sat hadd mad_hi mad_sat mul_hi rhadd sub_sat (loops);;
-24;testsuite.at:257;Kernel functions << >> rotate;rotate;
-25;testsuite.at:275;Trigonometric functions;;
-26;testsuite.at:282;Sampler address clamp;;
-27;testsuite.at:290;Image query functions;;
-28;testsuite.at:298;Kernel functions: shuffle charN;shuffle long;
-29;testsuite.at:306;Kernel functions: shuffle shortN;shuffle long;
-30;testsuite.at:314;Kernel functions: shuffle ushortN;shuffle long;
-31;testsuite.at:322;Kernel functions: shuffle halfN;shuffle long;
-32;testsuite.at:333;Kernel functions: shuffle intN;shuffle long;
-33;testsuite.at:341;Kernel functions: shuffle uintN;shuffle long;
-34;testsuite.at:349;Kernel functions: shuffle floatN;shuffle long;
-35;testsuite.at:357;Kernel functions: shuffle longN;shuffle long;
-36;testsuite.at:365;Kernel functions: shuffle ulongN;shuffle long;
-37;testsuite.at:373;Kernel functions: shuffle doubleN;shuffle long;
-38;testsuite.at:383;Scalar wave equation;scalarwave;
-39;testsuite-workgroup.at:28;unconditional barriers (full replication);workgroup;
-40;testsuite-workgroup.at:35;unconditional barriers (loops);workgroup;
-41;testsuite-workgroup.at:42;unbarriered for loops (full replication);workgroup;
-42;testsuite-workgroup.at:49;unbarriered for loops (loops);workgroup;
-43;testsuite-workgroup.at:56;barriered for loops (full replication);workgroup;
-44;testsuite-workgroup.at:63;barriered for loops (loops);workgroup;
-45;testsuite-workgroup.at:70;conditional barrier (full replication);condbar workgroup;
-46;testsuite-workgroup.at:77;conditional barrier (loops);condbar workgroup;
-47;testsuite-workgroup.at:84;b-loop with none of the WIs reaching the barrier (full replication);b-loop workgroup;
-48;testsuite-workgroup.at:91;b-loop with none of the WIs reaching the barrier (loops);b-loop workgroup;
-49;testsuite-workgroup.at:98;forcing horizontal parallelization to some outer loops (repl);workgroup outerlooppar;
-50;testsuite-workgroup.at:105;forcing horizontal parallelization to some outer loops (loops);workgroup outerlooppar;
-51;testsuite-workgroup.at:112;different implicit barrier injection scenarios (loops);workgroup implicit;
-52;testsuite-workgroup.at:119;loop with two paths to the latch (full replication);twolatchpaths workgroup;
-53;testsuite-workgroup.at:126;loop with two paths to the latch (loops);twolatchpaths workgroup;
-54;testsuite-workgroup.at:133;b-loop with two latches (full replication);twolatches workgroup;
-55;testsuite-workgroup.at:140;b-loop with two latches (loops);twolatches workgroup;
-56;testsuite-workgroup.at:147;workgroup_sizes: work-items get wrong ids (full replication);id workgroup;
-57;testsuite-workgroup.at:155;workgroup_sizes: work-items get wrong ids (loop);id workgroup;
-58;testsuite-regression.at:4;phi nodes not replicated (repl) - lp:927573;regression;
-59;testsuite-regression.at:10;phi nodes not replicated (loops) - lp:927573;regression;
-60;testsuite-regression.at:16;issues with local pointers (repl) - lp:918801;regression locals tce;
-61;testsuite-regression.at:22;issues with local pointers (loops) - lp:918801;regression locals tce;
-62;testsuite-regression.at:28;barrier between two for loops (repl);regression tce;
-63;testsuite-regression.at:34;barrier between two for loops (loops);regression tce;
-64;testsuite-regression.at:40;simple for-loop with a barrier inside (repl);regression;
-65;testsuite-regression.at:46;simple for-loop with a barrier inside (loops);regression;
-66;testsuite-regression.at:52;for-loop with computation after the brexit (repl) - lp:938123;regression tce;
-67;testsuite-regression.at:58;for-loop with computation after the brexit (loops) - lp:938123;regression tce;
-68;testsuite-regression.at:64;for-loop with a variable iteration count (repl) - lp:938883;regression;
-69;testsuite-regression.at:70;for-loop with a variable iteration count (loops) - lp:938883;regression;
-70;testsuite-regression.at:79;early return before a barrier region (repl) - lp:940248;regression early-return tce;
-71;testsuite-regression.at:85;early return before a barrier region (loops) - lp:940248;regression early-return tce;
-72;testsuite-regression.at:92;id-dependent computation before kernel exit (repl) - lp:940549;regression;
-73;testsuite-regression.at:98;id-dependent computation before kernel exit (loops) - lp:940549;regression;
-74;testsuite-regression.at:104;struct kernel arguments - lp:987905;regression struct;
-75;testsuite-regression.at:113;vector kernel arguments - lp:987905;regression vectorarg;
-76;testsuite-regression.at:127;barrier just before return (repl) - lp:1012030;regression struct;
-77;testsuite-regression.at:133;barrier just before return (loops) - lp:1012030;regression struct;
-78;testsuite-regression.at:139;infinite loop (repl) - lp:941558;regression infinite-loop tce;
-79;testsuite-regression.at:146;infinite loop (loops) - lp:941558;regression infinite-loop tce;
-80;testsuite-regression.at:153;passing a constant array as an arg - lp:1032203;regression const-array tce;
-81;testsuite-regression.at:159;undominated variable from conditional barrier handling (repl) - lp:1045835;regression undominated;
-82;testsuite-regression.at:165;undominated variable from conditional barrier handling (loops) - lp:1045835;regression undominated;
-83;testsuite-regression.at:174;clSetKernelArg overwriting the previous kernel's args - lp:1075134;regression setkernelarg;
-84;testsuite-regression.at:180;setting a buffer argument to NULL causes a segfault - lp:1109030;regression nullarg;
-85;testsuite-regression.at:186;sizeof(uint);sizeof;
-86;testsuite-regression.at:193;block;block;
-87;testsuite-regression.at:203;case with multiple variable length loops and a barrier in one;regression varlengthloops;
-88;testsuite-regression.at:209;assigning a loop iterator variable to a private makes it local - issue 94 (repl);regression looppriv;
-89;testsuite-regression.at:218;assigning a loop iterator variable to a private makes it local - issue 94 (loops);regression looppriv;
-90;testsuite-regression.at:227;assigning a loop iterator variable to a private makes it local 2 - issue 102 (repl);regression looppriv;
-91;testsuite-regression.at:236;assigning a loop iterator variable to a private makes it local 2 - issue 102 (loops);regression looppriv;
-92;testsuite-regression.at:247;local struct arrays produce illegal AS casts;regression local_struct_array;
-93;testsuite-regression.at:263;LoopVectorizer crash with Haswell and Broadwell - issue 231;regression issue_231;
-94;testsuite-runtime.at:4;clGetDeviceInfo;runtime;
-95;testsuite-runtime.at:9;clEnqueueNativeKernel;runtime;
-96;testsuite-runtime.at:14;clGetEventInfo;runtime;
-97;testsuite-runtime.at:19;read/copy/write buffer;runtime;
-98;testsuite-runtime.at:24;event cycle;runtime;
-99;testsuite-runtime.at:29;event freeing;runtime;
-100;testsuite-runtime.at:34;clCreateProgramWithBinary;runtime;
-101;testsuite-runtime.at:40;clBuildProgram;runtime;
-102;testsuite-runtime.at:46;test_kernel_cache_includes;runtime;
-103;testsuite-runtime.at:53;clBuildProgram link error;runtime;
-104;testsuite-runtime.at:60;clFinish;runtime;
-105;testsuite-runtime.at:66;clSetEventCallback;runtime;
-106;testsuite-runtime.at:73;clGetSupportedImageFormats;runtime;
-107;testsuite-runtime.at:78;clCreateKernelsInProgram;runtime;
-108;testsuite-runtime.at:86;clCreateKernel;runtime;
-109;testsuite-runtime.at:92;clGetKernelArgInfo;runtime;
-110;testsuite-runtime.at:98;clCreateSubDevices;runtime;
-111;testsuite-tce.at:4;A basic ttasim driver test;tce tta ttasim;
-112;testsuite-tce.at:12;Half-precision floats on ttasim (repl);tce tta ttasim half;
-113;testsuite-tce.at:37;Half-precision floats on ttasim (loopvec);tce tta ttasim half;
-114;testsuite-tce.at:61;A basic TCEMC test;tce tta ttasim tcemc;
-115;testsuite-samples.at:8;Building the sources against pocl;booksamples buildsamples;
-116;testsuite-samples.at:28;Run Chapter 2: Hello World;booksamples helloworld;
-117;testsuite-samples.at:38;Run Chapter 3: OpenCLConvolution;booksamples;
-118;testsuite-samples.at:52;Run Chapter 6: HelloBinaryWorld;booksamples hellobinaryworld;
-119;testsuite-samples.at:66;Run Chapter 7: SimpleBufferSubBuffer;booksamples simplebuffersubbuffer;
-120;testsuite-samples.at:79;Run Chapter 8: ImageFilter2D;booksamples imagefilter2d;
-121;testsuite-samples.at:102;Run Chapter 12: VectorAdd (C++ bindings);booksamples;
-122;testsuite-viennacl.at:17;fft;viennacl fft;
-123;testsuite-viennacl.at:21;custom-context;viennacl custom-context;
-124;testsuite-viennacl.at:25;custom-kernels;viennacl custom-kernels;
-125;testsuite-viennacl.at:29;scheduler;viennacl scheduler;
-126;testsuite-viennacl.at:39;bandwidth-reduction;viennacl bandwidth-reduction long;
-127;testsuite-viennacl.at:55;blas3_solve_double-test-opencl;viennacl blas3_solve_double-test-opencl long;
-128;testsuite-viennacl.at:71;external_linkage-opencl;viennacl external_linkage-opencl;
-129;testsuite-viennacl.at:92;global_variables-test-opencl;viennacl global_variables-test-opencl;
-130;testsuite-viennacl.at:96;iterators-test-opencl;viennacl iterators-test-opencl;
-131;testsuite-viennacl.at:104;matrix_col_double-test-opencl long;viennacl matrix_col_double-test-opencl long;
-132;testsuite-viennacl.at:110;matrix_col_float-test-opencl long;viennacl matrix_col_float-test-opencl long;
-133;testsuite-viennacl.at:116;matrix_col_int-test-opencl;viennacl matrix_col_int-test-opencl;
-134;testsuite-viennacl.at:124;matrix_row_double-test-opencl long;viennacl matrix_row_double-test-opencl long;
-135;testsuite-viennacl.at:130;matrix_row_float-test-opencl long;viennacl matrix_row_float-test-opencl long;
-136;testsuite-viennacl.at:136;matrix_row_int-test-opencl;viennacl matrix_row_int-test-opencl;
-137;testsuite-viennacl.at:144;matrix_vector_int-test-opencl;viennacl matrix_vector_int-test-opencl;
-138;testsuite-viennacl.at:152;matrix_vector-test-opencl long;viennacl matrix_vector-test-opencl long;
-139;testsuite-viennacl.at:156;nmf-test-opencl;viennacl nmf-test-opencl;
-140;testsuite-viennacl.at:164;scalar-test-opencl;viennacl scalar-test-opencl;
-141;testsuite-viennacl.at:168;structured-matrices-test-opencl;viennacl structured-matrices-test-opencl;
-142;testsuite-viennacl.at:176;vector_double-test-opencl;viennacl vector_double-test-opencl;
-143;testsuite-rodinia.at:19;backprop;rodinia backprop long;
-144;testsuite-rodinia.at:31;bfs;rodinia bfs;
-145;testsuite-rodinia.at:42;cfd;rodinia cfd long;
-146;testsuite-rodinia.at:54;lud;rodinia lud;
-147;testsuite-rodinia.at:61;hotspot;rodinia hotspot long;
-148;testsuite-rodinia.at:69;kmeans;rodinia kmeans;
-149;testsuite-rodinia.at:78;lavaMD;rodinia lavamd;
-150;testsuite-rodinia.at:91;pathfinder;rodinia pathfinder;
-151;testsuite-rodinia.at:102;srad;rodinia srad;
-152;testsuite-parboil.at:17;spmv;parboil spmv;
-153;testsuite-parboil.at:26;stencil;parboil stencil;
-154;testsuite-parboil.at:33;tpacf;parboil tpacf;
-155;testsuite-parboil.at:44;cutcp;parboil cutcp;
-156;testsuite-parboil.at:51;mri-gridding;parboil mri-gridding long;
-157;testsuite-parboil.at:63;sad;parboil sad;
-158;testsuite-parboil.at:72;bfs;parboil bfs bfs-parboil long;
-159;testsuite-parboil.at:84;histo;parboil histo;
-160;testsuite-parboil.at:93;sgemm;parboil sgemm;
-161;testsuite-parboil.at:100;mri-q;parboil mri-q;
-162;testsuite-parboil.at:107;lbm;parboil lbm long;
-163;testsuite-amd.at:11;aesencryptdecrypt-repl;amdsdk aesencryptdecrypt-repl long;
-164;testsuite-amd.at:19;aesencryptdecrypt-loops;amdsdk aesencryptdecrypt-loops;
-165;testsuite-amd.at:25;atomiccounters;amdsdk atomiccounters;
-166;testsuite-amd.at:33;bitonicsort;amdsdk bitonicsort;
-167;testsuite-amd.at:39;binarysearch;amdsdk binarysearch;
-168;testsuite-amd.at:45;binomialoption-repl;amdsdk binomialoption-repl long;
-169;testsuite-amd.at:53;binomialoption-loops;amdsdk binomialoption-loops;
-170;testsuite-amd.at:59;blackscholes;amdsdk blackscholes;
-171;testsuite-amd.at:65;blackscholesdp;amdsdk blackscholesdp cl_amd_fp64;
-172;testsuite-amd.at:78;boxfilter;amdsdk boxfilter;
-173;testsuite-amd.at:85;dct;amdsdk dct;
-174;testsuite-amd.at:93;devicefission;amdsdk devicefission;
-175;testsuite-amd.at:99;dwthaar1d;amdsdk dwthaar1d;
-176;testsuite-amd.at:108;fastwalshtransform;amdsdk fastwalshtransform;
-177;testsuite-amd.at:114;floydwarshall;amdsdk floydwarshall;
-178;testsuite-amd.at:120;fluidsimulation2d;amdsdk fluidsimulation2d cl_amd_fp64;
-179;testsuite-amd.at:131;helloworld;amdsdk helloworld;
-180;testsuite-amd.at:138;histogram-repl;amdsdk histogram-repl long;
-181;testsuite-amd.at:144;histogram-loops;amdsdk histogram-loops;
-182;testsuite-amd.at:150;imageoverlap;amdsdk imageoverlap;
-183;testsuite-amd.at:158;ludecomposition;amdsdk ludecomposition cl_amd_fp64;
-184;testsuite-amd.at:170;mandelbrot;amdsdk mandelbrot;
-185;testsuite-amd.at:178;matrixmul;amdsdk matrixmul;
-186;testsuite-amd.at:186;matrixmulimage;amdsdk matrixmulimage;
-187;testsuite-amd.at:192;matrixtranspose;amdsdk matrixtranspose;
-188;testsuite-amd.at:198;memorymodel-repl;amdsdk memorymodel-repl long;
-189;testsuite-amd.at:204;memorymodel-loops;amdsdk memorymodel-loops;
-190;testsuite-amd.at:210;montecarloasian;amdsdk montecarloasian;
-191;testsuite-amd.at:216;montecarloasiandp;amdsdk montecarloasiandp cl_amd_fp64;
-192;testsuite-amd.at:226;nbody;amdsdk nbody;
-193;testsuite-amd.at:233;prefixsum;amdsdk prefixsum;
-194;testsuite-amd.at:239;quasirandomsequence;amdsdk quasirandomsequence;
-195;testsuite-amd.at:245;radixsort;amdsdk radixsort;
-196;testsuite-amd.at:251;recursivegaussian;amdsdk recursivegaussian;
-197;testsuite-amd.at:257;reduction;amdsdk reduction;
-198;testsuite-amd.at:263;scanlargearrays;amdsdk scanlargearrays;
-199;testsuite-amd.at:270;simpleconvolution;amdsdk simpleconvolution;
-200;testsuite-amd.at:276;simpleimage;amdsdk simpleimage;
-201;testsuite-amd.at:283;sobelfilter;amdsdk sobelfilter;
-202;testsuite-amd.at:289;template;amdsdk template;
-203;testsuite-amd.at:295;templatec;amdsdk templatec;
-204;testsuite-amd.at:301;transferoverlap;amdsdk transferoverlap;
-205;testsuite-amd.at:307;urng;amdsdk urng;
-206;testsuite-amdsdk2_9.at:20;asyncdatatransfer;amdsdk2.9 amdsdk long asyncdatatransfer;
-207;testsuite-amdsdk2_9.at:29;atomiccounters;amdsdk2.9 amdsdk long atomiccounters;
-208;testsuite-amdsdk2_9.at:37;basicdebug;amdsdk2.9 amdsdk long basicdebug;
-209;testsuite-amdsdk2_9.at:49;binarysearch;hsa amdsdk2.9 amdsdk long binarysearch;
-210;testsuite-amdsdk2_9.at:55;binomialoption-repl;amdsdk2.9 amdsdk long binomialoption-repl;
-211;testsuite-amdsdk2_9.at:63;binomialoption-loops;hsa amdsdk2.9 amdsdk long binomialoption-loops;
-212;testsuite-amdsdk2_9.at:69;binomialoptionmultigpu;amdsdk2.9 amdsdk long binomialoptionmultigpu;
-213;testsuite-amdsdk2_9.at:76;bitonicsort;hsa amdsdk2.9 amdsdk long bitonicsort;
-214;testsuite-amdsdk2_9.at:82;blackscholes;hsa amdsdk2.9 amdsdk long blackscholes;
-215;testsuite-amdsdk2_9.at:88;blackscholesdp;amdsdk2.9 amdsdk long blackscholesdp cl_amd_fp64;
-216;testsuite-amdsdk2_9.at:99;boxfilter;amdsdk2.9 amdsdk long boxfilter;
-217;testsuite-amdsdk2_9.at:106;boxfilterGL;amdsdk2.9 amdsdk long boxfiltergl;
-218;testsuite-amdsdk2_9.at:115;bufferbandwidth;amdsdk2.9 amdsdk long bufferbandwidth;
-219;testsuite-amdsdk2_9.at:125;bufferImageInterop;amdsdk2.9 amdsdk long bufferimageinterop;
-220;testsuite-amdsdk2_9.at:134;concurrentkernel;amdsdk2.9 amdsdk long concurrentkernel;
-221;testsuite-amdsdk2_9.at:141;constantbandwidth;amdsdk2.9 amdsdk long constantbandwidth;
-222;testsuite-amdsdk2_9.at:150;cpluspluswrapper;amdsdk2.9 amdsdk long cpluspluswrapper;
-223;testsuite-amdsdk2_9.at:156;dct;hsa amdsdk2.9 amdsdk long dct;
-224;testsuite-amdsdk2_9.at:162;devicefission;amdsdk2.9 amdsdk long devicefission;
-225;testsuite-amdsdk2_9.at:168;devicefission11ext;amdsdk2.9 amdsdk long devicefission11ext;
-226;testsuite-amdsdk2_9.at:176;dwthaar1d;amdsdk2.9 amdsdk long dwthaar1d;
-227;testsuite-amdsdk2_9.at:185;dwthaar1dcppkernel;amdsdk2.9 amdsdk long dwthaar1dcppkernel;
-228;testsuite-amdsdk2_9.at:193;eigenvalue;amdsdk2.9 amdsdk long eigenvalue;
-229;testsuite-amdsdk2_9.at:201;fastwalshtransform;hsa amdsdk2.9 amdsdk long fastwalshtransform;
-230;testsuite-amdsdk2_9.at:207;floydwarshall;hsa amdsdk2.9 amdsdk long floydwarshall;
-231;testsuite-amdsdk2_9.at:213;fft;amdsdk2.9 amdsdk long fft;
-232;testsuite-amdsdk2_9.at:221;fluidsimulation2d;amdsdk2.9 amdsdk long fluidsimulation2d cl_amd_fp64;
-233;testsuite-amdsdk2_9.at:232;gaussiannoise;amdsdk2.9 amdsdk long gaussiannoise;
-234;testsuite-amdsdk2_9.at:238;gaussiannoisegl;amdsdk2.9 amdsdk long gaussiannoisegl;
-235;testsuite-amdsdk2_9.at:253;hdrtonemapping;amdsdk2.9 amdsdk long hdrtonemapping;
-236;testsuite-amdsdk2_9.at:259;helloworld;hsa amdsdk2.9 amdsdk long helloworld;
-237;testsuite-amdsdk2_9.at:266;histogram-repl;amdsdk2.9 amdsdk long histogram-repl;
-238;testsuite-amdsdk2_9.at:272;histogram-loops;hsa amdsdk2.9 amdsdk long histogram-loops;
-239;testsuite-amdsdk2_9.at:278;histogramatomic;amdsdk2.9 amdsdk long histogramatomic;
-240;testsuite-amdsdk2_9.at:284;imagebandwidth;amdsdk2.9 amdsdk long imagebandwidth;
-241;testsuite-amdsdk2_9.at:292;imageoverlap;amdsdk2.9 amdsdk long imageoverlap;
-242;testsuite-amdsdk2_9.at:298;introstatickcppkernel;amdsdk2.9 amdsdk long introstatickcppkernel;
-243;testsuite-amdsdk2_9.at:306;kernellauch;amdsdk2.9 amdsdk long kernellauch;
-244;testsuite-amdsdk2_9.at:314;kmeansautoclustering;amdsdk2.9 amdsdk long kmeansautoclustering;
-245;testsuite-amdsdk2_9.at:323;ldsbandwidth;amdsdk2.9 amdsdk long ldsbandwidth;
-246;testsuite-amdsdk2_9.at:331;ludecomposition;amdsdk2.9 amdsdk long ludecomposition cl_amd_fp64;
-247;testsuite-amdsdk2_9.at:341;mandelbrot;amdsdk2.9 amdsdk long mandelbrot;
-248;testsuite-amdsdk2_9.at:350;matrixmuldouble;amdsdk2.9 amdsdk long matrixmuldouble;
-249;testsuite-amdsdk2_9.at:356;matrixmulimage;amdsdk2.9 amdsdk long matrixmulimage;
-250;testsuite-amdsdk2_9.at:362;matrixmultiplication;hsa amdsdk2.9 amdsdk long matrixmultiplication;
-251;testsuite-amdsdk2_9.at:370;matrixtranspose;hsa amdsdk2.9 amdsdk long matrixtranspose;
-252;testsuite-amdsdk2_9.at:376;memorymodel-repl;amdsdk2.9 amdsdk long memorymodel-repl;
-253;testsuite-amdsdk2_9.at:382;memorymodel-loops;amdsdk2.9 amdsdk long memorymodel-loops;
-254;testsuite-amdsdk2_9.at:388;memoryoptimizations;amdsdk2.9 amdsdk long memoryoptimizations;
-255;testsuite-amdsdk2_9.at:418;merzennetwister;amdsdk2.9 amdsdk long merzennetwister;
-256;testsuite-amdsdk2_9.at:426;montecarloasian;amdsdk2.9 amdsdk long montecarloasian;
-257;testsuite-amdsdk2_9.at:436;montecarloasiandp;amdsdk2.9 amdsdk long montecarloasiandp cl_amd_fp64;
-258;testsuite-amdsdk2_9.at:446;montecarloasianmultigpu;amdsdk2.9 amdsdk long montecarloasianmultigpu;
-259;testsuite-amdsdk2_9.at:452;nbody;amdsdk2.9 amdsdk long nbody;
-260;testsuite-amdsdk2_9.at:460;prefixsum;hsa amdsdk2.9 amdsdk long prefixsum;
-261;testsuite-amdsdk2_9.at:466;quasirandomsequence;hsa amdsdk2.9 amdsdk long quasirandomsequence;
-262;testsuite-amdsdk2_9.at:472;radixsort;amdsdk2.9 amdsdk long radixsort;
-263;testsuite-amdsdk2_9.at:478;recursivegaussian;amdsdk2.9 amdsdk long recursivegaussian;
-264;testsuite-amdsdk2_9.at:484;reduction;amdsdk2.9 amdsdk long reduction;
-265;testsuite-amdsdk2_9.at:490;scanlargearrays;hsa amdsdk2.9 amdsdk long scanlargearrays;
-266;testsuite-amdsdk2_9.at:497;simpleconvolution;hsa amdsdk2.9 amdsdk long simpleconvolution;
-267;testsuite-amdsdk2_9.at:503;simplegl;amdsdk2.9 amdsdk long simplegl;
-268;testsuite-amdsdk2_9.at:511;simpleimage;amdsdk2.9 amdsdk long simpleimage;
-269;testsuite-amdsdk2_9.at:518;soaversusaos;amdsdk2.9 amdsdk long soaversusaos;
-270;testsuite-amdsdk2_9.at:527;sobelfilter;hsa amdsdk2.9 amdsdk long sobelfilter;
-271;testsuite-amdsdk2_9.at:533;sobelfilterimage;amdsdk2.9 amdsdk long sobelfilterimage;
-272;testsuite-amdsdk2_9.at:541;stringsearch;amdsdk2.9 amdsdk long stringsearch;
-273;testsuite-amdsdk2_9.at:548;template;amdsdk2.9 amdsdk long template;
-274;testsuite-amdsdk2_9.at:554;transferoverlap;amdsdk2.9 amdsdk long transferoverlap;
-275;testsuite-amdsdk2_9.at:560;transferoverlapcpp;amdsdk2.9 amdsdk long transferoverlapcpp;
-276;testsuite-amdsdk2_9.at:568;unsharpmask;amdsdk2.9 amdsdk long unsharpmask;
-277;testsuite-amdsdk2_9.at:576;urng;hsa amdsdk2.9 amdsdk long urng;
-278;testsuite-amdsdk2_9.at:582;urngnoisegl;amdsdk2.9 amdsdk long urngnoisegl;
-279;testsuite-vexcl.at:10;fft;vexcl fft;
-280;testsuite-vexcl.at:19;generator;vexcl generator;
-281;testsuite-vexcl.at:25;multiple_objects;vexcl multiple_objects;
-282;testsuite-vexcl.at:31;multivector_arithmetics;vexcl multivector_arithmetics;
-283;testsuite-vexcl.at:37;multivector_create;vexcl multivector_create;
-284;testsuite-vexcl.at:43;random;vexcl random;
-285;testsuite-vexcl.at:49;spmv;vexcl spmv;
-286;testsuite-vexcl.at:55;stencil;vexcl stencil;
-287;testsuite-vexcl.at:63;vector_arithmetics;vexcl vector_arithmetics;
-288;testsuite-vexcl.at:69;vector_copy;vexcl vector_copy;
-289;testsuite-vexcl.at:75;vector_create;vexcl vector_create;
-290;testsuite-halide.at:11;tutorial12;halide tutorial12;
-291;testsuite-halide.at:21;bilateral_grid;halide bilateral_grid;
-292;testsuite-halide.at:26;interpolate;halide interpolate;
-293;testsuite-halide.at:31;local_laplacian;halide local_laplacian;
-294;testsuite-cloverleaf.at:11;cloverleaf;cloverleaf;
-295;testsuite-piglit.at:3;Piglit testsuite with LLVM 3.5;piglit long;
-296;testsuite-opencv.at:9;UMat;opencv umat;
-297;testsuite-opencv.at:14;Core_UMat;opencv core_umat;
-298;testsuite-opencv.at:19;Image2D;opencv image2d;
-299;testsuite-opencv.at:24;UMatBasicTests;opencv umatbasictests;
-300;testsuite-opencv.at:29;UMatTestReshape;opencv umattestreshape;
-301;testsuite-opencv.at:34;UMatTestRoi;opencv umattestroi;
-302;testsuite-opencv.at:39;UMatTestSizeOperations;opencv umattestsizeoperations;
-303;testsuite-opencv.at:44;UMatTestUMatOperations;opencv umattestumatoperations;
-304;testsuite-opencv.at:51;OCL_Channels/Merge;opencv ocl_channels/merge;
-305;testsuite-opencv.at:56;OCL_Channels/Split;opencv ocl_channels/split;
-306;testsuite-opencv.at:61;OCL_Channels/MixChannels;opencv ocl_channels/mixchannels;
-307;testsuite-opencv.at:66;OCL_Channels/InsertChannel;opencv ocl_channels/insertchannel;
-308;testsuite-opencv.at:71;OCL_Channels/ExtractChannel;opencv ocl_channels/extractchannel;
-309;testsuite-opencv.at:78;Lut;opencv lut;
-310;testsuite-opencv.at:83;Add;opencv add;
-311;testsuite-opencv.at:88;Subtract;opencv subtract;
-312;testsuite-opencv.at:93;Mul;opencv mul;
-313;testsuite-opencv.at:98;Div;opencv div;
-314;testsuite-opencv.at:103;Min;opencv min;
-315;testsuite-opencv.at:108;Max;opencv max;
-316;testsuite-opencv.at:113;AddWeighted;opencv addweighted;
-317;testsuite-opencv.at:118;Absdiff;opencv absdiff;
-318;testsuite-opencv.at:123;CartToPolar;opencv carttopolar;
-319;testsuite-opencv.at:128;PolarToCart;opencv polartocart;
-320;testsuite-opencv.at:133;Transpose;opencv transpose;
-321;testsuite-opencv.at:138;Bitwise_and;opencv bitwise_and;
-322;testsuite-opencv.at:143;Bitwise_or;opencv bitwise_or;
-323;testsuite-opencv.at:148;Bitwise_xor;opencv bitwise_xor;
-324;testsuite-opencv.at:153;Bitwise_not;opencv bitwise_not;
-325;testsuite-opencv.at:158;Compare;opencv compare;
-326;testsuite-opencv.at:163;Pow;opencv pow;
-327;testsuite-opencv.at:168;SetIdentity;opencv setidentity;
-328;testsuite-opencv.at:173;Repeat;opencv repeat;
-329;testsuite-opencv.at:178;CountNonZero;opencv countnonzero;
-330;testsuite-opencv.at:183;Sum;opencv sum;
-331;testsuite-opencv.at:188;MeanStdDev;opencv meanstddev;
-332;testsuite-opencv.at:193;Log;opencv log;
-333;testsuite-opencv.at:198;Exp;opencv exp;
-334;testsuite-opencv.at:203;Phase;opencv phase;
-335;testsuite-opencv.at:208;Magnitude;opencv magnitude;
-336;testsuite-opencv.at:213;Flip;opencv flip;
-337;testsuite-opencv.at:218;MinMaxIdx;opencv minmaxidx;
-338;testsuite-opencv.at:223;MinMaxIdx_Mask;opencv minmaxidx_mask;
-339;testsuite-opencv.at:228;Norm;opencv norm;
-340;testsuite-opencv.at:233;UMatDot;opencv umatdot;
-341;testsuite-opencv.at:238;Sqrt;opencv sqrt;
-342;testsuite-opencv.at:243;Normalize;opencv normalize;
-343;testsuite-opencv.at:248;InRange;opencv inrange;
-344;testsuite-opencv.at:253;ConvertScaleAbs;opencv convertscaleabs;
-345;testsuite-opencv.at:258;ScaleAdd;opencv scaleadd;
-346;testsuite-opencv.at:263;PatchNaNs;opencv patchnans;
-347;testsuite-opencv.at:268;Psnr;opencv psnr;
-348;testsuite-opencv.at:273;ReduceSum;opencv reducesum;
-349;testsuite-opencv.at:278;ReduceMax;opencv reducemax;
-350;testsuite-opencv.at:283;ReduceAvg;opencv reduceavg;
-351;testsuite-opencv.at:290;Gemm;opencv gemm;
-352;testsuite-opencv.at:295;Dft;opencv dft;
-353;testsuite-opencv.at:302;MultiSpectrums;opencv multispectrums;
-354;testsuite-opencv.at:309;ConvertTo;opencv convertto;
-355;testsuite-opencv.at:314;CopyTo;opencv copyto;
-356;testsuite-opencv.at:319;SetTo;opencv setto;
-357;testsuite-opencv.at:324;UMatExpr;opencv umatexpr;
-"
-# List of the all the test groups.
-at_groups_all=`$as_echo "$at_help_all" | sed 's/;.*//'`
-
-# at_fn_validate_ranges NAME...
-# -----------------------------
-# Validate and normalize the test group number contained in each variable
-# NAME. Leading zeroes are treated as decimal.
-at_fn_validate_ranges ()
-{
-  for at_grp
-  do
-    eval at_value=\$$at_grp
-    if test $at_value -lt 1 || test $at_value -gt 357; then
-      $as_echo "invalid test group: $at_value" >&2
-      exit 1
-    fi
-    case $at_value in
-      0*) # We want to treat leading 0 as decimal, like expr and test, but
-	  # AS_VAR_ARITH treats it as octal if it uses $(( )).
-	  # With XSI shells, ${at_value#${at_value%%[1-9]*}} avoids the
-	  # expr fork, but it is not worth the effort to determine if the
-	  # shell supports XSI when the user can just avoid leading 0.
-	  eval $at_grp='`expr $at_value + 0`' ;;
-    esac
-  done
-}
-
-at_prev=
-for at_option
-do
-  # If the previous option needs an argument, assign it.
-  if test -n "$at_prev"; then
-    at_option=$at_prev=$at_option
-    at_prev=
-  fi
-
-  case $at_option in
-  *=?*) at_optarg=`expr "X$at_option" : '[^=]*=\(.*\)'` ;;
-  *)    at_optarg= ;;
-  esac
-
-  # Accept the important Cygnus configure options, so we can diagnose typos.
-
-  case $at_option in
-    --help | -h )
-	at_help_p=:
-	;;
-
-    --list | -l )
-	at_list_p=:
-	;;
-
-    --version | -V )
-	at_version_p=:
-	;;
-
-    --clean | -c )
-	at_clean=:
-	;;
-
-    --color )
-	at_color=always
-	;;
-    --color=* )
-	case $at_optarg in
-	no | never | none) at_color=never ;;
-	auto | tty | if-tty) at_color=auto ;;
-	always | yes | force) at_color=always ;;
-	*) at_optname=`echo " $at_option" | sed 's/^ //; s/=.*//'`
-	   as_fn_error $? "unrecognized argument to $at_optname: $at_optarg" ;;
-	esac
-	;;
-
-    --debug | -d )
-	at_debug_p=:
-	;;
-
-    --errexit | -e )
-	at_debug_p=:
-	at_errexit_p=:
-	;;
-
-    --verbose | -v )
-	at_verbose=; at_quiet=:
-	;;
-
-    --trace | -x )
-	at_traceon='set -x'
-	at_trace_echo=echo
-	at_check_filter_trace=at_fn_filter_trace
-	;;
-
-    [0-9] | [0-9][0-9] | [0-9][0-9][0-9] | [0-9][0-9][0-9][0-9])
-	at_fn_validate_ranges at_option
-	as_fn_append at_groups "$at_option$as_nl"
-	;;
-
-    # Ranges
-    [0-9]- | [0-9][0-9]- | [0-9][0-9][0-9]- | [0-9][0-9][0-9][0-9]-)
-	at_range_start=`echo $at_option |tr -d X-`
-	at_fn_validate_ranges at_range_start
-	at_range=`$as_echo "$at_groups_all" | \
-	  sed -ne '/^'$at_range_start'$/,$p'`
-	as_fn_append at_groups "$at_range$as_nl"
-	;;
-
-    -[0-9] | -[0-9][0-9] | -[0-9][0-9][0-9] | -[0-9][0-9][0-9][0-9])
-	at_range_end=`echo $at_option |tr -d X-`
-	at_fn_validate_ranges at_range_end
-	at_range=`$as_echo "$at_groups_all" | \
-	  sed -ne '1,/^'$at_range_end'$/p'`
-	as_fn_append at_groups "$at_range$as_nl"
-	;;
-
-    [0-9]-[0-9] | [0-9]-[0-9][0-9] | [0-9]-[0-9][0-9][0-9] | \
-    [0-9]-[0-9][0-9][0-9][0-9] | [0-9][0-9]-[0-9][0-9] | \
-    [0-9][0-9]-[0-9][0-9][0-9] | [0-9][0-9]-[0-9][0-9][0-9][0-9] | \
-    [0-9][0-9][0-9]-[0-9][0-9][0-9] | \
-    [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9] | \
-    [0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9] )
-	at_range_start=`expr $at_option : '\(.*\)-'`
-	at_range_end=`expr $at_option : '.*-\(.*\)'`
-	if test $at_range_start -gt $at_range_end; then
-	  at_tmp=$at_range_end
-	  at_range_end=$at_range_start
-	  at_range_start=$at_tmp
-	fi
-	at_fn_validate_ranges at_range_start at_range_end
-	at_range=`$as_echo "$at_groups_all" | \
-	  sed -ne '/^'$at_range_start'$/,/^'$at_range_end'$/p'`
-	as_fn_append at_groups "$at_range$as_nl"
-	;;
-
-    # Directory selection.
-    --directory | -C )
-	at_prev=--directory
-	;;
-    --directory=* )
-	at_change_dir=:
-	at_dir=$at_optarg
-	if test x- = "x$at_dir" ; then
-	  at_dir=./-
-	fi
-	;;
-
-    # Parallel execution.
-    --jobs | -j )
-	at_jobs=0
-	;;
-    --jobs=* | -j[0-9]* )
-	if test -n "$at_optarg"; then
-	  at_jobs=$at_optarg
-	else
-	  at_jobs=`expr X$at_option : 'X-j\(.*\)'`
-	fi
-	case $at_jobs in *[!0-9]*)
-	  at_optname=`echo " $at_option" | sed 's/^ //; s/[0-9=].*//'`
-	  as_fn_error $? "non-numeric argument to $at_optname: $at_jobs" ;;
-	esac
-	;;
-
-    # Keywords.
-    --keywords | -k )
-	at_prev=--keywords
-	;;
-    --keywords=* )
-	at_groups_selected=$at_help_all
-	at_save_IFS=$IFS
-	IFS=,
-	set X $at_optarg
-	shift
-	IFS=$at_save_IFS
-	for at_keyword
-	do
-	  at_invert=
-	  case $at_keyword in
-	  '!'*)
-	    at_invert="-v"
-	    at_keyword=`expr "X$at_keyword" : 'X!\(.*\)'`
-	    ;;
-	  esac
-	  # It is on purpose that we match the test group titles too.
-	  at_groups_selected=`$as_echo "$at_groups_selected" |
-	      grep -i $at_invert "^[1-9][^;]*;.*[; ]$at_keyword[ ;]"`
-	done
-	# Smash the keywords.
-	at_groups_selected=`$as_echo "$at_groups_selected" | sed 's/;.*//'`
-	as_fn_append at_groups "$at_groups_selected$as_nl"
-	;;
-    --recheck)
-	at_recheck=:
-	;;
-
-    *=*)
-	at_envvar=`expr "x$at_option" : 'x\([^=]*\)='`
-	# Reject names that are not valid shell variable names.
-	case $at_envvar in
-	  '' | [0-9]* | *[!_$as_cr_alnum]* )
-	    as_fn_error $? "invalid variable name: \`$at_envvar'" ;;
-	esac
-	at_value=`$as_echo "$at_optarg" | sed "s/'/'\\\\\\\\''/g"`
-	# Export now, but save eval for later and for debug scripts.
-	export $at_envvar
-	as_fn_append at_debug_args " $at_envvar='$at_value'"
-	;;
-
-     *) $as_echo "$as_me: invalid option: $at_option" >&2
-	$as_echo "Try \`$0 --help' for more information." >&2
-	exit 1
-	;;
-  esac
-done
-
-# Verify our last option didn't require an argument
-if test -n "$at_prev"; then :
-  as_fn_error $? "\`$at_prev' requires an argument"
-fi
-
-# The file containing the suite.
-at_suite_log=$at_dir/$as_me.log
-
-# Selected test groups.
-if test -z "$at_groups$at_recheck"; then
-  at_groups=$at_groups_all
-else
-  if test -n "$at_recheck" && test -r "$at_suite_log"; then
-    at_oldfails=`sed -n '
-      /^Failed tests:$/,/^Skipped tests:$/{
-	s/^[ ]*\([1-9][0-9]*\):.*/\1/p
-      }
-      /^Unexpected passes:$/,/^## Detailed failed tests/{
-	s/^[ ]*\([1-9][0-9]*\):.*/\1/p
-      }
-      /^## Detailed failed tests/q
-      ' "$at_suite_log"`
-    as_fn_append at_groups "$at_oldfails$as_nl"
-  fi
-  # Sort the tests, removing duplicates.
-  at_groups=`$as_echo "$at_groups" | sort -nu | sed '/^$/d'`
-fi
-
-if test x"$at_color" = xalways \
-   || { test x"$at_color" = xauto && test -t 1; }; then
-  at_red=`printf '\033[0;31m'`
-  at_grn=`printf '\033[0;32m'`
-  at_lgn=`printf '\033[1;32m'`
-  at_blu=`printf '\033[1;34m'`
-  at_std=`printf '\033[m'`
-else
-  at_red= at_grn= at_lgn= at_blu= at_std=
-fi
-
-# Help message.
-if $at_help_p; then
-  cat <<_ATEOF || at_write_fail=1
-Usage: $0 [OPTION]... [VARIABLE=VALUE]... [TESTS]
-
-Run all the tests, or the selected TESTS, given by numeric ranges, and
-save a detailed log file.  Upon failure, create debugging scripts.
-
-Do not change environment variables directly.  Instead, set them via
-command line arguments.  Set \`AUTOTEST_PATH' to select the executables
-to exercise.  Each relative directory is expanded as build and source
-directories relative to the top level of this distribution.
-E.g., from within the build directory /tmp/foo-1.0, invoking this:
-
-  $ $0 AUTOTEST_PATH=bin
-
-is equivalent to the following, assuming the source directory is /src/foo-1.0:
-
-  PATH=/tmp/foo-1.0/bin:/src/foo-1.0/bin:\$PATH $0
-_ATEOF
-cat <<_ATEOF || at_write_fail=1
-
-Operation modes:
-  -h, --help     print the help message, then exit
-  -V, --version  print version number, then exit
-  -c, --clean    remove all the files this test suite might create and exit
-  -l, --list     describes all the tests, or the selected TESTS
-_ATEOF
-cat <<_ATEOF || at_write_fail=1
-
-Execution tuning:
-  -C, --directory=DIR
-                 change to directory DIR before starting
-      --color[=never|auto|always]
-                 disable colored test results, or enable even without terminal
-  -j, --jobs[=N]
-                 Allow N jobs at once; infinite jobs with no arg (default 1)
-  -k, --keywords=KEYWORDS
-                 select the tests matching all the comma-separated KEYWORDS
-                 multiple \`-k' accumulate; prefixed \`!' negates a KEYWORD
-      --recheck  select all tests that failed or passed unexpectedly last time
-  -e, --errexit  abort as soon as a test fails; implies --debug
-  -v, --verbose  force more detailed output
-                 default for debugging scripts
-  -d, --debug    inhibit clean up and top-level logging
-                 default for debugging scripts
-  -x, --trace    enable tests shell tracing
-_ATEOF
-cat <<_ATEOF || at_write_fail=1
-
-Report bugs to <pocl-devel at lists.sourceforge.net>.
-_ATEOF
-  exit $at_write_fail
-fi
-
-# List of tests.
-if $at_list_p; then
-  cat <<_ATEOF || at_write_fail=1
-pocl 0.12 test suite test groups:
-
- NUM: FILE-NAME:LINE     TEST-GROUP-NAME
-      KEYWORDS
-
-_ATEOF
-  # Pass an empty line as separator between selected groups and help.
-  $as_echo "$at_groups$as_nl$as_nl$at_help_all" |
-    awk 'NF == 1 && FS != ";" {
-	   selected[$ 1] = 1
-	   next
-	 }
-	 /^$/ { FS = ";" }
-	 NF > 0 {
-	   if (selected[$ 1]) {
-	     printf " %3d: %-18s %s\n", $ 1, $ 2, $ 3
-	     if ($ 4) {
-	       lmax = 79
-	       indent = "     "
-	       line = indent
-	       len = length (line)
-	       n = split ($ 4, a, " ")
-	       for (i = 1; i <= n; i++) {
-		 l = length (a[i]) + 1
-		 if (i > 1 && len + l > lmax) {
-		   print line
-		   line = indent " " a[i]
-		   len = length (line)
-		 } else {
-		   line = line " " a[i]
-		   len += l
-		 }
-	       }
-	       if (n)
-		 print line
-	     }
-	   }
-	 }' || at_write_fail=1
-  exit $at_write_fail
-fi
-if $at_version_p; then
-  $as_echo "$as_me (pocl 0.12)" &&
-  cat <<\_ATEOF || at_write_fail=1
-
-Copyright (C) 2012 Free Software Foundation, Inc.
-This test suite is free software; the Free Software Foundation gives
-unlimited permission to copy, distribute and modify it.
-_ATEOF
-  exit $at_write_fail
-fi
-
-# Should we print banners?  Yes if more than one test is run.
-case $at_groups in #(
-  *$as_nl* )
-      at_print_banners=: ;; #(
-  * ) at_print_banners=false ;;
-esac
-# Text for banner N, set to a single space once printed.
-# Banner 1. testsuite.at:35
-# Category starts at test group 2.
-at_banner_text_1="OpenCL specification tests"
-# Banner 2. testsuite.at:76
-# Category starts at test group 7.
-at_banner_text_2="Kernel runtime library"
-# Banner 3. testsuite.at:381
-# Category starts at test group 38.
-at_banner_text_3="Full applications"
-# Banner 4. testsuite-workgroup.at:26
-# Category starts at test group 39.
-at_banner_text_4="Workgroup creation tests"
-# Banner 5. testsuite-regression.at:2
-# Category starts at test group 58.
-at_banner_text_5="Regression tests"
-# Banner 6. testsuite-runtime.at:2
-# Category starts at test group 94.
-at_banner_text_6="Runtime library tests"
-# Banner 7. testsuite-tce.at:2
-# Category starts at test group 111.
-at_banner_text_7="TCE tests"
-# Banner 8. testsuite-samples.at:6
-# Category starts at test group 115.
-at_banner_text_8="OpenCL Programming Guide Samples"
-# Banner 9. testsuite-viennacl.at:15
-# Category starts at test group 122.
-at_banner_text_9="ViennaCL 1.5.1 tests"
-# Banner 10. testsuite-rodinia.at:17
-# Category starts at test group 143.
-at_banner_text_10="Rodinia 2.0.1 tests"
-# Banner 11. testsuite-parboil.at:15
-# Category starts at test group 152.
-at_banner_text_11="Parboil tests"
-# Banner 12. testsuite-amd.at:9
-# Category starts at test group 163.
-at_banner_text_12="AMD APP SDK tests"
-# Banner 13. testsuite-amdsdk2_9.at:18
-# Category starts at test group 206.
-at_banner_text_13="AMD APP SDK 2.9 tests"
-# Banner 14. testsuite-vexcl.at:8
-# Category starts at test group 279.
-at_banner_text_14="VexCL tests"
-# Banner 15. testsuite-halide.at:9
-# Category starts at test group 290.
-at_banner_text_15="Halide OpenCL examples"
-# Banner 16. testsuite-cloverleaf.at:9
-# Category starts at test group 294.
-at_banner_text_16="CloverLeaf"
-# Banner 17. testsuite-piglit.at:1
-# Category starts at test group 295.
-at_banner_text_17="Piglit OpenCL tests"
-# Banner 18. testsuite-opencv.at:7
-# Category starts at test group 296.
-at_banner_text_18="OpenCV UMat tests"
-# Banner 19. testsuite-opencv.at:49
-# Category starts at test group 304.
-at_banner_text_19="OpenCV Channels test"
-# Banner 20. testsuite-opencv.at:76
-# Category starts at test group 309.
-at_banner_text_20="OpenCV Arithm tests"
-# Banner 21. testsuite-opencv.at:300
-# Category starts at test group 353.
-at_banner_text_21="OpenCV ImgProc test"
-# Banner 22. testsuite-opencv.at:307
-# Category starts at test group 354.
-at_banner_text_22="OpenCV MatrixOperation tests"
-
-# Take any -C into account.
-if $at_change_dir ; then
-  test x != "x$at_dir" && cd "$at_dir" \
-    || as_fn_error $? "unable to change directory"
-  at_dir=`pwd`
-fi
-
-# Load the config files for any default variable assignments.
-for at_file in atconfig atlocal
-do
-  test -r $at_file || continue
-  . ./$at_file || as_fn_error $? "invalid content: $at_file"
-done
-
-# Autoconf <=2.59b set at_top_builddir instead of at_top_build_prefix:
-: "${at_top_build_prefix=$at_top_builddir}"
-
-# Perform any assignments requested during argument parsing.
-eval "$at_debug_args"
-
-# atconfig delivers names relative to the directory the test suite is
-# in, but the groups themselves are run in testsuite-dir/group-dir.
-if test -n "$at_top_srcdir"; then
-  builddir=../..
-  for at_dir_var in srcdir top_srcdir top_build_prefix
-  do
-    eval at_val=\$at_$at_dir_var
-    case $at_val in
-      [\\/$]* | ?:[\\/]* ) at_prefix= ;;
-      *) at_prefix=../../ ;;
-    esac
-    eval "$at_dir_var=\$at_prefix\$at_val"
-  done
-fi
-
-## -------------------- ##
-## Directory structure. ##
-## -------------------- ##
-
-# This is the set of directories and files used by this script
-# (non-literals are capitalized):
-#
-# TESTSUITE         - the testsuite
-# TESTSUITE.log     - summarizes the complete testsuite run
-# TESTSUITE.dir/    - created during a run, remains after -d or failed test
-# + at-groups/      - during a run: status of all groups in run
-# | + NNN/          - during a run: meta-data about test group NNN
-# | | + check-line  - location (source file and line) of current AT_CHECK
-# | | + status      - exit status of current AT_CHECK
-# | | + stdout      - stdout of current AT_CHECK
-# | | + stder1      - stderr, including trace
-# | | + stderr      - stderr, with trace filtered out
-# | | + test-source - portion of testsuite that defines group
-# | | + times       - timestamps for computing duration
-# | | + pass        - created if group passed
-# | | + xpass       - created if group xpassed
-# | | + fail        - created if group failed
-# | | + xfail       - created if group xfailed
-# | | + skip        - created if group skipped
-# + at-stop         - during a run: end the run if this file exists
-# + at-source-lines - during a run: cache of TESTSUITE line numbers for extraction
-# + 0..NNN/         - created for each group NNN, remains after -d or failed test
-# | + TESTSUITE.log - summarizes the group results
-# | + ...           - files created during the group
-
-# The directory the whole suite works in.
-# Should be absolute to let the user `cd' at will.
-at_suite_dir=$at_dir/$as_me.dir
-# The file containing the suite ($at_dir might have changed since earlier).
-at_suite_log=$at_dir/$as_me.log
-# The directory containing helper files per test group.
-at_helper_dir=$at_suite_dir/at-groups
-# Stop file: if it exists, do not start new jobs.
-at_stop_file=$at_suite_dir/at-stop
-# The fifo used for the job dispatcher.
-at_job_fifo=$at_suite_dir/at-job-fifo
-
-if $at_clean; then
-  test -d "$at_suite_dir" &&
-    find "$at_suite_dir" -type d ! -perm -700 -exec chmod u+rwx \{\} \;
-  rm -f -r "$at_suite_dir" "$at_suite_log"
-  exit $?
-fi
-
-# Don't take risks: use only absolute directories in PATH.
-#
-# For stand-alone test suites (ie. atconfig was not found),
-# AUTOTEST_PATH is relative to `.'.
-#
-# For embedded test suites, AUTOTEST_PATH is relative to the top level
-# of the package.  Then expand it into build/src parts, since users
-# may create executables in both places.
-AUTOTEST_PATH=`$as_echo "$AUTOTEST_PATH" | sed "s|:|$PATH_SEPARATOR|g"`
-at_path=
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $AUTOTEST_PATH $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -n "$at_path" && as_fn_append at_path $PATH_SEPARATOR
-case $as_dir in
-  [\\/]* | ?:[\\/]* )
-    as_fn_append at_path "$as_dir"
-    ;;
-  * )
-    if test -z "$at_top_build_prefix"; then
-      # Stand-alone test suite.
-      as_fn_append at_path "$as_dir"
-    else
-      # Embedded test suite.
-      as_fn_append at_path "$at_top_build_prefix$as_dir$PATH_SEPARATOR"
-      as_fn_append at_path "$at_top_srcdir/$as_dir"
-    fi
-    ;;
-esac
-  done
-IFS=$as_save_IFS
-
-
-# Now build and simplify PATH.
-#
-# There might be directories that don't exist, but don't redirect
-# builtins' (eg., cd) stderr directly: Ultrix's sh hates that.
-at_new_path=
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $at_path
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -d "$as_dir" || continue
-case $as_dir in
-  [\\/]* | ?:[\\/]* ) ;;
-  * ) as_dir=`(cd "$as_dir" && pwd) 2>/dev/null` ;;
-esac
-case $PATH_SEPARATOR$at_new_path$PATH_SEPARATOR in
-  *$PATH_SEPARATOR$as_dir$PATH_SEPARATOR*) ;;
-  $PATH_SEPARATOR$PATH_SEPARATOR) at_new_path=$as_dir ;;
-  *) as_fn_append at_new_path "$PATH_SEPARATOR$as_dir" ;;
-esac
-  done
-IFS=$as_save_IFS
-
-PATH=$at_new_path
-export PATH
-
-# Setting up the FDs.
-
-
-
-# 5 is the log file.  Not to be overwritten if `-d'.
-if $at_debug_p; then
-  at_suite_log=/dev/null
-else
-  : >"$at_suite_log"
-fi
-exec 5>>"$at_suite_log"
-
-# Banners and logs.
-$as_echo "## --------------------- ##
-## pocl 0.12 test suite. ##
-## --------------------- ##"
-{
-  $as_echo "## --------------------- ##
-## pocl 0.12 test suite. ##
-## --------------------- ##"
-  echo
-
-  $as_echo "$as_me: command line was:"
-  $as_echo "  \$ $0 $at_cli_args"
-  echo
-
-  # If ChangeLog exists, list a few lines in case it might help determining
-  # the exact version.
-  if test -n "$at_top_srcdir" && test -f "$at_top_srcdir/ChangeLog"; then
-    $as_echo "## ---------- ##
-## ChangeLog. ##
-## ---------- ##"
-    echo
-    sed 's/^/| /;10q' "$at_top_srcdir/ChangeLog"
-    echo
-  fi
-
-  {
-cat <<_ASUNAME
-## --------- ##
-## Platform. ##
-## --------- ##
-
-hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
-uname -m = `(uname -m) 2>/dev/null || echo unknown`
-uname -r = `(uname -r) 2>/dev/null || echo unknown`
-uname -s = `(uname -s) 2>/dev/null || echo unknown`
-uname -v = `(uname -v) 2>/dev/null || echo unknown`
-
-/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
-/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
-
-/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
-/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
-/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
-/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
-/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
-/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
-/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
-
-_ASUNAME
-
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    $as_echo "PATH: $as_dir"
-  done
-IFS=$as_save_IFS
-
-}
-  echo
-
-  # Contents of the config files.
-  for at_file in atconfig atlocal
-  do
-    test -r $at_file || continue
-    $as_echo "$as_me: $at_file:"
-    sed 's/^/| /' $at_file
-    echo
-  done
-} >&5
-
-
-## ------------------------- ##
-## Autotest shell functions. ##
-## ------------------------- ##
-
-# at_fn_banner NUMBER
-# -------------------
-# Output banner NUMBER, provided the testsuite is running multiple groups and
-# this particular banner has not yet been printed.
-at_fn_banner ()
-{
-  $at_print_banners || return 0
-  eval at_banner_text=\$at_banner_text_$1
-  test "x$at_banner_text" = "x " && return 0
-  eval "at_banner_text_$1=\" \""
-  if test -z "$at_banner_text"; then
-    $at_first || echo
-  else
-    $as_echo "$as_nl$at_banner_text$as_nl"
-  fi
-} # at_fn_banner
-
-# at_fn_check_prepare_notrace REASON LINE
-# ---------------------------------------
-# Perform AT_CHECK preparations for the command at LINE for an untraceable
-# command; REASON is the reason for disabling tracing.
-at_fn_check_prepare_notrace ()
-{
-  $at_trace_echo "Not enabling shell tracing (command contains $1)"
-  $as_echo "$2" >"$at_check_line_file"
-  at_check_trace=: at_check_filter=:
-  : >"$at_stdout"; : >"$at_stderr"
-}
-
-# at_fn_check_prepare_trace LINE
-# ------------------------------
-# Perform AT_CHECK preparations for the command at LINE for a traceable
-# command.
-at_fn_check_prepare_trace ()
-{
-  $as_echo "$1" >"$at_check_line_file"
-  at_check_trace=$at_traceon at_check_filter=$at_check_filter_trace
-  : >"$at_stdout"; : >"$at_stderr"
-}
-
-# at_fn_check_prepare_dynamic COMMAND LINE
-# ----------------------------------------
-# Decide if COMMAND at LINE is traceable at runtime, and call the appropriate
-# preparation function.
-at_fn_check_prepare_dynamic ()
-{
-  case $1 in
-    *$as_nl*)
-      at_fn_check_prepare_notrace 'an embedded newline' "$2" ;;
-    *)
-      at_fn_check_prepare_trace "$2" ;;
-  esac
-}
-
-# at_fn_filter_trace
-# ------------------
-# Remove the lines in the file "$at_stderr" generated by "set -x" and print
-# them to stderr.
-at_fn_filter_trace ()
-{
-  mv "$at_stderr" "$at_stder1"
-  grep '^ *+' "$at_stder1" >&2
-  grep -v '^ *+' "$at_stder1" >"$at_stderr"
-}
-
-# at_fn_log_failure FILE-LIST
-# ---------------------------
-# Copy the files in the list on stdout with a "> " prefix, and exit the shell
-# with a failure exit code.
-at_fn_log_failure ()
-{
-  for file
-    do $as_echo "$file:"; sed 's/^/> /' "$file"; done
-  echo 1 > "$at_status_file"
-  exit 1
-}
-
-# at_fn_check_skip EXIT-CODE LINE
-# -------------------------------
-# Check whether EXIT-CODE is a special exit code (77 or 99), and if so exit
-# the test group subshell with that same exit code. Use LINE in any report
-# about test failure.
-at_fn_check_skip ()
-{
-  case $1 in
-    99) echo 99 > "$at_status_file"; at_failed=:
-	$as_echo "$2: hard failure"; exit 99;;
-    77) echo 77 > "$at_status_file"; exit 77;;
-  esac
-}
-
-# at_fn_check_status EXPECTED EXIT-CODE LINE
-# ------------------------------------------
-# Check whether EXIT-CODE is the EXPECTED exit code, and if so do nothing.
-# Otherwise, if it is 77 or 99, exit the test group subshell with that same
-# exit code; if it is anything else print an error message referring to LINE,
-# and fail the test.
-at_fn_check_status ()
-{
-  case $2 in
-    $1 ) ;;
-    77) echo 77 > "$at_status_file"; exit 77;;
-    99) echo 99 > "$at_status_file"; at_failed=:
-	$as_echo "$3: hard failure"; exit 99;;
-    *) $as_echo "$3: exit code was $2, expected $1"
-      at_failed=:;;
-  esac
-}
-
-# at_fn_diff_devnull FILE
-# -----------------------
-# Emit a diff between /dev/null and FILE. Uses "test -s" to avoid useless diff
-# invocations.
-at_fn_diff_devnull ()
-{
-  test -s "$1" || return 0
-  $at_diff "$at_devnull" "$1"
-}
-
-# at_fn_test NUMBER
-# -----------------
-# Parse out test NUMBER from the tail of this file.
-at_fn_test ()
-{
-  eval at_sed=\$at_sed$1
-  sed "$at_sed" "$at_myself" > "$at_test_source"
-}
-
-# at_fn_create_debugging_script
-# -----------------------------
-# Create the debugging script $at_group_dir/run which will reproduce the
-# current test group.
-at_fn_create_debugging_script ()
-{
-  {
-    echo "#! /bin/sh" &&
-    echo 'test "${ZSH_VERSION+set}" = set && alias -g '\''${1+"$@"}'\''='\''"$@"'\''' &&
-    $as_echo "cd '$at_dir'" &&
-    $as_echo "exec \${CONFIG_SHELL-$SHELL} \"$at_myself\" -v -d $at_debug_args $at_group \${1+\"\$@\"}" &&
-    echo 'exit 1'
-  } >"$at_group_dir/run" &&
-  chmod +x "$at_group_dir/run"
-}
-
-## -------------------------------- ##
-## End of autotest shell functions. ##
-## -------------------------------- ##
-{
-  $as_echo "## ---------------- ##
-## Tested programs. ##
-## ---------------- ##"
-  echo
-} >&5
-
-# Report what programs are being tested.
-for at_program in : $at_tested
-do
-  test "$at_program" = : && continue
-  case $at_program in
-    [\\/]* | ?:[\\/]* ) $at_program_=$at_program ;;
-    * )
-    as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -f "$as_dir/$at_program" && break
-  done
-IFS=$as_save_IFS
-
-    at_program_=$as_dir/$at_program ;;
-  esac
-  if test -f "$at_program_"; then
-    {
-      $as_echo "$at_srcdir/testsuite.at:26: $at_program_ --version"
-      "$at_program_" --version </dev/null
-      echo
-    } >&5 2>&1
-  else
-    as_fn_error $? "cannot find $at_program" "$LINENO" 5
-  fi
-done
-
-{
-  $as_echo "## ------------------ ##
-## Running the tests. ##
-## ------------------ ##"
-} >&5
-
-at_start_date=`date`
-at_start_time=`date +%s 2>/dev/null`
-$as_echo "$as_me: starting at: $at_start_date" >&5
-
-# Create the master directory if it doesn't already exist.
-as_dir="$at_suite_dir"; as_fn_mkdir_p ||
-  as_fn_error $? "cannot create \`$at_suite_dir'" "$LINENO" 5
-
-# Can we diff with `/dev/null'?  DU 5.0 refuses.
-if diff /dev/null /dev/null >/dev/null 2>&1; then
-  at_devnull=/dev/null
-else
-  at_devnull=$at_suite_dir/devnull
-  >"$at_devnull"
-fi
-
-# Use `diff -u' when possible.
-if at_diff=`diff -u "$at_devnull" "$at_devnull" 2>&1` && test -z "$at_diff"
-then
-  at_diff='diff -u'
-else
-  at_diff=diff
-fi
-
-# Get the last needed group.
-for at_group in : $at_groups; do :; done
-
-# Extract the start and end lines of each test group at the tail
-# of this file
-awk '
-BEGIN { FS="" }
-/^#AT_START_/ {
-  start = NR
-}
-/^#AT_STOP_/ {
-  test = substr ($ 0, 10)
-  print "at_sed" test "=\"1," start "d;" (NR-1) "q\""
-  if (test == "'"$at_group"'") exit
-}' "$at_myself" > "$at_suite_dir/at-source-lines" &&
-. "$at_suite_dir/at-source-lines" ||
-  as_fn_error $? "cannot create test line number cache" "$LINENO" 5
-rm -f "$at_suite_dir/at-source-lines"
-
-# Set number of jobs for `-j'; avoid more jobs than test groups.
-set X $at_groups; shift; at_max_jobs=$#
-if test $at_max_jobs -eq 0; then
-  at_jobs=1
-fi
-if test $at_jobs -ne 1 &&
-   { test $at_jobs -eq 0 || test $at_jobs -gt $at_max_jobs; }; then
-  at_jobs=$at_max_jobs
-fi
-
-# If parallel mode, don't output banners, don't split summary lines.
-if test $at_jobs -ne 1; then
-  at_print_banners=false
-  at_quiet=:
-fi
-
-# Set up helper dirs.
-rm -rf "$at_helper_dir" &&
-mkdir "$at_helper_dir" &&
-cd "$at_helper_dir" &&
-{ test -z "$at_groups" || mkdir $at_groups; } ||
-as_fn_error $? "testsuite directory setup failed" "$LINENO" 5
-
-# Functions for running a test group.  We leave the actual
-# test group execution outside of a shell function in order
-# to avoid hitting zsh 4.x exit status bugs.
-
-# at_fn_group_prepare
-# -------------------
-# Prepare for running a test group.
-at_fn_group_prepare ()
-{
-  # The directory for additional per-group helper files.
-  at_job_dir=$at_helper_dir/$at_group
-  # The file containing the location of the last AT_CHECK.
-  at_check_line_file=$at_job_dir/check-line
-  # The file containing the exit status of the last command.
-  at_status_file=$at_job_dir/status
-  # The files containing the output of the tested commands.
-  at_stdout=$at_job_dir/stdout
-  at_stder1=$at_job_dir/stder1
-  at_stderr=$at_job_dir/stderr
-  # The file containing the code for a test group.
-  at_test_source=$at_job_dir/test-source
-  # The file containing dates.
-  at_times_file=$at_job_dir/times
-
-  # Be sure to come back to the top test directory.
-  cd "$at_suite_dir"
-
-  # Clearly separate the test groups when verbose.
-  $at_first || $at_verbose echo
-
-  at_group_normalized=$at_group
-
-  eval 'while :; do
-    case $at_group_normalized in #(
-    '"$at_format"'*) break;;
-    esac
-    at_group_normalized=0$at_group_normalized
-  done'
-
-
-  # Create a fresh directory for the next test group, and enter.
-  # If one already exists, the user may have invoked ./run from
-  # within that directory; we remove the contents, but not the
-  # directory itself, so that we aren't pulling the rug out from
-  # under the shell's notion of the current directory.
-  at_group_dir=$at_suite_dir/$at_group_normalized
-  at_group_log=$at_group_dir/$as_me.log
-  if test -d "$at_group_dir"; then
-  find "$at_group_dir" -type d ! -perm -700 -exec chmod u+rwx {} \;
-  rm -fr "$at_group_dir"/* "$at_group_dir"/.[!.] "$at_group_dir"/.??*
-fi ||
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: test directory for $at_group_normalized could not be cleaned" >&5
-$as_echo "$as_me: WARNING: test directory for $at_group_normalized could not be cleaned" >&2;}
-  # Be tolerant if the above `rm' was not able to remove the directory.
-  as_dir="$at_group_dir"; as_fn_mkdir_p
-
-  echo 0 > "$at_status_file"
-
-  # In verbose mode, append to the log file *and* show on
-  # the standard output; in quiet mode only write to the log.
-  if test -z "$at_verbose"; then
-    at_tee_pipe='tee -a "$at_group_log"'
-  else
-    at_tee_pipe='cat >> "$at_group_log"'
-  fi
-}
-
-# at_fn_group_banner ORDINAL LINE DESC PAD [BANNER]
-# -------------------------------------------------
-# Declare the test group ORDINAL, located at LINE with group description DESC,
-# and residing under BANNER. Use PAD to align the status column.
-at_fn_group_banner ()
-{
-  at_setup_line="$2"
-  test -n "$5" && at_fn_banner $5
-  at_desc="$3"
-  case $1 in
-    [0-9])      at_desc_line="  $1: ";;
-    [0-9][0-9]) at_desc_line=" $1: " ;;
-    *)          at_desc_line="$1: "  ;;
-  esac
-  as_fn_append at_desc_line "$3$4"
-  $at_quiet $as_echo_n "$at_desc_line"
-  echo "#                             -*- compilation -*-" >> "$at_group_log"
-}
-
-# at_fn_group_postprocess
-# -----------------------
-# Perform cleanup after running a test group.
-at_fn_group_postprocess ()
-{
-  # Be sure to come back to the suite directory, in particular
-  # since below we might `rm' the group directory we are in currently.
-  cd "$at_suite_dir"
-
-  if test ! -f "$at_check_line_file"; then
-    sed "s/^ */$as_me: WARNING: /" <<_ATEOF
-      A failure happened in a test group before any test could be
-      run. This means that test suite is improperly designed.  Please
-      report this failure to <pocl-devel at lists.sourceforge.net>.
-_ATEOF
-    $as_echo "$at_setup_line" >"$at_check_line_file"
-    at_status=99
-  fi
-  $at_verbose $as_echo_n "$at_group. $at_setup_line: "
-  $as_echo_n "$at_group. $at_setup_line: " >> "$at_group_log"
-  case $at_xfail:$at_status in
-    yes:0)
-	at_msg="UNEXPECTED PASS"
-	at_res=xpass
-	at_errexit=$at_errexit_p
-	at_color=$at_red
-	;;
-    no:0)
-	at_msg="ok"
-	at_res=pass
-	at_errexit=false
-	at_color=$at_grn
-	;;
-    *:77)
-	at_msg='skipped ('`cat "$at_check_line_file"`')'
-	at_res=skip
-	at_errexit=false
-	at_color=$at_blu
-	;;
-    no:* | *:99)
-	at_msg='FAILED ('`cat "$at_check_line_file"`')'
-	at_res=fail
-	at_errexit=$at_errexit_p
-	at_color=$at_red
-	;;
-    yes:*)
-	at_msg='expected failure ('`cat "$at_check_line_file"`')'
-	at_res=xfail
-	at_errexit=false
-	at_color=$at_lgn
-	;;
-  esac
-  echo "$at_res" > "$at_job_dir/$at_res"
-  # In parallel mode, output the summary line only afterwards.
-  if test $at_jobs -ne 1 && test -n "$at_verbose"; then
-    $as_echo "$at_desc_line $at_color$at_msg$at_std"
-  else
-    # Make sure there is a separator even with long titles.
-    $as_echo " $at_color$at_msg$at_std"
-  fi
-  at_log_msg="$at_group. $at_desc ($at_setup_line): $at_msg"
-  case $at_status in
-    0|77)
-      # $at_times_file is only available if the group succeeded.
-      # We're not including the group log, so the success message
-      # is written in the global log separately.  But we also
-      # write to the group log in case they're using -d.
-      if test -f "$at_times_file"; then
-	at_log_msg="$at_log_msg     ("`sed 1d "$at_times_file"`')'
-	rm -f "$at_times_file"
-      fi
-      $as_echo "$at_log_msg" >> "$at_group_log"
-      $as_echo "$at_log_msg" >&5
-
-      # Cleanup the group directory, unless the user wants the files
-      # or the success was unexpected.
-      if $at_debug_p || test $at_res = xpass; then
-	at_fn_create_debugging_script
-	if test $at_res = xpass && $at_errexit; then
-	  echo stop > "$at_stop_file"
-	fi
-      else
-	if test -d "$at_group_dir"; then
-	  find "$at_group_dir" -type d ! -perm -700 -exec chmod u+rwx \{\} \;
-	  rm -fr "$at_group_dir"
-	fi
-	rm -f "$at_test_source"
-      fi
-      ;;
-    *)
-      # Upon failure, include the log into the testsuite's global
-      # log.  The failure message is written in the group log.  It
-      # is later included in the global log.
-      $as_echo "$at_log_msg" >> "$at_group_log"
-
-      # Upon failure, keep the group directory for autopsy, and create
-      # the debugging script.  With -e, do not start any further tests.
-      at_fn_create_debugging_script
-      if $at_errexit; then
-	echo stop > "$at_stop_file"
-      fi
-      ;;
-  esac
-}
-
-
-## ------------ ##
-## Driver loop. ##
-## ------------ ##
-
-
-if (set -m && set +m && set +b) >/dev/null 2>&1; then
-  set +b
-  at_job_control_on='set -m' at_job_control_off='set +m' at_job_group=-
-else
-  at_job_control_on=: at_job_control_off=: at_job_group=
-fi
-
-for at_signal in 1 2 15; do
-  trap 'set +x; set +e
-	$at_job_control_off
-	at_signal='"$at_signal"'
-	echo stop > "$at_stop_file"
-	trap "" $at_signal
-	at_pgids=
-	for at_pgid in `jobs -p 2>/dev/null`; do
-	  at_pgids="$at_pgids $at_job_group$at_pgid"
-	done
-	test -z "$at_pgids" || kill -$at_signal $at_pgids 2>/dev/null
-	wait
-	if test "$at_jobs" -eq 1 || test -z "$at_verbose"; then
-	  echo >&2
-	fi
-	at_signame=`kill -l $at_signal 2>&1 || echo $at_signal`
-	set x $at_signame
-	test 1 -gt 2 && at_signame=$at_signal
-	{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: caught signal $at_signame, bailing out" >&5
-$as_echo "$as_me: WARNING: caught signal $at_signame, bailing out" >&2;}
-	as_fn_arith 128 + $at_signal && exit_status=$as_val
-	as_fn_exit $exit_status' $at_signal
-done
-
-rm -f "$at_stop_file"
-at_first=:
-
-if test $at_jobs -ne 1 &&
-     rm -f "$at_job_fifo" &&
-     test -n "$at_job_group" &&
-     ( mkfifo "$at_job_fifo" && trap 'exit 1' PIPE STOP TSTP ) 2>/dev/null
-then
-  # FIFO job dispatcher.
-
-  trap 'at_pids=
-	for at_pid in `jobs -p`; do
-	  at_pids="$at_pids $at_job_group$at_pid"
-	done
-	if test -n "$at_pids"; then
-	  at_sig=TSTP
-	  test "${TMOUT+set}" = set && at_sig=STOP
-	  kill -$at_sig $at_pids 2>/dev/null
-	fi
-	kill -STOP $$
-	test -z "$at_pids" || kill -CONT $at_pids 2>/dev/null' TSTP
-
-  echo
-  # Turn jobs into a list of numbers, starting from 1.
-  at_joblist=`$as_echo "$at_groups" | sed -n 1,${at_jobs}p`
-
-  set X $at_joblist
-  shift
-  for at_group in $at_groups; do
-    $at_job_control_on 2>/dev/null
-    (
-      # Start one test group.
-      $at_job_control_off
-      if $at_first; then
-	exec 7>"$at_job_fifo"
-      else
-	exec 6<&-
-      fi
-      trap 'set +x; set +e
-	    trap "" PIPE
-	    echo stop > "$at_stop_file"
-	    echo >&7
-	    as_fn_exit 141' PIPE
-      at_fn_group_prepare
-      if cd "$at_group_dir" &&
-	 at_fn_test $at_group &&
-	 . "$at_test_source"
-      then :; else
-	{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unable to parse test group: $at_group" >&5
-$as_echo "$as_me: WARNING: unable to parse test group: $at_group" >&2;}
-	at_failed=:
-      fi
-      at_fn_group_postprocess
-      echo >&7
-    ) &
-    $at_job_control_off
-    if $at_first; then
-      at_first=false
-      exec 6<"$at_job_fifo" 7>"$at_job_fifo"
-    fi
-    shift # Consume one token.
-    if test $# -gt 0; then :; else
-      read at_token <&6 || break
-      set x $*
-    fi
-    test -f "$at_stop_file" && break
-  done
-  exec 7>&-
-  # Read back the remaining ($at_jobs - 1) tokens.
-  set X $at_joblist
-  shift
-  if test $# -gt 0; then
-    shift
-    for at_job
-    do
-      read at_token
-    done <&6
-  fi
-  exec 6<&-
-  wait
-else
-  # Run serially, avoid forks and other potential surprises.
-  for at_group in $at_groups; do
-    at_fn_group_prepare
-    if cd "$at_group_dir" &&
-       at_fn_test $at_group &&
-       . "$at_test_source"; then :; else
-      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unable to parse test group: $at_group" >&5
-$as_echo "$as_me: WARNING: unable to parse test group: $at_group" >&2;}
-      at_failed=:
-    fi
-    at_fn_group_postprocess
-    test -f "$at_stop_file" && break
-    at_first=false
-  done
-fi
-
-# Wrap up the test suite with summary statistics.
-cd "$at_helper_dir"
-
-# Use ?..???? when the list must remain sorted, the faster * otherwise.
-at_pass_list=`for f in */pass; do echo $f; done | sed '/\*/d; s,/pass,,'`
-at_skip_list=`for f in */skip; do echo $f; done | sed '/\*/d; s,/skip,,'`
-at_xfail_list=`for f in */xfail; do echo $f; done | sed '/\*/d; s,/xfail,,'`
-at_xpass_list=`for f in ?/xpass ??/xpass ???/xpass ????/xpass; do
-		 echo $f; done | sed '/?/d; s,/xpass,,'`
-at_fail_list=`for f in ?/fail ??/fail ???/fail ????/fail; do
-		echo $f; done | sed '/?/d; s,/fail,,'`
-
-set X $at_pass_list $at_xpass_list $at_xfail_list $at_fail_list $at_skip_list
-shift; at_group_count=$#
-set X $at_xpass_list; shift; at_xpass_count=$#; at_xpass_list=$*
-set X $at_xfail_list; shift; at_xfail_count=$#
-set X $at_fail_list; shift; at_fail_count=$#; at_fail_list=$*
-set X $at_skip_list; shift; at_skip_count=$#
-
-as_fn_arith $at_group_count - $at_skip_count && at_run_count=$as_val
-as_fn_arith $at_xpass_count + $at_fail_count && at_unexpected_count=$as_val
-as_fn_arith $at_xfail_count + $at_fail_count && at_total_fail_count=$as_val
-
-# Back to the top directory.
-cd "$at_dir"
-rm -rf "$at_helper_dir"
-
-# Compute the duration of the suite.
-at_stop_date=`date`
-at_stop_time=`date +%s 2>/dev/null`
-$as_echo "$as_me: ending at: $at_stop_date" >&5
-case $at_start_time,$at_stop_time in
-  [0-9]*,[0-9]*)
-    as_fn_arith $at_stop_time - $at_start_time && at_duration_s=$as_val
-    as_fn_arith $at_duration_s / 60 && at_duration_m=$as_val
-    as_fn_arith $at_duration_m / 60 && at_duration_h=$as_val
-    as_fn_arith $at_duration_s % 60 && at_duration_s=$as_val
-    as_fn_arith $at_duration_m % 60 && at_duration_m=$as_val
-    at_duration="${at_duration_h}h ${at_duration_m}m ${at_duration_s}s"
-    $as_echo "$as_me: test suite duration: $at_duration" >&5
-    ;;
-esac
-
-echo
-$as_echo "## ------------- ##
-## Test results. ##
-## ------------- ##"
-echo
-{
-  echo
-  $as_echo "## ------------- ##
-## Test results. ##
-## ------------- ##"
-  echo
-} >&5
-
-if test $at_run_count = 1; then
-  at_result="1 test"
-  at_were=was
-else
-  at_result="$at_run_count tests"
-  at_were=were
-fi
-if $at_errexit_p && test $at_unexpected_count != 0; then
-  if test $at_xpass_count = 1; then
-    at_result="$at_result $at_were run, one passed"
-  else
-    at_result="$at_result $at_were run, one failed"
-  fi
-  at_result="$at_result unexpectedly and inhibited subsequent tests."
-  at_color=$at_red
-else
-  # Don't you just love exponential explosion of the number of cases?
-  at_color=$at_red
-  case $at_xpass_count:$at_fail_count:$at_xfail_count in
-    # So far, so good.
-    0:0:0) at_result="$at_result $at_were successful." at_color=$at_grn ;;
-    0:0:*) at_result="$at_result behaved as expected." at_color=$at_lgn ;;
-
-    # Some unexpected failures
-    0:*:0) at_result="$at_result $at_were run,
-$at_fail_count failed unexpectedly." ;;
-
-    # Some failures, both expected and unexpected
-    0:*:1) at_result="$at_result $at_were run,
-$at_total_fail_count failed ($at_xfail_count expected failure)." ;;
-    0:*:*) at_result="$at_result $at_were run,
-$at_total_fail_count failed ($at_xfail_count expected failures)." ;;
-
-    # No unexpected failures, but some xpasses
-    *:0:*) at_result="$at_result $at_were run,
-$at_xpass_count passed unexpectedly." ;;
-
-    # No expected failures, but failures and xpasses
-    *:1:0) at_result="$at_result $at_were run,
-$at_unexpected_count did not behave as expected ($at_fail_count unexpected failure)." ;;
-    *:*:0) at_result="$at_result $at_were run,
-$at_unexpected_count did not behave as expected ($at_fail_count unexpected failures)." ;;
-
-    # All of them.
-    *:*:1) at_result="$at_result $at_were run,
-$at_xpass_count passed unexpectedly,
-$at_total_fail_count failed ($at_xfail_count expected failure)." ;;
-    *:*:*) at_result="$at_result $at_were run,
-$at_xpass_count passed unexpectedly,
-$at_total_fail_count failed ($at_xfail_count expected failures)." ;;
-  esac
-
-  if test $at_skip_count = 0 && test $at_run_count -gt 1; then
-    at_result="All $at_result"
-  fi
-fi
-
-# Now put skips in the mix.
-case $at_skip_count in
-  0) ;;
-  1) at_result="$at_result
-1 test was skipped." ;;
-  *) at_result="$at_result
-$at_skip_count tests were skipped." ;;
-esac
-
-if test $at_unexpected_count = 0; then
-  echo "$at_color$at_result$at_std"
-  echo "$at_result" >&5
-else
-  echo "${at_color}ERROR: $at_result$at_std" >&2
-  echo "ERROR: $at_result" >&5
-  {
-    echo
-    $as_echo "## ------------------------ ##
-## Summary of the failures. ##
-## ------------------------ ##"
-
-    # Summary of failed and skipped tests.
-    if test $at_fail_count != 0; then
-      echo "Failed tests:"
-      $SHELL "$at_myself" $at_fail_list --list
-      echo
-    fi
-    if test $at_skip_count != 0; then
-      echo "Skipped tests:"
-      $SHELL "$at_myself" $at_skip_list --list
-      echo
-    fi
-    if test $at_xpass_count != 0; then
-      echo "Unexpected passes:"
-      $SHELL "$at_myself" $at_xpass_list --list
-      echo
-    fi
-    if test $at_fail_count != 0; then
-      $as_echo "## ---------------------- ##
-## Detailed failed tests. ##
-## ---------------------- ##"
-      echo
-      for at_group in $at_fail_list
-      do
-	at_group_normalized=$at_group
-
-  eval 'while :; do
-    case $at_group_normalized in #(
-    '"$at_format"'*) break;;
-    esac
-    at_group_normalized=0$at_group_normalized
-  done'
-
-	cat "$at_suite_dir/$at_group_normalized/$as_me.log"
-	echo
-      done
-      echo
-    fi
-    if test -n "$at_top_srcdir"; then
-      sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
-## ${at_top_build_prefix}config.log ##
-_ASBOX
-      sed 's/^/| /' ${at_top_build_prefix}config.log
-      echo
-    fi
-  } >&5
-
-  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
-## $as_me.log was created. ##
-_ASBOX
-
-  echo
-  if $at_debug_p; then
-    at_msg='per-test log files'
-  else
-    at_msg="\`${at_testdir+${at_testdir}/}$as_me.log'"
-  fi
-  $as_echo "Please send $at_msg and all information you think might help:
-
-   To: <pocl-devel at lists.sourceforge.net>
-   Subject: [pocl 0.12] $as_me: $at_fail_list${at_fail_list:+ failed${at_xpass_list:+, }}$at_xpass_list${at_xpass_list:+ passed unexpectedly}
-
-You may investigate any problem if you feel able to do so, in which
-case the test suite provides a good starting point.  Its output may
-be found below \`${at_testdir+${at_testdir}/}$as_me.dir'.
-"
-  exit 1
-fi
-
-exit 0
-
-## ------------- ##
-## Actual tests. ##
-## ------------- ##
-#AT_START_1
-at_fn_group_banner 1 'testsuite.at:29' \
-  "check for pocl version" "                         "
-at_xfail=no
-(
-  $as_echo "1. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:30: POCL_DEVICES=basic \$abs_top_builddir/tests/runtime/test_version"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic $abs_top_builddir/tests/runtime/test_version" "testsuite.at:30"
-( $at_check_trace; POCL_DEVICES=basic $abs_top_builddir/tests/runtime/test_version
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "basic
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:30"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_1
-#AT_START_2
-at_fn_group_banner 2 'testsuite.at:37' \
-  "example1: dot product" "                          " 1
-at_xfail=no
-(
-  $as_echo "2. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:39: \$abs_top_builddir/examples/example1/example1"
-at_fn_check_prepare_dynamic "$abs_top_builddir/examples/example1/example1" "testsuite.at:39"
-( $at_check_trace; $abs_top_builddir/examples/example1/example1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/example_expout.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:39"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_2
-#AT_START_3
-at_fn_group_banner 3 'testsuite.at:44' \
-  "example1: dot product (SPIR64)" "                 " 1
-at_xfail=no
-(
-  $as_echo "3. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# This SPIR example works because it does not use local memory nor
-# call builtins that are mangled with address spaces.
-$as_echo "testsuite.at:47" >"$at_check_line_file"
-(! grep "#define POCL_DEVICE_ADDRESS_BITS 64" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite.at:47"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:49: \$abs_top_builddir/examples/example1-spir64/example1-spir"
-at_fn_check_prepare_dynamic "$abs_top_builddir/examples/example1-spir64/example1-spir" "testsuite.at:49"
-( $at_check_trace; $abs_top_builddir/examples/example1-spir64/example1-spir
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/example_expout.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:49"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_3
-#AT_START_4
-at_fn_group_banner 4 'testsuite.at:54' \
-  "example1: dot product (SPIR32)" "                 " 1
-at_xfail=no
-(
-  $as_echo "4. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# This SPIR example works because it does not use local memory nor
-# call builtins that are mangled with address spaces.
-$as_echo "testsuite.at:57" >"$at_check_line_file"
-(! grep "#define POCL_DEVICE_ADDRESS_BITS 32" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite.at:57"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:59: \$abs_top_builddir/examples/example1-spir32/example1-spir32"
-at_fn_check_prepare_dynamic "$abs_top_builddir/examples/example1-spir32/example1-spir32" "testsuite.at:59"
-( $at_check_trace; $abs_top_builddir/examples/example1-spir32/example1-spir32
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/example_expout.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:59"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_4
-#AT_START_5
-at_fn_group_banner 5 'testsuite.at:64' \
-  "example2: matrix transpose" "                     " 1
-at_xfail=no
-(
-  $as_echo "5. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:66: \$abs_top_builddir/examples/example2/example2"
-at_fn_check_prepare_dynamic "$abs_top_builddir/examples/example2/example2" "testsuite.at:66"
-( $at_check_trace; $abs_top_builddir/examples/example2/example2
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:66"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_5
-#AT_START_6
-at_fn_group_banner 6 'testsuite.at:70' \
-  "example2a: matrix transpose (automatic locals)" " " 1
-at_xfail=no
-(
-  $as_echo "6. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:72: \$abs_top_builddir/examples/example2a/example2a"
-at_fn_check_prepare_dynamic "$abs_top_builddir/examples/example2a/example2a" "testsuite.at:72"
-( $at_check_trace; $abs_top_builddir/examples/example2a/example2a
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:72"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_6
-#AT_START_7
-at_fn_group_banner 7 'testsuite.at:78' \
-  "Kernel functions convert_char*" "                 " 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&
-             grep -q "#define LLVM_3_2" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "7. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-cat >expout <<'_ATEOF'
-Running test test_short16...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:86: \$abs_top_builddir/tests/kernel/kernel test_short16"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_short16" "testsuite.at:86"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_short16
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:86"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_7
-#AT_START_8
-at_fn_group_banner 8 'testsuite.at:89' \
-  "Kernel functions printf" "                        " 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q i686 && at_xfail=yes
-      egrep -q "#define LLVM_3_2|#define LLVM_3_3" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "8. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# On 32-bit x86, accessing int4 via va_arg segfaults (wrong alignment?)
-
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:94: \$abs_top_builddir/tests/kernel/kernel test_printf"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_printf" "testsuite.at:94"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_printf
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/test_printf_expout.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:94"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_8
-#AT_START_9
-at_fn_group_banner 9 'testsuite.at:99' \
-  "Kernel functions as_type" "                       " 2
-at_xfail=no
-(
-  $as_echo "9. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-$as_echo "testsuite.at:100" >"$at_check_line_file"
-(grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite.at:100"
-
-cat >expout <<'_ATEOF'
-Running test test_as_type...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:106: \$abs_top_builddir/tests/kernel/kernel test_as_type"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_as_type" "testsuite.at:106"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_as_type
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:106"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_9
-#AT_START_10
-at_fn_group_banner 10 'testsuite.at:109' \
-  "Kernel functions convert_type - scalars" "        " 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64 && at_xfail=yes
-(
-  $as_echo "10. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# ppc codegen issue, see bug #26
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:113: \$abs_top_builddir/tests/kernel/kernel test_convert_type_1"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_convert_type_1" "testsuite.at:113"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_convert_type_1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Running test test_convert_type_1...
-OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:113"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_10
-#AT_START_11
-at_fn_group_banner 11 'testsuite.at:118' \
-  "Kernel functions convert_type - vector of 2" "    " 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64 && at_xfail=yes
-(
-  $as_echo "11. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# ppc codegen issue, see bug #26
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:122: \$abs_top_builddir/tests/kernel/kernel test_convert_type_2"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_convert_type_2" "testsuite.at:122"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_convert_type_2
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Running test test_convert_type_2...
-OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:122"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_11
-#AT_START_12
-at_fn_group_banner 12 'testsuite.at:127' \
-  "Kernel functions convert_type - vector of 4" "    " 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64 && at_xfail=yes
-(
-  $as_echo "12. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# ppc codegen issue, see bug #26
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:131: \$abs_top_builddir/tests/kernel/kernel test_convert_type_4"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_convert_type_4" "testsuite.at:131"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_convert_type_4
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Running test test_convert_type_4...
-OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:131"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_12
-#AT_START_13
-at_fn_group_banner 13 'testsuite.at:136' \
-  "Kernel functions convert_type - vector of 8" "    " 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64 && at_xfail=yes
-(
-  $as_echo "13. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# ppc codegen issue, see bug #26
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:140: \$abs_top_builddir/tests/kernel/kernel test_convert_type_8"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_convert_type_8" "testsuite.at:140"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_convert_type_8
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Running test test_convert_type_8...
-OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:140"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_13
-#AT_START_14
-at_fn_group_banner 14 'testsuite.at:145' \
-  "Kernel functions convert_type - vector of 16" "   " 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64 && at_xfail=yes
-(
-  $as_echo "14. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# ppc codegen issue, see bug #26
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:149: \$abs_top_builddir/tests/kernel/kernel test_convert_type_16"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_convert_type_16" "testsuite.at:149"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_convert_type_16
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Running test test_convert_type_16...
-OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:149"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_14
-#AT_START_15
-at_fn_group_banner 15 'testsuite.at:156' \
-  "Kernel functions min and max when the operands are of different sign" "" 2
-at_xfail=no
-(
-  $as_echo "15. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# This used to produces an LLVM 3.3 bug that appeared only with Intel CPUs
-# without SSE4.2. http://llvm.org/bugs/show_bug.cgi?id=15977
-
-cat >expout <<'_ATEOF'
-Running test test_min_max...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:164: \$abs_top_builddir/tests/kernel/kernel test_min_max"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_min_max" "testsuite.at:164"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_min_max
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:164"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_15
-#AT_START_16
-at_fn_group_banner 16 'testsuite.at:167' \
-  "Kernel functions length, distance, and normalize" "" 2
-at_xfail=no
-(
-  $as_echo "16. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# This tests in particular for unintended overflow
-
-cat >expout <<'_ATEOF'
-Running test test_length_distance...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:174: \$abs_top_builddir/tests/kernel/kernel test_length_distance"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_length_distance" "testsuite.at:174"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_length_distance
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:174"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_16
-#AT_START_17
-at_fn_group_banner 17 'testsuite.at:177' \
-  "Kernel functions fmin, fmax, fma" "               " 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc  && at_xfail=yes
-(
-  $as_echo "17. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-cat >expout <<'_ATEOF'
-Running test test_fmin_fmax_fma...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:184: \$abs_top_builddir/tests/kernel/kernel test_fmin_fmax_fma"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_fmin_fmax_fma" "testsuite.at:184"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_fmin_fmax_fma
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:184"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_17
-#AT_START_18
-at_fn_group_banner 18 'testsuite.at:187' \
-  "Kernel functions frexp modf" "                    " 2
-at_xfail=no
-      grep -q "#undef USE_VECMATHLIB" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "18. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-Running test test_frexp_modf...
-frexp(8e2f): 0.8 10
-modf(1.5f): 0.5 1.000000
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:195: \$abs_top_builddir/tests/kernel/kernel test_frexp_modf"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_frexp_modf" "testsuite.at:195"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_frexp_modf
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:195"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_18
-#AT_START_19
-at_fn_group_banner 19 'testsuite.at:200' \
-  "A saturating conversion from long to uint" "      " 2
-at_xfail=no
-(
-  $as_echo "19. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-Running test test_convert_sat_regression...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:206: \$abs_top_builddir/tests/kernel/kernel test_convert_sat_regression"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_convert_sat_regression" "testsuite.at:206"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_convert_sat_regression
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:206"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_19
-#AT_START_20
-at_fn_group_banner 20 'testsuite.at:210' \
-  "Kernel functions abs bitselect clz max min popcount" "" 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc && at_xfail=yes
-(
-  $as_echo "20. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-#Fails on tce due to bug #1160383
-#AT_KEYWORDS([tce])
-
-cat >expout <<'_ATEOF'
-Running test test_bitselect...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:218: \$abs_top_builddir/tests/kernel/kernel test_bitselect"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_bitselect" "testsuite.at:218"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_bitselect
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:218"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_20
-#AT_START_21
-at_fn_group_banner 21 'testsuite.at:221' \
-  "Kernel functions fabs signbit isfinite isinf isnan isnormal copysign ilogb ldexp" "" 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64 && at_xfail=yes
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&
-             grep -q "#define LLVM_3_2" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "21. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-#Fails with what looks like rounding with vecmathlib on ppc32. ppc64 uninvestigated
-
-
-
-cat >expout <<'_ATEOF'
-Running test test_fabs...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:231: \$abs_top_builddir/tests/kernel/kernel test_fabs"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_fabs" "testsuite.at:231"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_fabs
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:231"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_21
-#AT_START_22
-at_fn_group_banner 22 'testsuite.at:234' \
-  "Kernel functions abs abs_diff add_sat hadd mad_hi mad_sat mul_hi rhadd sub_sat (loopvec)" "" 2
-at_xfail=no
-      grep OCL_KERNEL_TARGET $abs_top_builddir/config.h | cut -d\" -f2 | grep -q x86_64 &&
-             grep -q "#define LLVM_3_5" $abs_top_builddir/config.h && at_xfail=yes
-      grep OCL_KERNEL_TARGET $abs_top_builddir/config.h | cut -d\" -f2 | grep -q x86_64 &&
-             grep -q "#define LLVM_3_6" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "22. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# 3-element vector cases fail when vectorizer is enabled,
-# at least with Intel Core i5 and AMD FX8. Assume it fails on all others too.
-
-
-cat >expout <<'_ATEOF'
-Running test test_hadd...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:245: POCL_WORK_GROUP_METHOD=loopvec \$abs_top_builddir/tests/kernel/kernel test_hadd 2>&1"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=loopvec $abs_top_builddir/tests/kernel/kernel test_hadd 2>&1" "testsuite.at:245"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loopvec $abs_top_builddir/tests/kernel/kernel test_hadd 2>&1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:245"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_22
-#AT_START_23
-at_fn_group_banner 23 'testsuite.at:248' \
-  "Kernel functions abs abs_diff add_sat hadd mad_hi mad_sat mul_hi rhadd sub_sat (loops)" "" 2
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc && at_xfail=yes
-(
-  $as_echo "23. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-Running test test_hadd...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:254: POCL_WORK_GROUP_METHOD=loops \$abs_top_builddir/tests/kernel/kernel test_hadd 2>&1"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/kernel/kernel test_hadd 2>&1" "testsuite.at:254"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/kernel/kernel test_hadd 2>&1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:254"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_23
-#AT_START_24
-at_fn_group_banner 24 'testsuite.at:257' \
-  "Kernel functions << >> rotate" "                  " 2
-at_xfail=no
-      grep -q "#define LLVM_3_2" $abs_top_builddir/config.h && at_xfail=yes
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&
-             grep -q "#define LLVM_3_3" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "24. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-# Fails because of bugs in rotate.
-# At least in Debian 6.0/x86_64/LLVM 3.2 and
-# Ubuntu 12.04/x86/LLVM 3.2
-
-# tce fails currently this test so removed it from the keywords.
-# https://bugs.launchpad.net/tce/+bug/1180309
-#LLVM 3.3 regression on ppc - looks like an optimization+ppc codegen bug
-
-
-cat >expout <<'_ATEOF'
-Running test test_rotate...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:272: \$abs_top_builddir/tests/kernel/kernel test_rotate"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_rotate" "testsuite.at:272"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_rotate
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:272"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_24
-#AT_START_25
-at_fn_group_banner 25 'testsuite.at:275' \
-  "Trigonometric functions" "                        " 2
-at_xfail=no
-(
-  $as_echo "25. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:279: \$abs_top_builddir/examples/trig/trig"
-at_fn_check_prepare_dynamic "$abs_top_builddir/examples/trig/trig" "testsuite.at:279"
-( $at_check_trace; $abs_top_builddir/examples/trig/trig
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:279"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_25
-#AT_START_26
-at_fn_group_banner 26 'testsuite.at:282' \
-  "Sampler address clamp" "                          " 2
-at_xfail=no
-(
-  $as_echo "26. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-cat >expout <<'_ATEOF'
-Running test test_sampler_address_clamp...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:287: \$abs_top_builddir/tests/kernel/sampler_address_clamp"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/sampler_address_clamp" "testsuite.at:287"
-( $at_check_trace; $abs_top_builddir/tests/kernel/sampler_address_clamp
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:287"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_26
-#AT_START_27
-at_fn_group_banner 27 'testsuite.at:290' \
-  "Image query functions" "                          " 2
-at_xfail=no
-(
-  $as_echo "27. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-cat >expout <<'_ATEOF'
-Running test test_image_query_funcs...
-OK
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:295: \$abs_top_builddir/tests/kernel/image_query_funcs"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/image_query_funcs" "testsuite.at:295"
-( $at_check_trace; $abs_top_builddir/tests/kernel/image_query_funcs
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:295"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_27
-#AT_START_28
-at_fn_group_banner 28 'testsuite.at:298' \
-  "Kernel functions: shuffle charN" "                " 2
-at_xfail=no
-(
-  $as_echo "28. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:303: \$abs_top_builddir/tests/kernel/test_shuffle char"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle char" "testsuite.at:303"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle char
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:303"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_28
-#AT_START_29
-at_fn_group_banner 29 'testsuite.at:306' \
-  "Kernel functions: shuffle shortN" "               " 2
-at_xfail=no
-(
-  $as_echo "29. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:311: \$abs_top_builddir/tests/kernel/test_shuffle short"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle short" "testsuite.at:311"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle short
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:311"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_29
-#AT_START_30
-at_fn_group_banner 30 'testsuite.at:314' \
-  "Kernel functions: shuffle ushortN" "              " 2
-at_xfail=no
-(
-  $as_echo "30. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:319: \$abs_top_builddir/tests/kernel/test_shuffle ushort"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle ushort" "testsuite.at:319"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle ushort
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:319"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_30
-#AT_START_31
-at_fn_group_banner 31 'testsuite.at:322' \
-  "Kernel functions: shuffle halfN" "                " 2
-at_xfail=no
-      grep -q "#define LLVM_3_3" $abs_top_builddir/config.h ||
-             grep -q "#define LLVM_3_4" $abs_top_builddir/config.h  && at_xfail=yes
-(
-  $as_echo "31. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-$as_echo "testsuite.at:327" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite.at:327"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:330: \$abs_top_builddir/tests/kernel/test_shuffle half"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle half" "testsuite.at:330"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle half
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:330"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_31
-#AT_START_32
-at_fn_group_banner 32 'testsuite.at:333' \
-  "Kernel functions: shuffle intN" "                 " 2
-at_xfail=no
-(
-  $as_echo "32. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:338: \$abs_top_builddir/tests/kernel/test_shuffle int"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle int" "testsuite.at:338"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle int
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:338"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_32
-#AT_START_33
-at_fn_group_banner 33 'testsuite.at:341' \
-  "Kernel functions: shuffle uintN" "                " 2
-at_xfail=no
-(
-  $as_echo "33. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:346: \$abs_top_builddir/tests/kernel/test_shuffle uint"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle uint" "testsuite.at:346"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle uint
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:346"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_33
-#AT_START_34
-at_fn_group_banner 34 'testsuite.at:349' \
-  "Kernel functions: shuffle floatN" "               " 2
-at_xfail=no
-(
-  $as_echo "34. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:354: \$abs_top_builddir/tests/kernel/test_shuffle float"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle float" "testsuite.at:354"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle float
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:354"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_34
-#AT_START_35
-at_fn_group_banner 35 'testsuite.at:357' \
-  "Kernel functions: shuffle longN" "                " 2
-at_xfail=no
-(
-  $as_echo "35. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:362: \$abs_top_builddir/tests/kernel/test_shuffle long"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle long" "testsuite.at:362"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle long
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:362"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_35
-#AT_START_36
-at_fn_group_banner 36 'testsuite.at:365' \
-  "Kernel functions: shuffle ulongN" "               " 2
-at_xfail=no
-(
-  $as_echo "36. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:370: \$abs_top_builddir/tests/kernel/test_shuffle ulong"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle ulong" "testsuite.at:370"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle ulong
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:370"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_36
-#AT_START_37
-at_fn_group_banner 37 'testsuite.at:373' \
-  "Kernel functions: shuffle doubleN" "              " 2
-at_xfail=no
-(
-  $as_echo "37. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-OK
-_ATEOF
-]
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:378: \$abs_top_builddir/tests/kernel/test_shuffle double"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/test_shuffle double" "testsuite.at:378"
-( $at_check_trace; $abs_top_builddir/tests/kernel/test_shuffle double
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:378"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_37
-#AT_START_38
-at_fn_group_banner 38 'testsuite.at:383' \
-  "Scalar wave equation" "                           " 3
-at_xfail=no
-(
-  $as_echo "38. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-#this is a check for if doubles are available
-$as_echo "testsuite.at:386" >"$at_check_line_file"
-(grep DISABLE_LONG $abs_top_builddir/Makefile) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite.at:386"
-{ set +x
-$as_echo "$at_srcdir/testsuite.at:387: \$abs_top_builddir/examples/scalarwave/scalarwave"
-at_fn_check_prepare_dynamic "$abs_top_builddir/examples/scalarwave/scalarwave" "testsuite.at:387"
-( $at_check_trace; $abs_top_builddir/examples/scalarwave/scalarwave
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/scalarwave_expout.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite.at:387"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_38
-#AT_START_39
-at_fn_group_banner 39 'testsuite-workgroup.at:28' \
-  "unconditional barriers (full replication)" "      " 4
-at_xfail=no
-(
-  $as_echo "39. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:30: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/workgroup/run_kernel basic_barriers.cl 2 2 2 2"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel basic_barriers.cl 2 2 2 2" "testsuite-workgroup.at:30"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel basic_barriers.cl 2 2 2 2
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/basic_barriers_2_2_2_2.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:30"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_39
-#AT_START_40
-at_fn_group_banner 40 'testsuite-workgroup.at:35' \
-  "unconditional barriers (loops)" "                 " 4
-at_xfail=no
-(
-  $as_echo "40. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:37: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/workgroup/run_kernel basic_barriers.cl 2 2 2 2"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel basic_barriers.cl 2 2 2 2" "testsuite-workgroup.at:37"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel basic_barriers.cl 2 2 2 2
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/basic_barriers_2_2_2_2.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:37"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_40
-#AT_START_41
-at_fn_group_banner 41 'testsuite-workgroup.at:42' \
-  "unbarriered for loops (full replication)" "       " 4
-at_xfail=no
-(
-  $as_echo "41. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:44: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/workgroup/run_kernel forloops.cl 2 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel forloops.cl 2 2 1 1" "testsuite-workgroup.at:44"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel forloops.cl 2 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/forloops_2_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:44"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_41
-#AT_START_42
-at_fn_group_banner 42 'testsuite-workgroup.at:49' \
-  "unbarriered for loops (loops)" "                  " 4
-at_xfail=no
-(
-  $as_echo "42. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:51: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/workgroup/run_kernel forloops.cl 2 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel forloops.cl 2 2 1 1" "testsuite-workgroup.at:51"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel forloops.cl 2 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/forloops_2_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:51"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_42
-#AT_START_43
-at_fn_group_banner 43 'testsuite-workgroup.at:56' \
-  "barriered for loops (full replication)" "         " 4
-at_xfail=no
-(
-  $as_echo "43. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:58: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/workgroup/run_kernel loopbarriers.cl 2 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel loopbarriers.cl 2 2 1 1" "testsuite-workgroup.at:58"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel loopbarriers.cl 2 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/loopbarriers_2_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:58"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_43
-#AT_START_44
-at_fn_group_banner 44 'testsuite-workgroup.at:63' \
-  "barriered for loops (loops)" "                    " 4
-at_xfail=no
-(
-  $as_echo "44. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:65: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/workgroup/run_kernel loopbarriers.cl 2 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel loopbarriers.cl 2 2 1 1" "testsuite-workgroup.at:65"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel loopbarriers.cl 2 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/loopbarriers_2_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:65"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_44
-#AT_START_45
-at_fn_group_banner 45 'testsuite-workgroup.at:70' \
-  "conditional barrier (full replication)" "         " 4
-at_xfail=no
-(
-  $as_echo "45. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:72: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/workgroup/run_kernel conditional_barriers.cl 1 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel conditional_barriers.cl 1 2 1 1" "testsuite-workgroup.at:72"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel conditional_barriers.cl 1 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/cond_barriers_1_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:72"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_45
-#AT_START_46
-at_fn_group_banner 46 'testsuite-workgroup.at:77' \
-  "conditional barrier (loops)" "                    " 4
-at_xfail=no
-(
-  $as_echo "46. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:79: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/workgroup/run_kernel conditional_barriers.cl 1 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel conditional_barriers.cl 1 2 1 1" "testsuite-workgroup.at:79"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel conditional_barriers.cl 1 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/cond_barriers_1_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:79"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_46
-#AT_START_47
-at_fn_group_banner 47 'testsuite-workgroup.at:84' \
-  "b-loop with none of the WIs reaching the barrier (full replication)" "" 4
-at_xfail=no
-(
-  $as_echo "47. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:86: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/workgroup/run_kernel tricky_for.cl 1 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel tricky_for.cl 1 2 1 1" "testsuite-workgroup.at:86"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel tricky_for.cl 1 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/tricky_for_1_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:86"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_47
-#AT_START_48
-at_fn_group_banner 48 'testsuite-workgroup.at:91' \
-  "b-loop with none of the WIs reaching the barrier (loops)" "" 4
-at_xfail=no
-(
-  $as_echo "48. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:93: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/workgroup/run_kernel tricky_for.cl 1 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel tricky_for.cl 1 2 1 1" "testsuite-workgroup.at:93"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel tricky_for.cl 1 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/tricky_for_1_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:93"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_48
-#AT_START_49
-at_fn_group_banner 49 'testsuite-workgroup.at:98' \
-  "forcing horizontal parallelization to some outer loops (repl)" "" 4
-at_xfail=no
-(
-  $as_echo "49. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:100: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/workgroup/run_kernel outerlooppar.cl 2 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel outerlooppar.cl 2 2 1 1" "testsuite-workgroup.at:100"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel outerlooppar.cl 2 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/outerlooppar_2_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:100"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_49
-#AT_START_50
-at_fn_group_banner 50 'testsuite-workgroup.at:105' \
-  "forcing horizontal parallelization to some outer loops (loops)" "" 4
-at_xfail=no
-(
-  $as_echo "50. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:107: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/workgroup/run_kernel outerlooppar.cl 2 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel outerlooppar.cl 2 2 1 1" "testsuite-workgroup.at:107"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel outerlooppar.cl 2 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/outerlooppar_2_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:107"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_50
-#AT_START_51
-at_fn_group_banner 51 'testsuite-workgroup.at:112' \
-  "different implicit barrier injection scenarios (loops)" "" 4
-at_xfail=no
-(
-  $as_echo "51. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:114: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=loops \$abs_top_builddir/tests/workgroup/run_kernel implicit_barriers.cl 1 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/workgroup/run_kernel implicit_barriers.cl 1 2 1 1" "testsuite-workgroup.at:114"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/workgroup/run_kernel implicit_barriers.cl 1 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/implicit_barriers_1_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:114"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_51
-#AT_START_52
-at_fn_group_banner 52 'testsuite-workgroup.at:119' \
-  "loop with two paths to the latch (full replication)" "" 4
-at_xfail=no
-(
-  $as_echo "52. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:121: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/workgroup/run_kernel for_bug.cl 1 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel for_bug.cl 1 2 1 1" "testsuite-workgroup.at:121"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel for_bug.cl 1 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/for_bug_1_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:121"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_52
-#AT_START_53
-at_fn_group_banner 53 'testsuite-workgroup.at:126' \
-  "loop with two paths to the latch (loops)" "       " 4
-at_xfail=no
-(
-  $as_echo "53. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:128: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/workgroup/run_kernel for_bug.cl 1 2 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel for_bug.cl 1 2 1 1" "testsuite-workgroup.at:128"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel for_bug.cl 1 2 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/for_bug_1_2_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:128"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_53
-#AT_START_54
-at_fn_group_banner 54 'testsuite-workgroup.at:133' \
-  "b-loop with two latches (full replication)" "     " 4
-at_xfail=no
-(
-  $as_echo "54. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:135: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/workgroup/run_kernel multilatch_bloop.cl 1 3 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel multilatch_bloop.cl 1 3 1 1" "testsuite-workgroup.at:135"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel multilatch_bloop.cl 1 3 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/multilatch_bloop_1_3_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:135"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_54
-#AT_START_55
-at_fn_group_banner 55 'testsuite-workgroup.at:140' \
-  "b-loop with two latches (loops)" "                " 4
-at_xfail=no
-(
-  $as_echo "55. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:142: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=loops \$abs_top_builddir/tests/workgroup/run_kernel multilatch_bloop.cl 1 3 1 1"
-at_fn_check_prepare_dynamic "POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/workgroup/run_kernel multilatch_bloop.cl 1 3 1 1" "testsuite-workgroup.at:142"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/workgroup/run_kernel multilatch_bloop.cl 1 3 1 1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/multilatch_bloop_1_3_1_1.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:142"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_55
-#AT_START_56
-at_fn_group_banner 56 'testsuite-workgroup.at:147' \
-  "workgroup_sizes: work-items get wrong ids (full replication)" "" 4
-at_xfail=no
-(
-  $as_echo "56. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:149: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/workgroup/run_kernel print_all_ids.cl 1 1 1 4 | sort"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-workgroup.at:149"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel print_all_ids.cl 1 1 1 4 | sort
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/print_all_ids_114114.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:149"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_56
-#AT_START_57
-at_fn_group_banner 57 'testsuite-workgroup.at:155' \
-  "workgroup_sizes: work-items get wrong ids (loop)" "" 4
-at_xfail=no
-(
-  $as_echo "57. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-workgroup.at:157: POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/workgroup/run_kernel print_all_ids.cl 1 1 1 4 | sort"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-workgroup.at:157"
-( $at_check_trace; POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel print_all_ids.cl 1 1 1 4 | sort
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/workgroup/print_all_ids_114114.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-workgroup.at:157"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_57
-#AT_START_58
-at_fn_group_banner 58 'testsuite-regression.at:4' \
-  "phi nodes not replicated (repl) - lp:927573" "    " 5
-at_xfail=no
-(
-  $as_echo "58. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:6" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:6"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:7: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_loop_phi_replication"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_loop_phi_replication" "testsuite-regression.at:7"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_loop_phi_replication
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:7"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_58
-#AT_START_59
-at_fn_group_banner 59 'testsuite-regression.at:10' \
-  "phi nodes not replicated (loops) - lp:927573" "   " 5
-at_xfail=no
-(
-  $as_echo "59. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:12" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:12"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:13: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_loop_phi_replication"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_loop_phi_replication" "testsuite-regression.at:13"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_loop_phi_replication
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:13"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_59
-#AT_START_60
-at_fn_group_banner 60 'testsuite-regression.at:16' \
-  "issues with local pointers (repl) - lp:918801" "  " 5
-at_xfail=no
-(
-  $as_echo "60. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:18" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:18"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:19: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_locals"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_locals" "testsuite-regression.at:19"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_locals
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:19"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_60
-#AT_START_61
-at_fn_group_banner 61 'testsuite-regression.at:22' \
-  "issues with local pointers (loops) - lp:918801" " " 5
-at_xfail=no
-(
-  $as_echo "61. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:24" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:24"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:25: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_locals"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_locals" "testsuite-regression.at:25"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_locals
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:25"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_61
-#AT_START_62
-at_fn_group_banner 62 'testsuite-regression.at:28' \
-  "barrier between two for loops (repl)" "           " 5
-at_xfail=no
-(
-  $as_echo "62. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:30" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:30"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:31: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_barrier_between_for_loops"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_barrier_between_for_loops" "testsuite-regression.at:31"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_barrier_between_for_loops
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:31"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_62
-#AT_START_63
-at_fn_group_banner 63 'testsuite-regression.at:34' \
-  "barrier between two for loops (loops)" "          " 5
-at_xfail=no
-(
-  $as_echo "63. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:36" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:36"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:37: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_barrier_between_for_loops"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_barrier_between_for_loops" "testsuite-regression.at:37"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_barrier_between_for_loops
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:37"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_63
-#AT_START_64
-at_fn_group_banner 64 'testsuite-regression.at:40' \
-  "simple for-loop with a barrier inside (repl)" "   " 5
-at_xfail=no
-(
-  $as_echo "64. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:42" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:42"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:43: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_simple_for_with_a_barrier"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_simple_for_with_a_barrier" "testsuite-regression.at:43"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_simple_for_with_a_barrier
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:43"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_64
-#AT_START_65
-at_fn_group_banner 65 'testsuite-regression.at:46' \
-  "simple for-loop with a barrier inside (loops)" "  " 5
-at_xfail=no
-(
-  $as_echo "65. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:48" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:48"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:49: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_simple_for_with_a_barrier"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_simple_for_with_a_barrier" "testsuite-regression.at:49"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_simple_for_with_a_barrier
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:49"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_65
-#AT_START_66
-at_fn_group_banner 66 'testsuite-regression.at:52' \
-  "for-loop with computation after the brexit (repl) - lp:938123" "" 5
-at_xfail=no
-(
-  $as_echo "66. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:54" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:54"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:55: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_multi_level_loops_with_barriers"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_multi_level_loops_with_barriers" "testsuite-regression.at:55"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_multi_level_loops_with_barriers
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:55"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_66
-#AT_START_67
-at_fn_group_banner 67 'testsuite-regression.at:58' \
-  "for-loop with computation after the brexit (loops) - lp:938123" "" 5
-at_xfail=no
-(
-  $as_echo "67. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:60" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:60"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:61: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_multi_level_loops_with_barriers"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_multi_level_loops_with_barriers" "testsuite-regression.at:61"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_multi_level_loops_with_barriers
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:61"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_67
-#AT_START_68
-at_fn_group_banner 68 'testsuite-regression.at:64' \
-  "for-loop with a variable iteration count (repl) - lp:938883" "" 5
-at_xfail=no
-(
-  $as_echo "68. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:66" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:66"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:67: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_for_with_var_iteration_count"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_for_with_var_iteration_count" "testsuite-regression.at:67"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_for_with_var_iteration_count
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:67"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_68
-#AT_START_69
-at_fn_group_banner 69 'testsuite-regression.at:70' \
-  "for-loop with a variable iteration count (loops) - lp:938883" "" 5
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&\
-  grep -q "define LLVM_3_1" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "69. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:72" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:72"
-#this broke on ppc in commit 525, for LLVM 3.1
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:76: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_for_with_var_iteration_count"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_for_with_var_iteration_count" "testsuite-regression.at:76"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_for_with_var_iteration_count
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:76"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_69
-#AT_START_70
-at_fn_group_banner 70 'testsuite-regression.at:79' \
-  "early return before a barrier region (repl) - lp:940248" "" 5
-at_xfail=no
-(
-  $as_echo "70. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:81" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:81"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:82: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_early_return"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_early_return" "testsuite-regression.at:82"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_early_return
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:82"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_70
-#AT_START_71
-at_fn_group_banner 71 'testsuite-regression.at:85' \
-  "early return before a barrier region (loops) - lp:940248" "" 5
-at_xfail=no
-(
-  $as_echo "71. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:87" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:87"
-#AT_SKIP_IF(true)
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:89: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_early_return"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_early_return" "testsuite-regression.at:89"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_early_return
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:89"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_71
-#AT_START_72
-at_fn_group_banner 72 'testsuite-regression.at:92' \
-  "id-dependent computation before kernel exit (repl) - lp:940549" "" 5
-at_xfail=no
-(
-  $as_echo "72. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:94" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:94"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:95: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_id_dependent_computation"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_id_dependent_computation" "testsuite-regression.at:95"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_id_dependent_computation
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:95"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_72
-#AT_START_73
-at_fn_group_banner 73 'testsuite-regression.at:98' \
-  "id-dependent computation before kernel exit (loops) - lp:940549" "" 5
-at_xfail=no
-(
-  $as_echo "73. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:100" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:100"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:101: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_id_dependent_computation"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_id_dependent_computation" "testsuite-regression.at:101"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_id_dependent_computation
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:101"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_73
-#AT_START_74
-at_fn_group_banner 74 'testsuite-regression.at:104' \
-  "struct kernel arguments - lp:987905" "            " 5
-at_xfail=yes
-(
-  $as_echo "74. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-# Skip for now as this passes and fails depending on the target (ABI).
-$as_echo "testsuite-regression.at:107" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:107"
-$as_echo "testsuite-regression.at:108" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:108"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:110: \$abs_top_builddir/tests/regression/test_structs_as_args"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/regression/test_structs_as_args" "testsuite-regression.at:110"
-( $at_check_trace; $abs_top_builddir/tests/regression/test_structs_as_args
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:110"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_74
-#AT_START_75
-at_fn_group_banner 75 'testsuite-regression.at:113' \
-  "vector kernel arguments - lp:987905" "            " 5
-at_xfail=no
-      grep LLVM_CXX_FLAGS $abs_top_builddir/config.log | grep -q -v DNDEBUG && at_xfail=yes
-      grep LLVM_CXX_FLAGS $abs_top_builddir/config.log | grep -q _DEBUG && at_xfail=yes
-(
-  $as_echo "75. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-$as_echo "testsuite-regression.at:114" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:114"
-# Skip for now as this passes and fails depending on the target (ABI).
-
-$as_echo "testsuite-regression.at:117" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:117"
-# This test fails with an assert. If asserts are off, it succeeds.
-# It fails depending if the argument list generated to the kernel
-# function happens to be 1:1 with the clKernelSetArgs indices or not
-# and that depends on the CC/ABI of the target at hand.
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:124: \$abs_top_builddir/tests/regression/test_vectors_as_args"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/regression/test_vectors_as_args" "testsuite-regression.at:124"
-( $at_check_trace; $abs_top_builddir/tests/regression/test_vectors_as_args
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:124"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_75
-#AT_START_76
-at_fn_group_banner 76 'testsuite-regression.at:127' \
-  "barrier just before return (repl) - lp:1012030" " " 5
-at_xfail=no
-(
-  $as_echo "76. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:129" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:129"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:130: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_barrier_before_return"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_barrier_before_return" "testsuite-regression.at:130"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_barrier_before_return
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:130"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_76
-#AT_START_77
-at_fn_group_banner 77 'testsuite-regression.at:133' \
-  "barrier just before return (loops) - lp:1012030" "" 5
-at_xfail=no
-(
-  $as_echo "77. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:135" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:135"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:136: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_barrier_before_return"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_barrier_before_return" "testsuite-regression.at:136"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_barrier_before_return
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:136"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_77
-#AT_START_78
-at_fn_group_banner 78 'testsuite-regression.at:139' \
-  "infinite loop (repl) - lp:941558" "               " 5
-at_xfail=no
-(
-  $as_echo "78. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:141" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:141"
-$as_echo "testsuite-regression.at:142" >"$at_check_line_file"
-( env | grep -q POCL_IMPLICIT_FINISH) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:142"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:143: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_infinite_loop"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_infinite_loop" "testsuite-regression.at:143"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_infinite_loop
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:143"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_78
-#AT_START_79
-at_fn_group_banner 79 'testsuite-regression.at:146' \
-  "infinite loop (loops) - lp:941558" "              " 5
-at_xfail=no
-(
-  $as_echo "79. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:148" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:148"
-$as_echo "testsuite-regression.at:149" >"$at_check_line_file"
-( env | grep -q POCL_IMPLICIT_FINISH) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:149"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:150: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_infinite_loop"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_infinite_loop" "testsuite-regression.at:150"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_infinite_loop
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:150"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_79
-#AT_START_80
-at_fn_group_banner 80 'testsuite-regression.at:153' \
-  "passing a constant array as an arg - lp:1032203" "" 5
-at_xfail=no
-(
-  $as_echo "80. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:155" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:155"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:156: \$abs_top_builddir/tests/regression/test_constant_array"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/regression/test_constant_array" "testsuite-regression.at:156"
-( $at_check_trace; $abs_top_builddir/tests/regression/test_constant_array
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:156"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_80
-#AT_START_81
-at_fn_group_banner 81 'testsuite-regression.at:159' \
-  "undominated variable from conditional barrier handling (repl) - lp:1045835" "" 5
-at_xfail=no
-(
-  $as_echo "81. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:161" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:161"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:162: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_undominated_variable"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_undominated_variable" "testsuite-regression.at:162"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_undominated_variable
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:162"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_81
-#AT_START_82
-at_fn_group_banner 82 'testsuite-regression.at:165' \
-  "undominated variable from conditional barrier handling (loops) - lp:1045835" "" 5
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&\
-  grep -q "define LLVM_3_1" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "82. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:167" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:167"
-#this broke on ppc in commit 525, for LLVM 3.1
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:171: POCL_WORK_GROUP_METHOD=workitemloops \$abs_top_builddir/tests/regression/test_undominated_variable"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_undominated_variable" "testsuite-regression.at:171"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_undominated_variable
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:171"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_82
-#AT_START_83
-at_fn_group_banner 83 'testsuite-regression.at:174' \
-  "clSetKernelArg overwriting the previous kernel's args - lp:1075134" "" 5
-at_xfail=no
-(
-  $as_echo "83. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:176" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:176"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:177: \$abs_top_builddir/tests/regression/test_setargs"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/regression/test_setargs" "testsuite-regression.at:177"
-( $at_check_trace; $abs_top_builddir/tests/regression/test_setargs
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:177"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_83
-#AT_START_84
-at_fn_group_banner 84 'testsuite-regression.at:180' \
-  "setting a buffer argument to NULL causes a segfault - lp:1109030" "" 5
-at_xfail=no
-(
-  $as_echo "84. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:182" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:182"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:183: \$abs_top_builddir/tests/regression/test_null_arg"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/regression/test_null_arg" "testsuite-regression.at:183"
-( $at_check_trace; $abs_top_builddir/tests/regression/test_null_arg
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:183"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_84
-#AT_START_85
-at_fn_group_banner 85 'testsuite-regression.at:186' \
-  "sizeof(uint)" "                                   " 5
-at_xfail=no
-(
-  $as_echo "85. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:188: \$abs_top_builddir/tests/kernel/kernel test_sizeof"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_sizeof" "testsuite-regression.at:188"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_sizeof
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/kernel/test_sizeof_expout.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:188"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_85
-#AT_START_86
-at_fn_group_banner 86 'testsuite-regression.at:193' \
-  "block" "                                          " 5
-at_xfail=no
-      grep -q "#define LLVM_3_4" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "86. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:195" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:195"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:196: \$abs_top_builddir/tests/kernel/kernel test_block"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_block" "testsuite-regression.at:196"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_block
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/kernel/test_block_expout.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:196"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-# LLVM 3.4's crashes with an illegal const expr cast. Unresolved.
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_86
-#AT_START_87
-at_fn_group_banner 87 'testsuite-regression.at:203' \
-  "case with multiple variable length loops and a barrier in one" "" 5
-at_xfail=no
-(
-  $as_echo "87. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-regression.at:205" >"$at_check_line_file"
-(! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:205"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:206: POCL_WORK_GROUP_METHOD=workitemrepl \$abs_top_builddir/tests/regression/test_fors_with_var_iteration_counts"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_fors_with_var_iteration_counts" "testsuite-regression.at:206"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_fors_with_var_iteration_counts
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:206"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_87
-#AT_START_88
-at_fn_group_banner 88 'testsuite-regression.at:209' \
-  "assigning a loop iterator variable to a private makes it local - issue 94 (repl)" "" 5
-at_xfail=no
-(
-  $as_echo "88. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-Changed value at global_id: 67599, local_id 3, group_id 16899, to: 854
-Value is changed at global_id: 67599, local_id 3, group_id 16899, to: 854
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:215: POCL_WORK_GROUP_METHOD=repl \$abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=repl $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local" "testsuite-regression.at:215"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:215"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_88
-#AT_START_89
-at_fn_group_banner 89 'testsuite-regression.at:218' \
-  "assigning a loop iterator variable to a private makes it local - issue 94 (loops)" "" 5
-at_xfail=no
-(
-  $as_echo "89. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-Changed value at global_id: 67599, local_id 3, group_id 16899, to: 854
-Value is changed at global_id: 67599, local_id 3, group_id 16899, to: 854
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:224: POCL_WORK_GROUP_METHOD=loops \$abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local" "testsuite-regression.at:224"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:224"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_89
-#AT_START_90
-at_fn_group_banner 90 'testsuite-regression.at:227' \
-  "assigning a loop iterator variable to a private makes it local 2 - issue 102 (repl)" "" 5
-at_xfail=no
-(
-  $as_echo "90. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-changing the value at global_id: 6, local_id 2, group_id 1, to: 3
-value is changed at global_id: 6, local_id 2, group_id 1, to: 3
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:233: POCL_WORK_GROUP_METHOD=repl \$abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local_2"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=repl $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local_2" "testsuite-regression.at:233"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local_2
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:233"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_90
-#AT_START_91
-at_fn_group_banner 91 'testsuite-regression.at:236' \
-  "assigning a loop iterator variable to a private makes it local 2 - issue 102 (loops)" "" 5
-at_xfail=no
-(
-  $as_echo "91. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-cat >expout <<'_ATEOF'
-changing the value at global_id: 6, local_id 2, group_id 1, to: 3
-value is changed at global_id: 6, local_id 2, group_id 1, to: 3
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:242: POCL_WORK_GROUP_METHOD=loops \$abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local_2"
-at_fn_check_prepare_dynamic "POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local_2" "testsuite-regression.at:242"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local_2
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:242"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_91
-#AT_START_92
-at_fn_group_banner 92 'testsuite-regression.at:247' \
-  "local struct arrays produce illegal AS casts" "   " 5
-at_xfail=no
-      grep -q "#define LLVM_BUILT_WITH_ASSERTS" $abs_top_builddir/config.h && \
-! grep -q "#define LLVM_3_2" $abs_top_builddir/config.h && \
-! grep -q "#define LLVM_3_3" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "92. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-# The HSA branch of LLVM 3.7 is probably outdated in comparison
-# to the upstream trunk and makes this case fail.
-$as_echo "testsuite-regression.at:251" >"$at_check_line_file"
-(grep -q "#define BUILD_HSA" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-regression.at:251"
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:252: \$abs_top_builddir/tests/kernel/kernel test_local_struct_array"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/kernel/kernel test_local_struct_array" "testsuite-regression.at:252"
-( $at_check_trace; $abs_top_builddir/tests/kernel/kernel test_local_struct_array
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Running test test_local_struct_array...
-OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:252"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_92
-#AT_START_93
-at_fn_group_banner 93 'testsuite-regression.at:263' \
-  "LoopVectorizer crash with Haswell and Broadwell - issue 231" "" 5
-at_xfail=no
-(
-  $as_echo "93. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-regression.at:265: \$abs_top_builddir/tests/regression/test_issue_231"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/regression/test_issue_231" "testsuite-regression.at:265"
-( $at_check_trace; $abs_top_builddir/tests/regression/test_issue_231
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-regression.at:265"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_93
-#AT_START_94
-at_fn_group_banner 94 'testsuite-runtime.at:4' \
-  "clGetDeviceInfo" "                                " 6
-at_xfail=no
-(
-  $as_echo "94. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:6: \$abs_top_builddir/tests/runtime/test_clGetDeviceInfo"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_clGetDeviceInfo" "testsuite-runtime.at:6"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clGetDeviceInfo
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:6"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_94
-#AT_START_95
-at_fn_group_banner 95 'testsuite-runtime.at:9' \
-  "clEnqueueNativeKernel" "                          " 6
-at_xfail=no
-(
-  $as_echo "95. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:11: \$abs_top_builddir/tests/runtime/test_clEnqueueNativeKernel"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_clEnqueueNativeKernel" "testsuite-runtime.at:11"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clEnqueueNativeKernel
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:11"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_95
-#AT_START_96
-at_fn_group_banner 96 'testsuite-runtime.at:14' \
-  "clGetEventInfo" "                                 " 6
-at_xfail=no
-(
-  $as_echo "96. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:16: \$abs_top_builddir/tests/runtime/test_clGetEventInfo"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_clGetEventInfo" "testsuite-runtime.at:16"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clGetEventInfo
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:16"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_96
-#AT_START_97
-at_fn_group_banner 97 'testsuite-runtime.at:19' \
-  "read/copy/write buffer" "                         " 6
-at_xfail=no
-(
-  $as_echo "97. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:21: \$abs_top_builddir/tests/runtime/test_read-copy-write-buffer"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_read-copy-write-buffer" "testsuite-runtime.at:21"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_read-copy-write-buffer
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:21"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_97
-#AT_START_98
-at_fn_group_banner 98 'testsuite-runtime.at:24' \
-  "event cycle" "                                    " 6
-at_xfail=no
-(
-  $as_echo "98. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:26: \$abs_top_builddir/tests/runtime/test_event_cycle"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_event_cycle" "testsuite-runtime.at:26"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_event_cycle
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:26"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_98
-#AT_START_99
-at_fn_group_banner 99 'testsuite-runtime.at:29' \
-  "event freeing" "                                  " 6
-at_xfail=no
-(
-  $as_echo "99. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:31: \$abs_top_builddir/tests/runtime/test_event_free"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_event_free" "testsuite-runtime.at:31"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_event_free
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:31"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_99
-#AT_START_100
-at_fn_group_banner 100 'testsuite-runtime.at:34' \
-  "clCreateProgramWithBinary" "                      " 6
-at_xfail=no
-(
-  $as_echo "100. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:36: \$abs_top_builddir/tests/runtime/test_clCreateProgramWithBinary"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_clCreateProgramWithBinary" "testsuite-runtime.at:36"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clCreateProgramWithBinary
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:36"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_100
-#AT_START_101
-at_fn_group_banner 101 'testsuite-runtime.at:40' \
-  "clBuildProgram" "                                 " 6
-at_xfail=no
-(
-  $as_echo "101. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:42: cd \$abs_top_srcdir/tests/runtime/; \$abs_top_builddir/tests/runtime/test_clBuildProgram"
-at_fn_check_prepare_dynamic "cd $abs_top_srcdir/tests/runtime/; $abs_top_builddir/tests/runtime/test_clBuildProgram" "testsuite-runtime.at:42"
-( $at_check_trace; cd $abs_top_srcdir/tests/runtime/; $abs_top_builddir/tests/runtime/test_clBuildProgram
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo stdout:; cat "$at_stdout"
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:42"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_101
-#AT_START_102
-at_fn_group_banner 102 'testsuite-runtime.at:46' \
-  "test_kernel_cache_includes" "                     " 6
-at_xfail=no
-(
-  $as_echo "102. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:48: cd \$abs_top_builddir/tests/runtime/; \$abs_top_builddir/tests/runtime/test_kernel_cache_includes"
-at_fn_check_prepare_dynamic "cd $abs_top_builddir/tests/runtime/; $abs_top_builddir/tests/runtime/test_kernel_cache_includes" "testsuite-runtime.at:48"
-( $at_check_trace; cd $abs_top_builddir/tests/runtime/; $abs_top_builddir/tests/runtime/test_kernel_cache_includes
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/runtime/test_kernel_cache_includes_expout.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:48"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_102
-#AT_START_103
-at_fn_group_banner 103 'testsuite-runtime.at:53' \
-  "clBuildProgram link error" "                      " 6
-at_xfail=yes
-(
-  $as_echo "103. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:56: \$abs_top_builddir/tests/runtime/test_link_error"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_link_error" "testsuite-runtime.at:56"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_link_error
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:56"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_103
-#AT_START_104
-at_fn_group_banner 104 'testsuite-runtime.at:60' \
-  "clFinish" "                                       " 6
-at_xfail=no
-(
-  $as_echo "104. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:62: \$abs_top_builddir/tests/runtime/test_clFinish | grep \"ABABC\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-runtime.at:62"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clFinish | grep "ABABC"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "ABABC
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:62"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_104
-#AT_START_105
-at_fn_group_banner 105 'testsuite-runtime.at:66' \
-  "clSetEventCallback" "                             " 6
-at_xfail=no
-(
-  $as_echo "105. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:68: \$abs_top_builddir/tests/runtime/test_clSetEventCallback"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_clSetEventCallback" "testsuite-runtime.at:68"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clSetEventCallback
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/tests/runtime/test_clSetEventCallback_expout.txt)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:68"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_105
-#AT_START_106
-at_fn_group_banner 106 'testsuite-runtime.at:73' \
-  "clGetSupportedImageFormats" "                     " 6
-at_xfail=no
-(
-  $as_echo "106. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:75: POCL_DEVICES=\"pthread pthread\" \$abs_top_builddir/tests/runtime/test_clGetSupportedImageFormats"
-at_fn_check_prepare_dynamic "POCL_DEVICES=\"pthread pthread\" $abs_top_builddir/tests/runtime/test_clGetSupportedImageFormats" "testsuite-runtime.at:75"
-( $at_check_trace; POCL_DEVICES="pthread pthread" $abs_top_builddir/tests/runtime/test_clGetSupportedImageFormats
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:75"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_106
-#AT_START_107
-at_fn_group_banner 107 'testsuite-runtime.at:78' \
-  "clCreateKernelsInProgram" "                       " 6
-at_xfail=no
-(
-  $as_echo "107. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:80: \$abs_top_builddir/tests/runtime/test_clCreateKernelsInProgram "
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_clCreateKernelsInProgram " "testsuite-runtime.at:80"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clCreateKernelsInProgram
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Hello
-World
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:80"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_107
-#AT_START_108
-at_fn_group_banner 108 'testsuite-runtime.at:86' \
-  "clCreateKernel" "                                 " 6
-at_xfail=no
-(
-  $as_echo "108. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:88: \$abs_top_builddir/tests/runtime/test_clCreateKernel "
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_clCreateKernel " "testsuite-runtime.at:88"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clCreateKernel
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:88"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_108
-#AT_START_109
-at_fn_group_banner 109 'testsuite-runtime.at:92' \
-  "clGetKernelArgInfo" "                             " 6
-at_xfail=no
-      grep -q "#define LLVM_3_2" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "109. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:95: \$abs_top_builddir/tests/runtime/test_clGetKernelArgInfo"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_clGetKernelArgInfo" "testsuite-runtime.at:95"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clGetKernelArgInfo
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo stdout:; cat "$at_stdout"
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:95"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_109
-#AT_START_110
-at_fn_group_banner 110 'testsuite-runtime.at:98' \
-  "clCreateSubDevices" "                             " 6
-at_xfail=no
-(
-  $as_echo "110. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-runtime.at:100: \$abs_top_builddir/tests/runtime/test_clCreateSubDevices"
-at_fn_check_prepare_dynamic "$abs_top_builddir/tests/runtime/test_clCreateSubDevices" "testsuite-runtime.at:100"
-( $at_check_trace; $abs_top_builddir/tests/runtime/test_clCreateSubDevices
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-runtime.at:100"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_110
-#AT_START_111
-at_fn_group_banner 111 'testsuite-tce.at:4' \
-  "A basic ttasim driver test" "                     " 7
-at_xfail=no
-(
-  $as_echo "111. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-tce.at:6" >"$at_check_line_file"
-(! grep -c "define TCE_AVAILABLE" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-tce.at:6"
-{ set +x
-$as_echo "$at_srcdir/testsuite-tce.at:7: make -s -C \$abs_top_builddir/tests/tce/ttasim run | egrep -v \"^make\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-tce.at:7"
-( $at_check_trace; make -s -C $abs_top_builddir/tests/tce/ttasim run | egrep -v "^make"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "PING23456.000000 2000001OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-tce.at:7"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_111
-#AT_START_112
-at_fn_group_banner 112 'testsuite-tce.at:12' \
-  "Half-precision floats on ttasim (repl)" "         " 7
-at_xfail=no
-      grep -q "#define LLVM_3_2" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "112. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-$as_echo "testsuite-tce.at:15" >"$at_check_line_file"
-(! grep -c "define TCE_AVAILABLE" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-tce.at:15"
-{ set +x
-$as_echo "$at_srcdir/testsuite-tce.at:16: POCL_WORK_GROUP_METHOD=repl make -s -C \$abs_top_builddir/tests/tce/fp16 run | egrep -v \"^make\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-tce.at:16"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl make -s -C $abs_top_builddir/tests/tce/fp16 run | egrep -v "^make"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "through conversion: 42
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-tce.at:16"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_112
-#AT_START_113
-at_fn_group_banner 113 'testsuite-tce.at:37' \
-  "Half-precision floats on ttasim (loopvec)" "      " 7
-at_xfail=no
-(
-  $as_echo "113. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-tce.at:39" >"$at_check_line_file"
-(! grep -c "define TCEMC_AVAILABLE" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-tce.at:39"
-{ set +x
-$as_echo "$at_srcdir/testsuite-tce.at:40: make -s -C \$abs_top_builddir/tests/tce/fp16 run | egrep -v \"^make\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-tce.at:40"
-( $at_check_trace; make -s -C $abs_top_builddir/tests/tce/fp16 run | egrep -v "^make"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "through conversion: 42
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-tce.at:40"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_113
-#AT_START_114
-at_fn_group_banner 114 'testsuite-tce.at:61' \
-  "A basic TCEMC test" "                             " 7
-at_xfail=no
-(
-  $as_echo "114. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-tce.at:63" >"$at_check_line_file"
-(! grep -c "define TCEMC_AVAILABLE" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-tce.at:63"
-{ set +x
-$as_echo "$at_srcdir/testsuite-tce.at:64: make -s -C \$abs_top_builddir/tests/tce/tcemc run | egrep -v \"^make\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-tce.at:64"
-( $at_check_trace; make -s -C $abs_top_builddir/tests/tce/tcemc run | egrep -v "^make"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "PING23456.000000 2000001OK
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-tce.at:64"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_114
-#AT_START_115
-at_fn_group_banner 115 'testsuite-samples.at:8' \
-  "Building the sources against pocl" "              " 8
-at_xfail=no
-(
-  $as_echo "115. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-  $as_echo "testsuite-samples.at:10" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" samples "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-samples.at:10"
-
-cat >expout <<'_ATEOF'
-Built target HelloWorld
-Built target OpenCLInfo
-Built target OpenCLConvolutionChap3
-Built target HelloBinaryWorld
-Built target SimpleBufferSubBuffer
-Built target ImageFilter2D
-Built target vecadd
-Built target histogram
-Built target Dijkstra
-Built target spmv
-_ATEOF
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-samples.at:23: sed -i 's/#include \"bmpLoader.hpp\"/\\/\\/#include \"bmpLoader.hpp/g' \\
-\$abs_top_srcdir/examples/opencl-book-samples/checkout/src/Chapter_12/Sinewave/sinewave.cpp;
-make -sC \$abs_top_builddir/examples/opencl-book-samples clean build 2>&1 | grep 'Built target' | cut -c8-"
-at_fn_check_prepare_notrace 'an embedded newline' "testsuite-samples.at:23"
-( $at_check_trace; sed -i 's/#include "bmpLoader.hpp"/\/\/#include "bmpLoader.hpp/g' \
-$abs_top_srcdir/examples/opencl-book-samples/checkout/src/Chapter_12/Sinewave/sinewave.cpp;
-make -sC $abs_top_builddir/examples/opencl-book-samples clean build 2>&1 | grep 'Built target' | cut -c8-
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-samples.at:23"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_115
-#AT_START_116
-at_fn_group_banner 116 'testsuite-samples.at:28' \
-  "Run Chapter 2: Hello World" "                     " 8
-at_xfail=no
-(
-  $as_echo "116. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-  $as_echo "testsuite-samples.at:30" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" samples "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-samples.at:30"
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-samples.at:32: cd \$abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_2/HelloWorld; ./HelloWorld | grep -v \"Could not create GPU context, trying CPU\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-samples.at:32"
-( $at_check_trace; cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_2/HelloWorld; ./HelloWorld | grep -v "Could not create GPU context, trying CPU"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "`cat $abs_top_srcdir/examples/opencl-book-samples/HelloWorld.stdout`
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-samples.at:32"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_116
-#AT_START_117
-at_fn_group_banner 117 'testsuite-samples.at:38' \
-  "Run Chapter 3: OpenCLConvolution" "               " 8
-at_xfail=no
-(
-  $as_echo "117. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-  $as_echo "testsuite-samples.at:40" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" samples "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-samples.at:40"
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-samples.at:42:
-cd \$abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_3/OpenCLConvolution
-ln -sf \$abs_top_srcdir/examples/opencl-book-samples/checkout/src/Chapter_3/OpenCLConvolution/Convolution.cl
-./OpenCLConvolutionChap3
-"
-at_fn_check_prepare_notrace 'an embedded newline' "testsuite-samples.at:42"
-( $at_check_trace;
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_3/OpenCLConvolution
-ln -sf $abs_top_srcdir/examples/opencl-book-samples/checkout/src/Chapter_3/OpenCLConvolution/Convolution.cl
-./OpenCLConvolutionChap3
-
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "`cat $abs_top_srcdir/examples/opencl-book-samples/OpenCLConvolutionChap3.stdout`
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-samples.at:42"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_117
-#AT_START_118
-at_fn_group_banner 118 'testsuite-samples.at:52' \
-  "Run Chapter 6: HelloBinaryWorld" "                " 8
-at_xfail=no
-(
-  $as_echo "118. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-  $as_echo "testsuite-samples.at:54" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" samples "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-samples.at:54"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-samples.at:55:
-cd \$abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_6/HelloBinaryWorld
-rm -f HelloWorld.cl.bin
-./HelloBinaryWorld | grep -v \"Could not create GPU\"
-./HelloBinaryWorld | grep -v \"Could not create GPU\"
-"
-at_fn_check_prepare_notrace 'an embedded newline' "testsuite-samples.at:55"
-( $at_check_trace;
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_6/HelloBinaryWorld
-rm -f HelloWorld.cl.bin
-./HelloBinaryWorld | grep -v "Could not create GPU"
-./HelloBinaryWorld | grep -v "Could not create GPU"
-
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "`cat $abs_top_srcdir/examples/opencl-book-samples/HelloBinaryWorld.stdout`
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-samples.at:55"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_118
-#AT_START_119
-at_fn_group_banner 119 'testsuite-samples.at:66' \
-  "Run Chapter 7: SimpleBufferSubBuffer" "           " 8
-at_xfail=no
-(
-  $as_echo "119. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-  $as_echo "testsuite-samples.at:68" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" samples "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-samples.at:68"
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-samples.at:70:
-cd \$abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_7/SimpleBufferSubBuffer
-./SimpleBufferSubBuffer --useMap | egrep -v \"_TYPE_GPU|16 17 18 19\"
-"
-at_fn_check_prepare_notrace 'an embedded newline' "testsuite-samples.at:70"
-( $at_check_trace;
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_7/SimpleBufferSubBuffer
-./SimpleBufferSubBuffer --useMap | egrep -v "_TYPE_GPU|16 17 18 19"
-
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "`cat $abs_top_srcdir/examples/opencl-book-samples/SimpleBufferSubBuffer.stdout`
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-samples.at:70"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_119
-#AT_START_120
-at_fn_group_banner 120 'testsuite-samples.at:79' \
-  "Run Chapter 8: ImageFilter2D" "                   " 8
-at_xfail=no
-(
-  $as_echo "120. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-  $as_echo "testsuite-samples.at:81" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" samples "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-samples.at:81"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-samples.at:82:
-cd \${abs_top_srcdir}/examples/opencl-book-samples/checkout/src/Chapter_8/ImageFilter2D ;
-cp \${abs_top_srcdir}/examples/opencl-book-samples/ImageFilter2D.cl ./
-sed \"6s/float/constant float/\" -i ImageFilter2D.cl ;
-sed '418cclFinish(commandQueue);' -i ImageFilter2D.cpp ;
-cd \${abs_top_builddir}/examples/opencl-book-samples/build/src/Chapter_8/ImageFilter2D ;
-make
-"
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-samples.at:82"
-( $at_check_trace;
-cd ${abs_top_srcdir}/examples/opencl-book-samples/checkout/src/Chapter_8/ImageFilter2D ;
-cp ${abs_top_srcdir}/examples/opencl-book-samples/ImageFilter2D.cl ./
-sed "6s/float/constant float/" -i ImageFilter2D.cl ;
-sed '418cclFinish(commandQueue);' -i ImageFilter2D.cpp ;
-cd ${abs_top_builddir}/examples/opencl-book-samples/build/src/Chapter_8/ImageFilter2D ;
-make
-
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo stdout:; cat "$at_stdout"
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-samples.at:82"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-samples.at:91:
-cd \$abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_8/ImageFilter2D
-./ImageFilter2D \$abs_top_srcdir/examples/opencl-book-samples/checkout/src/Chapter_19/oclFlow/data/minicooper/frame10.png output.png
-"
-at_fn_check_prepare_notrace 'an embedded newline' "testsuite-samples.at:91"
-( $at_check_trace;
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_8/ImageFilter2D
-./ImageFilter2D $abs_top_srcdir/examples/opencl-book-samples/checkout/src/Chapter_19/oclFlow/data/minicooper/frame10.png output.png
-
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Could not create GPU context, trying CPU...
-
-Executed program succesfully.
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-samples.at:91"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_120
-#AT_START_121
-at_fn_group_banner 121 'testsuite-samples.at:102' \
-  "Run Chapter 12: VectorAdd (C++ bindings)" "       " 8
-at_xfail=no
-(
-  $as_echo "121. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-
-  $as_echo "testsuite-samples.at:104" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" samples "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-samples.at:104"
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-samples.at:106:
-cd \$abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_12/VectorAdd
-./vecadd
-"
-at_fn_check_prepare_notrace 'an embedded newline' "testsuite-samples.at:106"
-( $at_check_trace;
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_12/VectorAdd
-./vecadd
-
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "`cat $abs_top_srcdir/examples/opencl-book-samples/VectorAdd.stdout`
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-samples.at:106"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_121
-#AT_START_122
-at_fn_group_banner 122 'testsuite-viennacl.at:17' \
-  "fft" "                                            " 9
-at_xfail=no
-(
-  $as_echo "122. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:17" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:17"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename examples/tutorial/fft).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:18: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/fft 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:18"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/fft 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:18"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_122
-#AT_START_123
-at_fn_group_banner 123 'testsuite-viennacl.at:21' \
-  "custom-context" "                                 " 9
-at_xfail=no
-(
-  $as_echo "123. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:21" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:21"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename examples/tutorial/custom-context).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:22: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/custom-context 2>&1 | sed -e '''1d;2d;3d;4d;8d;9d'"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:22"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/custom-context 2>&1 | sed -e '''1d;2d;3d;4d;8d;9d'
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:22"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_123
-#AT_START_124
-at_fn_group_banner 124 'testsuite-viennacl.at:25' \
-  "custom-kernels" "                                 " 9
-at_xfail=no
-(
-  $as_echo "124. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:25" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:25"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename examples/tutorial/custom-kernels).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:26: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/custom-kernels 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:26"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/custom-kernels 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:26"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_124
-#AT_START_125
-at_fn_group_banner 125 'testsuite-viennacl.at:29' \
-  "scheduler" "                                      " 9
-at_xfail=no
-(
-  $as_echo "125. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:29" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:29"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename examples/tutorial/scheduler).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:30: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/scheduler 2>&1 | sed -e '''1d;2d'"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:30"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/scheduler 2>&1 | sed -e '''1d;2d'
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:30"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_125
-#AT_START_126
-at_fn_group_banner 126 'testsuite-viennacl.at:39' \
-  "bandwidth-reduction" "                            " 9
-at_xfail=no
-(
-  $as_echo "126. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:39" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:39"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename examples/tutorial/bandwidth-reduction).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:40: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/bandwidth-reduction 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:40"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/examples/tutorial/bandwidth-reduction 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:40"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_126
-#AT_START_127
-at_fn_group_banner 127 'testsuite-viennacl.at:55' \
-  "blas3_solve_double-test-opencl" "                 " 9
-at_xfail=yes
-(
-  $as_echo "127. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:55" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:55"
-
-#rouning/codegen/float mode errors:
-#- * lower_tag:       passed! 5.16861e-07
-#+ * lower_tag:       passed! 0
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/blas3_solve_double-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:60: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/blas3_solve_double-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:60"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/blas3_solve_double-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:60"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_127
-#AT_START_128
-at_fn_group_banner 128 'testsuite-viennacl.at:71' \
-  "external_linkage-opencl" "                        " 9
-at_xfail=no
-(
-  $as_echo "128. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:71" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:71"
-
-# This is a buggy test as it relies on the destruction order
-# of global objects. Wrong order results in a crash due to
-# a dangling OpenCL object pointer.
-$as_echo "testsuite-viennacl.at:75" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:75"
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/external_linkage-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:76: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/external_linkage-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:76"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/external_linkage-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:76"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_128
-#AT_START_129
-at_fn_group_banner 129 'testsuite-viennacl.at:92' \
-  "global_variables-test-opencl" "                   " 9
-at_xfail=no
-(
-  $as_echo "129. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:92" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:92"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/global_variables-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:93: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/global_variables-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:93"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/global_variables-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:93"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_129
-#AT_START_130
-at_fn_group_banner 130 'testsuite-viennacl.at:96' \
-  "iterators-test-opencl" "                          " 9
-at_xfail=no
-(
-  $as_echo "130. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:96" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:96"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/iterators-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:97: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/iterators-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:97"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/iterators-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:97"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_130
-#AT_START_131
-at_fn_group_banner 131 'testsuite-viennacl.at:104' \
-  "matrix_col_double-test-opencl long" "             " 9
-at_xfail=no
-(
-  $as_echo "131. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:104" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:104"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/matrix_col_double-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:105: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_col_double-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:105"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_col_double-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:105"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_131
-#AT_START_132
-at_fn_group_banner 132 'testsuite-viennacl.at:110' \
-  "matrix_col_float-test-opencl long" "              " 9
-at_xfail=no
-(
-  $as_echo "132. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:110" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:110"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/matrix_col_float-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:111: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_col_float-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:111"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_col_float-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:111"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_132
-#AT_START_133
-at_fn_group_banner 133 'testsuite-viennacl.at:116' \
-  "matrix_col_int-test-opencl" "                     " 9
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv && at_xfail=yes
-(
-  $as_echo "133. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:116" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:116"
-
-#uninvestigated
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/matrix_col_int-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:119: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_col_int-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:119"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_col_int-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:119"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_133
-#AT_START_134
-at_fn_group_banner 134 'testsuite-viennacl.at:124' \
-  "matrix_row_double-test-opencl long" "             " 9
-at_xfail=no
-(
-  $as_echo "134. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:124" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:124"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/matrix_row_double-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:125: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_row_double-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:125"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_row_double-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:125"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_134
-#AT_START_135
-at_fn_group_banner 135 'testsuite-viennacl.at:130' \
-  "matrix_row_float-test-opencl long" "              " 9
-at_xfail=no
-(
-  $as_echo "135. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:130" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:130"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/matrix_row_float-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:131: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_row_float-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:131"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_row_float-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:131"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_135
-#AT_START_136
-at_fn_group_banner 136 'testsuite-viennacl.at:136' \
-  "matrix_row_int-test-opencl" "                     " 9
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv && at_xfail=yes
-(
-  $as_echo "136. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:136" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:136"
-
-#uninvestigated
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/matrix_row_int-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:139: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_row_int-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:139"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_row_int-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:139"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_136
-#AT_START_137
-at_fn_group_banner 137 'testsuite-viennacl.at:144' \
-  "matrix_vector_int-test-opencl" "                  " 9
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv && at_xfail=yes
-(
-  $as_echo "137. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:144" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:144"
-
-#uninvestigated
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/matrix_vector_int-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:147: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_vector_int-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:147"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_vector_int-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:147"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_137
-#AT_START_138
-at_fn_group_banner 138 'testsuite-viennacl.at:152' \
-  "matrix_vector-test-opencl long" "                 " 9
-at_xfail=no
-(
-  $as_echo "138. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:152" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:152"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/matrix_vector-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:153: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_vector-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:153"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/matrix_vector-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:153"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_138
-#AT_START_139
-at_fn_group_banner 139 'testsuite-viennacl.at:156' \
-  "nmf-test-opencl" "                                " 9
-at_xfail=yes
-(
-  $as_echo "139. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:156" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:156"
-
-# ViennaCL borks with "Generator: Key not found in map"
-#Note: uncomment a few other tests if fixing this
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/nmf-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:160: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/nmf-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:160"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/nmf-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:160"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_139
-#AT_START_140
-at_fn_group_banner 140 'testsuite-viennacl.at:164' \
-  "scalar-test-opencl" "                             " 9
-at_xfail=no
-(
-  $as_echo "140. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:164" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:164"
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/scalar-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:165: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/scalar-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:165"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/scalar-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:165"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_140
-#AT_START_141
-at_fn_group_banner 141 'testsuite-viennacl.at:168' \
-  "structured-matrices-test-opencl" "                " 9
-at_xfail=no
-(
-  $as_echo "141. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:168" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:168"
-
-#float/double precision errors. Might not even be
-#errors, the test seems to pass.
-#TODO, investigate, skip result passing with sed
-$as_echo "testsuite-viennacl.at:172" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:172"
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/structured-matrices-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:173: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/structured-matrices-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:173"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/structured-matrices-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:173"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_141
-#AT_START_142
-at_fn_group_banner 142 'testsuite-viennacl.at:176' \
-  "vector_double-test-opencl" "                      " 9
-at_xfail=no
-      egrep -q "#define LLVM_3_4" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "142. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-viennacl.at:176" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" viennacl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-viennacl.at:176"
-
-#fails with "Could not find a dominating alternative variable"
-#Note: uncomment a few other tests if fixing this
-
-
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename tests/vector_double-test-opencl).stdout > expout
-  { set +x
-$as_echo "$at_srcdir/testsuite-viennacl.at:180: \$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/vector_double-test-opencl 2>&1 | sed -e ''"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-viennacl.at:180"
-( $at_check_trace; $abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/tests/vector_double-test-opencl 2>&1 | sed -e ''
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-$at_diff expout "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-viennacl.at:180"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_142
-#AT_START_143
-at_fn_group_banner 143 'testsuite-rodinia.at:19' \
-  "backprop" "                                       " 10
-at_xfail=no
-(
-  $as_echo "143. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-rodinia.at:19" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" rodinia "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:19"
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  $as_echo "testsuite-rodinia.at:19" >"$at_check_line_file"
-(grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:19"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:20: make build-backprop -sC \${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v \"^make\"  | egrep -v \"^backprop_ocl.cpp:\" "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-rodinia.at:20"
-( $at_check_trace; make build-backprop -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make"  | egrep -v "^backprop_ocl.cpp:"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "  [Building Rodinia backprop]
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:20"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:21: cd \$abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/backprop; ./run | grep ."
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-rodinia.at:21"
-( $at_check_trace; cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/backprop; ./run | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Random number generator seed: 7
-Input layer size : 65536
-Starting training kernel
-num_devices = 1
-Performing GPU computation
-Finish the training for one iteration
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:21"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_143
-#AT_START_144
-at_fn_group_banner 144 'testsuite-rodinia.at:31' \
-  "bfs" "                                            " 10
-at_xfail=no
-(
-  $as_echo "144. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-rodinia.at:31" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" rodinia "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:31"
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  $as_echo "testsuite-rodinia.at:31" >"$at_check_line_file"
-(grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:31"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:32: make build-bfs -sC \${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v \"^make\"  | egrep -v \"^CLHelper.h:|^In file included from bfs.cpp:\"  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-rodinia.at:32"
-( $at_check_trace; make build-bfs -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make"  | egrep -v "^CLHelper.h:|^In file included from bfs.cpp:"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "  [Building Rodinia bfs]
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:32"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-# Test passes NVIDIA-specific parameters to clBuildProgram which is now
-# detected as an unsupported parameter by pocl.
-$as_echo "testsuite-rodinia.at:35" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:35"
-{ set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:36: cd \$abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/bfs; ./run 2>&1|grep -v \"incomplete\" | grep -v \"argument unused\" | grep ."
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-rodinia.at:36"
-( $at_check_trace; cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/bfs; ./run 2>&1|grep -v "incomplete" | grep -v "argument unused" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Reading File
---cambine:passed:-)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:36"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_144
-#AT_START_145
-at_fn_group_banner 145 'testsuite-rodinia.at:42' \
-  "cfd" "                                            " 10
-at_xfail=no
-(
-  $as_echo "145. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-rodinia.at:42" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" rodinia "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:42"
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  $as_echo "testsuite-rodinia.at:42" >"$at_check_line_file"
-(grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:42"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:43: make build-cfd -sC \${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v \"^make\" | egrep -v \"^euler3d.cpp:\"  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-rodinia.at:43"
-( $at_check_trace; make build-cfd -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make" | egrep -v "^euler3d.cpp:"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "  [Building Rodinia cfd]
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:43"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:44: cd \$abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/cfd; ./run 2>&1| grep . | egrep -v \"incomplete|device.name\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-rodinia.at:44"
-( $at_check_trace; cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/cfd; ./run 2>&1| grep . | egrep -v "incomplete|device.name"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "--cambine: nel=97046, nelr=97152
-Starting...
-Saving solution...
-Saved solution...
-Cleaning up...
-Done...
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:44"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_145
-#AT_START_146
-at_fn_group_banner 146 'testsuite-rodinia.at:54' \
-  "lud" "                                            " 10
-at_xfail=no
-(
-  $as_echo "146. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-rodinia.at:54" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" rodinia "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:54"
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  $as_echo "testsuite-rodinia.at:54" >"$at_check_line_file"
-(grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:54"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:55: make build-lud -sC \${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v \"^make\"  | egrep -v \"^lud.cpp:|^../common/common.c:\"  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-rodinia.at:55"
-( $at_check_trace; make build-lud -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make"  | egrep -v "^lud.cpp:|^../common/common.c:"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "  [Building Rodinia lud]
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:55"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:56: cd \$abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/lud/ocl; ./run | grep \"Time con\" | cut -c -18 | grep ."
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-rodinia.at:56"
-( $at_check_trace; cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/lud/ocl; ./run | grep "Time con" | cut -c -18 | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Time consumed(ms):
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:56"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_146
-#AT_START_147
-at_fn_group_banner 147 'testsuite-rodinia.at:61' \
-  "hotspot" "                                        " 10
-at_xfail=no
-(
-  $as_echo "147. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-rodinia.at:61" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" rodinia "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:61"
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  $as_echo "testsuite-rodinia.at:61" >"$at_check_line_file"
-(grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:61"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:62: make build-hotspot -sC \${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v \"^make\"  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-rodinia.at:62"
-( $at_check_trace; make build-hotspot -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "  [Building Rodinia hotspot]
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:62"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:63: cd \$abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/hotspot; ./run | grep \"time\" | cut -d ' ' -f1 | grep ."
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-rodinia.at:63"
-( $at_check_trace; cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/hotspot; ./run | grep "time" | cut -d ' ' -f1 | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Kernel
-Total
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:63"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_147
-#AT_START_148
-at_fn_group_banner 148 'testsuite-rodinia.at:69' \
-  "kmeans" "                                         " 10
-at_xfail=no
-(
-  $as_echo "148. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-rodinia.at:69" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" rodinia "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:69"
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  $as_echo "testsuite-rodinia.at:69" >"$at_check_line_file"
-(grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:69"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:70: make build-kmeans -sC \${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v \"^make\"  | egrep -v \"^kmeans.cpp:|^read_input.c:\"  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-rodinia.at:70"
-( $at_check_trace; make build-kmeans -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make"  | egrep -v "^kmeans.cpp:|^read_input.c:"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "  [Building Rodinia kmeans]
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:70"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:71: cd \$abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/kmeans; ./run | grep \"Number of\" | grep ."
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-rodinia.at:71"
-( $at_check_trace; cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/kmeans; ./run | grep "Number of" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Number of objects: 494020
-Number of features: 34
-Number of Iteration: 1
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:71"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_148
-#AT_START_149
-at_fn_group_banner 149 'testsuite-rodinia.at:78' \
-  "lavaMD" "                                         " 10
-at_xfail=no
-(
-  $as_echo "149. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-rodinia.at:78" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" rodinia "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:78"
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  $as_echo "testsuite-rodinia.at:78" >"$at_check_line_file"
-(grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:78"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:79: make build-lavaMD -sC \${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v \"^make\"  | egrep -v \"^./kernel/kernel_gpu_opencl_wrapper.c:|^rm:\"  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-rodinia.at:79"
-( $at_check_trace; make build-lavaMD -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make"  | egrep -v "^./kernel/kernel_gpu_opencl_wrapper.c:|^rm:"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "  [Building Rodinia lavaMD]
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:79"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-# This test case uses struct kernel arguments which currently do not
-# work correctly due to assuming the ABI maps variables 1:1 to kernel
-# arguments and at least AMD64 seems to sometimes split the struct
-# arguments to multiple scalar arguments. This (falsely) passes with
-# the pthread device and crashes with the basic device.
-$as_echo "testsuite-rodinia.at:85" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:85"
-{ set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:86: cd \$abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/lavaMD; ./run | grep \"Total time:\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-rodinia.at:86"
-( $at_check_trace; cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/lavaMD; ./run | grep "Total time:"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Total time:
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:86"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_149
-#AT_START_150
-at_fn_group_banner 150 'testsuite-rodinia.at:91' \
-  "pathfinder" "                                     " 10
-at_xfail=no
-(
-  $as_echo "150. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-rodinia.at:91" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" rodinia "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:91"
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  $as_echo "testsuite-rodinia.at:91" >"$at_check_line_file"
-(grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:91"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:92: make build-pathfinder -sC \${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v \"^make\"  | egrep -v \"^OpenCL.cpp:\"  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-rodinia.at:92"
-( $at_check_trace; make build-pathfinder -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make"  | egrep -v "^OpenCL.cpp:"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "  [Building Rodinia pathfinder]
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:92"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-# This started to fail at around 2014-12-03.
-$as_echo "testsuite-rodinia.at:94" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:94"
-{ set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:95: cd \$abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/pathfinder; \\
-POCL_MAX_WORK_GROUP_SIZE=2 ./run 2>&1 | grep -v \"pocl warning:\" ; \\
-cat result.txt 2>&1 | egrep -v \"DEVICE_|PROFILE|VERSION|NAME|EXTENSIONS\" | grep ."
-at_fn_check_prepare_notrace 'an embedded newline' "testsuite-rodinia.at:95"
-( $at_check_trace; cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/pathfinder; \
-POCL_MAX_WORK_GROUP_SIZE=2 ./run 2>&1 | grep -v "pocl warning:" ; \
-cat result.txt 2>&1 | egrep -v "DEVICE_|PROFILE|VERSION|NAME|EXTENSIONS" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "$(cat $abs_top_srcdir/examples/Rodinia/pathfinder.stdout)
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:95"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_150
-#AT_START_151
-at_fn_group_banner 151 'testsuite-rodinia.at:102' \
-  "srad" "                                           " 10
-at_xfail=no
-(
-  $as_echo "151. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-rodinia.at:102" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" rodinia "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:102"
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  $as_echo "testsuite-rodinia.at:102" >"$at_check_line_file"
-(grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-rodinia.at:102"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:103: make build-srad -sC \${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v \"^make\"  | egrep -v \"^main.cpp:\"  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-rodinia.at:103"
-( $at_check_trace; make build-srad -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make"  | egrep -v "^main.cpp:"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "  [Building Rodinia srad]
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:103"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-rodinia.at:104: cd \$abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/srad; \\
-POCL_MAX_WORK_GROUP_SIZE=2 ./run 2>&1 | grep \"Total time\" | grep ."
-at_fn_check_prepare_notrace 'an embedded newline' "testsuite-rodinia.at:104"
-( $at_check_trace; cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/srad; \
-POCL_MAX_WORK_GROUP_SIZE=2 ./run 2>&1 | grep "Total time" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Total time:
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-rodinia.at:104"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_151
-#AT_START_152
-at_fn_group_banner 152 'testsuite-parboil.at:17' \
-  "spmv" "                                           " 11
-at_xfail=no
-      grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "152. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:17" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:17"
-
-#This fails when pocl has ICD enabled.
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:20: make build-spmv -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:20"
-( $at_check_trace; make build-spmv -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:20"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:21: make run-spmv -sC \$abs_top_builddir/examples/Parboil | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:21"
-( $at_check_trace; make run-spmv -sC $abs_top_builddir/examples/Parboil | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:21"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_152
-#AT_START_153
-at_fn_group_banner 153 'testsuite-parboil.at:26' \
-  "stencil" "                                        " 11
-at_xfail=no
-(
-  $as_echo "153. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:26" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:26"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:27: make build-stencil -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:27"
-( $at_check_trace; make build-stencil -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:27"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:28: make run-stencil -sC \$abs_top_builddir/examples/Parboil | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:28"
-( $at_check_trace; make run-stencil -sC $abs_top_builddir/examples/Parboil | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:28"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_153
-#AT_START_154
-at_fn_group_banner 154 'testsuite-parboil.at:33' \
-  "tpacf" "                                          " 11
-at_xfail=yes
-(
-  $as_echo "154. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:33" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:33"
-
-# This probably is in infinite loop with wiloops.
-$as_echo "testsuite-parboil.at:35" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:35"
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:36: make build-tpacf -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:36"
-( $at_check_trace; make build-tpacf -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:36"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:37: make run-tpacf -sC \$abs_top_builddir/examples/Parboil | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:37"
-( $at_check_trace; make run-tpacf -sC $abs_top_builddir/examples/Parboil | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:37"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-# Result verification error.
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_154
-#AT_START_155
-at_fn_group_banner 155 'testsuite-parboil.at:44' \
-  "cutcp" "                                          " 11
-at_xfail=no
-(
-  $as_echo "155. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:44" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:44"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:45: make build-cutcp -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:45"
-( $at_check_trace; make build-cutcp -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:45"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:46: make run-cutcp -sC \$abs_top_builddir/examples/Parboil | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:46"
-( $at_check_trace; make run-cutcp -sC $abs_top_builddir/examples/Parboil | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:46"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_155
-#AT_START_156
-at_fn_group_banner 156 'testsuite-parboil.at:51' \
-  "mri-gridding" "                                   " 11
-at_xfail=no
-(
-  $as_echo "156. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:51" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:51"
-
-# Takes forever to compile with the repl method.
-# Also tries to create an illegal work group size and has potentially
-# erroneous kernels?
-$as_echo "testsuite-parboil.at:55" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:55"
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:56: make build-mri-gridding -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:56"
-( $at_check_trace; make build-mri-gridding -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:56"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:57: make run-mri-gridding -sC \$abs_top_builddir/examples/Parboil | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:57"
-( $at_check_trace; make run-mri-gridding -sC $abs_top_builddir/examples/Parboil | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:57"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-#AT_XFAIL_IF(true)
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_156
-#AT_START_157
-at_fn_group_banner 157 'testsuite-parboil.at:63' \
-  "sad" "                                            " 11
-at_xfail=yes
-(
-  $as_echo "157. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:63" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:63"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:64: make build-sad -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:64"
-( $at_check_trace; make build-sad -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:64"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:65: make run-sad -sC \$abs_top_builddir/examples/Parboil | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:65"
-( $at_check_trace; make run-sad -sC $abs_top_builddir/examples/Parboil | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:65"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-# Requires read_imageui
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_157
-#AT_START_158
-at_fn_group_banner 158 'testsuite-parboil.at:72' \
-  "bfs" "                                            " 11
-at_xfail=no
-(
-  $as_echo "158. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:72" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:72"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:73: make build-bfs -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:73"
-( $at_check_trace; make build-bfs -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:73"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:74: make run-bfs -sC \$abs_top_builddir/examples/Parboil 2>&1 | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:74"
-( $at_check_trace; make run-bfs -sC $abs_top_builddir/examples/Parboil 2>&1 | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:74"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-# LLVM 3.4's SROA crashes with this. Reported in http://llvm.org/bugs/show_bug.cgi?id=15907
-# However, this is fixed with a TCE-patched version of 3.4, so let's just
-# skip it for 3.4 for now.
-$as_echo "testsuite-parboil.at:80" >"$at_check_line_file"
-(grep -q "#define LLVM_3_4" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:80"
-#AT_XFAIL_IF([grep -q "#define LLVM_3_4" $abs_top_builddir/config.h])
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_158
-#AT_START_159
-at_fn_group_banner 159 'testsuite-parboil.at:84' \
-  "histo" "                                          " 11
-at_xfail=yes
-(
-  $as_echo "159. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:84" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:84"
-
-# Illegal kernels with array parameters to functions.
-$as_echo "testsuite-parboil.at:86" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:86"
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:87: make build-histo -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:87"
-( $at_check_trace; make build-histo -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:87"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:88: make run-histo -sC \$abs_top_builddir/examples/Parboil | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:88"
-( $at_check_trace; make run-histo -sC $abs_top_builddir/examples/Parboil | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:88"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_159
-#AT_START_160
-at_fn_group_banner 160 'testsuite-parboil.at:93' \
-  "sgemm" "                                          " 11
-at_xfail=no
-(
-  $as_echo "160. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:93" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:93"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:94: make build-sgemm -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:94"
-( $at_check_trace; make build-sgemm -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:94"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:95: make run-sgemm -sC \$abs_top_builddir/examples/Parboil 2>&1| grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:95"
-( $at_check_trace; make run-sgemm -sC $abs_top_builddir/examples/Parboil 2>&1| grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:95"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_160
-#AT_START_161
-at_fn_group_banner 161 'testsuite-parboil.at:100' \
-  "mri-q" "                                          " 11
-at_xfail=no
-(
-  $as_echo "161. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:100" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:100"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:101: make build-mri-q -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:101"
-( $at_check_trace; make build-mri-q -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:101"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:102: make run-mri-q -sC \$abs_top_builddir/examples/Parboil | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:102"
-( $at_check_trace; make run-mri-q -sC $abs_top_builddir/examples/Parboil | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:102"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_161
-#AT_START_162
-at_fn_group_banner 162 'testsuite-parboil.at:107' \
-  "lbm" "                                            " 11
-at_xfail=no
-(
-  $as_echo "162. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-parboil.at:107" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" parboil "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-parboil.at:107"
-
-
-  { set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:108: make build-lbm -sC \${abs_top_builddir}/examples/Parboil 2>&1 | grep \"Parboil parallel benchmark suite\" | grep .  "
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-parboil.at:108"
-( $at_check_trace; make build-lbm -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep .
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Parboil parallel benchmark suite, version 0.2
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:108"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-parboil.at:109: make run-lbm -sC \$abs_top_builddir/examples/Parboil | grep Pass"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-parboil.at:109"
-( $at_check_trace; make run-lbm -sC $abs_top_builddir/examples/Parboil | grep Pass
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Pass
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-parboil.at:109"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-#AT_XFAIL_IF(true)
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_162
-#AT_START_163
-at_fn_group_banner 163 'testsuite-amd.at:11' \
-  "aesencryptdecrypt-repl" "                         " 12
-at_xfail=no
-(
-  $as_echo "163. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:11" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:11"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:11" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:11"
-
-#This fails, and on LLVM 3.3 it takes more than an hour (on modest PPC hardware) to detect it.
-$as_echo "testsuite-amd.at:13" >"$at_check_line_file"
-(grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:13"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:14: POCL_WORK_GROUP_METHOD=repl make test_AESEncryptDecrypt -sC \$abs_top_builddir/examples/AMD | grep \"Encryption Passed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:14"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl make test_AESEncryptDecrypt -sC $abs_top_builddir/examples/AMD | grep "Encryption Passed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Encryption Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:14"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_163
-#AT_START_164
-at_fn_group_banner 164 'testsuite-amd.at:19' \
-  "aesencryptdecrypt-loops" "                        " 12
-at_xfail=no
-(
-  $as_echo "164. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:19" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:19"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:19" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:19"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:20: POCL_WORK_GROUP_METHOD=loops make test_AESEncryptDecrypt -sC \$abs_top_builddir/examples/AMD | grep \"Encryption Passed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:20"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops make test_AESEncryptDecrypt -sC $abs_top_builddir/examples/AMD | grep "Encryption Passed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Encryption Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:20"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_164
-#AT_START_165
-at_fn_group_banner 165 'testsuite-amd.at:25' \
-  "atomiccounters" "                                 " 12
-at_xfail=yes
-(
-  $as_echo "165. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:25" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:25"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:25" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:25"
-
-# Expected Error: Device does not support cl_ext_atomic_counters_32 extension!
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:28: make test_AtomicCounters -sC \$abs_top_builddir/examples/AMD | grep \"Encryption Passed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:28"
-( $at_check_trace; make test_AtomicCounters -sC $abs_top_builddir/examples/AMD | grep "Encryption Passed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Encryption Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:28"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_165
-#AT_START_166
-at_fn_group_banner 166 'testsuite-amd.at:33' \
-  "bitonicsort" "                                    " 12
-at_xfail=no
-(
-  $as_echo "166. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:33" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:33"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:33" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:33"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:34: make test_BitonicSort -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:34"
-( $at_check_trace; make test_BitonicSort -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:34"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_166
-#AT_START_167
-at_fn_group_banner 167 'testsuite-amd.at:39' \
-  "binarysearch" "                                   " 12
-at_xfail=no
-(
-  $as_echo "167. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:39" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:39"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:39" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:39"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:40: make test_BinarySearch -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:40"
-( $at_check_trace; make test_BinarySearch -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:40"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_167
-#AT_START_168
-at_fn_group_banner 168 'testsuite-amd.at:45' \
-  "binomialoption-repl" "                            " 12
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&
-             grep -q "define LLVM_3_2" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "168. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:45" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:45"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:45" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:45"
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:48: POCL_WORK_GROUP_METHOD=repl make test_BinomialOption -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:48"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl make test_BinomialOption -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:48"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_168
-#AT_START_169
-at_fn_group_banner 169 'testsuite-amd.at:53' \
-  "binomialoption-loops" "                           " 12
-at_xfail=no
-(
-  $as_echo "169. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:53" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:53"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:53" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:53"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:54: POCL_WORK_GROUP_METHOD=loops make test_BinomialOption -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:54"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops make test_BinomialOption -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:54"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_169
-#AT_START_170
-at_fn_group_banner 170 'testsuite-amd.at:59' \
-  "blackscholes" "                                   " 12
-at_xfail=no
-(
-  $as_echo "170. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:59" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:59"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:59" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:59"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:60: make test_BlackScholes -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:60"
-( $at_check_trace; make test_BlackScholes -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:60"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_170
-#AT_START_171
-at_fn_group_banner 171 'testsuite-amd.at:65' \
-  "blackscholesdp" "                                 " 12
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc && at_xfail=yes
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv &&
-             (grep -q "define LLVM_3_2" $abs_top_builddir/config.h ||
-              grep -q "define LLVM_3_3" $abs_top_builddir/config.h ) && at_xfail=yes
-(
-  $as_echo "171. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:65" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:65"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:65" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:65"
-
-
-#this causes assert on LLVM 3.1
-$as_echo "testsuite-amd.at:68" >"$at_check_line_file"
-( grep "#define LLVM_3_1" $abs_top_builddir/config.h ) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:68"
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:73: make test_BlackScholesDP -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:73"
-( $at_check_trace; make test_BlackScholesDP -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:73"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_171
-#AT_START_172
-at_fn_group_banner 172 'testsuite-amd.at:78' \
-  "boxfilter" "                                      " 12
-at_xfail=no
-(
-  $as_echo "172. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:78" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:78"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:78" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:78"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:79: make test_BoxFilter -sC \$abs_top_builddir/examples/AMD | egrep \"Passed|failed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:79"
-( $at_check_trace; make test_BoxFilter -sC $abs_top_builddir/examples/AMD | egrep "Passed|failed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-Verifying results...Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:79"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_172
-#AT_START_173
-at_fn_group_banner 173 'testsuite-amd.at:85' \
-  "dct" "                                            " 12
-at_xfail=no
-(
-  $as_echo "173. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:85" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:85"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:85" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:85"
-
-#uninvestigated miscompilation. regression from 0.8
-$as_echo "testsuite-amd.at:87" >"$at_check_line_file"
-(grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:87"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:88: make test_DCT -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:88"
-( $at_check_trace; make test_DCT -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:88"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_173
-#AT_START_174
-at_fn_group_banner 174 'testsuite-amd.at:93' \
-  "devicefission" "                                  " 12
-at_xfail=no
-(
-  $as_echo "174. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:93" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:93"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:93" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:93"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:94: make test_DeviceFission -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:94"
-( $at_check_trace; make test_DeviceFission -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:94"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_174
-#AT_START_175
-at_fn_group_banner 175 'testsuite-amd.at:99' \
-  "dwthaar1d" "                                      " 12
-at_xfail=no
-      egrep -q "#define LLVM_3_4" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "175. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:99" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:99"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:99" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:99"
-
-# 3.5 introduced the noduplicate attribute which, when
-# used with barrier(), fixes this.
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:103: make test_DwtHaar1D -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:103"
-( $at_check_trace; make test_DwtHaar1D -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:103"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_175
-#AT_START_176
-at_fn_group_banner 176 'testsuite-amd.at:108' \
-  "fastwalshtransform" "                             " 12
-at_xfail=no
-(
-  $as_echo "176. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:108" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:108"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:108" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:108"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:109: make test_FastWalshTransform -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:109"
-( $at_check_trace; make test_FastWalshTransform -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:109"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_176
-#AT_START_177
-at_fn_group_banner 177 'testsuite-amd.at:114' \
-  "floydwarshall" "                                  " 12
-at_xfail=no
-(
-  $as_echo "177. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:114" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:114"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:114" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:114"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:115: make test_FloydWarshall -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:115"
-( $at_check_trace; make test_FloydWarshall -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:115"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_177
-#AT_START_178
-at_fn_group_banner 178 'testsuite-amd.at:120' \
-  "fluidsimulation2d" "                              " 12
-at_xfail=yes
-(
-  $as_echo "178. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:120" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:120"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:120" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:120"
-
-# error: can't convert between vector values of different size ('uint' (aka 'unsigned int') and 'int8')
-# It should be a legal implicit conversion according to 6.3 Operators. Some other error makes it
-# break with Intel OCL also.
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:126: make test_FluidSimulation2D -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:126"
-( $at_check_trace; make test_FluidSimulation2D -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:126"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_178
-#AT_START_179
-at_fn_group_banner 179 'testsuite-amd.at:131' \
-  "helloworld" "                                     " 12
-at_xfail=no
-(
-  $as_echo "179. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:131" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:131"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:131" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:131"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:132: make test_HelloWorld -sC \$abs_top_builddir/examples/AMD | egrep \"GdkknVnqkc|HelloWorld\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:132"
-( $at_check_trace; make test_HelloWorld -sC $abs_top_builddir/examples/AMD | egrep "GdkknVnqkc|HelloWorld"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "GdkknVnqkc
-HelloWorld
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:132"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_179
-#AT_START_180
-at_fn_group_banner 180 'testsuite-amd.at:138' \
-  "histogram-repl" "                                 " 12
-at_xfail=no
-(
-  $as_echo "180. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:138" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:138"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:138" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:138"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:139: POCL_WORK_GROUP_METHOD=repl make test_Histogram -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:139"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl make test_Histogram -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:139"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_180
-#AT_START_181
-at_fn_group_banner 181 'testsuite-amd.at:144' \
-  "histogram-loops" "                                " 12
-at_xfail=no
-(
-  $as_echo "181. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:144" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:144"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:144" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:144"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:145: POCL_WORK_GROUP_METHOD=loops make test_Histogram -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:145"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops make test_Histogram -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:145"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_181
-#AT_START_182
-at_fn_group_banner 182 'testsuite-amd.at:150' \
-  "imageoverlap" "                                   " 12
-at_xfail=no
-(
-  $as_echo "182. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:150" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:150"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:150" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:150"
-
-# doen't work because of image indexing, sdk 2.9 version works
-$as_echo "testsuite-amd.at:152" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:152"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:153: make test_ImageOverlap -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:153"
-( $at_check_trace; make test_ImageOverlap -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Verifying result - Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:153"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_182
-#AT_START_183
-at_fn_group_banner 183 'testsuite-amd.at:158' \
-  "ludecomposition" "                                " 12
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc && at_xfail=yes
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv &&
-             (grep -q "define LLVM_3_2" $abs_top_builddir/config.h ||
-              grep -q "define LLVM_3_3" $abs_top_builddir/config.h ) && at_xfail=yes
-(
-  $as_echo "183. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:158" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:158"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:158" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:158"
-
-
-#test uses doubles
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:165: make test_LUDecomposition -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:165"
-( $at_check_trace; make test_LUDecomposition -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:165"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_183
-#AT_START_184
-at_fn_group_banner 184 'testsuite-amd.at:170' \
-  "mandelbrot" "                                     " 12
-at_xfail=no
-(
-  $as_echo "184. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:170" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:170"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:170" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:170"
-
-$as_echo "testsuite-amd.at:171" >"$at_check_line_file"
-( grep "undef HAVE_GLEW" $abs_top_builddir/config.h ) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:171"
-# undefined symbol: _Z7std_fmaDv4_fS_S_ with VML
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:173: make test_Mandelbrot -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:173"
-( $at_check_trace; make test_Mandelbrot -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:173"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_184
-#AT_START_185
-at_fn_group_banner 185 'testsuite-amd.at:178' \
-  "matrixmul" "                                      " 12
-at_xfail=no
-(
-  $as_echo "185. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:178" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:178"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:178" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:178"
-
-#uninvestigated miscompilation. regression from 0.8
-$as_echo "testsuite-amd.at:180" >"$at_check_line_file"
-(grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:180"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:181: make test_MatrixMultiplication -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:181"
-( $at_check_trace; make test_MatrixMultiplication -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:181"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_185
-#AT_START_186
-at_fn_group_banner 186 'testsuite-amd.at:186' \
-  "matrixmulimage" "                                 " 12
-at_xfail=no
-(
-  $as_echo "186. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:186" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:186"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:186" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:186"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:187: make test_MatrixMulImage -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:187"
-( $at_check_trace; make test_MatrixMulImage -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:187"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_186
-#AT_START_187
-at_fn_group_banner 187 'testsuite-amd.at:192' \
-  "matrixtranspose" "                                " 12
-at_xfail=no
-(
-  $as_echo "187. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:192" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:192"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:192" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:192"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:193: make test_MatrixTranspose -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:193"
-( $at_check_trace; make test_MatrixTranspose -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:193"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_187
-#AT_START_188
-at_fn_group_banner 188 'testsuite-amd.at:198' \
-  "memorymodel-repl" "                               " 12
-at_xfail=no
-(
-  $as_echo "188. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:198" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:198"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:198" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:198"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:199: POCL_WORK_GROUP_METHOD=repl make test_MemoryModel -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:199"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl make test_MemoryModel -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:199"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_188
-#AT_START_189
-at_fn_group_banner 189 'testsuite-amd.at:204' \
-  "memorymodel-loops" "                              " 12
-at_xfail=no
-(
-  $as_echo "189. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:204" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:204"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:204" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:204"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:205: POCL_WORK_GROUP_METHOD=loops make test_MemoryModel -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:205"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops make test_MemoryModel -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:205"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_189
-#AT_START_190
-at_fn_group_banner 190 'testsuite-amd.at:210' \
-  "montecarloasian" "                                " 12
-at_xfail=no
-(
-  $as_echo "190. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:210" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:210"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:210" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:210"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:211: make test_MonteCarloAsian -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:211"
-( $at_check_trace; make test_MonteCarloAsian -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:211"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_190
-#AT_START_191
-at_fn_group_banner 191 'testsuite-amd.at:216' \
-  "montecarloasiandp" "                              " 12
-at_xfail=yes
-(
-  $as_echo "191. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:216" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:216"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:216" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:216"
-
-
-# error: can't convert between vector values of different size ('double4' and 'int')
-# It should be a legal implicit conversion according to 6.3 Operators. Works also with Intel OCL
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:221: make test_MonteCarloAsianDP -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:221"
-( $at_check_trace; make test_MonteCarloAsianDP -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:221"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_191
-#AT_START_192
-at_fn_group_banner 192 'testsuite-amd.at:226' \
-  "nbody" "                                          " 12
-at_xfail=no
-(
-  $as_echo "192. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:226" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:226"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:226" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:226"
-
-$as_echo "testsuite-amd.at:227" >"$at_check_line_file"
-( grep "undef HAVE_GLEW" $abs_top_builddir/config.h ) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:227"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:228: make test_NBody -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:228"
-( $at_check_trace; make test_NBody -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:228"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_192
-#AT_START_193
-at_fn_group_banner 193 'testsuite-amd.at:233' \
-  "prefixsum" "                                      " 12
-at_xfail=no
-(
-  $as_echo "193. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:233" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:233"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:233" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:233"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:234: make test_PrefixSum -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:234"
-( $at_check_trace; make test_PrefixSum -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:234"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_193
-#AT_START_194
-at_fn_group_banner 194 'testsuite-amd.at:239' \
-  "quasirandomsequence" "                            " 12
-at_xfail=no
-(
-  $as_echo "194. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:239" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:239"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:239" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:239"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:240: make test_QuasiRandomSequence -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:240"
-( $at_check_trace; make test_QuasiRandomSequence -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:240"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_194
-#AT_START_195
-at_fn_group_banner 195 'testsuite-amd.at:245' \
-  "radixsort" "                                      " 12
-at_xfail=no
-(
-  $as_echo "195. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:245" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:245"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:245" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:245"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:246: make test_RadixSort -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:246"
-( $at_check_trace; make test_RadixSort -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:246"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_195
-#AT_START_196
-at_fn_group_banner 196 'testsuite-amd.at:251' \
-  "recursivegaussian" "                              " 12
-at_xfail=no
-(
-  $as_echo "196. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:251" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:251"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:251" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:251"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:252: make test_RecursiveGaussian -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:252"
-( $at_check_trace; make test_RecursiveGaussian -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:252"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_196
-#AT_START_197
-at_fn_group_banner 197 'testsuite-amd.at:257' \
-  "reduction" "                                      " 12
-at_xfail=no
-(
-  $as_echo "197. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:257" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:257"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:257" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:257"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:258: make test_Reduction -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:258"
-( $at_check_trace; make test_Reduction -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:258"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_197
-#AT_START_198
-at_fn_group_banner 198 'testsuite-amd.at:263' \
-  "scanlargearrays" "                                " 12
-at_xfail=no
-(
-  $as_echo "198. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:263" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:263"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:263" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:263"
-
-# Fails with vectorization. With wiloops and no unrolling, the vectorization won't apply.
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:265: make test_ScanLargeArrays -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:265"
-( $at_check_trace; make test_ScanLargeArrays -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:265"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_198
-#AT_START_199
-at_fn_group_banner 199 'testsuite-amd.at:270' \
-  "simpleconvolution" "                              " 12
-at_xfail=no
-(
-  $as_echo "199. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:270" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:270"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:270" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:270"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:271: make test_SimpleConvolution -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:271"
-( $at_check_trace; make test_SimpleConvolution -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:271"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_199
-#AT_START_200
-at_fn_group_banner 200 'testsuite-amd.at:276' \
-  "simpleimage" "                                    " 12
-at_xfail=no
-(
-  $as_echo "200. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:276" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:276"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:276" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:276"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:277: make test_SimpleImage -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:277"
-( $at_check_trace; make test_SimpleImage -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Verifying 2D copy result - Passed!
-Verifying 3D copy result - Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:277"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_200
-#AT_START_201
-at_fn_group_banner 201 'testsuite-amd.at:283' \
-  "sobelfilter" "                                    " 12
-at_xfail=no
-(
-  $as_echo "201. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:283" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:283"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:283" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:283"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:284: make test_SobelFilter -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:284"
-( $at_check_trace; make test_SobelFilter -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:284"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_201
-#AT_START_202
-at_fn_group_banner 202 'testsuite-amd.at:289' \
-  "template" "                                       " 12
-at_xfail=no
-(
-  $as_echo "202. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:289" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:289"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:289" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:289"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:290: make test_Template -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:290"
-( $at_check_trace; make test_Template -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:290"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_202
-#AT_START_203
-at_fn_group_banner 203 'testsuite-amd.at:295' \
-  "templatec" "                                      " 12
-at_xfail=no
-(
-  $as_echo "203. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:295" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:295"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:295" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:295"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:296: make test_TemplateC -sC \$abs_top_builddir/examples/AMD | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:296"
-( $at_check_trace; make test_TemplateC -sC $abs_top_builddir/examples/AMD | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:296"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_203
-#AT_START_204
-at_fn_group_banner 204 'testsuite-amd.at:301' \
-  "transferoverlap" "                                " 12
-at_xfail=no
-(
-  $as_echo "204. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:301" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:301"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:301" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:301"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:302: make test_TransferOverlap -sC \$abs_top_builddir/examples/AMD | grep \"Passed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:302"
-( $at_check_trace; make test_TransferOverlap -sC $abs_top_builddir/examples/AMD | grep "Passed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:302"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_204
-#AT_START_205
-at_fn_group_banner 205 'testsuite-amd.at:307' \
-  "urng" "                                           " 12
-at_xfail=no
-(
-  $as_echo "205. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amd.at:307" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amd "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:307"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amd.at:307" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amd.at:307"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amd.at:308: make test_URNG -sC \$abs_top_builddir/examples/AMD | grep Passed | cut -c -7"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amd.at:308"
-( $at_check_trace; make test_URNG -sC $abs_top_builddir/examples/AMD | grep Passed | cut -c -7
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amd.at:308"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_205
-#AT_START_206
-at_fn_group_banner 206 'testsuite-amdsdk2_9.at:20' \
-  "asyncdatatransfer" "                              " 13
-at_xfail=no
-(
-  $as_echo "206. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:20" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:20"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:20" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:20"
-
-# needs asynch properties implemented
-$as_echo "testsuite-amdsdk2_9.at:22" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:22"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:23: make test_AsyncDataTransfer -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep \"Passed\" | sed -e 's/^ \\t*//'"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:23"
-( $at_check_trace; make test_AsyncDataTransfer -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Passed" | sed -e 's/^ \t*//'
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "SyncKernel verification  : Passed!
-AsyncKernel verification : Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:23"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_206
-#AT_START_207
-at_fn_group_banner 207 'testsuite-amdsdk2_9.at:29' \
-  "atomiccounters" "                                 " 13
-at_xfail=yes
-(
-  $as_echo "207. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:29" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:29"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:29" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:29"
-
-# Expected Error: Device does not support cl_ext_atomic_counters_32 extension!
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:32: make test_AtomicCounters -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep \"Encryption Passed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:32"
-( $at_check_trace; make test_AtomicCounters -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Encryption Passed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Encryption Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:32"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_207
-#AT_START_208
-at_fn_group_banner 208 'testsuite-amdsdk2_9.at:37' \
-  "basicdebug" "                                     " 13
-at_xfail=no
-(
-  $as_echo "208. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:37" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:37"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:37" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:37"
-
-# This tests debugging features by executing a kernel that writes
-# out of bounds of a local array. No point testing it here as the
-# result should be undefined (basic device crashes, pthread device
-# silently passes). It passes if the kernel's local array size is
-# increased so there is no out of bounds error.
-$as_echo "testsuite-amdsdk2_9.at:43" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:43"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:44: make test_BasicDebug -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:44"
-( $at_check_trace; make test_BasicDebug -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:44"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_208
-#AT_START_209
-at_fn_group_banner 209 'testsuite-amdsdk2_9.at:49' \
-  "binarysearch" "                                   " 13
-at_xfail=no
-(
-  $as_echo "209. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:49" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:49"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:49" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:49"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:50: make test_BinarySearch -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:50"
-( $at_check_trace; make test_BinarySearch -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:50"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_209
-#AT_START_210
-at_fn_group_banner 210 'testsuite-amdsdk2_9.at:55' \
-  "binomialoption-repl" "                            " 13
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&
-             grep -q "define LLVM_3_2" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "210. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:55" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:55"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:55" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:55"
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:58: POCL_WORK_GROUP_METHOD=repl make test_BinomialOption -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:58"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:58"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_210
-#AT_START_211
-at_fn_group_banner 211 'testsuite-amdsdk2_9.at:63' \
-  "binomialoption-loops" "                           " 13
-at_xfail=no
-(
-  $as_echo "211. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:63" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:63"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:63" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:63"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:64: POCL_WORK_GROUP_METHOD=loops make test_BinomialOption -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:64"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:64"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_211
-#AT_START_212
-at_fn_group_banner 212 'testsuite-amdsdk2_9.at:69' \
-  "binomialoptionmultigpu" "                         " 13
-at_xfail=no
-(
-  $as_echo "212. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:69" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:69"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:69" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:69"
-
-$as_echo "testsuite-amdsdk2_9.at:70" >"$at_check_line_file"
-( ! test -e $abs_top_builddir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/cl/BinomialOptionMultiGPU/bin/x86_64/Release/BinomialOptionMultiGPU) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:70"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:71: make test_BinomialOptionMultiGPU -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:71"
-( $at_check_trace; make test_BinomialOptionMultiGPU -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:71"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_212
-#AT_START_213
-at_fn_group_banner 213 'testsuite-amdsdk2_9.at:76' \
-  "bitonicsort" "                                    " 13
-at_xfail=no
-(
-  $as_echo "213. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:76" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:76"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:76" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:76"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:77: make test_BitonicSort -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:77"
-( $at_check_trace; make test_BitonicSort -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:77"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_213
-#AT_START_214
-at_fn_group_banner 214 'testsuite-amdsdk2_9.at:82' \
-  "blackscholes" "                                   " 13
-at_xfail=no
-(
-  $as_echo "214. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:82" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:82"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:82" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:82"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:83: make test_BlackScholes -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:83"
-( $at_check_trace; make test_BlackScholes -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:83"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_214
-#AT_START_215
-at_fn_group_banner 215 'testsuite-amdsdk2_9.at:88' \
-  "blackscholesdp" "                                 " 13
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc && at_xfail=yes
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv && at_xfail=yes
-(
-  $as_echo "215. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:88" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:88"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:88" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:88"
-
-
-#this causes assert on LLVM 3.1
-$as_echo "testsuite-amdsdk2_9.at:91" >"$at_check_line_file"
-( grep "#define LLVM_3_1" $abs_top_builddir/config.h ) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:91"
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:94: make test_BlackScholesDP -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:94"
-( $at_check_trace; make test_BlackScholesDP -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:94"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_215
-#AT_START_216
-at_fn_group_banner 216 'testsuite-amdsdk2_9.at:99' \
-  "boxfilter" "                                      " 13
-at_xfail=no
-(
-  $as_echo "216. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:99" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:99"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:99" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:99"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:100: make test_BoxFilter -sC \$abs_top_builddir/examples/AMDSDK2.9 | egrep \"Passed|failed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:100"
-( $at_check_trace; make test_BoxFilter -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-Verifying results...Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:100"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_216
-#AT_START_217
-at_fn_group_banner 217 'testsuite-amdsdk2_9.at:106' \
-  "boxfilterGL" "                                    " 13
-at_xfail=yes
-(
-  $as_echo "217. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:106" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:106"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:106" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:106"
-
-# doesnt work
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:109: make test_BoxFilterGL -sC \$abs_top_builddir/examples/AMDSDK2.9 | egrep \"Passed|failed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:109"
-( $at_check_trace; make test_BoxFilterGL -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-Verifying results...Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:109"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_217
-#AT_START_218
-at_fn_group_banner 218 'testsuite-amdsdk2_9.at:115' \
-  "bufferbandwidth" "                                " 13
-at_xfail=no
-(
-  $as_echo "218. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:115" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:115"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:115" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:115"
-
-# Device does not support cl_khr_local_int32_base_atomics extension!
-# AT_XFAIL_IF(true)
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:118: make test_BufferBandwidth -sC \$abs_top_builddir/examples/AMDSDK2.9 | egrep \"Passed|failed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:118"
-( $at_check_trace; make test_BufferBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo " Verification Passed!
- Verification Passed!
-Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:118"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_218
-#AT_START_219
-at_fn_group_banner 219 'testsuite-amdsdk2_9.at:125' \
-  "bufferImageInterop" "                             " 13
-at_xfail=yes
-(
-  $as_echo "219. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:125" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:125"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:125" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:125"
-
-# Error: Selected device doesn't support Buffer-Image
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:128: make test_BufferImageInterop -sC \$abs_top_builddir/examples/AMDSDK2.9 | egrep \"Passed|failed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:128"
-( $at_check_trace; make test_BufferImageInterop -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:128"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_219
-#AT_START_220
-at_fn_group_banner 220 'testsuite-amdsdk2_9.at:134' \
-  "concurrentkernel" "                               " 13
-at_xfail=no
-(
-  $as_echo "220. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:134" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:134"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:134" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:134"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:135: make test_ConcurrentKernel -sC \$abs_top_builddir/examples/AMDSDK2.9 | egrep \"Passed|failed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:135"
-( $at_check_trace; make test_ConcurrentKernel -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo " Sequential Kernel verification : Passed!
- Concurrent Kernel verification : Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:135"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_220
-#AT_START_221
-at_fn_group_banner 221 'testsuite-amdsdk2_9.at:141' \
-  "constantbandwidth" "                              " 13
-at_xfail=no
-(
-  $as_echo "221. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:141" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:141"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:141" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:141"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:142: make test_ConstantBandwidth -sC \$abs_top_builddir/examples/AMDSDK2.9 | egrep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:142"
-( $at_check_trace; make test_ConstantBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-Passed!
-Passed!
-Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:142"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_221
-#AT_START_222
-at_fn_group_banner 222 'testsuite-amdsdk2_9.at:150' \
-  "cpluspluswrapper" "                               " 13
-at_xfail=no
-(
-  $as_echo "222. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:150" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:150"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:150" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:150"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:151: make test_CplusplusWrapper -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:151"
-( $at_check_trace; make test_CplusplusWrapper -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:151"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_222
-#AT_START_223
-at_fn_group_banner 223 'testsuite-amdsdk2_9.at:156' \
-  "dct" "                                            " 13
-at_xfail=no
-(
-  $as_echo "223. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:156" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:156"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:156" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:156"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:157: make test_DCT -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:157"
-( $at_check_trace; make test_DCT -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:157"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_223
-#AT_START_224
-at_fn_group_banner 224 'testsuite-amdsdk2_9.at:162' \
-  "devicefission" "                                  " 13
-at_xfail=no
-(
-  $as_echo "224. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:162" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:162"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:162" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:162"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:163: make test_DeviceFission -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:163"
-( $at_check_trace; make test_DeviceFission -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:163"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_224
-#AT_START_225
-at_fn_group_banner 225 'testsuite-amdsdk2_9.at:168' \
-  "devicefission11ext" "                             " 13
-at_xfail=yes
-(
-  $as_echo "225. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:168" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:168"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:168" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:168"
-
-# Expected Error: Device does not support cl_ext_device_fission extension!
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:171: make test_DeviceFission11Ext -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:171"
-( $at_check_trace; make test_DeviceFission11Ext -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:171"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_225
-#AT_START_226
-at_fn_group_banner 226 'testsuite-amdsdk2_9.at:176' \
-  "dwthaar1d" "                                      " 13
-at_xfail=no
-      egrep -q "#define LLVM_3_4" $abs_top_builddir/config.h && at_xfail=yes
-(
-  $as_echo "226. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:176" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:176"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:176" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:176"
-
-# 3.5 introduced the noduplicate attribute which, when
-# used with barrier(), fixes this.
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:180: make test_DwtHaar1D -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:180"
-( $at_check_trace; make test_DwtHaar1D -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:180"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_226
-#AT_START_227
-at_fn_group_banner 227 'testsuite-amdsdk2_9.at:185' \
-  "dwthaar1dcppkernel" "                             " 13
-at_xfail=yes
-(
-  $as_echo "227. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:185" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:185"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:185" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:185"
-
-# Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:188: make test_DwtHaar1DCPPKernel -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:188"
-( $at_check_trace; make test_DwtHaar1DCPPKernel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:188"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_227
-#AT_START_228
-at_fn_group_banner 228 'testsuite-amdsdk2_9.at:193' \
-  "eigenvalue" "                                     " 13
-at_xfail=yes
-(
-  $as_echo "228. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:193" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:193"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:193" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:193"
-
-# Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:196: make test_EigenValue -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:196"
-( $at_check_trace; make test_EigenValue -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:196"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_228
-#AT_START_229
-at_fn_group_banner 229 'testsuite-amdsdk2_9.at:201' \
-  "fastwalshtransform" "                             " 13
-at_xfail=no
-(
-  $as_echo "229. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:201" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:201"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:201" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:201"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:202: make test_FastWalshTransform -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:202"
-( $at_check_trace; make test_FastWalshTransform -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:202"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_229
-#AT_START_230
-at_fn_group_banner 230 'testsuite-amdsdk2_9.at:207' \
-  "floydwarshall" "                                  " 13
-at_xfail=no
-(
-  $as_echo "230. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:207" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:207"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:207" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:207"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:208: make test_FloydWarshall -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:208"
-( $at_check_trace; make test_FloydWarshall -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:208"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_230
-#AT_START_231
-at_fn_group_banner 231 'testsuite-amdsdk2_9.at:213' \
-  "fft" "                                            " 13
-at_xfail=no
-(
-  $as_echo "231. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:213" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:213"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:213" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:213"
-
-# Build parameter clc++ not supported
-$as_echo "testsuite-amdsdk2_9.at:215" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:215"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:216: make test_FFT -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:216"
-( $at_check_trace; make test_FFT -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:216"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_231
-#AT_START_232
-at_fn_group_banner 232 'testsuite-amdsdk2_9.at:221' \
-  "fluidsimulation2d" "                              " 13
-at_xfail=yes
-(
-  $as_echo "232. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:221" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:221"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:221" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:221"
-
-# error: can't convert between vector values of different size ('uint' (aka 'unsigned int') and 'int8')
-# It should be a legal implicit conversion according to 6.3 Operators. Some other error makes it
-# break with Intel OCL also.
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:227: make test_FluidSimulation2D -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:227"
-( $at_check_trace; make test_FluidSimulation2D -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:227"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_232
-#AT_START_233
-at_fn_group_banner 233 'testsuite-amdsdk2_9.at:232' \
-  "gaussiannoise" "                                  " 13
-at_xfail=no
-(
-  $as_echo "233. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:232" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:232"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:232" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:232"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:233: make test_GaussianNoise -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed | sed 's/ //g'"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:233"
-( $at_check_trace; make test_GaussianNoise -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed | sed 's/ //g'
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:233"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_233
-#AT_START_234
-at_fn_group_banner 234 'testsuite-amdsdk2_9.at:238' \
-  "gaussiannoisegl" "                                " 13
-at_xfail=no
-(
-  $as_echo "234. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:238" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:238"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:238" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:238"
-
-#doesnt work
-$as_echo "testsuite-amdsdk2_9.at:240" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:240"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:241: make test_GaussianNoiseGL -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:241"
-( $at_check_trace; make test_GaussianNoiseGL -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:241"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_234
-#AT_START_235
-at_fn_group_banner 235 'testsuite-amdsdk2_9.at:253' \
-  "hdrtonemapping" "                                 " 13
-at_xfail=no
-(
-  $as_echo "235. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:253" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:253"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:253" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:253"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:254: make test_HDRToneMapping -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:254"
-( $at_check_trace; make test_HDRToneMapping -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:254"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_235
-#AT_START_236
-at_fn_group_banner 236 'testsuite-amdsdk2_9.at:259' \
-  "helloworld" "                                     " 13
-at_xfail=no
-(
-  $as_echo "236. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:259" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:259"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:259" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:259"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:260: make test_HelloWorld -sC \$abs_top_builddir/examples/AMDSDK2.9 | egrep \"GdkknVnqkc|HelloWorld\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:260"
-( $at_check_trace; make test_HelloWorld -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "GdkknVnqkc|HelloWorld"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "GdkknVnqkc
-HelloWorld
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:260"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_236
-#AT_START_237
-at_fn_group_banner 237 'testsuite-amdsdk2_9.at:266' \
-  "histogram-repl" "                                 " 13
-at_xfail=no
-(
-  $as_echo "237. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:266" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:266"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:266" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:266"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:267: POCL_WORK_GROUP_METHOD=repl make test_Histogram -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:267"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl make test_Histogram -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:267"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_237
-#AT_START_238
-at_fn_group_banner 238 'testsuite-amdsdk2_9.at:272' \
-  "histogram-loops" "                                " 13
-at_xfail=no
-(
-  $as_echo "238. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:272" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:272"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:272" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:272"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:273: POCL_WORK_GROUP_METHOD=loops make test_Histogram -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:273"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops make test_Histogram -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:273"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_238
-#AT_START_239
-at_fn_group_banner 239 'testsuite-amdsdk2_9.at:278' \
-  "histogramatomic" "                                " 13
-at_xfail=no
-(
-  $as_echo "239. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:278" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:278"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:278" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:278"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:279: make test_HistogramAtomics -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:279"
-( $at_check_trace; make test_HistogramAtomics -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:279"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_239
-#AT_START_240
-at_fn_group_banner 240 'testsuite-amdsdk2_9.at:284' \
-  "imagebandwidth" "                                 " 13
-at_xfail=no
-(
-  $as_echo "240. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:284" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:284"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:284" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:284"
-
-# AT_SKIP_IF(true)
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:286: make test_ImageBandwidth -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:286"
-( $at_check_trace; make test_ImageBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:286"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_240
-#AT_START_241
-at_fn_group_banner 241 'testsuite-amdsdk2_9.at:292' \
-  "imageoverlap" "                                   " 13
-at_xfail=no
-(
-  $as_echo "241. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:292" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:292"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:292" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:292"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:293: make test_ImageOverlap -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:293"
-( $at_check_trace; make test_ImageOverlap -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Verifying result - Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:293"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_241
-#AT_START_242
-at_fn_group_banner 242 'testsuite-amdsdk2_9.at:298' \
-  "introstatickcppkernel" "                          " 13
-at_xfail=yes
-(
-  $as_echo "242. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:298" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:298"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:298" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:298"
-
-# Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:301: make test_IntroStaticCPPKernel -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:301"
-( $at_check_trace; make test_IntroStaticCPPKernel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Verifying result - Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:301"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_242
-#AT_START_243
-at_fn_group_banner 243 'testsuite-amdsdk2_9.at:306' \
-  "kernellauch" "                                    " 13
-at_xfail=no
-(
-  $as_echo "243. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:306" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:306"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:306" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:306"
-
-# Device does not support cl_khr_local_int32_base_atomics extension!
-# works anyway
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:309: make test_KernelLaunch -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed!"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:309"
-( $at_check_trace; make test_KernelLaunch -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed!
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:309"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_243
-#AT_START_244
-at_fn_group_banner 244 'testsuite-amdsdk2_9.at:314' \
-  "kmeansautoclustering" "                           " 13
-at_xfail=yes
-(
-  $as_echo "244. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:314" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:314"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:314" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:314"
-
-# doesn't find opencl library for some reason
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:317: make test_KmeansAutoclustering -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:317"
-( $at_check_trace; make test_KmeansAutoclustering -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:317"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_244
-#AT_START_245
-at_fn_group_banner 245 'testsuite-amdsdk2_9.at:323' \
-  "ldsbandwidth" "                                   " 13
-at_xfail=no
-(
-  $as_echo "245. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:323" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:323"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:323" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:323"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:324: make test_LDSBandwidth -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:324"
-( $at_check_trace; make test_LDSBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-Passed!
-Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:324"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_245
-#AT_START_246
-at_fn_group_banner 246 'testsuite-amdsdk2_9.at:331' \
-  "ludecomposition" "                                " 13
-at_xfail=no
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc && at_xfail=yes
-      grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv && at_xfail=yes
-(
-  $as_echo "246. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:331" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:331"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:331" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:331"
-
-
-#test uses doubles
-
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:336: make test_LUDecomposition -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:336"
-( $at_check_trace; make test_LUDecomposition -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:336"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_246
-#AT_START_247
-at_fn_group_banner 247 'testsuite-amdsdk2_9.at:341' \
-  "mandelbrot" "                                     " 13
-at_xfail=no
-(
-  $as_echo "247. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:341" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:341"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:341" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:341"
-
-$as_echo "testsuite-amdsdk2_9.at:342" >"$at_check_line_file"
-( grep "undef HAVE_GLEW" $abs_top_builddir/config.h ) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:342"
-$as_echo "testsuite-amdsdk2_9.at:343" >"$at_check_line_file"
-( ! test -e $abs_top_builddir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/cl/Mandelbrot/bin/x86_64/Release/Mandelbrot) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:343"
-# undefined symbol: _Z7std_fmaDv4_fS_S_ with VML
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:345: make test_Mandelbrot -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:345"
-( $at_check_trace; make test_Mandelbrot -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:345"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_247
-#AT_START_248
-at_fn_group_banner 248 'testsuite-amdsdk2_9.at:350' \
-  "matrixmuldouble" "                                " 13
-at_xfail=no
-(
-  $as_echo "248. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:350" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:350"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:350" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:350"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:351: make test_MatrixMulDouble -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:351"
-( $at_check_trace; make test_MatrixMulDouble -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:351"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_248
-#AT_START_249
-at_fn_group_banner 249 'testsuite-amdsdk2_9.at:356' \
-  "matrixmulimage" "                                 " 13
-at_xfail=no
-(
-  $as_echo "249. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:356" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:356"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:356" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:356"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:357: make test_MatrixMulImage -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:357"
-( $at_check_trace; make test_MatrixMulImage -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:357"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_249
-#AT_START_250
-at_fn_group_banner 250 'testsuite-amdsdk2_9.at:362' \
-  "matrixmultiplication" "                           " 13
-at_xfail=no
-(
-  $as_echo "250. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:362" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:362"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:362" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:362"
-
-# pocl error: encountered unimplemented part of the OpenCL specs in clCreateImage2D.c:119
-#AT_XFAIL_IF(true)
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:365: make test_MatrixMultiplication -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:365"
-( $at_check_trace; make test_MatrixMultiplication -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:365"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_250
-#AT_START_251
-at_fn_group_banner 251 'testsuite-amdsdk2_9.at:370' \
-  "matrixtranspose" "                                " 13
-at_xfail=no
-(
-  $as_echo "251. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:370" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:370"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:370" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:370"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:371: make test_MatrixTranspose -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:371"
-( $at_check_trace; make test_MatrixTranspose -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:371"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_251
-#AT_START_252
-at_fn_group_banner 252 'testsuite-amdsdk2_9.at:376' \
-  "memorymodel-repl" "                               " 13
-at_xfail=no
-(
-  $as_echo "252. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:376" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:376"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:376" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:376"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:377: POCL_WORK_GROUP_METHOD=repl make test_MemoryModel -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:377"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=repl make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:377"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_252
-#AT_START_253
-at_fn_group_banner 253 'testsuite-amdsdk2_9.at:382' \
-  "memorymodel-loops" "                              " 13
-at_xfail=no
-(
-  $as_echo "253. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:382" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:382"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:382" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:382"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:383: POCL_WORK_GROUP_METHOD=loops make test_MemoryModel -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:383"
-( $at_check_trace; POCL_WORK_GROUP_METHOD=loops make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:383"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_253
-#AT_START_254
-at_fn_group_banner 254 'testsuite-amdsdk2_9.at:388' \
-  "memoryoptimizations" "                            " 13
-at_xfail=no
-(
-  $as_echo "254. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:388" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:388"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:388" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:388"
-
-#Device does not support global_int32_base_atomics
-#AT_SKIP_IF(true)
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:391: make test_MemoryOptimizations -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:391"
-( $at_check_trace; make test_MemoryOptimizations -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:391"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_254
-#AT_START_255
-at_fn_group_banner 255 'testsuite-amdsdk2_9.at:418' \
-  "merzennetwister" "                                " 13
-at_xfail=no
-(
-  $as_echo "255. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:418" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:418"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:418" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:418"
-
-#Build parameter clc++ is not supported
-$as_echo "testsuite-amdsdk2_9.at:420" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:420"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:421: make test_MersenneTwister -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:421"
-( $at_check_trace; make test_MersenneTwister -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:421"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_255
-#AT_START_256
-at_fn_group_banner 256 'testsuite-amdsdk2_9.at:426' \
-  "montecarloasian" "                                " 13
-at_xfail=yes
-(
-  $as_echo "256. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:426" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:426"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:426" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:426"
-
-# kernel compilation fails due to
-# error: can't convert between vector values of different size ('float4' and 'int')
-# It should be a legal implicit conversion according to 6.3 Operators. Works also with Intel OCL
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:431: make test_MonteCarloAsian -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:431"
-( $at_check_trace; make test_MonteCarloAsian -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:431"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_256
-#AT_START_257
-at_fn_group_banner 257 'testsuite-amdsdk2_9.at:436' \
-  "montecarloasiandp" "                              " 13
-at_xfail=yes
-(
-  $as_echo "257. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:436" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:436"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:436" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:436"
-
-
-# error: can't convert between vector values of different size ('double4' and 'int')
-# It should be a legal implicit conversion according to 6.3 Operators. Works also with Intel OCL
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:441: make test_MonteCarloAsianDP -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:441"
-( $at_check_trace; make test_MonteCarloAsianDP -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:441"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_257
-#AT_START_258
-at_fn_group_banner 258 'testsuite-amdsdk2_9.at:446' \
-  "montecarloasianmultigpu" "                        " 13
-at_xfail=no
-(
-  $as_echo "258. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:446" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:446"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:446" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:446"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:447: make test_MonteCarloAsianMultiGPU -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:447"
-( $at_check_trace; make test_MonteCarloAsianMultiGPU -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:447"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_258
-#AT_START_259
-at_fn_group_banner 259 'testsuite-amdsdk2_9.at:452' \
-  "nbody" "                                          " 13
-at_xfail=no
-(
-  $as_echo "259. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:452" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:452"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:452" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:452"
-
-$as_echo "testsuite-amdsdk2_9.at:453" >"$at_check_line_file"
-( grep "undef HAVE_GLEW" $abs_top_builddir/config.h ) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:453"
-$as_echo "testsuite-amdsdk2_9.at:454" >"$at_check_line_file"
-( ! test -e $abs_top_builddir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/cl/NBody/bin/x86_64/Release/NBody) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:454"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:455: make test_NBody -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:455"
-( $at_check_trace; make test_NBody -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:455"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_259
-#AT_START_260
-at_fn_group_banner 260 'testsuite-amdsdk2_9.at:460' \
-  "prefixsum" "                                      " 13
-at_xfail=no
-(
-  $as_echo "260. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:460" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:460"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:460" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:460"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:461: make test_PrefixSum -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:461"
-( $at_check_trace; make test_PrefixSum -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:461"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_260
-#AT_START_261
-at_fn_group_banner 261 'testsuite-amdsdk2_9.at:466' \
-  "quasirandomsequence" "                            " 13
-at_xfail=no
-(
-  $as_echo "261. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:466" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:466"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:466" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:466"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:467: make test_QuasiRandomSequence -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:467"
-( $at_check_trace; make test_QuasiRandomSequence -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:467"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_261
-#AT_START_262
-at_fn_group_banner 262 'testsuite-amdsdk2_9.at:472' \
-  "radixsort" "                                      " 13
-at_xfail=no
-(
-  $as_echo "262. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:472" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:472"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:472" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:472"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:473: make test_RadixSort -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:473"
-( $at_check_trace; make test_RadixSort -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:473"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_262
-#AT_START_263
-at_fn_group_banner 263 'testsuite-amdsdk2_9.at:478' \
-  "recursivegaussian" "                              " 13
-at_xfail=no
-(
-  $as_echo "263. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:478" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:478"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:478" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:478"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:479: make test_RecursiveGaussian -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:479"
-( $at_check_trace; make test_RecursiveGaussian -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:479"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_263
-#AT_START_264
-at_fn_group_banner 264 'testsuite-amdsdk2_9.at:484' \
-  "reduction" "                                      " 13
-at_xfail=no
-(
-  $as_echo "264. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:484" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:484"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:484" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:484"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:485: make test_Reduction -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:485"
-( $at_check_trace; make test_Reduction -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:485"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_264
-#AT_START_265
-at_fn_group_banner 265 'testsuite-amdsdk2_9.at:490' \
-  "scanlargearrays" "                                " 13
-at_xfail=no
-(
-  $as_echo "265. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:490" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:490"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:490" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:490"
-
-# Fails with vectorization. With wiloops and no unrolling, the vectorization won't apply.
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:492: make test_ScanLargeArrays -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:492"
-( $at_check_trace; make test_ScanLargeArrays -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:492"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_265
-#AT_START_266
-at_fn_group_banner 266 'testsuite-amdsdk2_9.at:497' \
-  "simpleconvolution" "                              " 13
-at_xfail=no
-(
-  $as_echo "266. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:497" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:497"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:497" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:497"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:498: make test_SimpleConvolution -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:498"
-( $at_check_trace; make test_SimpleConvolution -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:498"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_266
-#AT_START_267
-at_fn_group_banner 267 'testsuite-amdsdk2_9.at:503' \
-  "simplegl" "                                       " 13
-at_xfail=yes
-(
-  $as_echo "267. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:503" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:503"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:503" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:503"
-
-# doesn't find opecl library
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:506: make test_SimpleGL -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:506"
-( $at_check_trace; make test_SimpleGL -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:506"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_267
-#AT_START_268
-at_fn_group_banner 268 'testsuite-amdsdk2_9.at:511' \
-  "simpleimage" "                                    " 13
-at_xfail=no
-(
-  $as_echo "268. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:511" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:511"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:511" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:511"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:512: make test_SimpleImage -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:512"
-( $at_check_trace; make test_SimpleImage -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Verifying 2D copy result - Passed!
-Verifying 3D copy result - Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:512"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_268
-#AT_START_269
-at_fn_group_banner 269 'testsuite-amdsdk2_9.at:518' \
-  "soaversusaos" "                                   " 13
-at_xfail=no
-(
-  $as_echo "269. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:518" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:518"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:518" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:518"
-
-#Build Options are : -x clc++ -D num1=4096 -D num2=4096
-#Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS
-$as_echo "testsuite-amdsdk2_9.at:521" >"$at_check_line_file"
-at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:521"
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:522: make test_SoAversusAoS -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:522"
-( $at_check_trace; make test_SoAversusAoS -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:522"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_269
-#AT_START_270
-at_fn_group_banner 270 'testsuite-amdsdk2_9.at:527' \
-  "sobelfilter" "                                    " 13
-at_xfail=no
-(
-  $as_echo "270. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:527" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:527"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:527" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:527"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:528: make test_SobelFilter -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:528"
-( $at_check_trace; make test_SobelFilter -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:528"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_270
-#AT_START_271
-at_fn_group_banner 271 'testsuite-amdsdk2_9.at:533' \
-  "sobelfilterimage" "                               " 13
-at_xfail=yes
-(
-  $as_echo "271. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:533" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:533"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:533" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:533"
-
-# segfault
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:536: make test_SobelFilterImage -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:536"
-( $at_check_trace; make test_SobelFilterImage -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:536"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_271
-#AT_START_272
-at_fn_group_banner 272 'testsuite-amdsdk2_9.at:541' \
-  "stringsearch" "                                   " 13
-at_xfail=no
-(
-  $as_echo "272. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:541" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:541"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:541" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:541"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:542: make test_StringSearch -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:542"
-( $at_check_trace; make test_StringSearch -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:542"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_272
-#AT_START_273
-at_fn_group_banner 273 'testsuite-amdsdk2_9.at:548' \
-  "template" "                                       " 13
-at_xfail=no
-(
-  $as_echo "273. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:548" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:548"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:548" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:548"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:549: make test_Template -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:549"
-( $at_check_trace; make test_Template -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:549"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_273
-#AT_START_274
-at_fn_group_banner 274 'testsuite-amdsdk2_9.at:554' \
-  "transferoverlap" "                                " 13
-at_xfail=no
-(
-  $as_echo "274. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:554" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:554"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:554" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:554"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:555: make test_TransferOverlap -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep \"Passed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:555"
-( $at_check_trace; make test_TransferOverlap -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Passed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:555"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_274
-#AT_START_275
-at_fn_group_banner 275 'testsuite-amdsdk2_9.at:560' \
-  "transferoverlapcpp" "                             " 13
-at_xfail=yes
-(
-  $as_echo "275. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:560" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:560"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:560" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:560"
-
-# Expected Error: Device does not support cl_khr_local_int32_base_atomics extension! and segfault
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:563: make test_TransferOverlapCPP -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep \"Passed\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:563"
-( $at_check_trace; make test_TransferOverlapCPP -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Passed"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:563"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_275
-#AT_START_276
-at_fn_group_banner 276 'testsuite-amdsdk2_9.at:568' \
-  "unsharpmask" "                                    " 13
-at_xfail=yes
-(
-  $as_echo "276. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:568" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:568"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:568" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:568"
-
-# doesn't find opencl library
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:571: make test_UnsharpMask -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:571"
-( $at_check_trace; make test_UnsharpMask -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:571"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_276
-#AT_START_277
-at_fn_group_banner 277 'testsuite-amdsdk2_9.at:576' \
-  "urng" "                                           " 13
-at_xfail=no
-(
-  $as_echo "277. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:576" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:576"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:576" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:576"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:577: make test_URNG -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed | cut -c -7"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:577"
-( $at_check_trace; make test_URNG -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed | cut -c -7
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:577"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_277
-#AT_START_278
-at_fn_group_banner 278 'testsuite-amdsdk2_9.at:582' \
-  "urngnoisegl" "                                    " 13
-at_xfail=yes
-(
-  $as_echo "278. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-amdsdk2_9.at:582" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" amdsdk2_9 "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:582"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-amdsdk2_9.at:582" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-amdsdk2_9.at:582"
-
-# Error: clGetPlatformIDs failed. Error code : CL_PLATFORM_NOT_FOUND_KHR
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-amdsdk2_9.at:585: make test_URNGNoiseGL -sC \$abs_top_builddir/examples/AMDSDK2.9 | grep Passed | cut -c -7"
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-amdsdk2_9.at:585"
-( $at_check_trace; make test_URNGNoiseGL -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed | cut -c -7
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "Passed!
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-amdsdk2_9.at:585"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_278
-#AT_START_279
-at_fn_group_banner 279 'testsuite-vexcl.at:10' \
-  "fft" "                                            " 14
-at_xfail=no
-(
-  $as_echo "279. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:10" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:10"
-
-# Warnings ignored:
-# clang: warning: argument unused during compilation: '-cl-mad-enable'
-# clang: warning: argument unused during compilation: '-cl-fast-relaxed-math'
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:14: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./fft 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:14"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./fft 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:14"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_279
-#AT_START_280
-at_fn_group_banner 280 'testsuite-vexcl.at:19' \
-  "generator" "                                      " 14
-at_xfail=no
-(
-  $as_echo "280. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:19" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:19"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:20: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./generator 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:20"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./generator 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:20"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_280
-#AT_START_281
-at_fn_group_banner 281 'testsuite-vexcl.at:25' \
-  "multiple_objects" "                               " 14
-at_xfail=no
-(
-  $as_echo "281. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:25" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:25"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:26: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./multiple_objects 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:26"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./multiple_objects 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:26"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_281
-#AT_START_282
-at_fn_group_banner 282 'testsuite-vexcl.at:31' \
-  "multivector_arithmetics" "                        " 14
-at_xfail=no
-(
-  $as_echo "282. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:31" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:31"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:32: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./multivector_arithmetics 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:32"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./multivector_arithmetics 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:32"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_282
-#AT_START_283
-at_fn_group_banner 283 'testsuite-vexcl.at:37' \
-  "multivector_create" "                             " 14
-at_xfail=no
-(
-  $as_echo "283. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:37" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:37"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:38: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./multivector_create 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:38"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./multivector_create 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:38"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_283
-#AT_START_284
-at_fn_group_banner 284 'testsuite-vexcl.at:43' \
-  "random" "                                         " 14
-at_xfail=no
-(
-  $as_echo "284. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:43" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:43"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:44: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./random 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:44"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./random 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:44"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_284
-#AT_START_285
-at_fn_group_banner 285 'testsuite-vexcl.at:49' \
-  "spmv" "                                           " 14
-at_xfail=no
-(
-  $as_echo "285. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:49" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:49"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:50: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./spmv 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:50"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./spmv 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:50"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_285
-#AT_START_286
-at_fn_group_banner 286 'testsuite-vexcl.at:55' \
-  "stencil" "                                        " 14
-at_xfail=yes
-(
-  $as_echo "286. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:55" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:55"
-
-# Crash.
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:58: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./stencil 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:58"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./stencil 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:58"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_286
-#AT_START_287
-at_fn_group_banner 287 'testsuite-vexcl.at:63' \
-  "vector_arithmetics" "                             " 14
-at_xfail=no
-(
-  $as_echo "287. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:63" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:63"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:64: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./vector_arithmetics 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:64"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./vector_arithmetics 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:64"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_287
-#AT_START_288
-at_fn_group_banner 288 'testsuite-vexcl.at:69' \
-  "vector_copy" "                                    " 14
-at_xfail=no
-(
-  $as_echo "288. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:69" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:69"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:70: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./vector_copy 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:70"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./vector_copy 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:70"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_288
-#AT_START_289
-at_fn_group_banner 289 'testsuite-vexcl.at:75' \
-  "vector_create" "                                  " 14
-at_xfail=no
-(
-  $as_echo "289. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-vexcl.at:75" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" vexcl "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-vexcl.at:75"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-vexcl.at:76: cd \$abs_top_builddir/examples/VexCL/vexcl/tests ; ./vector_create 2>&1 | grep \"No errors\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-vexcl.at:76"
-( $at_check_trace; cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./vector_create 2>&1 | grep "No errors"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo >>"$at_stdout"; $as_echo "*** No errors detected
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-vexcl.at:76"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_289
-#AT_START_290
-at_fn_group_banner 290 'testsuite-halide.at:11' \
-  "tutorial12" "                                     " 15
-at_xfail=no
-(
-  $as_echo "290. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-halide.at:11" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" Halide "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-halide.at:11"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-halide.at:11" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-halide.at:11"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-halide.at:12: cd \$abs_top_builddir/examples/Halide/Halide/tutorial ;
-LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ../bin/tutorial_lesson_12_using_the_gpu | cut -d ' ' -f 2"
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-halide.at:12"
-( $at_check_trace; cd $abs_top_builddir/examples/Halide/Halide/tutorial ;
-LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ../bin/tutorial_lesson_12_using_the_gpu | cut -d ' ' -f 2
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo "performance
-milliseconds
-performance
-milliseconds
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-halide.at:12"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_290
-#AT_START_291
-at_fn_group_banner 291 'testsuite-halide.at:21' \
-  "bilateral_grid" "                                 " 15
-at_xfail=no
-(
-  $as_echo "291. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-halide.at:21" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" Halide "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-halide.at:21"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-halide.at:21" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-halide.at:21"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-halide.at:22: cd \$abs_top_builddir/examples/Halide/Halide/apps/bilateral_grid ;
-LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ./filter ../images/gray.png out.png 0.1"
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-halide.at:22"
-( $at_check_trace; cd $abs_top_builddir/examples/Halide/Halide/apps/bilateral_grid ;
-LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ./filter ../images/gray.png out.png 0.1
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo stdout:; cat "$at_stdout"
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-halide.at:22"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_291
-#AT_START_292
-at_fn_group_banner 292 'testsuite-halide.at:26' \
-  "interpolate" "                                    " 15
-at_xfail=no
-(
-  $as_echo "292. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-halide.at:26" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" Halide "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-halide.at:26"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-halide.at:26" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-halide.at:26"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-halide.at:27: cd \$abs_top_builddir/examples/Halide/Halide/apps/interpolate ;
-LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ./interpolate ../images/rgba.png out.png"
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-halide.at:27"
-( $at_check_trace; cd $abs_top_builddir/examples/Halide/Halide/apps/interpolate ;
-LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ./interpolate ../images/rgba.png out.png
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo stdout:; cat "$at_stdout"
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-halide.at:27"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_292
-#AT_START_293
-at_fn_group_banner 293 'testsuite-halide.at:31' \
-  "local_laplacian" "                                " 15
-at_xfail=no
-(
-  $as_echo "293. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-halide.at:31" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" Halide "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-halide.at:31"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-halide.at:31" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-halide.at:31"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-halide.at:32: cd \$abs_top_builddir/examples/Halide/Halide/apps/local_laplacian ;
-LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ./process ../images/rgb.png 8 1 1 out.png"
-at_fn_check_prepare_notrace 'a ${...} parameter expansion' "testsuite-halide.at:32"
-( $at_check_trace; cd $abs_top_builddir/examples/Halide/Halide/apps/local_laplacian ;
-LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ./process ../images/rgb.png 8 1 1 out.png
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo stdout:; cat "$at_stdout"
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-halide.at:32"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_293
-#AT_START_294
-at_fn_group_banner 294 'testsuite-cloverleaf.at:11' \
-  "cloverleaf" "                                     " 16
-at_xfail=no
-(
-  $as_echo "294. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-cloverleaf.at:11" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" CloverLeaf "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-cloverleaf.at:11"
-  # AMD APP SDK tests require ICD loader with
-  $as_echo "testsuite-cloverleaf.at:11" >"$at_check_line_file"
-(! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-cloverleaf.at:11"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-cloverleaf.at:12: cd \$abs_top_builddir/examples/CloverLeaf/CloverLeaf_OpenCL ;
-./clover_leaf 2>&1 |grep First | cut -c-6
-"
-at_fn_check_prepare_notrace 'an embedded newline' "testsuite-cloverleaf.at:12"
-( $at_check_trace; cd $abs_top_builddir/examples/CloverLeaf/CloverLeaf_OpenCL ;
-./clover_leaf 2>&1 |grep First | cut -c-6
-
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-echo >>"$at_stdout"; $as_echo " First
-" | \
-  $at_diff - "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-cloverleaf.at:12"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_294
-#AT_START_295
-at_fn_group_banner 295 'testsuite-piglit.at:3' \
-  "Piglit testsuite with LLVM 3.5" "                 " 17
-at_xfail=no
-(
-  $as_echo "295. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-$as_echo "testsuite-piglit.at:5" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" piglit "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-piglit.at:5"
-$as_echo "testsuite-piglit.at:6" >"$at_check_line_file"
-(! grep -q "#define LLVM_3_5" $abs_top_builddir/config.h) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-piglit.at:6"
-{ set +x
-$as_echo "$at_srcdir/testsuite-piglit.at:7: cd \$abs_top_builddir/examples/piglit/; ./produce_results.sh "
-at_fn_check_prepare_dynamic "cd $abs_top_builddir/examples/piglit/; ./produce_results.sh " "testsuite-piglit.at:7"
-( $at_check_trace; cd $abs_top_builddir/examples/piglit/; ./produce_results.sh
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-echo stderr:; cat "$at_stderr"
-echo stdout:; cat "$at_stdout"
-at_fn_check_skip $at_status "$at_srcdir/testsuite-piglit.at:7"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-piglit.at:8: cd \$abs_top_builddir/examples/piglit/; LC_ALL=C comm -23 sorted_ref_llvm_3.5 sorted_result"
-at_fn_check_prepare_dynamic "cd $abs_top_builddir/examples/piglit/; LC_ALL=C comm -23 sorted_ref_llvm_3.5 sorted_result" "testsuite-piglit.at:8"
-( $at_check_trace; cd $abs_top_builddir/examples/piglit/; LC_ALL=C comm -23 sorted_ref_llvm_3.5 sorted_result
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 0 $at_status "$at_srcdir/testsuite-piglit.at:8"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_295
-#AT_START_296
-at_fn_group_banner 296 'testsuite-opencv.at:9' \
-  "UMat" "                                           " 18
-at_xfail=no
-(
-  $as_echo "296. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:9" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:9"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:10: make test_UMat -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:10"
-( $at_check_trace; make test_UMat -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:10"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_296
-#AT_START_297
-at_fn_group_banner 297 'testsuite-opencv.at:14' \
-  "Core_UMat" "                                      " 18
-at_xfail=no
-(
-  $as_echo "297. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:14" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:14"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:15: make test_Core_UMat -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:15"
-( $at_check_trace; make test_Core_UMat -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:15"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_297
-#AT_START_298
-at_fn_group_banner 298 'testsuite-opencv.at:19' \
-  "Image2D" "                                        " 18
-at_xfail=no
-(
-  $as_echo "298. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:19" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:19"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:20: make test_Image2D -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:20"
-( $at_check_trace; make test_Image2D -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:20"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_298
-#AT_START_299
-at_fn_group_banner 299 'testsuite-opencv.at:24' \
-  "UMatBasicTests" "                                 " 18
-at_xfail=no
-(
-  $as_echo "299. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:24" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:24"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:25: make test_UMat/UMatBasicTests -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:25"
-( $at_check_trace; make test_UMat/UMatBasicTests -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:25"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_299
-#AT_START_300
-at_fn_group_banner 300 'testsuite-opencv.at:29' \
-  "UMatTestReshape" "                                " 18
-at_xfail=no
-(
-  $as_echo "300. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:29" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:29"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:30: make test_UMat/UMatTestReshape -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:30"
-( $at_check_trace; make test_UMat/UMatTestReshape -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:30"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_300
-#AT_START_301
-at_fn_group_banner 301 'testsuite-opencv.at:34' \
-  "UMatTestRoi" "                                    " 18
-at_xfail=no
-(
-  $as_echo "301. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:34" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:34"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:35: make test_UMat/UMatTestRoi -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:35"
-( $at_check_trace; make test_UMat/UMatTestRoi -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:35"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_301
-#AT_START_302
-at_fn_group_banner 302 'testsuite-opencv.at:39' \
-  "UMatTestSizeOperations" "                         " 18
-at_xfail=no
-(
-  $as_echo "302. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:39" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:39"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:40: make test_UMat/UMatTestSizeOperations -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:40"
-( $at_check_trace; make test_UMat/UMatTestSizeOperations -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:40"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_302
-#AT_START_303
-at_fn_group_banner 303 'testsuite-opencv.at:44' \
-  "UMatTestUMatOperations" "                         " 18
-at_xfail=no
-(
-  $as_echo "303. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:44" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:44"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:45: make test_UMat/UMatTestUMatOperations -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:45"
-( $at_check_trace; make test_UMat/UMatTestUMatOperations -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:45"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_303
-#AT_START_304
-at_fn_group_banner 304 'testsuite-opencv.at:51' \
-  "OCL_Channels/Merge" "                             " 19
-at_xfail=no
-(
-  $as_echo "304. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:51" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:51"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:52: make test_OCL_Channels/Merge -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:52"
-( $at_check_trace; make test_OCL_Channels/Merge -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:52"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_304
-#AT_START_305
-at_fn_group_banner 305 'testsuite-opencv.at:56' \
-  "OCL_Channels/Split" "                             " 19
-at_xfail=no
-(
-  $as_echo "305. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:56" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:56"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:57: make test_OCL_Channels/Split -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:57"
-( $at_check_trace; make test_OCL_Channels/Split -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:57"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_305
-#AT_START_306
-at_fn_group_banner 306 'testsuite-opencv.at:61' \
-  "OCL_Channels/MixChannels" "                       " 19
-at_xfail=no
-(
-  $as_echo "306. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:61" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:61"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:62: make test_OCL_Channels/MixChannels -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:62"
-( $at_check_trace; make test_OCL_Channels/MixChannels -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:62"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_306
-#AT_START_307
-at_fn_group_banner 307 'testsuite-opencv.at:66' \
-  "OCL_Channels/InsertChannel" "                     " 19
-at_xfail=no
-(
-  $as_echo "307. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:66" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:66"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:67: make test_OCL_Channels/InsertChannels -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:67"
-( $at_check_trace; make test_OCL_Channels/InsertChannels -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:67"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_307
-#AT_START_308
-at_fn_group_banner 308 'testsuite-opencv.at:71' \
-  "OCL_Channels/ExtractChannel" "                    " 19
-at_xfail=no
-(
-  $as_echo "308. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:71" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:71"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:72: make test_OCL_Channels/ExtractChannels -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:72"
-( $at_check_trace; make test_OCL_Channels/ExtractChannels -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:72"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_308
-#AT_START_309
-at_fn_group_banner 309 'testsuite-opencv.at:78' \
-  "Lut" "                                            " 20
-at_xfail=no
-(
-  $as_echo "309. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:78" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:78"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:79: make test_OCL_Arithm/Lut -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:79"
-( $at_check_trace; make test_OCL_Arithm/Lut -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:79"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_309
-#AT_START_310
-at_fn_group_banner 310 'testsuite-opencv.at:83' \
-  "Add" "                                            " 20
-at_xfail=no
-(
-  $as_echo "310. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:83" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:83"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:84: make test_OCL_Arithm/Add -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:84"
-( $at_check_trace; make test_OCL_Arithm/Add -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:84"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_310
-#AT_START_311
-at_fn_group_banner 311 'testsuite-opencv.at:88' \
-  "Subtract" "                                       " 20
-at_xfail=no
-(
-  $as_echo "311. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:88" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:88"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:89: make test_OCL_Arithm/Subtract -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:89"
-( $at_check_trace; make test_OCL_Arithm/Subtract -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:89"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_311
-#AT_START_312
-at_fn_group_banner 312 'testsuite-opencv.at:93' \
-  "Mul" "                                            " 20
-at_xfail=no
-(
-  $as_echo "312. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:93" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:93"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:94: make test_OCL_Arithm/Mul -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:94"
-( $at_check_trace; make test_OCL_Arithm/Mul -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:94"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_312
-#AT_START_313
-at_fn_group_banner 313 'testsuite-opencv.at:98' \
-  "Div" "                                            " 20
-at_xfail=no
-(
-  $as_echo "313. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:98" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:98"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:99: make test_OCL_Arithm/Div -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:99"
-( $at_check_trace; make test_OCL_Arithm/Div -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:99"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_313
-#AT_START_314
-at_fn_group_banner 314 'testsuite-opencv.at:103' \
-  "Min" "                                            " 20
-at_xfail=no
-(
-  $as_echo "314. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:103" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:103"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:104: make test_OCL_Arithm/Min -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:104"
-( $at_check_trace; make test_OCL_Arithm/Min -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:104"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_314
-#AT_START_315
-at_fn_group_banner 315 'testsuite-opencv.at:108' \
-  "Max" "                                            " 20
-at_xfail=no
-(
-  $as_echo "315. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:108" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:108"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:109: make test_OCL_Arithm/Max -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:109"
-( $at_check_trace; make test_OCL_Arithm/Max -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:109"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_315
-#AT_START_316
-at_fn_group_banner 316 'testsuite-opencv.at:113' \
-  "AddWeighted" "                                    " 20
-at_xfail=no
-(
-  $as_echo "316. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:113" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:113"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:114: make test_OCL_Arithm/AddWeighted -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:114"
-( $at_check_trace; make test_OCL_Arithm/AddWeighted -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:114"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_316
-#AT_START_317
-at_fn_group_banner 317 'testsuite-opencv.at:118' \
-  "Absdiff" "                                        " 20
-at_xfail=no
-(
-  $as_echo "317. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:118" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:118"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:119: make test_OCL_Arithm/Absdiff -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:119"
-( $at_check_trace; make test_OCL_Arithm/Absdiff -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:119"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_317
-#AT_START_318
-at_fn_group_banner 318 'testsuite-opencv.at:123' \
-  "CartToPolar" "                                    " 20
-at_xfail=no
-(
-  $as_echo "318. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:123" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:123"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:124: make test_OCL_Arithm/CartToPolar -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:124"
-( $at_check_trace; make test_OCL_Arithm/CartToPolar -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:124"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_318
-#AT_START_319
-at_fn_group_banner 319 'testsuite-opencv.at:128' \
-  "PolarToCart" "                                    " 20
-at_xfail=no
-(
-  $as_echo "319. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:128" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:128"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:129: make test_OCL_Arithm/PolarToCart -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:129"
-( $at_check_trace; make test_OCL_Arithm/PolarToCart -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:129"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_319
-#AT_START_320
-at_fn_group_banner 320 'testsuite-opencv.at:133' \
-  "Transpose" "                                      " 20
-at_xfail=no
-(
-  $as_echo "320. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:133" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:133"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:134: make test_OCL_Arithm/Transpose -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:134"
-( $at_check_trace; make test_OCL_Arithm/Transpose -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:134"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_320
-#AT_START_321
-at_fn_group_banner 321 'testsuite-opencv.at:138' \
-  "Bitwise_and" "                                    " 20
-at_xfail=no
-(
-  $as_echo "321. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:138" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:138"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:139: make test_OCL_Arithm/Bitwise_and -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:139"
-( $at_check_trace; make test_OCL_Arithm/Bitwise_and -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:139"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_321
-#AT_START_322
-at_fn_group_banner 322 'testsuite-opencv.at:143' \
-  "Bitwise_or" "                                     " 20
-at_xfail=no
-(
-  $as_echo "322. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:143" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:143"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:144: make test_OCL_Arithm/Bitwise_or -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:144"
-( $at_check_trace; make test_OCL_Arithm/Bitwise_or -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:144"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_322
-#AT_START_323
-at_fn_group_banner 323 'testsuite-opencv.at:148' \
-  "Bitwise_xor" "                                    " 20
-at_xfail=no
-(
-  $as_echo "323. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:148" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:148"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:149: make test_OCL_Arithm/Bitwise_xor -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:149"
-( $at_check_trace; make test_OCL_Arithm/Bitwise_xor -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:149"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_323
-#AT_START_324
-at_fn_group_banner 324 'testsuite-opencv.at:153' \
-  "Bitwise_not" "                                    " 20
-at_xfail=no
-(
-  $as_echo "324. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:153" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:153"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:154: make test_OCL_Arithm/Bitwise_not -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:154"
-( $at_check_trace; make test_OCL_Arithm/Bitwise_not -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:154"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_324
-#AT_START_325
-at_fn_group_banner 325 'testsuite-opencv.at:158' \
-  "Compare" "                                        " 20
-at_xfail=no
-(
-  $as_echo "325. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:158" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:158"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:159: make test_OCL_Arithm/Compare -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:159"
-( $at_check_trace; make test_OCL_Arithm/Compare -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:159"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_325
-#AT_START_326
-at_fn_group_banner 326 'testsuite-opencv.at:163' \
-  "Pow" "                                            " 20
-at_xfail=no
-(
-  $as_echo "326. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:163" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:163"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:164: make test_OCL_Arithm/Pow -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:164"
-( $at_check_trace; make test_OCL_Arithm/Pow -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:164"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_326
-#AT_START_327
-at_fn_group_banner 327 'testsuite-opencv.at:168' \
-  "SetIdentity" "                                    " 20
-at_xfail=no
-(
-  $as_echo "327. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:168" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:168"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:169: make test_OCL_Arithm/SetIdentity -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:169"
-( $at_check_trace; make test_OCL_Arithm/SetIdentity -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:169"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_327
-#AT_START_328
-at_fn_group_banner 328 'testsuite-opencv.at:173' \
-  "Repeat" "                                         " 20
-at_xfail=no
-(
-  $as_echo "328. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:173" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:173"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:174: make test_OCL_Arithm/Repeat -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:174"
-( $at_check_trace; make test_OCL_Arithm/Repeat -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:174"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_328
-#AT_START_329
-at_fn_group_banner 329 'testsuite-opencv.at:178' \
-  "CountNonZero" "                                   " 20
-at_xfail=no
-(
-  $as_echo "329. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:178" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:178"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:179: make test_OCL_Arithm/CountNonZero -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:179"
-( $at_check_trace; make test_OCL_Arithm/CountNonZero -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:179"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_329
-#AT_START_330
-at_fn_group_banner 330 'testsuite-opencv.at:183' \
-  "Sum" "                                            " 20
-at_xfail=no
-(
-  $as_echo "330. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:183" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:183"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:184: make test_OCL_Arithm/Sum -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:184"
-( $at_check_trace; make test_OCL_Arithm/Sum -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:184"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_330
-#AT_START_331
-at_fn_group_banner 331 'testsuite-opencv.at:188' \
-  "MeanStdDev" "                                     " 20
-at_xfail=no
-(
-  $as_echo "331. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:188" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:188"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:189: make test_OCL_Arithm/MeanStdDev -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:189"
-( $at_check_trace; make test_OCL_Arithm/MeanStdDev -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:189"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_331
-#AT_START_332
-at_fn_group_banner 332 'testsuite-opencv.at:193' \
-  "Log" "                                            " 20
-at_xfail=no
-(
-  $as_echo "332. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:193" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:193"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:194: make test_OCL_Arithm/Log -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:194"
-( $at_check_trace; make test_OCL_Arithm/Log -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:194"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_332
-#AT_START_333
-at_fn_group_banner 333 'testsuite-opencv.at:198' \
-  "Exp" "                                            " 20
-at_xfail=no
-(
-  $as_echo "333. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:198" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:198"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:199: make test_OCL_Arithm/Exp -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:199"
-( $at_check_trace; make test_OCL_Arithm/Exp -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:199"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_333
-#AT_START_334
-at_fn_group_banner 334 'testsuite-opencv.at:203' \
-  "Phase" "                                          " 20
-at_xfail=no
-(
-  $as_echo "334. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:203" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:203"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:204: make test_OCL_Arithm/Phase -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:204"
-( $at_check_trace; make test_OCL_Arithm/Phase -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:204"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_334
-#AT_START_335
-at_fn_group_banner 335 'testsuite-opencv.at:208' \
-  "Magnitude" "                                      " 20
-at_xfail=no
-(
-  $as_echo "335. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:208" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:208"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:209: make test_OCL_Arithm/Magnitude -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:209"
-( $at_check_trace; make test_OCL_Arithm/Magnitude -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:209"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_335
-#AT_START_336
-at_fn_group_banner 336 'testsuite-opencv.at:213' \
-  "Flip" "                                           " 20
-at_xfail=no
-(
-  $as_echo "336. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:213" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:213"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:214: make test_OCL_Arithm/Flip -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:214"
-( $at_check_trace; make test_OCL_Arithm/Flip -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:214"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_336
-#AT_START_337
-at_fn_group_banner 337 'testsuite-opencv.at:218' \
-  "MinMaxIdx" "                                      " 20
-at_xfail=no
-(
-  $as_echo "337. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:218" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:218"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:219: make test_OCL_Arithm/MinMaxIdx -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:219"
-( $at_check_trace; make test_OCL_Arithm/MinMaxIdx -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:219"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_337
-#AT_START_338
-at_fn_group_banner 338 'testsuite-opencv.at:223' \
-  "MinMaxIdx_Mask" "                                 " 20
-at_xfail=no
-(
-  $as_echo "338. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:223" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:223"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:224: make test_OCL_Arithm/MinMaxIdx_Mask -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:224"
-( $at_check_trace; make test_OCL_Arithm/MinMaxIdx_Mask -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:224"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_338
-#AT_START_339
-at_fn_group_banner 339 'testsuite-opencv.at:228' \
-  "Norm" "                                           " 20
-at_xfail=no
-(
-  $as_echo "339. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:228" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:228"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:229: make test_OCL_Arithm/Norm -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:229"
-( $at_check_trace; make test_OCL_Arithm/Norm -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:229"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_339
-#AT_START_340
-at_fn_group_banner 340 'testsuite-opencv.at:233' \
-  "UMatDot" "                                        " 20
-at_xfail=no
-(
-  $as_echo "340. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:233" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:233"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:234: make test_OCL_Arithm/UMatDot -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:234"
-( $at_check_trace; make test_OCL_Arithm/UMatDot -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:234"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_340
-#AT_START_341
-at_fn_group_banner 341 'testsuite-opencv.at:238' \
-  "Sqrt" "                                           " 20
-at_xfail=no
-(
-  $as_echo "341. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:238" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:238"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:239: make test_OCL_Arithm/Sqrt -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:239"
-( $at_check_trace; make test_OCL_Arithm/Sqrt -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:239"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_341
-#AT_START_342
-at_fn_group_banner 342 'testsuite-opencv.at:243' \
-  "Normalize" "                                      " 20
-at_xfail=no
-(
-  $as_echo "342. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:243" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:243"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:244: make test_OCL_Arithm/Normalize -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:244"
-( $at_check_trace; make test_OCL_Arithm/Normalize -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:244"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_342
-#AT_START_343
-at_fn_group_banner 343 'testsuite-opencv.at:248' \
-  "InRange" "                                        " 20
-at_xfail=no
-(
-  $as_echo "343. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:248" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:248"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:249: make test_OCL_Arithm/InRange -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:249"
-( $at_check_trace; make test_OCL_Arithm/InRange -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:249"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_343
-#AT_START_344
-at_fn_group_banner 344 'testsuite-opencv.at:253' \
-  "ConvertScaleAbs" "                                " 20
-at_xfail=no
-(
-  $as_echo "344. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:253" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:253"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:254: make test_OCL_Arithm/ConvertScaleAbs -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:254"
-( $at_check_trace; make test_OCL_Arithm/ConvertScaleAbs -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:254"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_344
-#AT_START_345
-at_fn_group_banner 345 'testsuite-opencv.at:258' \
-  "ScaleAdd" "                                       " 20
-at_xfail=no
-(
-  $as_echo "345. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:258" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:258"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:259: make test_OCL_Arithm/ScaleAdd -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:259"
-( $at_check_trace; make test_OCL_Arithm/ScaleAdd -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:259"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_345
-#AT_START_346
-at_fn_group_banner 346 'testsuite-opencv.at:263' \
-  "PatchNaNs" "                                      " 20
-at_xfail=no
-(
-  $as_echo "346. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:263" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:263"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:264: make test_OCL_Arithm/PatchNaNs -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:264"
-( $at_check_trace; make test_OCL_Arithm/PatchNaNs -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:264"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_346
-#AT_START_347
-at_fn_group_banner 347 'testsuite-opencv.at:268' \
-  "Psnr" "                                           " 20
-at_xfail=no
-(
-  $as_echo "347. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:268" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:268"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:269: make test_OCL_Arithm/Psnr -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:269"
-( $at_check_trace; make test_OCL_Arithm/Psnr -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:269"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_347
-#AT_START_348
-at_fn_group_banner 348 'testsuite-opencv.at:273' \
-  "ReduceSum" "                                      " 20
-at_xfail=no
-(
-  $as_echo "348. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:273" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:273"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:274: make test_OCL_Arithm/ReduceSum -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:274"
-( $at_check_trace; make test_OCL_Arithm/ReduceSum -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:274"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_348
-#AT_START_349
-at_fn_group_banner 349 'testsuite-opencv.at:278' \
-  "ReduceMax" "                                      " 20
-at_xfail=no
-(
-  $as_echo "349. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:278" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:278"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:279: make test_OCL_Arithm/ReduceMax -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:279"
-( $at_check_trace; make test_OCL_Arithm/ReduceMax -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:279"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_349
-#AT_START_350
-at_fn_group_banner 350 'testsuite-opencv.at:283' \
-  "ReduceAvg" "                                      " 20
-at_xfail=no
-(
-  $as_echo "350. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:283" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:283"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:284: make test_OCL_Arithm/ReduceAvg -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:284"
-( $at_check_trace; make test_OCL_Arithm/ReduceAvg -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:284"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_350
-#AT_START_351
-at_fn_group_banner 351 'testsuite-opencv.at:290' \
-  "Gemm" "                                           " 20
-at_xfail=no
-(
-  $as_echo "351. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:290" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:290"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:291: make test_OCL_Core/Gemm -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:291"
-( $at_check_trace; make test_OCL_Core/Gemm -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:291"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_351
-#AT_START_352
-at_fn_group_banner 352 'testsuite-opencv.at:295' \
-  "Dft" "                                            " 20
-at_xfail=no
-(
-  $as_echo "352. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:295" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:295"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:296: make test_OCL_Core/Dft -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:296"
-( $at_check_trace; make test_OCL_Core/Dft -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:296"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_352
-#AT_START_353
-at_fn_group_banner 353 'testsuite-opencv.at:302' \
-  "MultiSpectrums" "                                 " 21
-at_xfail=no
-(
-  $as_echo "353. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:302" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:302"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:303: make test_OCL_OCL_ImgProc/MultiSpectrums -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:303"
-( $at_check_trace; make test_OCL_OCL_ImgProc/MultiSpectrums -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:303"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_353
-#AT_START_354
-at_fn_group_banner 354 'testsuite-opencv.at:309' \
-  "ConvertTo" "                                      " 22
-at_xfail=no
-(
-  $as_echo "354. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:309" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:309"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:310: make test_OCL_MatrixOperation/ConvertTo -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:310"
-( $at_check_trace; make test_OCL_MatrixOperation/ConvertTo -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:310"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_354
-#AT_START_355
-at_fn_group_banner 355 'testsuite-opencv.at:314' \
-  "CopyTo" "                                         " 22
-at_xfail=no
-(
-  $as_echo "355. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:314" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:314"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:315: make test_OCL_MatrixOperation/CopyTo -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:315"
-( $at_check_trace; make test_OCL_MatrixOperation/CopyTo -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:315"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_355
-#AT_START_356
-at_fn_group_banner 356 'testsuite-opencv.at:319' \
-  "SetTo" "                                          " 22
-at_xfail=no
-(
-  $as_echo "356. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:319" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:319"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:320: make test_OCL_MatrixOperation/SetTo -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:320"
-( $at_check_trace; make test_OCL_MatrixOperation/SetTo -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:320"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_356
-#AT_START_357
-at_fn_group_banner 357 'testsuite-opencv.at:324' \
-  "UMatExpr" "                                       " 22
-at_xfail=no
-(
-  $as_echo "357. $at_setup_line: testing $at_desc ..."
-  $at_traceon
-
-
-  $as_echo "testsuite-opencv.at:324" >"$at_check_line_file"
-(case " $POAT_TESTSUITES " in #(
-  *" OpenCV "*) :
-    false ;; #(
-  *) :
-    : ;;
-esac) \
-  && at_fn_check_skip 77 "$at_srcdir/testsuite-opencv.at:324"
-
-{ set +x
-$as_echo "$at_srcdir/testsuite-opencv.at:325: make test_OCL_MatrixOperation/UMatExpr -sC \$abs_top_builddir/examples/OpenCV | grep \"FAILED\""
-at_fn_check_prepare_notrace 'a shell pipeline' "testsuite-opencv.at:325"
-( $at_check_trace; make test_OCL_MatrixOperation/UMatExpr -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"
-) >>"$at_stdout" 2>>"$at_stderr" 5>&-
-at_status=$? at_failed=false
-$at_check_filter
-at_fn_diff_devnull "$at_stderr" || at_failed=:
-at_fn_diff_devnull "$at_stdout" || at_failed=:
-at_fn_check_status 1 $at_status "$at_srcdir/testsuite-opencv.at:325"
-$at_failed && at_fn_log_failure
-$at_traceon; }
-
-  set +x
-  $at_times_p && times >"$at_times_file"
-) 5>&1 2>&1 7>&- | eval $at_tee_pipe
-read at_status <"$at_status_file"
-#AT_STOP_357
diff --git a/tests/testsuite-amdsdk2_9.at b/tests/testsuite-amdsdk2_9.at
index 8989935..fd93f57 100644
--- a/tests/testsuite-amdsdk2_9.at
+++ b/tests/testsuite-amdsdk2_9.at
@@ -524,7 +524,7 @@ AT_CHECK_UNQUOTED([make test_SoAversusAoS -sC $abs_top_builddir/examples/AMDSDK2
 ])     
 AT_CLEANUP
 
-POAT_AMDSDK_HSA_SETUP([sobelfilter])
+POAT_AMDSDK_SETUP([sobelfilter])
 AT_CHECK_UNQUOTED([make test_SobelFilter -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
 [Passed!
 ])     
diff --git a/tests/testsuite-amdsdk2_9.at b/tests/testsuite-amdsdk3_0.at
similarity index 59%
copy from tests/testsuite-amdsdk2_9.at
copy to tests/testsuite-amdsdk3_0.at
index 8989935..d20a4a8 100644
--- a/tests/testsuite-amdsdk2_9.at
+++ b/tests/testsuite-amdsdk3_0.at
@@ -1,26 +1,27 @@
 m4_define([POAT_AMDSDK_SETUP],[
   AT_SETUP([$1])
-  AT_KEYWORDS([amdsdk2.9 amdsdk long $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" amdsdk2_9 "*],[false],[:])])
+  AT_KEYWORDS([amdsdk3.0 amdsdk long $1 $2])
+  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" amdsdk3_0 "*],[false],[:])])
   # AMD APP SDK tests require ICD loader with
   AT_SKIP_IF([! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h])
 ])
 
 m4_define([POAT_AMDSDK_HSA_SETUP],[
   AT_SETUP([$1])
-  AT_KEYWORDS([hsa amdsdk2.9 amdsdk long $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" amdsdk2_9 "*],[false],[:])])
+  AT_KEYWORDS([hsa amdsdk3.0 amdsdk long $1 $2])
+  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" amdsdk3_0 "*],[false],[:])])
   # AMD APP SDK tests require ICD loader with
   AT_SKIP_IF([! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h])
 ])
 
+##########################################################################
 
-AT_BANNER([AMD APP SDK 2.9 tests])
+AT_BANNER([AMD APP SDK 3.0 tests])
 
 POAT_AMDSDK_SETUP([asyncdatatransfer])
 # needs asynch properties implemented
 AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_AsyncDataTransfer -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Passed" | sed -e 's/^[ \t]*//'], 0, 
+AT_CHECK_UNQUOTED([make test_AsyncDataTransfer -sC $abs_top_builddir/examples/AMDSDK3.0 | grep "Passed" | sed -e 's/^[ \t]*//'], 0,
 [SyncKernel verification  : Passed!
 AsyncKernel verification : Passed!
 ])
@@ -29,7 +30,7 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([atomiccounters])
 # Expected Error: Device does not support cl_ext_atomic_counters_32 extension!
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_AtomicCounters -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Encryption Passed"], 0, 
+AT_CHECK_UNQUOTED([make test_AtomicCounters -sC $abs_top_builddir/examples/AMDSDK3.0 | grep "Encryption Passed"], 0,
 [Encryption Passed!
 ])
 AT_CLEANUP
@@ -41,13 +42,15 @@ POAT_AMDSDK_SETUP([basicdebug])
 # silently passes). It passes if the kernel's local array size is
 # increased so there is no out of bounds error.
 AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_BasicDebug -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_BasicDebug -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ], ignore)
 AT_CLEANUP
 
-POAT_AMDSDK_HSA_SETUP([binarysearch])
-AT_CHECK_UNQUOTED([make test_BinarySearch -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+POAT_AMDSDK_HSA_SETUP([binarysearchdevicesideenqueue])
+# requires dev side queue
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_BinarySearchDeviceSideEnqueue -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -55,49 +58,47 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([binomialoption-repl])
 AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&
              grep -q "define LLVM_3_2" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([binomialoption-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([binomialoptionmultigpu])
-AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/cl/BinomialOptionMultiGPU/bin/x86_64/Release/BinomialOptionMultiGPU])
-AT_CHECK_UNQUOTED([make test_BinomialOptionMultiGPU -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK3.0/AMD-APP-SDK-v3.0-RC-lnx64/samples/opencl/cl/BinomialOptionMultiGPU/bin/x86_64/Release/BinomialOptionMultiGPU])
+AT_CHECK_UNQUOTED([make test_BinomialOptionMultiGPU -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([bitonicsort])
-AT_CHECK_UNQUOTED([make test_BitonicSort -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_BitonicSort -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([blackscholes])
-AT_CHECK_UNQUOTED([make test_BlackScholes -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_BlackScholes -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([blackscholesdp])
 AT_KEYWORDS([cl_amd_fp64])
-#this causes assert on LLVM 3.1
-AT_SKIP_IF([ grep "#define LLVM_3_1" $abs_top_builddir/config.h ])
 AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
 AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv])
-AT_CHECK_UNQUOTED([make test_BlackScholesDP -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_BlackScholesDP -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ], ignore)
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([boxfilter])
-AT_CHECK_UNQUOTED([make test_BoxFilter -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
+AT_CHECK_UNQUOTED([make test_BoxFilter -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
 [Passed!
 Verifying results...Passed!
 ])
@@ -106,16 +107,16 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([boxfilterGL])
 # doesnt work
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BoxFilterGL -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
+AT_CHECK_UNQUOTED([make test_BoxFilterGL -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
 [Passed!
 Verifying results...Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([bufferbandwidth])
-# Device does not support cl_khr_local_int32_base_atomics extension!
-# AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BufferBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
+# freezes/takes forever
+AT_SKIP_IF(true)
+AT_CHECK_UNQUOTED([make test_BufferBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
 [ Verification Passed!
  Verification Passed!
 Passed!
@@ -125,21 +126,35 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([bufferImageInterop])
 # Error: Selected device doesn't support Buffer-Image
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BufferImageInterop -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
+AT_CHECK_UNQUOTED([make test_BufferImageInterop -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
 [Passed!
 Passed!
 ])
 AT_CLEANUP
 
+POAT_AMDSDK_SETUP([builtinscan])
+# requires work_group_scan_inclusive_add, work_group_barrier & work_group_broadcast
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_BuiltInScan -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
+[OK
+])
+AT_CLEANUP
+
+POAT_AMDSDK_HSA_SETUP([calcpie])
+AT_CHECK_UNQUOTED([make test_CalcPie -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
+[Passed!
+])
+AT_CLEANUP
+
 POAT_AMDSDK_SETUP([concurrentkernel])
-AT_CHECK_UNQUOTED([make test_ConcurrentKernel -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
+AT_CHECK_UNQUOTED([make test_ConcurrentKernel -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
 [ Sequential Kernel verification : Passed!
  Concurrent Kernel verification : Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([constantbandwidth])
-AT_CHECK_UNQUOTED([make test_ConstantBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_ConstantBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep Passed], 0,
 [Passed!
 Passed!
 Passed!
@@ -148,19 +163,30 @@ Passed!
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([cpluspluswrapper])
-AT_CHECK_UNQUOTED([make test_CplusplusWrapper -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+# insists on AMD platform
+AT_SKIP_IF(true)
+AT_CHECK_UNQUOTED([make test_CplusplusWrapper -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([dct])
-AT_CHECK_UNQUOTED([make test_DCT -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_DCT -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([deviceenqueuebfs])
+# requires dev queue
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_DeviceEnqueueBFS -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([devicefission])
-AT_CHECK_UNQUOTED([make test_DeviceFission -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_DeviceFission -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -168,7 +194,7 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([devicefission11ext])
 # Expected Error: Device does not support cl_ext_device_fission extension!
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_DeviceFission11Ext -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_DeviceFission11Ext -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -177,7 +203,7 @@ POAT_AMDSDK_SETUP([dwthaar1d])
 # 3.5 introduced the noduplicate attribute which, when
 # used with barrier(), fixes this.
 AT_XFAIL_IF([egrep -q "#define LLVM_3_4" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([make test_DwtHaar1D -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_DwtHaar1D -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -185,7 +211,7 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([dwthaar1dcppkernel])
 # Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_DwtHaar1DCPPKernel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_DwtHaar1DCPPKernel -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -193,19 +219,21 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([eigenvalue])
 # Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_EigenValue -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_EigenValue -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
-POAT_AMDSDK_HSA_SETUP([fastwalshtransform])
-AT_CHECK_UNQUOTED([make test_FastWalshTransform -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+POAT_AMDSDK_SETUP([extractprimes])
+# requires dev queue
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_ExtractPrimes -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
-POAT_AMDSDK_HSA_SETUP([floydwarshall])
-AT_CHECK_UNQUOTED([make test_FloydWarshall -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+POAT_AMDSDK_HSA_SETUP([fastwalshtransform])
+AT_CHECK_UNQUOTED([make test_FastWalshTransform -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -213,7 +241,26 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([fft])
 # Build parameter clc++ not supported
 AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_FFT -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_FFT -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([finegrainsvm])
+# freezes with every device - requires async running queue
+AT_SKIP_IF(true)
+AT_CHECK_UNQUOTED([make test_FineGrainSVM -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
+])
+AT_CLEANUP
+
+POAT_AMDSDK_HSA_SETUP([finegrainsvmcas])
+AT_CHECK_UNQUOTED([make test_FineGrainSVMCAS -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+ignore, ignore)
+AT_CLEANUP
+
+POAT_AMDSDK_HSA_SETUP([floydwarshall])
+AT_CHECK_UNQUOTED([make test_FloydWarshall -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -224,13 +271,13 @@ POAT_AMDSDK_SETUP([fluidsimulation2d])
 # break with Intel OCL also.
 AT_XFAIL_IF(true)
 AT_KEYWORDS([cl_amd_fp64])
-AT_CHECK_UNQUOTED([make test_FluidSimulation2D -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_FluidSimulation2D -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ], ignore)
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([gaussiannoise])
-AT_CHECK_UNQUOTED([make test_GaussianNoise -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed | sed 's/ //g'], 0, 
+AT_CHECK_UNQUOTED([make test_GaussianNoise -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed | sed 's/ //g'], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -238,94 +285,110 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([gaussiannoisegl])
 #doesnt work
 AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_GaussianNoiseGL -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed! 
+AT_CHECK_UNQUOTED([make test_GaussianNoiseGL -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
 ])
 AT_CLEANUP
 
 #Benchmark test
 #POAT_AMDSDK_SETUP([globalmemorybandwidth])
-#AT_CHECK_UNQUOTED([make test_GlobalMemoryBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+#AT_CHECK_UNQUOTED([make test_GlobalMemoryBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 #[Passed!
 #])
 #AT_CLEANUP
 
 POAT_AMDSDK_SETUP([hdrtonemapping])
-AT_CHECK_UNQUOTED([make test_HDRToneMapping -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_HDRToneMapping -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!!
 ], ignore)
 AT_CLEANUP
 
+POAT_AMDSDK_HSA_SETUP([heatpde])
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_HeatPDE -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed"], 0,
+[Passed])
+AT_CLEANUP
+
 POAT_AMDSDK_HSA_SETUP([helloworld])
-AT_CHECK_UNQUOTED([make test_HelloWorld -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "GdkknVnqkc|HelloWorld"], 0, 
+AT_CHECK_UNQUOTED([make test_HelloWorld -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "GdkknVnqkc|HelloWorld"], 0,
 [GdkknVnqkc
 HelloWorld
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([histogram-repl])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_Histogram -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_Histogram -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([histogram-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_Histogram -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_Histogram -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
-POAT_AMDSDK_SETUP([histogramatomic])
-AT_CHECK_UNQUOTED([make test_HistogramAtomics -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+POAT_AMDSDK_SETUP([histogramatomics])
+AT_CHECK_UNQUOTED([make test_HistogramAtomics -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([imagebandwidth])
-# AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_ImageBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+# GPU not found. Exiting application
+AT_SKIP_IF(true)
+AT_CHECK_UNQUOTED([make test_ImageBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
+POAT_AMDSDK_SETUP([imagebinarization])
+# requires work_group_barrier
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_ImageBinarization -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
+])
+AT_CLEANUP
 
 POAT_AMDSDK_SETUP([imageoverlap])
-AT_CHECK_UNQUOTED([make test_ImageOverlap -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_ImageOverlap -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Verifying result - Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([introstatickcppkernel])
 # Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_IntroStaticCPPKernel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_IntroStaticCPPKernel -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Verifying result - Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([kernellauch])
-# Device does not support cl_khr_local_int32_base_atomics extension! 
-# works anyway
-AT_CHECK_UNQUOTED([make test_KernelLaunch -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed!], 0, 
+# GPU not found. Exiting application
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_KernelLaunch -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed!], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([kmeansautoclustering])
 # doesn't find opencl library for some reason
-AT_XFAIL_IF(true) 
-AT_CHECK_UNQUOTED([make test_KmeansAutoclustering -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_SKIP_IF(true)
+AT_CHECK_UNQUOTED([make test_KmeansAutoclustering -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
 Benchmark test
 POAT_AMDSDK_SETUP([ldsbandwidth])
-AT_CHECK_UNQUOTED([make test_LDSBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+# requires dev queue
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_LDSBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 Passed!
 Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([ludecomposition])
@@ -333,28 +396,29 @@ AT_KEYWORDS([cl_amd_fp64])
 #test uses doubles
 AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
 AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv])
-AT_CHECK_UNQUOTED([make test_LUDecomposition -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_LUDecomposition -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ], ignore)
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([mandelbrot])
 AT_SKIP_IF([ grep "undef HAVE_GLEW" $abs_top_builddir/config.h ])
-AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/cl/Mandelbrot/bin/x86_64/Release/Mandelbrot])
+AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK3.0/AMD-APP-SDK-v3.0-RC-lnx64/samples/opencl/cl/Mandelbrot/bin/x86_64/Release/Mandelbrot])
 # undefined symbol: _Z7std_fmaDv4_fS_S_ with VML
-AT_CHECK_UNQUOTED([make test_Mandelbrot -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_Mandelbrot -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ], ignore)
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([matrixmuldouble])
-AT_CHECK_UNQUOTED([make test_MatrixMulDouble -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_KEYWORDS([cl_amd_fp64])
+AT_CHECK_UNQUOTED([make test_MatrixMulDouble -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([matrixmulimage])
-AT_CHECK_UNQUOTED([make test_MatrixMulImage -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_MatrixMulImage -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ], ignore)
 AT_CLEANUP
@@ -362,33 +426,31 @@ AT_CLEANUP
 POAT_AMDSDK_HSA_SETUP([matrixmultiplication])
 # pocl error: encountered unimplemented part of the OpenCL specs in clCreateImage2D.c:119
 #AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_MatrixMultiplication -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_MatrixMultiplication -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([matrixtranspose])
-AT_CHECK_UNQUOTED([make test_MatrixTranspose -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_MatrixTranspose -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([memorymodel-repl])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([memorymodel-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([memoryoptimizations])
-#Device does not support global_int32_base_atomics
-#AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_MemoryOptimizations -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_MemoryOptimizations -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 Passed!
 Passed!
@@ -415,87 +477,133 @@ Passed!
 ], ignore)
 AT_CLEANUP
 
-POAT_AMDSDK_SETUP([merzennetwister])
+POAT_AMDSDK_SETUP([mersennetwister])
 #Build parameter clc++ is not supported
 AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_MersenneTwister -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_MersenneTwister -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ], ignore)
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([montecarloasian])
-# kernel compilation fails due to 
-# error: can't convert between vector values of different size ('float4' and 'int')
-# It should be a legal implicit conversion according to 6.3 Operators. Works also with Intel OCL
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_MonteCarloAsian -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_MonteCarloAsian -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
+
 POAT_AMDSDK_SETUP([montecarloasiandp])
-AT_KEYWORDS([cl_amd_fp64])
-# error: can't convert between vector values of different size ('double4' and 'int')
-# It should be a legal implicit conversion according to 6.3 Operators. Works also with Intel OCL
+# passes arguments via a struct
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_MonteCarloAsianDP -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_MonteCarloAsianDP -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ], ignore)
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([montecarloasianmultigpu])
-AT_CHECK_UNQUOTED([make test_MonteCarloAsianMultiGPU -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_MonteCarloAsianMultiGPU -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([nbody])
 AT_SKIP_IF([ grep "undef HAVE_GLEW" $abs_top_builddir/config.h ])
-AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/cl/NBody/bin/x86_64/Release/NBody])
-AT_CHECK_UNQUOTED([make test_NBody -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK3.0/AMD-APP-SDK-v3.0-RC-lnx64/samples/opencl/cl/NBody/bin/x86_64/Release/NBody])
+AT_CHECK_UNQUOTED([make test_NBody -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
+POAT_AMDSDK_SETUP([pipeproducerconsumerkernels])
+# no pipe support yet
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_PipeProducerConsumerKernels -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
+], ignore)
+AT_CLEANUP
+
 POAT_AMDSDK_HSA_SETUP([prefixsum])
-AT_CHECK_UNQUOTED([make test_PrefixSum -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_PrefixSum -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([quasirandomsequence])
-AT_CHECK_UNQUOTED([make test_QuasiRandomSequence -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_QuasiRandomSequence -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([radixsort])
-AT_CHECK_UNQUOTED([make test_RadixSort -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_RadixSort -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
+], ignore)
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([rangeminimumquery])
+# requires work_group_reduce_min
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_RangeMinimumQuery -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-], ignore)     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([recursivegaussian])
-AT_CHECK_UNQUOTED([make test_RecursiveGaussian -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_RecursiveGaussian -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([recursivegaussianprogramscope])
+# uses in-source global variable
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_RecursiveGaussian_ProgramScope -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([reduction])
-AT_CHECK_UNQUOTED([make test_Reduction -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_Reduction -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([regiongrowingsegmentation])
+# requires dev queue
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_RegionGrowingSegmentation -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([scanlargearrays])
 # Fails with vectorization. With wiloops and no unrolling, the vectorization won't apply.
-AT_CHECK_UNQUOTED([make test_ScanLargeArrays -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_ScanLargeArrays -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([simpleconvolution])
-AT_CHECK_UNQUOTED([make test_SimpleConvolution -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_SimpleConvolution -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Verifying non-Separable Convolution Kernel result - Passed!
+Verifying Separable Convolution Kernel result - Passed!
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([simpledepthimage])
+#  *** ERROR ***  Requested image format is not supported
+# Error: clCreateImage failed.(oclImage) Error code : CL_IMAGE_FORMAT_NOT_SUPPORTED
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_SimpleDepthImage -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Verifying 2D copy result - Passed!
+Verifying 3D copy result - Passed!
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([simplegenericaddressspace])
+# requires work_group_barrier
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_SimpleGenericAddressSpace -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -503,64 +611,95 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([simplegl])
 # doesn't find opecl library
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SimpleGL -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_SimpleGL -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([simpleimage])
-AT_CHECK_UNQUOTED([make test_SimpleImage -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_SimpleImage -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Verifying 2D copy result - Passed!
 Verifying 3D copy result - Passed!
-])     
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([simplepipe])
+# pipe not implemented
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_SimplePipe -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[OK
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([simplespir])
+# Device side queue is unimplemented
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_SimpleSPIR -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[OK
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([soaversusaos])
-#Build Options are : -x clc++ -D num1=4096 -D num2=4096 
+#Build Options are : -x clc++ -D num1=4096 -D num2=4096
 #Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS
 AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_SoAversusAoS -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_SoAversusAoS -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
-POAT_AMDSDK_HSA_SETUP([sobelfilter])
-AT_CHECK_UNQUOTED([make test_SobelFilter -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+POAT_AMDSDK_SETUP([sobelfilter])
+AT_CHECK_UNQUOTED([make test_SobelFilter -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([sobelfilterimage])
 # segfault
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SobelFilterImage -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_SobelFilterImage -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([stringsearch])
-AT_CHECK_UNQUOTED([make test_StringSearch -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_StringSearch -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
 Passed!
-])     
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([svmatomicsbinarytreeinsert])
+
+AT_CHECK_UNQUOTED([make test_SVMAtomicsBinaryTreeInsert -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+AT_XFAIL_IF(true)
+[Passed!
+])
+AT_CLEANUP
+
+POAT_AMDSDK_SETUP([svmbinarytreesearch])
+AT_XFAIL_IF(true)
+AT_CHECK_UNQUOTED([make test_SVMBinaryTreeSearch -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
+[Passed!
+])
 AT_CLEANUP
 
+
 POAT_AMDSDK_SETUP([template])
-AT_CHECK_UNQUOTED([make test_Template -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_Template -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([transferoverlap])
-AT_CHECK_UNQUOTED([make test_TransferOverlap -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Passed"], 0, 
+AT_CHECK_UNQUOTED([make test_TransferOverlap -sC $abs_top_builddir/examples/AMDSDK3.0 | grep "Passed"], 0,
 [Passed!
 ])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([transferoverlapcpp])
-# Expected Error: Device does not support cl_khr_local_int32_base_atomics extension! and segfault
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_TransferOverlapCPP -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Passed"], 0, 
+AT_CHECK_UNQUOTED([make test_TransferOverlapCPP -sC $abs_top_builddir/examples/AMDSDK3.0 | grep "Passed"], 0,
 [Passed!
 ])
 AT_CLEANUP
@@ -568,21 +707,21 @@ AT_CLEANUP
 POAT_AMDSDK_SETUP([unsharpmask])
 # doesn't find opencl library
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_UnsharpMask -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
+AT_CHECK_UNQUOTED([make test_UnsharpMask -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
 [Passed!
-])     
+])
 AT_CLEANUP
 
 POAT_AMDSDK_HSA_SETUP([urng])
-AT_CHECK_UNQUOTED([make test_URNG -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed | cut -c -7], 0, 
-[Passed! 
-])     
+AT_CHECK_UNQUOTED([make test_URNG -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed | cut -c -7], 0,
+[Passed!
+])
 AT_CLEANUP
 
 POAT_AMDSDK_SETUP([urngnoisegl])
 # Error: clGetPlatformIDs failed. Error code : CL_PLATFORM_NOT_FOUND_KHR
 AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_URNGNoiseGL -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed | cut -c -7], 0, 
-[Passed! 
-])     
+AT_CHECK_UNQUOTED([make test_URNGNoiseGL -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed | cut -c -7], 0,
+[Passed!
+])
 AT_CLEANUP
diff --git a/tests/testsuite-regression.at b/tests/testsuite-regression.at
index f0b28f7..f62d8ad 100644
--- a/tests/testsuite-regression.at
+++ b/tests/testsuite-regression.at
@@ -14,13 +14,13 @@ AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regressio
 AT_CLEANUP
 
 AT_SETUP([issues with local pointers (repl) - lp:918801])
-AT_KEYWORDS([regression locals tce])
+AT_KEYWORDS([regression locals])
 AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
 AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_locals], 0)
 AT_CLEANUP
 
 AT_SETUP([issues with local pointers (loops) - lp:918801])
-AT_KEYWORDS([regression locals tce])
+AT_KEYWORDS([regression locals])
 AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
 AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_locals], 0)
 AT_CLEANUP
diff --git a/tests/testsuite.at b/tests/testsuite.at
index a216bab..114e370 100644
--- a/tests/testsuite.at
+++ b/tests/testsuite.at
@@ -232,12 +232,6 @@ AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_fabs], 0, expout)
 AT_CLEANUP
 
 AT_SETUP([Kernel functions abs abs_diff add_sat hadd mad_hi mad_sat mul_hi rhadd sub_sat (loopvec)])
-# 3-element vector cases fail when vectorizer is enabled,
-# at least with Intel Core i5 and AMD FX8. Assume it fails on all others too.
-AT_XFAIL_IF([grep OCL_KERNEL_TARGET $abs_top_builddir/config.h | cut -d\" -f2 | grep -q x86_64 &&
-             grep -q "#define LLVM_3_5" $abs_top_builddir/config.h])
-AT_XFAIL_IF([grep OCL_KERNEL_TARGET $abs_top_builddir/config.h | cut -d\" -f2 | grep -q x86_64 &&
-             grep -q "#define LLVM_3_6" $abs_top_builddir/config.h])
 AT_DATA([expout],
 [Running test test_hadd...
 OK
@@ -389,6 +383,21 @@ AT_CHECK_UNQUOTED([$abs_top_builddir/examples/scalarwave/scalarwave], 0,
 ])
 AT_CLEANUP
 
+AT_SETUP([Intel SVM Coarse-grained])
+AT_KEYWORDS([intel_svm])
+AT_SKIP_IF([grep "define TCE_AVAILABLE" $abs_top_builddir/config.h])
+AT_SKIP_IF([test ! -f $abs_top_srcdir/examples/IntelSVM/source/SVMBasicCoarseGrained/svmbasic])
+AT_CHECK([make test_CoarseGrained -sC $abs_top_builddir/examples/IntelSVM | grep -q PASSED], 0)
+AT_CLEANUP
+
+AT_SETUP([Intel SVM Fine-grained])
+AT_KEYWORDS([intel_svm])
+AT_SKIP_IF([grep "define TCE_AVAILABLE" $abs_top_builddir/config.h])
+AT_SKIP_IF([test ! -f $abs_top_srcdir/examples/IntelSVM/source/SVMBasicFineGrained/svmbasic])
+AT_CHECK([make test_FineGrained -sC $abs_top_builddir/examples/IntelSVM | grep -q PASSED], 0)
+AT_CLEANUP
+
+
 #m4_include(testsuite-llvmopencl.at)
 m4_include(testsuite-workgroup.at)
 m4_include(testsuite-regression.at)
@@ -400,6 +409,7 @@ m4_include(testsuite-rodinia.at)
 m4_include(testsuite-parboil.at)
 m4_include(testsuite-amd.at)
 m4_include(testsuite-amdsdk2_9.at)
+m4_include(testsuite-amdsdk3_0.at)
 m4_include(testsuite-vexcl.at)
 m4_include(testsuite-halide.at)
 m4_include(testsuite-cloverleaf.at)
diff --git a/tests/viennacl.at b/tests/viennacl.at
new file mode 100644
index 0000000..7de6c4d
--- /dev/null
+++ b/tests/viennacl.at
@@ -0,0 +1,7 @@
+# short-cut to run only the ViennaCL tests.
+# run with 'make TESTUITE=viennacl check'
+
+AT_INIT()
+AT_COLOR_TESTS
+m4_include(testsuite-viennacl.at)
+
diff --git a/tests/workgroup/CMakeLists.txt b/tests/workgroup/CMakeLists.txt
index e783151..04e8511 100644
--- a/tests/workgroup/CMakeLists.txt
+++ b/tests/workgroup/CMakeLists.txt
@@ -23,38 +23,11 @@
 #
 #=============================================================================
 
-function(add_test_workgroup TEST_NAME RESULT_FILE CL_FILE)
-  add_test_custom("${CMAKE_CURRENT_BINARY_DIR}/run_kernel" "${TEST_NAME}"
-                  "${RESULT_FILE}" "${CL_FILE}" ${ARGN})
-endfunction()
-
-function(add_test_workgroup_sorted TEST_NAME RESULT_FILE CL_FILE)
-  set(RUN_CMD "${CMAKE_CURRENT_BINARY_DIR}/run_kernel")
-  foreach(LOOPVAR  "${CL_FILE}" ${ARGN})
-    set(RUN_CMD "${RUN_CMD}####${LOOPVAR}")
-  endforeach()
-
-  add_test("${TEST_NAME}"
-    "${CMAKE_COMMAND}"
-    -Dtest_cmd=${RUN_CMD}
-    -Dsort_output=1
-    -Doutput_blessed=${CMAKE_CURRENT_SOURCE_DIR}/${RESULT_FILE}
-    -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake"
-  )
-endfunction()
-
-
-#AM_LDFLAGS = ../../lib/poclu/libpoclu.la @OPENCL_LIBS@
-#POCLU_LINK_OPTIONS
-
-#AM_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -I$(top_srcdir)/lib/CL -DSRCDIR='"$(abs_srcdir)"' @OPENCL_CFLAGS@
+
 add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 include_directories("${CMAKE_SOURCE_DIR}/lib/CL")
 add_compile_options(${OPENCL_CFLAGS})
 
-#EXTRA_DIST = basic_barriers.cl conditional_barriers.cl forloops.cl forloops_2_2_1_1.stdout loopbarriers.cl basic_barriers_2_2_2_2.stdout tricky_for.cl outerlooppar.cl outerlooppar_2_2_1_1.stdout for_bug.cl for_bug_1_2_1_1.stdout multilatch_bloop.cl multilatch_bloop_1_3_1_1.stdout print_all_ids.cl print_all_ids_114114.txt implicit_barriers.cl implicit_barriers_1_2_1_1.stdout
-
-#noinst_PROGRAMS = run_kernel
 if(MSVC)
   set_source_files_properties( run_kernel.c PROPERTIES LANGUAGE CXX )
 endif(MSVC)
@@ -62,76 +35,117 @@ add_executable("run_kernel" "run_kernel.c")
 target_link_libraries("run_kernel" ${POCLU_LINK_OPTIONS})
 
 
-
 # repl
-add_test_workgroup("\"workgroup/unconditional barriers (full replication)\"" "basic_barriers_2_2_2_2.stdout" "basic_barriers.cl" 2 2 2 2 )
-
-add_test_workgroup("\"workgroup/unbarriered for loops (full replication)\"" "forloops_2_2_1_1.stdout" "forloops.cl" 2 2 1 1)
-
-add_test_workgroup("\"workgroup/barriered for loops (full replication)\"" "loopbarriers_2_2_1_1.stdout" "loopbarriers.cl" 2 2 1 1 )
-
-add_test_workgroup("\"workgroup/conditional barrier (full replication)\"" "cond_barriers_1_2_1_1.stdout" "conditional_barriers.cl" 1 2 1 1)
-
-add_test_workgroup("\"workgroup/b-loop with none of the WIs reaching the barrier (full replication)\"" "tricky_for_1_2_1_1.stdout" "tricky_for.cl" 1 2 1 1)
-
-add_test_workgroup("\"workgroup/forcing horizontal parallelization to some outer loops (full replication)\"" "outerlooppar_2_2_1_1.stdout" "outerlooppar.cl" 2 2 1 1)
-
-add_test_workgroup("\"workgroup/loop with two paths to the latch (full replication)\"" "for_bug_1_2_1_1.stdout" "for_bug.cl" 1 2 1 1)
-
-add_test_workgroup("\"workgroup/b-loop with two latches (full replication)\"" "multilatch_bloop_1_3_1_1.stdout" "multilatch_bloop.cl" 1 3 1 1)
-
-add_test_workgroup_sorted("\"workgroup/workgroup_sizes: work-items get wrong ids (full replication)\"" "print_all_ids_114114.txt" "print_all_ids.cl" 1 1 1 4)
-
-set_tests_properties( "\"workgroup/unconditional barriers (full replication)\""
-  "\"workgroup/unbarriered for loops (full replication)\""
-  "\"workgroup/barriered for loops (full replication)\""
-  "\"workgroup/conditional barrier (full replication)\""
-  "\"workgroup/b-loop with none of the WIs reaching the barrier (full replication)\""
-  "\"workgroup/forcing horizontal parallelization to some outer loops (full replication)\""
-  "\"workgroup/loop with two paths to the latch (full replication)\""
-  "\"workgroup/b-loop with two latches (full replication)\""
-  "\"workgroup/workgroup_sizes: work-items get wrong ids (full replication)\""
+add_test_pocl(NAME "workgroup/unconditional_barriers_REPL"
+              EXPECTED_OUTPUT "basic_barriers_2_2_2_2.stdout"
+              COMMAND "run_kernel" "basic_barriers.cl" 2 2 2 2)
+
+add_test_pocl(NAME "workgroup/unbarriered_for_loops_REPL"
+              EXPECTED_OUTPUT "forloops_2_2_1_1.stdout"
+              COMMAND "run_kernel" "forloops.cl" 2 2 1 1)
+
+add_test_pocl(NAME "workgroup/barriered_for_loops_REPL"
+              EXPECTED_OUTPUT "loopbarriers_2_2_1_1.stdout"
+              COMMAND "run_kernel" "loopbarriers.cl" 2 2 1 1)
+
+add_test_pocl(NAME "workgroup/conditional_barrier_REPL"
+              EXPECTED_OUTPUT "cond_barriers_1_2_1_1.stdout"
+              COMMAND "run_kernel" "conditional_barriers.cl" 1 2 1 1)
+
+add_test_pocl(NAME "workgroup/b_loop_with_none_of_the_WIs_reaching_the_barrier_REPL"
+              EXPECTED_OUTPUT "tricky_for_1_2_1_1.stdout"
+              COMMAND "run_kernel" "tricky_for.cl" 1 2 1 1)
+
+add_test_pocl(NAME "workgroup/forcing_horizontal_parallelization_to_some_outer_loops_REPL"
+              EXPECTED_OUTPUT "outerlooppar_2_2_1_1.stdout"
+              COMMAND "run_kernel" "outerlooppar.cl" 2 2 1 1)
+
+add_test_pocl(NAME "workgroup/loop_with_two_paths_to_the_latch_REPL"
+              EXPECTED_OUTPUT "for_bug_1_2_1_1.stdout"
+              COMMAND "run_kernel" "for_bug.cl" 1 2 1 1)
+
+add_test_pocl(NAME "workgroup/b_loop_with_two_latches_REPL"
+              EXPECTED_OUTPUT "multilatch_bloop_1_3_1_1.stdout"
+              COMMAND "run_kernel" "multilatch_bloop.cl" 1 3 1 1)
+
+add_test_pocl(NAME "workgroup/workgroup_sizes_work_items_get_wrong_ids_REPL"
+              SORT_OUTPUT
+              EXPECTED_OUTPUT "print_all_ids_114114.txt"
+              COMMAND "run_kernel" "print_all_ids.cl" 1 1 1 4)
+
+set_tests_properties( "workgroup/unconditional_barriers_REPL"
+  "workgroup/unbarriered_for_loops_REPL"
+  "workgroup/barriered_for_loops_REPL"
+  "workgroup/conditional_barrier_REPL"
+  "workgroup/b_loop_with_none_of_the_WIs_reaching_the_barrier_REPL"
+  "workgroup/forcing_horizontal_parallelization_to_some_outer_loops_REPL"
+  "workgroup/loop_with_two_paths_to_the_latch_REPL"
+  "workgroup/b_loop_with_two_latches_REPL"
+  "workgroup/workgroup_sizes_work_items_get_wrong_ids_REPL"
   PROPERTIES
     COST 2.0
     PROCESSORS 1
     LABELS "workgroup"
     ENVIRONMENT "POCL_DEVICES=basic;POCL_WORK_GROUP_METHOD=workitemrepl"
-    DEPENDS "pocl_version_check")
+    DEPENDS "pocl_version_check"
+    LABELS "internal;workgroup")
 
 # loops
-add_test_workgroup("\"workgroup/different implicit barrier injection scenarios (loops)\"" "implicit_barriers_1_2_1_1.stdout" "implicit_barriers.cl" 1 2 1 1)
-
-add_test_workgroup("\"workgroup/unconditional barriers (loops)\"" "basic_barriers_2_2_2_2.stdout" "basic_barriers.cl" 2 2 2 2 )
-
-add_test_workgroup("\"workgroup/unbarriered for loops (loops)\"" "forloops_2_2_1_1.stdout" "forloops.cl" 2 2 1 1)
-
-add_test_workgroup("\"workgroup/barriered for loops (loops)\"" "loopbarriers_2_2_1_1.stdout" "loopbarriers.cl" 2 2 1 1 )
-
-add_test_workgroup("\"workgroup/conditional barrier (loops)\"" "cond_barriers_1_2_1_1.stdout" "conditional_barriers.cl" 1 2 1 1)
-
-add_test_workgroup("\"workgroup/b-loop with none of the WIs reaching the barrier (loops)\"" "tricky_for_1_2_1_1.stdout" "tricky_for.cl" 1 2 1 1)
-
-add_test_workgroup("\"workgroup/forcing horizontal parallelization to some outer loops (loops)\"" "outerlooppar_2_2_1_1.stdout" "outerlooppar.cl" 2 2 1 1)
-
-add_test_workgroup("\"workgroup/loop with two paths to the latch (loops)\"" "for_bug_1_2_1_1.stdout" "for_bug.cl" 1 2 1 1)
-
-add_test_workgroup("\"workgroup/b-loop with two latches (loops)\"" "multilatch_bloop_1_3_1_1.stdout" "multilatch_bloop.cl" 1 3 1 1)
-
-add_test_workgroup_sorted("\"workgroup/workgroup_sizes: work-items get wrong ids (loops)\"" "print_all_ids_114114.txt" "print_all_ids.cl" 1 1 1 4)
-
-
-set_tests_properties( "\"workgroup/unconditional barriers (loops)\""
-  "\"workgroup/unbarriered for loops (loops)\""
-  "\"workgroup/barriered for loops (loops)\""
-  "\"workgroup/conditional barrier (loops)\""
-  "\"workgroup/b-loop with none of the WIs reaching the barrier (loops)\""
-  "\"workgroup/forcing horizontal parallelization to some outer loops (loops)\""
-  "\"workgroup/loop with two paths to the latch (loops)\""
-  "\"workgroup/b-loop with two latches (loops)\""
-  "\"workgroup/workgroup_sizes: work-items get wrong ids (loops)\""
+add_test_pocl(NAME "workgroup/different_implicit_barrier_injection_scenarios_LOOPS"
+              EXPECTED_OUTPUT "implicit_barriers_1_2_1_1.stdout"
+              COMMAND "run_kernel" "implicit_barriers.cl" 1 2 1 1)
+
+add_test_pocl(NAME "workgroup/unconditional_barriers_LOOPS"
+              EXPECTED_OUTPUT "basic_barriers_2_2_2_2.stdout"
+              COMMAND "run_kernel" "basic_barriers.cl" 2 2 2 2)
+
+add_test_pocl(NAME "workgroup/unbarriered_for_loops_LOOPS"
+              EXPECTED_OUTPUT "forloops_2_2_1_1.stdout"
+              COMMAND "run_kernel" "forloops.cl" 2 2 1 1)
+
+add_test_pocl(NAME "workgroup/barriered_for_loops_LOOPS"
+              EXPECTED_OUTPUT "loopbarriers_2_2_1_1.stdout"
+              COMMAND "run_kernel" "loopbarriers.cl" 2 2 1 1)
+
+add_test_pocl(NAME "workgroup/conditional_barrier_LOOPS"
+              EXPECTED_OUTPUT "cond_barriers_1_2_1_1.stdout"
+              COMMAND "run_kernel" "conditional_barriers.cl" 1 2 1 1)
+
+add_test_pocl(NAME "workgroup/b_loop_with_none_of_the_WIs_reaching_the_barrier_LOOPS"
+              EXPECTED_OUTPUT "tricky_for_1_2_1_1.stdout"
+              COMMAND "run_kernel" "tricky_for.cl" 1 2 1 1)
+
+add_test_pocl(NAME "workgroup/forcing_horizontal_parallelization_to_some_outer_loops_LOOPS"
+              EXPECTED_OUTPUT "outerlooppar_2_2_1_1.stdout"
+              COMMAND "run_kernel" "outerlooppar.cl" 2 2 1 1)
+
+add_test_pocl(NAME "workgroup/loop_with_two_paths_to_the_latch_LOOPS"
+              EXPECTED_OUTPUT "for_bug_1_2_1_1.stdout"
+              COMMAND "run_kernel" "for_bug.cl" 1 2 1 1)
+
+add_test_pocl(NAME "workgroup/b_loop_with_two_latches_LOOPS"
+              EXPECTED_OUTPUT "multilatch_bloop_1_3_1_1.stdout"
+              COMMAND "run_kernel" "multilatch_bloop.cl" 1 3 1 1)
+
+add_test_pocl(NAME "workgroup/workgroup_sizes_work_items_get_wrong_ids_LOOPS"
+              SORT_OUTPUT
+              EXPECTED_OUTPUT "print_all_ids_114114.txt"
+              COMMAND "run_kernel" "print_all_ids.cl" 1 1 1 4)
+
+
+set_tests_properties( "workgroup/unconditional_barriers_LOOPS"
+  "workgroup/unbarriered_for_loops_LOOPS"
+  "workgroup/barriered_for_loops_LOOPS"
+  "workgroup/conditional_barrier_LOOPS"
+  "workgroup/b_loop_with_none_of_the_WIs_reaching_the_barrier_LOOPS"
+  "workgroup/forcing_horizontal_parallelization_to_some_outer_loops_LOOPS"
+  "workgroup/loop_with_two_paths_to_the_latch_LOOPS"
+  "workgroup/b_loop_with_two_latches_LOOPS"
+  "workgroup/workgroup_sizes_work_items_get_wrong_ids_LOOPS"
   PROPERTIES
     COST 2.0
     PROCESSORS 1
     LABELS "workgroup"
     ENVIRONMENT "POCL_DEVICES=basic;POCL_WORK_GROUP_METHOD=workitemloops"
-    DEPENDS "pocl_version_check")
+    DEPENDS "pocl_version_check"
+    LABELS "internal;workgroup")
diff --git a/tests/workgroup/Makefile.in b/tests/workgroup/Makefile.in
index e488b9a..d3f77d5 100644
--- a/tests/workgroup/Makefile.in
+++ b/tests/workgroup/Makefile.in
@@ -242,6 +242,7 @@ HOST = @HOST@
 HOST_AS_FLAGS = @HOST_AS_FLAGS@
 HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
 HOST_CPU = @HOST_CPU@
+HOST_DEVICE_EXTENSION_DEFINES = @HOST_DEVICE_EXTENSION_DEFINES@
 HOST_LD_FLAGS = @HOST_LD_FLAGS@
 HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
 HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
@@ -249,6 +250,7 @@ HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
 HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
 HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
 HSAILASM = @HSAILASM@
+HSA_DEVICE_EXTENSION_DEFINES = @HSA_DEVICE_EXTENSION_DEFINES@
 HSA_INCLUDES = @HSA_INCLUDES@
 HSA_LIBS = @HSA_LIBS@
 HWLOC_CFLAGS = @HWLOC_CFLAGS@
@@ -266,8 +268,6 @@ LD_FLAGS_BIN = @LD_FLAGS_BIN@
 LIBOBJS = @LIBOBJS@
 LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
 LIBS = @LIBS@
-LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
-LIBSPE_LIBS = @LIBSPE_LIBS@
 LIBTOOL = @LIBTOOL@
 LIB_AGE_VERSION = @LIB_AGE_VERSION@
 LIB_CURRENT_VERSION = @LIB_CURRENT_VERSION@
@@ -343,6 +343,7 @@ TCECC = @TCECC@
 TCEMC_AVAILABLE = @TCEMC_AVAILABLE@
 TCE_AVAILABLE = @TCE_AVAILABLE@
 TCE_CONFIG = @TCE_CONFIG@
+TCE_DEVICE_EXTENSION_DEFINES = @TCE_DEVICE_EXTENSION_DEFINES@
 VERSION = @VERSION@
 abs_builddir = @abs_builddir@
 abs_srcdir = @abs_srcdir@
diff --git a/tools/scripts/run_all_tests b/tools/scripts/run_all_tests
new file mode 100755
index 0000000..dde4a93
--- /dev/null
+++ b/tools/scripts/run_all_tests
@@ -0,0 +1,75 @@
+#!/bin/bash
+# run_all_tests - Runs a longer test suite using different device drivers etc.
+#
+# Copyright (c) 2012 Pekka Jääskeläinen / Tampere Univ. of Technology
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+failures=no
+
+function run {
+    log=`mktemp`
+    if $* > $log 2>&1;
+        then
+        echo "OK"
+        rm -f $log
+        else
+        echo "FAIL! See $log."
+        failures=yes
+        fi
+}
+
+function run_silently {
+   log=`mktemp`
+   if $* > $log 2>&1;
+        then
+        rm -f $log
+        else
+        echo "FAIL! See $log."
+        failures=yes
+        fi
+}
+
+echo -n "Running 'make check' without ocl-icd and the default device..."
+run_silently ./configure --enable-testsuites=all --disable-icd $*
+run_silently make clean
+run_silently make 
+run make check
+
+echo -n "Running 'make check' with ocl-icd and the default device..."
+run_silently ./configure --enable-testsuites=all $*
+run_silently make clean
+run_silently make 
+run make check
+
+# The Basic device has broken tests at the moment. Enable after fixed.
+#echo -n "Running 'make check' with the 'basic' device (without ICD)..."
+#export POCL_DEVICES="basic"
+#run make check
+
+#echo -n "Running 'make check' with multiple devices (without ICD)..."
+#export POCL_DEVICES="pthread basic pthread pthread"
+#run make check
+
+if test "$failures" == "yes";
+then
+    exit 1
+else
+    exit 0
+fi
\ No newline at end of file
diff --git a/tools/scripts/run_hsa_tests b/tools/scripts/run_hsa_tests
index f54433a..37e2bc2 100755
--- a/tools/scripts/run_hsa_tests
+++ b/tools/scripts/run_hsa_tests
@@ -23,4 +23,8 @@
 
 export POCL_BUILDING=1
 export POCL_DEVICES="hsa"
-make check TESTSUITEFLAGS="-k hsa"
+if [ ! -f "CMakeCache.txt" ]; then
+  make check TESTSUITEFLAGS="-k hsa"
+else
+  ctest -L hsa $@
+fi
diff --git a/lib/kernel/cellspu/Makefile.am b/tools/scripts/run_tta_tests
old mode 100644
new mode 100755
similarity index 72%
rename from lib/kernel/cellspu/Makefile.am
rename to tools/scripts/run_tta_tests
index 9c89c51..c0a0665
--- a/lib/kernel/cellspu/Makefile.am
+++ b/tools/scripts/run_tta_tests
@@ -1,6 +1,7 @@
-# Process this file with automake to produce Makefile.in
-# 
-# Copyright (c) 2011 Universidad Rey Juan Carlos
+#!/bin/bash
+# run_tta_tests - Runs tests against the tta device drivers.
+#
+# Copyright (c) 2012 Pekka Jääskeläinen / Tampere Univ. of Technology
 # 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -20,16 +21,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+export POCL_BUILDING=1
+export POCL_DEVICES="ttasim"
+D=$(dirname "$(readlink -f "$0")")
+export POCL_TTASIM0_PARAMETERS="$D/../data/test_machine.adf"
 
-
-KERNEL_TARGET = tce
-
-# Use TARGET flags:
-CLANG_FLAGS = @TARGET_CLANG_FLAGS@ -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off
-LLC_FLAGS   = @TARGET_LLC_FLAGS@
-LD_FLAGS    = @TARGET_LD_FLAGS@
-
-include ../rules.mk
-include ../sources.mk
-
-EXTRA_DIST = CMakeLists.txt
+if [ ! -f "CMakeCache.txt" ]; then
+  make check TESTSUITEFLAGS="-k tce"
+else
+  ctest -L tce $@
+fi

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/collab-maint/pocl.git



More information about the Pkg-opencl-commits mailing list