[oclgrind] 01/01: New upstream version 18.3

James Price jprice-guest at moszumanska.debian.org
Sat Mar 10 08:42:03 UTC 2018


This is an automated email from the git hooks/post-receive script.

jprice-guest pushed a commit to annotated tag upstream/18.3
in repository oclgrind.

commit bb8001c37ce43a4d14275ed4258322334e49c26b
Author: James Price <j.price at bristol.ac.uk>
Date:   Fri Mar 9 21:38:54 2018 +0000

    New upstream version 18.3
---
 .gitignore                                         |   53 +-
 .travis-deps.sh                                    |    8 +-
 .travis.yml                                        |    4 +-
 CMakeLists.txt                                     |  141 ++-
 Makefile.am                                        |  133 ---
 NEWS                                               |   17 +
 README.md                                          |   40 +-
 cmake_config.h.in => config.h.in                   |    2 +
 configure.ac                                       |  148 ---
 m4/m4_ax_check_compile_flag.m4                     |   74 --
 src/CL/cl.h                                        |  610 +++++++-----
 src/CL/cl_d3d10.h                                  |   23 +-
 src/CL/cl_d3d11.h                                  |   24 +-
 src/CL/cl_dx9_media_sharing.h                      |   25 +-
 src/CL/cl_egl.h                                    |   29 +-
 src/CL/cl_ext.h                                    |  476 +++++++--
 src/CL/cl_gl.h                                     |    9 +-
 src/CL/cl_gl_ext.h                                 |    7 +-
 src/CL/cl_platform.h                               |  371 ++++---
 src/CL/opencl.h                                    |    7 +-
 src/core/Context.cpp                               |   10 +-
 src/core/Kernel.cpp                                |   91 +-
 src/core/Kernel.h                                  |    4 -
 src/core/KernelInvocation.cpp                      |   67 +-
 src/core/KernelInvocation.h                        |    4 +-
 src/core/Memory.cpp                                |   34 +-
 src/core/Memory.h                                  |    4 +-
 src/core/Program.cpp                               |  383 ++++---
 src/core/Program.h                                 |   12 +
 src/core/WorkGroup.cpp                             |   10 +-
 src/core/WorkGroup.h                               |    1 +
 src/core/WorkItem.cpp                              |  336 +++++--
 src/core/WorkItem.h                                |   11 +-
 src/core/WorkItemBuiltins.cpp                      |  395 ++++----
 src/core/clc.h                                     | 1041 --------------------
 src/core/common.cpp                                |  134 ++-
 src/core/common.h                                  |   28 +-
 src/core/gen_clc_h.cmake                           |   11 -
 src/core/gen_clc_h.sh                              |   18 -
 src/core/gen_opencl-c.h.cmake                      |   14 +
 src/kernel/Simulation.cpp                          |   23 +-
 src/kernel/oclgrind-kernel.cpp                     |  114 ++-
 src/plugins/InstructionCounter.cpp                 |   94 +-
 src/plugins/InstructionCounter.h                   |   16 +-
 src/plugins/InteractiveDebugger.cpp                |  167 +---
 src/plugins/Logger.cpp                             |   12 +-
 src/plugins/RaceDetector.cpp                       |    4 +-
 src/runtime/icd.h                                  |    4 +
 src/runtime/oclgrind.cpp                           |  152 ++-
 src/runtime/runtime.cpp                            |  573 ++++++++++-
 tests/apps/CMakeLists.txt                          |    7 +-
 tests/apps/image/image.c                           |    2 +-
 tests/apps/vecadd/vecadd.c                         |    2 +-
 tests/common/common.c                              |    4 +-
 tests/common/common.h                              |    2 +-
 tests/kernels/CMakeLists.txt                       |   28 +-
 tests/kernels/TESTS                                |   10 +-
 tests/kernels/atomics/atom_add.cl                  |    6 +
 tests/kernels/atomics/atom_add.ref                 |    2 +
 tests/kernels/atomics/atom_add.sim                 |    6 +
 tests/kernels/atomics/atomic_cmpxchg_read_race.cl  |    2 +-
 tests/kernels/atomics/atomic_cmpxchg_read_race.ref |    2 +-
 tests/kernels/atomics/atomic_minmax_signed.cl      |   16 +
 tests/kernels/atomics/atomic_minmax_signed.ref     |   13 +
 tests/kernels/atomics/atomic_minmax_signed.sim     |   18 +
 tests/kernels/atomics/atomic_race_after.cl         |    4 +-
 tests/kernels/atomics/atomic_race_after.ref        |    6 +-
 tests/kernels/atomics/atomic_race_after.sim        |    1 +
 tests/kernels/atomics/atomic_race_before.cl        |    2 +-
 tests/kernels/atomics/atomic_race_before.ref       |    2 +-
 tests/kernels/bugs/const_gep_expr_pointee_type.cl  |    2 +
 tests/kernels/bugs/llvm_memcpyopt_bug.cl           |   15 +
 tests/kernels/bugs/llvm_memcpyopt_bug.ref          |    5 +
 tests/kernels/bugs/llvm_memcpyopt_bug.sim          |    6 +
 tests/kernels/interactive/struct_member.cl         |   19 +
 tests/kernels/interactive/struct_member.inp        |   21 +
 tests/kernels/interactive/struct_member.ref        |   31 +
 tests/kernels/interactive/struct_member.sim        |    9 +
 tests/kernels/misc/global_variables.cl             |   14 +
 tests/kernels/misc/global_variables.ref            |    7 +
 tests/kernels/misc/global_variables.sim            |    8 +
 tests/kernels/misc/non_uniform_work_groups.cl      |   16 +
 tests/kernels/misc/non_uniform_work_groups.ref     |   34 +
 tests/kernels/misc/non_uniform_work_groups.sim     |    7 +
 tests/kernels/misc/printf.cl                       |    9 +
 tests/kernels/misc/printf.ref                      |    3 +
 tests/kernels/misc/printf.sim                      |   12 +
 tests/kernels/misc/reduce.cl                       |   24 +-
 tests/kernels/misc/reduce.ref                      |   19 +-
 tests/kernels/misc/reduce.sim                      |    8 +-
 tests/kernels/misc/switch_case.cl                  |   25 +
 tests/kernels/misc/switch_case.ref                 |    9 +
 tests/kernels/misc/switch_case.sim                 |   16 +
 .../uninitialized/padded_nested_struct_memcpy.cl   |    4 +-
 .../uninitialized/padded_nested_struct_memcpy.ref  |    4 +-
 .../uninitialized/padded_struct_alloca_fp.cl       |    2 +-
 .../uninitialized/padded_struct_alloca_fp.ref      |   13 +-
 .../uninitialized/padded_struct_alloca_fp.sim      |    2 +-
 .../uninitialized/padded_struct_memcpy_fp.cl       |    2 +-
 .../uninitialized/padded_struct_memcpy_fp.ref      |   13 +-
 .../uninitialized/padded_struct_memcpy_fp.sim      |    2 +-
 .../uninitialized_padded_nested_struct_memcpy.cl   |    2 +-
 .../uninitialized_padded_nested_struct_memcpy.ref  |    2 +-
 .../uninitialized_padded_struct_memcpy.ref         |   13 +-
 .../uninitialized_padded_struct_memcpy.sim         |    2 +-
 tests/kernels/wait_event/wait_event_invalid.cl     |    2 +-
 tests/run_test.py                                  |   43 +-
 tests/runtime/CMakeLists.txt                       |   13 +-
 tests/runtime/build_program.c                      |   73 ++
 tests/runtime/build_program.ref                    |    6 +
 tests/runtime/inc/nospace/header.h                 |    1 +
 tests/runtime/inc/with space/header.h              |    1 +
 tests/runtime/map_buffer.c                         |    2 +-
 tests/runtime/sampler.c                            |  138 +++
 tests/runtime/sampler.ref                          |    1 +
 115 files changed, 3803 insertions(+), 2960 deletions(-)

diff --git a/.gitignore b/.gitignore
index 34ed68f..86aeee5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,50 +1,27 @@
-# Autotools generated files
-aclocal.m4
-autom4te.cache
-compile
-config.guess
-config.h
-config.h.in
-config.log
-config.status
-config.sub
-configure
-depcomp
-.deps
-*.dirstamp
-install-sh
-ltmain.sh
-m4/libtool.m4
-m4/ltoptions.m4
-m4/ltsugar.m4
-m4/ltversion.m4
-m4/lt~obsolete.m4
+# CMake output
+*.cmake
+!src/core/gen_clc_h.cmake
+CMakeCache.txt
+CMakeFiles/
 Makefile
-Makefile.in
-missing
-stamp-h1
-test-driver
+config.h
+.ninja_deps
+.ninja_log
+build.ninja
+rules.ninja
 
 # Compiler output
 *.o
-*.lo
-.libs
-liboclgrind.la
-liboclgrind-rt.la
-liboclgrind-rt-icd.la
-libtool
+*.so
+*.dylib
+include
 /oclgrind
 oclgrind.icd
 oclgrind-kernel
 src/core/clc_h.cpp
-
-# Test output
-test-suite.log
-*.trs
-*.log
-*.out
-*.diff
+tests/apps/image/image
 tests/apps/vecadd/vecadd
+tests/runtime/map_buffer
 
 # Misc
 oclgrind-*.tar.gz
diff --git a/.travis-deps.sh b/.travis-deps.sh
index 628d1f8..bf0ab9e 100644
--- a/.travis-deps.sh
+++ b/.travis-deps.sh
@@ -3,7 +3,7 @@
 if [ "$TRAVIS_OS_NAME" == "linux" ]
 then
     # Add repositories
-    sudo add-apt-repository -y 'deb http://apt.llvm.org/trusty/ llvm-toolchain-trusty-3.9 main'
+    sudo add-apt-repository -y 'deb http://apt.llvm.org/trusty/ llvm-toolchain-trusty-4.0 main'
     wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
     sudo apt-get update -qq
 
@@ -11,9 +11,9 @@ then
     sudo apt-get remove llvm
 
     # Install Clang + LLVM
-    sudo apt-get install -y llvm-3.9-dev libclang-3.9-dev clang-3.9
-    sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-3.9 20
-    sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-3.9 20
+    sudo apt-get install -y llvm-4.0-dev libclang-4.0-dev clang-4.0
+    sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-4.0 20
+    sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-4.0 20
     sudo rm -f /usr/local/clang-3.5.0/bin/clang
     sudo rm -f /usr/local/clang-3.5.0/bin/clang++
 
diff --git a/.travis.yml b/.travis.yml
index 8a94414..f6a805e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,7 +20,7 @@ before_install:
   - bash ./.travis-deps.sh
 
 script:
-  - if [ "${TRAVIS_OS_NAME}" = "linux" ]; then cmake . -DLLVM_DIR=/usr/lib/llvm-3.9/lib/cmake/llvm ; fi
-  - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then cmake . -DLLVM_DIR=/usr/local/Cellar/llvm/3.8.1/share/llvm/cmake ; fi
+  - if [ "${TRAVIS_OS_NAME}" = "linux" ]; then cmake . -DLLVM_DIR=/usr/lib/llvm-4.0/lib/cmake/llvm ; fi
+  - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then cmake . -DLLVM_DIR=/usr/local/opt/llvm/lib/cmake/llvm ; fi
   - make -j 2
   - ctest --output-on-failure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4838fd5..ca522c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,12 +8,13 @@
 
 cmake_minimum_required(VERSION 2.8.12)
 project(Oclgrind)
-set(Oclgrind_VERSION_MAJOR 16)
-set(Oclgrind_VERSION_MINOR 10)
+set(Oclgrind_VERSION_MAJOR 18)
+set(Oclgrind_VERSION_MINOR 3)
 
 include(CheckIncludeFiles)
 include(CheckIncludeFileCXX)
 include(CheckLibraryExists)
+include(TestBigEndian)
 
 # Enable C99 for GCC (required for tests)
 if (CMAKE_COMPILER_IS_GNUCC)
@@ -42,6 +43,9 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-availability")
 endif()
 
+# Detect endianness
+test_big_endian(IS_BIG_ENDIAN)
+
 
 # Find LLVM
 find_package(LLVM REQUIRED CONFIG NO_CMAKE_BUILDS_PATH)
@@ -49,10 +53,18 @@ message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
 message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 
 # Check LLVM version
-if (${LLVM_PACKAGE_VERSION} VERSION_LESS "3.6")
-  message(FATAL_ERROR "LLVM version must be >= 3.6")
+if (${LLVM_PACKAGE_VERSION} VERSION_LESS "3.9")
+  message(FATAL_ERROR "LLVM version must be >= 3.9")
+endif()
+
+# Warn about LLVM 4.0 InstCombine/addrspacecast bug
+if ("${LLVM_VERSION_MAJOR}" STREQUAL "4" AND
+    "${LLVM_VERSION_PATCH}" STREQUAL "0")
+   message(WARNING
+   "There is a known bug with LLVM 4.0.0 that will cause "
+   "Oclgrind to fail on some kernels with a message about "
+   "`addrspacecast`. Use LLVM 3.9 or 4.0.1+ if possible.")
 endif()
-set(LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR})
 
 # Add flags for LLVM
 add_definitions(${LLVM_DEFINITIONS})
@@ -61,14 +73,9 @@ link_directories(${LLVM_LIBRARY_DIRS})
 
 # Get LLVM libraries for linking
 llvm_map_components_to_libnames(LLVM_LIBS
-  bitreader bitwriter core instrumentation ipo irreader
+  bitreader bitwriter core coverage instrumentation ipo irreader
   linker lto mcparser objcarcopts option target)
 
-if (NOT (${LLVM_PACKAGE_VERSION} VERSION_LESS "3.9"))
-  llvm_map_components_to_libnames(LLVM_COVERAGE coverage)
-  list(APPEND LLVM_LIBS ${LLVM_COVERAGE})
-endif()
-
 if (NOT (${LLVM_PACKAGE_VERSION} VERSION_LESS "4.0"))
   llvm_map_components_to_libnames(LLVM_COROUTINES coroutines)
   list(APPEND LLVM_LIBS ${LLVM_COROUTINES})
@@ -109,6 +116,18 @@ if ("${CLANG_LIB}" STREQUAL "CLANG_LIB-NOTFOUND")
   message(FATAL_ERROR "Clang libraries not found (set CLANG_ROOT)")
 endif()
 
+# Get path to Clang's opencl-c.h header
+get_filename_component(CLANG_LIB_DIR "${CLANG_LIB}" DIRECTORY)
+set(CLANG_FULL_VERSION
+    "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}")
+set(CLANG_OPENCL_C_H
+    "${CLANG_LIB_DIR}/clang/${CLANG_FULL_VERSION}/include/opencl-c.h")
+if (NOT EXISTS "${CLANG_OPENCL_C_H}")
+  message(FATAL_ERROR "\nClang opencl-c.h not found:\n\t${CLANG_OPENCL_C_H}")
+else()
+  message(STATUS "Using opencl-c.h: ${CLANG_OPENCL_C_H}")
+endif()
+
 # Check for clang
 find_program(CLANG clang
              PATHS "${CLANG_ROOT}/bin" "${LLVM_TOOLS_BINARY_DIR}"
@@ -127,8 +146,6 @@ if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
   include_directories(${READLINE_DIR}/include)
   link_directories(${READLINE_DIR}/lib)
 
-  message(STATUS ${CMAKE_REQUIRED_LIBRARIES})
-
   check_include_files("stdio.h;readline/readline.h" HAVE_READLINE_H)
   check_include_files("stdio.h;readline/history.h" HAVE_HISTORY_H)
   check_library_exists(readline readline
@@ -157,12 +174,12 @@ endif()
 set(LIBDIR_SUFFIX "${_LIBDIR_SUFFIX}"
     CACHE STRING "Suffix for installed library directory")
 
-# Generate stringified clc.h
+# Generate stringified opencl-c.h
 add_custom_command(
-  OUTPUT src/core/clc_h.cpp
-  COMMAND ${CMAKE_COMMAND} -DSOURCE_FILE=${CMAKE_SOURCE_DIR}/src/core/clc.h
-    -P ${CMAKE_SOURCE_DIR}/src/core/gen_clc_h.cmake
-  DEPENDS src/core/clc.h src/core/gen_clc_h.cmake
+  OUTPUT src/core/opencl-c.h.cpp
+  COMMAND ${CMAKE_COMMAND} -DSOURCE_FILE=${CLANG_OPENCL_C_H}
+    -P ${CMAKE_SOURCE_DIR}/src/core/gen_opencl-c.h.cmake
+  DEPENDS ${CLANG_OPENCL_C_H} src/core/gen_opencl-c.h.cmake
 )
 
 include_directories("src/" "${PROJECT_BINARY_DIR}")
@@ -186,7 +203,7 @@ set(CORE_HEADERS
 
 add_library(oclgrind ${CORE_LIB_TYPE}
   ${CORE_HEADERS}
-  src/core/clc_h.cpp
+  src/core/opencl-c.h.cpp
   src/core/common.cpp
   src/core/Context.cpp
   src/core/half.cpp
@@ -244,6 +261,13 @@ endif()
 add_library(oclgrind-rt SHARED ${RUNTIME_SOURCES} ${DLL_EXPORTS})
 target_link_libraries(oclgrind-rt ${CMAKE_DL_LIBS} oclgrind)
 
+if (UNIX AND NOT APPLE)
+  # Change the SONAME of the library so that it gets recognized by dlopen
+  set_target_properties(oclgrind-rt PROPERTIES
+                        NO_SONAME ON
+                        LINK_FLAGS "-Wl,-soname,libOpenCL.so")
+endif()
+
 add_executable(oclgrind-exe src/runtime/oclgrind.cpp)
 set_target_properties(oclgrind-exe PROPERTIES OUTPUT_NAME oclgrind)
 target_compile_definitions(oclgrind-exe PRIVATE
@@ -255,60 +279,87 @@ add_executable(oclgrind-kernel
   src/kernel/Simulation.cpp)
 target_link_libraries(oclgrind-kernel oclgrind)
 
-set(CLC_HEADERS
- ${CMAKE_BINARY_DIR}/include/oclgrind/clc.h
- ${CMAKE_BINARY_DIR}/include/oclgrind/clc32.pch
- ${CMAKE_BINARY_DIR}/include/oclgrind/clc64.pch
+set(OPENCL_C_H
+ ${CMAKE_BINARY_DIR}/include/oclgrind/opencl-c.h
+ ${CMAKE_BINARY_DIR}/include/oclgrind/opencl-c-1.2-32.pch
+ ${CMAKE_BINARY_DIR}/include/oclgrind/opencl-c-1.2-64.pch
+ ${CMAKE_BINARY_DIR}/include/oclgrind/opencl-c-2.0-32.pch
+ ${CMAKE_BINARY_DIR}/include/oclgrind/opencl-c-2.0-64.pch
 )
 
-add_custom_target(CLC_HEADERS ALL DEPENDS ${CLC_HEADERS})
+add_custom_target(OPENCL_C_HEADERS ALL DEPENDS ${OPENCL_C_H})
 
 add_custom_command(
-  OUTPUT include/oclgrind/clc.h
+  OUTPUT include/oclgrind/opencl-c.h
   POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E
-    copy ${CMAKE_SOURCE_DIR}/src/core/clc.h include/oclgrind/clc.h
-  DEPENDS src/core/clc.h)
+    copy ${CLANG_OPENCL_C_H} include/oclgrind/opencl-c.h
+  DEPENDS ${CLANG_OPENCL_C_H})
 
-# Generate precompiled headers for clc.h
-set(CLC_SYSROOT "${CMAKE_BINARY_DIR}/include/oclgrind/")
+# Generate precompiled headers for opencl-c.h
+set(OPENCL_C_H_SYSROOT "${CMAKE_BINARY_DIR}/include/oclgrind/")
 if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
-  string(REPLACE "/" "\\" CLC_SYSROOT "${CLC_SYSROOT}")
+  string(REPLACE "/" "\\" OPENCL_C_H_SYSROOT "${OPENCL_C_H_SYSROOT}")
 endif()
 add_custom_command(
-  OUTPUT include/oclgrind/clc32.pch
+  OUTPUT include/oclgrind/opencl-c-1.2-32.pch
   POST_BUILD
   COMMAND
     ${CLANG}
-    -cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin
+    -cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin -fgnu89-inline
     -emit-pch -triple spir-unknown-unknown
-    -relocatable-pch -isysroot "${CLC_SYSROOT}"
-    include/oclgrind/clc.h
-    -o include/oclgrind/clc32.pch
-  DEPENDS include/oclgrind/clc.h
+    -relocatable-pch -isysroot "${OPENCL_C_H_SYSROOT}"
+    include/oclgrind/opencl-c.h
+    -o include/oclgrind/opencl-c-1.2-32.pch
+  DEPENDS include/oclgrind/opencl-c.h
 )
 add_custom_command(
-  OUTPUT include/oclgrind/clc64.pch
+  OUTPUT include/oclgrind/opencl-c-1.2-64.pch
   POST_BUILD
   COMMAND
     ${CLANG}
-    -cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin
+    -cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin -fgnu89-inline
     -emit-pch -triple spir64-unknown-unknown
-    -relocatable-pch -isysroot "${CLC_SYSROOT}"
-    include/oclgrind/clc.h
-    -o include/oclgrind/clc64.pch
-  DEPENDS include/oclgrind/clc.h
+    -relocatable-pch -isysroot "${OPENCL_C_H_SYSROOT}"
+    include/oclgrind/opencl-c.h
+    -o include/oclgrind/opencl-c-1.2-64.pch
+  DEPENDS include/oclgrind/opencl-c.h
+)
+add_custom_command(
+  OUTPUT include/oclgrind/opencl-c-2.0-32.pch
+  POST_BUILD
+  COMMAND
+    ${CLANG}
+    -cc1 -x cl -cl-std=CL2.0 -O0 -fno-builtin -fgnu89-inline
+    -emit-pch -triple spir-unknown-unknown
+    -relocatable-pch -isysroot "${OPENCL_C_H_SYSROOT}"
+    include/oclgrind/opencl-c.h
+    -o include/oclgrind/opencl-c-2.0-32.pch
+  DEPENDS include/oclgrind/opencl-c.h
+)
+add_custom_command(
+  OUTPUT include/oclgrind/opencl-c-2.0-64.pch
+  POST_BUILD
+  COMMAND
+    ${CLANG}
+    -cc1 -x cl -cl-std=CL2.0 -O0 -fno-builtin -fgnu89-inline
+    -emit-pch -triple spir64-unknown-unknown
+    -relocatable-pch -isysroot "${OPENCL_C_H_SYSROOT}"
+    include/oclgrind/opencl-c.h
+    -o include/oclgrind/opencl-c-2.0-64.pch
+  DEPENDS include/oclgrind/opencl-c.h
 )
 
 
 # Generate config.h
-configure_file("cmake_config.h.in" "config.h")
+set(LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR})
+configure_file("config.h.in" "config.h")
 
 
 # Generate ICD loader if not on Windows
 if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
   file(GENERATE OUTPUT ${CMAKE_BINARY_DIR}/oclgrind.icd
-       CONTENT "$<TARGET_FILE:oclgrind-rt-icd>\n")
+       CONTENT "${CMAKE_INSTALL_PREFIX}/lib${LIBDIR_SUFFIX}/$<TARGET_FILE_NAME:oclgrind-rt-icd>\n")
 endif()
 
 install(TARGETS
@@ -318,7 +369,7 @@ install(TARGETS
   oclgrind oclgrind-rt oclgrind-rt-icd
   DESTINATION "lib${LIBDIR_SUFFIX}")
 install(FILES
-  ${CORE_HEADERS} ${CLC_HEADERS}
+  ${CORE_HEADERS} ${OPENCL_C_H}
   DESTINATION include/oclgrind)
 if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
   install(FILES
diff --git a/Makefile.am b/Makefile.am
deleted file mode 100644
index 144a027..0000000
--- a/Makefile.am
+++ /dev/null
@@ -1,133 +0,0 @@
-# Makefile.am (Oclgrind)
-# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
-# University of Bristol. All rights reserved.
-#
-# This program is provided under a three-clause BSD license. For full
-# license terms please see the LICENSE file distributed with this
-# source code.
-
-AUTOMAKE_OPTIONS = subdir-objects
-ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
-
-SUBDIRS = . tests
-
-AM_CPPFLAGS = -I$(top_srcdir)/src/ -Wall -fno-rtti
-
-# Suppress warnings from OpenCL runtime API headers
-if USING_CLANG
-AM_CPPFLAGS += -Wno-ignored-attributes -Wno-gcc-compat -Wno-availability
-endif USING_CLANG
-
-lib_LTLIBRARIES = liboclgrind.la liboclgrind-rt.la liboclgrind-rt-icd.la
-
-
-LLVM_COMPONENTS = bitreader bitwriter core instrumentation ipo	\
-irreader linker mcparser objcarcopts option target
-
-if LLVM_39_OR_NEWER
-LLVM_COMPONENTS += coverage
-endif LLVM_39_OR_NEWER
-
-if LLVM_40_OR_NEWER
-LLVM_COMPONENTS += coroutines
-endif LLVM_40_OR_NEWER
-
-LLVM_LIBS = `$(llvm_config) --system-libs --libs $(LLVM_COMPONENTS)`
-
-liboclgrind_la_SOURCES = src/core/common.h src/core/common.cpp		\
- src/core/Context.h src/core/Context.cpp src/core/half.h		\
- src/core/half.cpp src/core/Kernel.h src/core/Kernel.cpp		\
- src/core/KernelInvocation.h src/core/KernelInvocation.cpp		\
- src/core/Memory.h src/core/Memory.cpp src/core/Plugin.h		\
- src/core/Plugin.cpp src/core/Program.h src/core/Program.cpp		\
- src/core/Queue.h src/core/Queue.cpp src/core/WorkItem.h		\
- src/core/WorkItem.cpp src/core/WorkItemBuiltins.cpp			\
- src/core/WorkGroup.h src/core/WorkGroup.cpp				\
- src/plugins/InstructionCounter.h src/plugins/InstructionCounter.cpp	\
- src/plugins/InteractiveDebugger.h src/plugins/InteractiveDebugger.cpp	\
- src/plugins/Logger.h src/plugins/Logger.cpp src/plugins/MemCheck.h	\
- src/plugins/MemCheck.cpp src/plugins/RaceDetector.h			\
- src/plugins/RaceDetector.cpp src/plugins/Uninitialized.h \
- src/plugins/Uninitialized.cpp
-nodist_liboclgrind_la_SOURCES = src/core/clc_h.cpp config.h
-liboclgrind_la_LDFLAGS = -lclangFrontend -lclangDriver		\
--lclangSerialization -lclangCodeGen -lclangParse -lclangSema	\
--lclangAnalysis -lclangEdit -lclangAST -lclangLex -lclangBasic	\
-${LLVM_LIBS} $(oclgrind_extra_libs) -shared
-oclgrind_includedir = $(includedir)/oclgrind
-oclgrind_include_HEADERS = src/core/common.h src/core/Context.h	\
- src/core/half.h src/core/Kernel.h src/core/KernelInvocation.h	\
- src/core/Memory.h src/core/Plugin.h src/core/Program.h		\
- src/core/Queue.h src/core/WorkItem.h src/core/WorkGroup.h
-src/core/clc_h.cpp: src/core/gen_clc_h.sh	src/core/clc.h
-	$(top_srcdir)/src/core/gen_clc_h.sh $(top_srcdir)/src/core/clc.h $@
-
-install-data-hook:
-	cp -p src/include/oclgrind/clc.h      $(DESTDIR)$(includedir)/oclgrind/
-	cp -p src/include/oclgrind/clc32.pch  $(DESTDIR)$(includedir)/oclgrind/
-	cp -p src/include/oclgrind/clc64.pch  $(DESTDIR)$(includedir)/oclgrind/
-
-uninstall-hook:
-	rm -rf $(DESTDIR)$(includedir)/oclgrind/clc.h
-	rm -rf $(DESTDIR)$(includedir)/oclgrind/clc32.pch
-	rm -rf $(DESTDIR)$(includedir)/oclgrind/clc64.pch
-
-RUNTIME_SOURCES = src/runtime/async_queue.h				\
- src/runtime/async_queue.cpp src/runtime/icd.h src/runtime/runtime.cpp
-
-liboclgrind_rt_la_SOURCES = $(RUNTIME_SOURCES)
-liboclgrind_rt_la_LIBADD = liboclgrind.la
-liboclgrind_rt_la_LDFLAGS = -shared
-
-liboclgrind_rt_icd_la_CPPFLAGS = -DOCLGRIND_ICD $(AM_CPPFLAGS)
-liboclgrind_rt_icd_la_SOURCES = $(RUNTIME_SOURCES)
-liboclgrind_rt_icd_la_LIBADD = liboclgrind.la
-liboclgrind_rt_icd_la_LDFLAGS = -shared
-
-bin_PROGRAMS = oclgrind oclgrind-kernel
-
-oclgrind_SOURCES = src/runtime/oclgrind.cpp
-oclgrind_CPPFLAGS = -DLIBDIR_SUFFIX=""
-
-oclgrind_kernel_SOURCES = src/kernel/oclgrind-kernel.cpp	\
- src/kernel/Simulation.h src/kernel/Simulation.cpp
-oclgrind_kernel_LDADD = liboclgrind.la
-
-noinst_SCRIPTS = oclgrind.icd \
- src/include/oclgrind/clc.h \
- src/include/oclgrind/clc32.pch \
- src/include/oclgrind/clc64.pch
-oclgrind.icd: liboclgrind-rt-icd.la
-	printf $(libdir)/ >$@
-	$(GREP) dlname $< | $(AWK) -F "'" '{print $$2}' >>$@
-
-src/include/oclgrind/clc.h: $(top_srcdir)/src/core/clc.h
-	mkdir -p src/include/oclgrind
-	cp $< $@
-
-src/include/oclgrind/clc32.pch: src/include/oclgrind/clc.h
-	$(clang) \
-		-cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin \
-		-emit-pch -triple spir-unknown-unknown \
-		-relocatable-pch \
-		-isysroot $(abs_builddir)/src/include/oclgrind \
-		$< -o $@
-src/include/oclgrind/clc64.pch: src/include/oclgrind/clc.h
-	$(clang) \
-		-cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin \
-		-emit-pch -triple spir64-unknown-unknown \
-		-relocatable-pch \
-		-isysroot $(abs_builddir)/src/include/oclgrind \
-		$< -o $@
-
-EXTRA_DIST = NEWS src/core/gen_clc_h.sh src/core/clc.h src/CL/cl.h	\
- src/CL/cl_gl.h src/CL/cl_platform.h src/CL/cl_ext.h			\
- src/CL/cl_gl_ext.h src/CL/cl_egl.h src/CL/cl_d3d10.h			\
- src/CL/cl_d3d11.h src/CL/cl_dx9_media_sharing.h src/CL/opencl.h	\
- CMakeLists.txt tests/apps/CMakeLists.txt tests/kernels/CMakeLists.txt	\
- tests/runtime/CMakeLists.txt cmake_config.h.in				\
- src/core/gen_clc_h.cmake src/runtime/icd.def src/runtime/runtime.def	\
- src/install/INSTALL.darwin src/install/INSTALL.linux			\
- src/install/INSTALL.windows src/install/install.bat			\
- src/install/uninstall.bat src/install/oclgrind-icd.reg
-CLEANFILES = src/core/clc_h.cpp $(noinst_SCRIPTS)
diff --git a/NEWS b/NEWS
index c9c996b..b147239 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,23 @@ For more information, please visit the Oclgrind Wiki:
 https://github.com/jrprice/Oclgrind/wiki
 
 
+Oclgrind 18.3
+=============
+Added support for the following OpenCL 2.0 features:
+- Program-scope global variables
+- Non-uniform work-group sizes
+- Several miscellaneous API and kernel functions
+
+Other changes:
+- Switched to using Clang's builtin OpenCL header (opencl-c.h)
+- Added support for LLVM 4.0, 5.0 and 6.0
+- Dropped support for LLVM 3.6, 3.7 and 3.8
+- Added --{global,constant,local}-mem-size and --max-wgsize options
+- Removed autotools build system
+- Improved support for big-endian systems
+- Various minor bug fixes
+
+
 Oclgrind 16.10
 ==============
 This release incorporates the following changes:
diff --git a/README.md b/README.md
index 986c1bd..29edcff 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,3 @@
-========
 Oclgrind
 ========
 
@@ -21,7 +20,7 @@ Binary releases can be found on the GitHub releases page:
 
 Build dependencies
 ------------------
-To build this project, you will need LLVM and Clang 3.6 (or newer)
+To build this project, you will need LLVM and Clang 3.9 (or newer)
 development libraries and headers. If you build LLVM from source, it
 is recommended to enable optimizations to significantly improve the
 performance of Oclgrind (set `CMAKE_BUILD_TYPE` to `Release` or
@@ -73,43 +72,6 @@ copying the `oclgrind.icd` file from the build directory to
 `/etc/OpenCL/vendors/`.
 
 
-Building on Linux and OS X (autotools)
---------------------------------------
-An autotools build system is also provided as an alternative to
-CMake. This will likely be removed in a future version of Oclgrind.
-
-If you are building directly from the GitHub repository, you will need
-to run `autoreconf -i` to generate the necessary build files. This is
-not required if you are using a released source package.
-
-Run `configure` to generate the Makefile, optionally using
-`--prefix=PATH` to specify the target installation directory. If you
-don't have the LLVM includes and libraries on your search path, you
-can specify the location of your LLVM installation using the
-`--with-llvm=PATH` option. For example:
-
-    ./configure --prefix=$PWD/build/ --with-llvm=PATH/TO/LLVM/INSTALL
-
-This path should be the directory in which LLVM is installed (e.g. the
-path specified to `--prefix` or `CMAKE_INSTALL_PREFIX` when LLVM was
-built). If the Clang includes and libraries are not on your search
-path or in the same location as LLVM, you can use the
-`--with-clang=PATH` option to specify its root directory.
-
-Next, build and install with make:
-
-    make
-    make check
-    make install
-
-If installing to a non-default location, you should add the `bin/`
-directory to the `PATH` environment variable in order to make use of
-the `oclgrind` command. If you wish to use Oclgrind via the OpenCL ICD
-loader (optional), then you should create an ICD loading point by
-copying the `oclgrind.icd` file from the build directory to
-`/etc/OpenCL/vendors/`.
-
-
 Building on Windows
 -------------------
 Building Oclgrind on Windows requires Visual Studio 2013 (or newer),
diff --git a/cmake_config.h.in b/config.h.in
similarity index 79%
rename from cmake_config.h.in
rename to config.h.in
index 3794dc8..d95b988 100644
--- a/cmake_config.h.in
+++ b/config.h.in
@@ -3,3 +3,5 @@
 #define HAVE_READLINE @HAVE_READLINE@
 
 #define LLVM_VERSION @LLVM_VERSION@
+
+#define IS_BIG_ENDIAN @IS_BIG_ENDIAN@
diff --git a/configure.ac b/configure.ac
deleted file mode 100644
index 3da5e46..0000000
--- a/configure.ac
+++ /dev/null
@@ -1,148 +0,0 @@
-# configure.ac (Oclgrind)
-# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
-# University of Bristol. All rights reserved.
-#
-# This program is provided under a three-clause BSD license. For full
-# license terms please see the LICENSE file distributed with this
-# source code.
-
-AC_INIT([Oclgrind], [16.10], ,
-        [oclgrind], [https://github.com/jrprice/Oclgrind])
-AC_PREREQ([2.63])
-AC_CONFIG_SRCDIR([src/])
-AM_INIT_AUTOMAKE([foreign 1.12])
-AC_LANG(C++)
-AC_PROG_CXX
-AC_CONFIG_MACRO_DIR([m4])
-AC_CONFIG_HEADERS([config.h])
-AC_CONFIG_FILES([Makefile tests/Makefile])
-
-LT_INIT
-
-# Check if we're compiling with Clang
-AS_CASE([`$CC --version`], [*clang*], [using_clang=yes])
-AM_CONDITIONAL([USING_CLANG], [test "$using_clang" == "yes"])
-
-oclgrind_extra_libs=
-
-# Check for C++11
-AX_CHECK_COMPILE_FLAG([-std=c++11], [],
-                      [AC_MSG_ERROR([C++11 support is required])])
-CXXFLAGS="$CXXFLAGS -std=c++11"
-
-
-# --with-llvm option to specify root of LLVM installation
-AC_ARG_WITH(
-        llvm,
-        [AS_HELP_STRING([--with-llvm],
-                       [directory containing LLVM installation])],
-        [export PATH="$withval/bin:$PATH"])
-
-# Get path to llvm-config
-AC_CHECK_PROG(llvm_config, [llvm-config], `which llvm-config`)
-if test -z $llvm_config; then
-  AC_MSG_ERROR([llvm-config not found (use --with-llvm=)])
-fi
-
-# Check version of LLVM
-AC_MSG_CHECKING([llvm version])
-llvm_full_version=`$llvm_config --version`
-llvm_version=`echo $llvm_full_version | cut -b 1,3`
-AC_MSG_RESULT($llvm_full_version)
-if test $llvm_version -lt 36; then
-  AC_MSG_ERROR([LLVM version must be >= 3.6])
-fi
-AM_CONDITIONAL([LLVM_39_OR_NEWER], [test $llvm_version -ge 39])
-AM_CONDITIONAL([LLVM_40_OR_NEWER], [test $llvm_version -ge 40])
-AC_DEFINE_UNQUOTED([LLVM_VERSION],
-                   [$llvm_version],
-                   [Version of LLVM we are building against])
-
-# Add flags for LLVM
-CPPFLAGS="$CPPFLAGS `$llvm_config --cppflags`"
-LDFLAGS="$LDFLAGS `$llvm_config --ldflags`"
-
-
-# --with-clang option to specify root of Clang installation
-AC_ARG_WITH(
-        clang,
-        [AS_HELP_STRING([--with-clang],
-                       [directory containing Clang installation])],
-        [export PATH="$withval/bin:$PATH"
-         CPPFLAGS="$CPPFLAGS -I$withval/include/"
-         LDFLAGS="$LDFLAGS -L$withval/lib/"])
-
-CPPFLAGS_old="$CPPFLAGS"
-CPPFLAGS="$CPPFLAGS -std=c++11"
-
-# Check for Clang binaries, headers and libraries
-AC_CHECK_PROG(clang, [clang], `which clang`)
-AC_CHECK_HEADERS(
-        [clang/CodeGen/CodeGenAction.h],
-        [:],
-        [AC_MSG_ERROR([Clang headers not found (use --with-clang=)])])
-AC_CHECK_LIB(
-        [clangFrontend],
-        [main],
-        [:],
-        [AC_MSG_ERROR([Clang libraries not found (use --with-clang)])])
-
-CPPFLAGS="$CPPFLAGS_old"
-
-
-# GNU readline library (for interactive debugger)
-AC_ARG_WITH(
-        [readline],
-        AS_HELP_STRING([--with-readline],
-                       [location of GNU readline library]),
-        [CPPFLAGS="$CPPFLAGS -I$withval/include";
-         LDFLAGS="$LDFLAGS -L$withval/lib"])
-
-have_readline=true
-AC_CHECK_HEADER(
-        [readline/readline.h],
-        [:],
-        [have_readline=false])
-AC_CHECK_HEADER(
-        [readline/history.h],
-        [:],
-        [have_readline=false])
-AC_CHECK_LIB(
-        [readline],
-        [readline],
-        [:],
-        [have_readline=false])
-AC_CHECK_LIB(
-        [readline],
-        [add_history],
-        [:],
-        [have_readline=false])
-if test $have_readline = true; then
-    AC_DEFINE([HAVE_READLINE], [1], [Define to 1 if GNU readline found])
-    oclgrind_extra_libs="$oclgrind_extra_libs -lreadline"
-else
-    AC_MSG_WARN([GNU readline library not found (use --with-readline)])
-fi
-
-
-AC_SUBST([oclgrind_extra_libs], [$oclgrind_extra_libs])
-
-
-# Check if Python is available (required to run tests)
-AM_PATH_PYTHON(,,[:])
-AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :])
-
-# Kernel tests
-KERNEL_TESTS=""
-KERNEL_TEST_INPUTS=""
-m4_foreach([name], m4_split(m4_include(tests/kernels/TESTS), m4_newline),
-[
-    KERNEL_TESTS="$KERNEL_TESTS kernels/"name".sim"
-    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS kernels/"name".sim"
-    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS kernels/"name".cl"
-    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS kernels/"name".ref"
-])
-AC_SUBST(KERNEL_TESTS, $KERNEL_TESTS)
-AC_SUBST(KERNEL_TEST_INPUTS, $KERNEL_TEST_INPUTS)
-
-AC_OUTPUT
diff --git a/m4/m4_ax_check_compile_flag.m4 b/m4/m4_ax_check_compile_flag.m4
deleted file mode 100644
index ca36397..0000000
--- a/m4/m4_ax_check_compile_flag.m4
+++ /dev/null
@@ -1,74 +0,0 @@
-# ===========================================================================
-#   http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-#   AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
-#
-# DESCRIPTION
-#
-#   Check whether the given FLAG works with the current language's compiler
-#   or gives an error.  (Warnings, however, are ignored)
-#
-#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
-#   success/failure.
-#
-#   If EXTRA-FLAGS is defined, it is added to the current language's default
-#   flags (e.g. CFLAGS) when the check is done.  The check is thus made with
-#   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
-#   force the compiler to issue an error when a bad flag is given.
-#
-#   INPUT gives an alternative input source to AC_COMPILE_IFELSE.
-#
-#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
-#   macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
-#
-# LICENSE
-#
-#   Copyright (c) 2008 Guido U. Draheim <guidod at gmx.de>
-#   Copyright (c) 2011 Maarten Bosmans <mkbosmans at gmail.com>
-#
-#   This program is free software: you can redistribute it and/or modify it
-#   under the terms of the GNU General Public License as published by the
-#   Free Software Foundation, either version 3 of the License, or (at your
-#   option) any later version.
-#
-#   This program is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-#   Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License along
-#   with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-#   As a special exception, the respective Autoconf Macro's copyright owner
-#   gives unlimited permission to copy, distribute and modify the configure
-#   scripts that are the output of Autoconf when processing the Macro. You
-#   need not follow the terms of the GNU General Public License when using
-#   or distributing such scripts, even though portions of the text of the
-#   Macro appear in them. The GNU General Public License (GPL) does govern
-#   all other use of the material that constitutes the Autoconf Macro.
-#
-#   This special exception to the GPL applies to versions of the Autoconf
-#   Macro released by the Autoconf Archive. When you make and distribute a
-#   modified version of the Autoconf Macro, you may extend this special
-#   exception to the GPL to apply to your modified version as well.
-
-#serial 4
-
-AC_DEFUN([AX_CHECK_COMPILE_FLAG],
-[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
-AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
-AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
-  ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
-  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
-  AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
-    [AS_VAR_SET(CACHEVAR,[yes])],
-    [AS_VAR_SET(CACHEVAR,[no])])
-  _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
-AS_VAR_IF(CACHEVAR,yes,
-  [m4_default([$2], :)],
-  [m4_default([$3], :)])
-AS_VAR_POPDEF([CACHEVAR])dnl
-])dnl AX_CHECK_COMPILE_FLAGS
diff --git a/src/CL/cl.h b/src/CL/cl.h
index 203c659..6167f32 100644
--- a/src/CL/cl.h
+++ b/src/CL/cl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -24,11 +29,7 @@
 #ifndef __OPENCL_CL_H
 #define __OPENCL_CL_H
 
-#ifdef __APPLE__
-#include <OpenCL/cl_platform.h>
-#else
-#include <CL/cl_platform.h>
-#endif	
+#include "CL/cl_platform.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -46,7 +47,7 @@ typedef struct _cl_kernel *         cl_kernel;
 typedef struct _cl_event *          cl_event;
 typedef struct _cl_sampler *        cl_sampler;
 
-typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
 typedef cl_ulong            cl_bitfield;
 typedef cl_bitfield         cl_device_type;
 typedef cl_uint             cl_platform_info;
@@ -55,16 +56,19 @@ typedef cl_bitfield         cl_device_fp_config;
 typedef cl_uint             cl_device_mem_cache_type;
 typedef cl_uint             cl_device_local_mem_type;
 typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_device_svm_capabilities;
 typedef cl_bitfield         cl_command_queue_properties;
 typedef intptr_t            cl_device_partition_property;
 typedef cl_bitfield         cl_device_affinity_domain;
 
 typedef intptr_t            cl_context_properties;
 typedef cl_uint             cl_context_info;
+typedef cl_bitfield         cl_queue_properties;
 typedef cl_uint             cl_command_queue_info;
 typedef cl_uint             cl_channel_order;
 typedef cl_uint             cl_channel_type;
 typedef cl_bitfield         cl_mem_flags;
+typedef cl_bitfield         cl_svm_mem_flags;
 typedef cl_uint             cl_mem_object_type;
 typedef cl_uint             cl_mem_info;
 typedef cl_bitfield         cl_mem_migration_flags;
@@ -74,6 +78,8 @@ typedef cl_uint             cl_addressing_mode;
 typedef cl_uint             cl_filter_mode;
 typedef cl_uint             cl_sampler_info;
 typedef cl_bitfield         cl_map_flags;
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
 typedef cl_uint             cl_program_info;
 typedef cl_uint             cl_program_build_info;
 typedef cl_uint             cl_program_binary_type;
@@ -87,7 +93,8 @@ typedef cl_uint             cl_kernel_work_group_info;
 typedef cl_uint             cl_event_info;
 typedef cl_uint             cl_command_type;
 typedef cl_uint             cl_profiling_info;
-
+typedef cl_bitfield         cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
 
 typedef struct _cl_image_format {
     cl_channel_order        image_channel_order;
@@ -104,7 +111,13 @@ typedef struct _cl_image_desc {
     size_t                  image_slice_pitch;
     cl_uint                 num_mip_levels;
     cl_uint                 num_samples;
-    cl_mem                  buffer;
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+    union {
+      cl_mem                  buffer;
+      cl_mem                  mem_object;
+    };
 } cl_image_desc;
 
 typedef struct _cl_buffer_region {
@@ -176,11 +189,14 @@ typedef struct _cl_buffer_region {
 #define CL_INVALID_COMPILER_OPTIONS                 -66
 #define CL_INVALID_LINKER_OPTIONS                   -67
 #define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
 
 /* OpenCL Version */
 #define CL_VERSION_1_0                              1
 #define CL_VERSION_1_1                              1
 #define CL_VERSION_1_2                              1
+#define CL_VERSION_2_0                              1
 
 /* cl_bool */
 #define CL_FALSE                                    0
@@ -204,82 +220,98 @@ typedef struct _cl_buffer_region {
 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
 
 /* cl_device_info */
-#define CL_DEVICE_TYPE                              0x1000
-#define CL_DEVICE_VENDOR_ID                         0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
-#define CL_DEVICE_ADDRESS_BITS                      0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
-#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
-#define CL_DEVICE_MAX_SAMPLERS                      0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
-#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
-#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
-#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
-#define CL_DEVICE_AVAILABLE                         0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
-#define CL_DEVICE_NAME                              0x102B
-#define CL_DEVICE_VENDOR                            0x102C
-#define CL_DRIVER_VERSION                           0x102D
-#define CL_DEVICE_PROFILE                           0x102E
-#define CL_DEVICE_VERSION                           0x102F
-#define CL_DEVICE_EXTENSIONS                        0x1030
-#define CL_DEVICE_PLATFORM                          0x1031
-#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
-/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
-#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
-#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
-#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
-#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
-#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
-#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
-#define CL_DEVICE_PARENT_DEVICE                     0x1042
-#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
-#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
-#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
-#define CL_DEVICE_PARTITION_TYPE                    0x1046
-#define CL_DEVICE_REFERENCE_COUNT                   0x1047
-#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
-#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
-#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
-#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
+#define CL_DEVICE_TYPE                                  0x1000
+#define CL_DEVICE_VENDOR_ID                             0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                     0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS              0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                   0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                   0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR           0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT          0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT            0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG           0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT          0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE         0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                   0x100C
+#define CL_DEVICE_ADDRESS_BITS                          0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                   0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                  0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                    0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                     0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                    0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                     0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                    0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                     0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                         0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                    0x1017
+#define CL_DEVICE_MAX_SAMPLERS                          0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                   0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE              0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                      0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                 0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE             0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                 0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                       0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE              0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                     0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                        0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                        0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT              0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION            0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                         0x1026
+#define CL_DEVICE_AVAILABLE                             0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                    0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                      0x102A    /* deprecated */
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES              0x102A
+#define CL_DEVICE_NAME                                  0x102B
+#define CL_DEVICE_VENDOR                                0x102C
+#define CL_DRIVER_VERSION                               0x102D
+#define CL_DEVICE_PROFILE                               0x102E
+#define CL_DEVICE_VERSION                               0x102F
+#define CL_DEVICE_EXTENSIONS                            0x1030
+#define CL_DEVICE_PLATFORM                              0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                      0x1032
+#define CL_DEVICE_HALF_FP_CONFIG                        0x1033
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF           0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                   0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR              0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT             0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT               0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG              0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT             0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE            0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF              0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                      0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                      0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                      0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                 0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                  0x1041
+#define CL_DEVICE_PARENT_DEVICE                         0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES             0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                  0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN             0x1045
+#define CL_DEVICE_PARTITION_TYPE                        0x1046
+#define CL_DEVICE_REFERENCE_COUNT                       0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC           0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                    0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                 0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT          0x104B
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS             0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE              0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES            0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE        0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE              0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                  0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                  0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                      0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE  0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                         0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS          0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                  0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT   0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT     0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT      0x105A
 
 /* cl_device_fp_config - bitfield */
 #define CL_FP_DENORM                                (1 << 0)
@@ -307,6 +339,8 @@ typedef struct _cl_buffer_region {
 /* cl_command_queue_properties - bitfield */
 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
 #define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
 
 /* cl_context_info  */
 #define CL_CONTEXT_REFERENCE_COUNT                  0x1080
@@ -317,38 +351,48 @@ typedef struct _cl_buffer_region {
 /* cl_context_properties */
 #define CL_CONTEXT_PLATFORM                         0x1084
 #define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
-    
+
 /* cl_device_partition_property */
 #define CL_DEVICE_PARTITION_EQUALLY                 0x1086
 #define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
-    
+
 /* cl_device_affinity_domain */
-#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
-#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
-#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
-#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
-#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
-#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
 
 /* cl_command_queue_info */
 #define CL_QUEUE_CONTEXT                            0x1090
 #define CL_QUEUE_DEVICE                             0x1091
 #define CL_QUEUE_REFERENCE_COUNT                    0x1092
 #define CL_QUEUE_PROPERTIES                         0x1093
+#define CL_QUEUE_SIZE                               0x1094
 
-/* cl_mem_flags - bitfield */
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
 #define CL_MEM_READ_WRITE                           (1 << 0)
 #define CL_MEM_WRITE_ONLY                           (1 << 1)
 #define CL_MEM_READ_ONLY                            (1 << 2)
 #define CL_MEM_USE_HOST_PTR                         (1 << 3)
 #define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)
-// reserved                                         (1 << 6)    
+/* reserved                                         (1 << 6)    */
 #define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
 #define CL_MEM_HOST_READ_ONLY                       (1 << 8)
 #define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
 
 /* cl_mem_migration_flags - bitfield */
 #define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
@@ -370,6 +414,11 @@ typedef struct _cl_buffer_region {
 #define CL_RGBx                                     0x10BC
 #define CL_DEPTH                                    0x10BD
 #define CL_DEPTH_STENCIL                            0x10BE
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
 
 /* cl_channel_type */
 #define CL_SNORM_INT8                               0x10D0
@@ -397,6 +446,7 @@ typedef struct _cl_buffer_region {
 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
 #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
 #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#define CL_MEM_OBJECT_PIPE                          0x10F7
 
 /* cl_mem_info */
 #define CL_MEM_TYPE                                 0x1100
@@ -408,6 +458,7 @@ typedef struct _cl_buffer_region {
 #define CL_MEM_CONTEXT                              0x1106
 #define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
 #define CL_MEM_OFFSET                               0x1108
+#define CL_MEM_USES_SVM_POINTER                     0x1109
 
 /* cl_image_info */
 #define CL_IMAGE_FORMAT                             0x1110
@@ -422,6 +473,10 @@ typedef struct _cl_buffer_region {
 #define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
 #define CL_IMAGE_NUM_SAMPLES                        0x111A
 
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
+
 /* cl_addressing_mode */
 #define CL_ADDRESS_NONE                             0x1130
 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
@@ -439,6 +494,9 @@ typedef struct _cl_buffer_region {
 #define CL_SAMPLER_NORMALIZED_COORDS                0x1152
 #define CL_SAMPLER_ADDRESSING_MODE                  0x1153
 #define CL_SAMPLER_FILTER_MODE                      0x1154
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
 
 /* cl_map_flags - bitfield */
 #define CL_MAP_READ                                 (1 << 0)
@@ -461,7 +519,8 @@ typedef struct _cl_buffer_region {
 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
 #define CL_PROGRAM_BUILD_LOG                        0x1183
 #define CL_PROGRAM_BINARY_TYPE                      0x1184
-    
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
+
 /* cl_program_binary_type */
 #define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
@@ -500,12 +559,13 @@ typedef struct _cl_buffer_region {
 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
 #define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
 #define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
-    
+
 /* cl_kernel_arg_type_qualifer */
 #define CL_KERNEL_ARG_TYPE_NONE                     0
 #define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
 #define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
 #define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
 
 /* cl_kernel_work_group_info */
 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
@@ -515,6 +575,10 @@ typedef struct _cl_buffer_region {
 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
 #define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
 
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
+
 /* cl_event_info  */
 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
 #define CL_EVENT_COMMAND_TYPE                       0x11D1
@@ -548,6 +612,11 @@ typedef struct _cl_buffer_region {
 #define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
 #define CL_COMMAND_FILL_BUFFER                      0x1207
 #define CL_COMMAND_FILL_IMAGE                       0x1208
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
 
 /* command execution status */
 #define CL_COMPLETE                                 0x0
@@ -563,6 +632,7 @@ typedef struct _cl_buffer_region {
 #define CL_PROFILING_COMMAND_SUBMIT                 0x1281
 #define CL_PROFILING_COMMAND_START                  0x1282
 #define CL_PROFILING_COMMAND_END                    0x1283
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
 
 /********************************************************************************************************/
 
@@ -572,28 +642,28 @@ clGetPlatformIDs(cl_uint          /* num_entries */,
                  cl_platform_id * /* platforms */,
                  cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
 
-extern CL_API_ENTRY cl_int CL_API_CALL 
-clGetPlatformInfo(cl_platform_id   /* platform */, 
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id   /* platform */,
                   cl_platform_info /* param_name */,
-                  size_t           /* param_value_size */, 
+                  size_t           /* param_value_size */,
                   void *           /* param_value */,
                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 /* Device APIs */
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceIDs(cl_platform_id   /* platform */,
-               cl_device_type   /* device_type */, 
-               cl_uint          /* num_entries */, 
-               cl_device_id *   /* devices */, 
+               cl_device_type   /* device_type */,
+               cl_uint          /* num_entries */,
+               cl_device_id *   /* devices */,
                cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceInfo(cl_device_id    /* device */,
-                cl_device_info  /* param_name */, 
-                size_t          /* param_value_size */, 
+                cl_device_info  /* param_name */,
+                size_t          /* param_value_size */,
                 void *          /* param_value */,
                 size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-    
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clCreateSubDevices(cl_device_id                         /* in_device */,
                    const cl_device_partition_property * /* properties */,
@@ -603,10 +673,10 @@ clCreateSubDevices(cl_device_id                         /* in_device */,
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
-    
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
-    
+
 /* Context APIs  */
 extern CL_API_ENTRY cl_context CL_API_CALL
 clCreateContext(const cl_context_properties * /* properties */,
@@ -630,18 +700,18 @@ extern CL_API_ENTRY cl_int CL_API_CALL
 clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clGetContextInfo(cl_context         /* context */, 
-                 cl_context_info    /* param_name */, 
-                 size_t             /* param_value_size */, 
-                 void *             /* param_value */, 
+clGetContextInfo(cl_context         /* context */,
+                 cl_context_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
                  size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 /* Command Queue APIs */
 extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context                     /* context */, 
-                     cl_device_id                   /* device */, 
-                     cl_command_queue_properties    /* properties */,
-                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateCommandQueueWithProperties(cl_context               /* context */,
+                                   cl_device_id             /* device */,
+                                   const cl_queue_properties *    /* properties */,
+                                   cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
@@ -675,10 +745,18 @@ extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateImage(cl_context              /* context */,
               cl_mem_flags            /* flags */,
               const cl_image_format * /* image_format */,
-              const cl_image_desc *   /* image_desc */, 
+              const cl_image_desc *   /* image_desc */,
               void *                  /* host_ptr */,
               cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-                        
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 /* context */,
+             cl_mem_flags               /* flags */,
+             cl_uint                    /* pipe_packet_size */,
+             cl_uint                    /* pipe_max_packets */,
+             const cl_pipe_properties * /* properties */,
+             cl_int *                   /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
 
@@ -692,33 +770,50 @@ clGetSupportedImageFormats(cl_context           /* context */,
                            cl_uint              /* num_entries */,
                            cl_image_format *    /* image_formats */,
                            cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
-                                    
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetMemObjectInfo(cl_mem           /* memobj */,
-                   cl_mem_info      /* param_name */, 
+                   cl_mem_info      /* param_name */,
                    size_t           /* param_value_size */,
                    void *           /* param_value */,
                    size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetImageInfo(cl_mem           /* image */,
-               cl_image_info    /* param_name */, 
+               cl_image_info    /* param_name */,
                size_t           /* param_value_size */,
                void *           /* param_value */,
                size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
-                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
-                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+clGetPipeInfo(cl_mem           /* pipe */,
+              cl_pipe_info     /* param_name */,
+              size_t           /* param_value_size */,
+              void *           /* param_value */,
+              size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(cl_mem /* memobj */,
+                                 void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                 void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
+
+/* SVM Allocation APIs */
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       /* context */,
+           cl_svm_mem_flags /* flags */,
+           size_t           /* size */,
+           cl_uint          /* alignment */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context        /* context */,
+          void *            /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
 
 /* Sampler APIs */
 extern CL_API_ENTRY cl_sampler CL_API_CALL
-clCreateSampler(cl_context          /* context */,
-                cl_bool             /* normalized_coords */, 
-                cl_addressing_mode  /* addressing_mode */, 
-                cl_filter_mode      /* filter_mode */,
-                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateSamplerWithProperties(cl_context                     /* context */,
+                              const cl_sampler_properties *  /* normalized_coords */,
+                              cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
@@ -732,7 +827,7 @@ clGetSamplerInfo(cl_sampler         /* sampler */,
                  size_t             /* param_value_size */,
                  void *             /* param_value */,
                  size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                            
+
 /* Program Object APIs  */
 extern CL_API_ENTRY cl_program CL_API_CALL
 clCreateProgramWithSource(cl_context        /* context */,
@@ -767,7 +862,7 @@ extern CL_API_ENTRY cl_int CL_API_CALL
 clBuildProgram(cl_program           /* program */,
                cl_uint              /* num_devices */,
                const cl_device_id * /* device_list */,
-               const char *         /* options */, 
+               const char *         /* options */,
                void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
                void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
 
@@ -775,7 +870,7 @@ extern CL_API_ENTRY cl_int CL_API_CALL
 clCompileProgram(cl_program           /* program */,
                  cl_uint              /* num_devices */,
                  const cl_device_id * /* device_list */,
-                 const char *         /* options */, 
+                 const char *         /* options */,
                  cl_uint              /* num_input_headers */,
                  const cl_program *   /* input_headers */,
                  const char **        /* header_include_names */,
@@ -786,7 +881,7 @@ extern CL_API_ENTRY cl_program CL_API_CALL
 clLinkProgram(cl_context           /* context */,
               cl_uint              /* num_devices */,
               const cl_device_id * /* device_list */,
-              const char *         /* options */, 
+              const char *         /* options */,
               cl_uint              /* num_input_programs */,
               const cl_program *   /* input_programs */,
               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
@@ -811,7 +906,7 @@ clGetProgramBuildInfo(cl_program            /* program */,
                       size_t                /* param_value_size */,
                       void *                /* param_value */,
                       size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                            
+
 /* Kernel Object APIs */
 extern CL_API_ENTRY cl_kernel CL_API_CALL
 clCreateKernel(cl_program      /* program */,
@@ -837,6 +932,17 @@ clSetKernelArg(cl_kernel    /* kernel */,
                const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    /* kernel */,
+                         cl_uint      /* arg_index */,
+                         const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            /* kernel */,
+                    cl_kernel_exec_info  /* param_name */,
+                    size_t               /* param_value_size */,
+                    const void *         /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
 clGetKernelInfo(cl_kernel       /* kernel */,
                 cl_kernel_info  /* param_name */,
                 size_t          /* param_value_size */,
@@ -870,11 +976,11 @@ clGetEventInfo(cl_event         /* event */,
                size_t           /* param_value_size */,
                void *           /* param_value */,
                size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                            
+
 extern CL_API_ENTRY cl_event CL_API_CALL
 clCreateUserEvent(cl_context    /* context */,
-                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
-                            
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
 
@@ -884,7 +990,7 @@ clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetUserEventStatus(cl_event   /* event */,
                      cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
-                     
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetEventCallback( cl_event    /* event */,
                     cl_int      /* command_exec_callback_type */,
@@ -898,7 +1004,7 @@ clGetEventProfilingInfo(cl_event            /* event */,
                         size_t              /* param_value_size */,
                         void *              /* param_value */,
                         size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-                                
+
 /* Flush and Finish APIs */
 extern CL_API_ENTRY cl_int CL_API_CALL
 clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
@@ -912,84 +1018,84 @@ clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
                     cl_mem              /* buffer */,
                     cl_bool             /* blocking_read */,
                     size_t              /* offset */,
-                    size_t              /* size */, 
+                    size_t              /* size */,
                     void *              /* ptr */,
                     cl_uint             /* num_events_in_wait_list */,
                     const cl_event *    /* event_wait_list */,
                     cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
                         cl_mem              /* buffer */,
                         cl_bool             /* blocking_read */,
                         const size_t *      /* buffer_offset */,
-                        const size_t *      /* host_offset */, 
+                        const size_t *      /* host_offset */,
                         const size_t *      /* region */,
                         size_t              /* buffer_row_pitch */,
                         size_t              /* buffer_slice_pitch */,
                         size_t              /* host_row_pitch */,
-                        size_t              /* host_slice_pitch */,                        
+                        size_t              /* host_slice_pitch */,
                         void *              /* ptr */,
                         cl_uint             /* num_events_in_wait_list */,
                         const cl_event *    /* event_wait_list */,
                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-                            
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
-                     cl_mem             /* buffer */, 
-                     cl_bool            /* blocking_write */, 
-                     size_t             /* offset */, 
-                     size_t             /* size */, 
-                     const void *       /* ptr */, 
-                     cl_uint            /* num_events_in_wait_list */, 
-                     const cl_event *   /* event_wait_list */, 
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */,
+                     cl_mem             /* buffer */,
+                     cl_bool            /* blocking_write */,
+                     size_t             /* offset */,
+                     size_t             /* size */,
+                     const void *       /* ptr */,
+                     cl_uint            /* num_events_in_wait_list */,
+                     const cl_event *   /* event_wait_list */,
                      cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
                          cl_mem              /* buffer */,
                          cl_bool             /* blocking_write */,
                          const size_t *      /* buffer_offset */,
-                         const size_t *      /* host_offset */, 
+                         const size_t *      /* host_offset */,
                          const size_t *      /* region */,
                          size_t              /* buffer_row_pitch */,
                          size_t              /* buffer_slice_pitch */,
                          size_t              /* host_row_pitch */,
-                         size_t              /* host_slice_pitch */,                        
+                         size_t              /* host_slice_pitch */,
                          const void *        /* ptr */,
                          cl_uint             /* num_events_in_wait_list */,
                          const cl_event *    /* event_wait_list */,
                          cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
-                    cl_mem             /* buffer */, 
-                    const void *       /* pattern */, 
-                    size_t             /* pattern_size */, 
-                    size_t             /* offset */, 
-                    size_t             /* size */, 
-                    cl_uint            /* num_events_in_wait_list */, 
-                    const cl_event *   /* event_wait_list */, 
+                    cl_mem             /* buffer */,
+                    const void *       /* pattern */,
+                    size_t             /* pattern_size */,
+                    size_t             /* offset */,
+                    size_t             /* size */,
+                    cl_uint            /* num_events_in_wait_list */,
+                    const cl_event *   /* event_wait_list */,
                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */,
                     cl_mem              /* src_buffer */,
-                    cl_mem              /* dst_buffer */, 
+                    cl_mem              /* dst_buffer */,
                     size_t              /* src_offset */,
                     size_t              /* dst_offset */,
-                    size_t              /* size */, 
+                    size_t              /* size */,
                     cl_uint             /* num_events_in_wait_list */,
                     const cl_event *    /* event_wait_list */,
                     cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */, 
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */,
                         cl_mem              /* src_buffer */,
-                        cl_mem              /* dst_buffer */, 
+                        cl_mem              /* dst_buffer */,
                         const size_t *      /* src_origin */,
                         const size_t *      /* dst_origin */,
-                        const size_t *      /* region */, 
+                        const size_t *      /* region */,
                         size_t              /* src_row_pitch */,
                         size_t              /* src_slice_pitch */,
                         size_t              /* dst_row_pitch */,
@@ -997,15 +1103,15 @@ clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */,
                         cl_uint             /* num_events_in_wait_list */,
                         const cl_event *    /* event_wait_list */,
                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReadImage(cl_command_queue     /* command_queue */,
                    cl_mem               /* image */,
-                   cl_bool              /* blocking_read */, 
+                   cl_bool              /* blocking_read */,
                    const size_t *       /* origin[3] */,
                    const size_t *       /* region[3] */,
                    size_t               /* row_pitch */,
-                   size_t               /* slice_pitch */, 
+                   size_t               /* slice_pitch */,
                    void *               /* ptr */,
                    cl_uint              /* num_events_in_wait_list */,
                    const cl_event *     /* event_wait_list */,
@@ -1014,11 +1120,11 @@ clEnqueueReadImage(cl_command_queue     /* command_queue */,
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueWriteImage(cl_command_queue    /* command_queue */,
                     cl_mem              /* image */,
-                    cl_bool             /* blocking_write */, 
+                    cl_bool             /* blocking_write */,
                     const size_t *      /* origin[3] */,
                     const size_t *      /* region[3] */,
                     size_t              /* input_row_pitch */,
-                    size_t              /* input_slice_pitch */, 
+                    size_t              /* input_slice_pitch */,
                     const void *        /* ptr */,
                     cl_uint             /* num_events_in_wait_list */,
                     const cl_event *    /* event_wait_list */,
@@ -1026,21 +1132,21 @@ clEnqueueWriteImage(cl_command_queue    /* command_queue */,
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueFillImage(cl_command_queue   /* command_queue */,
-                   cl_mem             /* image */, 
-                   const void *       /* fill_color */, 
-                   const size_t *     /* origin[3] */, 
-                   const size_t *     /* region[3] */, 
-                   cl_uint            /* num_events_in_wait_list */, 
-                   const cl_event *   /* event_wait_list */, 
+                   cl_mem             /* image */,
+                   const void *       /* fill_color */,
+                   const size_t *     /* origin[3] */,
+                   const size_t *     /* region[3] */,
+                   cl_uint            /* num_events_in_wait_list */,
+                   const cl_event *   /* event_wait_list */,
                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
-                            
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueCopyImage(cl_command_queue     /* command_queue */,
                    cl_mem               /* src_image */,
-                   cl_mem               /* dst_image */, 
+                   cl_mem               /* dst_image */,
                    const size_t *       /* src_origin[3] */,
                    const size_t *       /* dst_origin[3] */,
-                   const size_t *       /* region[3] */, 
+                   const size_t *       /* region[3] */,
                    cl_uint              /* num_events_in_wait_list */,
                    const cl_event *     /* event_wait_list */,
                    cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
@@ -1048,9 +1154,9 @@ clEnqueueCopyImage(cl_command_queue     /* command_queue */,
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
                            cl_mem           /* src_image */,
-                           cl_mem           /* dst_buffer */, 
+                           cl_mem           /* dst_buffer */,
                            const size_t *   /* src_origin[3] */,
-                           const size_t *   /* region[3] */, 
+                           const size_t *   /* region[3] */,
                            size_t           /* dst_offset */,
                            cl_uint          /* num_events_in_wait_list */,
                            const cl_event * /* event_wait_list */,
@@ -1059,10 +1165,10 @@ clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
                            cl_mem           /* src_buffer */,
-                           cl_mem           /* dst_image */, 
+                           cl_mem           /* dst_image */,
                            size_t           /* src_offset */,
                            const size_t *   /* dst_origin[3] */,
-                           const size_t *   /* region[3] */, 
+                           const size_t *   /* region[3] */,
                            cl_uint          /* num_events_in_wait_list */,
                            const cl_event * /* event_wait_list */,
                            cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
@@ -1070,7 +1176,7 @@ clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
 extern CL_API_ENTRY void * CL_API_CALL
 clEnqueueMapBuffer(cl_command_queue /* command_queue */,
                    cl_mem           /* buffer */,
-                   cl_bool          /* blocking_map */, 
+                   cl_bool          /* blocking_map */,
                    cl_map_flags     /* map_flags */,
                    size_t           /* offset */,
                    size_t           /* size */,
@@ -1081,9 +1187,9 @@ clEnqueueMapBuffer(cl_command_queue /* command_queue */,
 
 extern CL_API_ENTRY void * CL_API_CALL
 clEnqueueMapImage(cl_command_queue  /* command_queue */,
-                  cl_mem            /* image */, 
-                  cl_bool           /* blocking_map */, 
-                  cl_map_flags      /* map_flags */, 
+                  cl_mem            /* image */,
+                  cl_bool           /* blocking_map */,
+                  cl_map_flags      /* map_flags */,
                   const size_t *    /* origin[3] */,
                   const size_t *    /* region[3] */,
                   size_t *          /* image_row_pitch */,
@@ -1122,17 +1228,10 @@ clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
                        cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueTask(cl_command_queue  /* command_queue */,
-              cl_kernel         /* kernel */,
-              cl_uint           /* num_events_in_wait_list */,
-              const cl_event *  /* event_wait_list */,
-              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
-					  void (CL_CALLBACK * /*user_func*/)(void *), 
+					  void (CL_CALLBACK * /*user_func*/)(void *),
                       void *            /* args */,
-                      size_t            /* cb_args */, 
+                      size_t            /* cb_args */,
                       cl_uint           /* num_mem_objects */,
                       const cl_mem *    /* mem_list */,
                       const void **     /* args_mem_loc */,
@@ -1141,74 +1240,145 @@ clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
                       cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+clEnqueueMarkerWithWaitList(cl_command_queue  /* command_queue */,
                             cl_uint           /* num_events_in_wait_list */,
                             const cl_event *  /* event_wait_list */,
                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+clEnqueueBarrierWithWaitList(cl_command_queue  /* command_queue */,
                              cl_uint           /* num_events_in_wait_list */,
                              const cl_event *  /* event_wait_list */,
                              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
 
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  /* command_queue */,
+                 cl_uint           /* num_svm_pointers */,
+                 void *[]          /* svm_pointers[] */,
+                 void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                        cl_uint          /* num_svm_pointers */,
+                                                        void *[]         /* svm_pointers[] */,
+                                                        void *           /* user_data */),
+                 void *            /* user_data */,
+                 cl_uint           /* num_events_in_wait_list */,
+                 const cl_event *  /* event_wait_list */,
+                 cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_copy */,
+                   void *            /* dst_ptr */,
+                   const void *      /* src_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  /* command_queue */,
+                    void *            /* svm_ptr */,
+                    const void *      /* pattern */,
+                    size_t            /* pattern_size */,
+                    size_t            /* size */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  /* command_queue */,
+                cl_bool           /* blocking_map */,
+                cl_map_flags      /* flags */,
+                void *            /* svm_ptr */,
+                size_t            /* size */,
+                cl_uint           /* num_events_in_wait_list */,
+                const cl_event *  /* event_wait_list */,
+                cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  /* command_queue */,
+                  void *            /* svm_ptr */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
 
 /* Extension function access
  *
  * Returns the extension function address for the given function name,
  * or NULL if a valid function can not be found.  The client must
- * check to make sure the address is not NULL, before using or 
+ * check to make sure the address is not NULL, before using or
  * calling the returned function address.
  */
-extern CL_API_ENTRY void * CL_API_CALL 
+extern CL_API_ENTRY void * CL_API_CALL
 clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
                                          const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
-    
 
-// Deprecated OpenCL 1.1 APIs
+
+/* Deprecated OpenCL 1.1 APIs */
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateImage2D(cl_context              /* context */,
                 cl_mem_flags            /* flags */,
                 const cl_image_format * /* image_format */,
                 size_t                  /* image_width */,
                 size_t                  /* image_height */,
-                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_row_pitch */,
                 void *                  /* host_ptr */,
                 cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-    
+
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateImage3D(cl_context              /* context */,
                 cl_mem_flags            /* flags */,
                 const cl_image_format * /* image_format */,
-                size_t                  /* image_width */, 
+                size_t                  /* image_width */,
                 size_t                  /* image_height */,
-                size_t                  /* image_depth */, 
-                size_t                  /* image_row_pitch */, 
-                size_t                  /* image_slice_pitch */, 
+                size_t                  /* image_depth */,
+                size_t                  /* image_row_pitch */,
+                size_t                  /* image_slice_pitch */,
                 void *                  /* host_ptr */,
                 cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-    
+
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
 clEnqueueMarker(cl_command_queue    /* command_queue */,
                 cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-    
+
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
 clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
                         cl_uint          /* num_events */,
                         const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-    
+
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
 clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
 clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-    
+
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
 clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 
+/* Deprecated OpenCL 2.0 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */,
+                     cl_device_id                   /* device */,
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */,
+                cl_addressing_mode  /* addressing_mode */,
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
 #ifdef __cplusplus
 }
 #endif
 
 #endif  /* __OPENCL_CL_H */
-
diff --git a/src/CL/cl_d3d10.h b/src/CL/cl_d3d10.h
index 81b0d37..d5960a4 100644
--- a/src/CL/cl_d3d10.h
+++ b/src/CL/cl_d3d10.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -43,31 +48,31 @@ typedef cl_uint cl_d3d10_device_set_khr;
 
 /******************************************************************************/
 
-// Error Codes
+/* Error Codes */
 #define CL_INVALID_D3D10_DEVICE_KHR                  -1002
 #define CL_INVALID_D3D10_RESOURCE_KHR                -1003
 #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
 #define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
 
-// cl_d3d10_device_source_nv
+/* cl_d3d10_device_source_nv */
 #define CL_D3D10_DEVICE_KHR                          0x4010
 #define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
 
-// cl_d3d10_device_set_nv
+/* cl_d3d10_device_set_nv */
 #define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
 #define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
 
-// cl_context_info
+/* cl_context_info */
 #define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
 #define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
 
-// cl_mem_info
+/* cl_mem_info */
 #define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
 
-// cl_image_info
+/* cl_image_info */
 #define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
 
-// cl_command_type
+/* cl_command_type */
 #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
 #define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
 
@@ -122,5 +127,5 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
 }
 #endif
 
-#endif  // __OPENCL_CL_D3D10_H
+#endif  /* __OPENCL_CL_D3D10_H */
 
diff --git a/src/CL/cl_d3d11.h b/src/CL/cl_d3d11.h
index d3c8bdc..38cc21a 100644
--- a/src/CL/cl_d3d11.h
+++ b/src/CL/cl_d3d11.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -43,31 +48,31 @@ typedef cl_uint cl_d3d11_device_set_khr;
 
 /******************************************************************************/
 
-// Error Codes
+/* Error Codes */
 #define CL_INVALID_D3D11_DEVICE_KHR                  -1006
 #define CL_INVALID_D3D11_RESOURCE_KHR                -1007
 #define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
 #define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
 
-// cl_d3d11_device_source
+/* cl_d3d11_device_source */
 #define CL_D3D11_DEVICE_KHR                          0x4019
 #define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
 
-// cl_d3d11_device_set
+/* cl_d3d11_device_set */
 #define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
 #define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
 
-// cl_context_info
+/* cl_context_info */
 #define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
 #define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
 
-// cl_mem_info
+/* cl_mem_info */
 #define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
 
-// cl_image_info
+/* cl_image_info */
 #define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
 
-// cl_command_type
+/* cl_command_type */
 #define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
 #define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
 
@@ -122,5 +127,4 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
 }
 #endif
 
-#endif  // __OPENCL_CL_D3D11_H
-
+#endif  /* __OPENCL_CL_D3D11_H */
diff --git a/src/CL/cl_dx9_media_sharing.h b/src/CL/cl_dx9_media_sharing.h
index 1ef543a..2729e8b 100644
--- a/src/CL/cl_dx9_media_sharing.h
+++ b/src/CL/cl_dx9_media_sharing.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -33,7 +38,7 @@
 extern "C" {
 #endif
 
-/******************************************************************************
+/******************************************************************************/
 /* cl_khr_dx9_media_sharing                                                   */
 #define cl_khr_dx9_media_sharing 1
 
@@ -52,34 +57,34 @@ typedef struct _cl_dx9_surface_info_khr
 
 /******************************************************************************/
 
-// Error Codes
+/* Error Codes */
 #define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
 #define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
 #define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
 #define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
 
-// cl_media_adapter_type_khr
+/* cl_media_adapter_type_khr */
 #define CL_ADAPTER_D3D9_KHR                              0x2020
 #define CL_ADAPTER_D3D9EX_KHR                            0x2021
 #define CL_ADAPTER_DXVA_KHR                              0x2022
 
-// cl_media_adapter_set_khr
+/* cl_media_adapter_set_khr */
 #define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
 #define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
 
-// cl_context_info
+/* cl_context_info */
 #define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
 #define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
 #define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
 
-// cl_mem_info
+/* cl_mem_info */
 #define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
 #define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
 
-// cl_image_info
+/* cl_image_info */
 #define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
 
-// cl_command_type
+/* cl_command_type */
 #define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
 #define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
 
@@ -123,5 +128,5 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn
 }
 #endif
 
-#endif  // __OPENCL_CL_DX9_MEDIA_SHARING_H
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
 
diff --git a/src/CL/cl_egl.h b/src/CL/cl_egl.h
index c1bd4f3..a765bd5 100644
--- a/src/CL/cl_egl.h
+++ b/src/CL/cl_egl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -28,8 +33,6 @@
 
 #else
 #include <CL/cl.h>
-#include <EGL/egl.h>
-#include <EGL/eglext.h>
 #endif  
 
 #ifdef __cplusplus
@@ -52,6 +55,9 @@ typedef void* CLeglImageKHR;
 /* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
 typedef void* CLeglDisplayKHR;
 
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
 /* properties passed to clCreateFromEGLImageKHR */
 typedef intptr_t cl_egl_image_properties_khr;
 
@@ -112,17 +118,16 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
 #define cl_khr_egl_event 1
 
 extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromEGLSyncKHR(cl_context /* context */,
-                            EGLSyncKHR /* sync */,
-                            EGLDisplay /* display */,
-                            cl_int *   /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateEventFromEGLSyncKHR(cl_context      /* context */,
+                            CLeglSyncKHR    /* sync */,
+                            CLeglDisplayKHR /* display */,
+                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
-	cl_context context,
-	EGLSyncKHR sync,
-	EGLDisplay display,
-	cl_int *   errcode_ret);
-
+	cl_context      context,
+	CLeglSyncKHR    sync,
+	CLeglDisplayKHR display,
+	cl_int *        errcode_ret);
 
 #ifdef __cplusplus
 }
diff --git a/src/CL/cl_ext.h b/src/CL/cl_ext.h
index 5ab2c13..5f71134 100644
--- a/src/CL/cl_ext.h
+++ b/src/CL/cl_ext.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2013 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -33,12 +38,7 @@
 extern "C" {
 #endif
 
-#ifdef __APPLE__
-        #include <OpenCL/cl.h>
-    #include <AvailabilityMacros.h>
-#else
-        #include <CL/cl.h>
-#endif
+#include "CL/cl.h"
 
 /* cl_khr_fp16 extension - no extension #define since it has no functions  */
 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
@@ -47,12 +47,12 @@ extern "C" {
  *
  * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
  *
- * Registers a user callback function that will be called when the memory object is deleted and its resources 
- * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
- * stack associated with memobj. The registered user callback functions are called in the reverse order in 
- * which they were registered. The user callback functions are called and then the memory object is deleted 
- * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
- * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
  * the storage bits for the memory object, can be reused or freed.
  *
  * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
@@ -61,9 +61,9 @@ extern "C" {
  * before using.
  */
 #define cl_APPLE_SetMemObjectDestructor 1
-cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
-                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
-                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */,
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;
 
 
 /* Context Logging Functions
@@ -72,29 +72,29 @@ cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */,
  * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
  * before using.
  *
- * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
  */
 #define cl_APPLE_ContextLoggingFunctions 1
-extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
-                                            const void * /* private_info */, 
-                                            size_t       /* cb */, 
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */,
+                                            const void * /* private_info */,
+                                            size_t       /* cb */,
                                             void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
 
 /* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
-extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
-                                          const void * /* private_info */, 
-                                          size_t       /* cb */, 
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */,
+                                          const void * /* private_info */,
+                                          size_t       /* cb */,
                                           void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
 
 /* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
-extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
-                                          const void * /* private_info */, 
-                                          size_t       /* cb */, 
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */,
+                                          const void * /* private_info */,
+                                          size_t       /* cb */,
                                           void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
 
 
-/************************ 
-* cl_khr_icd extension *                                                  
+/************************
+* cl_khr_icd extension *
 ************************/
 #define cl_khr_icd 1
 
@@ -129,31 +129,31 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
  * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
  * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
  */
-    
+
 /*************************************
  * cl_khr_initalize_memory extension *
  *************************************/
-    
-#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
-    
-    
+
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+
+
 /**************************************
  * cl_khr_terminate_context extension *
  **************************************/
-    
-#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x200F
-#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
 
 #define cl_khr_terminate_context 1
 extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
 
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
-    
-    
+
+
 /*
  * Extension: cl_khr_spir
  *
- * This extension adds support to create an OpenCL program object from a 
+ * This extension adds support to create an OpenCL program object from a
  * Standard Portable Intermediate Representation (SPIR) instance
  */
 
@@ -178,22 +178,28 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
 *********************************/
 #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
 
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
 #ifdef CL_VERSION_1_1
    /***********************************
     * cl_ext_device_fission extension *
     ***********************************/
     #define cl_ext_device_fission   1
-    
+
     extern CL_API_ENTRY cl_int CL_API_CALL
-    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
-    
-    typedef CL_API_ENTRY cl_int 
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int
     (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
 
     extern CL_API_ENTRY cl_int CL_API_CALL
-    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
-    
-    typedef CL_API_ENTRY cl_int 
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int
     (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
 
     typedef cl_ulong  cl_device_partition_property_ext;
@@ -204,7 +210,7 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
                             cl_device_id * /*out_devices*/,
                             cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
 
-    typedef CL_API_ENTRY cl_int 
+    typedef CL_API_ENTRY cl_int
     ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
                                                 const cl_device_partition_property_ext * /* properties */,
                                                 cl_uint /*num_entries*/,
@@ -216,19 +222,19 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
     #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
     #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
     #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
-    
+
     /* clDeviceGetInfo selectors */
     #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
     #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
     #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
     #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
     #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
-    
+
     /* error codes */
     #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
     #define CL_INVALID_PARTITION_COUNT_EXT              -1058
     #define CL_INVALID_PARTITION_NAME_EXT               -1059
-    
+
     /* CL_AFFINITY_DOMAINs */
     #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
     #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
@@ -236,7 +242,7 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
     #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
     #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
     #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
-    
+
     /* cl_device_partition_property_ext list terminators */
     #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
     #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
@@ -248,7 +254,7 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
 
 #define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
 
-#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
 #define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
 #define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
 #define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
@@ -271,11 +277,11 @@ clGetDeviceImageInfoQCOM(cl_device_id             device,
 
 typedef struct _cl_mem_ext_host_ptr
 {
-    // Type of external memory allocation.
-    // Legal values will be defined in layered extensions.
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
     cl_uint  allocation_type;
-            
-    // Host cache policy for this external memory allocation.
+
+    /* Host cache policy for this external memory allocation. */
     cl_uint  host_cache_policy;
 
 } cl_mem_ext_host_ptr;
@@ -288,20 +294,368 @@ typedef struct _cl_mem_ext_host_ptr
 
 typedef struct _cl_mem_ion_host_ptr
 {
-    // Type of external memory allocation.
-    // Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations.
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
     cl_mem_ext_host_ptr  ext_host_ptr;
 
-    // ION file descriptor
+    /* ION file descriptor */
     int                  ion_filedesc;
-            
-    // Host pointer to the ION allocated memory
+
+    /* Host pointer to the ION allocated memory */
     void*                ion_hostptr;
 
 } cl_mem_ion_host_ptr;
 
 #endif /* CL_VERSION_1_1 */
 
+#if defined(CL_VERSION_1_2)
+
+/******************************************
+ * cl_img_yuv_image extension *
+ ******************************************/
+
+/* Image formats used in clCreateImage */
+#define CL_NV21_IMG                                 0x40D0
+#define CL_YV12_IMG                                 0x40D1
+
+/******************************************
+ * cl_img_cached_allocations extension *
+ ******************************************/
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG         	(1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG           	(1 << 27)
+
+/******************************************
+ * cl_img_use_gralloc_ptr extension *
+ ******************************************/
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                 	(1 << 28)
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+
+/* Error code from clEnqueueReleaseGrallocObjectsIMG */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#endif /* CL_VERSION_1_2 */
+
+#ifdef CL_VERSION_2_0
+/*********************************
+* cl_khr_sub_groups extension
+*********************************/
+#define cl_khr_sub_groups 1
+
+typedef cl_uint  cl_kernel_sub_group_info;
+
+/* cl_khr_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR	0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR		0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+						   cl_device_id /*in_device*/,
+						   cl_kernel_sub_group_info /* param_name */,
+						   size_t /*input_value_size*/,
+						   const void * /*input_value*/,
+						   size_t /*param_value_size*/,
+						   void* /*param_value*/,
+						   size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0;
+
+typedef CL_API_ENTRY cl_int
+     ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+						      cl_device_id /*in_device*/,
+						      cl_kernel_sub_group_info /* param_name */,
+						      size_t /*input_value_size*/,
+						      const void * /*input_value*/,
+						      size_t /*param_value_size*/,
+						      void* /*param_value*/,
+						      size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0;
+#endif /* CL_VERSION_2_0 */
+
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+
+#ifdef CL_VERSION_1_2
+
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       /* context */,
+              cl_svm_mem_flags_arm /* flags */,
+              size_t           /* size */,
+              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        /* context */,
+             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
+                    cl_uint           /* num_svm_pointers */,
+                    void *[]          /* svm_pointers[] */,
+                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                           cl_uint          /* num_svm_pointers */,
+                                                           void *[]         /* svm_pointers[] */,
+                                                           void *           /* user_data */),
+                    void *            /* user_data */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
+                      cl_bool           /* blocking_copy */,
+                      void *            /* dst_ptr */,
+                      const void *      /* src_ptr */,
+                      size_t            /* size */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
+                       void *            /* svm_ptr */,
+                       const void *      /* pattern */,
+                       size_t            /* pattern_size */,
+                       size_t            /* size */,
+                       cl_uint           /* num_events_in_wait_list */,
+                       const cl_event *  /* event_wait_list */,
+                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_map */,
+                   cl_map_flags      /* flags */,
+                   void *            /* svm_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
+                     void *            /* svm_ptr */,
+                     cl_uint           /* num_events_in_wait_list */,
+                     const cl_event *  /* event_wait_list */,
+                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
+                            cl_uint      /* arg_index */,
+                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            /* kernel */,
+                       cl_kernel_exec_info_arm  /* param_name */,
+                       size_t               /* param_value_size */,
+                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#endif /* CL_VERSION_1_2 */
+
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+
+#ifdef CL_VERSION_1_2
+
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       /* context */,
+              cl_svm_mem_flags_arm /* flags */,
+              size_t           /* size */,
+              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        /* context */,
+             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
+                    cl_uint           /* num_svm_pointers */,
+                    void *[]          /* svm_pointers[] */,
+                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                           cl_uint          /* num_svm_pointers */,
+                                                           void *[]         /* svm_pointers[] */,
+                                                           void *           /* user_data */),
+                    void *            /* user_data */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
+                      cl_bool           /* blocking_copy */,
+                      void *            /* dst_ptr */,
+                      const void *      /* src_ptr */,
+                      size_t            /* size */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
+                       void *            /* svm_ptr */,
+                       const void *      /* pattern */,
+                       size_t            /* pattern_size */,
+                       size_t            /* size */,
+                       cl_uint           /* num_events_in_wait_list */,
+                       const cl_event *  /* event_wait_list */,
+                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_map */,
+                   cl_map_flags      /* flags */,
+                   void *            /* svm_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
+                     void *            /* svm_ptr */,
+                     cl_uint           /* num_events_in_wait_list */,
+                     const cl_event *  /* event_wait_list */,
+                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
+                            cl_uint      /* arg_index */,
+                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            /* kernel */,
+                       cl_kernel_exec_info_arm  /* param_name */,
+                       size_t               /* param_value_size */,
+                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#endif /* CL_VERSION_1_2 */
+
+/**********************************
+ * cl_arm_import_memory extension *
+ **********************************/
+
+#ifdef CL_VERSION_1_0
+
+typedef intptr_t cl_import_properties_arm;
+
+/* Default and valid proporties name for cl_arm_import_memory */
+#define CL_IMPORT_TYPE_ARM                        0x40B2
+
+/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
+
+/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+
+/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_SECURE_ARM                 0x40B5
+
+/* This extension adds a new function that allows for direct memory import into
+ * OpenCL via the clImportMemoryARM function.
+ *
+ * Memory imported through this interface will be mapped into the device's page
+ * tables directly, providing zero copy access. It will never fall back to copy
+ * operations and aliased buffers.
+ *
+ * Types of memory supported for import are specified as additional extension
+ * strings.
+ *
+ * This extension produces cl_mem allocations which are compatible with all other
+ * users of cl_mem in the standard API.
+ *
+ * This extension maps pages with the same properties as the normal buffer creation
+ * function clCreateBuffer.
+ */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clImportMemoryARM( cl_context context,
+                   cl_mem_flags flags,
+                   const cl_import_properties_arm *properties,
+                   void *memory,
+                   size_t size,
+                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+#endif /* CL_VERSION_1_0 */
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CL/cl_gl.h b/src/CL/cl_gl.h
index af2036c..945daa8 100644
--- a/src/CL/cl_gl.h
+++ b/src/CL/cl_gl.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -104,7 +109,7 @@ clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
                           cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
 
 
-// Deprecated OpenCL 1.1 APIs
+/* Deprecated OpenCL 1.1 APIs */
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateFromGLTexture2D(cl_context      /* context */,
                         cl_mem_flags    /* flags */,
diff --git a/src/CL/cl_gl_ext.h b/src/CL/cl_gl_ext.h
index 77d5353..e3c14c6 100644
--- a/src/CL/cl_gl_ext.h
+++ b/src/CL/cl_gl_ext.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
diff --git a/src/CL/cl_platform.h b/src/CL/cl_platform.h
index 7f6f5e8..7efc98d 100644
--- a/src/CL/cl_platform.h
+++ b/src/CL/cl_platform.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -45,6 +50,14 @@ extern "C" {
     #define CL_CALLBACK
 #endif
 
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+
 #ifdef __APPLE__
     #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
     #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
@@ -53,7 +66,7 @@ extern "C" {
     #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
     #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
     #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
-    
+
     #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
         #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
         #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
@@ -67,53 +80,80 @@ extern "C" {
         #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
         #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
     #endif
+
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
+    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+
 #else
-    #define CL_EXTENSION_WEAK_LINK  
+    #define CL_EXTENSION_WEAK_LINK
     #define CL_API_SUFFIX__VERSION_1_0
     #define CL_EXT_SUFFIX__VERSION_1_0
     #define CL_API_SUFFIX__VERSION_1_1
     #define CL_EXT_SUFFIX__VERSION_1_1
     #define CL_API_SUFFIX__VERSION_1_2
     #define CL_EXT_SUFFIX__VERSION_1_2
-    
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
+
     #ifdef __GNUC__
         #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
             #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
         #else
             #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
         #endif
-    
+
         #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED    
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
         #else
             #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
         #endif
-    #elif _WIN32
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+         #endif
+    #elif defined(_WIN32)
         #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED    
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
         #else
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)     
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
         #endif
-    
+
         #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
             #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
         #else
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated)
         #endif
     #else
         #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
         #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
-    
+
         #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
         #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
     #endif
 #endif
 
@@ -159,7 +199,18 @@ typedef double                  cl_double;
 #define CL_FLT_RADIX        2
 #define CL_FLT_MAX          340282346638528859811704183484516925440.0f
 #define CL_FLT_MIN          1.175494350822287507969e-38f
-#define CL_FLT_EPSILON      0x1.0p-23f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
 
 #define CL_DBL_DIG          15
 #define CL_DBL_MANT_DIG     53
@@ -248,9 +299,20 @@ typedef double          cl_double   __attribute__((aligned(8)));
 #define CL_FLT_MIN_10_EXP   -37
 #define CL_FLT_MIN_EXP      -125
 #define CL_FLT_RADIX        2
-#define CL_FLT_MAX          0x1.fffffep127f
-#define CL_FLT_MIN          0x1.0p-126f
-#define CL_FLT_EPSILON      0x1.0p-23f
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
 
 #define CL_DBL_DIG          15
 #define CL_DBL_MANT_DIG     53
@@ -259,9 +321,9 @@ typedef double          cl_double   __attribute__((aligned(8)));
 #define CL_DBL_MIN_10_EXP   -307
 #define CL_DBL_MIN_EXP      -1021
 #define CL_DBL_RADIX        2
-#define CL_DBL_MAX          0x1.fffffffffffffp1023
-#define CL_DBL_MIN          0x1.0p-1022
-#define CL_DBL_EPSILON      0x1.0p-52
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
 
 #define  CL_M_E             2.718281828459045090796
 #define  CL_M_LOG2E         1.442695040888963387005
@@ -299,7 +361,7 @@ typedef double          cl_double   __attribute__((aligned(8)));
    #define CL_HUGE_VALF     ((cl_float) 1e50)
    #define CL_HUGE_VAL      ((cl_double) 1e500)
    float nanf( const char * );
-   #define CL_NAN           nanf( "" )  
+   #define CL_NAN           nanf( "" )
 #endif
 #define CL_MAXFLOAT         CL_FLT_MAX
 #define CL_INFINITY         CL_HUGE_VALF
@@ -314,17 +376,17 @@ typedef int          cl_GLint;
 typedef unsigned int cl_GLenum;
 
 /*
- * Vector types 
+ * Vector types
  *
- *  Note:   OpenCL requires that all types be naturally aligned. 
+ *  Note:   OpenCL requires that all types be naturally aligned.
  *          This means that vector types must be naturally aligned.
  *          For example, a vector of four floats must be aligned to
- *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte
  *          alignment of the float).  The alignment qualifiers here
  *          will only function properly if your compiler supports them
  *          and if you don't actively work to defeat them.  For example,
  *          in order for a cl_float4 to be 16 byte aligned in a struct,
- *          the start of the struct must itself be 16-byte aligned. 
+ *          the start of the struct must itself be 16-byte aligned.
  *
  *          Maintaining proper alignment is the user's responsibility.
  */
@@ -438,7 +500,7 @@ typedef unsigned int cl_GLenum;
     #if defined( __MINGW64__ )
         #include <intrin.h>
     #else
-        #include <immintrin.h> 
+        #include <immintrin.h>
     #endif
     #if defined( __GNUC__ )
         typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
@@ -455,15 +517,17 @@ typedef unsigned int cl_GLenum;
 #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
 #define  __CL_HAS_ANON_STRUCT__ 1
 #define  __CL_ANON_STRUCT__ __extension__
-#elif defined( _WIN32) && (_MSC_VER >= 1500)
+#elif defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >= 1500
    /* Microsoft Developer Studio 2008 supports anonymous structs, but
     * complains by default. */
-#define  __CL_HAS_ANON_STRUCT__ 1
-#define  __CL_ANON_STRUCT__
+    #define  __CL_HAS_ANON_STRUCT__ 1
+    #define  __CL_ANON_STRUCT__
    /* Disable warning C4201: nonstandard extension used : nameless
     * struct/union */
-#pragma warning( push )
-#pragma warning( disable : 4201 )
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+    #endif
 #else
 #define  __CL_HAS_ANON_STRUCT__ 0
 #define  __CL_ANON_STRUCT__
@@ -502,7 +566,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
 #endif
-#if defined( __CL_CHAR2__) 
+#if defined( __CL_CHAR2__)
     __cl_char2     v2;
 #endif
 }cl_char2;
@@ -515,10 +579,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
 #endif
-#if defined( __CL_CHAR2__) 
+#if defined( __CL_CHAR2__)
     __cl_char2     v2[2];
 #endif
-#if defined( __CL_CHAR4__) 
+#if defined( __CL_CHAR4__)
     __cl_char4     v4;
 #endif
 }cl_char4;
@@ -534,10 +598,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
 #endif
-#if defined( __CL_CHAR2__) 
+#if defined( __CL_CHAR2__)
     __cl_char2     v2[4];
 #endif
-#if defined( __CL_CHAR4__) 
+#if defined( __CL_CHAR4__)
     __cl_char4     v4[2];
 #endif
 #if defined( __CL_CHAR8__ )
@@ -553,10 +617,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
 #endif
-#if defined( __CL_CHAR2__) 
+#if defined( __CL_CHAR2__)
     __cl_char2     v2[8];
 #endif
-#if defined( __CL_CHAR4__) 
+#if defined( __CL_CHAR4__)
     __cl_char4     v4[4];
 #endif
 #if defined( __CL_CHAR8__ )
@@ -577,7 +641,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
 #endif
-#if defined( __cl_uchar2__) 
+#if defined( __cl_uchar2__)
     __cl_uchar2     v2;
 #endif
 }cl_uchar2;
@@ -590,10 +654,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
 #endif
-#if defined( __CL_UCHAR2__) 
+#if defined( __CL_UCHAR2__)
     __cl_uchar2     v2[2];
 #endif
-#if defined( __CL_UCHAR4__) 
+#if defined( __CL_UCHAR4__)
     __cl_uchar4     v4;
 #endif
 }cl_uchar4;
@@ -609,10 +673,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
 #endif
-#if defined( __CL_UCHAR2__) 
+#if defined( __CL_UCHAR2__)
     __cl_uchar2     v2[4];
 #endif
-#if defined( __CL_UCHAR4__) 
+#if defined( __CL_UCHAR4__)
     __cl_uchar4     v4[2];
 #endif
 #if defined( __CL_UCHAR8__ )
@@ -628,10 +692,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
 #endif
-#if defined( __CL_UCHAR2__) 
+#if defined( __CL_UCHAR2__)
     __cl_uchar2     v2[8];
 #endif
-#if defined( __CL_UCHAR4__) 
+#if defined( __CL_UCHAR4__)
     __cl_uchar4     v4[4];
 #endif
 #if defined( __CL_UCHAR8__ )
@@ -652,7 +716,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
 #endif
-#if defined( __CL_SHORT2__) 
+#if defined( __CL_SHORT2__)
     __cl_short2     v2;
 #endif
 }cl_short2;
@@ -665,10 +729,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
 #endif
-#if defined( __CL_SHORT2__) 
+#if defined( __CL_SHORT2__)
     __cl_short2     v2[2];
 #endif
-#if defined( __CL_SHORT4__) 
+#if defined( __CL_SHORT4__)
     __cl_short4     v4;
 #endif
 }cl_short4;
@@ -684,10 +748,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
 #endif
-#if defined( __CL_SHORT2__) 
+#if defined( __CL_SHORT2__)
     __cl_short2     v2[4];
 #endif
-#if defined( __CL_SHORT4__) 
+#if defined( __CL_SHORT4__)
     __cl_short4     v4[2];
 #endif
 #if defined( __CL_SHORT8__ )
@@ -703,10 +767,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
 #endif
-#if defined( __CL_SHORT2__) 
+#if defined( __CL_SHORT2__)
     __cl_short2     v2[8];
 #endif
-#if defined( __CL_SHORT4__) 
+#if defined( __CL_SHORT4__)
     __cl_short4     v4[4];
 #endif
 #if defined( __CL_SHORT8__ )
@@ -727,7 +791,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
 #endif
-#if defined( __CL_USHORT2__) 
+#if defined( __CL_USHORT2__)
     __cl_ushort2     v2;
 #endif
 }cl_ushort2;
@@ -740,10 +804,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
 #endif
-#if defined( __CL_USHORT2__) 
+#if defined( __CL_USHORT2__)
     __cl_ushort2     v2[2];
 #endif
-#if defined( __CL_USHORT4__) 
+#if defined( __CL_USHORT4__)
     __cl_ushort4     v4;
 #endif
 }cl_ushort4;
@@ -759,10 +823,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
 #endif
-#if defined( __CL_USHORT2__) 
+#if defined( __CL_USHORT2__)
     __cl_ushort2     v2[4];
 #endif
-#if defined( __CL_USHORT4__) 
+#if defined( __CL_USHORT4__)
     __cl_ushort4     v4[2];
 #endif
 #if defined( __CL_USHORT8__ )
@@ -778,10 +842,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
 #endif
-#if defined( __CL_USHORT2__) 
+#if defined( __CL_USHORT2__)
     __cl_ushort2     v2[8];
 #endif
-#if defined( __CL_USHORT4__) 
+#if defined( __CL_USHORT4__)
     __cl_ushort4     v4[4];
 #endif
 #if defined( __CL_USHORT8__ )
@@ -792,6 +856,81 @@ typedef union
 #endif
 }cl_ushort16;
 
+
+/* ---- cl_halfn ---- */
+typedef union
+{
+    cl_half  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
+    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2;
+#endif
+}cl_half2;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
+    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[2];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4;
+#endif
+}cl_half4;
+
+/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
+typedef  cl_half4  cl_half3;
+
+typedef union
+{
+    cl_half   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
+    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[4];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[2];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8;
+#endif
+}cl_half8;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[8];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[4];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8[2];
+#endif
+#if defined( __CL_HALF16__ )
+    __cl_half16    v16;
+#endif
+}cl_half16;
+
 /* ---- cl_intn ---- */
 typedef union
 {
@@ -801,7 +940,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
 #endif
-#if defined( __CL_INT2__) 
+#if defined( __CL_INT2__)
     __cl_int2     v2;
 #endif
 }cl_int2;
@@ -814,10 +953,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
 #endif
-#if defined( __CL_INT2__) 
+#if defined( __CL_INT2__)
     __cl_int2     v2[2];
 #endif
-#if defined( __CL_INT4__) 
+#if defined( __CL_INT4__)
     __cl_int4     v4;
 #endif
 }cl_int4;
@@ -833,10 +972,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
 #endif
-#if defined( __CL_INT2__) 
+#if defined( __CL_INT2__)
     __cl_int2     v2[4];
 #endif
-#if defined( __CL_INT4__) 
+#if defined( __CL_INT4__)
     __cl_int4     v4[2];
 #endif
 #if defined( __CL_INT8__ )
@@ -852,10 +991,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
 #endif
-#if defined( __CL_INT2__) 
+#if defined( __CL_INT2__)
     __cl_int2     v2[8];
 #endif
-#if defined( __CL_INT4__) 
+#if defined( __CL_INT4__)
     __cl_int4     v4[4];
 #endif
 #if defined( __CL_INT8__ )
@@ -876,7 +1015,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
 #endif
-#if defined( __CL_UINT2__) 
+#if defined( __CL_UINT2__)
     __cl_uint2     v2;
 #endif
 }cl_uint2;
@@ -889,10 +1028,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
 #endif
-#if defined( __CL_UINT2__) 
+#if defined( __CL_UINT2__)
     __cl_uint2     v2[2];
 #endif
-#if defined( __CL_UINT4__) 
+#if defined( __CL_UINT4__)
     __cl_uint4     v4;
 #endif
 }cl_uint4;
@@ -908,10 +1047,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
 #endif
-#if defined( __CL_UINT2__) 
+#if defined( __CL_UINT2__)
     __cl_uint2     v2[4];
 #endif
-#if defined( __CL_UINT4__) 
+#if defined( __CL_UINT4__)
     __cl_uint4     v4[2];
 #endif
 #if defined( __CL_UINT8__ )
@@ -927,10 +1066,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
 #endif
-#if defined( __CL_UINT2__) 
+#if defined( __CL_UINT2__)
     __cl_uint2     v2[8];
 #endif
-#if defined( __CL_UINT4__) 
+#if defined( __CL_UINT4__)
     __cl_uint4     v4[4];
 #endif
 #if defined( __CL_UINT8__ )
@@ -950,7 +1089,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
 #endif
-#if defined( __CL_LONG2__) 
+#if defined( __CL_LONG2__)
     __cl_long2     v2;
 #endif
 }cl_long2;
@@ -963,10 +1102,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
 #endif
-#if defined( __CL_LONG2__) 
+#if defined( __CL_LONG2__)
     __cl_long2     v2[2];
 #endif
-#if defined( __CL_LONG4__) 
+#if defined( __CL_LONG4__)
     __cl_long4     v4;
 #endif
 }cl_long4;
@@ -982,10 +1121,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
 #endif
-#if defined( __CL_LONG2__) 
+#if defined( __CL_LONG2__)
     __cl_long2     v2[4];
 #endif
-#if defined( __CL_LONG4__) 
+#if defined( __CL_LONG4__)
     __cl_long4     v4[2];
 #endif
 #if defined( __CL_LONG8__ )
@@ -1001,10 +1140,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
 #endif
-#if defined( __CL_LONG2__) 
+#if defined( __CL_LONG2__)
     __cl_long2     v2[8];
 #endif
-#if defined( __CL_LONG4__) 
+#if defined( __CL_LONG4__)
     __cl_long4     v4[4];
 #endif
 #if defined( __CL_LONG8__ )
@@ -1025,7 +1164,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
 #endif
-#if defined( __CL_ULONG2__) 
+#if defined( __CL_ULONG2__)
     __cl_ulong2     v2;
 #endif
 }cl_ulong2;
@@ -1038,10 +1177,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
 #endif
-#if defined( __CL_ULONG2__) 
+#if defined( __CL_ULONG2__)
     __cl_ulong2     v2[2];
 #endif
-#if defined( __CL_ULONG4__) 
+#if defined( __CL_ULONG4__)
     __cl_ulong4     v4;
 #endif
 }cl_ulong4;
@@ -1057,10 +1196,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
 #endif
-#if defined( __CL_ULONG2__) 
+#if defined( __CL_ULONG2__)
     __cl_ulong2     v2[4];
 #endif
-#if defined( __CL_ULONG4__) 
+#if defined( __CL_ULONG4__)
     __cl_ulong4     v4[2];
 #endif
 #if defined( __CL_ULONG8__ )
@@ -1076,10 +1215,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
 #endif
-#if defined( __CL_ULONG2__) 
+#if defined( __CL_ULONG2__)
     __cl_ulong2     v2[8];
 #endif
-#if defined( __CL_ULONG4__) 
+#if defined( __CL_ULONG4__)
     __cl_ulong4     v4[4];
 #endif
 #if defined( __CL_ULONG8__ )
@@ -1101,7 +1240,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
 #endif
-#if defined( __CL_FLOAT2__) 
+#if defined( __CL_FLOAT2__)
     __cl_float2     v2;
 #endif
 }cl_float2;
@@ -1114,10 +1253,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
 #endif
-#if defined( __CL_FLOAT2__) 
+#if defined( __CL_FLOAT2__)
     __cl_float2     v2[2];
 #endif
-#if defined( __CL_FLOAT4__) 
+#if defined( __CL_FLOAT4__)
     __cl_float4     v4;
 #endif
 }cl_float4;
@@ -1133,10 +1272,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
 #endif
-#if defined( __CL_FLOAT2__) 
+#if defined( __CL_FLOAT2__)
     __cl_float2     v2[4];
 #endif
-#if defined( __CL_FLOAT4__) 
+#if defined( __CL_FLOAT4__)
     __cl_float4     v4[2];
 #endif
 #if defined( __CL_FLOAT8__ )
@@ -1152,10 +1291,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
 #endif
-#if defined( __CL_FLOAT2__) 
+#if defined( __CL_FLOAT2__)
     __cl_float2     v2[8];
 #endif
-#if defined( __CL_FLOAT4__) 
+#if defined( __CL_FLOAT4__)
     __cl_float4     v4[4];
 #endif
 #if defined( __CL_FLOAT8__ )
@@ -1176,7 +1315,7 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
 #endif
-#if defined( __CL_DOUBLE2__) 
+#if defined( __CL_DOUBLE2__)
     __cl_double2     v2;
 #endif
 }cl_double2;
@@ -1189,10 +1328,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
 #endif
-#if defined( __CL_DOUBLE2__) 
+#if defined( __CL_DOUBLE2__)
     __cl_double2     v2[2];
 #endif
-#if defined( __CL_DOUBLE4__) 
+#if defined( __CL_DOUBLE4__)
     __cl_double4     v4;
 #endif
 }cl_double4;
@@ -1208,10 +1347,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
 #endif
-#if defined( __CL_DOUBLE2__) 
+#if defined( __CL_DOUBLE2__)
     __cl_double2     v2[4];
 #endif
-#if defined( __CL_DOUBLE4__) 
+#if defined( __CL_DOUBLE4__)
     __cl_double4     v4[2];
 #endif
 #if defined( __CL_DOUBLE8__ )
@@ -1227,10 +1366,10 @@ typedef union
    __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
 #endif
-#if defined( __CL_DOUBLE2__) 
+#if defined( __CL_DOUBLE2__)
     __cl_double2     v2[8];
 #endif
-#if defined( __CL_DOUBLE4__) 
+#if defined( __CL_DOUBLE4__)
     __cl_double4     v4[4];
 #endif
 #if defined( __CL_DOUBLE8__ )
@@ -1241,9 +1380,9 @@ typedef union
 #endif
 }cl_double16;
 
-/* Macro to facilitate debugging 
+/* Macro to facilitate debugging
  * Usage:
- *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
  *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
  *   Each line thereafter of OpenCL C source must end with: \n\
  *   The last line ends in ";
@@ -1258,21 +1397,23 @@ typedef union
  *   }                                               \n\
  *   ";
  *
- * This should correctly set up the line, (column) and file information for your source 
+ * This should correctly set up the line, (column) and file information for your source
  * string so you can do source level debugging.
  */
 #define  __CL_STRINGIFY( _x )               # _x
 #define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
-#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
-  
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
 #ifdef __cplusplus
 }
 #endif
 
 #undef __CL_HAS_ANON_STRUCT__
 #undef __CL_ANON_STRUCT__
-#if defined( _WIN32) && (_MSC_VER >= 1500)
-#pragma warning( pop )
+#if defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >=1500
+    #pragma warning( pop )
+    #endif
 #endif
 
 #endif  /* __CL_PLATFORM_H  */
diff --git a/src/CL/opencl.h b/src/CL/opencl.h
index 3f00524..9855cd7 100644
--- a/src/CL/opencl.h
+++ b/src/CL/opencl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
  *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
diff --git a/src/core/Context.cpp b/src/core/Context.cpp
index e6fc415..0c71116 100644
--- a/src/core/Context.cpp
+++ b/src/core/Context.cpp
@@ -479,18 +479,14 @@ Context::Message& Context::Message::operator<<(
     }
     else
     {
-#if LLVM_VERSION > 36
       llvm::DILocation *loc = (llvm::DILocation*)md;
       unsigned lineNumber = loc->getLine();
+      unsigned columnNumber = loc->getColumn();
       llvm::StringRef filename = loc->getFilename();
-#else
-      llvm::DILocation loc((llvm::MDLocation*)md);
-      unsigned lineNumber = loc.getLineNumber();
-      llvm::StringRef filename = loc.getFilename();
-#endif
 
       *this << "At line " << dec << lineNumber
-           << " of " << filename.str() << ":" << endl;
+            << " (column " << columnNumber << ")"
+            << " of " << filename.str() << ":" << endl;
 
       // Get source line
       const Program *program = m_kernelInvocation->getKernel()->getProgram();
diff --git a/src/core/Kernel.cpp b/src/core/Kernel.cpp
index 2ca8393..3d7f6b3 100644
--- a/src/core/Kernel.cpp
+++ b/src/core/Kernel.cpp
@@ -18,7 +18,6 @@
 
 #include "Kernel.h"
 #include "Program.h"
-#include "Memory.h"
 
 using namespace oclgrind;
 using namespace std;
@@ -45,8 +44,9 @@ Kernel::Kernel(const Program *program,
 
       break;
     }
+    case AddrSpaceGlobal:
     case AddrSpaceConstant:
-      m_constants.push_back(&*itr);
+      m_values[&*itr] = program->getProgramScopeVar(&*itr).clone();
       break;
     case AddrSpaceLocal:
     {
@@ -92,8 +92,6 @@ Kernel::Kernel(const Kernel& kernel)
  : m_program(kernel.m_program)
 {
   m_function = kernel.m_function;
-  m_constants = kernel.m_constants;
-  m_constantBuffers = kernel.m_constantBuffers;
   m_name = kernel.m_name;
   m_metadata = kernel.m_metadata;
 
@@ -125,51 +123,6 @@ bool Kernel::allArgumentsSet() const
   return true;
 }
 
-void Kernel::allocateConstants(Memory *memory)
-{
-  list<const llvm::GlobalVariable*>::const_iterator itr;
-  for (itr = m_constants.begin(); itr != m_constants.end(); itr++)
-  {
-    const llvm::Constant *initializer = (*itr)->getInitializer();
-    const llvm::Type *type = initializer->getType();
-
-    // Deallocate existing pointer
-    if (m_values.count(*itr))
-    {
-      delete[] m_values[*itr].data;
-    }
-
-    // Get initializer data
-    unsigned size = getTypeSize(type);
-    unsigned char *data = new unsigned char[size];
-    getConstantData(data, (const llvm::Constant*)initializer);
-
-    // Allocate buffer
-    TypedValue address = {
-      sizeof(size_t),
-      1,
-      new unsigned char[sizeof(size_t)]
-    };
-    size_t ptr = memory->allocateBuffer(size, 0, data);
-    address.setPointer(ptr);
-
-    m_values[*itr] = address;
-    m_constantBuffers.push_back(ptr);
-
-    delete[] data;
-  }
-}
-
-void Kernel::deallocateConstants(Memory *memory)
-{
-  list<size_t>::const_iterator itr;
-  for (itr = m_constantBuffers.begin(); itr != m_constantBuffers.end(); itr++)
-  {
-    memory->deallocateBuffer(*itr);
-  }
-  m_constantBuffers.clear();
-}
-
 const llvm::Argument* Kernel::getArgument(unsigned int index) const
 {
   assert(index < getNumArguments());
@@ -242,7 +195,10 @@ unsigned int Kernel::getArgumentAddressQualifier(unsigned int index) const
 const llvm::Metadata* Kernel::getArgumentMetadata(string name,
                                                   unsigned int index) const
 {
-#if LLVM_VERSION < 39
+  llvm::MDNode *node = m_function->getMetadata(name);
+  if (node)
+    return node->getOperand(index);
+
   if (!m_metadata)
   {
     return NULL;
@@ -263,12 +219,6 @@ const llvm::Metadata* Kernel::getArgumentMetadata(string name,
     }
   }
   return NULL;
-#else
-  llvm::MDNode *node = m_function->getMetadata(name);
-  if (!node)
-    return NULL;
-  return node->getOperand(index);
-#endif
 }
 
 const llvm::StringRef Kernel::getArgumentName(unsigned int index) const
@@ -288,13 +238,11 @@ const llvm::StringRef Kernel::getArgumentTypeName(unsigned int index) const
   }
 
   llvm::StringRef name = llvm::dyn_cast<llvm::MDString>(md)->getString();
-#if LLVM_VERSION >= 39
   size_t imgStart = name.find(" image");
   if (imgStart != llvm::StringRef::npos)
   {
     name = name.substr(imgStart+1);
   }
-#endif
   return name;
 }
 
@@ -309,6 +257,11 @@ unsigned int Kernel::getArgumentTypeQualifier(unsigned int index) const
     return -1;
   }
 
+  // Ignore type qualifiers for non-pointer arguments
+  const llvm::Argument *arg = getArgument(index);
+  if (!arg->getType()->isPointerTy() || arg->hasByValAttr())
+    return CL_KERNEL_ARG_TYPE_NONE;
+
   // Get qualifiers
   const llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(md);
   istringstream iss(str->getString().str());
@@ -453,7 +406,27 @@ void Kernel::setArgument(unsigned int index, TypedValue value)
     delete[] m_values[argument].data;
   }
 
-  m_values[argument] = value.clone();
+#if LLVM_VERSION >= 40
+  if (getArgumentTypeName(index).str() == "sampler_t")
+  {
+    // Get an llvm::ConstantInt that represents the sampler value
+    llvm::Type *i32 = llvm::Type::getInt32Ty(m_program->getLLVMContext());
+    llvm::Constant *samplerValue = llvm::ConstantInt::get(i32, value.getSInt());
+
+    // A sampler argument is a pointer to the llvm::ConstantInt value
+    TypedValue sampler;
+    sampler.size = sizeof(size_t);
+    sampler.num = 1;
+    sampler.data = new unsigned char[sizeof(size_t)];
+    sampler.setPointer((size_t)samplerValue);
+
+    m_values[argument] = sampler;
+  }
+  else
+#endif
+  {
+    m_values[argument] = value.clone();
+  }
 }
 
 TypedValueMap::const_iterator Kernel::values_begin() const
diff --git a/src/core/Kernel.h b/src/core/Kernel.h
index 34755a1..49a6f2a 100644
--- a/src/core/Kernel.h
+++ b/src/core/Kernel.h
@@ -37,8 +37,6 @@ namespace oclgrind
     TypedValueMap::const_iterator values_begin() const;
     TypedValueMap::const_iterator values_end() const;
     bool allArgumentsSet() const;
-    void allocateConstants(Memory *memory);
-    void deallocateConstants(Memory *memory);
     unsigned int getArgumentAccessQualifier(unsigned int index) const;
     unsigned int getArgumentAddressQualifier(unsigned int index) const;
     const llvm::StringRef getArgumentName(unsigned int index) const;
@@ -57,8 +55,6 @@ namespace oclgrind
   private:
     const Program *m_program;
     const llvm::Function *m_function;
-    std::list<const llvm::GlobalVariable*> m_constants;
-    std::list<size_t> m_constantBuffers;
     const llvm::MDNode *m_metadata;
     std::string m_name;
 
diff --git a/src/core/KernelInvocation.cpp b/src/core/KernelInvocation.cpp
index a54f865..13589ba 100644
--- a/src/core/KernelInvocation.cpp
+++ b/src/core/KernelInvocation.cpp
@@ -16,6 +16,7 @@
 #include "Kernel.h"
 #include "KernelInvocation.h"
 #include "Memory.h"
+#include "Program.h"
 #include "WorkGroup.h"
 #include "WorkItem.h"
 
@@ -24,6 +25,7 @@ using namespace std;
 
 struct
 {
+  int id;
   WorkGroup *workGroup;
   WorkItem  *workItem;
 } static THREAD_LOCAL workerState;
@@ -45,23 +47,16 @@ KernelInvocation::KernelInvocation(const Context *context, const Kernel *kernel,
   m_numGroups.x = m_globalSize.x/m_localSize.x;
   m_numGroups.y = m_globalSize.y/m_localSize.y;
   m_numGroups.z = m_globalSize.z/m_localSize.z;
-
-  // Check for user overriding number of threads
-  m_numWorkers = 0;
-  const char *numThreads = getenv("OCLGRIND_NUM_THREADS");
-  if (numThreads)
-  {
-    char *next;
-    m_numWorkers = strtoul(numThreads, &next, 10);
-    if (strlen(next))
-    {
-      cerr << "Oclgrind: Invalid value for OCLGRIND_NUM_THREADS" << endl;
-    }
-  }
-  else
+  if (!m_kernel->getProgram()->requiresUniformWorkGroups())
   {
-    m_numWorkers = thread::hardware_concurrency();
+    m_numGroups.x += m_globalSize.x % m_localSize.x ? 1 : 0;
+    m_numGroups.y += m_globalSize.y % m_localSize.y ? 1 : 0;
+    m_numGroups.z += m_globalSize.z % m_localSize.z ? 1 : 0;
   }
+
+  // Check for user overriding number of threads
+  m_numWorkers = getEnvInt("OCLGRIND_NUM_THREADS",
+                           thread::hardware_concurrency(), false);
   if (!m_numWorkers || !m_context->isThreadSafe())
     m_numWorkers = 1;
 
@@ -151,23 +146,6 @@ void KernelInvocation::run(const Context *context, Kernel *kernel,
                            Size3 globalSize,
                            Size3 localSize)
 {
-  try
-  {
-    // Allocate and initialise constant memory
-    kernel->allocateConstants(context->getGlobalMemory());
-  }
-  catch (FatalError& err)
-  {
-    ostringstream info;
-    info << "OCLGRIND FATAL ERROR "
-         << "(" << err.getFile() << ":" << err.getLine() << ")"
-         << endl << err.what()
-         << endl << "When allocating kernel constants for '"
-         << kernel->getName() << "'";
-    context->logError(info.str().c_str());
-    return;
-  }
-
   // Create kernel invocation
   KernelInvocation *ki = new KernelInvocation(context, kernel, workDim,
                                               globalOffset,
@@ -180,9 +158,6 @@ void KernelInvocation::run(const Context *context, Kernel *kernel,
   context->notifyKernelEnd(ki);
 
   delete ki;
-
-  // Deallocate constant memory
-  kernel->deallocateConstants(context->getGlobalMemory());
 }
 
 void KernelInvocation::run()
@@ -194,7 +169,7 @@ void KernelInvocation::run()
   vector<thread> threads;
   for (unsigned i = 0; i < m_numWorkers; i++)
   {
-    threads.push_back(thread(&KernelInvocation::runWorker, this));
+    threads.push_back(thread(&KernelInvocation::runWorker, this, i));
   }
 
   // Wait for workers to complete
@@ -204,10 +179,16 @@ void KernelInvocation::run()
   }
 }
 
-void KernelInvocation::runWorker()
+int KernelInvocation::getWorkerID() const
+{
+  return workerState.id;
+}
+
+void KernelInvocation::runWorker(int id)
 {
   workerState.workGroup = NULL;
   workerState.workItem = NULL;
+  workerState.id = id;
   try
   {
     while (true)
@@ -227,7 +208,17 @@ void KernelInvocation::runWorker()
           // No more work to do
           break;
 
-        workerState.workGroup = new WorkGroup(this, m_workGroups[index]);
+        Size3 wgid   = m_workGroups[index];
+        Size3 wgsize = m_localSize;
+
+        // Handle remainder work-groups
+        for (unsigned i = 0; i < 3; i++)
+        {
+          if (wgsize[i]*(wgid[i]+1) > m_globalSize[i])
+            wgsize[i] = m_globalSize[i] % wgsize[i];
+        }
+
+        workerState.workGroup = new WorkGroup(this, wgid, wgsize);
         m_context->notifyWorkGroupBegin(workerState.workGroup);
       }
 
diff --git a/src/core/KernelInvocation.h b/src/core/KernelInvocation.h
index edca291..9f512ba 100644
--- a/src/core/KernelInvocation.h
+++ b/src/core/KernelInvocation.h
@@ -35,6 +35,8 @@ namespace oclgrind
     size_t getWorkDim() const;
     bool switchWorkItem(const Size3 gid);
 
+    int getWorkerID() const;
+
   private:
     KernelInvocation(const Context *context, const Kernel *kernel,
                      unsigned int workDim,
@@ -58,7 +60,7 @@ namespace oclgrind
     std::list<WorkGroup*> m_runningGroups;
 
     // Worker threads
-    void runWorker();
+    void runWorker(int id);
     unsigned m_numWorkers;
   };
 }
diff --git a/src/core/Memory.cpp b/src/core/Memory.cpp
index 289badc..17e6311 100644
--- a/src/core/Memory.cpp
+++ b/src/core/Memory.cpp
@@ -91,13 +91,19 @@ size_t Memory::allocateBuffer(size_t size, cl_mem_flags flags,
   return address;
 }
 
-uint32_t Memory::atomic(AtomicOp op, size_t address, uint32_t value)
+template uint64_t Memory::atomic(AtomicOp op, size_t address, uint64_t value);
+template int64_t Memory::atomic(AtomicOp op, size_t address, int64_t value);
+template uint32_t Memory::atomic(AtomicOp op, size_t address, uint32_t value);
+template int32_t Memory::atomic(AtomicOp op, size_t address, int32_t value);
+
+template<typename T>
+T Memory::atomic(AtomicOp op, size_t address, T value)
 {
-  m_context->notifyMemoryAtomicLoad(this, op, address, 4);
-  m_context->notifyMemoryAtomicStore(this, op, address, 4);
+  m_context->notifyMemoryAtomicLoad(this, op, address, sizeof(T));
+  m_context->notifyMemoryAtomicStore(this, op, address, sizeof(T));
 
   // Bounds check
-  if (!isAddressValid(address, 4))
+  if (!isAddressValid(address, sizeof(T)))
   {
     return 0;
   }
@@ -105,12 +111,12 @@ uint32_t Memory::atomic(AtomicOp op, size_t address, uint32_t value)
   // Get buffer
   size_t offset = extractOffset(address);
   Buffer *buffer = m_memory[extractBuffer(address)];
-  uint32_t *ptr = (uint32_t*)(buffer->data + offset);
+  T *ptr = (T*)(buffer->data + offset);
 
   if (m_addressSpace == AddrSpaceGlobal)
     ATOMIC_MUTEX(offset).lock();
 
-  uint32_t old = *ptr;
+  T old = *ptr;
   switch(op)
   {
   case AtomicAdd:
@@ -154,12 +160,16 @@ uint32_t Memory::atomic(AtomicOp op, size_t address, uint32_t value)
   return old;
 }
 
-uint32_t Memory::atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value)
+template uint32_t Memory::atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value);
+template uint64_t Memory::atomicCmpxchg(size_t address, uint64_t cmp, uint64_t value);
+
+template<typename T>
+T Memory::atomicCmpxchg(size_t address, T cmp, T value)
 {
-  m_context->notifyMemoryAtomicLoad(this, AtomicCmpXchg, address, 4);
+  m_context->notifyMemoryAtomicLoad(this, AtomicCmpXchg, address, sizeof(T));
 
   // Bounds check
-  if (!isAddressValid(address, 4))
+  if (!isAddressValid(address, sizeof(T)))
   {
     return 0;
   }
@@ -167,18 +177,18 @@ uint32_t Memory::atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value)
   // Get buffer
   size_t offset = extractOffset(address);
   Buffer *buffer = m_memory[extractBuffer(address)];
-  uint32_t *ptr = (uint32_t*)(buffer->data + offset);
+  T *ptr = (T *)(buffer->data + offset);
 
   if (m_addressSpace == AddrSpaceGlobal)
     ATOMIC_MUTEX(offset).lock();
 
   // Perform cmpxchg
-  uint32_t old = *ptr;
+  T old = *ptr;
   if (old == cmp)
   {
     *ptr = value;
 
-    m_context->notifyMemoryAtomicStore(this, AtomicCmpXchg, address, 4);
+    m_context->notifyMemoryAtomicStore(this, AtomicCmpXchg, address, sizeof(T));
   }
 
   if (m_addressSpace == AddrSpaceGlobal)
diff --git a/src/core/Memory.h b/src/core/Memory.h
index 71f0c45..a416c0b 100644
--- a/src/core/Memory.h
+++ b/src/core/Memory.h
@@ -28,8 +28,8 @@ namespace oclgrind
 
     size_t allocateBuffer(size_t size, cl_mem_flags flags=0,
                           const uint8_t *initData = NULL);
-    uint32_t atomic(AtomicOp op, size_t address, uint32_t value = 0);
-    uint32_t atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value);
+    template<typename T> T atomic(AtomicOp op, size_t address, T value = 0);
+    template<typename T> T atomicCmpxchg(size_t address, T cmp, T value);
     void clear();
     size_t createHostBuffer(size_t size, void *ptr, cl_mem_flags flags=0);
     bool copy(size_t dest, size_t src, size_t size);
diff --git a/src/core/Program.cpp b/src/core/Program.cpp
index 6480ed7..c0b6e71 100644
--- a/src/core/Program.cpp
+++ b/src/core/Program.cpp
@@ -17,7 +17,12 @@
 #include <dlfcn.h>
 #endif
 
+#if LLVM_VERSION < 40
 #include "llvm/Bitcode/ReaderWriter.h"
+#else
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#endif
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
@@ -35,6 +40,7 @@
 
 #include "Context.h"
 #include "Kernel.h"
+#include "Memory.h"
 #include "Program.h"
 #include "WorkItem.h"
 
@@ -50,8 +56,8 @@
 #endif
 
 #define REMAP_INPUT "input.cl"
-#define CLC_H_PATH REMAP_DIR"clc.h"
-extern const char CLC_H_DATA[];
+#define OPENCL_C_H_PATH REMAP_DIR"opencl-c.h"
+extern const char OPENCL_C_H_DATA[];
 
 const char *EXTENSIONS[] =
 {
@@ -61,6 +67,8 @@ const char *EXTENSIONS[] =
   "cl_khr_global_int32_extended_atomics",
   "cl_khr_local_int32_base_atomics",
   "cl_khr_local_int32_extended_atomics",
+  "cl_khr_int64_base_atomics",
+  "cl_khr_int64_extended_atomics",
   "cl_khr_byte_addressable_store",
 };
 
@@ -74,6 +82,9 @@ Program::Program(const Context *context, llvm::Module *module)
   m_buildOptions = "";
   m_buildStatus = CL_BUILD_SUCCESS;
   m_uid = generateUID();
+  m_totalProgramScopeVarSize = 0;
+
+  allocateProgramScopeVars();
 }
 
 Program::Program(const Context *context, const string& source)
@@ -84,6 +95,7 @@ Program::Program(const Context *context, const string& source)
   m_buildOptions = "";
   m_buildStatus = CL_BUILD_NONE;
   m_uid = 0;
+  m_totalProgramScopeVarSize = 0;
 
   // Split source into individual lines
   m_sourceLines.clear();
@@ -101,6 +113,122 @@ Program::Program(const Context *context, const string& source)
 Program::~Program()
 {
   clearInterpreterCache();
+  deallocateProgramScopeVars();
+}
+
+void Program::allocateProgramScopeVars()
+{
+  deallocateProgramScopeVars();
+
+  Memory *globalMemory = m_context->getGlobalMemory();
+
+  // Create the pointer values for each global variable
+  llvm::Module::const_global_iterator itr;
+  for (itr = m_module->global_begin(); itr != m_module->global_end(); itr++)
+  {
+    unsigned addrspace = itr->getType()->getPointerAddressSpace();
+    if (addrspace != AddrSpaceGlobal && addrspace != AddrSpaceConstant)
+      continue;
+
+    // Allocate global variable
+    const llvm::Type *type = itr->getType()->getPointerElementType();
+    size_t size = getTypeSize(type);
+    size_t ptr = globalMemory->allocateBuffer(size);
+    m_totalProgramScopeVarSize += size;
+
+    // Create pointer value
+    TypedValue ptrValue =
+    {
+      sizeof(size_t), 1, new uint8_t[sizeof(size_t)]
+    };
+    ptrValue.setPointer(ptr);
+    m_programScopeVars[&*itr] = ptrValue;
+  }
+
+  try
+  {
+    // Initialize global variables
+    for (auto itr  = m_programScopeVars.begin();
+              itr != m_programScopeVars.end();
+              itr++)
+    {
+      auto var = llvm::cast<llvm::GlobalVariable>(itr->first);
+      const llvm::Constant *initializer = var->getInitializer();
+      if (!initializer)
+        continue;
+
+      size_t varptr = itr->second.getPointer();
+      if (initializer->getType()->getTypeID() == llvm::Type::PointerTyID)
+      {
+        size_t ptr = resolveConstantPointer(initializer, m_programScopeVars);
+        globalMemory->store((uint8_t*)&ptr, varptr, sizeof(size_t));
+      }
+      else
+      {
+        size_t size = getTypeSize(initializer->getType());
+        uint8_t *data = new uint8_t[size];
+        getConstantData((uint8_t*)data, (const llvm::Constant*)initializer);
+        globalMemory->store(data, varptr, size);
+        delete[] data;
+      }
+    }
+  }
+  catch (FatalError& err)
+  {
+    cerr << endl << "OCLGRIND FATAL ERROR "
+         << "(" << err.getFile() << ":" << err.getLine() << ")"
+         << endl << err.what()
+         << endl << "When initializing program scope global variables"
+         << endl;
+  }
+}
+
+// Utility to split a string up to the next unquoted space
+// After this returns, input will point to the start of the next string (no
+// leading spaces), and next will point to where the next string will start.
+// Modifies the content of input in place.
+void split_token(char *input, char **next)
+{
+  char *output = input;
+
+  // Strip leading spaces
+  while (*input == ' ')
+    input++;
+
+  // Loop until end of string
+  bool quoted = false;
+  while (*input != '\0')
+  {
+    // Stop at space, unless we're in quotes
+    if (*input == ' ' && !quoted)
+      break;
+
+    if (*input == '"')
+    {
+      // Enter/exit quoted region, don't emit quote
+      quoted = !quoted;
+    }
+    else
+    {
+      // Check for escaped space
+      if (*input == '\\' && *(input+1) == ' ')
+        input++;
+
+      // Copy character to output string
+      *output = *input;
+      output++;
+    }
+
+    input++;
+  }
+
+  // Set *next to start of next potential string
+  *next = input;
+  if (**next != '\0')
+    (*next)++;
+
+  // Split token with null terminator
+  *output = '\0';
 }
 
 bool Program::build(const char *options, list<Header> headers)
@@ -116,6 +244,9 @@ bool Program::build(const char *options, list<Header> headers)
   if (m_source.empty() && m_module)
   {
     m_buildStatus = CL_BUILD_SUCCESS;
+
+    allocateProgramScopeVars();
+
     return true;
   }
 
@@ -130,33 +261,48 @@ bool Program::build(const char *options, list<Header> headers)
 
   // Set compiler arguments
   vector<const char*> args;
-  args.push_back("-cl-std=CL1.2");
   args.push_back("-cl-kernel-arg-info");
+  args.push_back("-D__IMAGE_SUPPORT__=1");
+  args.push_back("-D__OPENCL_VERSION__=120");
   args.push_back("-fno-builtin");
-#if LLVM_VERSION >= 38
+  args.push_back("-fgnu89-inline");
   args.push_back("-debug-info-kind=standalone");
-#else
-  args.push_back("-g");
-#endif
+  args.push_back("-dwarf-column-info");
   args.push_back("-triple");
   if (sizeof(size_t) == 4)
     args.push_back("spir-unknown-unknown");
   else
     args.push_back("spir64-unknown-unknown");
 
+#if ! IS_BIG_ENDIAN
+  args.push_back("-D__ENDIAN_LITTLE__=1");
+#endif
+
+#if LLVM_VERSION < 40
   // Define extensions
   for (unsigned i = 0; i < sizeof(EXTENSIONS)/sizeof(const char*); i++)
   {
     args.push_back("-D");
     args.push_back(EXTENSIONS[i]);
   }
+#else
+  // Disable all extensions
+  std::string cl_ext("-cl-ext=-all");
+  // Explicitly enable supported extensions
+  for (unsigned i = 0; i < sizeof(EXTENSIONS)/sizeof(const char*); i++)
+  {
+    cl_ext += ",+" + std::string(EXTENSIONS[i]);
+  }
+  args.push_back(cl_ext.c_str());
+#endif
 
   // Disable Clang's optimizations.
   // We will manually run optimization passes and legalize the IR later.
   args.push_back("-O0");
 
   bool optimize = true;
-  bool cl12     = true;
+  const char *clstd = NULL;
+  m_requiresUniformWorkGroups = false;
 
   // Disable optimizations by default if in interactive mode
   if (checkEnv("OCLGRIND_INTERACTIVE"))
@@ -171,8 +317,17 @@ bool Program::build(const char *options, list<Header> headers)
     extraOptions = "";
   char *tmpOptions = new char[strlen(mainOptions) + strlen(extraOptions) + 2];
   sprintf(tmpOptions, "%s %s", mainOptions, extraOptions);
-  for (char *opt = strtok(tmpOptions, " "); opt; opt = strtok(NULL, " "))
+  char *opt = tmpOptions;
+  char *next = NULL;
+  while (strlen(opt) > 0)
   {
+    // Split token up to next unquoted space
+    if (next)
+      opt = next;
+    split_token(opt, &next);
+    if (!strlen(opt))
+      break;
+
     // Ignore options that break PCH
     if (strcmp(opt, "-cl-fast-relaxed-math") != 0 &&
         strcmp(opt, "-cl-finite-math-only") != 0 &&
@@ -191,20 +346,21 @@ bool Program::build(const char *options, list<Header> headers)
         continue;
       }
 
-#if LLVM_VERSION >= 37
       // Clang no longer supports -cl-no-signed-zeros
       if (strcmp(opt, "-cl-no-signed-zeros") == 0)
         continue;
-#endif
+
+      // Check for -cl-uniform-work-group-size flag
+      if (strcmp(opt, "-cl-uniform-work-group-size") == 0)
+      {
+        m_requiresUniformWorkGroups = true;
+        continue;
+      }
 
       // Check for -cl-std flag
       if (strncmp(opt, "-cl-std=", 8) == 0)
       {
-        if (strcmp(opt+8, "CL1.2") != 0)
-        {
-          cl12 = false;
-          args.push_back(opt);
-        }
+        clstd = opt;
         continue;
       }
 
@@ -212,15 +368,21 @@ bool Program::build(const char *options, list<Header> headers)
     }
   }
 
-  if (cl12)
+  if (!clstd)
   {
-    args.push_back("-cl-std=CL1.2");
+    clstd = "-cl-std=CL1.2";
   }
+  args.push_back(clstd);
+
+  // If compiling for OpenCL 1.X, require uniform work-groups
+  if (strncmp(clstd, "-cl-std=CL1.", 12) == 0)
+    m_requiresUniformWorkGroups = true;
 
   // Pre-compiled header
   char *pchdir = NULL;
   char *pch    = NULL;
-  if (!checkEnv("OCLGRIND_DISABLE_PCH") && cl12)
+  if (!checkEnv("OCLGRIND_DISABLE_PCH") &&
+      (!strcmp(clstd, "-cl-std=CL1.2") || !strcmp(clstd, "-cl-std=CL2.0")))
   {
     const char *pchdirOverride = getenv("OCLGRIND_PCH_DIR");
     if (pchdirOverride)
@@ -266,8 +428,9 @@ bool Program::build(const char *options, list<Header> headers)
     if (pchdir)
     {
       // Select precompiled header
-      pch = new char[strlen(pchdir) + 20];
-      sprintf(pch, "%s/clc%d.pch", pchdir, (sizeof(size_t) == 4 ? 32 : 64));
+      pch = new char[strlen(pchdir) + 24];
+      sprintf(pch, "%s/opencl-c-%s-%d.pch",
+              pchdir, clstd+10, (sizeof(size_t) == 4 ? 32 : 64));
 
       // Check if precompiled header exists
       ifstream pchfile(pch);
@@ -297,9 +460,9 @@ bool Program::build(const char *options, list<Header> headers)
   }
   else
   {
-    // Fall back to embedded clc.h
+    // Fall back to embedded opencl-c.h
     args.push_back("-include");
-    args.push_back(CLC_H_PATH);
+    args.push_back(OPENCL_C_H_PATH);
   }
 
   // Append input file to arguments (remapped later)
@@ -318,7 +481,12 @@ bool Program::build(const char *options, list<Header> headers)
   compiler.createDiagnostics(diagConsumer, false);
 
   // Create compiler invocation
+#if LLVM_VERSION < 40
   clang::CompilerInvocation *invocation = new clang::CompilerInvocation;
+#else
+  std::shared_ptr<clang::CompilerInvocation> invocation(
+      new clang::CompilerInvocation);
+#endif
   clang::CompilerInvocation::CreateFromArgs(*invocation,
                                             &args[0], &args[0] + args.size(),
                                             compiler.getDiagnostics());
@@ -336,9 +504,10 @@ bool Program::build(const char *options, list<Header> headers)
                                                    buffer.release());
   }
 
-  // Remap clc.h
-  buffer = llvm::MemoryBuffer::getMemBuffer(CLC_H_DATA, "", false);
-  compiler.getPreprocessorOpts().addRemappedFile(CLC_H_PATH, buffer.release());
+  // Remap opencl-c.h
+  buffer = llvm::MemoryBuffer::getMemBuffer(OPENCL_C_H_DATA, "", false);
+  compiler.getPreprocessorOpts().addRemappedFile(
+    OPENCL_C_H_PATH, buffer.release());
 
   // Remap input file
   buffer = llvm::MemoryBuffer::getMemBuffer(m_source, "", false);
@@ -363,10 +532,6 @@ bool Program::build(const char *options, list<Header> headers)
       // Initialize pass managers
       llvm::legacy::PassManager modulePasses;
       llvm::legacy::FunctionPassManager functionPasses(m_module.get());
-#if LLVM_VERSION < 37
-      modulePasses.add(new llvm::DataLayoutPass());
-      functionPasses.add(new llvm::DataLayoutPass());
-#endif
 
       // Populate pass managers with -Oz
       llvm::PassManagerBuilder builder;
@@ -386,6 +551,8 @@ bool Program::build(const char *options, list<Header> headers)
 
     removeLValueLoads();
 
+    allocateProgramScopeVars();
+
     m_buildStatus = CL_BUILD_SUCCESS;
   }
   else
@@ -429,7 +596,11 @@ bool Program::build(const char *options, list<Header> headers)
 
       // Dump bitcode
       llvm::raw_fd_ostream bc(tempBC, err, llvm::sys::fs::F_None);
+#if LLVM_VERSION < 70
       llvm::WriteBitcodeToFile(m_module.get(), bc);
+#else
+      llvm::WriteBitcodeToFile(*m_module, bc);
+#endif
       bc.close();
     }
 
@@ -469,10 +640,10 @@ Program* Program::createFromBitcode(const Context *context,
   }
 
   // Parse bitcode into IR module
-#if LLVM_VERSION < 37
-  llvm::ErrorOr<llvm::Module*> module =
-#else
+#if LLVM_VERSION < 40
   llvm::ErrorOr<unique_ptr<llvm::Module>> module =
+#else
+  llvm::Expected<unique_ptr<llvm::Module>> module =
 #endif
     parseBitcodeFile(buffer->getMemBufferRef(), *context->getLLVMContext());
   if (!module)
@@ -480,11 +651,7 @@ Program* Program::createFromBitcode(const Context *context,
     return NULL;
   }
 
-#if LLVM_VERSION < 37
-  return new Program(context, module.get());
-#else
   return new Program(context, module.get().release());
-#endif
 }
 
 Program* Program::createFromBitcodeFile(const Context *context,
@@ -499,10 +666,10 @@ Program* Program::createFromBitcodeFile(const Context *context,
   }
 
   // Parse bitcode into IR module
-#if LLVM_VERSION < 37
-  llvm::ErrorOr<llvm::Module*> module =
-#else
+#if LLVM_VERSION < 40
   llvm::ErrorOr<unique_ptr<llvm::Module>> module =
+#else
+  llvm::Expected<unique_ptr<llvm::Module>> module =
 #endif
     parseBitcodeFile(buffer->get()->getMemBufferRef(),
                      *context->getLLVMContext());
@@ -511,11 +678,7 @@ Program* Program::createFromBitcodeFile(const Context *context,
     return NULL;
   }
 
-#if LLVM_VERSION < 37
-  return new Program(context, module.get());
-#else
   return new Program(context, module.get().release());
-#endif
 }
 
 Program* Program::createFromPrograms(const Context *context,
@@ -523,20 +686,16 @@ Program* Program::createFromPrograms(const Context *context,
 {
   llvm::Module *module = new llvm::Module("oclgrind_linked",
                                           *context->getLLVMContext());
-#if LLVM_VERSION < 38
-  llvm::Linker linker(module);
-#else
   llvm::Linker linker(*module);
-#endif
 
   // Link modules
   list<const Program*>::iterator itr;
   for (itr = programs.begin(); itr != programs.end(); itr++)
   {
-#if LLVM_VERSION < 38
-    llvm::Module *m = llvm::CloneModule((*itr)->m_module.get());
-#else
+#if LLVM_VERSION < 70
     unique_ptr<llvm::Module> m = llvm::CloneModule((*itr)->m_module.get());
+#else
+    unique_ptr<llvm::Module> m = llvm::CloneModule(*(*itr)->m_module);
 #endif
     if (linker.linkInModule(std::move(m)))
     {
@@ -554,37 +713,7 @@ Kernel* Program::createKernel(const string name)
 
   // Iterate over functions in module to find kernel
   llvm::Function *function = NULL;
-#if LLVM_VERSION < 37
-  // Query the SPIR kernel list
-  llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
-  // No kernels in module
-  if (!tuple)
-    return NULL;
-
-  for (unsigned i = 0; i < tuple->getNumOperands(); ++i)
-  {
-    llvm::MDNode* kernel = tuple->getOperand(i);
-
-    llvm::ConstantAsMetadata *cam =
-      llvm::dyn_cast<llvm::ConstantAsMetadata>(kernel->getOperand(0).get());
-    if (!cam)
-      continue;
-
-    llvm::Function *kernelFunction =
-      llvm::dyn_cast<llvm::Function>(cam->getValue());
 
-    // Shouldn't really happen - this would mean an invalid Module as input
-    if (!kernelFunction)
-      continue;
-
-    // Is this the kernel we want?
-    if (kernelFunction->getName() == name)
-    {
-      function = kernelFunction;
-      break;
-    }
-  }
-#else
   for (auto F = m_module->begin(); F != m_module->end(); F++)
   {
     if (F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL &&
@@ -594,7 +723,6 @@ Kernel* Program::createKernel(const string name)
       break;
     }
   }
-#endif
 
   if (function == NULL)
   {
@@ -623,6 +751,19 @@ Kernel* Program::createKernel(const string name)
   }
 }
 
+void Program::deallocateProgramScopeVars()
+{
+  for (auto psv  = m_programScopeVars.begin();
+            psv != m_programScopeVars.end();
+            psv++)
+  {
+    m_context->getGlobalMemory()->deallocateBuffer(psv->second.getPointer());
+    delete[] psv->second.data;
+  }
+  m_programScopeVars.clear();
+  m_totalProgramScopeVarSize = 0;
+}
+
 void Program::getBinary(unsigned char *binary) const
 {
   if (!m_module)
@@ -630,7 +771,11 @@ void Program::getBinary(unsigned char *binary) const
 
   std::string str;
   llvm::raw_string_ostream stream(str);
+#if LLVM_VERSION < 70
   llvm::WriteBitcodeToFile(m_module.get(), stream);
+#else
+  llvm::WriteBitcodeToFile(*m_module, stream);
+#endif
   stream.str();
 
   memcpy(binary, str.c_str(), str.length());
@@ -645,7 +790,11 @@ size_t Program::getBinarySize() const
 
   std::string str;
   llvm::raw_string_ostream stream(str);
+#if LLVM_VERSION < 70
   llvm::WriteBitcodeToFile(m_module.get(), stream);
+#else
+  llvm::WriteBitcodeToFile(*m_module, stream);
+#endif
   stream.str();
   return str.length();
 }
@@ -685,33 +834,6 @@ const InterpreterCache* Program::getInterpreterCache(
 list<string> Program::getKernelNames() const
 {
   list<string> names;
-
-#if LLVM_VERSION < 37
-  // Query the SPIR kernel list
-  llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
-
-  if (tuple)
-  {
-    for (unsigned i = 0; i < tuple->getNumOperands(); ++i)
-    {
-      llvm::MDNode* kernel = tuple->getOperand(i);
-
-      llvm::ConstantAsMetadata *cam =
-      llvm::dyn_cast<llvm::ConstantAsMetadata>(kernel->getOperand(0).get());
-      if (!cam)
-        continue;
-
-      llvm::Function *kernelFunction =
-        llvm::dyn_cast<llvm::Function>(cam->getValue());
-
-      // Shouldn't really happen - this would mean an invalid Module as input
-      if (!kernelFunction)
-        continue;
-
-      names.push_back(kernelFunction->getName());
-    }
-  }
-#else
   for (auto F = m_module->begin(); F != m_module->end(); F++)
   {
     if (F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
@@ -719,27 +841,19 @@ list<string> Program::getKernelNames() const
       names.push_back(F->getName());
     }
   }
-#endif
-
   return names;
 }
 
+llvm::LLVMContext& Program::getLLVMContext() const
+{
+  return m_module->getContext();
+}
+
 unsigned int Program::getNumKernels() const
 {
   assert(m_module);
 
-#if LLVM_VERSION < 37
-  // Extract kernels from metadata
-  llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
-
-  // No kernels in module
-  if (!tuple)
-    return 0;
-
-  return tuple->getNumOperands();
-#else
   unsigned int num = 0;
-
   for (auto F = m_module->begin(); F != m_module->end(); F++)
   {
     if (F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
@@ -747,9 +861,12 @@ unsigned int Program::getNumKernels() const
       num++;
     }
   }
-
   return num;
-#endif
+}
+
+const TypedValue& Program::getProgramScopeVar(const llvm::Value *variable) const
+{
+  return m_programScopeVars.at(variable);
 }
 
 const string& Program::getSource() const
@@ -770,6 +887,11 @@ size_t Program::getNumSourceLines() const
   return m_sourceLines.size();
 }
 
+size_t Program::getTotalProgramScopeVarSize() const
+{
+  return m_totalProgramScopeVarSize;
+}
+
 unsigned long Program::getUID() const
 {
   return m_uid;
@@ -825,6 +947,11 @@ void Program::removeLValueLoads()
   }
 }
 
+bool Program::requiresUniformWorkGroups() const
+{
+  return m_requiresUniformWorkGroups;
+}
+
 void Program::scalarizeAggregateStore(llvm::StoreInst *store)
 {
   llvm::IntegerType *gepIndexType = (sizeof(size_t)==8) ?
@@ -852,9 +979,7 @@ void Program::scalarizeAggregateStore(llvm::StoreInst *store)
       }
       indices.push_back(index);
       scalarPtr = llvm::GetElementPtrInst::Create(
-#if LLVM_VERSION > 36
         gep->getPointerOperandType()->getPointerElementType(),
-#endif
         gep->getPointerOperand(), indices);
     }
     else
@@ -864,9 +989,7 @@ void Program::scalarizeAggregateStore(llvm::StoreInst *store)
       indices.push_back(llvm::ConstantInt::getSigned(gepIndexType, 0));
       indices.push_back(index);
       scalarPtr = llvm::GetElementPtrInst::Create(
-#if LLVM_VERSION > 36
         vectorPtr->getType()->getPointerElementType(),
-#endif
         vectorPtr, indices);
     }
     scalarPtr->setDebugLoc(store->getDebugLoc());
@@ -984,9 +1107,7 @@ void Program::scalarizeAggregateStore(llvm::StoreInst *store)
         }
         gepIndices.push_back(llvm::ConstantInt::getSigned(gepIndexType, index));
         scalarPtr = llvm::GetElementPtrInst::Create(
-#if LLVM_VERSION > 36
           gep->getPointerOperandType()->getPointerElementType(),
-#endif
           gep->getPointerOperand(), gepIndices);
       }
       else
@@ -996,9 +1117,7 @@ void Program::scalarizeAggregateStore(llvm::StoreInst *store)
         gepIndices.push_back(llvm::ConstantInt::getSigned(gepIndexType, 0));
         gepIndices.push_back(llvm::ConstantInt::getSigned(gepIndexType, index));
         scalarPtr = llvm::GetElementPtrInst::Create(
-#if LLVM_VERSION > 36
           vectorPtr->getType()->getPointerElementType(),
-#endif
           vectorPtr, gepIndices);
       }
       scalarPtr->setDebugLoc(store->getDebugLoc());
diff --git a/src/core/Program.h b/src/core/Program.h
index 8b901c9..cdd1c9d 100644
--- a/src/core/Program.h
+++ b/src/core/Program.h
@@ -11,6 +11,7 @@
 namespace llvm
 {
   class Function;
+  class LLVMContext;
   class Module;
   class StoreInst;
 }
@@ -50,11 +51,15 @@ namespace oclgrind
     const InterpreterCache* getInterpreterCache(
       const llvm::Function *kernel) const;
     std::list<std::string> getKernelNames() const;
+    llvm::LLVMContext& getLLVMContext() const;
     unsigned int getNumKernels() const;
     const std::string& getSource() const;
     const char* getSourceLine(size_t lineNumber) const;
     size_t getNumSourceLines() const;
+    const TypedValue& getProgramScopeVar(const llvm::Value *var) const;
+    size_t getTotalProgramScopeVarSize() const;
     unsigned long getUID() const;
+    bool requiresUniformWorkGroups() const;
 
   private:
     Program(const Context *context, llvm::Module *module);
@@ -67,9 +72,16 @@ namespace oclgrind
     const Context *m_context;
     std::vector<std::string> m_sourceLines;
 
+    bool m_requiresUniformWorkGroups;
+
+    TypedValueMap m_programScopeVars;
+    size_t m_totalProgramScopeVarSize;
+
     unsigned long m_uid;
     unsigned long generateUID() const;
 
+    void allocateProgramScopeVars();
+    void deallocateProgramScopeVars();
     void pruneDeadCode(llvm::Instruction*);
     void removeLValueLoads();
     void scalarizeAggregateStore(llvm::StoreInst *store);
diff --git a/src/core/WorkGroup.cpp b/src/core/WorkGroup.cpp
index 2c891c6..3542dae 100644
--- a/src/core/WorkGroup.cpp
+++ b/src/core/WorkGroup.cpp
@@ -23,10 +23,16 @@ using namespace oclgrind;
 using namespace std;
 
 WorkGroup::WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid)
+  : WorkGroup(kernelInvocation, wgid, kernelInvocation->getLocalSize())
+{
+}
+
+WorkGroup::WorkGroup(const KernelInvocation *kernelInvocation,
+                     Size3 wgid, Size3 size)
  : m_context(kernelInvocation->getContext())
 {
-  m_groupID = wgid;
-  m_groupSize = kernelInvocation->getLocalSize();
+  m_groupID   = wgid;
+  m_groupSize = size;
 
   m_groupIndex = (m_groupID.x +
                  (m_groupID.y +
diff --git a/src/core/WorkGroup.h b/src/core/WorkGroup.h
index 73cb2b7..b3469a7 100644
--- a/src/core/WorkGroup.h
+++ b/src/core/WorkGroup.h
@@ -57,6 +57,7 @@ namespace oclgrind
 
   public:
     WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid);
+    WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid, Size3 size);
     virtual ~WorkGroup();
 
     size_t async_copy(
diff --git a/src/core/WorkItem.cpp b/src/core/WorkItem.cpp
index 4441a33..19bf30d 100644
--- a/src/core/WorkItem.cpp
+++ b/src/core/WorkItem.cpp
@@ -52,7 +52,7 @@ WorkItem::WorkItem(const KernelInvocation *kernelInvocation,
 
   // Compute global ID
   Size3 groupID = workGroup->getGroupID();
-  Size3 groupSize = workGroup->getGroupSize();
+  Size3 groupSize = kernelInvocation->getLocalSize();
   Size3 globalOffset = kernelInvocation->getGlobalOffset();
   m_globalID.x = lid.x + groupID.x*groupSize.x + globalOffset.x;
   m_globalID.y = lid.y + groupID.y*groupSize.y + globalOffset.y;
@@ -476,18 +476,48 @@ const unsigned char* WorkItem::getValueData(const llvm::Value *value) const
   return getValue(value).data;
 }
 
-const llvm::Value* WorkItem::getVariable(std::string name) const
+const WorkGroup* WorkItem::getWorkGroup() const
 {
+  return m_workGroup;
+}
+
+bool WorkItem::hasValue(const llvm::Value *key) const
+{
+  return m_cache->hasValue(key);
+}
+
+void WorkItem::printExpression(string expr) const
+{
+  // Split base variable name from rest of expression
+  size_t split;
+  string basename;
+  if ((split = expr.find_first_of(".-[")) != string::npos)
+  {
+    basename = expr.substr(0, split);
+    expr = expr.substr(split);
+  }
+  else
+  {
+    basename = expr;
+    expr = "";
+  }
+
+  const llvm::Value *baseValue = NULL;
+  const llvm::DIVariable *divar = NULL;
+
   // Check private variables
   VariableMap::const_iterator itr;
-  itr = m_variables.find(name);
+  itr = m_variables.find(basename);
   if (itr != m_variables.end())
-    return itr->second;
+  {
+    baseValue = itr->second.first;
+    divar = itr->second.second;
+  }
 
   // Check global variables
   string globalName = m_position->currBlock->getParent()->getName();
   globalName += ".";
-  globalName += name;
+  globalName += basename;
   const llvm::Module *module =
     m_kernelInvocation->getKernel()->getFunction()->getParent();
   for (auto global = module->global_begin();
@@ -495,61 +525,196 @@ const llvm::Value* WorkItem::getVariable(std::string name) const
             global++)
   {
     if (global->getName() == globalName)
-      return &*global;
+    {
+      baseValue = &*global;
+
+#if LLVM_VERSION >= 40
+      llvm::SmallVector<llvm::DIGlobalVariableExpression*, 3> GVEs;
+      global->getDebugInfo(GVEs);
+      if (GVEs.size() == 0)
+#endif
+      {
+        cout << "global variable debug information not found";
+        return;
+      }
+#if LLVM_VERSION >= 40
+      // TODO: Does it matter which GVE we pick?
+      divar = llvm::dyn_cast<llvm::DIGlobalVariable>(GVEs[0]->getRawVariable());
+#endif
+    }
   }
 
-  return NULL;
-}
+  // Check that we found the target variable
+  if (!baseValue)
+  {
+    cout << "not found";
+    return;
+  }
 
-const WorkGroup* WorkItem::getWorkGroup() const
-{
-  return m_workGroup;
-}
+  // Get variable data and type
+  TypedValue result = getOperand(baseValue);
+  unsigned char *data = result.data;
+  const llvm::Type *type = baseValue->getType();
+  const llvm::Metadata *mdtype = divar->getRawType();
 
-bool WorkItem::hasValue(const llvm::Value *key) const
-{
-  return m_cache->hasValue(key);
-}
-
-bool WorkItem::printValue(const llvm::Value *value) const
-{
-  if (!hasValue(value))
+  // Auto-dereference global variables and allocas
+  if (baseValue->getValueID() == llvm::Value::GlobalVariableVal ||
+      ((const llvm::Instruction*)baseValue)->getOpcode()
+         == llvm::Instruction::Alloca)
   {
-    return false;
+    size_t address = result.getPointer();
+    Memory *memory = getMemory(type->getPointerAddressSpace());
+    data = (unsigned char*)memory->getPointer(address);
+    type = type->getPointerElementType();
   }
 
-  printTypedData(value->getType(), getValue(value).data);
+  // Handle rest of print expression
+  while (!expr.empty())
+  {
+    bool member = false;
+    bool dereference = false;
+    size_t subscript = 0;
 
-  return true;
+    // Handle special characters
+    if (expr[0] == '.')
+    {
+      expr = expr.substr(1);
+      member = true;
+    }
+    else if (!expr.compare(0, 2, "->"))
+    {
+      expr = expr.substr(2);
+      dereference = true;
+      member = true;
+    }
+    else if (expr[0] == '[')
+    {
+      // Find end of subscript
+      size_t end = expr.find(']');
+      if (end == string::npos)
+      {
+        cout << "missing ']'" << endl;
+        return;
+      }
+
+      // Parse index value
+      stringstream ss(expr.substr(1, end-1));
+      ss >> subscript;
+      if (!ss.eof())
+      {
+        cout << "invalid subscript index" << endl;
+        return;
+      }
+
+      expr = expr.substr(end+1);
+      dereference = true;
+    }
+    else
+    {
+      cout << "invalid print expression";
+      return;
+    }
+
+    // Deference a pointer if user requested
+    if (dereference)
+    {
+      auto ptrtype = llvm::dyn_cast<llvm::DIDerivedType>(mdtype);
+      if (!ptrtype || ptrtype->getTag() != llvm::dwarf::DW_TAG_pointer_type)
+      {
+        cout << "not a pointer type";
+        return;
+      }
+
+      // Get pointer value
+      size_t address = *(size_t*)data;
+      Memory *memory = getMemory(type->getPointerAddressSpace());
+
+      // Check address is valid
+      auto elemType = type->getPointerElementType();
+      size_t elemSize = getTypeSize(elemType);
+      if (!memory->isAddressValid(address + subscript * elemSize, elemSize))
+      {
+        cout << "invalid memory address";
+        return;
+      }
+
+      // Get pointer to data and add offset
+      data = (unsigned char*)memory->getPointer(address);
+      data += subscript * elemSize;
+
+      // Update types
+      mdtype = ptrtype->getRawBaseType();
+      type = elemType;
+    }
+
+    // Deal with structure elements
+    if (member)
+    {
+      // Split at next special character
+      size_t split;
+      string element;
+      if ((split = expr.find_first_of(".-[")) != string::npos)
+      {
+        element = expr.substr(0, split);
+        expr = expr.substr(split);
+      }
+      else
+      {
+        element = expr;
+        expr = "";
+      }
+
+      // Deal with typedef
+      auto ditype = llvm::dyn_cast<llvm::DIType>(mdtype);
+      if (ditype->getTag() == llvm::dwarf::DW_TAG_typedef)
+      {
+        mdtype = llvm::dyn_cast<llvm::DIDerivedType>(ditype)->getRawBaseType();
+      }
+
+      // Ensure we have a composite type
+      auto composite_type = llvm::dyn_cast<llvm::DICompositeType>(mdtype);
+      if (!composite_type)
+      {
+        cout << "not a composite type";
+        return;
+      }
+
+      // Find element with matching name
+      bool found = false;
+      auto elements = composite_type->getElements();
+      unsigned numElements = elements->getNumOperands();
+      for (unsigned i = 0; i < numElements; i++)
+      {
+        auto elem =
+          llvm::dyn_cast<llvm::DIDerivedType>(elements->getOperand(i));
+        if (elem->getName() == element)
+        {
+          // Increment data pointer by offset and update type
+          type = type->getStructElementType(i);
+          mdtype = elem->getRawBaseType();
+          data = data + elem->getOffsetInBits()/8;
+          found = true;
+        }
+      }
+      if (!found)
+      {
+        cout << "no member named '" << element << "' found";
+        return;
+      }
+    }
+  }
+
+  printTypedData(type, data);
 }
 
-bool WorkItem::printVariable(string name) const
+bool WorkItem::printValue(const llvm::Value *value) const
 {
-  // Find variable
-  const llvm::Value *value = getVariable(name);
-  if (!value)
+  if (!hasValue(value))
   {
     return false;
   }
 
-  // Get variable value
-  TypedValue result = getOperand(value);
-  const llvm::Type *type = value->getType();
-
-  if (value->getValueID() == llvm::Value::GlobalVariableVal ||
-      ((const llvm::Instruction*)value)->getOpcode()
-         == llvm::Instruction::Alloca)
-  {
-    // If value is alloca or global variable, look-up data at address
-    size_t address = result.getPointer();
-    Memory *memory = getMemory(value->getType()->getPointerAddressSpace());
-    unsigned char *data = (unsigned char*)memory->getPointer(address);
-    printTypedData(value->getType()->getPointerElementType(), data);
-  }
-  else
-  {
-    printTypedData(type, result.data);
-  }
+  printTypedData(value->getType(), getValue(value).data);
 
   return true;
 }
@@ -644,22 +809,7 @@ INSTRUCTION(ashr)
 
 INSTRUCTION(bitcast)
 {
-  const llvm::Value *op = instruction->getOperand(0);
-
-  // Check for address space casts
-  if (instruction->getType()->isPointerTy())
-  {
-    unsigned srcAddrSpace = op->getType()->getPointerAddressSpace();
-    unsigned dstAddrSpace = instruction->getType()->getPointerAddressSpace();
-    if (srcAddrSpace != dstAddrSpace)
-    {
-      FATAL_ERROR("Invalid pointer cast from %s to %s address spaces",
-                  getAddressSpaceName(srcAddrSpace),
-                  getAddressSpaceName(dstAddrSpace));
-    }
-  }
-
-  TypedValue operand = getOperand(op);
+  TypedValue operand = getOperand(instruction->getOperand(0));
   memcpy(result.data, operand.data, result.size*result.num);
 }
 
@@ -970,50 +1120,18 @@ INSTRUCTION(gep)
     (const llvm::GetElementPtrInst*)instruction;
 
   // Get base address
-  const llvm::Value *base = gepInst->getPointerOperand();
-  size_t address = getOperand(base).getPointer();
+  size_t base = getOperand(gepInst->getPointerOperand()).getPointer();
   const llvm::Type *ptrType = gepInst->getPointerOperandType();
 
-  // Iterate over indices
+  // Get indices
+  std::vector<int64_t> offsets;
   llvm::User::const_op_iterator opItr;
   for (opItr = gepInst->idx_begin(); opItr != gepInst->idx_end(); opItr++)
   {
-    int64_t offset = getOperand(opItr->get()).getSInt();
-
-    if (ptrType->isPointerTy())
-    {
-      // Get pointer element size
-      const llvm::Type *elemType = ptrType->getPointerElementType();
-      address += offset*getTypeSize(elemType);
-      ptrType = elemType;
-    }
-    else if (ptrType->isArrayTy())
-    {
-      // Get array element size
-      const llvm::Type *elemType = ptrType->getArrayElementType();
-      address += offset*getTypeSize(elemType);
-      ptrType = elemType;
-    }
-    else if (ptrType->isVectorTy())
-    {
-      // Get vector element size
-      const llvm::Type *elemType = ptrType->getVectorElementType();
-      address += offset*getTypeSize(elemType);
-      ptrType = elemType;
-    }
-    else if (ptrType->isStructTy())
-    {
-      address +=
-        getStructMemberOffset((const llvm::StructType*)ptrType, offset);
-      ptrType = ptrType->getStructElementType(offset);
-    }
-    else
-    {
-      FATAL_ERROR("Unsupported GEP base type: %d", ptrType->getTypeID());
-    }
+    offsets.push_back(getOperand(opItr->get()).getSInt());
   }
 
-  result.setPointer(address);
+  result.setPointer(resolveGEP(base, ptrType, offsets));
 }
 
 INSTRUCTION(icmp)
@@ -1135,7 +1253,7 @@ INSTRUCTION(itrunc)
   TypedValue op = getOperand(instruction->getOperand(0));
   for (unsigned i = 0; i < result.num; i++)
   {
-    memcpy(result.data+i*result.size, op.data+i*op.size, result.size);
+    result.setUInt(op.getUInt(i), i);
   }
 }
 
@@ -1394,9 +1512,19 @@ INSTRUCTION(swtch)
   const llvm::SwitchInst *swtch = (const llvm::SwitchInst*)instruction;
   const llvm::Value *cond = swtch->getCondition();
   uint64_t val = getOperand(cond).getUInt();
-  const llvm::ConstantInt *cval =
-    (const llvm::ConstantInt*)llvm::ConstantInt::get(cond->getType(), val);
-  m_position->nextBlock = swtch->findCaseValue(cval).getCaseSuccessor();
+
+  // Look for case matching condition value
+  for (auto C : swtch->cases())
+  {
+    if (C.getCaseValue()->getZExtValue() == val)
+    {
+      m_position->nextBlock = C.getCaseSuccessor();
+      return;
+    }
+  }
+
+  // No matching cases - use default
+  m_position->nextBlock = swtch->getDefaultDest();
 }
 
 INSTRUCTION(udiv)
@@ -1533,7 +1661,11 @@ InterpreterCache::~InterpreterCache()
   for (constExprItr  = m_constExpressions.begin();
        constExprItr != m_constExpressions.end(); constExprItr++)
   {
+#if LLVM_VERSION < 50
     delete constExprItr->second;
+#else
+    constExprItr->second->deleteValue();
+#endif
   }
 }
 
diff --git a/src/core/WorkItem.h b/src/core/WorkItem.h
index 738df37..9da269b 100644
--- a/src/core/WorkItem.h
+++ b/src/core/WorkItem.h
@@ -13,7 +13,7 @@ namespace llvm
   class BasicBlock;
   class CallInst;
   class ConstExpr;
-  class DbgValueInst;
+  class DILocalVariable;
   class Function;
   class Module;
 }
@@ -76,7 +76,7 @@ namespace oclgrind
     typedef std::unordered_map<const llvm::Value*, unsigned> ValueMap;
     typedef std::unordered_map<const llvm::Function*, Builtin> BuiltinMap;
     typedef std::unordered_map<const llvm::Value*, TypedValue> ConstantMap;
-    typedef std::unordered_map<const llvm::Value*, const llvm::Instruction*>
+    typedef std::unordered_map<const llvm::Value*, llvm::Instruction*>
       ConstExprMap;
 
     BuiltinMap m_builtins;
@@ -113,10 +113,9 @@ namespace oclgrind
     Memory* getPrivateMemory() const;
     State getState() const;
     const unsigned char* getValueData(const llvm::Value *value) const;
-    const llvm::Value* getVariable(std::string name) const;
     const WorkGroup* getWorkGroup() const;
+    void printExpression(std::string expr) const;
     bool printValue(const llvm::Value *value) const;
-    bool printVariable(std::string name) const;
     State step();
 
     // SPIR instructions
@@ -173,7 +172,9 @@ namespace oclgrind
 #undef INSTRUCTION
 
   private:
-    typedef std::map<std::string, const llvm::Value*> VariableMap;
+    typedef std::map<std::string,
+                     std::pair<const llvm::Value*,
+                               const llvm::DILocalVariable*>> VariableMap;
 
     size_t m_globalIndex;
     Size3 m_globalID;
diff --git a/src/core/WorkItemBuiltins.cpp b/src/core/WorkItemBuiltins.cpp
index 0bd7837..ddde4a2 100644
--- a/src/core/WorkItemBuiltins.cpp
+++ b/src/core/WorkItemBuiltins.cpp
@@ -18,9 +18,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
-#if LLVM_VERSION > 36
 #include "llvm/IR/DebugInfoMetadata.h"
-#endif
 
 #include "CL/cl.h"
 #include "Context.h"
@@ -33,18 +31,6 @@
 using namespace oclgrind;
 using namespace std;
 
-#define CLK_NORMALIZED_COORDS_TRUE 0x0001
-
-#define CLK_ADDRESS_NONE 0x0000
-#define CLK_ADDRESS_CLAMP_TO_EDGE 0x0002
-#define CLK_ADDRESS_CLAMP 0x0004
-#define CLK_ADDRESS_REPEAT 0x0006
-#define CLK_ADDRESS_MIRRORED_REPEAT 0x0008
-#define CLK_ADDRESS_MASK 0x000E
-
-#define CLK_FILTER_NEAREST 0x0010
-#define CLK_FILTER_LINEAR 0x0020
-
 #ifndef M_PI
 #define M_PI 3.1415926535897932384626433832795
 #endif
@@ -275,162 +261,89 @@ namespace oclgrind
     //////////////////////
     // Atomic Functions //
     //////////////////////
+    static bool _is_signed_type(char c)
+    {
+      const string signed_vals("casilxn"); //CXXNameMangler
+      return signed_vals.find(c) != string::npos;
+    }
+
+    DEFINE_BUILTIN(atomic_op)
+    {
+      const static map<string, AtomicOp> name_to_op = {
+        { "atomic_add", AtomicAdd },
+        { "atom_add", AtomicAdd },
+        { "atomic_and", AtomicAnd },
+        { "atom_and", AtomicAnd },
+        { "atom_cmpxchg", AtomicCmpXchg },
+        { "atomic_cmpxchg", AtomicCmpXchg },
+        { "atom_dec", AtomicDec },
+        { "atomic_dec", AtomicDec },
+        { "atom_inc", AtomicInc },
+        { "atomic_inc", AtomicInc },
+        { "atom_max", AtomicMax },
+        { "atomic_max", AtomicMax },
+        { "atom_min", AtomicMin },
+        { "atomic_min", AtomicMin },
+        { "atom_or", AtomicOr },
+        { "atomic_or", AtomicOr },
+        { "atom_sub", AtomicSub },
+        { "atomic_sub", AtomicSub },
+        { "atom_xchg", AtomicXchg },
+        { "atomic_xchg", AtomicXchg },
+        { "atom_xor", AtomicXor },
+        { "atomic_xor", AtomicXor },
+      };
 
-    DEFINE_BUILTIN(atomic_add)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
-
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_add");
-      }
-      uint32_t old = memory->atomic(AtomicAdd, address, UARG(1));
-      result.setUInt(old);
-    }
-
-    DEFINE_BUILTIN(atomic_and)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
-
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_and");
-      }
-      uint32_t old = memory->atomic(AtomicAnd, address, UARG(1));
-      result.setUInt(old);
-    }
-
-    DEFINE_BUILTIN(atomic_cmpxchg)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
-
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_cmpxchg");
-      }
-      uint32_t old = memory->atomicCmpxchg(address, UARG(1), UARG(2));
-      result.setUInt(old);
-    }
-
-    DEFINE_BUILTIN(atomic_dec)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
-
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_dec");
-      }
-      uint32_t old = memory->atomic(AtomicDec, address);
-      result.setUInt(old);
-    }
-
-    DEFINE_BUILTIN(atomic_inc)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
-
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_dec");
-      }
-      uint32_t old = memory->atomic(AtomicInc, address);
-      result.setUInt(old);
-    }
-
-    DEFINE_BUILTIN(atomic_max)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
-
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_max");
-      }
-      uint32_t old = memory->atomic(AtomicMax, address, UARG(1));
-      result.setUInt(old);
-    }
-
-    DEFINE_BUILTIN(atomic_min)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
-
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_min");
-      }
-      uint32_t old = memory->atomic(AtomicMin, address, UARG(1));
-      result.setUInt(old);
-    }
-
-    DEFINE_BUILTIN(atomic_or)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
-
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_or");
-      }
-      uint32_t old = memory->atomic(AtomicOr, address, UARG(1));
-      result.setUInt(old);
-    }
-
-    DEFINE_BUILTIN(atomic_sub)
-    {
       Memory *memory =
         workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
 
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_sub");
-      }
-      uint32_t old = memory->atomic(AtomicSub, address, UARG(1));
-      result.setUInt(old);
-    }
-
-    DEFINE_BUILTIN(atomic_xchg)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      const bool is_64bit(ARG(0)->getType()->getPointerElementType()->getScalarSizeInBits() == 64);
+      const bool is_signed_type(_is_signed_type(overload.back()));
+      const auto op(name_to_op.at(fnName));
 
       size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_xchg");
+      // Verify the address is 4/8-byte aligned
+      if ((address & ((is_64bit ? 8 : 4) - 1)) != 0) {
+        workItem->m_context->logError(("Unaligned address on " + fnName).c_str());
       }
-      uint32_t old = memory->atomic(AtomicXchg, address, UARG(1));
-      result.setUInt(old);
-    }
 
-    DEFINE_BUILTIN(atomic_xor)
-    {
-      Memory *memory =
-        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
-
-      size_t address = PARG(0);
-      // Verify the address is 4-byte aligned
-      if ((address & 0x3) != 0) {
-        workItem->m_context->logError("Unaligned address on atomic_xor");
+      uint64_t old;
+      if (op == AtomicCmpXchg) {
+        if (is_64bit) {
+          old = memory->atomicCmpxchg<uint64_t>(address, UARG(1), UARG(2));
+        } else {
+          old = memory->atomicCmpxchg<uint32_t>(address, UARG(1), UARG(2));
+        }
+      } else if (op == AtomicInc || op == AtomicDec) {
+        if (is_64bit) {
+          old = memory->atomic<uint64_t>(op, address);
+        } else {
+          old = memory->atomic<uint32_t>(op, address);
+        }
+      } else if (op == AtomicMax || op == AtomicMin) {
+        if (is_64bit) {
+          if (is_signed_type) {
+            old = memory->atomic<int64_t>(op, address, SARG(1));
+          } else {
+            old = memory->atomic<uint64_t>(op, address, UARG(1));
+          }
+        } else {
+          if (is_signed_type) {
+            old = memory->atomic<int32_t>(op, address, SARG(1));
+          } else {
+            old = memory->atomic<uint32_t>(op, address, UARG(1));
+          }
+        }
+      } else {
+        if (is_64bit) {
+          old = memory->atomic<uint64_t>(op, address, UARG(1));
+        } else {
+          old = memory->atomic<uint32_t>(op, address, UARG(1));
+        }
       }
-      uint32_t old = memory->atomic(AtomicXor, address, UARG(1));
       result.setUInt(old);
     }
 
-
     //////////////////////
     // Common Functions //
     //////////////////////
@@ -1289,7 +1202,7 @@ namespace oclgrind
 
     DEFINE_BUILTIN(translate_sampler_initializer)
     {
-      // A sampler initializer is just a pointer to its ConstantInt object
+      // A sampler initializer is just a pointer to its ConstantInt value
       result.setPointer((size_t)ARG(0));
     }
 
@@ -2357,6 +2270,34 @@ namespace oclgrind
       }
     }
 
+    DEFINE_BUILTIN(fmax_builtin)
+    {
+      TypedValue a = workItem->getOperand(ARG(0));
+      TypedValue b = workItem->getOperand(ARG(1));
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double _b = b.num > 1 ? b.getFloat(i) : b.getFloat();
+        if (result.size == 4)
+          result.setFloat(fmaxf(a.getFloat(i), _b), i);
+        else
+          result.setFloat(fmax(a.getFloat(i), _b), i);
+      }
+    }
+
+    DEFINE_BUILTIN(fmin_builtin)
+    {
+      TypedValue a = workItem->getOperand(ARG(0));
+      TypedValue b = workItem->getOperand(ARG(1));
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double _b = b.num > 1 ? b.getFloat(i) : b.getFloat();
+        if (result.size == 4)
+          result.setFloat(fminf(a.getFloat(i), _b), i);
+        else
+          result.setFloat(fmin(a.getFloat(i), _b), i);
+      }
+    }
+
     static double _maxmag_(double x, double y)
     {
       double _x = fabs(x);
@@ -2448,7 +2389,11 @@ namespace oclgrind
     {
       for (unsigned i = 0; i < result.num; i++)
       {
-        result.setSInt(ilogb(FARGV(0, i)), i);
+        double x = FARGV(0, i);
+        if (std::isnan(x))
+          result.setSInt(INT_MAX, i);
+        else
+          result.setSInt(ilogb(x), i);
       }
     }
 
@@ -2819,7 +2764,7 @@ namespace oclgrind
     // Synchronization Functions //
     ///////////////////////////////
 
-    DEFINE_BUILTIN(barrier)
+    DEFINE_BUILTIN(work_group_barrier)
     {
       workItem->m_state = WorkItem::BARRIER;
       workItem->m_workGroup->notifyBarrier(workItem, callInst, UARG(0));
@@ -2995,6 +2940,14 @@ namespace oclgrind
       result.setUInt(r);
     }
 
+    DEFINE_BUILTIN(get_enqueued_local_size)
+    {
+      uint64_t dim = UARG(0);
+      size_t r = dim < 3 ?
+        workItem->m_kernelInvocation->getLocalSize()[dim] : 0;
+      result.setUInt(r);
+    }
+
     DEFINE_BUILTIN(get_num_groups)
     {
       uint64_t dim = UARG(0);
@@ -3011,11 +2964,37 @@ namespace oclgrind
       result.setUInt(workItem->m_kernelInvocation->getWorkDim());
     }
 
+    DEFINE_BUILTIN(get_global_linear_id)
+    {
+      Size3 globalID = workItem->m_globalID;
+      Size3 globalSize = workItem->m_kernelInvocation->getGlobalSize();
+      Size3 globalOffset = workItem->m_kernelInvocation->getGlobalOffset();
+      size_t r =
+        ((globalID.z - globalOffset.z)  * globalSize.y +
+         (globalID.y - globalOffset.y)) * globalSize.x +
+          globalID.x - globalOffset.x;
+      result.setUInt(r);
+    }
+
+    DEFINE_BUILTIN(get_local_linear_id)
+    {
+      Size3 localID = workItem->m_localID;
+      Size3 localSize = workItem->m_workGroup->getGroupSize();
+      size_t r =
+        (localID.z * localSize.y + localID.y) * localSize.x + localID.x;
+      result.setUInt(r);
+    }
 
     /////////////////////
     // Other Functions //
     /////////////////////
 
+    DEFINE_BUILTIN(astype)
+    {
+      TypedValue src = workItem->getOperand(ARG(0));
+      memcpy(result.data, src.data, src.size*src.num);
+    }
+
     static void setConvertRoundingMode(const string& name, int def)
     {
       size_t rpos = name.find("_rt");
@@ -3115,6 +3094,7 @@ namespace oclgrind
           case 'd':
           case 'f':
             f = FARGV(0, i);
+            break;
           default:
             FATAL_ERROR("Unsupported argument type: %c",
                         getOverloadArgType(overload));
@@ -3452,24 +3432,8 @@ namespace oclgrind
         (const llvm::DbgDeclareInst*)callInst;
       const llvm::Value *addr = dbgInst->getAddress();
 
-#if LLVM_VERSION > 36
-     const llvm::DILocalVariable *var = dbgInst->getVariable();
-     workItem->m_variables[var->getName()] = addr;
-#else
-      const llvm::MDNode *var = dbgInst->getVariable();
-      llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(var->getOperand(0));
-      if (str)
-      {
-        // TODO: There must be a better way of getting the variable name...
-        unsigned length = str->getLength();
-        const char *name = str->getString().str().c_str();
-        if (length > strlen(name) + 1)
-        {
-          name += strlen(name) + 1;
-          workItem->m_variables[name] = addr;
-        }
-      }
-#endif
+      const llvm::DILocalVariable *var = dbgInst->getVariable();
+      workItem->m_variables[var->getName()] = {addr, var};
     }
 
     DEFINE_BUILTIN(llvm_dbg_value)
@@ -3480,24 +3444,8 @@ namespace oclgrind
       // TODO: Use offset?
       //uint64_t offset = dbgInst->getOffset();
 
-#if LLVM_VERSION > 36
       const llvm::DILocalVariable *var = dbgInst->getVariable();
-      workItem->m_variables[var->getName()] = value;
-#else
-      const llvm::MDNode *var = dbgInst->getVariable();
-      llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(var->getOperand(0));
-      if (str)
-      {
-        // TODO: There must be a better way of getting the variable name...
-        unsigned length = str->getLength();
-        const char *name = str->getString().str().c_str();
-        if (length > strlen(name) + 1)
-        {
-          name += strlen(name) + 1;
-          workItem->m_variables[name] = value;
-        }
-      }
-#endif
+      workItem->m_variables[var->getName()] = {value, var};
     }
 
     DEFINE_BUILTIN(llvm_lifetime_start)
@@ -3573,28 +3521,28 @@ namespace oclgrind
     ADD_BUILTIN("prefetch", prefetch, NULL);
 
     // Atomic Functions
-    ADD_BUILTIN("atom_add", atomic_add, NULL);
-    ADD_BUILTIN("atomic_add", atomic_add, NULL);
-    ADD_BUILTIN("atom_and", atomic_and, NULL);
-    ADD_BUILTIN("atomic_and", atomic_and, NULL);
-    ADD_BUILTIN("atom_cmpxchg", atomic_cmpxchg, NULL);
-    ADD_BUILTIN("atomic_cmpxchg", atomic_cmpxchg, NULL);
-    ADD_BUILTIN("atom_dec", atomic_dec, NULL);
-    ADD_BUILTIN("atomic_dec", atomic_dec, NULL);
-    ADD_BUILTIN("atom_inc", atomic_inc, NULL);
-    ADD_BUILTIN("atomic_inc", atomic_inc, NULL);
-    ADD_BUILTIN("atom_max", atomic_max, NULL);
-    ADD_BUILTIN("atomic_max", atomic_max, NULL);
-    ADD_BUILTIN("atom_min", atomic_min, NULL);
-    ADD_BUILTIN("atomic_min", atomic_min, NULL);
-    ADD_BUILTIN("atom_or", atomic_or, NULL);
-    ADD_BUILTIN("atomic_or", atomic_or, NULL);
-    ADD_BUILTIN("atom_sub", atomic_sub, NULL);
-    ADD_BUILTIN("atomic_sub", atomic_sub, NULL);
-    ADD_BUILTIN("atom_xchg", atomic_xchg, NULL);
-    ADD_BUILTIN("atomic_xchg", atomic_xchg, NULL);
-    ADD_BUILTIN("atom_xor", atomic_xor, NULL);
-    ADD_BUILTIN("atomic_xor", atomic_xor, NULL);
+    ADD_BUILTIN("atom_add", atomic_op, NULL);
+    ADD_BUILTIN("atomic_add", atomic_op, NULL);
+    ADD_BUILTIN("atom_and", atomic_op, NULL);
+    ADD_BUILTIN("atomic_and", atomic_op, NULL);
+    ADD_BUILTIN("atom_cmpxchg", atomic_op, NULL);
+    ADD_BUILTIN("atomic_cmpxchg", atomic_op, NULL);
+    ADD_BUILTIN("atom_dec", atomic_op, NULL);
+    ADD_BUILTIN("atomic_dec", atomic_op, NULL);
+    ADD_BUILTIN("atom_inc", atomic_op, NULL);
+    ADD_BUILTIN("atomic_inc", atomic_op, NULL);
+    ADD_BUILTIN("atom_max", atomic_op, NULL);
+    ADD_BUILTIN("atomic_max", atomic_op, NULL);
+    ADD_BUILTIN("atom_min", atomic_op, NULL);
+    ADD_BUILTIN("atomic_min", atomic_op, NULL);
+    ADD_BUILTIN("atom_or", atomic_op, NULL);
+    ADD_BUILTIN("atomic_or", atomic_op, NULL);
+    ADD_BUILTIN("atom_sub", atomic_op, NULL);
+    ADD_BUILTIN("atomic_sub", atomic_op, NULL);
+    ADD_BUILTIN("atom_xchg", atomic_op, NULL);
+    ADD_BUILTIN("atomic_xchg", atomic_op, NULL);
+    ADD_BUILTIN("atom_xor", atomic_op, NULL);
+    ADD_BUILTIN("atomic_xor", atomic_op, NULL);
 
     // Common Functions
     ADD_BUILTIN("clamp", clamp, NULL);
@@ -3682,8 +3630,8 @@ namespace oclgrind
     ADD_BUILTIN("fdim", f2arg, F2ARG(fdim));
     ADD_BUILTIN("floor", f1arg, F1ARG(floor));
     ADD_BUILTIN("fma", fma_builtin, NULL);
-    ADD_BUILTIN("fmax", f2arg, F2ARG(fmax));
-    ADD_BUILTIN("fmin", f2arg, F2ARG(fmin));
+    ADD_BUILTIN("fmax", fmax_builtin, NULL);
+    ADD_BUILTIN("fmin", fmin_builtin, NULL);
     ADD_BUILTIN("fmod", f2arg, F2ARG(fmod));
     ADD_BUILTIN("fract", fract, NULL);
     ADD_BUILTIN("frexp", frexp_builtin, NULL);
@@ -3779,7 +3727,8 @@ namespace oclgrind
     ADD_BUILTIN("signbit", rel1arg, _signbit_);
 
     // Synchronization Functions
-    ADD_BUILTIN("barrier", barrier, NULL);
+    ADD_BUILTIN("barrier", work_group_barrier, NULL);
+    ADD_BUILTIN("work_group_barrier", work_group_barrier, NULL);
     ADD_BUILTIN("mem_fence", mem_fence, NULL);
     ADD_BUILTIN("read_mem_fence", mem_fence, NULL);
     ADD_BUILTIN("write_mem_fence", mem_fence, NULL);
@@ -3801,8 +3750,12 @@ namespace oclgrind
     ADD_BUILTIN("get_local_size", get_local_size, NULL);
     ADD_BUILTIN("get_num_groups", get_num_groups, NULL);
     ADD_BUILTIN("get_work_dim", get_work_dim, NULL);
+    ADD_BUILTIN("get_global_linear_id", get_global_linear_id, NULL);
+    ADD_BUILTIN("get_local_linear_id", get_local_linear_id, NULL);
+    ADD_BUILTIN("get_enqueued_local_size", get_enqueued_local_size, NULL);
 
     // Other Functions
+    ADD_PREFIX_BUILTIN("as_",            astype, NULL);
     ADD_PREFIX_BUILTIN("convert_half",   convert_half, NULL);
     ADD_PREFIX_BUILTIN("convert_float",  convert_float, NULL);
     ADD_PREFIX_BUILTIN("convert_double", convert_float, NULL);
diff --git a/src/core/clc.h b/src/core/clc.h
deleted file mode 100644
index 71fe306..0000000
--- a/src/core/clc.h
+++ /dev/null
@@ -1,1041 +0,0 @@
-// clc.h (Oclgrind)
-// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
-// University of Bristol. All rights reserved.
-//
-// This program is provided under a three-clause BSD license. For full
-// license terms please see the LICENSE file distributed with this
-// source code.
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-typedef unsigned int uint;
-typedef unsigned long ulong;
-
-#if defined(__SPIR32__)
-  typedef uint size_t;
-  typedef int ptrdiff_t;
-#else
-  typedef ulong size_t;
-  typedef long ptrdiff_t;
-#endif
-typedef size_t uintptr_t;
-typedef ptrdiff_t intptr_t;
-
-#define event_t size_t
-
-#define TYPEDEF_VECTOR(type)                                \
-  typedef __attribute__((ext_vector_type(2))) type type##2; \
-  typedef __attribute__((ext_vector_type(3))) type type##3; \
-  typedef __attribute__((ext_vector_type(4))) type type##4; \
-  typedef __attribute__((ext_vector_type(8))) type type##8; \
-  typedef __attribute__((ext_vector_type(16))) type type##16;
-TYPEDEF_VECTOR(char);
-TYPEDEF_VECTOR(uchar);
-TYPEDEF_VECTOR(short);
-TYPEDEF_VECTOR(ushort);
-TYPEDEF_VECTOR(int);
-TYPEDEF_VECTOR(uint);
-TYPEDEF_VECTOR(long);
-TYPEDEF_VECTOR(ulong);
-TYPEDEF_VECTOR(float);
-TYPEDEF_VECTOR(double);
-
-#define __ENDIAN_LITTLE__ 1
-#define __OPENCL_VERSION__ 120
-#define __OPENCL_C_VERSION__ 120
-#define __IMAGE_SUPPORT__ 1
-#define __kernel_exec(X, typen) __kernel                        \
-  __attribute__((work_group_size_hint(X, 1, 1)))                \
-  __attribute__((vec_type_hint(typen)))
-
-#define CHAR_BIT    8
-#define SCHAR_MAX 127
-#define SCHAR_MIN (-128)
-#define UCHAR_MAX 255
-#define CHAR_MAX  SCHAR_MAX
-#define CHAR_MIN  SCHAR_MIN
-#define USHRT_MAX 65535
-#define SHRT_MAX  32767
-#define SHRT_MIN  (-32768)
-#define UINT_MAX  0xffffffff
-#define INT_MAX   2147483647
-#define INT_MIN   (-2147483647-1)
-#define ULONG_MAX 0xffffffffffffffffUL
-#define LONG_MAX  ((long)0x7fffffffffffffffL)
-#define LONG_MIN  ((long)(-0x7fffffffffffffffL-1))
-
-#define FLT_DIG         6
-#define FLT_MANT_DIG    24
-#define FLT_MAX_10_EXP  +38
-#define FLT_MAX_EXP     +128
-#define FLT_MIN_10_EXP  -37
-#define FLT_MIN_EXP     -125
-#define FLT_RADIX       2
-#define FLT_MAX         0x1.fffffep127f
-#define FLT_MIN         0x1.0p-126f
-#define FLT_EPSILON     0x1.0p-23f
-
-#define DBL_DIG         15
-#define DBL_MANT_DIG    53
-#define DBL_MAX_10_EXP  +308
-#define DBL_MAX_EXP     +1024
-#define DBL_MIN_10_EXP  -307
-#define DBL_MIN_EXP     -1021
-#define DBL_RADIX       2
-#define DBL_MAX         0x1.fffffffffffffp1023
-#define DBL_MIN         0x1.0p-1022
-#define DBL_EPSILON     0x1.0p-52
-
-#define FP_ILOGB0       INT_MIN
-#define FP_ILOGBNAN     INT_MIN
-
-#define M_E_F         2.71828182845904523536028747135266250f
-#define M_LOG2E_F     1.44269504088896340735992468100189214f
-#define M_LOG10E_F    0.434294481903251827651128918916605082f
-#define M_LN2_F       0.693147180559945309417232121458176568f
-#define M_LN10_F      2.3025850929940456840179914546843642f
-#define M_PI_F        3.14159265358979323846264338327950288f
-#define M_PI_2_F      1.57079632679489661923132169163975144f
-#define M_PI_4_F      0.785398163397448309615660845819875721f
-#define M_1_PI_F      0.318309886183790671537767526745028724f
-#define M_2_PI_F      0.636619772367581343075535053490057448f
-#define M_2_SQRTPI_F  1.12837916709551257389615890312154517f
-#define M_SQRT2_F     1.41421356237309504880168872420969808f
-#define M_SQRT1_2_F   0.707106781186547524400844362104849039f
-
-#define M_E         2.71828182845904523536028747135266250
-#define M_LOG2E     1.44269504088896340735992468100189214
-#define M_LOG10E    0.434294481903251827651128918916605082
-#define M_LN2       0.693147180559945309417232121458176568
-#define M_LN10      2.30258509299404568401799145468436421
-#define M_PI        3.14159265358979323846264338327950288
-#define M_PI_2      1.57079632679489661923132169163975144
-#define M_PI_4      0.785398163397448309615660845819875721
-#define M_1_PI      0.318309886183790671537767526745028724
-#define M_2_PI      0.636619772367581343075535053490057448
-#define M_2_SQRTPI  1.12837916709551257389615890312154517
-#define M_SQRT2     1.41421356237309504880168872420969808
-#define M_SQRT1_2   0.707106781186547524400844362104849039
-
-#define MAXFLOAT ((float)3.40282346638528860e+38)
-#define HUGE_VALF __builtin_huge_valf()
-#define HUGE_VAL __builtin_huge_val()
-#define INFINITY __builtin_inff()
-#define NAN __builtin_nanf(0)
-
-#define CLK_SNORM_INT8 0x10D0
-#define CLK_SNORM_INT16 0x10D1
-#define CLK_UNORM_INT8 0x10D2
-#define CLK_UNORM_INT16 0x10D3
-#define CLK_UNORM_SHORT_565 0x10D4
-#define CLK_UNORM_SHORT_555 0x10D5
-#define CLK_UNORM_INT_101010 0x10D6
-#define CLK_SIGNED_INT8 0x10D7
-#define CLK_SIGNED_INT16 0x10D8
-#define CLK_SIGNED_INT32 0x10D9
-#define CLK_UNSIGNED_INT8 0x10DA
-#define CLK_UNSIGNED_INT16 0x10DB
-#define CLK_UNSIGNED_INT32 0x10DC
-#define CLK_HALF_FLOAT 0x10DD
-#define CLK_FLOAT 0x10DE
-#define CLK_UNORM_INT24 0x10DF
-
-#define CLK_R 0x10B0
-#define CLK_A 0x10B1
-#define CLK_RG 0x10B2
-#define CLK_RA 0x10B3
-#define CLK_RGB 0x10B4
-#define CLK_RGBA 0x10B5
-#define CLK_BGRA 0x10B6
-#define CLK_ARGB 0x10B7
-#define CLK_INTENSITY 0x10B8
-#define CLK_LUMINANCE 0x10B9
-#define CLK_Rx 0x10BA
-#define CLK_RGx 0x10BB
-#define CLK_RGBx 0x10BC
-#define CLK_DEPTH 0x10BD
-#define CLK_DEPTH_STENCIL 0x10BE
-
-#define CLK_NORMALIZED_COORDS_FALSE 0x0000
-#define CLK_NORMALIZED_COORDS_TRUE 0x0001
-
-#define CLK_ADDRESS_NONE 0x0000
-#define CLK_ADDRESS_CLAMP_TO_EDGE 0x0002
-#define CLK_ADDRESS_CLAMP 0x0004
-#define CLK_ADDRESS_REPEAT 0x0006
-#define CLK_ADDRESS_MIRRORED_REPEAT 0x0008
-
-#define CLK_FILTER_NEAREST 0x0010
-#define CLK_FILTER_LINEAR 0x0020
-
-#define __OVERLOAD__ __attribute__((__overloadable__))
-
-#define BUILTIN_1ARG(rtype, type0, name)  \
-  rtype __OVERLOAD__ name(type0 a);       \
-  rtype##2 __OVERLOAD__ name(type0##2 a); \
-  rtype##3 __OVERLOAD__ name(type0##3 a); \
-  rtype##4 __OVERLOAD__ name(type0##4 a); \
-  rtype##8 __OVERLOAD__ name(type0##8 a); \
-  rtype##16 __OVERLOAD__ name(type0##16 a);
-#define BUILTIN_2ARG(rtype, type0, type1, name)       \
-  rtype __OVERLOAD__ name(type0 a, type1 b);          \
-  rtype##2 __OVERLOAD__ name(type0##2 a, type1##2 b); \
-  rtype##3 __OVERLOAD__ name(type0##3 a, type1##3 b); \
-  rtype##4 __OVERLOAD__ name(type0##4 a, type1##4 b); \
-  rtype##8 __OVERLOAD__ name(type0##8 a, type1##8 b); \
-  rtype##16 __OVERLOAD__ name(type0##16 a, type1##16 b);
-#define BUILTIN_3ARG(rtype, type0, type1, type2, name)            \
-  rtype __OVERLOAD__ name(type0 a, type1 b, type2 c);             \
-  rtype##2 __OVERLOAD__ name(type0##2 a, type1##2 b, type2##2 c); \
-  rtype##3 __OVERLOAD__ name(type0##3 a, type1##3 b, type2##3 c); \
-  rtype##4 __OVERLOAD__ name(type0##4 a, type1##4 b, type2##4 c); \
-  rtype##8 __OVERLOAD__ name(type0##8 a, type1##8 b, type2##8 c); \
-  rtype##16 __OVERLOAD__ name(type0##16 a, type1##16 b, type2##16 c);
-
-#define BUILTIN_1ARG_INTEGERS(name)  \
-  BUILTIN_1ARG(char, char, name)     \
-  BUILTIN_1ARG(uchar, uchar, name)   \
-  BUILTIN_1ARG(short, short, name)   \
-  BUILTIN_1ARG(ushort, ushort, name) \
-  BUILTIN_1ARG(int, int, name)       \
-  BUILTIN_1ARG(uint, uint, name)     \
-  BUILTIN_1ARG(long, long, name)     \
-  BUILTIN_1ARG(ulong, ulong, name);
-#define BUILTIN_2ARG_INTEGERS(name)          \
-  BUILTIN_2ARG(char, char, char, name)       \
-  BUILTIN_2ARG(uchar, uchar, uchar, name)    \
-  BUILTIN_2ARG(short, short, short, name)    \
-  BUILTIN_2ARG(ushort, ushort, ushort, name) \
-  BUILTIN_2ARG(int, int, int, name)          \
-  BUILTIN_2ARG(uint, uint, uint, name)       \
-  BUILTIN_2ARG(long, long, long, name)       \
-  BUILTIN_2ARG(ulong, ulong, ulong, name);
-#define BUILTIN_3ARG_INTEGERS(name)                  \
-  BUILTIN_3ARG(char, char, char, char, name)         \
-  BUILTIN_3ARG(uchar, uchar, uchar, uchar, name)     \
-  BUILTIN_3ARG(short, short, short, short, name)     \
-  BUILTIN_3ARG(ushort, ushort, ushort, ushort, name) \
-  BUILTIN_3ARG(int, int, int, int, name)             \
-  BUILTIN_3ARG(uint, uint, uint, uint, name)         \
-  BUILTIN_3ARG(long, long, long, long, name)         \
-  BUILTIN_3ARG(ulong, ulong, ulong, ulong, name);
-
-#define BUILTIN_1ARG_FLOATS(name)  \
-  BUILTIN_1ARG(float, float, name) \
-  BUILTIN_1ARG(double, double, name);
-#define BUILTIN_2ARG_FLOATS(name)         \
-  BUILTIN_2ARG(float, float, float, name) \
-  BUILTIN_2ARG(double, double, double, name);
-#define BUILTIN_3ARG_FLOATS(name)                \
-  BUILTIN_3ARG(float, float, float, float, name) \
-  BUILTIN_3ARG(double, double, double, double, name);
-
-
-///////////////////////////////////////
-// Async Copy and Prefetch Functions //
-///////////////////////////////////////
-
-#define ASYNC_COPY_TYPE(type)                                                                                       \
-  event_t __OVERLOAD__ async_work_group_copy(__local type*, const __global type*, size_t, event_t);                 \
-  event_t __OVERLOAD__ async_work_group_copy(__global type*, const __local type*, size_t, event_t);                 \
-  event_t __OVERLOAD__ async_work_group_strided_copy(__local type*, const __global type*, size_t, size_t, event_t); \
-  event_t __OVERLOAD__ async_work_group_strided_copy(__global type*, const __local type*, size_t, size_t, event_t);
-#define ASYNC_COPY(type)   \
-  ASYNC_COPY_TYPE(type)    \
-  ASYNC_COPY_TYPE(type##2) \
-  ASYNC_COPY_TYPE(type##3) \
-  ASYNC_COPY_TYPE(type##4) \
-  ASYNC_COPY_TYPE(type##8) \
-  ASYNC_COPY_TYPE(type##16);
-ASYNC_COPY(char);
-ASYNC_COPY(uchar);
-ASYNC_COPY(short);
-ASYNC_COPY(ushort);
-ASYNC_COPY(int);
-ASYNC_COPY(uint);
-ASYNC_COPY(long);
-ASYNC_COPY(ulong);
-ASYNC_COPY(float);
-ASYNC_COPY(double);
-
-void wait_group_events(int, event_t*);
-
-#define PREFETCH(type)                                         \
-  void __OVERLOAD__ prefetch(const __global type*, size_t);    \
-  void __OVERLOAD__ prefetch(const __global type##2*, size_t); \
-  void __OVERLOAD__ prefetch(const __global type##3*, size_t); \
-  void __OVERLOAD__ prefetch(const __global type##4*, size_t); \
-  void __OVERLOAD__ prefetch(const __global type##8*, size_t); \
-  void __OVERLOAD__ prefetch(const __global type##16*, size_t);
-PREFETCH(char);
-PREFETCH(uchar);
-PREFETCH(short);
-PREFETCH(ushort);
-PREFETCH(int);
-PREFETCH(uint);
-PREFETCH(long);
-PREFETCH(ulong);
-PREFETCH(float);
-PREFETCH(double);
-
-
-//////////////////////
-// Atomic Functions //
-//////////////////////
-
-#define ATOMIC_0ARG_DEF(name, type)                  \
-  type __OVERLOAD__ name(volatile __global type *p); \
-  type __OVERLOAD__ name(volatile __local type *p);
-#define ATOMIC_0ARG(name)               \
-  ATOMIC_0ARG_DEF(atom_##name, int);    \
-  ATOMIC_0ARG_DEF(atom_##name, uint);   \
-  ATOMIC_0ARG_DEF(atomic_##name, int);  \
-  ATOMIC_0ARG_DEF(atomic_##name, uint);
-
-#define ATOMIC_1ARG_DEF(name, type)                            \
-  type __OVERLOAD__ name(volatile __global type *p, type val); \
-  type __OVERLOAD__ name(volatile __local type *p, type val);
-#define ATOMIC_1ARG(name)               \
-  ATOMIC_1ARG_DEF(atom_##name, int);    \
-  ATOMIC_1ARG_DEF(atom_##name, uint);   \
-  ATOMIC_1ARG_DEF(atomic_##name, int);  \
-  ATOMIC_1ARG_DEF(atomic_##name, uint);
-
-ATOMIC_1ARG(add);
-ATOMIC_1ARG(and);
-ATOMIC_0ARG(dec);
-ATOMIC_0ARG(inc);
-ATOMIC_1ARG(max);
-ATOMIC_1ARG(min);
-ATOMIC_1ARG(or);
-ATOMIC_1ARG(sub);
-ATOMIC_1ARG(xchg);
-ATOMIC_1ARG_DEF(atom_xchg, float);
-ATOMIC_1ARG_DEF(atomic_xchg, float);
-ATOMIC_1ARG(xor);
-
-int __OVERLOAD__ atom_cmpxchg(volatile __global int *p, int cmp, int val);
-int __OVERLOAD__ atom_cmpxchg(volatile __local int *p, int cmp, int val);
-uint __OVERLOAD__ atom_cmpxchg(volatile __global uint *p, uint cmp, uint val);
-uint __OVERLOAD__ atom_cmpxchg(volatile __local uint *p, uint cmp, uint val);
-int __OVERLOAD__ atomic_cmpxchg(volatile __global int *p, int cmp, int val);
-int __OVERLOAD__ atomic_cmpxchg(volatile __local int *p, int cmp, int val);
-uint __OVERLOAD__ atomic_cmpxchg(volatile __global uint *p, uint cmp, uint val);
-uint __OVERLOAD__ atomic_cmpxchg(volatile __local uint *p, uint cmp, uint val);
-
-
-//////////////////////
-// Common Functions //
-//////////////////////
-
-#define ABS(type)                 \
-  u##type __OVERLOAD__ abs(type); \
-  u##type __OVERLOAD__ abs(u##type);
-#define ABS_DIFF(type)                       \
-  u##type __OVERLOAD__ abs_diff(type, type); \
-  u##type __OVERLOAD__ abs_diff(u##type, u##type);
-#define ABS_BOTH(type) \
-  ABS(type);           \
-  ABS_DIFF(type);
-#define ABS_ALL(type) \
-  ABS_BOTH(type);     \
-  ABS_BOTH(type##2);  \
-  ABS_BOTH(type##3);  \
-  ABS_BOTH(type##4);  \
-  ABS_BOTH(type##8);  \
-  ABS_BOTH(type##16);
-
-ABS_ALL(char);
-ABS_ALL(short);
-ABS_ALL(int);
-ABS_ALL(long);
-BUILTIN_3ARG_FLOATS(clamp);
-BUILTIN_1ARG_FLOATS(degrees);
-BUILTIN_2ARG_FLOATS(max);
-BUILTIN_2ARG_FLOATS(min);
-BUILTIN_3ARG_FLOATS(mix);
-BUILTIN_1ARG_FLOATS(radians);
-BUILTIN_1ARG_FLOATS(sign);
-BUILTIN_3ARG_FLOATS(smoothstep);
-BUILTIN_2ARG_FLOATS(step);
-
-#define COMMON_SCALAR(type, n)                          \
-  type##n __OVERLOAD__ clamp(type##n, type, type);      \
-  type##n __OVERLOAD__ max(type##n, type);              \
-  type##n __OVERLOAD__ min(type##n, type);              \
-  type##n __OVERLOAD__ mix(type##n, type##n, type);     \
-  type##n __OVERLOAD__ smoothstep(type, type, type##n); \
-  type##n __OVERLOAD__ step(type, type##n);
-COMMON_SCALAR(float, 2);
-COMMON_SCALAR(float, 3);
-COMMON_SCALAR(float, 4);
-COMMON_SCALAR(float, 8);
-COMMON_SCALAR(float, 16);
-COMMON_SCALAR(double, 2);
-COMMON_SCALAR(double, 3);
-COMMON_SCALAR(double, 4);
-COMMON_SCALAR(double, 8);
-COMMON_SCALAR(double, 16);
-
-
-/////////////////////////
-// Geometric Functions //
-/////////////////////////
-
-#define GEOM_1ARG(type, name)     \
- type __OVERLOAD__ name(type);    \
- type __OVERLOAD__ name(type##2); \
- type __OVERLOAD__ name(type##3); \
- type __OVERLOAD__ name(type##4); \
- type __OVERLOAD__ name(type##8); \
- type __OVERLOAD__ name(type##16);
-#define GEOM_2ARG(type, name)              \
- type __OVERLOAD__ name(type, type);       \
- type __OVERLOAD__ name(type##2, type##2); \
- type __OVERLOAD__ name(type##3, type##3); \
- type __OVERLOAD__ name(type##4, type##4); \
- type __OVERLOAD__ name(type##8, type##8); \
- type __OVERLOAD__ name(type##16, type##16);
-
-float4 __OVERLOAD__ cross(float4, float4);
-float3 __OVERLOAD__ cross(float3, float3);
-double4 __OVERLOAD__ cross(double4, double4);
-double3 __OVERLOAD__ cross(double3, double3);
-GEOM_2ARG(float, dot);
-GEOM_2ARG(double, dot);
-GEOM_2ARG(float, distance);
-GEOM_2ARG(double, distance);
-GEOM_1ARG(float, length);
-GEOM_1ARG(double, length);
-BUILTIN_1ARG_FLOATS(normalize);
-GEOM_2ARG(float, fast_distance);
-GEOM_2ARG(double, fast_distance);
-GEOM_1ARG(float, fast_length);
-GEOM_1ARG(double, fast_length);
-BUILTIN_1ARG_FLOATS(fast_normalize);
-
-
-/////////////////////
-// Image Functions //
-/////////////////////
-
-#define IMAGE_QUERY(ret, name, type) \
-  ret __OVERLOAD__ name(read_only type image); \
-  ret __OVERLOAD__ name(write_only type image)
-
-IMAGE_QUERY(size_t, get_image_array_size, image1d_array_t);
-IMAGE_QUERY(size_t, get_image_array_size, image2d_array_t);
-
-IMAGE_QUERY(int, get_image_channel_data_type, image1d_t);
-IMAGE_QUERY(int, get_image_channel_data_type, image1d_buffer_t);
-IMAGE_QUERY(int, get_image_channel_data_type, image1d_array_t);
-IMAGE_QUERY(int, get_image_channel_data_type, image2d_t);
-IMAGE_QUERY(int, get_image_channel_data_type, image2d_array_t);
-IMAGE_QUERY(int, get_image_channel_data_type, image3d_t);
-
-IMAGE_QUERY(int, get_image_channel_order, image1d_t);
-IMAGE_QUERY(int, get_image_channel_order, image1d_buffer_t);
-IMAGE_QUERY(int, get_image_channel_order, image1d_array_t);
-IMAGE_QUERY(int, get_image_channel_order, image2d_t);
-IMAGE_QUERY(int, get_image_channel_order, image2d_array_t);
-IMAGE_QUERY(int, get_image_channel_order, image3d_t);
-
-IMAGE_QUERY(int2, get_image_dim, image2d_t);
-IMAGE_QUERY(int2, get_image_dim, image2d_array_t);
-IMAGE_QUERY(int4, get_image_dim, image3d_t);
-
-IMAGE_QUERY(int, get_image_depth, image3d_t);
-IMAGE_QUERY(int, get_image_height, image2d_t);
-IMAGE_QUERY(int, get_image_height, image2d_array_t);
-IMAGE_QUERY(int, get_image_height, image3d_t);
-IMAGE_QUERY(int, get_image_width, image1d_t);
-IMAGE_QUERY(int, get_image_width, image1d_buffer_t);
-IMAGE_QUERY(int, get_image_width, image1d_array_t);
-IMAGE_QUERY(int, get_image_width, image2d_t);
-IMAGE_QUERY(int, get_image_width, image2d_array_t);
-IMAGE_QUERY(int, get_image_width, image3d_t);
-
-float4 __OVERLOAD__ read_imagef(image1d_t, int);
-float4 __OVERLOAD__ read_imagef(image1d_buffer_t, int);
-float4 __OVERLOAD__ read_imagef(image1d_array_t, int2);
-float4 __OVERLOAD__ read_imagef(image2d_t, int2);
-float4 __OVERLOAD__ read_imagef(image2d_array_t, int4);
-float4 __OVERLOAD__ read_imagef(image3d_t, int4);
-
-float4 __OVERLOAD__ read_imagef(image1d_t, sampler_t, int);
-float4 __OVERLOAD__ read_imagef(image1d_t, sampler_t, float);
-float4 __OVERLOAD__ read_imagef(image1d_array_t, sampler_t, int2);
-float4 __OVERLOAD__ read_imagef(image1d_array_t, sampler_t, float2);
-float4 __OVERLOAD__ read_imagef(image2d_t, sampler_t, int2);
-float4 __OVERLOAD__ read_imagef(image2d_t, sampler_t, float2);
-float4 __OVERLOAD__ read_imagef(image2d_array_t, sampler_t, int4);
-float4 __OVERLOAD__ read_imagef(image2d_array_t, sampler_t, float4);
-float4 __OVERLOAD__ read_imagef(image3d_t, sampler_t, int4);
-float4 __OVERLOAD__ read_imagef(image3d_t, sampler_t, float4);
-
-int4 __OVERLOAD__ read_imagei(image1d_t, int);
-int4 __OVERLOAD__ read_imagei(image1d_buffer_t, int);
-int4 __OVERLOAD__ read_imagei(image1d_array_t, int2);
-int4 __OVERLOAD__ read_imagei(image2d_t, int2);
-int4 __OVERLOAD__ read_imagei(image2d_array_t, int4);
-int4 __OVERLOAD__ read_imagei(image3d_t, int4);
-
-int4 __OVERLOAD__ read_imagei(image1d_t, sampler_t, int);
-int4 __OVERLOAD__ read_imagei(image1d_t, sampler_t, float);
-int4 __OVERLOAD__ read_imagei(image1d_array_t, sampler_t, int2);
-int4 __OVERLOAD__ read_imagei(image1d_array_t, sampler_t, float2);
-int4 __OVERLOAD__ read_imagei(image2d_t, sampler_t, int2);
-int4 __OVERLOAD__ read_imagei(image2d_t, sampler_t, float2);
-int4 __OVERLOAD__ read_imagei(image2d_array_t, sampler_t, int4);
-int4 __OVERLOAD__ read_imagei(image2d_array_t, sampler_t, float4);
-int4 __OVERLOAD__ read_imagei(image3d_t, sampler_t, int4);
-int4 __OVERLOAD__ read_imagei(image3d_t, sampler_t, float4);
-
-uint4 __OVERLOAD__ read_imageui(image1d_t, int);
-uint4 __OVERLOAD__ read_imageui(image1d_buffer_t, int);
-uint4 __OVERLOAD__ read_imageui(image1d_array_t, int2);
-uint4 __OVERLOAD__ read_imageui(image2d_t, int2);
-uint4 __OVERLOAD__ read_imageui(image2d_array_t, int4);
-uint4 __OVERLOAD__ read_imageui(image3d_t, int4);
-
-uint4 __OVERLOAD__ read_imageui(image1d_t, sampler_t, int);
-uint4 __OVERLOAD__ read_imageui(image1d_t, sampler_t, float);
-uint4 __OVERLOAD__ read_imageui(image1d_array_t, sampler_t, int2);
-uint4 __OVERLOAD__ read_imageui(image1d_array_t, sampler_t, float2);
-uint4 __OVERLOAD__ read_imageui(image2d_t, sampler_t, int2);
-uint4 __OVERLOAD__ read_imageui(image2d_t, sampler_t, float2);
-uint4 __OVERLOAD__ read_imageui(image2d_array_t, sampler_t, int4);
-uint4 __OVERLOAD__ read_imageui(image2d_array_t, sampler_t, float4);
-uint4 __OVERLOAD__ read_imageui(image3d_t, sampler_t, int4);
-uint4 __OVERLOAD__ read_imageui(image3d_t, sampler_t, float4);
-
-void __OVERLOAD__ write_imagef(write_only image1d_t, int, float4);
-void __OVERLOAD__ write_imagef(write_only image1d_array_t, int2, float4);
-void __OVERLOAD__ write_imagef(write_only image2d_t, int2, float4);
-void __OVERLOAD__ write_imagef(write_only image2d_array_t, int4, float4);
-void __OVERLOAD__ write_imagef(write_only image3d_t, int4, float4);
-void __OVERLOAD__ write_imagei(write_only image1d_t, int, int4);
-void __OVERLOAD__ write_imagei(write_only image1d_array_t, int2, int4);
-void __OVERLOAD__ write_imagei(write_only image2d_t, int2, int4);
-void __OVERLOAD__ write_imagei(write_only image2d_array_t, int4, int4);
-void __OVERLOAD__ write_imagei(write_only image3d_t, int4, int4);
-void __OVERLOAD__ write_imageui(write_only image1d_t, int, uint4);
-void __OVERLOAD__ write_imageui(write_only image1d_array_t, int2, uint4);
-void __OVERLOAD__ write_imageui(write_only image2d_t, int2, uint4);
-void __OVERLOAD__ write_imageui(write_only image2d_array_t, int4, uint4);
-void __OVERLOAD__ write_imageui(write_only image3d_t, int4, uint4);
-
-
-///////////////////////
-// Integer Functions //
-///////////////////////
-
-BUILTIN_2ARG_INTEGERS(add_sat);
-BUILTIN_3ARG_INTEGERS(clamp);
-BUILTIN_1ARG_INTEGERS(clz);
-BUILTIN_2ARG_INTEGERS(hadd);
-BUILTIN_3ARG(int, int, int, int, mad24);
-BUILTIN_3ARG(uint, uint, uint, uint, mad24);
-BUILTIN_3ARG_INTEGERS(mad_hi);
-BUILTIN_3ARG_INTEGERS(mad_sat);
-BUILTIN_2ARG_INTEGERS(max);
-BUILTIN_2ARG_INTEGERS(min);
-BUILTIN_2ARG(int, int, int, mul24);
-BUILTIN_2ARG(uint, uint, uint, mul24);
-BUILTIN_2ARG_INTEGERS(mul_hi);
-BUILTIN_1ARG_INTEGERS(popcount);
-BUILTIN_2ARG_INTEGERS(rhadd);
-BUILTIN_2ARG_INTEGERS(rotate);
-BUILTIN_2ARG_INTEGERS(sub_sat);
-#define UPSAMPLE_SIZES(out, in1, in2)            \
-  out     __OVERLOAD__ upsample(in1, in2);       \
-  out##2  __OVERLOAD__ upsample(in1##2, in2##2); \
-  out##3  __OVERLOAD__ upsample(in1##3, in2##3); \
-  out##4  __OVERLOAD__ upsample(in1##4, in2##4); \
-  out##8  __OVERLOAD__ upsample(in1##8, in2##8); \
-  out##16 __OVERLOAD__ upsample(in1##16, in2##16);
-#define UPSAMPLE(out, in)      \
-  UPSAMPLE_SIZES(out, in, u##in); \
-  UPSAMPLE_SIZES(u##out, u##in, u##in);
-UPSAMPLE(short, char);
-UPSAMPLE(int, short);
-UPSAMPLE(long, int);
-
-
-////////////////////
-// Math Functions //
-////////////////////
-
-#define BUILTIN_2TYPE_PTR(type1, type2, name)     \
- type1 __OVERLOAD__ name(type1, __global type2*); \
- type1 __OVERLOAD__ name(type1, __local type2*);  \
- type1 __OVERLOAD__ name(type1, __private type2*);
-#define BUILTIN_PTR_ARG(type1, type2, name)  \
- BUILTIN_2TYPE_PTR(type1, type2, name)       \
- BUILTIN_2TYPE_PTR(type1##2, type2##2, name) \
- BUILTIN_2TYPE_PTR(type1##3, type2##3, name) \
- BUILTIN_2TYPE_PTR(type1##4, type2##4, name) \
- BUILTIN_2TYPE_PTR(type1##8, type2##8, name) \
- BUILTIN_2TYPE_PTR(type1##16, type2##16, name);
-#define REMQUO(type, addrspace)                                   \
-  type __OVERLOAD__ remquo(type, type, addrspace int*);           \
-  type##2 __OVERLOAD__ remquo(type##2, type##2, addrspace int2*); \
-  type##3 __OVERLOAD__ remquo(type##3, type##3, addrspace int3*); \
-  type##4 __OVERLOAD__ remquo(type##4, type##4, addrspace int4*); \
-  type##8 __OVERLOAD__ remquo(type##8, type##8, addrspace int8*); \
-  type##16 __OVERLOAD__ remquo(type##16, type##16, addrspace int16*);
-
-BUILTIN_1ARG_FLOATS(acos);
-BUILTIN_1ARG_FLOATS(acosh);
-BUILTIN_1ARG_FLOATS(acospi);
-BUILTIN_1ARG_FLOATS(asin);
-BUILTIN_1ARG_FLOATS(asinh);
-BUILTIN_1ARG_FLOATS(asinpi);
-BUILTIN_1ARG_FLOATS(atan);
-BUILTIN_2ARG_FLOATS(atan2);
-BUILTIN_1ARG_FLOATS(atanh);
-BUILTIN_1ARG_FLOATS(atanpi);
-BUILTIN_2ARG_FLOATS(atan2pi);
-BUILTIN_1ARG_FLOATS(cbrt);
-BUILTIN_1ARG_FLOATS(ceil);
-BUILTIN_2ARG_FLOATS(copysign);
-BUILTIN_1ARG_FLOATS(cos);
-BUILTIN_1ARG_FLOATS(cosh);
-BUILTIN_1ARG_FLOATS(cospi);
-BUILTIN_1ARG_FLOATS(erfc);
-BUILTIN_1ARG_FLOATS(erf);
-BUILTIN_1ARG_FLOATS(exp);
-BUILTIN_1ARG_FLOATS(exp2);
-BUILTIN_1ARG_FLOATS(exp10);
-BUILTIN_1ARG_FLOATS(expm1);
-BUILTIN_1ARG_FLOATS(fabs);
-BUILTIN_2ARG_FLOATS(fdim);
-BUILTIN_1ARG_FLOATS(floor);
-BUILTIN_3ARG_FLOATS(fma);
-BUILTIN_2ARG_FLOATS(fmax);
-BUILTIN_2ARG_FLOATS(fmin);
-BUILTIN_2ARG_FLOATS(fmod);
-BUILTIN_PTR_ARG(float, float, fract);
-BUILTIN_PTR_ARG(double, double, fract);
-BUILTIN_PTR_ARG(float, int, frexp);
-BUILTIN_PTR_ARG(double, int, frexp);
-BUILTIN_2ARG_FLOATS(hypot);
-BUILTIN_1ARG(int, float, ilogb);
-BUILTIN_1ARG(int, double, ilogb);
-BUILTIN_2ARG(float, float, int, ldexp);
-BUILTIN_2ARG(double, double, int, ldexp);
-BUILTIN_1ARG_FLOATS(lgamma);
-BUILTIN_PTR_ARG(float, int, lgamma_r);
-BUILTIN_PTR_ARG(double, int, lgamma_r);
-BUILTIN_1ARG_FLOATS(log);
-BUILTIN_1ARG_FLOATS(log2);
-BUILTIN_1ARG_FLOATS(log10);
-BUILTIN_1ARG_FLOATS(log1p);
-BUILTIN_1ARG_FLOATS(logb);
-BUILTIN_3ARG_FLOATS(mad);
-BUILTIN_2ARG_FLOATS(maxmag);
-BUILTIN_2ARG_FLOATS(minmag);
-BUILTIN_PTR_ARG(float, float, modf);
-BUILTIN_PTR_ARG(double, double, modf);
-BUILTIN_1ARG(float, uint, nan);
-BUILTIN_1ARG(double, ulong, nan);
-BUILTIN_2ARG_FLOATS(nextafter);
-BUILTIN_2ARG_FLOATS(pow);
-BUILTIN_2ARG(float, float, int, pown);
-BUILTIN_2ARG(double, double, int, pown);
-BUILTIN_2ARG_FLOATS(powr);
-BUILTIN_2ARG_FLOATS(remainder);
-REMQUO(float, global);
-REMQUO(float, local);
-REMQUO(float, private);
-REMQUO(double, global);
-REMQUO(double, local);
-REMQUO(double, private);
-BUILTIN_1ARG_FLOATS(rint);
-BUILTIN_2ARG(float, float, int, rootn);
-BUILTIN_2ARG(double, double, int, rootn);
-BUILTIN_1ARG_FLOATS(round);
-BUILTIN_1ARG_FLOATS(rsqrt);
-BUILTIN_1ARG_FLOATS(sin);
-BUILTIN_1ARG_FLOATS(sinpi);
-BUILTIN_1ARG_FLOATS(sinh);
-BUILTIN_PTR_ARG(float, float, sincos);
-BUILTIN_PTR_ARG(double, double, sincos);
-BUILTIN_1ARG_FLOATS(sqrt);
-BUILTIN_1ARG_FLOATS(tan);
-BUILTIN_1ARG_FLOATS(tanh);
-BUILTIN_1ARG_FLOATS(tanpi);
-BUILTIN_1ARG_FLOATS(tgamma);
-BUILTIN_1ARG_FLOATS(trunc);
-
-// Native math functions
-BUILTIN_1ARG_FLOATS(half_cos);
-BUILTIN_1ARG_FLOATS(native_cos);
-BUILTIN_2ARG_FLOATS(half_divide);
-BUILTIN_2ARG_FLOATS(native_divide);
-BUILTIN_1ARG_FLOATS(half_exp);
-BUILTIN_1ARG_FLOATS(native_exp);
-BUILTIN_1ARG_FLOATS(half_exp2);
-BUILTIN_1ARG_FLOATS(native_exp2);
-BUILTIN_1ARG_FLOATS(half_exp10);
-BUILTIN_1ARG_FLOATS(native_exp10);
-BUILTIN_1ARG_FLOATS(half_log);
-BUILTIN_1ARG_FLOATS(native_log);
-BUILTIN_1ARG_FLOATS(half_log2);
-BUILTIN_1ARG_FLOATS(native_log2);
-BUILTIN_1ARG_FLOATS(half_log10);
-BUILTIN_1ARG_FLOATS(native_log10);
-BUILTIN_2ARG_FLOATS(half_powr);
-BUILTIN_2ARG_FLOATS(native_powr);
-BUILTIN_1ARG_FLOATS(half_recip);
-BUILTIN_1ARG_FLOATS(native_recip);
-BUILTIN_1ARG_FLOATS(half_rsqrt);
-BUILTIN_1ARG_FLOATS(native_rsqrt);
-BUILTIN_1ARG_FLOATS(half_sin);
-BUILTIN_1ARG_FLOATS(native_sin);
-BUILTIN_1ARG_FLOATS(half_sqrt);
-BUILTIN_1ARG_FLOATS(native_sqrt);
-BUILTIN_1ARG_FLOATS(half_tan);
-BUILTIN_1ARG_FLOATS(native_tan);
-
-
-
-////////////////////////////
-// Misc. Vector Functions //
-////////////////////////////
-
-#define SHUFFLE_TYPE(ret, type, mask)         \
-  ret __OVERLOAD__ shuffle(type, mask);       \
-  ret##2 __OVERLOAD__ shuffle(type, mask##2); \
-  ret##3 __OVERLOAD__ shuffle(type, mask##3); \
-  ret##4 __OVERLOAD__ shuffle(type, mask##4); \
-  ret##8 __OVERLOAD__ shuffle(type, mask##8); \
-  ret##16 __OVERLOAD__ shuffle(type, mask##16);
-#define SHUFFLE(type, mask)          \
-  SHUFFLE_TYPE(type, type, mask);    \
-  SHUFFLE_TYPE(type, type##2, mask); \
-  SHUFFLE_TYPE(type, type##3, mask); \
-  SHUFFLE_TYPE(type, type##4, mask); \
-  SHUFFLE_TYPE(type, type##8, mask); \
-  SHUFFLE_TYPE(type, type##16, mask);
-SHUFFLE(char, uchar);
-SHUFFLE(uchar, uchar);
-SHUFFLE(short, ushort);
-SHUFFLE(ushort, ushort);
-SHUFFLE(int, uint);
-SHUFFLE(uint, uint);
-SHUFFLE(long, ulong);
-SHUFFLE(ulong, ulong);
-SHUFFLE(float, uint);
-SHUFFLE(double, ulong);
-
-#define SHUFFLE2_TYPE(ret, type, mask)               \
-  ret __OVERLOAD__ shuffle2(type, type, mask);       \
-  ret##2 __OVERLOAD__ shuffle2(type, type, mask##2); \
-  ret##3 __OVERLOAD__ shuffle2(type, type, mask##3); \
-  ret##4 __OVERLOAD__ shuffle2(type, type, mask##4); \
-  ret##8 __OVERLOAD__ shuffle2(type, type, mask##8); \
-  ret##16 __OVERLOAD__ shuffle2(type, type, mask##16);
-#define SHUFFLE2(type, mask)          \
-  SHUFFLE2_TYPE(type, type, mask);    \
-  SHUFFLE2_TYPE(type, type##2, mask); \
-  SHUFFLE2_TYPE(type, type##3, mask); \
-  SHUFFLE2_TYPE(type, type##4, mask); \
-  SHUFFLE2_TYPE(type, type##8, mask); \
-  SHUFFLE2_TYPE(type, type##16, mask);
-SHUFFLE2(char, uchar);
-SHUFFLE2(uchar, uchar);
-SHUFFLE2(short, ushort);
-SHUFFLE2(ushort, ushort);
-SHUFFLE2(int, uint);
-SHUFFLE2(uint, uint);
-SHUFFLE2(long, ulong);
-SHUFFLE2(ulong, ulong);
-SHUFFLE2(float, uint);
-SHUFFLE2(double, ulong);
-
-
-//////////////////////////
-// Relational Functions //
-//////////////////////////
-
-#define BUILTIN_ANYALL(name, type) \
-  int __OVERLOAD__ name(type);     \
-  int __OVERLOAD__ name(type##2);  \
-  int __OVERLOAD__ name(type##3);  \
-  int __OVERLOAD__ name(type##4);  \
-  int __OVERLOAD__ name(type##8);  \
-  int __OVERLOAD__ name(type##16);
-#define REL_1ARG(name)            \
-  BUILTIN_1ARG(int, float, name); \
-  BUILTIN_1ARG(long, double, name);
-#define REL_2ARG(name)                   \
-  BUILTIN_2ARG(int, float, float, name); \
-  BUILTIN_2ARG(long, double, double, name);
-BUILTIN_ANYALL(all, char);
-BUILTIN_ANYALL(all, short);
-BUILTIN_ANYALL(all, int);
-BUILTIN_ANYALL(all, long);
-BUILTIN_ANYALL(any, char);
-BUILTIN_ANYALL(any, short);
-BUILTIN_ANYALL(any, int);
-BUILTIN_ANYALL(any, long);
-BUILTIN_3ARG_FLOATS(bitselect);
-BUILTIN_3ARG_INTEGERS(bitselect);
-REL_2ARG(isequal);
-REL_2ARG(isnotequal);
-REL_2ARG(isgreater);
-REL_2ARG(isgreaterequal);
-REL_2ARG(isless);
-REL_2ARG(islessequal);
-REL_2ARG(islessgreater);
-REL_1ARG(isfinite);
-REL_1ARG(isinf);
-REL_1ARG(isnan);
-REL_1ARG(isnormal);
-REL_2ARG(isordered);
-REL_2ARG(isunordered);
-REL_1ARG(signbit);
-
-#define SELECT_TYPE(type, ctype)               \
-  type __OVERLOAD__ select(type, type, ctype); \
-  type __OVERLOAD__ select(type, type, u##ctype);
-#define SELECT(type, ctype)      \
-  SELECT_TYPE(type, ctype)       \
-  SELECT_TYPE(type##2, ctype##2) \
-  SELECT_TYPE(type##3, ctype##3) \
-  SELECT_TYPE(type##4, ctype##4) \
-  SELECT_TYPE(type##8, ctype##8) \
-  SELECT_TYPE(type##16, ctype##16);
-SELECT(char, char);
-SELECT(uchar, char);
-SELECT(short, short);
-SELECT(ushort, short);
-SELECT(int, int);
-SELECT(uint, int);
-SELECT(long, long);
-SELECT(ulong, long);
-SELECT(float, int);
-SELECT(double, long);
-
-
-///////////////////////////////
-// Synchronization Functions //
-///////////////////////////////
-
-typedef uint cl_mem_fence_flags;
-#define CLK_LOCAL_MEM_FENCE  (1<<0)
-#define CLK_GLOBAL_MEM_FENCE (1<<1)
-
-void barrier(cl_mem_fence_flags);
-void mem_fence(cl_mem_fence_flags);
-void read_mem_fence(cl_mem_fence_flags);
-void write_mem_fence(cl_mem_fence_flags);
-
-
-//////////////////////////////////////////
-// Vector Data Load and Store Functions //
-//////////////////////////////////////////
-
-#define VLOAD_ADDRSPACE(type, width)                                    \
-  type##width __OVERLOAD__ vload##width(size_t, const __private type*); \
-  type##width __OVERLOAD__ vload##width(size_t, const __local type*);   \
-  type##width __OVERLOAD__ vload##width(size_t, const __global type*);  \
-  type##width __OVERLOAD__ vload##width(size_t, const __constant type*);
-
-#define VSTORE_ADDRSPACE(type, width)                                   \
-  void __OVERLOAD__ vstore##width(type##width, size_t, __local type*);  \
-  void __OVERLOAD__ vstore##width(type##width, size_t, __global type*); \
-  void __OVERLOAD__ vstore##width(type##width, size_t, __private type*);
-
-#define V_ADDRSPACE(macro, type) \
-  macro(type, 2)                 \
-  macro(type, 3)                 \
-  macro(type, 4)                 \
-  macro(type, 8)                 \
-  macro(type, 16);
-
-#define VLOADSTORE(type)              \
-  V_ADDRSPACE(VLOAD_ADDRSPACE, type); \
-  V_ADDRSPACE(VSTORE_ADDRSPACE, type);
-
-VLOADSTORE(char);
-VLOADSTORE(uchar);
-VLOADSTORE(short);
-VLOADSTORE(ushort);
-VLOADSTORE(int);
-VLOADSTORE(uint);
-VLOADSTORE(long);
-VLOADSTORE(ulong);
-VLOADSTORE(float);
-VLOADSTORE(double);
-
-#define VLOAD_HALF_WIDTH(n)                                            \
-  float##n __OVERLOAD__ vload_half##n(size_t, const __private half*);  \
-  float##n __OVERLOAD__ vloada_half##n(size_t, const __private half*); \
-  float##n __OVERLOAD__ vload_half##n(size_t, const __local half*);    \
-  float##n __OVERLOAD__ vloada_half##n(size_t, const __local half*);   \
-  float##n __OVERLOAD__ vload_half##n(size_t, const __global half*);   \
-  float##n __OVERLOAD__ vloada_half##n(size_t, const __global half*);  \
-  float##n __OVERLOAD__ vload_half##n(size_t, const __constant half*); \
-  float##n __OVERLOAD__ vloada_half##n(size_t, const __constant half*);
-#define VSTORE_HALF_ADDRSPACE(func, type)                      \
-  void __OVERLOAD__ func(type, size_t, const __private half*); \
-  void __OVERLOAD__ func(type, size_t, const __local half*);   \
-  void __OVERLOAD__ func(type, size_t, const __global half*);  \
-  void __OVERLOAD__ func(type, size_t, const __constant half*);
-#define VSTORE_HALF_ROUND(func, type)      \
-  VSTORE_HALF_ADDRSPACE(func, type);       \
-  VSTORE_HALF_ADDRSPACE(func##_rte, type); \
-  VSTORE_HALF_ADDRSPACE(func##_rtz, type); \
-  VSTORE_HALF_ADDRSPACE(func##_rtp, type); \
-  VSTORE_HALF_ADDRSPACE(func##_rtn, type);
-#define VSTORE_HALF_WIDTH(n)                    \
-  VSTORE_HALF_ROUND(vstore_half##n, float##n);  \
-  VSTORE_HALF_ROUND(vstorea_half##n, float##n); \
-  VSTORE_HALF_ROUND(vstore_half##n, double##n); \
-  VSTORE_HALF_ROUND(vstorea_half##n, double##n);
-#define VLOADSTORE_HALF_WIDTH(n) \
-  VLOAD_HALF_WIDTH(n);           \
-  VSTORE_HALF_WIDTH(n);
-VLOADSTORE_HALF_WIDTH();
-VLOADSTORE_HALF_WIDTH(2);
-VLOADSTORE_HALF_WIDTH(3);
-VLOADSTORE_HALF_WIDTH(4);
-VLOADSTORE_HALF_WIDTH(8);
-VLOADSTORE_HALF_WIDTH(16);
-
-
-/////////////////////////
-// Work-Item Functions //
-/////////////////////////
-
-size_t get_global_id(uint dim);
-size_t get_global_size(uint dim);
-size_t get_global_offset(uint dim);
-size_t get_group_id(uint dim);
-size_t get_local_id(uint dim);
-size_t get_local_size(uint dim);
-size_t get_num_groups(uint dim);
-uint get_work_dim(void);
-
-
-
-/////////////////////
-// Other Functions //
-/////////////////////
-
-int printf(__constant char * restrict, ...);
-
-
-/////////////////
-// Conversions //
-/////////////////
-
-#define as_char( _x )  __builtin_astype( _x, char )
-#define as_char2( _x )  __builtin_astype( _x, char2 )
-#define as_char3( _x )  __builtin_astype( _x, char3 )
-#define as_char4( _x )  __builtin_astype( _x, char4 )
-#define as_char8( _x )  __builtin_astype( _x, char8 )
-#define as_char16( _x )  __builtin_astype( _x, char16 )
-#define as_uchar( _x )  __builtin_astype( _x, uchar )
-#define as_uchar2( _x )  __builtin_astype( _x, uchar2 )
-#define as_uchar3( _x )  __builtin_astype( _x, uchar3 )
-#define as_uchar4( _x )  __builtin_astype( _x, uchar4 )
-#define as_uchar8( _x )  __builtin_astype( _x, uchar8 )
-#define as_uchar16( _x )  __builtin_astype( _x, uchar16 )
-#define as_short( _x )  __builtin_astype( _x, short )
-#define as_short2( _x )  __builtin_astype( _x, short2 )
-#define as_short3( _x )  __builtin_astype( _x, short3 )
-#define as_short4( _x )  __builtin_astype( _x, short4 )
-#define as_short8( _x )  __builtin_astype( _x, short8 )
-#define as_short16( _x )  __builtin_astype( _x, short16 )
-#define as_ushort( _x )  __builtin_astype( _x, ushort )
-#define as_ushort2( _x )  __builtin_astype( _x, ushort2 )
-#define as_ushort3( _x )  __builtin_astype( _x, ushort3 )
-#define as_ushort4( _x )  __builtin_astype( _x, ushort4 )
-#define as_ushort8( _x )  __builtin_astype( _x, ushort8 )
-#define as_ushort16( _x )  __builtin_astype( _x, ushort16 )
-#define as_int( _x )  __builtin_astype( _x, int )
-#define as_int2( _x )  __builtin_astype( _x, int2 )
-#define as_int3( _x )  __builtin_astype( _x, int3 )
-#define as_int4( _x )  __builtin_astype( _x, int4 )
-#define as_int8( _x )  __builtin_astype( _x, int8 )
-#define as_int16( _x )  __builtin_astype( _x, int16 )
-#define as_uint( _x )  __builtin_astype( _x, uint )
-#define as_uint2( _x )  __builtin_astype( _x, uint2 )
-#define as_uint3( _x )  __builtin_astype( _x, uint3 )
-#define as_uint4( _x )  __builtin_astype( _x, uint4 )
-#define as_uint8( _x )  __builtin_astype( _x, uint8 )
-#define as_uint16( _x )  __builtin_astype( _x, uint16 )
-#define as_long( _x )  __builtin_astype( _x, long )
-#define as_long2( _x )  __builtin_astype( _x, long2 )
-#define as_long3( _x )  __builtin_astype( _x, long3 )
-#define as_long4( _x )  __builtin_astype( _x, long4 )
-#define as_long8( _x )  __builtin_astype( _x, long8 )
-#define as_long16( _x )  __builtin_astype( _x, long16 )
-#define as_ulong( _x )  __builtin_astype( _x, ulong )
-#define as_ulong2( _x )  __builtin_astype( _x, ulong2 )
-#define as_ulong3( _x )  __builtin_astype( _x, ulong3 )
-#define as_ulong4( _x )  __builtin_astype( _x, ulong4 )
-#define as_ulong8( _x )  __builtin_astype( _x, ulong8 )
-#define as_ulong16( _x )  __builtin_astype( _x, ulong16 )
-#define as_float( _x )  __builtin_astype( _x, float )
-#define as_float2( _x )  __builtin_astype( _x, float2 )
-#define as_float3( _x )  __builtin_astype( _x, float3 )
-#define as_float4( _x )  __builtin_astype( _x, float4 )
-#define as_float8( _x )  __builtin_astype( _x, float8 )
-#define as_float16( _x )  __builtin_astype( _x, float16 )
-#define as_double( _x )  __builtin_astype( _x, double )
-#define as_double2( _x )  __builtin_astype( _x, double2 )
-#define as_double3( _x )  __builtin_astype( _x, double3 )
-#define as_double4( _x )  __builtin_astype( _x, double4 )
-#define as_double8( _x )  __builtin_astype( _x, double8 )
-#define as_double16( _x )  __builtin_astype( _x, double16 )
-#define as_size_t( _x ) __builtin_astype( _x, size_t )
-#define as_ptrdiff_t( _x ) __builtin_astype( _x, ptrdiff_t )
-#define as_uintptr_t( _x ) __builtin_astype( _x, uintptr_t )
-#define as_intptr_t( _x ) __builtin_astype( _x, intptr_t )
-
-#define CONVERT_TYPE_SIZE(out, in)              \
-  out __OVERLOAD__ convert_##out(in);           \
-  out __OVERLOAD__ convert_##out##_rte(in);     \
-  out __OVERLOAD__ convert_##out##_rtz(in);     \
-  out __OVERLOAD__ convert_##out##_rtp(in);     \
-  out __OVERLOAD__ convert_##out##_rtn(in);     \
-  out __OVERLOAD__ convert_##out##_sat(in);     \
-  out __OVERLOAD__ convert_##out##_sat_rte(in); \
-  out __OVERLOAD__ convert_##out##_sat_rtz(in); \
-  out __OVERLOAD__ convert_##out##_sat_rtp(in); \
-  out __OVERLOAD__ convert_##out##_sat_rtn(in);
-#define CONVERT_TYPE(out, in)             \
-  CONVERT_TYPE_SIZE(out, in);             \
-  CONVERT_TYPE_SIZE(out##2, in##2);       \
-  CONVERT_TYPE_SIZE(out##3, in##3);       \
-  CONVERT_TYPE_SIZE(out##4, in##4);       \
-  CONVERT_TYPE_SIZE(out##8, in##8);       \
-  CONVERT_TYPE_SIZE(out##16, in##16);
-#define CONVERT(out)         \
-  CONVERT_TYPE(out, char);   \
-  CONVERT_TYPE(out, uchar);  \
-  CONVERT_TYPE(out, short);  \
-  CONVERT_TYPE(out, ushort); \
-  CONVERT_TYPE(out, int);    \
-  CONVERT_TYPE(out, uint);   \
-  CONVERT_TYPE(out, long);   \
-  CONVERT_TYPE(out, ulong);  \
-  CONVERT_TYPE(out, float);  \
-  CONVERT_TYPE(out, double);
-
-CONVERT(char);
-CONVERT(uchar);
-CONVERT(short);
-CONVERT(ushort);
-CONVERT(int);
-CONVERT(uint);
-CONVERT(long);
-CONVERT(ulong);
-CONVERT(float);
-CONVERT(double);
diff --git a/src/core/common.cpp b/src/core/common.cpp
index a4975c7..ea92196 100644
--- a/src/core/common.cpp
+++ b/src/core/common.cpp
@@ -288,6 +288,23 @@ namespace oclgrind
     return (value && !strcmp(value, "1"));
   }
 
+  unsigned getEnvInt(const char *var, int def, bool allowZero)
+  {
+    const char *value = getenv(var);
+    if (!value)
+      return def;
+
+    char *next;
+    uint64_t result = strtoul(value, &next, 10);
+    if (strlen(next) || result == ULONG_MAX || (!allowZero && !result))
+    {
+      cerr << endl << "Oclgrind: Invalid value for " << var << endl;
+      abort();
+    }
+
+    return result;
+  }
+
   void dumpInstruction(ostream& out, const llvm::Instruction *instruction)
   {
     llvm::raw_os_ostream stream(out);
@@ -404,8 +421,7 @@ namespace oclgrind
     }
   }
 
-  const llvm::Instruction* getConstExprAsInstruction(
-    const llvm::ConstantExpr *expr)
+  llvm::Instruction* getConstExprAsInstruction(const llvm::ConstantExpr *expr)
   {
     // Get operands
     vector<llvm::Value*> valueOperands(expr->op_begin(), expr->op_end());
@@ -452,13 +468,8 @@ namespace oclgrind
       }
       else
       {
-#if LLVM_VERSION > 36
         return llvm::GetElementPtrInst::Create(nullptr,
                                                operands[0], operands.slice(1));
-#else
-        return llvm::GetElementPtrInst::Create(operands[0], operands.slice(1));
-#endif
-
       }
     case llvm::Instruction::ICmp:
     case llvm::Instruction::FCmp:
@@ -466,6 +477,8 @@ namespace oclgrind
         (llvm::Instruction::OtherOps)opcode,
         (llvm::CmpInst::Predicate)expr->getPredicate(),
         operands[0], operands[1]);
+    case llvm::Instruction::AddrSpaceCast:
+      FATAL_ERROR("Unsupported constant expression: addrspacecast");
     default:
       assert(expr->getNumOperands() == 2 && "Must be binary operator?");
 
@@ -706,6 +719,7 @@ namespace oclgrind
       cout << *(double*)data;
       break;
     case llvm::Type::IntegerTyID:
+      cout << dec;
       switch (size)
       {
       case 1:
@@ -766,6 +780,93 @@ namespace oclgrind
     }
   }
 
+  size_t resolveConstantPointer(const llvm::Value *ptr, TypedValueMap& values)
+  {
+    if (values.count(ptr))
+    {
+      // In the value map - just return the pointer
+      return values.at(ptr).getPointer();
+    }
+    else if (auto gep = llvm::dyn_cast<llvm::GEPOperator>(ptr))
+    {
+      // Get base address
+      size_t base = resolveConstantPointer(gep->getPointerOperand(), values);
+      const llvm::Type *ptrType = gep->getPointerOperandType();
+
+      // Get indices
+      std::vector<int64_t> offsets;
+      llvm::User::const_op_iterator opItr;
+      for (opItr = gep->idx_begin(); opItr != gep->idx_end(); opItr++)
+      {
+        auto idx = (llvm::ConstantInt*)(opItr->get());
+        offsets.push_back(idx->getSExtValue());
+      }
+
+      return resolveGEP(base, ptrType, offsets);
+    }
+    else if (auto bc = llvm::dyn_cast<llvm::BitCastOperator>(ptr))
+    {
+      // bitcast - no change to the source pointer
+      return resolveConstantPointer(bc->getOperand(0), values);
+    }
+    else if (ptr->getValueID() == llvm::Value::ConstantPointerNullVal)
+    {
+      return 0;
+    }
+    else
+    {
+      FATAL_ERROR("Unsupported constant pointer type: %d", ptr->getValueID());
+    }
+
+    return 0;
+  }
+
+  size_t resolveGEP(size_t base, const llvm::Type *ptrType,
+                    std::vector<int64_t>& offsets)
+  {
+    size_t address = base;
+
+    // Iterate over indices
+    for (int i = 0; i < offsets.size(); i++)
+    {
+      int64_t offset = offsets[i];
+
+      if (ptrType->isPointerTy())
+      {
+        // Get pointer element size
+        const llvm::Type *elemType = ptrType->getPointerElementType();
+        address += offset*getTypeSize(elemType);
+        ptrType = elemType;
+      }
+      else if (ptrType->isArrayTy())
+      {
+        // Get array element size
+        const llvm::Type *elemType = ptrType->getArrayElementType();
+        address += offset*getTypeSize(elemType);
+        ptrType = elemType;
+      }
+      else if (ptrType->isVectorTy())
+      {
+        // Get vector element size
+        const llvm::Type *elemType = ptrType->getVectorElementType();
+        address += offset*getTypeSize(elemType);
+        ptrType = elemType;
+      }
+      else if (ptrType->isStructTy())
+      {
+        address +=
+          getStructMemberOffset((const llvm::StructType*)ptrType, offset);
+        ptrType = ptrType->getStructElementType(offset);
+      }
+      else
+      {
+        FATAL_ERROR("Unsupported GEP base type: %d", ptrType->getTypeID());
+      }
+    }
+
+    return address;
+  }
+
   FatalError::FatalError(const string& msg, const string& file, size_t line)
     : std::runtime_error(msg)
   {
@@ -808,6 +909,9 @@ namespace oclgrind
 
   uint8_t* MemoryPool::alloc(size_t size)
   {
+    if (size == 0)
+      return NULL;
+
     // Check if requested size larger than block size
     if (size > m_blockSize)
     {
@@ -817,6 +921,22 @@ namespace oclgrind
       return buffer;
     }
 
+    // Round up size to nearest power of two for alignment
+    // Taken from here:
+    //   http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+    unsigned align = size;
+    align--;
+    align |= align >> 1;
+    align |= align >> 2;
+    align |= align >> 4;
+    align |= align >> 8;
+    align |= align >> 16;
+    align++;
+
+    // Align offset to size of requested allocation
+    if (m_offset & (align-1))
+      m_offset += (align - (m_offset & (align-1)));
+
     // Check if enough space in current block
     if (m_offset + size > m_blockSize)
     {
diff --git a/src/core/common.h b/src/core/common.h
index b015cab..9217fc3 100644
--- a/src/core/common.h
+++ b/src/core/common.h
@@ -28,9 +28,6 @@
 #include <unordered_map>
 #include <vector>
 
-#define BIG_SEPARATOR   "================================"
-#define SMALL_SEPARATOR "--------------------------------"
-
 #if defined(_WIN32) && !defined(__MINGW32__)
 #define snprintf _snprintf
 #undef ERROR
@@ -46,6 +43,18 @@
 #define THREAD_LOCAL thread_local
 #endif
 
+#define CLK_NORMALIZED_COORDS_TRUE 0x0001
+
+#define CLK_ADDRESS_NONE 0x0000
+#define CLK_ADDRESS_CLAMP_TO_EDGE 0x0002
+#define CLK_ADDRESS_CLAMP 0x0004
+#define CLK_ADDRESS_REPEAT 0x0006
+#define CLK_ADDRESS_MIRRORED_REPEAT 0x0008
+#define CLK_ADDRESS_MASK 0x000E
+
+#define CLK_FILTER_NEAREST 0x0010
+#define CLK_FILTER_LINEAR 0x0020
+
 namespace llvm
 {
   class Constant;
@@ -148,6 +157,9 @@ namespace oclgrind
   // Check if an environment variable is set to 1
   bool checkEnv(const char *var);
 
+  // Get an environment variable as an integer
+  unsigned getEnvInt(const char *var, int def=0, bool allowZero=true);
+
   // Output an instruction in human-readable format
   void dumpInstruction(std::ostream& out, const llvm::Instruction *instruction);
 
@@ -158,8 +170,7 @@ namespace oclgrind
   void getConstantData(unsigned char *data, const llvm::Constant *constant);
 
   // Creates an instruction from a constant expression
-  const llvm::Instruction* getConstExprAsInstruction(
-    const llvm::ConstantExpr *expr);
+  llvm::Instruction* getConstExprAsInstruction(const llvm::ConstantExpr *expr);
 
   // Get the ConstantInt object for a Metadata node
   const llvm::ConstantInt* getMDAsConstInt(const llvm::Metadata *md);
@@ -188,6 +199,13 @@ namespace oclgrind
   // Print data in a human readable format (according to its type)
   void printTypedData(const llvm::Type *type, const unsigned char *data);
 
+  // Resolve a constant pointer, using a set of known constant values
+  size_t resolveConstantPointer(const llvm::Value *ptr, TypedValueMap& values);
+
+  // Resolve a GEP from a base address and list of offsets
+  size_t resolveGEP(size_t base, const llvm::Type *ptrType,
+                    std::vector<int64_t>& offsets);
+
   // Exception class for raising fatal errors
   class FatalError : std::runtime_error
   {
diff --git a/src/core/gen_clc_h.cmake b/src/core/gen_clc_h.cmake
deleted file mode 100644
index 43b4fa5..0000000
--- a/src/core/gen_clc_h.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-set(OUTPUT src/core/clc_h.cpp)
-
-file(WRITE ${OUTPUT} "extern const char CLC_H_DATA[] = \n\"")
-
-file(READ ${SOURCE_FILE} CLC_H)
-string(REGEX REPLACE "\\\\" "\\\\\\\\" CLC_H "${CLC_H}")
-string(REGEX REPLACE "\"" "\\\\\"" CLC_H "${CLC_H}")
-string(REGEX REPLACE "\n" "\\\\n\"\n\"" CLC_H "${CLC_H}")
-file(APPEND ${OUTPUT} "${CLC_H}")
-
-file(APPEND ${OUTPUT} "\";")
diff --git a/src/core/gen_clc_h.sh b/src/core/gen_clc_h.sh
deleted file mode 100755
index e9ce2b1..0000000
--- a/src/core/gen_clc_h.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-if [ $# -ne 2 ]
-then
-  echo "Usage: gen_clc_h.sh INPUT OUTPUT"
-  exit 1
-fi
-
-IN=$1
-OUT=$2
-
-echo "extern const char CLC_H_DATA[] =" >$OUT
-sed -e 's/\\/\\\\/g;s/"/\\"/g;s/^/"/;s/$/\\n"/' $IN >>$OUT
-if [ $? -ne 0 ]
-then
-  exit 1
-fi
-echo ";" >>$OUT
diff --git a/src/core/gen_opencl-c.h.cmake b/src/core/gen_opencl-c.h.cmake
new file mode 100644
index 0000000..af515ff
--- /dev/null
+++ b/src/core/gen_opencl-c.h.cmake
@@ -0,0 +1,14 @@
+set(OUTPUT src/core/opencl-c.h.cpp)
+
+# Load opencl-c.h
+file(READ ${SOURCE_FILE} OPENCL_C_H)
+
+# Replace each character with a C character literal, escaping as necessary
+string(REGEX REPLACE "(.)" "'\\1', " CONTENT "${OPENCL_C_H}")
+string(REGEX REPLACE "\n'" "\\\\n'\n" CONTENT "${CONTENT}")
+string(REGEX REPLACE "\\\\'" "\\\\\\\\'" CONTENT "${CONTENT}")
+
+# Write character array
+file(WRITE ${OUTPUT} "extern const char OPENCL_C_H_DATA[] = {\n")
+file(APPEND ${OUTPUT} "${CONTENT}")
+file(APPEND ${OUTPUT} "'\\0'};\n")
diff --git a/src/kernel/Simulation.cpp b/src/kernel/Simulation.cpp
index efa20a2..bbf37f7 100644
--- a/src/kernel/Simulation.cpp
+++ b/src/kernel/Simulation.cpp
@@ -149,15 +149,6 @@ bool Simulation::load(const char *filename)
     get(m_wgsize.y);
     get(m_wgsize.z);
 
-    // Ensure work-group size exactly divides NDRange
-    if (m_ndrange.x % m_wgsize.x ||
-        m_ndrange.y % m_wgsize.y ||
-        m_ndrange.z % m_wgsize.z)
-    {
-      cerr << "Work group size must divide NDRange exactly." << endl;
-      return false;
-    }
-
     // Open program file
     ifstream progFile;
     progFile.open(progFileName.c_str(), ios_base::in | ios_base::binary);
@@ -212,9 +203,15 @@ bool Simulation::load(const char *filename)
       return false;
     }
 
-    // Clear global memory
-    Memory *globalMemory = m_context->getGlobalMemory();
-    globalMemory->clear();
+    // Ensure work-group size exactly divides NDRange if necessary
+    if (m_program->requiresUniformWorkGroups() &&
+        (m_ndrange.x % m_wgsize.x ||
+         m_ndrange.y % m_wgsize.y ||
+         m_ndrange.z % m_wgsize.z))
+    {
+      cerr << "Work group size must divide NDRange exactly." << endl;
+      return false;
+    }
 
     // Parse kernel arguments
     m_dumpArguments.clear();
@@ -748,7 +745,7 @@ void Simulation::run(bool dumpGlobalMemory)
 
     switch (itr->type)
     {
-      DUMP_TYPE(TYPE_CHAR, char);
+      DUMP_TYPE(TYPE_CHAR, int8_t);
       DUMP_TYPE(TYPE_UCHAR, uint8_t);
       DUMP_TYPE(TYPE_SHORT, int16_t);
       DUMP_TYPE(TYPE_USHORT, uint16_t);
diff --git a/src/kernel/oclgrind-kernel.cpp b/src/kernel/oclgrind-kernel.cpp
index 1ce599d..86dd386 100644
--- a/src/kernel/oclgrind-kernel.cpp
+++ b/src/kernel/oclgrind-kernel.cpp
@@ -57,6 +57,15 @@ static bool parseArguments(int argc, char *argv[])
       }
       setEnvironment("OCLGRIND_BUILD_OPTIONS", argv[i]);
     }
+    else if (!strcmp(argv[i], "--constant-mem-size"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --constant-mem-size" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_CONSTANT_MEM_SIZE", argv[i]);
+    }
     else if (!strcmp(argv[i], "--data-races"))
     {
       setEnvironment("OCLGRIND_DATA_RACES", "1");
@@ -73,6 +82,15 @@ static bool parseArguments(int argc, char *argv[])
     {
       outputGlobalMemory = true;
     }
+    else if (!strcmp(argv[i], "--global-mem-size"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --global-mem-size" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_GLOBAL_MEM_SIZE", argv[i]);
+    }
     else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help"))
     {
       printUsage();
@@ -86,6 +104,15 @@ static bool parseArguments(int argc, char *argv[])
     {
       setEnvironment("OCLGRIND_INTERACTIVE", "1");
     }
+    else if (!strcmp(argv[i], "--local-mem-size"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --local-mem-size" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_LOCAL_MEM_SIZE", argv[i]);
+    }
     else if (!strcmp(argv[i], "--log"))
     {
       if (++i >= argc)
@@ -104,6 +131,15 @@ static bool parseArguments(int argc, char *argv[])
       }
       setEnvironment("OCLGRIND_MAX_ERRORS", argv[i]);
     }
+    else if (!strcmp(argv[i], "--max-wgsize"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --max-wgsize" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_MAX_WGSIZE", argv[i]);
+    }
     else if (!strcmp(argv[i], "--num-threads"))
     {
       if (++i >= argc)
@@ -148,7 +184,7 @@ static bool parseArguments(int argc, char *argv[])
       cout << endl;
       cout << "Oclgrind " PACKAGE_VERSION << endl;
       cout << endl;
-      cout << "Copyright (c) 2013-2016" << endl;
+      cout << "Copyright (c) 2013-2018" << endl;
       cout << "James Price and Simon McIntosh-Smith, University of Bristol"
            << endl;
       cout << "https://github.com/jrprice/Oclgrind" << endl;
@@ -190,40 +226,48 @@ static void printUsage()
     << "       oclgrind-kernel [--help | --version]" << endl
     << endl
     << "Options:" << endl
-    << "     --build-options  OPTIONS  "
-             "Additional options to pass to the OpenCL compiler" << endl
-    << "     --data-races              "
-             "Enable data-race detection" << endl
-    << "     --disable-pch             "
-             "Don't use precompiled headers" << endl
-    << "     --dump-spir               "
-             "Dump SPIR to /tmp/oclgrind_*.{ll,bc}" << endl
-    << "  -g --global-mem              "
-             "Output global memory at exit" << endl
-    << "  -h --help                    "
-             "Display usage information" << endl
-    << "     --inst-counts             "
-             "Output histograms of instructions executed" << endl
-    << "  -i --interactive             "
-             "Enable interactive mode" << endl
-    << "     --log            LOGFILE  "
-             "Redirect log/error messages to a file" << endl
-    << "     --max-errors     NUM      "
-             "Limit the number of error/warning messages" << endl
-    << "     --num-threads    NUM      "
-             "Set the number of worker threads to use" << endl
-    << "     --pch-dir        DIR      "
-             "Override directory containing precompiled headers" << endl
-    << "     --plugins        PLUGINS  "
-             "Load colon separated list of plugin libraries" << endl
-    << "  -q --quick                   "
-             "Only run first and last work-group" << endl
-    << "     --uniform-writes          "
-             "Don't suppress uniform write-write data-races" << endl
-    << "     --uninitialized           "
-             "Report usage of uninitialized values" << endl
-    << "  -v --version                 "
-             "Display version information" << endl
+    << "  --build-options     OPTIONS  "
+          "Additional options to pass to the OpenCL compiler" << endl
+    << "  --constant-mem-size BYTES    "
+          "Change the constant memory size of the device" << endl
+    << "  --data-races                 "
+          "Enable data-race detection" << endl
+    << "  --disable-pch                "
+          "Don't use precompiled headers" << endl
+    << "  --dump-spir                  "
+          "Dump SPIR to /tmp/oclgrind_*.{ll,bc}" << endl
+    << "  --global-mem [-g]            "
+          "Output global memory at exit" << endl
+    << "  --global-mem-size   BYTES    "
+          "Change the global memory size of the device" << endl
+    << "  --help [-h]                  "
+          "Display usage information" << endl
+    << "  --inst-counts                "
+          "Output histograms of instructions executed" << endl
+    << "  --interactive [-i]           "
+          "Enable interactive mode" << endl
+    << "  --local-mem-size    BYTES    "
+          "Change the local memory size of the device" << endl
+    << "  --log               LOGFILE  "
+          "Redirect log/error messages to a file" << endl
+    << "  --max-errors        NUM      "
+          "Limit the number of error/warning messages" << endl
+    << "  --max-wgsize        WGSIZE   "
+          "Change the maximum work-group size of the device" << endl
+    << "  --num-threads       NUM      "
+          "Set the number of worker threads to use" << endl
+    << "  --pch-dir           DIR      "
+          "Override directory containing precompiled headers" << endl
+    << "  --plugins           PLUGINS  "
+          "Load colon separated list of plugin libraries" << endl
+    << "  --quick [-q]                 "
+          "Only run first and last work-group" << endl
+    << "  --uniform-writes             "
+          "Don't suppress uniform write-write data-races" << endl
+    << "  --uninitialized              "
+          "Report usage of uninitialized values" << endl
+    << "  --version [-q]               "
+          "Display version information" << endl
     << endl
     << "For more information, please visit the Oclgrind wiki page:" << endl
     << "-> https://github.com/jrprice/Oclgrind/wiki" << endl
diff --git a/src/plugins/InstructionCounter.cpp b/src/plugins/InstructionCounter.cpp
index ed977ec..de61c80 100644
--- a/src/plugins/InstructionCounter.cpp
+++ b/src/plugins/InstructionCounter.cpp
@@ -27,16 +27,24 @@ using namespace std;
 #define COUNTED_STORE_BASE (COUNTED_LOAD_BASE + 8)
 #define COUNTED_CALL_BASE  (COUNTED_STORE_BASE + 8)
 
+THREAD_LOCAL InstructionCounter::WorkerState
+  InstructionCounter::m_state = {NULL};
+
 static bool compareNamedCount(pair<string,size_t> a, pair<string,size_t> b)
 {
-  return a.second > b.second;
+  if (a.second > b.second)
+    return true;
+  else if (a.second < b.second)
+    return false;
+  else
+    return a.first < b.first;
 }
 
 string InstructionCounter::getOpcodeName(unsigned opcode) const
 {
   if (opcode >= COUNTED_CALL_BASE)
   {
-    // Get functon name
+    // Get function name
     unsigned index = opcode - COUNTED_CALL_BASE;
     assert(index < m_functions.size());
     return "call " + m_functions[index]->getName().str() + "()";
@@ -92,7 +100,7 @@ void InstructionCounter::instructionExecuted(
 
     // Count total number of bytes loaded/stored
     unsigned bytes = getTypeSize(type->getPointerElementType());
-    m_memopBytes[opcode-COUNTED_LOAD_BASE] += bytes;
+    (*m_state.memopBytes)[opcode-COUNTED_LOAD_BASE] += bytes;
   }
   else if (opcode == llvm::Instruction::Call)
   {
@@ -102,35 +110,29 @@ void InstructionCounter::instructionExecuted(
     if (function)
     {
       vector<const llvm::Function*>::iterator itr =
-        find(m_functions.begin(), m_functions.end(), function);
-      if (itr == m_functions.end())
+        find(m_state.functions->begin(), m_state.functions->end(), function);
+      if (itr == m_state.functions->end())
       {
-        opcode = COUNTED_CALL_BASE + m_functions.size();
-        m_functions.push_back(function);
+        opcode = COUNTED_CALL_BASE + m_state.functions->size();
+        m_state.functions->push_back(function);
       }
       else
       {
-        opcode = COUNTED_CALL_BASE + (itr - m_functions.begin());
+        opcode = COUNTED_CALL_BASE + (itr - m_state.functions->begin());
       }
     }
   }
 
-  if (opcode >= m_instructionCounts.size())
+  if (opcode >= m_state.instCounts->size())
   {
-    m_instructionCounts.resize(opcode+1);
+    m_state.instCounts->resize(opcode+1);
   }
-  m_instructionCounts[opcode]++;
-}
-
-bool InstructionCounter::isThreadSafe() const
-{
-  return false;
+  (*m_state.instCounts)[opcode]++;
 }
 
 void InstructionCounter::kernelBegin(const KernelInvocation *kernelInvocation)
 {
   m_instructionCounts.clear();
-  m_instructionCounts.resize(COUNTED_CALL_BASE);
 
   m_memopBytes.clear();
   m_memopBytes.resize(16);
@@ -182,3 +184,61 @@ void InstructionCounter::kernelEnd(const KernelInvocation *kernelInvocation)
   // Restore locale
   cout.imbue(previousLocale);
 }
+
+void InstructionCounter::workGroupBegin(const WorkGroup *workGroup)
+{
+  // Create worker state if haven't already
+  if (!m_state.instCounts)
+  {
+    m_state.instCounts = new vector<size_t>;
+    m_state.memopBytes = new vector<size_t>;
+    m_state.functions = new vector<const llvm::Function*>;
+  }
+
+  m_state.instCounts->clear();
+  m_state.instCounts->resize(COUNTED_CALL_BASE);
+
+  m_state.memopBytes->clear();
+  m_state.memopBytes->resize(16);
+
+  m_state.functions->clear();
+}
+
+void InstructionCounter::workGroupComplete(const WorkGroup *workGroup)
+{
+  lock_guard<mutex> lock(m_mtx);
+
+  if (m_state.instCounts->size() > m_instructionCounts.size())
+    m_instructionCounts.resize(m_state.instCounts->size());
+
+  // Merge instruction counts into global list
+  for (unsigned i = 0; i < m_state.instCounts->size(); i++)
+  {
+    if (m_state.instCounts->at(i) == 0)
+      continue;
+
+    // Merge functions into global list
+    unsigned opcode = i;
+    if (i >= COUNTED_CALL_BASE)
+    {
+      const llvm::Function *func = m_state.functions->at(i - COUNTED_CALL_BASE);
+      vector<const llvm::Function*>::iterator itr =
+        find(m_functions.begin(), m_functions.end(), func);
+      if (itr == m_functions.end())
+      {
+        opcode = COUNTED_CALL_BASE + m_functions.size();
+        m_functions.push_back(func);
+      }
+      else
+      {
+        opcode = COUNTED_CALL_BASE + (itr - m_functions.begin());
+      }
+    }
+
+    m_instructionCounts[opcode] += m_state.instCounts->at(i);
+  }
+
+  // Merge memory transfer sizes into global list
+  for (unsigned i = 0; i < m_state.memopBytes->size(); i++)
+    m_memopBytes[i] += m_state.memopBytes->at(i);
+}
diff --git a/src/plugins/InstructionCounter.h b/src/plugins/InstructionCounter.h
index e6f3646..3722c64 100644
--- a/src/plugins/InstructionCounter.h
+++ b/src/plugins/InstructionCounter.h
@@ -8,6 +8,8 @@
 
 #include "core/Plugin.h"
 
+#include <mutex>
+
 namespace llvm
 {
   class Function;
@@ -25,14 +27,24 @@ namespace oclgrind
                                      const TypedValue& result) override;
     virtual void kernelBegin(const KernelInvocation *kernelInvocation) override;
     virtual void kernelEnd(const KernelInvocation *kernelInvocation) override;
-
-    virtual bool isThreadSafe() const override;
+    virtual void workGroupBegin(const WorkGroup *workGroup) override;
+    virtual void workGroupComplete(const WorkGroup *workGroup) override;
 
   private:
     std::vector<size_t> m_instructionCounts;
     std::vector<size_t> m_memopBytes;
     std::vector<const llvm::Function*> m_functions;
 
+    struct WorkerState
+    {
+      std::vector<size_t> *instCounts;
+      std::vector<size_t> *memopBytes;
+      std::vector<const llvm::Function*> *functions;
+    };
+    static THREAD_LOCAL WorkerState m_state;
+
+    std::mutex m_mtx;
+
     std::string getOpcodeName(unsigned opcode) const;
   };
 }
diff --git a/src/plugins/InteractiveDebugger.cpp b/src/plugins/InteractiveDebugger.cpp
index 012e009..79ca54e 100644
--- a/src/plugins/InteractiveDebugger.cpp
+++ b/src/plugins/InteractiveDebugger.cpp
@@ -14,6 +14,11 @@
 
 #if !defined(_WIN32) || defined(__MINGW32__)
 #include <signal.h>
+#include <unistd.h>
+#else
+#include <io.h>
+#define isatty _isatty
+#define STDIN_FILENO _fileno(stdin)
 #endif
 
 #if HAVE_READLINE
@@ -109,32 +114,40 @@ void InteractiveDebugger::instructionExecuted(
   m_continue     = false;
   m_next         = false;
 
+  bool interactive = isatty(STDIN_FILENO);
   while (true)
   {
     // Prompt for command
     bool eof = false;
     string cmd;
   #if HAVE_READLINE
-    char *line = readline("(oclgrind) ");
-    if (line)
+    if (interactive)
     {
-      cmd = line;
-      free(line);
+      char *line = readline("(oclgrind) ");
+      if (line)
+      {
+        cmd = line;
+        free(line);
+      }
+      else
+      {
+        eof = true;
+      }
     }
     else
+  #endif
     {
-      eof = true;
+      if (interactive)
+        cout << "(oclgrind) " << flush;
+      getline(cin, cmd);
+      eof = cin.eof();
     }
-  #else
-    cout << "(oclgrind) " << flush;
-    getline(cin, cmd);
-    eof = cin.eof();
-  #endif
 
     // Quit on EOF
     if (eof)
     {
-      cout << "(quit)" << endl;
+      if (interactive)
+        cout << "(quit)" << endl;
       quit(vector<string>());
       return;
     }
@@ -153,7 +166,8 @@ void InteractiveDebugger::instructionExecuted(
     }
 
   #if HAVE_READLINE
-    add_history(cmd.c_str());
+    if (interactive)
+      add_history(cmd.c_str());
   #endif
 
     // Find command in map and execute
@@ -225,13 +239,7 @@ size_t InteractiveDebugger::getLineNumber(
   llvm::MDNode *md = instruction->getMetadata("dbg");
   if (md)
   {
-#if LLVM_VERSION > 36
-    llvm::DILocation *loc = (llvm::DILocation*)md;
-    return loc->getLine();
-#else
-    llvm::DILocation loc((llvm::MDLocation*)md);
-    return loc.getLineNumber();
-#endif
+    return ((llvm::DILocation*)md)->getLine();
   }
   return 0;
 }
@@ -832,126 +840,15 @@ bool InteractiveDebugger::print(vector<string> args)
   for (unsigned i = 1; i < args.size(); i++)
   {
     cout << args[i] << " = ";
-
-    // Check for subscript operator
-    size_t start = args[i].find("[");
-    if (start != string::npos)
+    try
     {
-      // Find end of subscript
-      size_t end = args[i].find(']');
-      if (end == string::npos)
-      {
-        cout << "missing ']'" << endl;
-        return false;
-      }
-      if (end != args[i].length() - 1)
-      {
-        cout << "invalid variable" << endl;
-        return false;
-      }
-
-      // Parse index value
-      size_t index = 0;
-      string var = args[i].substr(0, start);
-      stringstream ss(args[i].substr(start+1, end-start-1));
-      ss >> index;
-      if (!ss.eof())
-      {
-        cout << "invalid index" << endl;
-        return false;
-      }
-
-      // Get variable value and type
-      const llvm::Value *ptr = workItem->getVariable(var);
-      if (!ptr)
-      {
-        cout << "not found" << endl;
-        return false;
-      }
-
-      const llvm::Type *ptrType = ptr->getType();
-      unsigned addrSpace = ptrType->getPointerAddressSpace();
-
-      // Check for alloca instruction, in which case look at allocated type
-      bool alloca = false;
-      if (ptr->getValueID() == llvm::Value::GlobalVariableVal)
-      {
-        ptrType = ptrType->getPointerElementType();
-      }
-      if (ptr->getValueID() >= llvm::Value::InstructionVal &&
-          ((llvm::Instruction*)ptr)->getOpcode() == llvm::Instruction::Alloca)
-      {
-        ptrType = ((const llvm::AllocaInst*)ptr)->getAllocatedType();
-        if (ptrType->isPointerTy())
-          addrSpace = ptrType->getPointerAddressSpace();
-        alloca = true;
-      }
-
-      // Ensure type is a pointer
-      if (!ptrType->isPointerTy() && !ptrType->isArrayTy())
-      {
-        cout << "not a pointer" << endl;
-        return false;
-      }
-
-      // Get base address
-      size_t base = *(size_t*)workItem->getValueData(ptr);
-      if (alloca)
-      {
-        // Load base address from private memory
-        workItem->getPrivateMemory()->load((unsigned char*)&base,
-                                           base, sizeof(size_t));
-      }
-
-      // Get target memory object
-      Memory *memory = NULL;
-      switch (addrSpace)
-      {
-      case AddrSpacePrivate:
-        memory = workItem->getPrivateMemory();
-        break;
-      case AddrSpaceGlobal:
-      case AddrSpaceConstant:
-        memory = m_context->getGlobalMemory();
-        break;
-      case AddrSpaceLocal:
-        memory = m_kernelInvocation->getCurrentWorkGroup()->getLocalMemory();
-        break;
-      default:
-        cout << "invalid address space" << endl;
-        return false;
-      }
-
-      // Get element type
-      const llvm::Type *elemType = ptrType->getPointerElementType();
-      unsigned elemSize = getTypeSize(elemType);
-
-      // Load data
-      if (!memory->isAddressValid(base + index*elemSize, elemSize))
-      {
-        cout << "invalid memory address" << endl;
-      }
-      else
-      {
-        // Print data
-        void *data = (void*)memory->getPointer(base+index*elemSize);
-        printTypedData(elemType, (unsigned char*)data);
-        cout << endl;
-      }
+      workItem->printExpression(args[i]);
     }
-    else
+    catch (FatalError err)
     {
-      try
-      {
-        if (!workItem->printVariable(args[i]))
-          cout << "not found";
-      }
-      catch (FatalError err)
-      {
-        cout << "not found";
-      }
-      cout << endl;
+      cout << "fatal error: " << err.what();
     }
+    cout << endl;
   }
 
   return false;
diff --git a/src/plugins/Logger.cpp b/src/plugins/Logger.cpp
index acc8896..ba49233 100644
--- a/src/plugins/Logger.cpp
+++ b/src/plugins/Logger.cpp
@@ -38,17 +38,7 @@ Logger::Logger(const Context *context)
     }
   }
 
-  m_maxErrors = DEFAULT_MAX_ERRORS;
-  const char *maxErrors = getenv("OCLGRIND_MAX_ERRORS");
-  if (maxErrors)
-  {
-    char *next;
-    m_maxErrors = strtoul(maxErrors, &next, 10);
-    if (strlen(next))
-    {
-      cerr << "Oclgrind: Invalid value for OCLGRIND_MAX_ERRORS" << endl;
-    }
-  }
+  m_maxErrors = getEnvInt("OCLGRIND_MAX_ERRORS", DEFAULT_MAX_ERRORS);
 }
 
 Logger::~Logger()
diff --git a/src/plugins/RaceDetector.cpp b/src/plugins/RaceDetector.cpp
index 8c38907..770373b 100644
--- a/src/plugins/RaceDetector.cpp
+++ b/src/plugins/RaceDetector.cpp
@@ -375,7 +375,7 @@ void RaceDetector::registerAccess(const Memory *memory,
     index = STATE(workGroup).wiLocal.size() - 1;
   }
 
-  AccessMap& accesess = (addrSpace == AddrSpaceGlobal) ?
+  AccessMap& accesses = (addrSpace == AddrSpaceGlobal) ?
     STATE(workGroup).wiGlobal[index] :
     STATE(workGroup).wiLocal[index];
 
@@ -384,7 +384,7 @@ void RaceDetector::registerAccess(const Memory *memory,
     if (storeData)
       access.setStoreData(storeData[i]);
 
-    insert(accesess[address+i], access);
+    insert(accesses[address+i], access);
   }
 }
 
diff --git a/src/runtime/icd.h b/src/runtime/icd.h
index 6a2d207..e35ec87 100644
--- a/src/runtime/icd.h
+++ b/src/runtime/icd.h
@@ -148,6 +148,10 @@ struct _cl_platform_id
 struct _cl_device_id
 {
   void **dispatch;
+  size_t globalMemSize;
+  size_t constantMemSize;
+  size_t localMemSize;
+  size_t maxWGSize;
 };
 
 struct _cl_context
diff --git a/src/runtime/oclgrind.cpp b/src/runtime/oclgrind.cpp
index e547bb2..0d357c8 100644
--- a/src/runtime/oclgrind.cpp
+++ b/src/runtime/oclgrind.cpp
@@ -212,6 +212,15 @@ static bool parseArguments(int argc, char *argv[])
     {
       setEnvironment("OCLGRIND_CHECK_API", "1");
     }
+    else if (!strcmp(argv[i], "--constant-mem-size"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --constant-mem-size" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_CONSTANT_MEM_SIZE", argv[i]);
+    }
     else if (!strcmp(argv[i], "--data-races"))
     {
       setEnvironment("OCLGRIND_DATA_RACES", "1");
@@ -224,6 +233,15 @@ static bool parseArguments(int argc, char *argv[])
     {
       setEnvironment("OCLGRIND_DUMP_SPIR", "1");
     }
+    else if (!strcmp(argv[i], "--global-mem-size"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --global-mem-size" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_GLOBAL_MEM_SIZE", argv[i]);
+    }
     else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help"))
     {
       printUsage();
@@ -237,6 +255,15 @@ static bool parseArguments(int argc, char *argv[])
     {
       setEnvironment("OCLGRIND_INTERACTIVE", "1");
     }
+    else if (!strcmp(argv[i], "--local-mem-size"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --local-mem-size" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_LOCAL_MEM_SIZE", argv[i]);
+    }
     else if (!strcmp(argv[i], "--log"))
     {
       if (++i >= argc)
@@ -255,6 +282,15 @@ static bool parseArguments(int argc, char *argv[])
       }
       setEnvironment("OCLGRIND_MAX_ERRORS", argv[i]);
     }
+    else if (!strcmp(argv[i], "--max-wgsize"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --max-wgsize" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_MAX_WGSIZE", argv[i]);
+    }
     else if (!strcmp(argv[i], "--num-threads"))
     {
       if (++i >= argc)
@@ -299,7 +335,7 @@ static bool parseArguments(int argc, char *argv[])
       cout << endl;
       cout << "Oclgrind " PACKAGE_VERSION << endl;
       cout << endl;
-      cout << "Copyright (c) 2013-2016" << endl;
+      cout << "Copyright (c) 2013-2018" << endl;
       cout << "James Price and Simon McIntosh-Smith, University of Bristol"
            << endl;
       cout << "https://github.com/jrprice/Oclgrind" << endl;
@@ -346,6 +382,22 @@ static bool parseArguments(int argc, char *argv[])
   return true;
 }
 
+static void stripLastComponent(string& path)
+{
+  size_t slash;
+#if defined(_WIN32) && !defined(__MINGW32__)
+  if ((slash = path.find_last_of('\\')) == string::npos)
+#else
+  if ((slash = path.find_last_of('/')) == string::npos)
+#endif
+  {
+    cerr << "[Oclgrind] Failed to get path to library directory" << endl;
+    exit(1);
+  }
+
+  path.resize(slash);
+}
+
 static string getLibDirPath()
 {
   string libdir;
@@ -376,23 +428,17 @@ static string getLibDirPath()
   libdir = path;
 #endif
 
-  // Remove executable filename and containing directory
-  size_t slash;
-  for (int i = 0; i < 2; i++)
-  {
-#if defined(_WIN32) && !defined(__MINGW32__)
-    if ((slash = libdir.find_last_of('\\')) == string::npos)
-#else
-    if ((slash = libdir.find_last_of('/')) == string::npos)
-#endif
-      cerr << "[Oclgrind] Failed to get path to library directory" << endl;
+  // Remove executable filename
+  stripLastComponent(libdir);
 
-    libdir.resize(slash);
+  const char *testing = getenv("OCLGRIND_TESTING");
+  if (!testing)
+  {
+    // Remove containing directory and append library directory
+    stripLastComponent(libdir);
+    libdir += "/lib" LIBDIR_SUFFIX;
   }
 
-  // Append library directory
-  libdir += "/lib" LIBDIR_SUFFIX;
-
   return libdir;
 }
 
@@ -403,40 +449,48 @@ static void printUsage()
     << "       oclgrind [--help | --version]" << endl
     << endl
     << "Options:" << endl
-    << "     --build-options  OPTIONS  "
-             "Additional options to pass to the OpenCL compiler" << endl
-    << "     --check-api               "
-             "Report errors on API calls" << endl
-    << "     --data-races              "
-             "Enable data-race detection" << endl
-    << "     --disable-pch             "
-             "Don't use precompiled headers" << endl
-    << "     --dump-spir               "
-             "Dump SPIR to /tmp/oclgrind_*.{ll,bc}" << endl
-    << "  -h --help                    "
-             "Display usage information" << endl
-    << "     --inst-counts             "
-             "Output histograms of instructions executed" << endl
-    << "  -i --interactive             "
-             "Enable interactive mode" << endl
-    << "     --log            LOGFILE  "
-             "Redirect log/error messages to a file" << endl
-    << "     --max-errors     NUM      "
-             "Limit the number of error/warning messages" << endl
-    << "     --num-threads    NUM      "
-             "Set the number of worker threads to use" << endl
-    << "     --pch-dir        DIR      "
-             "Override directory containing precompiled headers" << endl
-    << "     --plugins        PLUGINS  "
-             "Load colon separated list of plugin libraries" << endl
-    << "  -q --quick                   "
-             "Only run first and last work-group" << endl
-    << "     --uniform-writes          "
-             "Don't suppress uniform write-write data-races" << endl
-    << "     --uninitialized           "
-             "Report usage of uninitialized values" << endl
-    << "  -v --version                 "
-             "Display version information" << endl
+    << "  --build-options     OPTIONS  "
+          "Additional options to pass to the OpenCL compiler" << endl
+    << "  --check-api                  "
+          "Report errors on API calls"  << endl
+    << "  --constant-mem-size BYTES    "
+          "Change the constant memory size of the device" << endl
+    << "  --data-races                 "
+          "Enable data-race detection" << endl
+    << "  --disable-pch                "
+          "Don't use precompiled headers" << endl
+    << "  --dump-spir                  "
+          "Dump SPIR to /tmp/oclgrind_*.{ll,bc}" << endl
+    << "  --global-mem-size   BYTES    "
+          "Change the global memory size of the device" << endl
+    << "  --help [-h]                  "
+          "Display usage information" << endl
+    << "  --inst-counts                "
+          "Output histograms of instructions executed" << endl
+    << "  --interactive [-i]           "
+          "Enable interactive mode" << endl
+    << "  --local-mem-size    BYTES    "
+          "Change the local memory size of the device" << endl
+    << "  --log               LOGFILE  "
+          "Redirect log/error messages to a file" << endl
+    << "  --max-errors        NUM      "
+          "Limit the number of error/warning messages" << endl
+    << "  --max-wgsize        WGSIZE   "
+          "Change the maximum work-group size of the device" << endl
+    << "  --num-threads       NUM      "
+          "Set the number of worker threads to use" << endl
+    << "  --pch-dir           DIR      "
+          "Override directory containing precompiled headers" << endl
+    << "  --plugins           PLUGINS  "
+          "Load colon separated list of plugin libraries" << endl
+    << "  --quick [-q]                 "
+          "Only run first and last work-group" << endl
+    << "  --uniform-writes             "
+          "Don't suppress uniform write-write data-races" << endl
+    << "  --uninitialized              "
+          "Report usage of uninitialized values" << endl
+    << "  --version [-v]               "
+          "Display version information" << endl
     << endl
     << "For more information, please visit the Oclgrind wiki page:" << endl
     << "-> https://github.com/jrprice/Oclgrind/wiki" << endl
diff --git a/src/runtime/runtime.cpp b/src/runtime/runtime.cpp
index 55759c5..98d2c21 100644
--- a/src/runtime/runtime.cpp
+++ b/src/runtime/runtime.cpp
@@ -26,10 +26,10 @@
 
 using namespace std;
 
-#define MAX_GLOBAL_MEM_SIZE      (128 * 1048576)
-#define MAX_CONSTANT_BUFFER_SIZE (1048576)
-#define MAX_LOCAL_MEM_SIZE       (32768)
-#define MAX_WI_SIZE              (1024)
+#define DEFAULT_GLOBAL_MEM_SIZE   (128 * 1048576)
+#define DEFAULT_CONSTANT_MEM_SIZE (65536)
+#define DEFAULT_LOCAL_MEM_SIZE    (32768)
+#define DEFAULT_MAX_WGSIZE        (1024)
 
 #define PLATFORM_NAME       "Oclgrind"
 #define PLATFORM_VENDOR     "University of Bristol"
@@ -53,6 +53,8 @@ using namespace std;
   cl_khr_global_int32_extended_atomics \
   cl_khr_local_int32_base_atomics      \
   cl_khr_local_int32_extended_atomics  \
+  cl_khr_int64_base_atomics            \
+  cl_khr_int64_extended_atomics        \
   cl_khr_byte_addressable_store        \
   cl_khr_fp64"
 #define DEVICE_TYPE (CL_DEVICE_TYPE_CPU | \
@@ -154,8 +156,7 @@ namespace
     string error = oss.str();
 
     // Output message to stderr if required
-    const char *checkAPI = getenv("OCLGRIND_CHECK_API");
-    if (checkAPI && strcmp(checkAPI, "1") == 0)
+    if (oclgrind::checkEnv("OCLGRIND_CHECK_API"))
     {
       cerr << error << endl;
     }
@@ -216,6 +217,11 @@ clIcdGetPlatformIDsKHR
   cl_uint *num_platforms
 )
 {
+  if (platforms && num_entries < 1)
+  {
+    ReturnError(NULL, CL_INVALID_VALUE);
+  }
+
   if (!m_platform)
   {
     m_platform = new _cl_platform_id;
@@ -223,9 +229,20 @@ clIcdGetPlatformIDsKHR
 
     m_device = new _cl_device_id;
     m_device->dispatch = m_dispatchTable;
+    m_device->globalMemSize =
+      oclgrind::getEnvInt("OCLGRIND_GLOBAL_MEM_SIZE",
+                          DEFAULT_GLOBAL_MEM_SIZE, false);
+    m_device->constantMemSize =
+      oclgrind::getEnvInt("OCLGRIND_CONSTANT_MEM_SIZE",
+                          DEFAULT_CONSTANT_MEM_SIZE, false);
+    m_device->localMemSize =
+      oclgrind::getEnvInt("OCLGRIND_LOCAL_MEM_SIZE",
+                          DEFAULT_LOCAL_MEM_SIZE, false);
+    m_device->maxWGSize =
+      oclgrind::getEnvInt("OCLGRIND_MAX_WGSIZE", DEFAULT_MAX_WGSIZE, false);
   }
 
-  if (num_entries > 0)
+  if (platforms)
   {
     platforms[0] = m_platform;
   }
@@ -242,14 +259,6 @@ clIcdGetPlatformIDsKHR
 // OpenCL Runtime API Definitions //
 ////////////////////////////////////
 
-#ifndef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
-#endif
-
-#ifndef CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#endif
-
 CL_API_ENTRY void* CL_API_CALL
 clGetExtensionFunctionAddress
 (
@@ -407,6 +416,7 @@ clGetDeviceInfo
     cl_platform_id clplatid;
     cl_device_partition_property cldevpartprop;
     cl_device_affinity_domain cldevaffdom;
+    cl_device_svm_capabilities svm;
   } result_data;
   // The result is actually a string that needs copying
   const char* str = 0;
@@ -431,13 +441,13 @@ clGetDeviceInfo
     break;
   case CL_DEVICE_MAX_WORK_GROUP_SIZE:
     result_size = sizeof(size_t);
-    result_data.sizet = MAX_WI_SIZE;
+    result_data.sizet = m_device->maxWGSize;
     break;
   case CL_DEVICE_MAX_WORK_ITEM_SIZES:
     result_size = 3*sizeof(size_t);
-    result_data.sizet3[0] = MAX_WI_SIZE;
-    result_data.sizet3[1] = MAX_WI_SIZE;
-    result_data.sizet3[2] = MAX_WI_SIZE;
+    result_data.sizet3[0] = m_device->maxWGSize;
+    result_data.sizet3[1] = m_device->maxWGSize;
+    result_data.sizet3[2] = m_device->maxWGSize;
     break;
   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
@@ -462,11 +472,15 @@ clGetDeviceInfo
     break;
   case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
     result_size = sizeof(cl_uint);
-    result_data.cluint = 8;
+    result_data.cluint = 64;
+    break;
+  case CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 64;
     break;
   case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
     result_size = sizeof(cl_ulong);
-    result_data.clulong = MAX_GLOBAL_MEM_SIZE;
+    result_data.clulong = m_device->globalMemSize;
     break;
   case CL_DEVICE_IMAGE2D_MAX_WIDTH:
   case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
@@ -489,7 +503,27 @@ clGetDeviceInfo
     break;
   case CL_DEVICE_MAX_SAMPLERS:
     result_size = sizeof(cl_uint);
-    result_data.sizet = 16;
+    result_data.cluint = 16;
+    break;
+  case CL_DEVICE_IMAGE_PITCH_ALIGNMENT:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+  case CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 4;
+    break;
+  case CL_DEVICE_MAX_PIPE_ARGS:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 16;
+    break;
+  case CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+  case CL_DEVICE_PIPE_MAX_PACKET_SIZE:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1024;
     break;
   case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
     result_size = sizeof(cl_uint);
@@ -518,23 +552,31 @@ clGetDeviceInfo
     break;
   case CL_DEVICE_GLOBAL_MEM_SIZE:
     result_size = sizeof(cl_ulong);
-    result_data.clulong = MAX_GLOBAL_MEM_SIZE;
+    result_data.clulong = device->globalMemSize;
     break;
   case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
     result_size = sizeof(cl_ulong);
-    result_data.clulong = MAX_CONSTANT_BUFFER_SIZE;
+    result_data.clulong = device->constantMemSize;
     break;
   case CL_DEVICE_MAX_CONSTANT_ARGS:
     result_size = sizeof(cl_uint);
     result_data.cluint = 1024;
     break;
+  case CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE:
+    result_size = sizeof(size_t);
+    result_data.sizet = 64 * 1024;
+    break;
+  case CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE:
+    result_size = sizeof(size_t);
+    result_data.sizet = device->globalMemSize;
+    break;
   case CL_DEVICE_LOCAL_MEM_TYPE:
     result_size = sizeof(cl_device_local_mem_type);
     result_data.devicelocalmemtype = CL_LOCAL;
     break;
   case CL_DEVICE_LOCAL_MEM_SIZE:
     result_size = sizeof(cl_ulong);
-    result_data.clulong = MAX_LOCAL_MEM_SIZE;
+    result_data.clulong = device->localMemSize;
     break;
   case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
     result_size = sizeof(cl_bool);
@@ -546,7 +588,11 @@ clGetDeviceInfo
     break;
   case CL_DEVICE_ENDIAN_LITTLE:
     result_size = sizeof(cl_bool);
+#if IS_BIG_ENDIAN
+    result_data.clbool = CL_FALSE;
+#else
     result_data.clbool = CL_TRUE;
+#endif
     break;
   case CL_DEVICE_AVAILABLE:
     result_size = sizeof(cl_bool);
@@ -560,10 +606,31 @@ clGetDeviceInfo
     result_size = sizeof(cl_device_exec_capabilities);
     result_data.cldevexeccap =  CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL;
     break;
-  case CL_DEVICE_QUEUE_PROPERTIES:
+  case CL_DEVICE_QUEUE_ON_HOST_PROPERTIES:
     result_size = sizeof(cl_command_queue_properties);
     result_data.clcmdqprop = CL_QUEUE_PROFILING_ENABLE;
     break;
+  case CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES:
+    result_size = sizeof(cl_command_queue_properties);
+    result_data.clcmdqprop =
+      CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE;
+    break;
+  case CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 16 * 1024;
+    break;
+  case CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 256 * 1024;
+    break;
+  case CL_DEVICE_MAX_ON_DEVICE_QUEUES:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+  case CL_DEVICE_MAX_ON_DEVICE_EVENTS:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1024;
+    break;
   case CL_DEVICE_NAME:
     result_size = sizeof(DEVICE_NAME);
     str = DEVICE_NAME;
@@ -669,6 +736,22 @@ clGetDeviceInfo
     result_size = sizeof(size_t);
     result_data.sizet = 1024;
     break;
+  case CL_DEVICE_SVM_CAPABILITIES:
+    result_size = sizeof(cl_device_svm_capabilities);
+    result_data.svm = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER;
+    break;
+  case CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
+  case CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
+  case CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT:
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
   case CL_DEVICE_SPIR_VERSIONS:
     result_size = sizeof(DEVICE_SPIR_VERSIONS);
     str = DEVICE_SPIR_VERSIONS;
@@ -1494,7 +1577,7 @@ clCreateImage2D
     0,
     0,
     0,
-    NULL
+    {NULL}
   };
   return clCreateImage(context, flags,
                        image_format, &desc,
@@ -1527,7 +1610,7 @@ clCreateImage3D
     image_slice_pitch,
     0,
     0,
-    NULL
+    {NULL}
   };
   return clCreateImage(context, flags,
                        image_format, &desc,
@@ -1944,7 +2027,7 @@ clCreateSampler
 
   if (normalized_coords)
   {
-    bitfield |= 0x0001;
+    bitfield |= CLK_NORMALIZED_COORDS_TRUE;
   }
 
   switch (addressing_mode)
@@ -1952,16 +2035,16 @@ clCreateSampler
     case CL_ADDRESS_NONE:
       break;
     case CL_ADDRESS_CLAMP_TO_EDGE:
-      bitfield |= 0x0002;
+      bitfield |= CLK_ADDRESS_CLAMP_TO_EDGE;
       break;
     case CL_ADDRESS_CLAMP:
-      bitfield |= 0x0004;
+      bitfield |= CLK_ADDRESS_CLAMP;
       break;
     case CL_ADDRESS_REPEAT:
-      bitfield |= 0x0006;
+      bitfield |= CLK_ADDRESS_REPEAT;
       break;
     case CL_ADDRESS_MIRRORED_REPEAT:
-      bitfield |= 0x0008;
+      bitfield |= CLK_ADDRESS_MIRRORED_REPEAT;
       break;
     default:
       SetErrorArg(context, CL_INVALID_VALUE, addressing_mode);
@@ -1971,10 +2054,10 @@ clCreateSampler
   switch (filter_mode)
   {
     case CL_FILTER_NEAREST:
-      bitfield |= 0x0010;
+      bitfield |= CLK_FILTER_NEAREST;
       break;
     case CL_FILTER_LINEAR:
-      bitfield |= 0x0020;
+      bitfield |= CLK_FILTER_LINEAR;
       break;
     default:
       SetErrorArg(context, CL_INVALID_VALUE, filter_mode);
@@ -2618,6 +2701,7 @@ clGetProgramBuildInfo
   {
     cl_build_status status;
     cl_program_binary_type type;
+    size_t sizet;
   } result_data;
   const char* str = 0;
 
@@ -2639,6 +2723,10 @@ clGetProgramBuildInfo
     result_size = sizeof(cl_program_binary_type);
     result_data.type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
     break;
+  case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE:
+    result_size = sizeof(size_t);
+    result_data.sizet = program->program->getTotalProgramScopeVarSize();
+    break;
   default:
     ReturnErrorArg(program->context, CL_INVALID_VALUE, param_name);
   }
@@ -2803,7 +2891,11 @@ clSetKernelArg
   const void *  arg_value
 ) CL_API_SUFFIX__VERSION_1_0
 {
-  // Check parameters
+  // Check parameters are valid
+  if (!kernel)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+  }
   if (arg_index >= kernel->kernel->getNumArguments())
   {
     ReturnErrorInfo(kernel->program->context, CL_INVALID_ARG_INDEX,
@@ -2843,7 +2935,7 @@ clSetKernelArg
     }
     break;
   case CL_KERNEL_ARG_ADDRESS_LOCAL:
-    delete value.data;
+    delete[] value.data;
     value.data = NULL;
     break;
   case CL_KERNEL_ARG_ADDRESS_GLOBAL:
@@ -3080,7 +3172,7 @@ clGetKernelWorkGroupInfo
                     "CL_KERNEL_GLOBAL_SIZE only valid on custom devices");
   case CL_KERNEL_WORK_GROUP_SIZE:
     result_size = sizeof(size_t);
-    result_data.sizet = MAX_WI_SIZE;
+    result_data.sizet = m_device->maxWGSize;
     break;
   case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
     result_size = sizeof(size_t[3]);
@@ -4804,6 +4896,7 @@ clEnqueueNDRangeKernel
 
   // Check global and local sizes are valid
   size_t reqdWorkGroupSize[3];
+  size_t totalWGSize = 1;
   kernel->kernel->getRequiredWorkGroupSize(reqdWorkGroupSize);
   for (unsigned i = 0; i < work_dim; i++)
   {
@@ -4812,13 +4905,24 @@ clEnqueueNDRangeKernel
       ReturnErrorInfo(command_queue->context, CL_INVALID_GLOBAL_WORK_SIZE,
                       "global_work_size[" << i << "] = 0");
     }
-    if (local_work_size && global_work_size[i] % local_work_size[i])
+    if (kernel->kernel->getProgram()->requiresUniformWorkGroups() &&
+        local_work_size && global_work_size[i] % local_work_size[i])
     {
       ReturnErrorInfo(command_queue->context, CL_INVALID_WORK_GROUP_SIZE,
                       "local_work_size[" << i << "]=" << local_work_size[i] <<
                       " does not divide global_work_size[" << i << "]=" <<
                       global_work_size[i]);
     }
+    if (local_work_size)
+    {
+      if (local_work_size[i] > m_device->maxWGSize)
+      {
+        ReturnErrorInfo(command_queue->context, CL_INVALID_WORK_ITEM_SIZE,
+                        "local_work_size[" << i << "]=" << local_work_size[i] <<
+                        " exceeds device maximum of " << m_device->maxWGSize);
+      }
+      totalWGSize *= local_work_size[i];
+    }
     if (local_work_size && reqdWorkGroupSize[i] &&
         local_work_size[i] != reqdWorkGroupSize[i])
     {
@@ -4828,6 +4932,12 @@ clEnqueueNDRangeKernel
                       reqdWorkGroupSize[i])
     }
   }
+  if (totalWGSize > m_device->maxWGSize)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_WORK_GROUP_SIZE,
+                    "total work-group size (" << totalWGSize << ")"
+                    " exceeds device maximum of " << m_device->maxWGSize);
+  }
 
   // Ensure all arguments have been set
   if (!kernel->kernel->allArgumentsSet())
@@ -4836,6 +4946,31 @@ clEnqueueNDRangeKernel
                     "Not all kernel arguments set");
   }
 
+  // Check that local memory requirement is within device maximum
+  size_t totalLocal = kernel->kernel->getLocalMemorySize();
+  if (totalLocal > m_device->localMemSize)
+  {
+    ReturnErrorInfo(command_queue->context, CL_OUT_OF_RESOURCES,
+                    "total local memory size (" << totalLocal << ")"
+                    " exceeds device maximum of " << m_device->localMemSize);
+  }
+
+  // Check that constant memory requirement is within device maximum
+  size_t totalConstant = 0;
+  std::map<cl_uint,cl_mem>::iterator arg;
+  for (arg = kernel->memArgs.begin(); arg != kernel->memArgs.end(); arg++)
+  {
+    if (kernel->kernel->getArgumentAddressQualifier(arg->first) ==
+        CL_KERNEL_ARG_ADDRESS_CONSTANT)
+      totalConstant += arg->second->size;
+  }
+  if (totalConstant > m_device->constantMemSize)
+  {
+    ReturnErrorInfo(command_queue->context, CL_OUT_OF_RESOURCES,
+                    "total constant memory size (" << totalConstant << ")"
+                    " exceeds device maximum of " << m_device->constantMemSize);
+  }
+
   // Set-up offsets and sizes
   oclgrind::Queue::KernelCommand *cmd = new oclgrind::Queue::KernelCommand();
   cmd->kernel = new oclgrind::Kernel(*kernel->kernel);
@@ -5446,6 +5581,340 @@ clEnqueueReleaseDX9MediaSurfacesKHR
 
 #endif // DX extension functions
 
+
+/////////////////////
+// OpenCL 2.0 APIs //
+/////////////////////
+
+CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithProperties
+(
+  cl_context                  context,
+  cl_device_id                device,
+  const cl_queue_properties * properties,
+  cl_int *                    errcode_ret
+) CL_API_SUFFIX__VERSION_2_0
+{
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+  if (device != m_device)
+  {
+    SetErrorArg(context, CL_INVALID_DEVICE, device);
+    return NULL;
+  }
+
+  // Parse properties
+  cl_command_queue_properties props = 0;
+  unsigned i = 0;
+  while (properties && properties[i])
+  {
+    switch (properties[i++])
+    {
+    case CL_QUEUE_PROPERTIES:
+      if (properties[i] & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+      {
+        SetErrorInfo(context, CL_INVALID_QUEUE_PROPERTIES,
+                     "Out-of-order command queues not supported");
+        return NULL;
+      }
+      if (properties[i] &
+          (CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT))
+      {
+        SetErrorInfo(context, CL_INVALID_QUEUE_PROPERTIES,
+                     "On device queues not implemented");
+        return NULL;
+      }
+      props = properties[i];
+      break;
+    case CL_QUEUE_SIZE:
+      SetErrorInfo(context, CL_INVALID_VALUE, "CL_QUEUE_SIZE not implemented");
+      return NULL;
+    default:
+      SetErrorInfo(context, CL_INVALID_VALUE, properties);
+      return NULL;
+    }
+    i++;
+  }
+
+  // Create command-queue object
+  cl_command_queue queue;
+  queue = new _cl_command_queue;
+  queue->queue = new oclgrind::Queue(context->context);
+  queue->dispatch = m_dispatchTable;
+  queue->properties = props;
+  queue->context = context;
+  queue->refCount = 1;
+
+  clRetainContext(context);
+
+  SetError(context, CL_SUCCESS);
+  return queue;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe
+(
+  cl_context                 context,
+  cl_mem_flags               flags,
+  cl_uint                    pipe_packet_size,
+  cl_uint                    pipe_max_packets,
+  const cl_pipe_properties * properties,
+  cl_int *                   errcode_ret
+) CL_API_SUFFIX__VERSION_2_0
+{
+  SetErrorInfo(context, CL_INVALID_OPERATION, "Unimplemented OpenCL 2.0 API");
+  return NULL;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo
+(
+  cl_mem       pipe,
+  cl_pipe_info param_name,
+  size_t       param_value_size,
+  void *       param_value,
+  size_t *     param_value_size_ret
+) CL_API_SUFFIX__VERSION_2_0
+{
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "Unimplemented OpenCL 2.0 API");
+}
+
+CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc
+(
+  cl_context       context,
+  cl_svm_mem_flags flags,
+  size_t           size,
+  cl_uint          alignment
+) CL_API_SUFFIX__VERSION_2_0
+{
+  notifyAPIError(context, CL_INVALID_OPERATION, __func__,
+                 "Unimplemented OpenCL 2.0 API");
+  return NULL;
+}
+
+CL_API_ENTRY void CL_API_CALL
+clSVMFree
+(
+  cl_context context,
+  void *     svm_pointer
+) CL_API_SUFFIX__VERSION_2_0
+{
+  notifyAPIError(context, CL_INVALID_OPERATION, __func__,
+                 "Unimplemented OpenCL 2.0 API");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree
+(
+  cl_command_queue command_queue,
+  cl_uint num_svm_pointers,
+  void* svm_pointers[],
+  void (CL_CALLBACK* pfn_free_func)(
+      cl_command_queue queue,
+      cl_uint num_svm_pointers,
+      void* svm_pointers[],
+      void* user_data),
+  void* user_data,
+  cl_uint num_events_in_wait_list,
+  const cl_event* event_wait_list,
+  cl_event* event
+) CL_API_SUFFIX__VERSION_2_0
+{
+  ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                  "Unimplemented OpenCL 2.0 API");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy
+(
+  cl_command_queue  command_queue,
+  cl_bool           blocking_copy,
+  void *            dst_ptr,
+  const void *      src_ptr,
+  size_t            size,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+) CL_API_SUFFIX__VERSION_2_0
+{
+  ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                  "Unimplemented OpenCL 2.0 API");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill
+(
+  cl_command_queue command_queue,
+  void *           svm_ptr,
+  const void *     pattern,
+  size_t           pattern_size,
+  size_t           size,
+  cl_uint          num_events_in_wait_list,
+  const cl_event * event_wait_list,
+  cl_event *       event
+) CL_API_SUFFIX__VERSION_2_0
+{
+  ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                  "Unimplemented OpenCL 2.0 API");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap
+(
+  cl_command_queue  command_queue,
+  cl_bool           blocking_map,
+  cl_map_flags      flags,
+  void *            svm_ptr,
+  size_t            size,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+) CL_API_SUFFIX__VERSION_2_0
+{
+  ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                  "Unimplemented OpenCL 2.0 API");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap
+(
+  cl_command_queue command_queue,
+  void *           svm_ptr,
+  cl_uint          num_events_in_wait_list,
+  const cl_event * event_wait_list,
+  cl_event *       event
+) CL_API_SUFFIX__VERSION_2_0
+{
+  ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                  "Unimplemented OpenCL 2.0 API");
+}
+
+CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSamplerWithProperties
+(
+  cl_context                     context,
+  const cl_sampler_properties *  sampler_properties,
+  cl_int *                       errcode_ret
+) CL_API_SUFFIX__VERSION_2_0
+{
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+
+  cl_bool             normalized_coords = CL_TRUE;
+  cl_addressing_mode  addressing_mode   = CL_ADDRESS_CLAMP;
+  cl_filter_mode      filter_mode       = CL_FILTER_NEAREST;
+
+  // Parse properties
+  unsigned i = 0;
+  while (sampler_properties && sampler_properties[i])
+  {
+    switch (sampler_properties[i++])
+    {
+    case CL_SAMPLER_NORMALIZED_COORDS:
+      normalized_coords = sampler_properties[i];
+      break;
+    case CL_SAMPLER_ADDRESSING_MODE:
+      addressing_mode = sampler_properties[i];
+      break;
+    case CL_SAMPLER_FILTER_MODE:
+      filter_mode = sampler_properties[i];
+      break;
+    default:
+      SetErrorInfo(context, CL_INVALID_VALUE, sampler_properties);
+      return NULL;
+    }
+    i++;
+  }
+
+  // Create sampler bitfield
+  uint32_t bitfield = 0;
+
+  if (normalized_coords)
+  {
+    bitfield |= CLK_NORMALIZED_COORDS_TRUE;
+  }
+
+  switch (addressing_mode)
+  {
+    case CL_ADDRESS_NONE:
+      break;
+    case CL_ADDRESS_CLAMP_TO_EDGE:
+      bitfield |= CLK_ADDRESS_CLAMP_TO_EDGE;
+      break;
+    case CL_ADDRESS_CLAMP:
+      bitfield |= CLK_ADDRESS_CLAMP;
+      break;
+    case CL_ADDRESS_REPEAT:
+      bitfield |= CLK_ADDRESS_REPEAT;
+      break;
+    case CL_ADDRESS_MIRRORED_REPEAT:
+      bitfield |= CLK_ADDRESS_MIRRORED_REPEAT;
+      break;
+    default:
+      SetErrorArg(context, CL_INVALID_VALUE, sampler_properties);
+      return NULL;
+  }
+
+  switch (filter_mode)
+  {
+    case CL_FILTER_NEAREST:
+      bitfield |= CLK_FILTER_NEAREST;
+      break;
+    case CL_FILTER_LINEAR:
+      bitfield |= CLK_FILTER_LINEAR;
+      break;
+    default:
+      SetErrorArg(context, CL_INVALID_VALUE, sampler_properties);
+      return NULL;
+  }
+
+  // Create sampler
+  cl_sampler sampler = new _cl_sampler;
+  sampler->dispatch = m_dispatchTable;
+  sampler->context = context;
+  sampler->normCoords = normalized_coords;
+  sampler->addressMode = addressing_mode;
+  sampler->filterMode = filter_mode;
+  sampler->sampler = bitfield;
+
+  SetError(context, CL_SUCCESS);
+  return sampler;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer
+(
+  cl_kernel    kernel,
+  cl_uint      arg_index,
+  const void * arg_value
+) CL_API_SUFFIX__VERSION_2_0
+{
+  ReturnErrorInfo(kernel->program->context, CL_INVALID_OPERATION,
+                  "Unimplemented OpenCL 2.0 API");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo
+(
+  cl_kernel            kernel,
+  cl_kernel_exec_info  param_name,
+  size_t               param_value_size,
+  const void *         param_value
+) CL_API_SUFFIX__VERSION_2_0
+{
+  ReturnErrorInfo(kernel->program->context, CL_INVALID_OPERATION,
+                  "Unimplemented OpenCL 2.0 API");
+}
+
 ////////////////////
 // Dispatch Table //
 ////////////////////
@@ -5604,6 +6073,32 @@ void *m_dispatchTable[] =
   DISPATCH_TABLE_ENTRY(NULL),
   DISPATCH_TABLE_ENTRY(NULL),
 #endif
+
+  // cl_khr_egl_image
+  DISPATCH_TABLE_ENTRY(NULL),
+  DISPATCH_TABLE_ENTRY(NULL),
+  DISPATCH_TABLE_ENTRY(NULL),
+
+  // cl_khr_egl_event
+  DISPATCH_TABLE_ENTRY(NULL),
+
+  // OpenCL 2.0
+  DISPATCH_TABLE_ENTRY(clCreateCommandQueueWithProperties),
+  DISPATCH_TABLE_ENTRY(clCreatePipe),
+  DISPATCH_TABLE_ENTRY(clGetPipeInfo),
+  DISPATCH_TABLE_ENTRY(clSVMAlloc),
+  DISPATCH_TABLE_ENTRY(clSVMFree),
+  DISPATCH_TABLE_ENTRY(clEnqueueSVMFree),
+  DISPATCH_TABLE_ENTRY(clEnqueueSVMMemcpy),
+  DISPATCH_TABLE_ENTRY(clEnqueueSVMMemFill),
+  DISPATCH_TABLE_ENTRY(clEnqueueSVMMap),
+  DISPATCH_TABLE_ENTRY(clEnqueueSVMUnmap),
+  DISPATCH_TABLE_ENTRY(clCreateSamplerWithProperties),
+  DISPATCH_TABLE_ENTRY(clSetKernelArgSVMPointer),
+  DISPATCH_TABLE_ENTRY(clSetKernelExecInfo),
+
+  // cl_khr_sub_groups
+  DISPATCH_TABLE_ENTRY(NULL),
 };
 
 #if defined(_WIN32) && !defined(OCLGRIND_ICD)
diff --git a/tests/apps/CMakeLists.txt b/tests/apps/CMakeLists.txt
index b8fb14a..2c1abb5 100644
--- a/tests/apps/CMakeLists.txt
+++ b/tests/apps/CMakeLists.txt
@@ -30,13 +30,14 @@ foreach(test
     NAME app_${test}
     COMMAND
     ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/run_test.py
-    $<TARGET_FILE:oclgrind-kernel>
+    $<TARGET_FILE:oclgrind-exe>
     $<TARGET_FILE:${test}>)
 
   set_tests_properties(app_${test} PROPERTIES DEPENDS ${test})
 
   # Set PCH directory
-  set_tests_properties(app_${test} PROPERTIES
-      ENVIRONMENT "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
+  set(ENV "OCLGRIND_TESTING=1")
+  list(APPEND ENV "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
+  set_tests_properties(app_${test} PROPERTIES ENVIRONMENT "${ENV}")
 
 endforeach(${test})
diff --git a/tests/apps/image/image.c b/tests/apps/image/image.c
index 101dd4f..fd1d014 100644
--- a/tests/apps/image/image.c
+++ b/tests/apps/image/image.c
@@ -50,7 +50,7 @@ int main()
         }
     }
 
-    Context cl = createContext(KERNEL_SOURCE);
+    Context cl = createContext(KERNEL_SOURCE, "");
     kernel = clCreateKernel(cl.program, "image_copy", &err);
     checkError(err, "creating kernel");
 
diff --git a/tests/apps/vecadd/vecadd.c b/tests/apps/vecadd/vecadd.c
index 49f59e5..17f2206 100644
--- a/tests/apps/vecadd/vecadd.c
+++ b/tests/apps/vecadd/vecadd.c
@@ -44,7 +44,7 @@ int main(int argc, char *argv[])
     exit(1);
   }
 
-  Context cl = createContext(KERNEL_SOURCE);
+  Context cl = createContext(KERNEL_SOURCE, "");
 
   kernel = clCreateKernel(cl.program, "vecadd", &err);
   checkError(err, "creating kernel");
diff --git a/tests/common/common.c b/tests/common/common.c
index 1923048..db2058c 100644
--- a/tests/common/common.c
+++ b/tests/common/common.c
@@ -12,7 +12,7 @@ void checkError(cl_int err, const char *operation)
   }
 }
 
-Context createContext(const char *source)
+Context createContext(const char *source, const char *options)
 {
   Context cl;
   cl_int err;
@@ -42,7 +42,7 @@ Context createContext(const char *source)
   cl.program = clCreateProgramWithSource(cl.context, 1, &source, NULL, &err);
   checkError(err, "creating program");
 
-  err = clBuildProgram(cl.program, 1, &cl.device, "", NULL, NULL);
+  err = clBuildProgram(cl.program, 1, &cl.device, options, NULL, NULL);
   if (err == CL_BUILD_PROGRAM_FAILURE)
   {
     size_t sz;
diff --git a/tests/common/common.h b/tests/common/common.h
index 92b44db..8229da0 100644
--- a/tests/common/common.h
+++ b/tests/common/common.h
@@ -11,5 +11,5 @@ typedef struct
 
 void checkError(cl_int err, const char *operation);
 
-Context createContext(const char *source);
+Context createContext(const char *source, const char *options);
 void    releaseContext(Context cl);
diff --git a/tests/kernels/CMakeLists.txt b/tests/kernels/CMakeLists.txt
index 017acb2..50deaba 100644
--- a/tests/kernels/CMakeLists.txt
+++ b/tests/kernels/CMakeLists.txt
@@ -22,6 +22,30 @@ endforeach(${test})
 set_tests_properties(${KERNEL_TESTS} PROPERTIES
     ENVIRONMENT "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
 
+# LLVM bug, exposed in 4.0, fixed in 5.0, backported to 4.0.1
+if ("${LLVM_VERSION_MAJOR}" STREQUAL "4" AND
+    "${LLVM_VERSION_PATCH}" STREQUAL "0")
+  set(XFAIL
+      ${XFAIL}
+      bugs/byval_function_argument
+      bugs/const_gep_expr_pointee_type
+      bugs/multidim_array_in_struct
+      memcheck/static_array_padded_struct
+      misc/array
+      uninitialized/private_array_initializer_list
+  )
+endif()
+
+# Can't seem to get DIGlobalVariable in LLVM 3.9
+if ("${LLVM_VERSION_MAJOR}" STRLESS "4")
+  set(XFAIL ${XFAIL} interactive/struct_member)
+endif()
+
+# LLVM bug, exposed in 4.0, fixed in 5.0, backported to 4.0.1
+if ("${LLVM_VERSION_MAJOR}" STREQUAL "4" AND
+    "${LLVM_VERSION_PATCH}" STREQUAL "0")
+  set(XFAIL ${XFAIL} bugs/llvm_memcpyopt_bug)
+endif()
+
 # Expected failures
-set_tests_properties(
-  PROPERTIES WILL_FAIL TRUE)
+set_tests_properties(${XFAIL} PROPERTIES WILL_FAIL TRUE)
diff --git a/tests/kernels/TESTS b/tests/kernels/TESTS
index fd95269..31251d2 100644
--- a/tests/kernels/TESTS
+++ b/tests/kernels/TESTS
@@ -16,9 +16,11 @@ atomics/atomic_global_fence_race
 atomics/atomic_increment
 atomics/atomic_intergroup_race
 atomics/atomic_local_fence
+atomics/atomic_minmax_signed
 atomics/atomic_race_after
 atomics/atomic_race_before
 atomics/atomic_same_workitem
+atomics/atom_add
 barrier/barrier_different_instructions
 barrier/barrier_divergence
 bugs/byval_function_argument
@@ -27,6 +29,7 @@ bugs/false_warning_vector_argument
 bugs/gvn_arbitrary_integers
 bugs/kernel_struct_argument
 bugs/llvm_bswap
+bugs/llvm_memcpyopt_bug
 bugs/many_alloca
 bugs/multidim_array_in_struct
 bugs/null_argument
@@ -46,6 +49,7 @@ data-race/local_only_fence
 data-race/local_read_write_race
 data-race/local_write_write_race
 data-race/uniform_write_race
+interactive/struct_member
 memcheck/async_copy_out_of_bounds
 memcheck/atomic_out_of_bounds
 memcheck/casted_static_array
@@ -58,9 +62,13 @@ memcheck/static_array_padded_struct
 memcheck/write_out_of_bounds
 memcheck/write_read_only_memory
 misc/array
+misc/global_variables
 misc/lvalue_loads
+misc/non_uniform_work_groups
+misc/printf
 misc/program_scope_constant_array
 misc/reduce
+misc/switch_case
 misc/vecadd
 misc/vector_argument
 uninitialized/padded_nested_struct_memcpy
@@ -80,4 +88,4 @@ uninitialized/uninitialized_private_array
 wait_event/wait_event_chained
 wait_event/wait_event_divergent
 wait_event/wait_event_duplicates
-wait_event/wait_event_invalid
\ No newline at end of file
+wait_event/wait_event_invalid
diff --git a/tests/kernels/atomics/atom_add.cl b/tests/kernels/atomics/atom_add.cl
new file mode 100644
index 0000000..317fdd5
--- /dev/null
+++ b/tests/kernels/atomics/atom_add.cl
@@ -0,0 +1,6 @@
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+
+kernel void _atom_add(global ulong *data)
+{
+  atom_add(data, (ulong)UINT_MAX);
+}
diff --git a/tests/kernels/atomics/atom_add.ref b/tests/kernels/atomics/atom_add.ref
new file mode 100644
index 0000000..b067357
--- /dev/null
+++ b/tests/kernels/atomics/atom_add.ref
@@ -0,0 +1,2 @@
+EXACT Argument 'data': 8 bytes
+EXACT   data[0] = 17179869180
diff --git a/tests/kernels/atomics/atom_add.sim b/tests/kernels/atomics/atom_add.sim
new file mode 100644
index 0000000..87a9f6b
--- /dev/null
+++ b/tests/kernels/atomics/atom_add.sim
@@ -0,0 +1,6 @@
+atom_add.cl
+_atom_add
+4 1 1
+1 1 1
+
+<size=8 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_cmpxchg_read_race.cl b/tests/kernels/atomics/atomic_cmpxchg_read_race.cl
index 9be3a88..2f6655e 100644
--- a/tests/kernels/atomics/atomic_cmpxchg_read_race.cl
+++ b/tests/kernels/atomics/atomic_cmpxchg_read_race.cl
@@ -7,6 +7,6 @@ kernel void atomic_cmpxchg_read_race(global int *data)
   }
   else
   {
-    atomic_cmpxchg(data, 0, i);
+    atomic_cmpxchg(data, 0, 0x01000001);
   }
 }
diff --git a/tests/kernels/atomics/atomic_cmpxchg_read_race.ref b/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
index d01adc0..43c056c 100644
--- a/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
+++ b/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
@@ -2,4 +2,4 @@ ERROR Read-write data race at global memory
 ERROR Write-write data race at global memory
 
 EXACT Argument 'data': 4 bytes
-EXACT   data[0] = 1
+MATCH   data[0] =
diff --git a/tests/kernels/atomics/atomic_minmax_signed.cl b/tests/kernels/atomics/atomic_minmax_signed.cl
new file mode 100644
index 0000000..3c2e4be
--- /dev/null
+++ b/tests/kernels/atomics/atomic_minmax_signed.cl
@@ -0,0 +1,16 @@
+kernel void atomic_minmax_signed(global int *data)
+{
+  atomic_min(data+0, -8);
+  atomic_min(data+1, -6);
+  atomic_min(data+2, 3);
+  atomic_min(data+3, -3);
+  atomic_min(data+4, 6);
+  atomic_min(data+5, 8);
+
+  atomic_max(data+6, -8);
+  atomic_max(data+7, -6);
+  atomic_max(data+8, 3);
+  atomic_max(data+9, -3);
+  atomic_max(data+10, 6);
+  atomic_max(data+11, 8);
+}
diff --git a/tests/kernels/atomics/atomic_minmax_signed.ref b/tests/kernels/atomics/atomic_minmax_signed.ref
new file mode 100644
index 0000000..3dc9987
--- /dev/null
+++ b/tests/kernels/atomics/atomic_minmax_signed.ref
@@ -0,0 +1,13 @@
+EXACT Argument 'data': 48 bytes
+EXACT   data[0] = -8
+EXACT   data[1] = -7
+EXACT   data[2] = -7
+EXACT   data[3] = -3
+EXACT   data[4] = 6
+EXACT   data[5] = 7
+EXACT   data[6] = -7
+EXACT   data[7] = -6
+EXACT   data[8] = 3
+EXACT   data[9] = 7
+EXACT   data[10] = 7
+EXACT   data[11] = 8
diff --git a/tests/kernels/atomics/atomic_minmax_signed.sim b/tests/kernels/atomics/atomic_minmax_signed.sim
new file mode 100644
index 0000000..28f294c
--- /dev/null
+++ b/tests/kernels/atomics/atomic_minmax_signed.sim
@@ -0,0 +1,18 @@
+atomic_minmax_signed.cl
+atomic_minmax_signed
+1 1 1
+1 1 1
+
+<size=48 int dump>
+-7
+-7
+-7
+7
+7
+7
+-7
+-7
+-7
+7
+7
+7
diff --git a/tests/kernels/atomics/atomic_race_after.cl b/tests/kernels/atomics/atomic_race_after.cl
index d168053..91659bc 100644
--- a/tests/kernels/atomics/atomic_race_after.cl
+++ b/tests/kernels/atomics/atomic_race_after.cl
@@ -1,8 +1,8 @@
-kernel void atomic_race_after(global int *data)
+kernel void atomic_race_after(global int *data, global int *output)
 {
   atomic_inc(data);
   if (get_global_id(0) == get_global_size(0)-1)
   {
-    (*data)++;
+    *output = *data;
   }
 }
diff --git a/tests/kernels/atomics/atomic_race_after.ref b/tests/kernels/atomics/atomic_race_after.ref
index 8c97680..19b6b2f 100644
--- a/tests/kernels/atomics/atomic_race_after.ref
+++ b/tests/kernels/atomics/atomic_race_after.ref
@@ -1,6 +1,4 @@
 ERROR Read-write data race at global memory
-ERROR Read-write data race at global memory
-ERROR Write-write data race at global memory
 
-EXACT Argument 'data': 4 bytes
-EXACT   data[0] = 5
+EXACT Argument 'output': 4 bytes
+MATCH   output[0] =
diff --git a/tests/kernels/atomics/atomic_race_after.sim b/tests/kernels/atomics/atomic_race_after.sim
index d182089..65cda65 100644
--- a/tests/kernels/atomics/atomic_race_after.sim
+++ b/tests/kernels/atomics/atomic_race_after.sim
@@ -3,4 +3,5 @@ atomic_race_after
 4 1 1
 4 1 1
 
+<size=4 fill=0>
 <size=4 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_race_before.cl b/tests/kernels/atomics/atomic_race_before.cl
index 53db050..a31b335 100644
--- a/tests/kernels/atomics/atomic_race_before.cl
+++ b/tests/kernels/atomics/atomic_race_before.cl
@@ -4,5 +4,5 @@ kernel void atomic_race_before(global int *data)
   {
     *data = 0;
   }
-  atomic_inc(data);
+  atomic_dec(data);
 }
diff --git a/tests/kernels/atomics/atomic_race_before.ref b/tests/kernels/atomics/atomic_race_before.ref
index 65484a7..8eda7fd 100644
--- a/tests/kernels/atomics/atomic_race_before.ref
+++ b/tests/kernels/atomics/atomic_race_before.ref
@@ -6,4 +6,4 @@ ERROR Read-write data race at global memory address
 ERROR Write-write data race at global memory address
 
 EXACT Argument 'data': 4 bytes
-EXACT   data[0] = 4
+EXACT   data[0] = -4
diff --git a/tests/kernels/bugs/const_gep_expr_pointee_type.cl b/tests/kernels/bugs/const_gep_expr_pointee_type.cl
index 63d9780..9145370 100644
--- a/tests/kernels/bugs/const_gep_expr_pointee_type.cl
+++ b/tests/kernels/bugs/const_gep_expr_pointee_type.cl
@@ -1,3 +1,5 @@
+#pragma clang diagnostic ignored "-Wunused-value"
+
 struct S0 {
   int d;
   long b;
diff --git a/tests/kernels/bugs/llvm_memcpyopt_bug.cl b/tests/kernels/bugs/llvm_memcpyopt_bug.cl
new file mode 100644
index 0000000..e8028bb
--- /dev/null
+++ b/tests/kernels/bugs/llvm_memcpyopt_bug.cl
@@ -0,0 +1,15 @@
+typedef struct {
+  int a;
+  int b;
+  int c;
+} S;
+
+S foo(S a) {
+  return a;
+}
+
+kernel void llvm_memcpyopt_bug(global S *out)
+{
+  S a = {7,7,7};
+  out[0] = foo(a);
+}
diff --git a/tests/kernels/bugs/llvm_memcpyopt_bug.ref b/tests/kernels/bugs/llvm_memcpyopt_bug.ref
new file mode 100644
index 0000000..2be7db8
--- /dev/null
+++ b/tests/kernels/bugs/llvm_memcpyopt_bug.ref
@@ -0,0 +1,5 @@
+
+EXACT Argument 'out': 12 bytes
+EXACT   out[0] = 7
+EXACT   out[1] = 7
+EXACT   out[2] = 7
diff --git a/tests/kernels/bugs/llvm_memcpyopt_bug.sim b/tests/kernels/bugs/llvm_memcpyopt_bug.sim
new file mode 100644
index 0000000..02aef6a
--- /dev/null
+++ b/tests/kernels/bugs/llvm_memcpyopt_bug.sim
@@ -0,0 +1,6 @@
+llvm_memcpyopt_bug.cl
+llvm_memcpyopt_bug
+1 1 1
+1 1 1
+
+<size=12 int fill=0 dump>
diff --git a/tests/kernels/interactive/struct_member.cl b/tests/kernels/interactive/struct_member.cl
new file mode 100644
index 0000000..a4b3459
--- /dev/null
+++ b/tests/kernels/interactive/struct_member.cl
@@ -0,0 +1,19 @@
+struct S
+{
+  int a;
+  float b;
+};
+
+kernel void struct_member(global int *i, global float *f, global struct S *out)
+{
+  struct S s;
+  local struct S t;
+  s.a = *i;
+  s.b = *f;
+  t = s;
+  t.a += 1;
+  t.b += 0.1f;
+  *out = t;
+  out->a += 2;
+  out->b += 0.2f;
+}
diff --git a/tests/kernels/interactive/struct_member.inp b/tests/kernels/interactive/struct_member.inp
new file mode 100644
index 0000000..91b4678
--- /dev/null
+++ b/tests/kernels/interactive/struct_member.inp
@@ -0,0 +1,21 @@
+step
+step
+step
+step
+print s
+print s.a
+print s.b
+print s.c
+print s.
+step
+step
+print t.a
+print t.b
+step
+step
+step
+step
+print out[0].a
+print out[0].b
+print out->a
+print out->b
diff --git a/tests/kernels/interactive/struct_member.ref b/tests/kernels/interactive/struct_member.ref
new file mode 100644
index 0000000..4f915fd
--- /dev/null
+++ b/tests/kernels/interactive/struct_member.ref
@@ -0,0 +1,31 @@
+MATCH 7	kernel void struct_member(global int *i, global float *f, global struct S *out)
+MATCH 9	  struct S s;
+MATCH 11	  s.a = *i;
+MATCH 12	  s.b = *f;
+MATCH 13	  t = s;
+MATCH s = (raw) 0x0700000000002842
+MATCH s.a = 7
+MATCH s.b = 42
+MATCH s.c = no member named 'c' found
+MATCH s. = no member named '' found
+MATCH 14	  t.a += 1;
+MATCH 15	  t.b += 0.1f;
+MATCH t.a = 8
+MATCH t.b = 42
+MATCH 16	  *out = t;
+MATCH 17	  out->a += 2;
+MATCH 18	  out->b += 0.2f;
+MATCH out[0].a = 10
+MATCH out[0].b = 42.3
+MATCH out->a = 10
+MATCH out->b = 42.3
+
+MATCH Argument 'out': 8 bytes
+MATCH   out[0] = 0x0A
+MATCH   out[1] = 0x00
+MATCH   out[2] = 0x00
+MATCH   out[3] = 0x00
+MATCH   out[4] = 0x33
+MATCH   out[5] = 0x33
+MATCH   out[6] = 0x29
+MATCH   out[7] = 0x42
diff --git a/tests/kernels/interactive/struct_member.sim b/tests/kernels/interactive/struct_member.sim
new file mode 100644
index 0000000..76cd2db
--- /dev/null
+++ b/tests/kernels/interactive/struct_member.sim
@@ -0,0 +1,9 @@
+# ARGS: -i
+struct_member.cl
+struct_member
+1 1 1
+1 1 1
+
+<size=4 int fill=7>
+<size=4 float fill=42>
+<size=8 hex char fill=0 dump>
diff --git a/tests/kernels/misc/global_variables.cl b/tests/kernels/misc/global_variables.cl
new file mode 100644
index 0000000..e946e14
--- /dev/null
+++ b/tests/kernels/misc/global_variables.cl
@@ -0,0 +1,14 @@
+global int g_arr[] = {7, 42};
+constant int c_arr[] = {-3, 56};
+global int *p_g_int = &g_arr[1];
+constant int *p_c_int = &c_arr[1];
+
+kernel void global_variables(global int *output)
+{
+  output[0] = g_arr[0];
+  output[1] = g_arr[1];
+  output[2] = c_arr[0];
+  output[3] = c_arr[1];
+  output[4] = *p_g_int;
+  output[5] = *p_c_int;
+}
diff --git a/tests/kernels/misc/global_variables.ref b/tests/kernels/misc/global_variables.ref
new file mode 100644
index 0000000..3d5512c
--- /dev/null
+++ b/tests/kernels/misc/global_variables.ref
@@ -0,0 +1,7 @@
+EXACT Argument 'output': 24 bytes
+EXACT   output[0] = 7
+EXACT   output[1] = 42
+EXACT   output[2] = -3
+EXACT   output[3] = 56
+EXACT   output[4] = 42
+EXACT   output[5] = 56
diff --git a/tests/kernels/misc/global_variables.sim b/tests/kernels/misc/global_variables.sim
new file mode 100644
index 0000000..259d34c
--- /dev/null
+++ b/tests/kernels/misc/global_variables.sim
@@ -0,0 +1,8 @@
+# ARGS: --build-options -cl-std=CL2.0
+global_variables.cl
+global_variables
+1 1 1
+1 1 1
+
+<size=24 int fill=0 dump>
+
diff --git a/tests/kernels/misc/non_uniform_work_groups.cl b/tests/kernels/misc/non_uniform_work_groups.cl
new file mode 100644
index 0000000..4b1da84
--- /dev/null
+++ b/tests/kernels/misc/non_uniform_work_groups.cl
@@ -0,0 +1,16 @@
+kernel void non_uniform_work_groups(global int *output)
+{
+  int i = get_global_linear_id();
+  output[i] = get_local_linear_id();
+
+  int end = get_global_size(0) * get_global_size(1) * get_global_size(2);
+  if (i == end-1)
+  {
+    output[end]   = get_local_size(0);
+    output[end+1] = get_local_size(1);
+    output[end+2] = get_local_size(2);
+    output[end+3] = get_enqueued_local_size(0);
+    output[end+4] = get_enqueued_local_size(1);
+    output[end+5] = get_enqueued_local_size(2);
+  }
+}
diff --git a/tests/kernels/misc/non_uniform_work_groups.ref b/tests/kernels/misc/non_uniform_work_groups.ref
new file mode 100644
index 0000000..c5d912b
--- /dev/null
+++ b/tests/kernels/misc/non_uniform_work_groups.ref
@@ -0,0 +1,34 @@
+EXACT Argument 'output': 132 bytes
+EXACT   output[0] = 0
+EXACT   output[1] = 1
+EXACT   output[2] = 0
+EXACT   output[3] = 2
+EXACT   output[4] = 3
+EXACT   output[5] = 1
+EXACT   output[6] = 0
+EXACT   output[7] = 1
+EXACT   output[8] = 0
+EXACT   output[9] = 4
+EXACT   output[10] = 5
+EXACT   output[11] = 2
+EXACT   output[12] = 6
+EXACT   output[13] = 7
+EXACT   output[14] = 3
+EXACT   output[15] = 2
+EXACT   output[16] = 3
+EXACT   output[17] = 1
+EXACT   output[18] = 0
+EXACT   output[19] = 1
+EXACT   output[20] = 0
+EXACT   output[21] = 2
+EXACT   output[22] = 3
+EXACT   output[23] = 1
+EXACT   output[24] = 0
+EXACT   output[25] = 1
+EXACT   output[26] = 0
+EXACT   output[27] = 1
+EXACT   output[28] = 1
+EXACT   output[29] = 1
+EXACT   output[30] = 2
+EXACT   output[31] = 2
+EXACT   output[32] = 2
diff --git a/tests/kernels/misc/non_uniform_work_groups.sim b/tests/kernels/misc/non_uniform_work_groups.sim
new file mode 100644
index 0000000..f83dc92
--- /dev/null
+++ b/tests/kernels/misc/non_uniform_work_groups.sim
@@ -0,0 +1,7 @@
+# ARGS: --build-options -cl-std=CL2.0
+non_uniform_work_groups.cl
+non_uniform_work_groups
+3 3 3
+2 2 2
+
+<size=132 fill=0 dump>
diff --git a/tests/kernels/misc/printf.cl b/tests/kernels/misc/printf.cl
new file mode 100644
index 0000000..0431d92
--- /dev/null
+++ b/tests/kernels/misc/printf.cl
@@ -0,0 +1,9 @@
+kernel void printf_kernel(
+  const int i,
+  const float f,
+  const float4 f4v)
+{
+  printf("int = %d\n", i);
+  printf("float = %.1f\n", f);
+  printf("float4 = %.2v4f\n", f4v);
+}
diff --git a/tests/kernels/misc/printf.ref b/tests/kernels/misc/printf.ref
new file mode 100644
index 0000000..fc25e5e
--- /dev/null
+++ b/tests/kernels/misc/printf.ref
@@ -0,0 +1,3 @@
+MATCH int = 7
+MATCH float = 42.0
+MATCH float4 = 42.12,-3.70,100001.01,-0.00
diff --git a/tests/kernels/misc/printf.sim b/tests/kernels/misc/printf.sim
new file mode 100644
index 0000000..df59b1a
--- /dev/null
+++ b/tests/kernels/misc/printf.sim
@@ -0,0 +1,12 @@
+printf.cl
+printf_kernel
+1 1 1
+1 1 1
+
+<size=4 fill=7>
+<size=4 fill=42>
+<size=16>
+42.123
+-3.7
+100001.01
+-0.0
diff --git a/tests/kernels/misc/reduce.cl b/tests/kernels/misc/reduce.cl
index 28f53ca..59f3e25 100644
--- a/tests/kernels/misc/reduce.cl
+++ b/tests/kernels/misc/reduce.cl
@@ -1,18 +1,22 @@
-kernel void reduce(unsigned int n,
-                   global unsigned int *data,
-                   global unsigned int *result,
-                   local unsigned int *localData)
+kernel void reduce(uint n,
+                   global uint *data,
+                   global uint *result,
+                   local uint *localData)
 {
-  unsigned int lid = get_local_id(0);
-  unsigned int lsz = get_local_size(0);
-  unsigned int sum = 0;
-  for (unsigned int i = lid; i < n; i+=lsz)
+  uint gid = get_global_id(0);
+  uint lid = get_local_id(0);
+  uint gsz = get_global_size(0);
+  uint lsz = get_local_size(0);
+  uint grp = get_group_id(0);
+
+  uint sum = 0;
+  for (uint i = gid; i < n; i+=gsz)
   {
     sum += data[i];
   }
 
   localData[lid] = sum;
-  for (unsigned int offset = lsz/2; offset > 0; offset/=2)
+  for (uint offset = lsz/2; offset > 0; offset/=2)
   {
     barrier(CLK_LOCAL_MEM_FENCE);
     if (lid < offset)
@@ -23,6 +27,6 @@ kernel void reduce(unsigned int n,
 
   if (lid == 0)
   {
-    *result = localData[lid];
+    result[grp] = localData[lid];
   }
 }
diff --git a/tests/kernels/misc/reduce.ref b/tests/kernels/misc/reduce.ref
index 3ebb5ea..2f33a38 100644
--- a/tests/kernels/misc/reduce.ref
+++ b/tests/kernels/misc/reduce.ref
@@ -1,2 +1,17 @@
-EXACT Argument 'result': 4 bytes
-EXACT   result[0] = 120
+EXACT Argument 'result': 64 bytes
+EXACT   result[0] = 1560
+EXACT   result[1] = 1624
+EXACT   result[2] = 1688
+EXACT   result[3] = 1752
+EXACT   result[4] = 1816
+EXACT   result[5] = 1880
+EXACT   result[6] = 1944
+EXACT   result[7] = 2008
+EXACT   result[8] = 2072
+EXACT   result[9] = 2136
+EXACT   result[10] = 2200
+EXACT   result[11] = 2264
+EXACT   result[12] = 2328
+EXACT   result[13] = 2392
+EXACT   result[14] = 2456
+EXACT   result[15] = 2520
diff --git a/tests/kernels/misc/reduce.sim b/tests/kernels/misc/reduce.sim
index 927a2e0..568fb57 100644
--- a/tests/kernels/misc/reduce.sim
+++ b/tests/kernels/misc/reduce.sim
@@ -1,11 +1,11 @@
 reduce.cl
 reduce
-4 1 1
+64 1 1
 4 1 1
 
 <size=4>
-16
+256
 
-<size=64 range=0:1:15>
-<size=4 fill=0 dump>
+<size=1024 range=0:1:255>
+<size=64 fill=0 dump>
 <size=16>
diff --git a/tests/kernels/misc/switch_case.cl b/tests/kernels/misc/switch_case.cl
new file mode 100644
index 0000000..57900af
--- /dev/null
+++ b/tests/kernels/misc/switch_case.cl
@@ -0,0 +1,25 @@
+kernel void switch_case(global int *input, global int *output)
+{
+  int i = get_global_id(0);
+  int in = input[i];
+  int out;
+  switch (in)
+  {
+  case 0:
+    out = -7;
+    break;
+  case 1:
+    out = i;
+    break;
+  case 2:
+  case 3:
+  case 4:
+    out = in + i;
+    break;
+  default:
+    out = 42;
+    break;
+  }
+
+  output[i] = out;
+}
diff --git a/tests/kernels/misc/switch_case.ref b/tests/kernels/misc/switch_case.ref
new file mode 100644
index 0000000..5efc51f
--- /dev/null
+++ b/tests/kernels/misc/switch_case.ref
@@ -0,0 +1,9 @@
+EXACT Argument 'output': 32 bytes
+EXACT   output[0] = 2
+EXACT   output[1] = 4
+EXACT   output[2] = 4
+EXACT   output[3] = -7
+EXACT   output[4] = 4
+EXACT   output[5] = 42
+EXACT   output[6] = 42
+EXACT   output[7] = 11
diff --git a/tests/kernels/misc/switch_case.sim b/tests/kernels/misc/switch_case.sim
new file mode 100644
index 0000000..7af4a7f
--- /dev/null
+++ b/tests/kernels/misc/switch_case.sim
@@ -0,0 +1,16 @@
+switch_case.cl
+switch_case
+8 1 1
+1 1 1
+
+<size=32>
+2
+3
+2
+0
+1
+5
+-1
+4
+
+<size=32 dump fill=0>
diff --git a/tests/kernels/uninitialized/padded_nested_struct_memcpy.cl b/tests/kernels/uninitialized/padded_nested_struct_memcpy.cl
index 20656ea..b14b3c2 100644
--- a/tests/kernels/uninitialized/padded_nested_struct_memcpy.cl
+++ b/tests/kernels/uninitialized/padded_nested_struct_memcpy.cl
@@ -17,10 +17,10 @@ kernel void padded_nested_struct_memcpy(global struct S *output)
 {
   struct S s;
   s.a = 1;
-  s.b = 2;
+  s.b = 0x02000002;
   s.c = 3;
   s.d.a = 4;
-  s.d.b = 5;
+  s.d.b = 0x05000005;
   s.d.c = 6;
 
   *output = s;
diff --git a/tests/kernels/uninitialized/padded_nested_struct_memcpy.ref b/tests/kernels/uninitialized/padded_nested_struct_memcpy.ref
index a55c060..3fcd122 100644
--- a/tests/kernels/uninitialized/padded_nested_struct_memcpy.ref
+++ b/tests/kernels/uninitialized/padded_nested_struct_memcpy.ref
@@ -6,7 +6,7 @@ MATCH   output[3] =
 EXACT   output[4] = 2
 EXACT   output[5] = 0
 EXACT   output[6] = 0
-EXACT   output[7] = 0
+EXACT   output[7] = 2
 EXACT   output[8] = 3
 MATCH   output[9] = 
 MATCH   output[10] = 
@@ -18,7 +18,7 @@ MATCH   output[15] =
 EXACT   output[16] = 5
 EXACT   output[17] = 0
 EXACT   output[18] = 0
-EXACT   output[19] = 0
+EXACT   output[19] = 5
 EXACT   output[20] = 6
 MATCH   output[21] = 
 MATCH   output[22] = 
diff --git a/tests/kernels/uninitialized/padded_struct_alloca_fp.cl b/tests/kernels/uninitialized/padded_struct_alloca_fp.cl
index 7f878a8..ed1fced 100644
--- a/tests/kernels/uninitialized/padded_struct_alloca_fp.cl
+++ b/tests/kernels/uninitialized/padded_struct_alloca_fp.cl
@@ -9,7 +9,7 @@ kernel void padded_struct_alloca_fp(global struct S *output)
 {
   struct S s;
   s.a = 42;
-  s.b = -7;
+  s.b = 0xF9FFFFF9;
   s.c = 127;
 
   *output = s;
diff --git a/tests/kernels/uninitialized/padded_struct_alloca_fp.ref b/tests/kernels/uninitialized/padded_struct_alloca_fp.ref
index 609ed8f..d9196f5 100644
--- a/tests/kernels/uninitialized/padded_struct_alloca_fp.ref
+++ b/tests/kernels/uninitialized/padded_struct_alloca_fp.ref
@@ -1,4 +1,13 @@
 EXACT Argument 'output': 12 bytes
 EXACT   output[0] = 42
-EXACT   output[1] = -7
-EXACT   output[2] = 127
+EXACT   output[1] = 0
+EXACT   output[2] = 0
+EXACT   output[3] = 0
+EXACT   output[4] = -7
+EXACT   output[5] = -1
+EXACT   output[6] = -1
+EXACT   output[7] = -7
+EXACT   output[8] = 127
+EXACT   output[9] = 0
+EXACT   output[10] = 0
+EXACT   output[11] = 0
diff --git a/tests/kernels/uninitialized/padded_struct_alloca_fp.sim b/tests/kernels/uninitialized/padded_struct_alloca_fp.sim
index 74f1b0a..4881a61 100644
--- a/tests/kernels/uninitialized/padded_struct_alloca_fp.sim
+++ b/tests/kernels/uninitialized/padded_struct_alloca_fp.sim
@@ -3,4 +3,4 @@ padded_struct_alloca_fp
 1 1 1
 1 1 1
 
-<size=12 int fill=0 dump>
+<size=12 char fill=0 dump>
diff --git a/tests/kernels/uninitialized/padded_struct_memcpy_fp.cl b/tests/kernels/uninitialized/padded_struct_memcpy_fp.cl
index f1a449b..ef6225c 100644
--- a/tests/kernels/uninitialized/padded_struct_memcpy_fp.cl
+++ b/tests/kernels/uninitialized/padded_struct_memcpy_fp.cl
@@ -12,7 +12,7 @@ kernel void padded_struct_memcpy_fp(local struct S *scratch,
 
   struct S s;
   s.a = 42;
-  s.b = -7;
+  s.b = 0xF9FFFFF9;
   s.c = 127;
 
   if (lid == 0)
diff --git a/tests/kernels/uninitialized/padded_struct_memcpy_fp.ref b/tests/kernels/uninitialized/padded_struct_memcpy_fp.ref
index 609ed8f..d9196f5 100644
--- a/tests/kernels/uninitialized/padded_struct_memcpy_fp.ref
+++ b/tests/kernels/uninitialized/padded_struct_memcpy_fp.ref
@@ -1,4 +1,13 @@
 EXACT Argument 'output': 12 bytes
 EXACT   output[0] = 42
-EXACT   output[1] = -7
-EXACT   output[2] = 127
+EXACT   output[1] = 0
+EXACT   output[2] = 0
+EXACT   output[3] = 0
+EXACT   output[4] = -7
+EXACT   output[5] = -1
+EXACT   output[6] = -1
+EXACT   output[7] = -7
+EXACT   output[8] = 127
+EXACT   output[9] = 0
+EXACT   output[10] = 0
+EXACT   output[11] = 0
diff --git a/tests/kernels/uninitialized/padded_struct_memcpy_fp.sim b/tests/kernels/uninitialized/padded_struct_memcpy_fp.sim
index 8ed2ae9..e5cb20c 100644
--- a/tests/kernels/uninitialized/padded_struct_memcpy_fp.sim
+++ b/tests/kernels/uninitialized/padded_struct_memcpy_fp.sim
@@ -5,4 +5,4 @@ padded_struct_memcpy_fp
 
 <size=12 char>
 
-<size=12 int fill=0 dump>
+<size=12 char fill=0 dump>
diff --git a/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.cl b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.cl
index 4558ff6..e881310 100644
--- a/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.cl
+++ b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.cl
@@ -15,6 +15,6 @@ struct S
 
 kernel void uninitialized_padded_nested_struct_memcpy(local int *scratch, global struct S *output)
 {
-  struct S s = {1, 2, 3, {4, *scratch, 5}};
+  struct S s = {1, 0x02000002, 3, {4, *scratch, 5}};
   *output = s;
 }
diff --git a/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.ref b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.ref
index ad64661..f232fda 100644
--- a/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.ref
+++ b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.ref
@@ -8,7 +8,7 @@ MATCH   output[3] =
 EXACT   output[4] = 2
 EXACT   output[5] = 0
 EXACT   output[6] = 0
-EXACT   output[7] = 0
+EXACT   output[7] = 2
 EXACT   output[8] = 3
 MATCH   output[9] = 
 MATCH   output[10] = 
diff --git a/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.ref b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.ref
index 3fffd1d..ccd00ce 100644
--- a/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.ref
+++ b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.ref
@@ -2,5 +2,14 @@ ERROR Uninitialized value
 
 EXACT Argument 'output': 12 bytes
 EXACT   output[0] = 1
-MATCH   output[1] = 
-EXACT   output[2] = 2
+EXACT   output[1] = 0
+EXACT   output[2] = 0
+EXACT   output[3] = 0
+MATCH   output[4] =
+MATCH   output[5] =
+MATCH   output[6] =
+MATCH   output[7] =
+EXACT   output[8] = 2
+EXACT   output[9] = 0
+EXACT   output[10] = 0
+EXACT   output[11] = 0
diff --git a/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.sim b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.sim
index cdf5827..07f5d9e 100644
--- a/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.sim
+++ b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.sim
@@ -5,4 +5,4 @@ uninitialized_padded_struct_memcpy
 
 <size=8>
 
-<size=12 int fill=0 dump>
+<size=12 char fill=0 dump>
diff --git a/tests/kernels/wait_event/wait_event_invalid.cl b/tests/kernels/wait_event/wait_event_invalid.cl
index 239530e..2fc63fb 100644
--- a/tests/kernels/wait_event/wait_event_invalid.cl
+++ b/tests/kernels/wait_event/wait_event_invalid.cl
@@ -1,5 +1,5 @@
 kernel void wait_event_invalid(global int *data)
 {
-  event_t event = 42;
+  event_t event = 0;
   wait_group_events(1, &event);
 }
diff --git a/tests/run_test.py b/tests/run_test.py
index 330ca7e..0db819f 100644
--- a/tests/run_test.py
+++ b/tests/run_test.py
@@ -14,14 +14,14 @@ import sys
 
 # Check arguments
 if len(sys.argv) != 3:
-  print('Usage: python run_test.py OCLGRIND-KERNEL TEST_EXE|TEST.sim')
+  print('Usage: python run_test.py OCLGRIND-EXE TEST_EXE|TEST.sim')
   sys.exit(1)
 if not os.path.isfile(sys.argv[2]):
   print('Test file not found')
   sys.exit(1)
 
 # Construct paths to test inputs/outputs
-oclgrind_kernel = sys.argv[1]
+oclgrind_exe    = sys.argv[1]
 test_full_path  = sys.argv[2]
 test_dir        = os.path.dirname(os.path.realpath(test_full_path))
 test_file       = os.path.basename(test_full_path)
@@ -29,6 +29,7 @@ test_name       = os.path.splitext(test_file)[0]
 current_dir     = os.getcwd()
 
 if test_file.endswith('.sim'):
+  test_inp = test_full_path[:-4] + '.inp'
   test_ref = test_full_path[:-4] + '.ref'
 else:
   if test_full_path[0] == '/':
@@ -36,6 +37,8 @@ else:
   else:
     rel_path = test_full_path
 
+  test_inp = os.path.dirname(os.path.abspath(__file__)) + os.path.sep \
+    + rel_path + '.inp'
   test_ref = os.path.dirname(os.path.abspath(__file__)) + os.path.sep \
     + rel_path + '.ref'
 
@@ -69,15 +72,30 @@ def run(output_suffix):
       raise
 
   out = open(test_out, 'w')
+  try:
+      inp = open(test_inp, 'r')
+  except:
+      inp = None
 
   # Run test
   if test_file.endswith('.sim'):
     os.chdir(test_dir)
-    retval = subprocess.call([oclgrind_kernel, test_file],
-                             stdout=out, stderr=out)
+
+    cmd = [oclgrind_exe]
+
+    # Add any additional arguments specified in the test file
+    first_line = open(test_file).readline()[:-1]
+    if first_line[:7] == '# ARGS:':
+        cmd.extend(first_line[8:].split(' '))
+
+    cmd.append(test_file)
+
+    retval = subprocess.call(cmd, stdout=out, stderr=out, stdin=inp)
+
     os.chdir(current_dir)
   else:
-    retval = subprocess.call([test_full_path], stdout=out, stderr=out)
+    retval = subprocess.call([oclgrind_exe,test_full_path],
+                             stdout=out, stderr=out, stdin=inp)
 
   out.close()
   if retval != 0:
@@ -101,7 +119,12 @@ def run(output_suffix):
       text = line[6:]
 
       # Find next non-blank line in output file
-      while not len(out[oi]):
+      while True:
+        if oi >= len(out):
+            print('Unexpected end of output when matching ' + line)
+            fail()
+        if len(out[oi]):
+            break
         oi += 1
 
       if type == 'ERROR':
@@ -131,6 +154,14 @@ def run(output_suffix):
         print('Invalid match type in reference file')
         fail()
 
+    # Check there are no more lines in output
+    while oi < len(out):
+      if len(out[oi]) > 0:
+          print('Unexpected output after all matches completed (line %d):' % oi)
+          print(out[oi])
+          fail()
+      oi += 1
+
 print('Running test with optimisations')
 run('')
 print('PASSED')
diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt
index c700f96..7aaf435 100644
--- a/tests/runtime/CMakeLists.txt
+++ b/tests/runtime/CMakeLists.txt
@@ -11,9 +11,13 @@ include_directories(../common)
 
 # Add runtime tests
 foreach(test
-  map_buffer)
+  build_program
+  map_buffer
+  sampler)
 
   add_executable(${test} ${test}.c ${COMMON_SOURCES})
+  target_compile_definitions(${test} PRIVATE
+                             "-DROOT_DIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
   target_link_libraries(${test} oclgrind-rt)
 
   # Generate test binaries in same dir as Oclgrind libraries on Windows
@@ -28,13 +32,14 @@ foreach(test
     NAME rt_${test}
     COMMAND
     ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/run_test.py
-    $<TARGET_FILE:oclgrind-kernel>
+    $<TARGET_FILE:oclgrind-exe>
     $<TARGET_FILE:${test}>)
 
   set_tests_properties(rt_${test} PROPERTIES DEPENDS ${test})
 
   # Set PCH directory
-  set_tests_properties(rt_${test} PROPERTIES
-      ENVIRONMENT "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
+  set(ENV "OCLGRIND_TESTING=1")
+  list(APPEND ENV "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
+  set_tests_properties(rt_${test} PROPERTIES ENVIRONMENT "${ENV}")
 
 endforeach(${test})
diff --git a/tests/runtime/build_program.c b/tests/runtime/build_program.c
new file mode 100644
index 0000000..9c97cb6
--- /dev/null
+++ b/tests/runtime/build_program.c
@@ -0,0 +1,73 @@
+#include "common.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define TOL 1e-8
+#define MAX_ERRORS 8
+#define N 16
+
+const char *SOURCE_INCLUDE =
+"#include \"header.h\"                    \n"
+"kernel void test_kernel(global int *out) \n"
+"{                                        \n"
+"  *out = VALUE;                          \n"
+"}                                        \n"
+;
+
+const char *SOURCE_MACRO =
+"#define _STR(ARG) #ARG                   \n"
+"#define STR(ARG) _STR(ARG)               \n"
+"kernel void test_kernel(global int *out) \n"
+"{                                        \n"
+"  printf(\"MSG = %s\\n\", STR(MSG));     \n"
+"}                                        \n"
+;
+
+void run(const char *source, const char *options)
+{
+  cl_int err;
+  cl_kernel kernel;
+  cl_mem d_out;
+
+  Context cl = createContext(source, options);
+
+  kernel = clCreateKernel(cl.program, "test_kernel", &err);
+  checkError(err, "creating kernel");
+
+  d_out = clCreateBuffer(cl.context, CL_MEM_WRITE_ONLY, 4, NULL, &err);
+  checkError(err, "creating d_out");
+
+  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_out);
+  checkError(err, "setting kernel argument");
+
+  size_t global[1] = {1};
+  err = clEnqueueNDRangeKernel(cl.queue, kernel,
+                               1, NULL, global, NULL, 0, NULL, NULL);
+  checkError(err, "enqueuing kernel");
+
+  err = clFinish(cl.queue);
+  checkError(err, "running kernel");
+
+  int *h_out = clEnqueueMapBuffer(cl.queue, d_out, CL_TRUE, CL_MAP_READ,
+                                  0, 4, 0, NULL, NULL, &err);
+  checkError(err, "mapping buffer for reading");
+
+  printf("out = %d\n", *h_out);
+
+  err = clEnqueueUnmapMemObject(cl.queue, d_out, h_out, 0, NULL, NULL);
+  checkError(err, "unmapping buffer");
+
+  clReleaseMemObject(d_out);
+  clReleaseKernel(kernel);
+  releaseContext(cl);
+}
+
+int main(int argc, char *argv[])
+{
+  run(SOURCE_INCLUDE, "-I " ROOT_DIR "/inc/nospace");
+  run(SOURCE_INCLUDE, "-I \"" ROOT_DIR "/inc/with space\"");
+  run(SOURCE_MACRO, "-D MSG=hello");
+  run(SOURCE_MACRO, "-D MSG=foo\\ and\\ bar");
+  return 0;
+}
diff --git a/tests/runtime/build_program.ref b/tests/runtime/build_program.ref
new file mode 100644
index 0000000..05bf36e
--- /dev/null
+++ b/tests/runtime/build_program.ref
@@ -0,0 +1,6 @@
+EXACT out = 42
+EXACT out = -7
+EXACT MSG = hello
+MATCH out =
+EXACT MSG = foo and bar
+MATCH out =
diff --git a/tests/runtime/inc/nospace/header.h b/tests/runtime/inc/nospace/header.h
new file mode 100644
index 0000000..4350790
--- /dev/null
+++ b/tests/runtime/inc/nospace/header.h
@@ -0,0 +1 @@
+#define VALUE 42
diff --git a/tests/runtime/inc/with space/header.h b/tests/runtime/inc/with space/header.h
new file mode 100644
index 0000000..045a3a8
--- /dev/null
+++ b/tests/runtime/inc/with space/header.h	
@@ -0,0 +1 @@
+#define VALUE -7
diff --git a/tests/runtime/map_buffer.c b/tests/runtime/map_buffer.c
index 059b261..3d904c9 100644
--- a/tests/runtime/map_buffer.c
+++ b/tests/runtime/map_buffer.c
@@ -274,7 +274,7 @@ int main(int argc, char *argv[])
     N = atoi(argv[1]);
   }
 
-  Context cl = createContext(KERNEL_SOURCE);
+  Context cl = createContext(KERNEL_SOURCE, "");
 
   kernel = clCreateKernel(cl.program, "vecadd", &err);
   checkError(err, "creating kernel");
diff --git a/tests/runtime/sampler.c b/tests/runtime/sampler.c
new file mode 100644
index 0000000..53c7f25
--- /dev/null
+++ b/tests/runtime/sampler.c
@@ -0,0 +1,138 @@
+#include "common.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define TOL 1e-8
+#define MAX_ERRORS 8
+#define N 16
+
+const char *KERNEL_SOURCE =
+"kernel void test_sampler(read_only  image2d_t input,         \n"
+"                         write_only image2d_t output,        \n"
+"                                    sampler_t sampler)       \n"
+"{                                                            \n"
+"  int x = get_global_id(0);                                  \n"
+"  int y = get_global_id(1);                                  \n"
+"  float4 pixel = read_imagef(input, sampler, (int2)(x,y));   \n"
+"  float4 left  = read_imagef(input, sampler, (int2)(x-1,y)); \n"
+"  write_imagef(output, (int2)(x,y), pixel+left);             \n"
+"}                                                            \n"
+;
+
+unsigned checkResults(uint8_t *input, uint8_t *output);
+
+int main(int argc, char *argv[])
+{
+  cl_int err;
+  cl_kernel kernel;
+  cl_mem d_input, d_output;
+  cl_sampler sampler;
+
+  Context cl = createContext(KERNEL_SOURCE, "");
+
+  kernel = clCreateKernel(cl.program, "test_sampler", &err);
+  checkError(err, "creating kernel");
+
+  cl_image_format format;
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNORM_INT8;
+
+  cl_image_desc desc = {0};
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = N;
+  desc.image_height = N;
+
+  // Create images
+  d_input = clCreateImage(cl.context, CL_MEM_READ_ONLY,
+                          &format, &desc, NULL, &err);
+  checkError(err, "creating d_input image");
+  d_output = clCreateImage(cl.context, CL_MEM_WRITE_ONLY,
+                           &format, &desc, NULL, &err);
+  checkError(err, "creating d_output image");
+
+  size_t dataSize = N*N*4;
+
+  // Initialise data
+  uint8_t *h_input = malloc(dataSize);
+  uint8_t *h_output = malloc(dataSize);
+  srand(0);
+  for (unsigned i = 0; i < dataSize; i++)
+  {
+    h_input[i] = rand() % 256 / 2;
+    h_output[i] = 0;
+  }
+
+  size_t origin[] = {0, 0, 0};
+  size_t region[] = {N, N, 1};
+  err = clEnqueueWriteImage(cl.queue, d_input, CL_TRUE, origin, region, 0, 0,
+                            h_input, 0, NULL, NULL);
+  checkError(err, "writing image data");
+
+  // Create sampler
+  sampler = clCreateSampler(cl.context,
+                            CL_FALSE,
+                            CL_ADDRESS_CLAMP_TO_EDGE,
+                            CL_FILTER_NEAREST,
+                            &err);
+  checkError(err, "creating sampler");
+
+  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_input);
+  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_output);
+  err |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
+  checkError(err, "setting kernel args");
+
+  size_t global[2] = {N, N};
+  err = clEnqueueNDRangeKernel(cl.queue, kernel,
+                               2, NULL, global, NULL, 0, NULL, NULL);
+  checkError(err, "enqueuing kernel");
+
+  err = clFinish(cl.queue);
+  checkError(err, "running kernel");
+
+  err = clEnqueueReadImage(cl.queue, d_output, CL_TRUE, origin, region, 0, 0,
+                           h_output, 0, NULL, NULL);
+  checkError(err, "writing image data");
+
+  unsigned errors = checkResults(h_input, h_output);
+
+  clReleaseMemObject(d_input);
+  clReleaseMemObject(d_output);
+  clReleaseKernel(kernel);
+  releaseContext(cl);
+
+  return (errors != 0);
+}
+
+unsigned checkResults(uint8_t *input, uint8_t *output)
+{
+  // Check results
+  unsigned errors = 0;
+  for (int y = 0; y < N; y++)
+  {
+    for (int x = 0; x < N; x++)
+    {
+      int xleft = x ? x - 1 : 0;
+      for (int c = 0; c < 4; c++)
+      {
+        int i = (x + y*N)*4 + c;
+        int ref = input[i] + input[(xleft + y*N)*4 + c];
+        if (output[i] != ref)
+        {
+          if (errors < MAX_ERRORS)
+          {
+            fprintf(stderr, "%2d,%2d,%2d: %d != %d\n", x, y, c, output[i], ref);
+          }
+          errors++;
+        }
+      }
+    }
+  }
+
+  printf("%d errors detected\n", errors);
+
+  return errors;
+}
diff --git a/tests/runtime/sampler.ref b/tests/runtime/sampler.ref
new file mode 100644
index 0000000..501b124
--- /dev/null
+++ b/tests/runtime/sampler.ref
@@ -0,0 +1 @@
+EXACT 0 errors detected

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/oclgrind.git



More information about the Pkg-opencl-commits mailing list