[oclgrind] 02/10: New upstream version 16.10

James Price jprice-guest at moszumanska.debian.org
Sun Oct 23 20:01:40 UTC 2016


This is an automated email from the git hooks/post-receive script.

jprice-guest pushed a commit to branch master
in repository oclgrind.

commit e20cc9099c31783fdf1ef04ec33092e591ae9c76
Author: James Price <j.price at bristol.ac.uk>
Date:   Sun Oct 23 19:46:53 2016 +0100

    New upstream version 16.10
---
 .gitignore                                         |    1 +
 .travis-deps.sh                                    |   26 +
 .travis.yml                                        |   26 +
 CMakeLists.txt                                     |  203 +-
 LICENSE                                            |    2 +-
 Makefile.am                                        |  106 +-
 NEWS                                               |   17 +
 README                                             |  138 -
 README.md                                          |  196 ++
 configure.ac                                       |   58 +-
 src/core/Context.cpp                               |   39 +-
 src/core/Context.h                                 |   17 +-
 src/core/Kernel.cpp                                |  259 +-
 src/core/Kernel.h                                  |   19 +-
 src/core/KernelInvocation.cpp                      |   18 +-
 src/core/KernelInvocation.h                        |    2 +-
 src/core/Memory.cpp                                |  127 +-
 src/core/Memory.h                                  |   34 +-
 src/core/Plugin.cpp                                |    2 +-
 src/core/Plugin.h                                  |    9 +-
 src/core/Program.cpp                               |  441 ++-
 src/core/Program.h                                 |    8 +-
 src/core/Queue.cpp                                 |   21 +-
 src/core/Queue.h                                   |   28 +-
 src/core/WorkGroup.cpp                             |   24 +-
 src/core/WorkGroup.h                               |   14 +-
 src/core/WorkItem.cpp                              |  244 +-
 src/core/WorkItem.h                                |   32 +-
 src/core/WorkItemBuiltins.cpp                      |  439 ++-
 src/core/clc.h                                     |  102 +-
 src/core/common.cpp                                |  195 +-
 src/core/common.h                                  |  128 +-
 src/core/half.cpp                                  |  259 ++
 src/core/half.h                                    |  165 +-
 src/install/INSTALL.darwin                         |   10 +-
 src/install/INSTALL.linux                          |   10 +-
 src/install/INSTALL.windows                        |   14 +-
 src/install/cpack-description                      |   11 +
 src/install/install.bat                            |   13 +
 src/kernel/Simulation.cpp                          |   43 +-
 src/kernel/Simulation.h                            |    7 +-
 src/kernel/oclgrind-kernel.cpp                     |   13 +-
 src/plugins/InstructionCounter.cpp                 |    2 +-
 src/plugins/InstructionCounter.h                   |    2 +-
 src/plugins/InteractiveDebugger.cpp                |   26 +-
 src/plugins/InteractiveDebugger.h                  |    2 +-
 src/plugins/Logger.cpp                             |    2 +-
 src/plugins/Logger.h                               |    2 +-
 src/plugins/MemCheck.cpp                           |  134 +-
 src/plugins/MemCheck.h                             |   27 +-
 src/plugins/RaceDetector.cpp                       |  582 ++--
 src/plugins/RaceDetector.h                         |  130 +-
 src/plugins/Uninitialized.cpp                      | 2811 ++++++++++++++++++++
 src/plugins/Uninitialized.h                        |  314 +++
 src/runtime/async_queue.cpp                        |    2 +-
 src/runtime/async_queue.h                          |    2 +-
 src/runtime/icd.h                                  |    2 +-
 src/runtime/oclgrind                               |  145 -
 src/runtime/oclgrind.cpp                           |  483 ++++
 src/runtime/runtime.cpp                            |  253 +-
 src/runtime/runtime.def                            |    2 +
 tests/Makefile.am                                  |   50 +
 tests/apps/CMakeLists.txt                          |   17 +-
 tests/apps/image/image.c                           |  133 +
 tests/apps/vecadd/vecadd.c                         |  102 +-
 tests/common/common.c                              |   66 +
 tests/common/common.h                              |   15 +
 tests/kernels/CMakeLists.txt                       |   27 +
 tests/kernels/TESTS                                |   27 +
 tests/kernels/alignment/packed.ref                 |    6 +-
 tests/kernels/alignment/packed.sim                 |    2 +-
 tests/kernels/alignment/unaligned.ref              |    7 +-
 tests/kernels/async_copy/async_copy.ref            |   12 +-
 tests/kernels/async_copy/async_copy_divergent.ref  |   13 +-
 .../kernels/async_copy/async_copy_global_race.ref  |   15 +-
 tests/kernels/async_copy/async_copy_local_race.ref |   15 +-
 tests/kernels/async_copy/async_copy_loop.ref       |   12 +-
 .../async_copy/async_copy_loop_divergent.ref       |   13 +-
 tests/kernels/async_copy/async_copy_single_wi.ref  |   13 +-
 tests/kernels/async_copy/async_copy_unwaited.ref   |   13 +-
 .../kernels/atomics/atomic_cmpxchg_false_race.ref  |   14 +-
 tests/kernels/atomics/atomic_cmpxchg_read_race.ref |    8 +-
 .../kernels/atomics/atomic_cmpxchg_write_race.ref  |    7 +-
 tests/kernels/atomics/atomic_global_fence.ref      |    8 +-
 tests/kernels/atomics/atomic_global_fence_race.ref |   10 +-
 tests/kernels/atomics/atomic_increment.ref         |    6 +-
 tests/kernels/atomics/atomic_intergroup_race.ref   |    8 +-
 tests/kernels/atomics/atomic_local_fence.ref       |    8 +-
 tests/kernels/atomics/atomic_race_after.ref        |    9 +-
 tests/kernels/atomics/atomic_race_before.ref       |   12 +-
 tests/kernels/atomics/atomic_same_workitem.ref     |   12 +-
 .../barrier/barrier_different_instructions.ref     |   15 +-
 tests/kernels/barrier/barrier_divergence.ref       |   13 +-
 tests/kernels/bugs/byval_function_argument.cl      |   19 +
 tests/kernels/bugs/byval_function_argument.ref     |    3 +
 tests/kernels/bugs/byval_function_argument.sim     |    6 +
 tests/kernels/bugs/const_gep_expr_pointee_type.cl  |    9 +
 tests/kernels/bugs/const_gep_expr_pointee_type.sim |    4 +
 .../kernels/bugs/false_warning_vector_argument.cl  |    8 +
 .../kernels/bugs/false_warning_vector_argument.ref |    9 +
 .../kernels/bugs/false_warning_vector_argument.sim |    7 +
 tests/kernels/bugs/gvn_arbitrary_integers.ref      |   10 +-
 tests/kernels/bugs/kernel_struct_argument.ref      |    6 +-
 tests/kernels/bugs/llvm_bswap.cl                   |    8 +
 tests/kernels/bugs/llvm_bswap.ref                  |    7 +
 tests/kernels/bugs/llvm_bswap.sim                  |   12 +
 tests/kernels/bugs/many_alloca.ref                 |    6 +-
 tests/kernels/bugs/multidim_array_in_struct.ref    |    6 +-
 tests/kernels/bugs/null_argument.ref               |    6 +-
 tests/kernels/bugs/rhadd_overflow.cl               |    4 +
 tests/kernels/bugs/rhadd_overflow.ref              |    3 +
 tests/kernels/bugs/rhadd_overflow.sim              |    6 +
 tests/kernels/bugs/sroa_addrspace_cast.ref         |    6 +-
 tests/kernels/bugs/write_vector_write_only_fp.cl   |    5 +
 tests/kernels/bugs/write_vector_write_only_fp.ref  |   17 +
 tests/kernels/bugs/write_vector_write_only_fp.sim  |    6 +
 tests/kernels/data-race/broadcast.ref              |   12 +-
 tests/kernels/data-race/global_fence.ref           |   12 +-
 tests/kernels/data-race/global_only_fence.ref      |   15 +-
 tests/kernels/data-race/global_read_write_race.ref |   14 +-
 tests/kernels/data-race/global_read_write_race.sim |    2 +-
 .../kernels/data-race/global_write_write_race.ref  |    9 +-
 tests/kernels/data-race/increment.ref              |   12 +-
 tests/kernels/data-race/intergroup_hidden_race.ref |    9 +-
 tests/kernels/data-race/intergroup_race.ref        |   11 +-
 tests/kernels/data-race/intergroup_race.sim        |    4 +-
 tests/kernels/data-race/intragroup_hidden_race.ref |    9 +-
 tests/kernels/data-race/local_only_fence.ref       |   24 +-
 tests/kernels/data-race/local_read_write_race.cl   |    3 +
 tests/kernels/data-race/local_read_write_race.ref  |    9 +-
 tests/kernels/data-race/local_write_write_race.ref |   15 +-
 tests/kernels/data-race/uniform_write_race.ref     |    6 +-
 .../kernels/memcheck/async_copy_out_of_bounds.ref  |   13 +-
 tests/kernels/memcheck/atomic_out_of_bounds.ref    |   15 +-
 tests/kernels/memcheck/casted_static_array.cl      |   31 +
 tests/kernels/memcheck/casted_static_array.ref     |    7 +
 tests/kernels/memcheck/casted_static_array.sim     |    6 +
 tests/kernels/memcheck/dereference_null.ref        |    8 +-
 tests/kernels/memcheck/fake_out_of_bounds.cl       |   12 +
 tests/kernels/memcheck/fake_out_of_bounds.ref      |    2 +
 tests/kernels/memcheck/fake_out_of_bounds.sim      |    6 +
 tests/kernels/memcheck/read_out_of_bounds.ref      |   17 +-
 tests/kernels/memcheck/read_write_only_memory.ref  |   16 +-
 tests/kernels/memcheck/static_array.cl             |   13 +
 tests/kernels/memcheck/static_array.ref            |   10 +
 tests/kernels/memcheck/static_array.sim            |    6 +
 .../kernels/memcheck/static_array_padded_struct.cl |   12 +
 .../memcheck/static_array_padded_struct.ref        |    8 +
 .../memcheck/static_array_padded_struct.sim        |    6 +
 tests/kernels/memcheck/write_out_of_bounds.ref     |   13 +-
 tests/kernels/memcheck/write_read_only_memory.ref  |   16 +-
 tests/kernels/misc/array.ref                       |  260 +-
 tests/kernels/misc/lvalue_loads.cl                 |  119 +
 tests/kernels/misc/lvalue_loads.ref                |   75 +
 tests/kernels/misc/lvalue_loads.sim                |   29 +
 tests/kernels/misc/program_scope_constant_array.cl |    7 +
 .../kernels/misc/program_scope_constant_array.ref  |    5 +
 .../kernels/misc/program_scope_constant_array.sim  |    6 +
 tests/kernels/misc/reduce.ref                      |    6 +-
 tests/kernels/misc/vecadd.ref                      | 2052 +++++++-------
 tests/kernels/misc/vector_argument.cl              |    4 +
 tests/kernels/misc/vector_argument.ref             |    5 +
 tests/kernels/misc/vector_argument.sim             |    9 +
 tests/kernels/run_kernel_test.py                   |   93 -
 .../uninitialized/padded_nested_struct_memcpy.cl   |   27 +
 .../uninitialized/padded_nested_struct_memcpy.ref  |   25 +
 .../uninitialized/padded_nested_struct_memcpy.sim  |    6 +
 .../uninitialized/padded_struct_alloca_fp.cl       |   16 +
 .../uninitialized/padded_struct_alloca_fp.ref      |    4 +
 .../uninitialized/padded_struct_alloca_fp.sim      |    6 +
 .../uninitialized/padded_struct_memcpy_fp.cl       |   27 +
 .../uninitialized/padded_struct_memcpy_fp.ref      |    4 +
 .../uninitialized/padded_struct_memcpy_fp.sim      |    8 +
 .../uninitialized/partially_uninitialized_fract.cl |    6 +
 .../partially_uninitialized_fract.ref              |   12 +
 .../partially_uninitialized_fract.sim              |    6 +
 .../private_array_initializer_list.cl              |    9 +
 .../private_array_initializer_list.ref             |    5 +
 .../private_array_initializer_list.sim             |    6 +
 .../kernels/uninitialized/uninitialized_address.cl |    9 +
 .../uninitialized/uninitialized_address.ref        |    5 +
 .../uninitialized/uninitialized_address.sim        |    6 +
 .../uninitialized/uninitialized_global_buffer.cl   |    5 +
 .../uninitialized/uninitialized_global_buffer.ref  |    4 +
 .../uninitialized/uninitialized_global_buffer.sim  |    8 +
 .../uninitialized/uninitialized_local_array.cl     |   11 +
 .../uninitialized/uninitialized_local_array.ref    |   19 +
 .../uninitialized/uninitialized_local_array.sim    |    6 +
 .../uninitialized/uninitialized_local_ptr.cl       |    9 +
 .../uninitialized/uninitialized_local_ptr.ref      |   19 +
 .../uninitialized/uninitialized_local_ptr.sim      |    8 +
 .../uninitialized/uninitialized_local_variable.cl  |    7 +
 .../uninitialized/uninitialized_local_variable.ref |    4 +
 .../uninitialized/uninitialized_local_variable.sim |    6 +
 .../uninitialized_packed_struct_memcpy.cl          |   12 +
 .../uninitialized_packed_struct_memcpy.ref         |    9 +
 .../uninitialized_packed_struct_memcpy.sim         |    8 +
 .../uninitialized_padded_nested_struct_memcpy.cl   |   20 +
 .../uninitialized_padded_nested_struct_memcpy.ref  |   27 +
 .../uninitialized_padded_nested_struct_memcpy.sim  |    8 +
 .../uninitialized_padded_struct_memcpy.cl          |   12 +
 .../uninitialized_padded_struct_memcpy.ref         |    6 +
 .../uninitialized_padded_struct_memcpy.sim         |    8 +
 .../uninitialized/uninitialized_private_array.cl   |   16 +
 .../uninitialized/uninitialized_private_array.ref  |    7 +
 .../uninitialized/uninitialized_private_array.sim  |   14 +
 tests/kernels/wait_event/wait_event_chained.ref    |   12 +-
 tests/kernels/wait_event/wait_event_divergent.cl   |    3 +
 tests/kernels/wait_event/wait_event_divergent.ref  |   10 +-
 tests/kernels/wait_event/wait_event_duplicates.ref |   12 +-
 tests/kernels/wait_event/wait_event_invalid.ref    |   13 +-
 tests/run_test.py                                  |  145 +
 tests/{apps => runtime}/CMakeLists.txt             |   25 +-
 tests/runtime/map_buffer.c                         |  327 +++
 tests/runtime/map_buffer.ref                       |    4 +
 215 files changed, 10120 insertions(+), 3228 deletions(-)

diff --git a/.gitignore b/.gitignore
index 14830ae..34ed68f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ tests/apps/vecadd/vecadd
 # Misc
 oclgrind-*.tar.gz
 oclgrind-*.zip
+.clang_complete
 .DS_Store
 *.kdev4
 *.sublime-*
diff --git a/.travis-deps.sh b/.travis-deps.sh
new file mode 100644
index 0000000..628d1f8
--- /dev/null
+++ b/.travis-deps.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [ "$TRAVIS_OS_NAME" == "linux" ]
+then
+    # Add repositories
+    sudo add-apt-repository -y 'deb http://apt.llvm.org/trusty/ llvm-toolchain-trusty-3.9 main'
+    wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+    sudo apt-get update -qq
+
+    # Remove existing LLVM
+    sudo apt-get remove llvm
+
+    # Install Clang + LLVM
+    sudo apt-get install -y llvm-3.9-dev libclang-3.9-dev clang-3.9
+    sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-3.9 20
+    sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-3.9 20
+    sudo rm -f /usr/local/clang-3.5.0/bin/clang
+    sudo rm -f /usr/local/clang-3.5.0/bin/clang++
+
+    # Other dependencies
+    sudo apt-get install -y libedit-dev
+elif [ "$TRAVIS_OS_NAME" == "osx" ]
+then
+    brew update
+    brew install -v llvm --with-clang
+fi
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..8a94414
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,26 @@
+os:
+  - linux
+  - osx
+
+sudo: required
+dist: trusty
+osx_image: xcode7.3
+
+language: cpp
+compiler:
+  - gcc
+  - clang
+
+matrix:
+  exclude:
+    - os: osx
+      compiler: gcc
+
+before_install:
+  - bash ./.travis-deps.sh
+
+script:
+  - if [ "${TRAVIS_OS_NAME}" = "linux" ]; then cmake . -DLLVM_DIR=/usr/lib/llvm-3.9/lib/cmake/llvm ; fi
+  - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then cmake . -DLLVM_DIR=/usr/local/Cellar/llvm/3.8.1/share/llvm/cmake ; fi
+  - make -j 2
+  - ctest --output-on-failure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a35af1e..4838fd5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # CMakeLists.txt (Oclgrind)
-# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 # University of Bristol. All rights reserved.
 #
 # This program is provided under a three-clause BSD license. For full
@@ -8,23 +8,26 @@
 
 cmake_minimum_required(VERSION 2.8.12)
 project(Oclgrind)
-set(Oclgrind_VERSION_MAJOR 15)
-set(Oclgrind_VERSION_MINOR 5)
+set(Oclgrind_VERSION_MAJOR 16)
+set(Oclgrind_VERSION_MINOR 10)
 
 include(CheckIncludeFiles)
+include(CheckIncludeFileCXX)
 include(CheckLibraryExists)
 
 # Enable C99 for GCC (required for tests)
 if (CMAKE_COMPILER_IS_GNUCC)
-  set(CMAKE_C_FLAGS "-std=c99")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
 endif()
 
 # Enable rpath on OS X
 set(CMAKE_MACOSX_RPATH 1)
 
-# Enable C++11 for Clang/GCC
 if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
-  set(CMAKE_CXX_FLAGS "-std=c++11")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
 endif()
 
 # Disable min/max macros on Windows
@@ -34,7 +37,9 @@ endif()
 
 # Suppress warnings from OpenCL runtime API headers
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-attributes -Wno-gcc-compat -Wno-availability")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-attributes")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-gcc-compat")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-availability")
 endif()
 
 
@@ -53,12 +58,65 @@ set(LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR})
 add_definitions(${LLVM_DEFINITIONS})
 include_directories(${LLVM_INCLUDE_DIRS})
 link_directories(${LLVM_LIBRARY_DIRS})
-set(CLANG ${LLVM_TOOLS_BINARY_DIR}/clang)
 
 # Get LLVM libraries for linking
 llvm_map_components_to_libnames(LLVM_LIBS
   bitreader bitwriter core instrumentation ipo irreader
-  linker mcparser objcarcopts option)
+  linker lto mcparser objcarcopts option target)
+
+if (NOT (${LLVM_PACKAGE_VERSION} VERSION_LESS "3.9"))
+  llvm_map_components_to_libnames(LLVM_COVERAGE coverage)
+  list(APPEND LLVM_LIBS ${LLVM_COVERAGE})
+endif()
+
+if (NOT (${LLVM_PACKAGE_VERSION} VERSION_LESS "4.0"))
+  llvm_map_components_to_libnames(LLVM_COROUTINES coroutines)
+  list(APPEND LLVM_LIBS ${LLVM_COROUTINES})
+endif()
+
+
+# Allow user to set path to Clang installation via CLANG_ROOT
+set (CLANG_ROOT " " CACHE PATH "Root of Clang installation")
+if (NOT ${CLANG_ROOT} STREQUAL " ")
+  include_directories("${CLANG_ROOT}/include")
+  link_directories("${CLANG_ROOT}/lib")
+  set(CMAKE_REQUIRED_INCLUDES
+      "${CMAKE_REQUIRED_INCLUDES};${CLANG_ROOT}/include")
+endif()
+
+set(CMAKE_REQUIRED_INCLUDES
+    "${CMAKE_REQUIRED_INCLUDES};${LLVM_INCLUDE_DIRS}")
+set(CMAKE_REQUIRED_DEFINITIONS
+    "${CMAKE_REQUIRED_DEFINITIONS};${LLVM_DEFINITIONS}")
+
+# Check for Clang headers
+unset(CLANG_HEADER CACHE)
+find_path(CLANG_HEADER "clang/CodeGen/CodeGenAction.h"
+          PATHS "${CLANG_ROOT}/include" "${LLVM_INCLUDE_DIRS}"
+          NO_DEFAULT_PATH)
+find_path(CLANG_HEADER "clang/CodeGen/CodeGenAction.h")
+if ("${CLANG_HEADER}" STREQUAL "CLANG_HEADER-NOTFOUND")
+  message(FATAL_ERROR "Clang headers not found (set CLANG_ROOT)")
+endif()
+
+# Check for Clang libraries
+unset(CLANG_LIB CACHE)
+find_library(CLANG_LIB "clangFrontend"
+             PATHS "${CLANG_ROOT}/lib" "${LLVM_LIBRARY_DIRS}"
+             NO_DEFAULT_PATH)
+find_library(CLANG_LIB "clangFrontend")
+if ("${CLANG_LIB}" STREQUAL "CLANG_LIB-NOTFOUND")
+  message(FATAL_ERROR "Clang libraries not found (set CLANG_ROOT)")
+endif()
+
+# Check for clang
+find_program(CLANG clang
+             PATHS "${CLANG_ROOT}/bin" "${LLVM_TOOLS_BINARY_DIR}"
+             NO_DEFAULT_PATH)
+find_program(CLANG clang)
+if ("${CLANG}" STREQUAL "CLANG-NOTFOUND")
+  message(FATAL_ERROR "Could not find clang binary")
+endif()
 
 
 # Check for GNU readline library
@@ -73,8 +131,10 @@ if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
 
   check_include_files("stdio.h;readline/readline.h" HAVE_READLINE_H)
   check_include_files("stdio.h;readline/history.h" HAVE_HISTORY_H)
-  check_library_exists(readline readline "${READLINE_DIR}/lib" HAVE_READLINE_LIB)
-  check_library_exists(readline add_history "${READLINE_DIR}/lib" HAVE_HISTORY_LIB)
+  check_library_exists(readline readline
+                       "${READLINE_DIR}/lib" HAVE_READLINE_LIB)
+  check_library_exists(readline add_history
+                       "${READLINE_DIR}/lib" HAVE_HISTORY_LIB)
   if (HAVE_READLINE_H AND HAVE_HISTORY_H AND
       HAVE_READLINE_LIB AND HAVE_HISTORY_LIB)
     set(HAVE_READLINE 1)
@@ -88,6 +148,15 @@ else()
  set(HAVE_READLINE 0)
 endif()
 
+# Check for library directory suffixes
+set(_LIBDIR_SUFFIX "")
+get_property(USING_LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS)
+if (USING_LIB64 AND NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
+  set(_LIBDIR_SUFFIX "64")
+endif()
+set(LIBDIR_SUFFIX "${_LIBDIR_SUFFIX}"
+    CACHE STRING "Suffix for installed library directory")
+
 # Generate stringified clc.h
 add_custom_command(
   OUTPUT src/core/clc_h.cpp
@@ -120,6 +189,7 @@ add_library(oclgrind ${CORE_LIB_TYPE}
   src/core/clc_h.cpp
   src/core/common.cpp
   src/core/Context.cpp
+  src/core/half.cpp
   src/core/Kernel.cpp
   src/core/KernelInvocation.cpp
   src/core/Memory.cpp
@@ -138,12 +208,18 @@ add_library(oclgrind ${CORE_LIB_TYPE}
   src/plugins/MemCheck.h
   src/plugins/MemCheck.cpp
   src/plugins/RaceDetector.h
-  src/plugins/RaceDetector.cpp)
-target_link_libraries(oclgrind ${CORE_EXTRA_LIBS}
-  clangAnalysis clangAST clangBasic clangCodeGen clangDriver clangEdit
-  clangFrontend clangLex clangParse clangSema clangSerialization
+  src/plugins/RaceDetector.cpp
+  src/plugins/Uninitialized.h
+  src/plugins/Uninitialized.cpp)
+target_link_libraries(oclgrind PRIVATE ${CORE_EXTRA_LIBS}
+  clangFrontend clangSerialization clangDriver clangCodeGen
+  clangParse clangSema clangAnalysis clangEdit clangAST clangLex clangBasic
   ${LLVM_LIBS})
 
+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+  target_link_libraries(oclgrind PRIVATE Version)
+endif()
+
 # Sources for OpenCL runtime API frontend
 set(RUNTIME_SOURCES
   src/runtime/async_queue.h
@@ -153,21 +229,26 @@ set(RUNTIME_SOURCES
 
 # Add ICD exports on Windows
 if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
-  list(APPEND RUNTIME_SOURCES src/runtime/icd.def)
+  set(DLL_EXPORTS src/runtime/icd.def)
 endif()
 
-add_library(oclgrind-rt-icd SHARED ${RUNTIME_SOURCES})
+add_library(oclgrind-rt-icd SHARED ${RUNTIME_SOURCES} ${DLL_EXPORTS})
 set_target_properties(oclgrind-rt-icd PROPERTIES COMPILE_FLAGS -DOCLGRIND_ICD)
 target_link_libraries(oclgrind-rt-icd ${CMAKE_DL_LIBS} oclgrind)
 
 # Add runtime exports on Windows
 if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
-  list(APPEND RUNTIME_SOURCES src/runtime/runtime.def)
+  set(DLL_EXPORTS src/runtime/runtime.def)
 endif()
 
-add_library(oclgrind-rt SHARED ${RUNTIME_SOURCES})
+add_library(oclgrind-rt SHARED ${RUNTIME_SOURCES} ${DLL_EXPORTS})
 target_link_libraries(oclgrind-rt ${CMAKE_DL_LIBS} oclgrind)
 
+add_executable(oclgrind-exe src/runtime/oclgrind.cpp)
+set_target_properties(oclgrind-exe PROPERTIES OUTPUT_NAME oclgrind)
+target_compile_definitions(oclgrind-exe PRIVATE
+                           "-DLIBDIR_SUFFIX=\"${LIBDIR_SUFFIX}\"")
+
 add_executable(oclgrind-kernel
   src/kernel/oclgrind-kernel.cpp
   src/kernel/Simulation.h
@@ -190,15 +271,19 @@ add_custom_command(
   DEPENDS src/core/clc.h)
 
 # Generate precompiled headers for clc.h
+set(CLC_SYSROOT "${CMAKE_BINARY_DIR}/include/oclgrind/")
+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+  string(REPLACE "/" "\\" CLC_SYSROOT "${CLC_SYSROOT}")
+endif()
 add_custom_command(
   OUTPUT include/oclgrind/clc32.pch
   POST_BUILD
   COMMAND
     ${CLANG}
-    -cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin
+    -cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin
     -emit-pch -triple spir-unknown-unknown
-    -relocatable-pch -isysroot ${CMAKE_BINARY_DIR}/include/oclgrind/
-    ${CMAKE_BINARY_DIR}/include/oclgrind/clc.h
+    -relocatable-pch -isysroot "${CLC_SYSROOT}"
+    include/oclgrind/clc.h
     -o include/oclgrind/clc32.pch
   DEPENDS include/oclgrind/clc.h
 )
@@ -207,10 +292,10 @@ add_custom_command(
   POST_BUILD
   COMMAND
     ${CLANG}
-    -cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin
+    -cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin
     -emit-pch -triple spir64-unknown-unknown
-    -relocatable-pch -isysroot ${CMAKE_BINARY_DIR}/include/oclgrind/
-    ${CMAKE_BINARY_DIR}/include/oclgrind/clc.h
+    -relocatable-pch -isysroot "${CLC_SYSROOT}"
+    include/oclgrind/clc.h
     -o include/oclgrind/clc64.pch
   DEPENDS include/oclgrind/clc.h
 )
@@ -220,31 +305,20 @@ add_custom_command(
 configure_file("cmake_config.h.in" "config.h")
 
 
-# Install oclgrind script if not on Windows
+# Generate ICD loader if not on Windows
 if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
-  file(READ src/runtime/oclgrind OCLGRIND_SCRIPT)
-  string(REGEX REPLACE
-    "__VERSION__" "${Oclgrind_VERSION_MAJOR}.${Oclgrind_VERSION_MINOR}"
-    OCLGRIND_SCRIPT "${OCLGRIND_SCRIPT}")
-  file(WRITE ${CMAKE_BINARY_DIR}/oclgrind "${OCLGRIND_SCRIPT}")
-
-  # Generate ICD loader
-  get_property(OCLGRIND_RT_FILENAME TARGET oclgrind-rt-icd PROPERTY LOCATION)
-  file(WRITE ${CMAKE_BINARY_DIR}/oclgrind.icd "${OCLGRIND_RT_FILENAME}\n")
-
-  install(PROGRAMS
-    ${CMAKE_BINARY_DIR}/oclgrind
-    DESTINATION bin)
+  file(GENERATE OUTPUT ${CMAKE_BINARY_DIR}/oclgrind.icd
+       CONTENT "$<TARGET_FILE:oclgrind-rt-icd>\n")
 endif()
 
 install(TARGETS
-  oclgrind-kernel
+  oclgrind-exe oclgrind-kernel
   DESTINATION bin)
 install(TARGETS
   oclgrind oclgrind-rt oclgrind-rt-icd
-  DESTINATION lib)
+  DESTINATION "lib${LIBDIR_SUFFIX}")
 install(FILES
-  ${CORE_HEADERS} ${CMAKE_BINARY_DIR}/config.h ${CLC_HEADERS} LICENSE
+  ${CORE_HEADERS} ${CLC_HEADERS}
   DESTINATION include/oclgrind)
 if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
   install(FILES
@@ -269,31 +343,28 @@ enable_testing()
 find_package(PythonInterp)
 if (PYTHONINTERP_FOUND)
 
-  # Add kernel tests
-  file(READ tests/kernels/TESTS KERNEL_TESTS)
-  string(REPLACE "\n" ";" KERNEL_TESTS ${KERNEL_TESTS})
-  foreach(test ${KERNEL_TESTS})
-    add_test(
-      NAME ${test}
-      COMMAND
-      ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/kernels/run_kernel_test.py
-      $<TARGET_FILE:oclgrind-kernel>
-      ${CMAKE_SOURCE_DIR}/tests/kernels/${test}.sim)
-  endforeach(${test})
-
-  # Set PCH directory
-  set_tests_properties(${KERNEL_TESTS} PROPERTIES
-      ENVIRONMENT "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
-
-  # Expected failures
-  set_tests_properties(
-    atomics/atomic_intergroup_race
-    data-race/intragroup_hidden_race
-    PROPERTIES WILL_FAIL TRUE)
+  # Add test directories
+  add_subdirectory(tests/apps)
+  add_subdirectory(tests/kernels)
+  add_subdirectory(tests/runtime)
 
 else()
-  message(WARNING "Kernel tests will not be run (Python required)")
+  message(WARNING "Tests will not be run (Python required)")
 endif()
 
-# Add app tests
-add_subdirectory(tests/apps)
+
+# CPack config
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "OpenCL device simulator")
+set(CPACK_PACKAGE_DESCRIPTION_FILE
+    "${CMAKE_SOURCE_DIR}/src/install/cpack-description")
+set(CPACK_PACKAGE_VENDOR "University of Bristol")
+set(CPACK_PACKAGE_VERSION_MAJOR ${Oclgrind_VERSION_MAJOR})
+set(CPACK_PACKAGE_VERSION_MINOR ${Oclgrind_VERSION_MINOR})
+set(CPACK_PACKAGE_VERSION "${Oclgrind_VERSION_MAJOR}.${Oclgrind_VERSION_MINOR}")
+set(CPACK_PACKAGE_VERSION_PATCH "0")
+
+# CPack RPM config
+set(CPACK_RPM_PACKAGE_GROUP "Development/Tools")
+set(CPACK_RPM_PACKAGE_LICENSE "BSD")
+
+include(CPack)
diff --git a/LICENSE b/LICENSE
index f91a2f2..bec9311 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 University of Bristol. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/Makefile.am b/Makefile.am
index 8fcd00f..144a027 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,5 +1,5 @@
 # Makefile.am (Oclgrind)
-# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 # University of Bristol. All rights reserved.
 #
 # This program is provided under a three-clause BSD license. For full
@@ -9,8 +9,9 @@
 AUTOMAKE_OPTIONS = subdir-objects
 ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
 
-AM_CFLAGS   = -std=c99
-AM_CPPFLAGS = -I$(top_srcdir)/src/ -Wall
+SUBDIRS = . tests
+
+AM_CPPFLAGS = -I$(top_srcdir)/src/ -Wall -fno-rtti
 
 # Suppress warnings from OpenCL runtime API headers
 if USING_CLANG
@@ -19,23 +20,35 @@ endif USING_CLANG
 
 lib_LTLIBRARIES = liboclgrind.la liboclgrind-rt.la liboclgrind-rt-icd.la
 
-LLVM_LIBS = `$(llvm_config) --system-libs --libs bitreader bitwriter	\
- core instrumentation ipo irreader linker mcparser objcarcopts option`
+
+LLVM_COMPONENTS = bitreader bitwriter core instrumentation ipo	\
+irreader linker mcparser objcarcopts option target
+
+if LLVM_39_OR_NEWER
+LLVM_COMPONENTS += coverage
+endif LLVM_39_OR_NEWER
+
+if LLVM_40_OR_NEWER
+LLVM_COMPONENTS += coroutines
+endif LLVM_40_OR_NEWER
+
+LLVM_LIBS = `$(llvm_config) --system-libs --libs $(LLVM_COMPONENTS)`
 
 liboclgrind_la_SOURCES = src/core/common.h src/core/common.cpp		\
  src/core/Context.h src/core/Context.cpp src/core/half.h		\
- src/core/Kernel.h src/core/Kernel.cpp src/core/KernelInvocation.h	\
- src/core/KernelInvocation.cpp src/core/Memory.h src/core/Memory.cpp	\
- src/core/Plugin.h src/core/Plugin.cpp src/core/Program.h		\
- src/core/Program.cpp src/core/Queue.h src/core/Queue.cpp		\
- src/core/WorkItem.h src/core/WorkItem.cpp				\
- src/core/WorkItemBuiltins.cpp src/core/WorkGroup.h			\
- src/core/WorkGroup.cpp src/plugins/InstructionCounter.h		\
- src/plugins/InstructionCounter.cpp src/plugins/InteractiveDebugger.h	\
- src/plugins/InteractiveDebugger.cpp src/plugins/Logger.h		\
- src/plugins/Logger.cpp src/plugins/MemCheck.h				\
+ src/core/half.cpp src/core/Kernel.h src/core/Kernel.cpp		\
+ src/core/KernelInvocation.h src/core/KernelInvocation.cpp		\
+ src/core/Memory.h src/core/Memory.cpp src/core/Plugin.h		\
+ src/core/Plugin.cpp src/core/Program.h src/core/Program.cpp		\
+ src/core/Queue.h src/core/Queue.cpp src/core/WorkItem.h		\
+ src/core/WorkItem.cpp src/core/WorkItemBuiltins.cpp			\
+ src/core/WorkGroup.h src/core/WorkGroup.cpp				\
+ src/plugins/InstructionCounter.h src/plugins/InstructionCounter.cpp	\
+ src/plugins/InteractiveDebugger.h src/plugins/InteractiveDebugger.cpp	\
+ src/plugins/Logger.h src/plugins/Logger.cpp src/plugins/MemCheck.h	\
  src/plugins/MemCheck.cpp src/plugins/RaceDetector.h			\
- src/plugins/RaceDetector.cpp
+ src/plugins/RaceDetector.cpp src/plugins/Uninitialized.h \
+ src/plugins/Uninitialized.cpp
 nodist_liboclgrind_la_SOURCES = src/core/clc_h.cpp config.h
 liboclgrind_la_LDFLAGS = -lclangFrontend -lclangDriver		\
 -lclangSerialization -lclangCodeGen -lclangParse -lclangSema	\
@@ -45,7 +58,7 @@ oclgrind_includedir = $(includedir)/oclgrind
 oclgrind_include_HEADERS = src/core/common.h src/core/Context.h	\
  src/core/half.h src/core/Kernel.h src/core/KernelInvocation.h	\
  src/core/Memory.h src/core/Plugin.h src/core/Program.h		\
- src/core/Queue.h src/core/WorkItem.h src/core/WorkGroup.h config.h LICENSE
+ src/core/Queue.h src/core/WorkItem.h src/core/WorkGroup.h
 src/core/clc_h.cpp: src/core/gen_clc_h.sh	src/core/clc.h
 	$(top_srcdir)/src/core/gen_clc_h.sh $(top_srcdir)/src/core/clc.h $@
 
@@ -71,16 +84,15 @@ liboclgrind_rt_icd_la_SOURCES = $(RUNTIME_SOURCES)
 liboclgrind_rt_icd_la_LIBADD = liboclgrind.la
 liboclgrind_rt_icd_la_LDFLAGS = -shared
 
-bin_PROGRAMS = oclgrind-kernel
+bin_PROGRAMS = oclgrind oclgrind-kernel
+
+oclgrind_SOURCES = src/runtime/oclgrind.cpp
+oclgrind_CPPFLAGS = -DLIBDIR_SUFFIX=""
+
 oclgrind_kernel_SOURCES = src/kernel/oclgrind-kernel.cpp	\
  src/kernel/Simulation.h src/kernel/Simulation.cpp
 oclgrind_kernel_LDADD = liboclgrind.la
 
-bin_SCRIPTS = oclgrind
-oclgrind: $(top_srcdir)/src/runtime/oclgrind
-	cat $(top_srcdir)/src/runtime/oclgrind \
-	| $(SED) 's|__VERSION__|'$(VERSION)'|g' \
-	>$@
 noinst_SCRIPTS = oclgrind.icd \
  src/include/oclgrind/clc.h \
  src/include/oclgrind/clc32.pch \
@@ -95,53 +107,27 @@ src/include/oclgrind/clc.h: $(top_srcdir)/src/core/clc.h
 
 src/include/oclgrind/clc32.pch: src/include/oclgrind/clc.h
 	$(clang) \
-		-cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin \
+		-cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin \
 		-emit-pch -triple spir-unknown-unknown \
 		-relocatable-pch \
-                -isysroot $(abs_builddir)/src/include/oclgrind \
+		-isysroot $(abs_builddir)/src/include/oclgrind \
 		$< -o $@
 src/include/oclgrind/clc64.pch: src/include/oclgrind/clc.h
 	$(clang) \
-		-cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin \
+		-cc1 -x cl -cl-std=CL1.2 -O0 -fno-builtin \
 		-emit-pch -triple spir64-unknown-unknown \
 		-relocatable-pch \
-                -isysroot $(abs_builddir)/src/include/oclgrind \
+		-isysroot $(abs_builddir)/src/include/oclgrind \
 		$< -o $@
 
-check_PROGRAMS = tests/apps/vecadd/vecadd
-tests_apps_vecadd_vecadd_LDADD = liboclgrind-rt.la
-TESTS = $(check_PROGRAMS)
-
-TEST_EXTENSIONS = .sim
-SIM_LOG_COMPILER = $(PYTHON)				\
-  $(top_srcdir)/tests/kernels/run_kernel_test.py	\
-  ${abs_top_builddir}/oclgrind-kernel
-AM_TESTS_ENVIRONMENT = \
-  export AM_TESTS=1; \
-  export OCLGRIND_PCH_DIR=$(abs_builddir)/src/include/oclgrind;
-
-if HAVE_PYTHON
-TESTS += $(KERNEL_TESTS)
-XFAIL_TESTS =							\
-	tests/kernels/atomics/atomic_intergroup_race.sim 	\
-	tests/kernels/data-race/intragroup_hidden_race.sim
-else
-check-local:
-	@echo
-	@echo "WARNING: Kernel tests skipped (Python required)."
-	@echo
-endif
-
-EXTRA_DIST = NEWS src/core/gen_clc_h.sh src/core/clc.h			\
- src/runtime/oclgrind src/CL/cl.h src/CL/cl_gl.h src/CL/cl_platform.h	\
- src/CL/cl_ext.h src/CL/cl_gl_ext.h src/CL/cl_egl.h src/CL/cl_d3d10.h	\
+EXTRA_DIST = NEWS src/core/gen_clc_h.sh src/core/clc.h src/CL/cl.h	\
+ src/CL/cl_gl.h src/CL/cl_platform.h src/CL/cl_ext.h			\
+ src/CL/cl_gl_ext.h src/CL/cl_egl.h src/CL/cl_d3d10.h			\
  src/CL/cl_d3d11.h src/CL/cl_dx9_media_sharing.h src/CL/opencl.h	\
- CMakeLists.txt tests/apps/CMakeLists.txt cmake_config.h.in		\
+ CMakeLists.txt tests/apps/CMakeLists.txt tests/kernels/CMakeLists.txt	\
+ tests/runtime/CMakeLists.txt cmake_config.h.in				\
  src/core/gen_clc_h.cmake src/runtime/icd.def src/runtime/runtime.def	\
  src/install/INSTALL.darwin src/install/INSTALL.linux			\
  src/install/INSTALL.windows src/install/install.bat			\
- src/install/uninstall.bat src/install/oclgrind-icd.reg			\
- tests/kernels/run_kernel_test.py tests/kernels/TESTS			\
- $(KERNEL_TEST_INPUTS)
-CLEANFILES = src/core/clc_h.cpp $(bin_SCRIPTS) $(noinst_SCRIPTS)	\
- $(KERNEL_TEST_OUTPUTS)
+ src/install/uninstall.bat src/install/oclgrind-icd.reg
+CLEANFILES = src/core/clc_h.cpp $(noinst_SCRIPTS)
diff --git a/NEWS b/NEWS
index 16766ab..c9c996b 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,23 @@
 For more information, please visit the Oclgrind Wiki:
 https://github.com/jrprice/Oclgrind/wiki
 
+
+Oclgrind 16.10
+==============
+This release incorporates the following changes:
+
+- Added plugin to detect use of uninitialized values (from Moritz Pflanzer)
+- Added memoryMap and memoryUnmap plugin callbacks
+- Added support for LLVM 3.7, 3.8, and 3.9
+- Added oclgrind.exe command on Windows
+- Report invalid uses of mapped buffers inside kernels
+- Report invalid indices when accessing statically sized arrays
+- Improved coverage of race detection plugin
+- Fixed memcheck false-positive when writing to a write-only vector array
+- Oclgrind will now appear with device type (CPU | GPU | ACCELERATOR | DEFAULT)
+- Various minor bug fixes
+
+
 Oclgrind 15.5
 =============
 This release updates to LLVM 3.6, which improves the OpenCL C compiler
diff --git a/README b/README
deleted file mode 100644
index 6a65f57..0000000
--- a/README
+++ /dev/null
@@ -1,138 +0,0 @@
-========
-Oclgrind
-========
-
-About
------
-This project implements a virtual OpenCL device simulator, including
-an OpenCL runtime with ICD support. The goal is to provide a platform
-for creating tools to aid OpenCL development. In particular, this
-project currently implements utilities for debugging memory access
-errors, detecting data-races and barrier divergence, collecting
-instruction histograms, and for interactive OpenCL kernel debugging.
-The simulator is built on an interpreter for LLVM IR. This project is
-being developed by James Price and Simon McIntosh-Smith at the
-University of Bristol.
-
-Binary releases can be found on the GitHub releases page:
-
-  https://github.com/jrprice/Oclgrind/releases
-
-
-Building
---------
-To build this project, you will require the LLVM and Clang 3.6
-development libraries and headers. With some modifications, it may
-also be possible to use other (recent) versions of LLVM. If building
-LLVM from source, it is recommended to enable optimizations to improve
-the performance of Oclgrind (configure with --enable-optimized, or set
-CMAKE_BUILD_TYPE to RelWithDebInfo).
-
-You will also need to use a compiler that supports C++11.
-
-
-Building on Linux and OS X
---------------------------
-If you are building directly from the GitHub repository, you need to
-run 'autoreconf -i' to generate the necessary build files. This is not
-required if you are using a released source package.
-
-Run ./configure to generate the Makefile, optionally using
---prefix=PATH to specify the target installation directory. If you
-don't have the LLVM/Clang includes and libraries on your search path,
-you can specify the location of your LLVM installation using the
---with-llvm=PATH option. For example:
-
-./configure --prefix=$PWD/build/ --with-llvm=PATH/TO/LLVM/INSTALL
-
-This path should be the directory in which LLVM is installed (e.g. the
-path specified to --prefix or CMAKE_INSTALL_PATH when LLVM was built).
-
-Next, build and install with make:
-
-make
-make check
-make install
-
-If installing to a non-default location, you should add the bin/
-directory to the PATH environment variable in order to make use of the
-oclgrind command. If you wish to use Oclgrind via the OpenCL ICD
-(optional), then you should create an ICD loading point by copying the
-oclgrind.icd file from the build directory to /etc/OpenCL/vendors/.
-
-Building on Windows
--------------------
-A CMake build system is provided for building Oclgrind on Windows. At
-present, this only works with Visual Studio 2013 (or newer), and
-Windows 7.
-
-When configuring the CMake build, you may be prompted to supply a
-value for the LLVM_DIR parameter. This should be set to the directory
-containing your LLVM installations's LLVMConfig.cmake file, (for
-example C:\Program Files\LLVM\share\llvm\cmake\).
-
-If you wish to use Oclgrind via the OpenCL ICD (optional), then you
-should also create an ICD loading point. To do this, you should add a
-REG_DWORD value to the Windows Registry under one or both of the
-registry keys below, with the name set to the absolute path of the
-oclgrind-rt-icd.dll library and the value set to 0.
-
-Key for 32-bit machines or 64-bit apps on a 64-bit machine:
-HKEY_LOCAL_MACHINE\SOFTWARE\Khronos\OpenCL\Vendors
-
-Key for 32-bit apps on a 64-bit machine:
-HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Khronos\OpenCL\Vendors
-
-
-Usage
------
-The recommended method of running an application with Oclgrind is to
-use the oclgrind command, for example:
-
-oclgrind ./application
-
-This command will make it such the only OpenCL platform and device
-available to your application is Oclgrind. If you need more control
-over platform selection then installing an ICD loading point for
-Oclgrind will cause it to appear when an application calls
-clGetPlatformIDs(), alongside any other OpenCL platforms installed on
-your system.
-
-If it encounters any invalid memory accesses, Oclgrind will
-report the details to stderr, for example:
-
-> Invalid write of size 4 at global memory address 0x1000000000040
->     Kernel:  vecadd
->     Entity:  Global(16,0,0) Local(0,0,0) Group(16,0,0)
->     store i32 %tmp9, i32 addrspace(1)* %tmp15, align 4
->     At line 4 of input.cl
->       c[i] = a[i] + b[i]
-
-Since it is interpreting an abstract intermediate representation and
-bounds-checking each memory access, Oclgrind will run quite slowly
-(typically a couple of orders of magnitude slower than a regular CPU
-implementation). Therefore, it is recommended to run your application
-with a small problem if possible.
-
-To enable an interactive, GDB-style debugging session, supply the -i
-flag to the oclgrind command, or export the environment variable
-OCLGRIND_INTERACTIVE=1. This will cause Oclgrind to automatically
-break at the beginning of each kernel invocation, and upon
-encountering an invalid memory access. Type 'help' for details of
-available commands.
-
-For more detailed information about using Oclgrind please visit the
-GitHub Wiki:
-
-    https://github.com/jrprice/Oclgrind/wiki/
-
-
-Contact
--------
-If you encounter any issues or have any questions, please use the
-GitHub issues page:
-
-    https://github.com/jrprice/Oclgrind/issues
-
-You can also contact the primary developer via email:
-James Price <j.price at bristol.ac.uk>
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..986c1bd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,196 @@
+========
+Oclgrind
+========
+
+About
+-----
+This project implements a virtual OpenCL device simulator, including
+an OpenCL runtime with ICD support. The goal is to provide a platform
+for creating tools to aid OpenCL development. In particular, this
+project currently implements utilities for debugging memory access
+errors, detecting data-races and barrier divergence, collecting
+instruction histograms, and for interactive OpenCL kernel debugging.
+The simulator is built on an interpreter for LLVM IR. This project is
+being developed by James Price and Simon McIntosh-Smith at the
+University of Bristol.
+
+Binary releases can be found on the GitHub releases page:
+
+  https://github.com/jrprice/Oclgrind/releases
+
+
+Build dependencies
+------------------
+To build this project, you will need LLVM and Clang 3.6 (or newer)
+development libraries and headers. If you build LLVM from source, it
+is recommended to enable optimizations to significantly improve the
+performance of Oclgrind (set `CMAKE_BUILD_TYPE` to `Release` or
+`RelWithDebInfo`).
+
+You will need to use a compiler that supports C++11. Python should
+also be available in order to run the test suite.
+
+
+Building on Linux and OS X (CMake)
+----------------------------------
+The recommended method of building Oclgrind is via CMake.
+
+When configuring the CMake build, you may be prompted to supply a
+value for the `LLVM_DIR` parameter (this shouldn't be necessary if
+LLVM is installed in a standard system location). This should be set
+to the directory containing your LLVM installation's
+`LLVMConfig.cmake` file (typically either
+`${LLVM_ROOT}/lib/cmake/llvm` or `${LLVM_ROOT}/share/llvm/cmake/`).
+If Clang is installed separately to LLVM, then you may also be
+prompted to supply a path for the `CLANG_ROOT` parameter, which should
+be the root of your Clang installation (containing the `bin/`, `lib/`
+and `include/` directories).
+
+A typical CMake command-line might look like this:
+
+    cmake ${OCLGRIND_SOURCE} \
+          -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+          -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT} \
+          -DLLVM_DIR=${LLVM_ROOT}/lib/cmake/llvm
+
+where `${OCLGRIND_SOURCE}` is the path to the root directory
+containing the Oclgrind source code, `${LLVM_ROOT}` is the path to the
+LLVM installation, and `${INSTALL_ROOT}` is the desired installation
+root directory (this can be omitted if installing to system
+directories).
+
+Next, build and install with make:
+
+    make
+    make test
+    make install
+
+If installing to a non-system location, you should add the `bin/`
+directory to the `PATH` environment variable in order to make use of
+the `oclgrind` command. If you wish to use Oclgrind via the OpenCL ICD
+loader (optional), then you should create an ICD loading point by
+copying the `oclgrind.icd` file from the build directory to
+`/etc/OpenCL/vendors/`.
+
+
+Building on Linux and OS X (autotools)
+--------------------------------------
+An autotools build system is also provided as an alternative to
+CMake. This will likely be removed in a future version of Oclgrind.
+
+If you are building directly from the GitHub repository, you will need
+to run `autoreconf -i` to generate the necessary build files. This is
+not required if you are using a released source package.
+
+Run `configure` to generate the Makefile, optionally using
+`--prefix=PATH` to specify the target installation directory. If you
+don't have the LLVM includes and libraries on your search path, you
+can specify the location of your LLVM installation using the
+`--with-llvm=PATH` option. For example:
+
+    ./configure --prefix=$PWD/build/ --with-llvm=PATH/TO/LLVM/INSTALL
+
+This path should be the directory in which LLVM is installed (e.g. the
+path specified to `--prefix` or `CMAKE_INSTALL_PREFIX` when LLVM was
+built). If the Clang includes and libraries are not on your search
+path or in the same location as LLVM, you can use the
+`--with-clang=PATH` option to specify its root directory.
+
+Next, build and install with make:
+
+    make
+    make check
+    make install
+
+If installing to a non-default location, you should add the `bin/`
+directory to the `PATH` environment variable in order to make use of
+the `oclgrind` command. If you wish to use Oclgrind via the OpenCL ICD
+loader (optional), then you should create an ICD loading point by
+copying the `oclgrind.icd` file from the build directory to
+`/etc/OpenCL/vendors/`.
+
+
+Building on Windows
+-------------------
+Building Oclgrind on Windows requires Visual Studio 2013 (or newer),
+and Windows 7 (or newer). Compiling against recent versions of LLVM
+may require Visual Studio 2015.
+
+When configuring the CMake build, you may be prompted to supply a
+value for the `LLVM_DIR` parameter. This should be set to the
+directory containing your LLVM installation's `LLVMConfig.cmake` file
+(for example `C:\Program Files\LLVM\lib\cmake\llvm`). If Clang is
+installed separately to LLVM, then you may also be prompted to supply
+a path in the `CLANG_ROOT` parameter, which should be the root of your
+Clang installation (containing the `bin/`, `lib/` and `include/`
+directories).
+
+You should add the `bin` directory of the Oclgrind installation to the
+`PATH` environment variable in order to make use of the `oclgrind`
+command. If you wish to use Oclgrind via the OpenCL ICD loader
+(optional), then you should also create an ICD loading point. To do
+this, you should add a `REG_DWORD` value to the Windows Registry under
+one or both of the registry keys below, with the name set to the
+absolute path of the `oclgrind-rt-icd.dll` library and the value set
+to 0.
+
+Key for 32-bit machines or 64-bit apps on a 64-bit machine:
+`HKEY_LOCAL_MACHINE\SOFTWARE\Khronos\OpenCL\Vendors`
+
+Key for 32-bit apps on a 64-bit machine:
+`HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Khronos\OpenCL\Vendors`
+
+
+Usage
+-----
+The recommended method of running an application with Oclgrind is to
+use the `oclgrind` command, for example:
+
+    oclgrind ./application
+
+This command will make it such the only OpenCL platform and device
+available to your application is Oclgrind. If you need more control
+over platform selection then installing an ICD loading point for
+Oclgrind will cause it to appear when an application calls
+`clGetPlatformIDs()`, alongside any other OpenCL platforms installed
+on your system.
+
+If it encounters any invalid memory accesses, Oclgrind will
+report the details to stderr, for example:
+
+    Invalid write of size 4 at global memory address 0x1000000000040
+        Kernel:  vecadd
+        Entity:  Global(16,0,0) Local(0,0,0) Group(16,0,0)
+        store i32 %tmp9, i32 addrspace(1)* %tmp15, align 4
+        At line 4 of input.cl
+          c[i] = a[i] + b[i]
+
+Since it is interpreting an abstract intermediate representation and
+bounds-checking each memory access, Oclgrind will run quite slowly
+(typically a couple of orders of magnitude slower than a regular CPU
+implementation). Therefore, it is recommended to run your application
+with a small problem if possible.
+
+To enable an interactive, GDB-style debugging session, supply the `-i`
+flag to the oclgrind command, or export the environment variable
+`OCLGRIND_INTERACTIVE=1`. This will cause Oclgrind to automatically
+break at the beginning of each kernel invocation, and upon
+encountering an invalid memory access. Type `help` for details of
+available commands.
+
+For more detailed information about using Oclgrind please visit the
+GitHub Wiki:
+
+  https://github.com/jrprice/Oclgrind/wiki/
+
+
+Contact
+-------
+If you encounter any issues or have any questions, please use the
+GitHub issues page:
+
+  https://github.com/jrprice/Oclgrind/issues
+
+You can also contact the primary developer via email:
+
+  James Price `<j.price at bristol.ac.uk>`
diff --git a/configure.ac b/configure.ac
index 4b4c793..3da5e46 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,12 +1,13 @@
 # configure.ac (Oclgrind)
-# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 # University of Bristol. All rights reserved.
 #
 # This program is provided under a three-clause BSD license. For full
 # license terms please see the LICENSE file distributed with this
 # source code.
 
-AC_INIT([Oclgrind], [15.5], , [oclgrind], [https://github.com/jrprice/Oclgrind])
+AC_INIT([Oclgrind], [16.10], ,
+        [oclgrind], [https://github.com/jrprice/Oclgrind])
 AC_PREREQ([2.63])
 AC_CONFIG_SRCDIR([src/])
 AM_INIT_AUTOMAKE([foreign 1.12])
@@ -14,7 +15,7 @@ AC_LANG(C++)
 AC_PROG_CXX
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_HEADERS([config.h])
-AC_CONFIG_FILES([Makefile])
+AC_CONFIG_FILES([Makefile tests/Makefile])
 
 LT_INIT
 
@@ -28,18 +29,16 @@ oclgrind_extra_libs=
 AX_CHECK_COMPILE_FLAG([-std=c++11], [],
                       [AC_MSG_ERROR([C++11 support is required])])
 CXXFLAGS="$CXXFLAGS -std=c++11"
-CPPFLAGS="$CPPFLAGS -std=c++11"
 
-# --with-llvm option to specify root of LLVM/Clang installation
+
+# --with-llvm option to specify root of LLVM installation
 AC_ARG_WITH(
         llvm,
         [AS_HELP_STRING([--with-llvm],
-                       [directory containing LLVM/Clang installation])],
-        [AC_SUBST(clang, $withval/bin/clang)
-         AC_SUBST(llvm_config, $withval/bin/llvm-config)])
+                       [directory containing LLVM installation])],
+        [export PATH="$withval/bin:$PATH"])
 
-# Find LLVM/Clang binaries (assume on PATH if --with-llvm not used)
-AC_CHECK_PROG(clang, [clang], `which clang`)
+# Get path to llvm-config
 AC_CHECK_PROG(llvm_config, [llvm-config], `which llvm-config`)
 if test -z $llvm_config; then
   AC_MSG_ERROR([llvm-config not found (use --with-llvm=)])
@@ -53,6 +52,8 @@ AC_MSG_RESULT($llvm_full_version)
 if test $llvm_version -lt 36; then
   AC_MSG_ERROR([LLVM version must be >= 3.6])
 fi
+AM_CONDITIONAL([LLVM_39_OR_NEWER], [test $llvm_version -ge 39])
+AM_CONDITIONAL([LLVM_40_OR_NEWER], [test $llvm_version -ge 40])
 AC_DEFINE_UNQUOTED([LLVM_VERSION],
                    [$llvm_version],
                    [Version of LLVM we are building against])
@@ -61,17 +62,33 @@ AC_DEFINE_UNQUOTED([LLVM_VERSION],
 CPPFLAGS="$CPPFLAGS `$llvm_config --cppflags`"
 LDFLAGS="$LDFLAGS `$llvm_config --ldflags`"
 
-# Check for LLVM/Clang headers/libraries
+
+# --with-clang option to specify root of Clang installation
+AC_ARG_WITH(
+        clang,
+        [AS_HELP_STRING([--with-clang],
+                       [directory containing Clang installation])],
+        [export PATH="$withval/bin:$PATH"
+         CPPFLAGS="$CPPFLAGS -I$withval/include/"
+         LDFLAGS="$LDFLAGS -L$withval/lib/"])
+
+CPPFLAGS_old="$CPPFLAGS"
+CPPFLAGS="$CPPFLAGS -std=c++11"
+
+# Check for Clang binaries, headers and libraries
+AC_CHECK_PROG(clang, [clang], `which clang`)
 AC_CHECK_HEADERS(
-        [llvm/IR/Instruction.h clang/CodeGen/CodeGenAction.h],
+        [clang/CodeGen/CodeGenAction.h],
         [:],
-        [AC_MSG_ERROR([LLVM/Clang includes not found (use --with-llvm=)])])
-
+        [AC_MSG_ERROR([Clang headers not found (use --with-clang=)])])
 AC_CHECK_LIB(
         [clangFrontend],
         [main],
         [:],
-        [AC_MSG_ERROR([Clang library not found (use --with-llvm)])])
+        [AC_MSG_ERROR([Clang libraries not found (use --with-clang)])])
+
+CPPFLAGS="$CPPFLAGS_old"
+
 
 # GNU readline library (for interactive debugger)
 AC_ARG_WITH(
@@ -118,17 +135,14 @@ AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :])
 # Kernel tests
 KERNEL_TESTS=""
 KERNEL_TEST_INPUTS=""
-KERNEL_TEST_OUTPUTS=""
 m4_foreach([name], m4_split(m4_include(tests/kernels/TESTS), m4_newline),
 [
-    KERNEL_TESTS="$KERNEL_TESTS tests/kernels/"name".sim"
-    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS tests/kernels/"name".sim"
-    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS tests/kernels/"name".cl"
-    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS tests/kernels/"name".ref"
-    KERNEL_TEST_OUTPUTS="$KERNEL_TEST_OUTPUTS tests/kernels/"name".out"
+    KERNEL_TESTS="$KERNEL_TESTS kernels/"name".sim"
+    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS kernels/"name".sim"
+    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS kernels/"name".cl"
+    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS kernels/"name".ref"
 ])
 AC_SUBST(KERNEL_TESTS, $KERNEL_TESTS)
 AC_SUBST(KERNEL_TEST_INPUTS, $KERNEL_TEST_INPUTS)
-AC_SUBST(KERNEL_TEST_OUTPUTS, $KERNEL_TEST_OUTPUTS)
 
 AC_OUTPUT
diff --git a/src/core/Context.cpp b/src/core/Context.cpp
index 6a8b4ff..e6fc415 100644
--- a/src/core/Context.cpp
+++ b/src/core/Context.cpp
@@ -1,11 +1,12 @@
 // Context.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
 // license terms please see the LICENSE file distributed with this
 // source code.
 
+#include "config.h"
 #include "common.h"
 
 #if defined(_WIN32) && !defined(__MINGW32__)
@@ -17,6 +18,7 @@
 
 #include <mutex>
 
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Instruction.h"
 
@@ -33,13 +35,17 @@
 #include "plugins/Logger.h"
 #include "plugins/MemCheck.h"
 #include "plugins/RaceDetector.h"
+#include "plugins/Uninitialized.h"
 
 using namespace oclgrind;
 using namespace std;
 
 Context::Context()
 {
-  m_globalMemory = new Memory(AddrSpaceGlobal, this);
+  m_llvmContext = new llvm::LLVMContext;
+
+  m_globalMemory = new Memory(AddrSpaceGlobal, sizeof(size_t)==8 ? 16 : 8,
+                              this);
   m_kernelInvocation = NULL;
 
   loadPlugins();
@@ -47,6 +53,7 @@ Context::Context()
 
 Context::~Context()
 {
+  delete m_llvmContext;
   delete m_globalMemory;
 
   unloadPlugins();
@@ -67,6 +74,11 @@ Memory* Context::getGlobalMemory() const
   return m_globalMemory;
 }
 
+llvm::LLVMContext* Context::getLLVMContext() const
+{
+  return m_llvmContext;
+}
+
 void Context::loadPlugins()
 {
   // Create core plugins
@@ -79,6 +91,9 @@ void Context::loadPlugins()
   if (checkEnv("OCLGRIND_DATA_RACES"))
     m_plugins.push_back(make_pair(new RaceDetector(this), true));
 
+  if (checkEnv("OCLGRIND_UNINITIALIZED"))
+    m_plugins.push_back(make_pair(new Uninitialized(this), true));
+
   if (checkEnv("OCLGRIND_INTERACTIVE"))
     m_plugins.push_back(make_pair(new InteractiveDebugger(this), true));
 
@@ -92,7 +107,7 @@ void Context::loadPlugins()
     while(std::getline(ss, libpath, ':'))
     {
 #if defined(_WIN32) && !defined(__MINGW32__)
-      HMODULE library = LoadLibrary(libpath.c_str());
+      HMODULE library = LoadLibraryA(libpath.c_str());
       if (!library)
       {
         cerr << "Loading Oclgrind plugin failed (LoadLibrary): "
@@ -221,9 +236,10 @@ void Context::notifyKernelEnd(const KernelInvocation *kernelInvocation) const
 }
 
 void Context::notifyMemoryAllocated(const Memory *memory, size_t address,
-                                    size_t size, cl_mem_flags flags) const
+                                    size_t size, cl_mem_flags flags,
+                                    const uint8_t *initData) const
 {
-  NOTIFY(memoryAllocated, memory, address, size, flags);
+  NOTIFY(memoryAllocated, memory, address, size, flags, initData);
 }
 
 void Context::notifyMemoryAtomicLoad(const Memory *memory, AtomicOp op,
@@ -274,6 +290,13 @@ void Context::notifyMemoryLoad(const Memory *memory, size_t address,
   }
 }
 
+void Context::notifyMemoryMap(const Memory *memory, size_t address,
+                              size_t offset, size_t size,
+                              cl_mem_flags flags) const
+{
+  NOTIFY(memoryMap, memory, address, offset, size, flags);
+}
+
 void Context::notifyMemoryStore(const Memory *memory, size_t address,
                                 size_t size, const uint8_t *storeData) const
 {
@@ -301,6 +324,12 @@ void Context::notifyMessage(MessageType type, const char *message) const
   NOTIFY(log, type, message);
 }
 
+void Context::notifyMemoryUnmap(const Memory *memory, size_t address,
+                                const void *ptr) const
+{
+  NOTIFY(memoryUnmap, memory, address, ptr);
+}
+
 void Context::notifyWorkGroupBarrier(const WorkGroup *workGroup,
                                      uint32_t flags) const
 {
diff --git a/src/core/Context.h b/src/core/Context.h
index 41be6c7..131caab 100644
--- a/src/core/Context.h
+++ b/src/core/Context.h
@@ -1,5 +1,5 @@
 // Context.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -8,6 +8,11 @@
 
 #include "common.h"
 
+namespace llvm
+{
+  class LLVMContext;
+}
+
 namespace oclgrind
 {
   class KernelInvocation;
@@ -26,6 +31,7 @@ namespace oclgrind
     virtual ~Context();
 
     Memory* getGlobalMemory() const;
+    llvm::LLVMContext* getLLVMContext() const;
     bool isThreadSafe() const;
     void logError(const char* error) const;
 
@@ -36,7 +42,8 @@ namespace oclgrind
     void notifyKernelBegin(const KernelInvocation *kernelInvocation) const;
     void notifyKernelEnd(const KernelInvocation *kernelInvocation) const;
     void notifyMemoryAllocated(const Memory *memory, size_t address,
-                               size_t size, cl_mem_flags flags) const;
+                               size_t size, cl_mem_flags flags,
+                               const uint8_t *initData) const;
     void notifyMemoryAtomicLoad(const Memory *memory, AtomicOp op,
                                 size_t address, size_t size) const;
     void notifyMemoryAtomicStore(const Memory *memory, AtomicOp op,
@@ -44,9 +51,13 @@ namespace oclgrind
     void notifyMemoryDeallocated(const Memory *memory, size_t address) const;
     void notifyMemoryLoad(const Memory *memory, size_t address,
                           size_t size) const;
+    void notifyMemoryMap(const Memory *memory, size_t address,
+                         size_t offset, size_t size, cl_map_flags flags) const;
     void notifyMemoryStore(const Memory *memory, size_t address, size_t size,
                            const uint8_t *storeData) const;
     void notifyMessage(MessageType type, const char *message) const;
+    void notifyMemoryUnmap(const Memory *memory, size_t address,
+                           const void *ptr) const;
     void notifyWorkGroupBarrier(const WorkGroup *workGroup,
                                 uint32_t flags) const;
     void notifyWorkGroupBegin(const WorkGroup *workGroup) const;
@@ -68,6 +79,8 @@ namespace oclgrind
     void loadPlugins();
     void unloadPlugins();
 
+    llvm::LLVMContext *m_llvmContext;
+
   public:
     class Message
     {
diff --git a/src/core/Kernel.cpp b/src/core/Kernel.cpp
index ab2741e..2ca8393 100644
--- a/src/core/Kernel.cpp
+++ b/src/core/Kernel.cpp
@@ -1,12 +1,14 @@
 // Kernel.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
 // license terms please see the LICENSE file distributed with this
 // source code.
 
+#include "config.h"
 #include "common.h"
+
 #include <sstream>
 
 #include "llvm/IR/Constants.h"
@@ -25,9 +27,6 @@ Kernel::Kernel(const Program *program,
                const llvm::Function *function, const llvm::Module *module)
  : m_program(program), m_function(function), m_name(function->getName())
 {
-  m_localMemory = new Memory(AddrSpaceLocal, program->getContext());
-  m_privateMemory = new Memory(AddrSpacePrivate, program->getContext());
-
   // Set-up global variables
   llvm::Module::const_global_iterator itr;
   for (itr = module->global_begin(); itr != module->global_end(); itr++)
@@ -37,41 +36,25 @@ Kernel::Kernel(const Program *program,
     {
     case AddrSpacePrivate:
     {
+      // Get initializer data
       const llvm::Constant *init = itr->getInitializer();
-
-      // Allocate private memory for variable
       unsigned size = getTypeSize(init->getType());
-      size_t address = m_privateMemory->allocateBuffer(size);
-
-      // Initialize variable
-      void *ptr = m_privateMemory->getPointer(address);
-      getConstantData((unsigned char*)ptr, init);
-
-      TypedValue value =
-      {
-        sizeof(size_t),
-        1,
-        new unsigned char[sizeof(size_t)]
-      };
-      value.setPointer(address);
-      m_arguments[itr] = value;
+      TypedValue value = {size, 1, new uint8_t[size]};
+      getConstantData(value.data, init);
+      m_values[&*itr] = value;
 
       break;
     }
     case AddrSpaceConstant:
-      m_constants.push_back(itr);
+      m_constants.push_back(&*itr);
       break;
     case AddrSpaceLocal:
     {
-      // Allocate buffer
-      unsigned size = getTypeSize(itr->getInitializer()->getType());
-      TypedValue v = {
-        sizeof(size_t),
-        1,
-        new unsigned char[sizeof(size_t)]
+      // Get size of allocation
+      TypedValue allocSize = {
+        getTypeSize(itr->getInitializer()->getType()), 1, NULL
       };
-      v.setPointer(m_localMemory->allocateBuffer(size));
-      m_arguments[itr] = v;
+      m_values[&*itr] = allocSize;
 
       break;
     }
@@ -111,26 +94,19 @@ Kernel::Kernel(const Kernel& kernel)
   m_function = kernel.m_function;
   m_constants = kernel.m_constants;
   m_constantBuffers = kernel.m_constantBuffers;
-  m_localMemory = kernel.m_localMemory->clone();
-  m_privateMemory = kernel.m_privateMemory->clone();
   m_name = kernel.m_name;
   m_metadata = kernel.m_metadata;
 
-  TypedValueMap::const_iterator itr;
-  for (itr = kernel.m_arguments.begin();
-       itr != kernel.m_arguments.end(); itr++)
+  for (auto itr = kernel.m_values.begin(); itr != kernel.m_values.end(); itr++)
   {
-    m_arguments[itr->first] = itr->second.clone();
+    m_values[itr->first] = itr->second.clone();
   }
 }
 
 Kernel::~Kernel()
 {
-  delete m_localMemory;
-  delete m_privateMemory;
-
   TypedValueMap::iterator itr;
-  for (itr = m_arguments.begin(); itr != m_arguments.end(); itr++)
+  for (itr = m_values.begin(); itr != m_values.end(); itr++)
   {
     delete[] itr->second.data;
   }
@@ -141,7 +117,7 @@ bool Kernel::allArgumentsSet() const
   llvm::Function::const_arg_iterator itr;
   for (itr = m_function->arg_begin(); itr != m_function->arg_end(); itr++)
   {
-    if (!m_arguments.count(itr))
+    if (!m_values.count(&*itr))
     {
       return false;
     }
@@ -157,22 +133,29 @@ void Kernel::allocateConstants(Memory *memory)
     const llvm::Constant *initializer = (*itr)->getInitializer();
     const llvm::Type *type = initializer->getType();
 
-    // Allocate buffer
+    // Deallocate existing pointer
+    if (m_values.count(*itr))
+    {
+      delete[] m_values[*itr].data;
+    }
+
+    // Get initializer data
     unsigned size = getTypeSize(type);
-    TypedValue v = {
+    unsigned char *data = new unsigned char[size];
+    getConstantData(data, (const llvm::Constant*)initializer);
+
+    // Allocate buffer
+    TypedValue address = {
       sizeof(size_t),
       1,
       new unsigned char[sizeof(size_t)]
     };
-    size_t address = memory->allocateBuffer(size);
-    v.setPointer(address);
-    m_constantBuffers.push_back(address);
-    m_arguments[*itr] = v;
+    size_t ptr = memory->allocateBuffer(size, 0, data);
+    address.setPointer(ptr);
+
+    m_values[*itr] = address;
+    m_constantBuffers.push_back(ptr);
 
-    // Initialise buffer contents
-    unsigned char *data = new unsigned char[size];
-    getConstantData(data, (const llvm::Constant*)initializer);
-    memory->store(data, address, size);
     delete[] data;
   }
 }
@@ -196,23 +179,23 @@ const llvm::Argument* Kernel::getArgument(unsigned int index) const
   {
     argItr++;
   }
-  return argItr;
+  return &*argItr;
 }
 
 unsigned int Kernel::getArgumentAccessQualifier(unsigned int index) const
 {
   assert(index < getNumArguments());
 
-  // Get metadata node
-  const llvm::MDNode *node = getArgumentMetadata("kernel_arg_access_qual");
-  if (!node)
+  // Get metadata
+  const llvm::Metadata *md =
+    getArgumentMetadata("kernel_arg_access_qual", index);
+  if (!md)
   {
     return -1;
   }
 
   // Get qualifier string
-  llvm::MDString *str
-    = llvm::dyn_cast<llvm::MDString>(node->getOperand(index+1));
+  const llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(md);
   string access = str->getString();
   if (access == "read_only")
   {
@@ -233,15 +216,15 @@ unsigned int Kernel::getArgumentAddressQualifier(unsigned int index) const
 {
   assert(index < getNumArguments());
 
-  // Get metadata node
-  const llvm::MDNode *node = getArgumentMetadata("kernel_arg_addr_space");
-  if (!node)
+  // Get metadata
+  const llvm::Metadata *md =
+    getArgumentMetadata("kernel_arg_addr_space", index);
+  if (!md)
   {
     return -1;
   }
 
-  // Get address space
-  switch(getMDOpAsConstInt(node->getOperand(index+1))->getZExtValue())
+  switch(getMDAsConstInt(md)->getZExtValue())
   {
     case AddrSpacePrivate:
       return CL_KERNEL_ARG_ADDRESS_PRIVATE;
@@ -256,8 +239,10 @@ unsigned int Kernel::getArgumentAddressQualifier(unsigned int index) const
   }
 }
 
-const llvm::MDNode* Kernel::getArgumentMetadata(string name) const
+const llvm::Metadata* Kernel::getArgumentMetadata(string name,
+                                                  unsigned int index) const
 {
+#if LLVM_VERSION < 39
   if (!m_metadata)
   {
     return NULL;
@@ -273,11 +258,17 @@ const llvm::MDNode* Kernel::getArgumentMetadata(string name) const
       if (node->getNumOperands() > 0 &&
           ((llvm::MDString*)(node->getOperand(0).get()))->getString() == name)
       {
-        return node;
+        return node->getOperand(index+1).get();
       }
     }
   }
   return NULL;
+#else
+  llvm::MDNode *node = m_function->getMetadata(name);
+  if (!node)
+    return NULL;
+  return node->getOperand(index);
+#endif
 }
 
 const llvm::StringRef Kernel::getArgumentName(unsigned int index) const
@@ -289,30 +280,37 @@ const llvm::StringRef Kernel::getArgumentTypeName(unsigned int index) const
 {
   assert(index < getNumArguments());
 
-  // Get metadata node
-  const llvm::MDNode *node = getArgumentMetadata("kernel_arg_type");
-  if (!node)
+  // Get metadata
+  const llvm::Metadata *md = getArgumentMetadata("kernel_arg_type", index);
+  if (!md)
   {
     return "";
   }
 
-  return llvm::dyn_cast<llvm::MDString>(node->getOperand(index+1))->getString();
+  llvm::StringRef name = llvm::dyn_cast<llvm::MDString>(md)->getString();
+#if LLVM_VERSION >= 39
+  size_t imgStart = name.find(" image");
+  if (imgStart != llvm::StringRef::npos)
+  {
+    name = name.substr(imgStart+1);
+  }
+#endif
+  return name;
 }
 
 unsigned int Kernel::getArgumentTypeQualifier(unsigned int index) const
 {
   assert(index < getNumArguments());
 
-  // Get metadata node
-  const llvm::MDNode *node = getArgumentMetadata("kernel_arg_type_qual");
-  if (!node)
+  // Get metadata
+  const llvm::Metadata *md = getArgumentMetadata("kernel_arg_type_qual", index);
+  if (!md)
   {
     return -1;
   }
 
   // Get qualifiers
-  llvm::MDString *str =
-    llvm::dyn_cast<llvm::MDString>(node->getOperand(index+1));
+  const llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(md);
   istringstream iss(str->getString().str());
 
   unsigned int result = CL_KERNEL_ARG_TYPE_NONE;
@@ -368,12 +366,9 @@ string Kernel::getAttributes() const
           name == "work_group_size_hint")
       {
         attributes << name << "("
-                   <<
-          getMDOpAsConstInt(val->getOperand(1))->getZExtValue()
-                   << "," <<
-          getMDOpAsConstInt(val->getOperand(2))->getZExtValue()
-                   << "," <<
-          getMDOpAsConstInt(val->getOperand(3))->getZExtValue()
+                   <<        getMDAsConstInt(val->getOperand(1))->getZExtValue()
+                   << "," << getMDAsConstInt(val->getOperand(2))->getZExtValue()
+                   << "," << getMDAsConstInt(val->getOperand(3))->getZExtValue()
                    << ") ";
       }
       else if (name == "vec_type_hint")
@@ -406,14 +401,18 @@ const llvm::Function* Kernel::getFunction() const
   return m_function;
 }
 
-const Memory* Kernel::getLocalMemory() const
-{
-  return m_localMemory;
-}
-
 size_t Kernel::getLocalMemorySize() const
 {
-  return m_localMemory->getTotalAllocated();
+  size_t sz = 0;
+  for (auto value = m_values.begin(); value != m_values.end(); value++)
+  {
+    const llvm::Type *type = value->first->getType();
+    if (type->isPointerTy() && type->getPointerAddressSpace() == AddrSpaceLocal)
+    {
+      sz += value->second.size;
+    }
+  }
+  return sz;
 }
 
 const std::string& Kernel::getName() const
@@ -426,11 +425,6 @@ unsigned int Kernel::getNumArguments() const
   return m_function->arg_size();
 }
 
-const Memory* Kernel::getPrivateMemory() const
-{
-  return m_privateMemory;
-}
-
 const Program* Kernel::getProgram() const
 {
   return m_program;
@@ -439,22 +433,11 @@ const Program* Kernel::getProgram() const
 void Kernel::getRequiredWorkGroupSize(size_t reqdWorkGroupSize[3]) const
 {
   memset(reqdWorkGroupSize, 0, 3*sizeof(size_t));
-  for (unsigned i = 0; i < m_metadata->getNumOperands(); i++)
+  for (int j = 0; j < 3; j++)
   {
-    const llvm::MDOperand& op = m_metadata->getOperand(i);
-    if (llvm::MDNode *val = llvm::dyn_cast<llvm::MDNode>(op.get()))
-    {
-      llvm::MDString *str =
-        llvm::dyn_cast<llvm::MDString>(val->getOperand(0).get());
-      if (str->getString() == "reqd_work_group_size")
-      {
-        for (int j = 0; j < 3; j++)
-        {
-          reqdWorkGroupSize[j] =
-            getMDOpAsConstInt(val->getOperand(j+1))->getZExtValue();
-        }
-      }
-    }
+    const llvm::Metadata *md = getArgumentMetadata("reqd_work_group_size", j);
+    if (md)
+      reqdWorkGroupSize[j] = getMDAsConstInt(md)->getZExtValue();
   }
 }
 
@@ -463,72 +446,22 @@ void Kernel::setArgument(unsigned int index, TypedValue value)
   assert(index < m_function->arg_size());
 
   const llvm::Value *argument = getArgument(index);
-  unsigned int type = getArgumentAddressQualifier(index);
-  if (type == CL_KERNEL_ARG_ADDRESS_LOCAL)
-  {
-    // Deallocate existing argument
-    if (m_arguments.count(argument))
-    {
-      m_localMemory->deallocateBuffer(m_arguments[argument].getPointer());
-      delete[] m_arguments[argument].data;
-    }
 
-    // Allocate local memory buffer
-    TypedValue v = {
-      sizeof(size_t),
-      1,
-      new unsigned char[sizeof(size_t)]
-    };
-    v.setPointer(m_localMemory->allocateBuffer(value.size));
-    m_arguments[argument] = v;
-  }
-  else
+  // Deallocate existing argument
+  if (m_values.count(argument))
   {
-    if (((const llvm::Argument*)argument)->hasByValAttr())
-    {
-      // Deallocate existing argument
-      if (m_arguments.count(argument))
-      {
-        m_privateMemory->deallocateBuffer(m_arguments[argument].getPointer());
-        delete[] m_arguments[argument].data;
-      }
-
-      TypedValue address =
-      {
-        sizeof(size_t),
-        1,
-        new unsigned char[sizeof(size_t)]
-      };
-      size_t size = value.size*value.num;
-      address.setPointer(m_privateMemory->allocateBuffer(size));
-      m_privateMemory->store(value.data, address.getPointer(), size);
-      m_arguments[argument] = address;
-    }
-    else
-    {
-      // Deallocate existing argument
-      if (m_arguments.count(argument))
-      {
-        delete[] m_arguments[argument].data;
-      }
-
-      const llvm::Type *type = argument->getType();
-      if (type->isVectorTy())
-      {
-        value.num = type->getVectorNumElements();
-        value.size = getTypeSize(type->getVectorElementType());
-      }
-      m_arguments[argument] = value.clone();
-    }
+    delete[] m_values[argument].data;
   }
+
+  m_values[argument] = value.clone();
 }
 
-TypedValueMap::const_iterator Kernel::args_begin() const
+TypedValueMap::const_iterator Kernel::values_begin() const
 {
-  return m_arguments.begin();
+  return m_values.begin();
 }
 
-TypedValueMap::const_iterator Kernel::args_end() const
+TypedValueMap::const_iterator Kernel::values_end() const
 {
-  return m_arguments.end();
+  return m_values.end();
 }
diff --git a/src/core/Kernel.h b/src/core/Kernel.h
index cf94e90..34755a1 100644
--- a/src/core/Kernel.h
+++ b/src/core/Kernel.h
@@ -1,5 +1,5 @@
 // Kernel.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -17,6 +17,7 @@ namespace llvm
   class Function;
   class GlobalVariable;
   class MDNode;
+  class Metadata;
   class Module;
 }
 
@@ -33,24 +34,22 @@ namespace oclgrind
     Kernel(const Kernel& kernel);
     virtual ~Kernel();
 
-    TypedValueMap::const_iterator args_begin() const;
-    TypedValueMap::const_iterator args_end() const;
+    TypedValueMap::const_iterator values_begin() const;
+    TypedValueMap::const_iterator values_end() const;
     bool allArgumentsSet() const;
     void allocateConstants(Memory *memory);
     void deallocateConstants(Memory *memory);
-    size_t getArgumentSize(unsigned int index) const;
     unsigned int getArgumentAccessQualifier(unsigned int index) const;
     unsigned int getArgumentAddressQualifier(unsigned int index) const;
     const llvm::StringRef getArgumentName(unsigned int index) const;
+    size_t getArgumentSize(unsigned int index) const;
     const llvm::StringRef getArgumentTypeName(unsigned int index) const;
     unsigned int getArgumentTypeQualifier(unsigned int index) const;
     std::string getAttributes() const;
     const llvm::Function* getFunction() const;
-    const Memory* getLocalMemory() const;
     size_t getLocalMemorySize() const;
     const std::string& getName() const;
     unsigned int getNumArguments() const;
-    const Memory* getPrivateMemory() const;
     const Program* getProgram() const;
     void getRequiredWorkGroupSize(size_t reqdWorkGroupSize[3]) const;
     void setArgument(unsigned int index, TypedValue value);
@@ -58,15 +57,15 @@ namespace oclgrind
   private:
     const Program *m_program;
     const llvm::Function *m_function;
-    TypedValueMap m_arguments;
     std::list<const llvm::GlobalVariable*> m_constants;
     std::list<size_t> m_constantBuffers;
-    Memory *m_localMemory;
     const llvm::MDNode *m_metadata;
     std::string m_name;
-    Memory *m_privateMemory;
+
+    TypedValueMap m_values;
 
     const llvm::Argument* getArgument(unsigned int index) const;
-    const llvm::MDNode* getArgumentMetadata(std::string name) const;
+    const llvm::Metadata* getArgumentMetadata(std::string name,
+                                              unsigned int index) const;
   };
 }
diff --git a/src/core/KernelInvocation.cpp b/src/core/KernelInvocation.cpp
index 3d50031..a54f865 100644
--- a/src/core/KernelInvocation.cpp
+++ b/src/core/KernelInvocation.cpp
@@ -1,5 +1,5 @@
 // KernelInvocation.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -22,14 +22,6 @@
 using namespace oclgrind;
 using namespace std;
 
-// TODO: Remove this when thread_local fixed on OS X
-#ifdef __APPLE__
-#define THREAD_LOCAL __thread
-#elif defined(_WIN32) && !defined(__MINGW32__)
-#define THREAD_LOCAL __declspec(thread)
-#else
-#define THREAD_LOCAL thread_local
-#endif
 struct
 {
   WorkGroup *workGroup;
@@ -80,7 +72,8 @@ KernelInvocation::KernelInvocation(const Context *context, const Kernel *kernel,
     Size3 firstGroup(0, 0, 0);
     Size3 lastGroup(m_numGroups.x-1, m_numGroups.y-1, m_numGroups.z-1);
     m_workGroups.push_back(firstGroup);
-    m_workGroups.push_back(lastGroup);
+    if (lastGroup != firstGroup)
+      m_workGroups.push_back(lastGroup);
   }
   else
   {
@@ -166,7 +159,7 @@ void KernelInvocation::run(const Context *context, Kernel *kernel,
   catch (FatalError& err)
   {
     ostringstream info;
-    info << endl << "OCLGRIND FATAL ERROR "
+    info << "OCLGRIND FATAL ERROR "
          << "(" << err.getFile() << ":" << err.getLine() << ")"
          << endl << err.what()
          << endl << "When allocating kernel constants for '"
@@ -272,7 +265,7 @@ void KernelInvocation::runWorker()
   catch (FatalError& err)
   {
     ostringstream info;
-    info << endl << "OCLGRIND FATAL ERROR "
+    info << "OCLGRIND FATAL ERROR "
          << "(" << err.getFile() << ":" << err.getLine() << ")"
          << endl << err.what();
     m_context->logError(info.str().c_str());
@@ -324,6 +317,7 @@ bool KernelInvocation::switchWorkItem(const Size3 gid)
      if (group == *pItr)
      {
        workerState.workGroup = new WorkGroup(this, group);
+       m_context->notifyWorkGroupBegin(workerState.workGroup);
        found = true;
 
        // Re-order list of groups accordingly
diff --git a/src/core/KernelInvocation.h b/src/core/KernelInvocation.h
index 4f02447..edca291 100644
--- a/src/core/KernelInvocation.h
+++ b/src/core/KernelInvocation.h
@@ -1,5 +1,5 @@
 // KernelInvocation.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/core/Memory.cpp b/src/core/Memory.cpp
index cd33bc4..289badc 100644
--- a/src/core/Memory.cpp
+++ b/src/core/Memory.cpp
@@ -1,5 +1,5 @@
 // Memory.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -7,6 +7,7 @@
 // source code.
 
 #include "common.h"
+
 #include <cassert>
 #include <cmath>
 #include <cstring>
@@ -26,11 +27,16 @@ mutex atomicMutex[NUM_ATOMIC_MUTEXES];
 #define ATOMIC_MUTEX(offset) \
   atomicMutex[(((offset)>>2) & (NUM_ATOMIC_MUTEXES-1))]
 
-Memory::Memory(unsigned int addrSpace, const Context *context)
+Memory::Memory(unsigned addrSpace, unsigned bufferBits, const Context *context)
 {
   m_context = context;
   m_addressSpace = addrSpace;
 
+  m_numBitsBuffer = bufferBits;
+  m_numBitsAddress = ((sizeof(size_t)<<3) - m_numBitsBuffer);
+  m_maxNumBuffers = ((size_t)1 << m_numBitsBuffer) - 1; // 0 reserved for NULL
+  m_maxBufferSize = ((size_t)1 << m_numBitsAddress);
+
   clear();
 }
 
@@ -39,17 +45,18 @@ Memory::~Memory()
   clear();
 }
 
-size_t Memory::allocateBuffer(size_t size, cl_mem_flags flags)
+size_t Memory::allocateBuffer(size_t size, cl_mem_flags flags,
+                              const uint8_t *initData)
 {
   // Check requested size doesn't exceed maximum
-  if (size > MAX_BUFFER_SIZE)
+  if (size > m_maxBufferSize)
   {
     return 0;
   }
 
   // Find first unallocated buffer slot
   unsigned b = getNextBuffer();
-  if (b >= MAX_NUM_BUFFERS)
+  if (b >= m_maxNumBuffers)
   {
     return 0;
   }
@@ -60,9 +67,6 @@ size_t Memory::allocateBuffer(size_t size, cl_mem_flags flags)
   buffer->flags  = flags;
   buffer->data   = new unsigned char[size];
 
-  // Initialize contents to 0
-  memset(buffer->data, 0, size);
-
   if (b >= m_memory.size())
   {
     m_memory.push_back(buffer);
@@ -74,9 +78,15 @@ size_t Memory::allocateBuffer(size_t size, cl_mem_flags flags)
 
   m_totalAllocated += size;
 
-  size_t address = ((size_t)b) << NUM_ADDRESS_BITS;
+  // Initialize contents of buffer
+  if (initData)
+    memcpy(buffer->data, initData, size);
+  else
+    memset(buffer->data, 0, size);
+
+  size_t address = ((size_t)b) << m_numBitsAddress;
 
-  m_context->notifyMemoryAllocated(this, address, size, flags);
+  m_context->notifyMemoryAllocated(this, address, size, flags, initData);
 
   return address;
 }
@@ -93,8 +103,8 @@ uint32_t Memory::atomic(AtomicOp op, size_t address, uint32_t value)
   }
 
   // Get buffer
-  size_t offset = EXTRACT_OFFSET(address);
-  Buffer *buffer = m_memory[EXTRACT_BUFFER(address)];
+  size_t offset = extractOffset(address);
+  Buffer *buffer = m_memory[extractBuffer(address)];
   uint32_t *ptr = (uint32_t*)(buffer->data + offset);
 
   if (m_addressSpace == AddrSpaceGlobal)
@@ -155,8 +165,8 @@ uint32_t Memory::atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value)
   }
 
   // Get buffer
-  size_t offset = EXTRACT_OFFSET(address);
-  Buffer *buffer = m_memory[EXTRACT_BUFFER(address)];
+  size_t offset = extractOffset(address);
+  Buffer *buffer = m_memory[extractBuffer(address)];
   uint32_t *ptr = (uint32_t*)(buffer->data + offset);
 
   if (m_addressSpace == AddrSpaceGlobal)
@@ -190,7 +200,7 @@ void Memory::clear()
       }
       delete *itr;
 
-      size_t address = (itr-m_memory.begin())<<NUM_ADDRESS_BITS;
+      size_t address = (itr-m_memory.begin())<<m_numBitsAddress;
       m_context->notifyMemoryDeallocated(this, address);
     }
   }
@@ -200,46 +210,17 @@ void Memory::clear()
   m_totalAllocated = 0;
 }
 
-Memory* Memory::clone() const
-{
-  Memory *mem = new Memory(m_addressSpace, m_context);
-
-  // Clone buffers
-  mem->m_memory.resize(m_memory.size());
-  mem->m_memory[0] = NULL;
-  for (unsigned i = 1; i < m_memory.size(); i++)
-  {
-    Buffer *src = m_memory[i];
-    Buffer *dst = new Buffer;
-    dst->size   = src->size;
-    dst->flags  = src->flags,
-    dst->data   =
-      (src->flags&CL_MEM_USE_HOST_PTR) ?
-        src->data : new unsigned char[src->size],
-    memcpy(dst->data, src->data, src->size);
-    mem->m_memory[i] = dst;
-    m_context->notifyMemoryAllocated(mem, ((size_t)i<<NUM_ADDRESS_BITS),
-                                     src->size, src->flags);
-  }
-
-  // Clone state
-  mem->m_freeBuffers = m_freeBuffers;
-  mem->m_totalAllocated = m_totalAllocated;
-
-  return mem;
-}
-
 size_t Memory::createHostBuffer(size_t size, void *ptr, cl_mem_flags flags)
 {
   // Check requested size doesn't exceed maximum
-  if (size > MAX_BUFFER_SIZE)
+  if (size > m_maxBufferSize)
   {
     return 0;
   }
 
   // Find first unallocated buffer slot
   unsigned b = getNextBuffer();
-  if (b >= MAX_NUM_BUFFERS)
+  if (b >= m_maxNumBuffers)
   {
     return 0;
   }
@@ -261,9 +242,9 @@ size_t Memory::createHostBuffer(size_t size, void *ptr, cl_mem_flags flags)
 
   m_totalAllocated += size;
 
-  size_t address = ((size_t)b) << NUM_ADDRESS_BITS;
+  size_t address = ((size_t)b) << m_numBitsAddress;
 
-  m_context->notifyMemoryAllocated(this, address, size, flags);
+  m_context->notifyMemoryAllocated(this, address, size, flags, (uint8_t*)ptr);
 
   return address;
 }
@@ -277,8 +258,8 @@ bool Memory::copy(size_t dst, size_t src, size_t size)
   {
     return false;
   }
-  size_t src_offset = EXTRACT_OFFSET(src);
-  Buffer *src_buffer = m_memory.at(EXTRACT_BUFFER(src));
+  size_t src_offset = extractOffset(src);
+  Buffer *src_buffer = m_memory.at(extractBuffer(src));
 
 
   m_context->notifyMemoryStore(this, dst, size, src_buffer->data + src_offset);
@@ -288,8 +269,8 @@ bool Memory::copy(size_t dst, size_t src, size_t size)
   {
     return false;
   }
-  size_t dst_offset = EXTRACT_OFFSET(dst);
-  Buffer *dst_buffer = m_memory.at(EXTRACT_BUFFER(dst));
+  size_t dst_offset = extractOffset(dst);
+  Buffer *dst_buffer = m_memory.at(extractBuffer(dst));
 
 
   // Copy data
@@ -302,7 +283,7 @@ bool Memory::copy(size_t dst, size_t src, size_t size)
 
 void Memory::deallocateBuffer(size_t address)
 {
-  unsigned buffer = EXTRACT_BUFFER(address);
+  unsigned buffer = extractBuffer(address);
   assert(buffer < m_memory.size() && m_memory[buffer]);
 
   if (!(m_memory[buffer]->flags & CL_MEM_USE_HOST_PTR))
@@ -323,7 +304,7 @@ void Memory::dump() const
 {
   for (unsigned b = 1; b < m_memory.size(); b++)
   {
-    if (!m_memory[b]->data)
+    if (!m_memory[b] || !m_memory[b]->data)
     {
       continue;
     }
@@ -334,7 +315,7 @@ void Memory::dump() const
       {
         cout << endl << hex << uppercase
              << setw(16) << setfill(' ') << right
-             << ((((size_t)b)<<NUM_ADDRESS_BITS) | i) << ":";
+             << ((((size_t)b)<<m_numBitsAddress) | i) << ":";
       }
       cout << " " << hex << uppercase << setw(2) << setfill('0')
            << (int)m_memory[b]->data[i];
@@ -343,6 +324,16 @@ void Memory::dump() const
   cout << endl;
 }
 
+size_t Memory::extractBuffer(size_t address) const
+{
+  return (address >> m_numBitsAddress);
+}
+
+size_t Memory::extractOffset(size_t address) const
+{
+  return (address & (((size_t)-1) >> m_numBitsBuffer));
+}
+
 unsigned int Memory::getAddressSpace() const
 {
   return m_addressSpace;
@@ -350,7 +341,7 @@ unsigned int Memory::getAddressSpace() const
 
 const Memory::Buffer* Memory::getBuffer(size_t address) const
 {
-  size_t buf = EXTRACT_BUFFER(address);
+  size_t buf = extractBuffer(address);
   if (buf == 0 || buf >= m_memory.size() || !m_memory[buf]->data)
   {
     return NULL;
@@ -361,7 +352,7 @@ const Memory::Buffer* Memory::getBuffer(size_t address) const
 
 size_t Memory::getMaxAllocSize()
 {
-  return MAX_BUFFER_SIZE;
+  return m_maxBufferSize;
 }
 
 unsigned Memory::getNextBuffer()
@@ -380,7 +371,7 @@ unsigned Memory::getNextBuffer()
 
 void* Memory::getPointer(size_t address) const
 {
-  size_t buffer = EXTRACT_BUFFER(address);
+  size_t buffer = extractBuffer(address);
 
   // Bounds check
   if (!isAddressValid(address))
@@ -388,7 +379,7 @@ void* Memory::getPointer(size_t address) const
     return NULL;
   }
 
-  return m_memory[buffer]->data + EXTRACT_OFFSET(address);
+  return m_memory[buffer]->data + extractOffset(address);
 }
 
 size_t Memory::getTotalAllocated() const
@@ -398,8 +389,8 @@ size_t Memory::getTotalAllocated() const
 
 bool Memory::isAddressValid(size_t address, size_t size) const
 {
-  size_t buffer = EXTRACT_BUFFER(address);
-  size_t offset = EXTRACT_OFFSET(address);
+  size_t buffer = extractBuffer(address);
+  size_t offset = extractOffset(address);
   if (buffer == 0 ||
       buffer >= m_memory.size() ||
       !m_memory[buffer] ||
@@ -421,8 +412,8 @@ bool Memory::load(unsigned char *dest, size_t address, size_t size) const
   }
 
   // Get buffer
-  size_t offset = EXTRACT_OFFSET(address);
-  Buffer *src = m_memory[EXTRACT_BUFFER(address)];
+  size_t offset = extractOffset(address);
+  Buffer *src = m_memory[extractBuffer(address)];
 
   // Load data
   memcpy(dest, src->data + offset, size);
@@ -430,9 +421,9 @@ bool Memory::load(unsigned char *dest, size_t address, size_t size) const
   return true;
 }
 
-unsigned char* Memory::mapBuffer(size_t address, size_t offset, size_t size)
+void* Memory::mapBuffer(size_t address, size_t offset, size_t size)
 {
-  size_t buffer = EXTRACT_BUFFER(address);
+  size_t buffer = extractBuffer(address);
 
   // Bounds check
   if (!isAddressValid(address, size))
@@ -440,7 +431,7 @@ unsigned char* Memory::mapBuffer(size_t address, size_t offset, size_t size)
     return NULL;
   }
 
-  return m_memory[buffer]->data + offset + EXTRACT_OFFSET(address);
+  return m_memory[buffer]->data + offset + extractOffset(address);
 }
 
 bool Memory::store(const unsigned char *source, size_t address, size_t size)
@@ -454,8 +445,8 @@ bool Memory::store(const unsigned char *source, size_t address, size_t size)
   }
 
   // Get buffer
-  size_t offset = EXTRACT_OFFSET(address);
-  Buffer *dst = m_memory[EXTRACT_BUFFER(address)];
+  size_t offset = extractOffset(address);
+  Buffer *dst = m_memory[extractBuffer(address)];
 
   // Store data
   memcpy(dst->data + offset, source, size);
diff --git a/src/core/Memory.h b/src/core/Memory.h
index 42eb63a..71f0c45 100644
--- a/src/core/Memory.h
+++ b/src/core/Memory.h
@@ -1,5 +1,5 @@
 // Memory.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -8,16 +8,6 @@
 
 #include "common.h"
 
-#define NUM_BUFFER_BITS ( (sizeof(size_t)==4) ? 8 : 16)
-#define MAX_NUM_BUFFERS ((size_t)1 << NUM_BUFFER_BITS)
-#define NUM_ADDRESS_BITS ((sizeof(size_t)<<3) - NUM_BUFFER_BITS)
-#define MAX_BUFFER_SIZE ((size_t)1 << NUM_ADDRESS_BITS)
-
-#define EXTRACT_BUFFER(address) \
-  (address >> NUM_ADDRESS_BITS)
-#define EXTRACT_OFFSET(address) \
-  (address & (((size_t)-1) >> NUM_BUFFER_BITS))
-
 namespace oclgrind
 {
   class Context;
@@ -25,22 +15,22 @@ namespace oclgrind
   class Memory
   {
   public:
-    typedef struct
+    struct Buffer
     {
       size_t size;
       cl_mem_flags flags;
       unsigned char *data;
-    } Buffer;
+    };
 
   public:
-    Memory(unsigned int addrSpace, const Context *context);
+    Memory(unsigned addrSpace, unsigned bufferBits, const Context *context);
     virtual ~Memory();
 
-    size_t allocateBuffer(size_t size, cl_mem_flags flags=0);
+    size_t allocateBuffer(size_t size, cl_mem_flags flags=0,
+                          const uint8_t *initData = NULL);
     uint32_t atomic(AtomicOp op, size_t address, uint32_t value = 0);
     uint32_t atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value);
     void clear();
-    Memory *clone() const;
     size_t createHostBuffer(size_t size, void *ptr, cl_mem_flags flags=0);
     bool copy(size_t dest, size_t src, size_t size);
     void deallocateBuffer(size_t address);
@@ -51,10 +41,13 @@ namespace oclgrind
     size_t getTotalAllocated() const;
     bool isAddressValid(size_t address, size_t size=1) const;
     bool load(unsigned char *dst, size_t address, size_t size=1) const;
-    unsigned char* mapBuffer(size_t address, size_t offset, size_t size);
+    void* mapBuffer(size_t address, size_t offset, size_t size);
     bool store(const unsigned char *source, size_t address, size_t size=1);
 
-    static size_t getMaxAllocSize();
+    size_t extractBuffer(size_t address) const;
+    size_t extractOffset(size_t address) const;
+
+    size_t getMaxAllocSize();
 
   private:
     const Context *m_context;
@@ -63,6 +56,11 @@ namespace oclgrind
     unsigned int m_addressSpace;
     size_t m_totalAllocated;
 
+    unsigned m_numBitsBuffer;
+    unsigned m_numBitsAddress;
+    size_t m_maxNumBuffers;
+    size_t m_maxBufferSize;
+
     unsigned getNextBuffer();
   };
 }
diff --git a/src/core/Plugin.cpp b/src/core/Plugin.cpp
index 8880f2d..bab80db 100644
--- a/src/core/Plugin.cpp
+++ b/src/core/Plugin.cpp
@@ -1,5 +1,5 @@
 // Plugin.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/core/Plugin.h b/src/core/Plugin.h
index d4a8ea7..df7c863 100644
--- a/src/core/Plugin.h
+++ b/src/core/Plugin.h
@@ -1,5 +1,5 @@
 // Plugin.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -37,7 +37,8 @@ namespace oclgrind
     virtual void kernelEnd(const KernelInvocation *kernelInvocation){}
     virtual void log(MessageType type, const char *message){}
     virtual void memoryAllocated(const Memory *memory, size_t address,
-                                 size_t size, cl_mem_flags flags){}
+                                 size_t size, cl_mem_flags flags,
+                                 const uint8_t *initData){}
     virtual void memoryAtomicLoad(const Memory *memory,
                                   const WorkItem *workItem,
                                   AtomicOp op, size_t address, size_t size){}
@@ -49,12 +50,16 @@ namespace oclgrind
                             size_t address, size_t size){}
     virtual void memoryLoad(const Memory *memory, const WorkGroup *workGroup,
                             size_t address, size_t size){}
+    virtual void memoryMap(const Memory *memory, size_t address,
+                           size_t offset, size_t size, cl_map_flags flags){}
     virtual void memoryStore(const Memory *memory, const WorkItem *workItem,
                              size_t address, size_t size,
                              const uint8_t *storeData){}
     virtual void memoryStore(const Memory *memory, const WorkGroup *workGroup,
                              size_t address, size_t size,
                              const uint8_t *storeData){}
+    virtual void memoryUnmap(const Memory *memory, size_t address,
+                             const void *ptr){}
     virtual void workGroupBarrier(const WorkGroup *workGroup, uint32_t flags){}
     virtual void workGroupBegin(const WorkGroup *workGroup){}
     virtual void workGroupComplete(const WorkGroup *workGroup){}
diff --git a/src/core/Program.cpp b/src/core/Program.cpp
index 31fdc5b..6480ed7 100644
--- a/src/core/Program.cpp
+++ b/src/core/Program.cpp
@@ -1,12 +1,14 @@
 // Program.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
 // license terms please see the LICENSE file distributed with this
 // source code.
 
+#include "config.h"
 #include "common.h"
+
 #include <fstream>
 
 #if defined(_WIN32) && !defined(__MINGW32__)
@@ -29,7 +31,9 @@
 #include "clang/CodeGen/CodeGenAction.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "clang/Lex/PreprocessorOptions.h"
 
+#include "Context.h"
 #include "Kernel.h"
 #include "Program.h"
 #include "WorkItem.h"
@@ -129,7 +133,11 @@ bool Program::build(const char *options, list<Header> headers)
   args.push_back("-cl-std=CL1.2");
   args.push_back("-cl-kernel-arg-info");
   args.push_back("-fno-builtin");
+#if LLVM_VERSION >= 38
+  args.push_back("-debug-info-kind=standalone");
+#else
   args.push_back("-g");
+#endif
   args.push_back("-triple");
   if (sizeof(size_t) == 4)
     args.push_back("spir-unknown-unknown");
@@ -150,6 +158,10 @@ bool Program::build(const char *options, list<Header> headers)
   bool optimize = true;
   bool cl12     = true;
 
+  // Disable optimizations by default if in interactive mode
+  if (checkEnv("OCLGRIND_INTERACTIVE"))
+    optimize = false;
+
   // Add OpenCL build options
   const char *mainOptions = options;
   const char *extraOptions = getenv("OCLGRIND_BUILD_OPTIONS");
@@ -164,7 +176,8 @@ bool Program::build(const char *options, list<Header> headers)
     // Ignore options that break PCH
     if (strcmp(opt, "-cl-fast-relaxed-math") != 0 &&
         strcmp(opt, "-cl-finite-math-only") != 0 &&
-        strcmp(opt, "-cl-single-precision-constant") != 0)
+        strcmp(opt, "-cl-single-precision-constant") &&
+        strcmp(opt, "-cl-unsafe-math-optimizations") != 0)
     {
       // Check for optimization flags
       if (strcmp(opt, "-O0") == 0 || strcmp(opt, "-cl-opt-disable") == 0)
@@ -178,6 +191,12 @@ bool Program::build(const char *options, list<Header> headers)
         continue;
       }
 
+#if LLVM_VERSION >= 37
+      // Clang no longer supports -cl-no-signed-zeros
+      if (strcmp(opt, "-cl-no-signed-zeros") == 0)
+        continue;
+#endif
+
       // Check for -cl-std flag
       if (strncmp(opt, "-cl-std=", 8) == 0)
       {
@@ -214,11 +233,11 @@ bool Program::build(const char *options, list<Header> headers)
 #if defined(_WIN32) && !defined(__MINGW32__)
       char libpath[4096];
       HMODULE dll;
-      if (GetModuleHandleEx(
+      if (GetModuleHandleExA(
             GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
             GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
             (LPCSTR)&Program::createFromBitcode, &dll) &&
-          GetModuleFileName(dll, libpath, sizeof(libpath)))
+          GetModuleFileNameA(dll, libpath, sizeof(libpath)))
       {
 #else
       Dl_info dlinfo;
@@ -274,6 +293,7 @@ bool Program::build(const char *options, list<Header> headers)
 
     args.push_back("-include-pch");
     args.push_back(pch);
+    args.push_back("-fno-validate-pch");
   }
   else
   {
@@ -325,8 +345,7 @@ bool Program::build(const char *options, list<Header> headers)
   compiler.getPreprocessorOpts().addRemappedFile(REMAP_INPUT, buffer.release());
 
   // Compile
-  llvm::LLVMContext& context = llvm::getGlobalContext();
-  clang::EmitLLVMOnlyAction action(&context);
+  clang::EmitLLVMOnlyAction action(m_context->getLLVMContext());
   if (compiler.ExecuteAction(action))
   {
     // Retrieve module
@@ -338,32 +357,34 @@ bool Program::build(const char *options, list<Header> headers)
       stripDebugIntrinsics();
     }
 
-    // Initialize pass managers
-    llvm::legacy::PassManager modulePasses;
-    llvm::legacy::FunctionPassManager functionPasses(m_module.get());
-#if LLVM_VERSION < 37
-    modulePasses.add(new llvm::DataLayoutPass());
-    functionPasses.add(new llvm::DataLayoutPass());
-#endif
-
     // Run optimizations on module
     if (optimize)
     {
+      // Initialize pass managers
+      llvm::legacy::PassManager modulePasses;
+      llvm::legacy::FunctionPassManager functionPasses(m_module.get());
+#if LLVM_VERSION < 37
+      modulePasses.add(new llvm::DataLayoutPass());
+      functionPasses.add(new llvm::DataLayoutPass());
+#endif
+
       // Populate pass managers with -Oz
       llvm::PassManagerBuilder builder;
       builder.OptLevel = 2;
       builder.SizeLevel = 2;
       builder.populateModulePassManager(modulePasses);
       builder.populateFunctionPassManager(functionPasses);
+
+      // Run passes
+      functionPasses.doInitialization();
+      llvm::Module::iterator fItr;
+      for (fItr = m_module->begin(); fItr != m_module->end(); fItr++)
+        functionPasses.run(*fItr);
+      functionPasses.doFinalization();
+      modulePasses.run(*m_module);
     }
 
-    // Run passes
-    functionPasses.doInitialization();
-    llvm::Module::iterator fItr;
-    for (fItr = m_module->begin(); fItr != m_module->end(); fItr++)
-      functionPasses.run(*fItr);
-    functionPasses.doFinalization();
-    modulePasses.run(*m_module);
+    removeLValueLoads();
 
     m_buildStatus = CL_BUILD_SUCCESS;
   }
@@ -448,14 +469,22 @@ Program* Program::createFromBitcode(const Context *context,
   }
 
   // Parse bitcode into IR module
+#if LLVM_VERSION < 37
   llvm::ErrorOr<llvm::Module*> module =
-    parseBitcodeFile(buffer->getMemBufferRef(), llvm::getGlobalContext());
+#else
+  llvm::ErrorOr<unique_ptr<llvm::Module>> module =
+#endif
+    parseBitcodeFile(buffer->getMemBufferRef(), *context->getLLVMContext());
   if (!module)
   {
     return NULL;
   }
 
+#if LLVM_VERSION < 37
   return new Program(context, module.get());
+#else
+  return new Program(context, module.get().release());
+#endif
 }
 
 Program* Program::createFromBitcodeFile(const Context *context,
@@ -470,35 +499,52 @@ Program* Program::createFromBitcodeFile(const Context *context,
   }
 
   // Parse bitcode into IR module
+#if LLVM_VERSION < 37
   llvm::ErrorOr<llvm::Module*> module =
+#else
+  llvm::ErrorOr<unique_ptr<llvm::Module>> module =
+#endif
     parseBitcodeFile(buffer->get()->getMemBufferRef(),
-                     llvm::getGlobalContext());
+                     *context->getLLVMContext());
   if (!module)
   {
     return NULL;
   }
 
+#if LLVM_VERSION < 37
   return new Program(context, module.get());
+#else
+  return new Program(context, module.get().release());
+#endif
 }
 
 Program* Program::createFromPrograms(const Context *context,
                                      list<const Program*> programs)
 {
   llvm::Module *module = new llvm::Module("oclgrind_linked",
-                                          llvm::getGlobalContext());
+                                          *context->getLLVMContext());
+#if LLVM_VERSION < 38
   llvm::Linker linker(module);
+#else
+  llvm::Linker linker(*module);
+#endif
 
   // Link modules
   list<const Program*>::iterator itr;
   for (itr = programs.begin(); itr != programs.end(); itr++)
   {
-    if (linker.linkInModule(CloneModule((*itr)->m_module.get())))
+#if LLVM_VERSION < 38
+    llvm::Module *m = llvm::CloneModule((*itr)->m_module.get());
+#else
+    unique_ptr<llvm::Module> m = llvm::CloneModule((*itr)->m_module.get());
+#endif
+    if (linker.linkInModule(std::move(m)))
     {
       return NULL;
     }
   }
 
-  return new Program(context, linker.getModule());
+  return new Program(context, module);
 }
 
 Kernel* Program::createKernel(const string name)
@@ -508,7 +554,7 @@ Kernel* Program::createKernel(const string name)
 
   // Iterate over functions in module to find kernel
   llvm::Function *function = NULL;
-
+#if LLVM_VERSION < 37
   // Query the SPIR kernel list
   llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
   // No kernels in module
@@ -538,6 +584,17 @@ Kernel* Program::createKernel(const string name)
       break;
     }
   }
+#else
+  for (auto F = m_module->begin(); F != m_module->end(); F++)
+  {
+    if (F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL &&
+        F->getName() == name)
+    {
+      function = &*F;
+      break;
+    }
+  }
+#endif
 
   if (function == NULL)
   {
@@ -566,20 +623,17 @@ Kernel* Program::createKernel(const string name)
   }
 }
 
-unsigned char* Program::getBinary() const
+void Program::getBinary(unsigned char *binary) const
 {
   if (!m_module)
-  {
-    return NULL;
-  }
+    return;
 
   std::string str;
   llvm::raw_string_ostream stream(str);
   llvm::WriteBitcodeToFile(m_module.get(), stream);
   stream.str();
-  unsigned char *bitcode = new unsigned char[str.length()];
-  memcpy(bitcode, str.c_str(), str.length());
-  return bitcode;
+
+  memcpy(binary, str.c_str(), str.length());
 }
 
 size_t Program::getBinarySize() const
@@ -632,6 +686,7 @@ list<string> Program::getKernelNames() const
 {
   list<string> names;
 
+#if LLVM_VERSION < 37
   // Query the SPIR kernel list
   llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
 
@@ -656,6 +711,15 @@ list<string> Program::getKernelNames() const
       names.push_back(kernelFunction->getName());
     }
   }
+#else
+  for (auto F = m_module->begin(); F != m_module->end(); F++)
+  {
+    if (F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
+    {
+      names.push_back(F->getName());
+    }
+  }
+#endif
 
   return names;
 }
@@ -664,6 +728,7 @@ unsigned int Program::getNumKernels() const
 {
   assert(m_module);
 
+#if LLVM_VERSION < 37
   // Extract kernels from metadata
   llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
 
@@ -672,6 +737,19 @@ unsigned int Program::getNumKernels() const
     return 0;
 
   return tuple->getNumOperands();
+#else
+  unsigned int num = 0;
+
+  for (auto F = m_module->begin(); F != m_module->end(); F++)
+  {
+    if (F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
+    {
+      num++;
+    }
+  }
+
+  return num;
+#endif
 }
 
 const string& Program::getSource() const
@@ -697,13 +775,303 @@ unsigned long Program::getUID() const
   return m_uid;
 }
 
+void Program::pruneDeadCode(llvm::Instruction *instruction)
+{
+  // Remove instructions that have no uses
+  if (instruction->getNumUses() == 0)
+  {
+    // Get list of operands
+    set<llvm::Value*> operands;
+    {
+      llvm::Instruction::op_iterator op;
+      for (op = instruction->op_begin(); op != instruction->op_end(); op++)
+      {
+        operands.insert(*op);
+      }
+    }
+
+    // Remove instruction
+    instruction->eraseFromParent();
+
+    // Prune operands
+    set<llvm::Value*>::iterator op;
+    for (op = operands.begin(); op != operands.end(); op++)
+    {
+      if (auto inst = llvm::dyn_cast<llvm::Instruction>(*op))
+        pruneDeadCode(inst);
+    }
+  }
+}
+
+void Program::removeLValueLoads()
+{
+  // Get list of aggregate store instructions
+  set<llvm::StoreInst*> aggStores;
+  for (llvm::Module::iterator F = m_module->begin(); F != m_module->end(); F++)
+  {
+    llvm::Function *f = &*F;
+    for (llvm::inst_iterator I = inst_begin(f), E = inst_end(f); I != E; I++)
+    {
+      if (auto store = llvm::dyn_cast<llvm::StoreInst>(&*I))
+        aggStores.insert(store);
+    }
+  }
+
+  // Replace aggregate modify-write sequences with direct scalar writes
+  set<llvm::StoreInst*>::iterator itr;
+  for (itr = aggStores.begin(); itr != aggStores.end(); itr++)
+  {
+    scalarizeAggregateStore(*itr);
+  }
+}
+
+void Program::scalarizeAggregateStore(llvm::StoreInst *store)
+{
+  llvm::IntegerType *gepIndexType = (sizeof(size_t)==8) ?
+      llvm::Type::getInt64Ty(m_module.get()->getContext()) :
+      llvm::Type::getInt32Ty(m_module.get()->getContext());
+
+  llvm::Value *storeValue = store->getValueOperand();
+  llvm::Value *vectorPtr  = store->getPointerOperand();
+
+  if (auto insert = llvm::dyn_cast<llvm::InsertElementInst>(storeValue))
+  {
+    llvm::Value *vector = insert->getOperand(0);
+    llvm::Value *value  = insert->getOperand(1);
+    llvm::Value *index  = insert->getOperand(2);
+
+    // Create GEP for scalar value
+    llvm::GetElementPtrInst *scalarPtr = NULL;
+    if (auto gep = llvm::dyn_cast<llvm::GetElementPtrInst>(vectorPtr))
+    {
+      // Create GEP from existing GEP
+      std::vector<llvm::Value*> indices;
+      for (auto idx = gep->idx_begin(); idx != gep->idx_end(); idx++)
+      {
+        indices.push_back(*idx);
+      }
+      indices.push_back(index);
+      scalarPtr = llvm::GetElementPtrInst::Create(
+#if LLVM_VERSION > 36
+        gep->getPointerOperandType()->getPointerElementType(),
+#endif
+        gep->getPointerOperand(), indices);
+    }
+    else
+    {
+      // Create GEP from non-GEP pointer
+      std::vector<llvm::Value*> indices;
+      indices.push_back(llvm::ConstantInt::getSigned(gepIndexType, 0));
+      indices.push_back(index);
+      scalarPtr = llvm::GetElementPtrInst::Create(
+#if LLVM_VERSION > 36
+        vectorPtr->getType()->getPointerElementType(),
+#endif
+        vectorPtr, indices);
+    }
+    scalarPtr->setDebugLoc(store->getDebugLoc());
+    scalarPtr->insertAfter(store);
+
+    // Create direct scalar store
+    llvm::StoreInst *scalarStore = new llvm::StoreInst(
+      value, scalarPtr, store->isVolatile(),
+      getTypeAlignment(value->getType()));
+    scalarStore->setDebugLoc(store->getDebugLoc());
+    scalarStore->insertAfter(scalarPtr);
+
+    // Check if the input to the insertelement instruction came from something
+    // other than a load to the same address as the store
+    llvm::LoadInst *load = llvm::dyn_cast<llvm::LoadInst>(vector);
+    if (!(load && load->getPointerOperand() == store->getPointerOperand()))
+    {
+      // Replace value in store with the input to the insertelement instruction
+      llvm::StoreInst *_store = new llvm::StoreInst(
+        vector, store->getPointerOperand(),
+        store->isVolatile(), store->getAlignment());
+      _store->setDebugLoc(store->getDebugLoc());
+      _store->insertAfter(store);
+
+      // Repeat process with new store
+      if (_store)
+        scalarizeAggregateStore(_store);
+    }
+
+    // Remove vector store and any dead code
+    store->eraseFromParent();
+    pruneDeadCode(insert);
+  }
+  else if (auto shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(storeValue))
+  {
+    llvm::Value *v1      = shuffle->getOperand(0);
+    llvm::Value *v2      = shuffle->getOperand(1);
+    llvm::Constant *mask = shuffle->getMask();
+    unsigned maskSize    = mask->getType()->getVectorNumElements();
+
+    // Check if shuffle sources came from a load with same address as the store
+    llvm::LoadInst *load;
+    bool v1SourceIsDest = false, v2SourceIsDest = false;
+    if ((load = llvm::dyn_cast<llvm::LoadInst>(v1)) &&
+         load->getPointerOperand() == vectorPtr)
+      v1SourceIsDest = true;
+    if ((load = llvm::dyn_cast<llvm::LoadInst>(v2)) &&
+         load->getPointerOperand() == vectorPtr)
+      v2SourceIsDest = true;
+
+    // Get mask indices that don't correspond to the destination vector
+    stack<unsigned> indices;
+    for (unsigned i = 0; i < maskSize; i++)
+    {
+      int idx = shuffle->getMaskValue(i);
+
+      // Skip undef indices
+      if (idx == -1)
+        continue;
+
+      // Check if source is the store destination
+      bool sourceIsDest =
+        ((unsigned)idx < v1->getType()->getVectorNumElements() ?
+          v1SourceIsDest : v2SourceIsDest);
+
+      // If destination is used in non-identity position, leave shuffle as is
+      if (sourceIsDest && (unsigned)idx != i)
+        return;
+
+      // Add non-destination index
+      if (!sourceIsDest)
+        indices.push(i);
+    }
+
+    // Check if destination is actually used as a source in the mask
+    if (indices.size() == maskSize)
+    {
+      // Check for any unused loads with the same address as the store
+      // These would usually be caught by DCE, but if optimisations are
+      // disabled we need to prune these manually
+      list<llvm::LoadInst*> lvalueloads;
+      for (auto user  = vectorPtr->user_begin();
+                user != vectorPtr->user_end() ;
+                user++)
+      {
+        if (auto load = llvm::dyn_cast<llvm::LoadInst>(*user))
+        {
+          if (load->getNumUses() == 0)
+            lvalueloads.push_back(load);
+        }
+      }
+      for (auto load = lvalueloads.begin(); load != lvalueloads.end(); load++)
+      {
+        (*load)->eraseFromParent();
+      }
+
+      return;
+    }
+
+    // Create a scalar store for each shuffle index
+    while (!indices.empty())
+    {
+      unsigned index = indices.top();
+      indices.pop();
+
+      // Create GEP for scalar value
+      llvm::GetElementPtrInst *scalarPtr = NULL;
+      if (auto gep = llvm::dyn_cast<llvm::GetElementPtrInst>(vectorPtr))
+      {
+        // Create GEP from existing GEP
+        std::vector<llvm::Value*> gepIndices;
+        for (auto idx = gep->idx_begin(); idx != gep->idx_end(); idx++)
+        {
+          gepIndices.push_back(*idx);
+        }
+        gepIndices.push_back(llvm::ConstantInt::getSigned(gepIndexType, index));
+        scalarPtr = llvm::GetElementPtrInst::Create(
+#if LLVM_VERSION > 36
+          gep->getPointerOperandType()->getPointerElementType(),
+#endif
+          gep->getPointerOperand(), gepIndices);
+      }
+      else
+      {
+        // Create GEP from non-GEP pointer
+        std::vector<llvm::Value*> gepIndices;
+        gepIndices.push_back(llvm::ConstantInt::getSigned(gepIndexType, 0));
+        gepIndices.push_back(llvm::ConstantInt::getSigned(gepIndexType, index));
+        scalarPtr = llvm::GetElementPtrInst::Create(
+#if LLVM_VERSION > 36
+          vectorPtr->getType()->getPointerElementType(),
+#endif
+          vectorPtr, gepIndices);
+      }
+      scalarPtr->setDebugLoc(store->getDebugLoc());
+      scalarPtr->insertAfter(store);
+
+      // Get source vector and index
+      unsigned idx   = shuffle->getMaskValue(index);
+      unsigned v1num = v1->getType()->getVectorNumElements();
+      llvm::Value *src = v1;
+      if (idx >= v1num)
+      {
+        idx -= v1num;
+        src = v2;
+      }
+
+      // Create direct scalar store
+      if (auto cnst = llvm::dyn_cast<llvm::ConstantVector>(src))
+      {
+        // If source is a constant, extract scalar constant
+        src = cnst->getAggregateElement(idx);
+
+        llvm::StoreInst *scalarStore = new llvm::StoreInst(
+          src, scalarPtr, store->isVolatile(),
+          getTypeAlignment(src->getType()));
+        scalarStore->setDebugLoc(store->getDebugLoc());
+        scalarStore->insertAfter(scalarPtr);
+      }
+      else
+      {
+        // If extracting from a shuffle, trace back to last non-shuffle
+        while (auto shfl = llvm::dyn_cast<llvm::ShuffleVectorInst>(src))
+        {
+          llvm::Value *v1 = shfl->getOperand(0);
+          llvm::Value *v2 = shfl->getOperand(1);
+          unsigned v1num  = v1->getType()->getVectorNumElements();
+
+          // Get source vector and index
+          idx = shfl->getMaskValue(idx);
+          src = v1;
+          if (idx >= v1num)
+          {
+            idx -= v1num;
+            src = v2;
+          }
+        }
+
+        llvm::ExtractElementInst *extract = llvm::ExtractElementInst::Create(
+          src, llvm::ConstantInt::getSigned(gepIndexType, idx));
+        extract->setDebugLoc(shuffle->getDebugLoc());
+        extract->insertAfter(scalarPtr);
+
+        llvm::StoreInst *scalarStore = new llvm::StoreInst(
+          extract, scalarPtr, store->isVolatile(),
+          getTypeAlignment(extract->getType()));
+        scalarStore->setDebugLoc(store->getDebugLoc());
+        scalarStore->insertAfter(extract);
+      }
+    }
+
+    // Prune old store and dead any code
+    store->eraseFromParent();
+    pruneDeadCode(shuffle);
+  }
+}
+
 void Program::stripDebugIntrinsics()
 {
   // Get list of llvm.dbg intrinsics
   set<llvm::Instruction*> intrinsics;
   for (llvm::Module::iterator F = m_module->begin(); F != m_module->end(); F++)
   {
-    for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; I++)
+    llvm::Function *f = &*F;
+    for (llvm::inst_iterator I = inst_begin(f), E = inst_end(f); I != E; I++)
     {
       if (I->getOpcode() == llvm::Instruction::Call)
       {
@@ -722,7 +1090,6 @@ void Program::stripDebugIntrinsics()
   set<llvm::Instruction*>::iterator itr;
   for (itr = intrinsics.begin(); itr != intrinsics.end(); itr++)
   {
-    (*itr)->removeFromParent();
-    delete *itr;
+    (*itr)->eraseFromParent();
   }
 }
diff --git a/src/core/Program.h b/src/core/Program.h
index f888746..8b901c9 100644
--- a/src/core/Program.h
+++ b/src/core/Program.h
@@ -1,5 +1,5 @@
 // Program.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -12,6 +12,7 @@ namespace llvm
 {
   class Function;
   class Module;
+  class StoreInst;
 }
 
 namespace oclgrind
@@ -42,7 +43,7 @@ namespace oclgrind
     Kernel* createKernel(const std::string name);
     const std::string& getBuildLog() const;
     const std::string& getBuildOptions() const;
-    unsigned char* getBinary() const;
+    void getBinary(unsigned char *binary) const;
     size_t getBinarySize() const;
     unsigned int getBuildStatus() const;
     const Context *getContext() const;
@@ -69,6 +70,9 @@ namespace oclgrind
     unsigned long m_uid;
     unsigned long generateUID() const;
 
+    void pruneDeadCode(llvm::Instruction*);
+    void removeLValueLoads();
+    void scalarizeAggregateStore(llvm::StoreInst *store);
     void stripDebugIntrinsics();
 
     typedef std::map<const llvm::Function*, InterpreterCache*>
diff --git a/src/core/Queue.cpp b/src/core/Queue.cpp
index e9e082f..a3510e6 100644
--- a/src/core/Queue.cpp
+++ b/src/core/Queue.cpp
@@ -1,5 +1,5 @@
 // Queue.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -7,6 +7,7 @@
 // source code.
 
 #include "common.h"
+
 #include <cassert>
 
 #include "Context.h"
@@ -114,6 +115,12 @@ void Queue::executeKernel(KernelCommand *cmd)
                         cmd->localSize);
 }
 
+void Queue::executeMap(MapCommand *cmd)
+{
+  m_context->notifyMemoryMap(m_context->getGlobalMemory(),
+                             cmd->address, cmd->offset, cmd->size, cmd->flags);
+}
+
 void Queue::executeNativeKernel(NativeKernelCommand *cmd)
 {
   // Run kernel
@@ -147,6 +154,12 @@ void Queue::executeReadBufferRect(BufferRectCommand *cmd)
   }
 }
 
+void Queue::executeUnmap(UnmapCommand *cmd)
+{
+  m_context->notifyMemoryUnmap(m_context->getGlobalMemory(),
+                               cmd->address, cmd->ptr);
+}
+
 void Queue::executeWriteBuffer(BufferCommand *cmd)
 {
   m_context->getGlobalMemory()->store(cmd->ptr, cmd->address, cmd->size);
@@ -238,9 +251,15 @@ Queue::Command* Queue::update()
   case KERNEL:
     executeKernel((KernelCommand*)cmd);
     break;
+  case MAP:
+    executeMap((MapCommand*)cmd);
+    break;
   case NATIVE_KERNEL:
     executeNativeKernel((NativeKernelCommand*)cmd);
     break;
+  case UNMAP:
+    executeUnmap((UnmapCommand*)cmd);
+    break;
   case WRITE:
     executeWriteBuffer((BufferCommand*)cmd);
     break;
diff --git a/src/core/Queue.h b/src/core/Queue.h
index 7736d47..7879dbd 100644
--- a/src/core/Queue.h
+++ b/src/core/Queue.h
@@ -1,5 +1,5 @@
 // Queue.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -25,7 +25,8 @@ namespace oclgrind
   {
   public:
     enum CommandType {EMPTY, COPY, COPY_RECT, FILL_BUFFER, FILL_IMAGE, KERNEL,
-                      NATIVE_KERNEL, READ, READ_RECT, WRITE, WRITE_RECT};
+                      MAP, NATIVE_KERNEL, READ, READ_RECT, UNMAP, WRITE,
+                      WRITE_RECT};
     struct Command
     {
       CommandType type;
@@ -155,6 +156,27 @@ namespace oclgrind
         }
       }
     };
+    struct MapCommand : Command
+    {
+      void *ptr;
+      size_t address;
+      size_t offset;
+      size_t size;
+      cl_map_flags flags;
+      MapCommand()
+      {
+        type = MAP;
+      }
+    };
+    struct UnmapCommand : Command
+    {
+      const void *ptr;
+      size_t address;
+      UnmapCommand()
+      {
+        type = UNMAP;
+      }
+    };
 
   public:
     Queue(const Context *context);
@@ -167,9 +189,11 @@ namespace oclgrind
     void executeFillBuffer(FillBufferCommand *cmd);
     void executeFillImage(FillImageCommand *cmd);
     void executeKernel(KernelCommand *cmd);
+    void executeMap(MapCommand *cmd);
     void executeNativeKernel(NativeKernelCommand *cmd);
     void executeReadBuffer(BufferCommand *cmd);
     void executeReadBufferRect(BufferRectCommand *cmd);
+    void executeUnmap(UnmapCommand *cmd);
     void executeWriteBuffer(BufferCommand *cmd);
     void executeWriteBufferRect(BufferRectCommand *cmd);
 
diff --git a/src/core/WorkGroup.cpp b/src/core/WorkGroup.cpp
index 23daf9d..2c891c6 100644
--- a/src/core/WorkGroup.cpp
+++ b/src/core/WorkGroup.cpp
@@ -1,5 +1,5 @@
 // WorkGroup.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -7,6 +7,7 @@
 // source code.
 
 #include "common.h"
+
 #include <sstream>
 
 #include "llvm/IR/Module.h"
@@ -33,7 +34,20 @@ WorkGroup::WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid)
                   kernelInvocation->getNumGroups().x));
 
   // Allocate local memory
-  m_localMemory = kernelInvocation->getKernel()->getLocalMemory()->clone();
+  m_localMemory = new Memory(AddrSpaceLocal, sizeof(size_t)==8 ? 16 : 8,
+                             m_context);
+  const Kernel *kernel = kernelInvocation->getKernel();
+  for (auto value = kernel->values_begin();
+            value != kernel->values_end();
+            value++)
+  {
+    const llvm::Type *type = value->first->getType();
+    if (type->isPointerTy() && type->getPointerAddressSpace() == AddrSpaceLocal)
+    {
+      size_t ptr = m_localMemory->allocateBuffer(value->second.size);
+      m_localAddresses[value->first] = ptr;
+    }
+  }
 
   // Initialise work-items
   for (size_t k = 0; k < m_groupSize.z; k++)
@@ -46,7 +60,6 @@ WorkGroup::WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid)
                                           Size3(i, j, k));
         m_workItems.push_back(workItem);
         m_running.insert(workItem);
-        m_context->notifyWorkItemBegin(workItem);
       }
     }
   }
@@ -287,6 +300,11 @@ Memory* WorkGroup::getLocalMemory() const
   return m_localMemory;
 }
 
+size_t WorkGroup::getLocalMemoryAddress(const llvm::Value *value) const
+{
+  return m_localAddresses.at(value);
+}
+
 WorkItem* WorkGroup::getNextWorkItem() const
 {
   if (m_running.empty())
diff --git a/src/core/WorkGroup.h b/src/core/WorkGroup.h
index 88319cf..73cb2b7 100644
--- a/src/core/WorkGroup.h
+++ b/src/core/WorkGroup.h
@@ -1,5 +1,5 @@
 // WorkGroup.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -32,7 +32,7 @@ namespace oclgrind
     };
     std::set<WorkItem*, WorkItemCmp> m_running;
 
-    typedef struct
+    struct AsyncCopy
     {
       const llvm::Instruction *instruction;
       AsyncCopyType type;
@@ -44,16 +44,16 @@ namespace oclgrind
       size_t destStride;
 
       size_t event;
-    } AsyncCopy;
+    };
 
-    typedef struct
+    struct Barrier
     {
       const llvm::Instruction *instruction;
       std::set<WorkItem*, WorkItemCmp> workItems;
 
       uint64_t fence;
       std::list<size_t> events;
-    } Barrier;
+    };
 
   public:
     WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid);
@@ -76,6 +76,7 @@ namespace oclgrind
     size_t getGroupIndex() const;
     Size3 getGroupSize() const;
     Memory* getLocalMemory() const;
+    size_t getLocalMemoryAddress(const llvm::Value *value) const;
     WorkItem *getNextWorkItem() const;
     WorkItem *getWorkItem(Size3 localID) const;
     bool hasBarrier() const;
@@ -89,7 +90,10 @@ namespace oclgrind
     Size3 m_groupID;
     Size3 m_groupSize;
     const Context *m_context;
+
     Memory *m_localMemory;
+    std::map<const llvm::Value*,size_t> m_localAddresses;
+
     std::vector<WorkItem*> m_workItems;
 
     Barrier *m_barrier;
diff --git a/src/core/WorkItem.cpp b/src/core/WorkItem.cpp
index 9d37ade..4441a33 100644
--- a/src/core/WorkItem.cpp
+++ b/src/core/WorkItem.cpp
@@ -1,13 +1,16 @@
 // WorkItem.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.`
 //
 // This program is provided under a three-clause BSD license. For full
 // license terms please see the LICENSE file distributed with this
 // source code.
 
+#include "config.h"
 #include "common.h"
 
+#include <math.h>
+
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstrTypes.h"
@@ -30,9 +33,10 @@ using namespace std;
 
 struct WorkItem::Position
 {
-  llvm::Function::const_iterator       prevBlock;
-  llvm::Function::const_iterator       currBlock;
-  llvm::Function::const_iterator       nextBlock;
+  bool hasBegun;
+  const llvm::BasicBlock *             prevBlock;
+  const llvm::BasicBlock *             currBlock;
+  const llvm::BasicBlock *             nextBlock;
   llvm::BasicBlock::const_iterator     currInst;
   std::stack<const llvm::Instruction*> callStack;
   std::stack< std::list<size_t> >      allocations;
@@ -67,21 +71,48 @@ WorkItem::WorkItem(const KernelInvocation *kernelInvocation,
   // Set initial number of values to store based on cache
   m_values.resize(m_cache->getNumValues());
 
-  m_privateMemory = kernel->getPrivateMemory()->clone();
+  m_privateMemory = new Memory(AddrSpacePrivate, sizeof(size_t)==8 ? 32 : 16,
+                               m_context);
 
-  // Initialise kernel arguments
-  TypedValueMap::const_iterator argItr;
-  for (argItr = kernel->args_begin(); argItr != kernel->args_end(); argItr++)
+  // Initialise kernel arguments and global variables
+  for (auto value  = kernel->values_begin();
+            value != kernel->values_end();
+            value++)
   {
-    setValue(argItr->first, m_pool.clone(argItr->second));
+    pair<unsigned,unsigned> size = getValueSize(value->first);
+    TypedValue v = {
+      size.first,
+      size.second,
+      m_pool.alloc(size.first*size.second)
+    };
+
+    const llvm::Type *type = value->first->getType();
+    if (type->isPointerTy() &&
+        type->getPointerAddressSpace() == AddrSpacePrivate)
+    {
+      size_t sz = value->second.size*value->second.num;
+      v.setPointer(m_privateMemory->allocateBuffer(sz, 0, value->second.data));
+    }
+    else if (type->isPointerTy() &&
+             type->getPointerAddressSpace() == AddrSpaceLocal)
+    {
+      v.setPointer(m_workGroup->getLocalMemoryAddress(value->first));
+    }
+    else
+    {
+      memcpy(v.data, value->second.data, v.size*v.num);
+    }
+
+    setValue(value->first, v);
   }
 
   // Initialize interpreter state
   m_state    = READY;
   m_position = new Position;
+  m_position->hasBegun = false;
   m_position->prevBlock = NULL;
   m_position->nextBlock = NULL;
-  m_position->currBlock = kernel->getFunction()->begin();
+  m_position->currBlock = &*kernel->getFunction()->begin();
   m_position->currInst = m_position->currBlock->begin();
 }
 
@@ -298,19 +329,19 @@ void WorkItem::execute(const llvm::Instruction *instruction)
   m_context->notifyInstructionExecuted(this, instruction, result);
 }
 
-TypedValue WorkItem::getValue(const llvm::Value *key) const
+const stack<const llvm::Instruction*>& WorkItem::getCallStack() const
 {
-  return m_values[m_cache->getValueID(key)];
+  return m_position->callStack;
 }
 
-const stack<const llvm::Instruction*>& WorkItem::getCallStack() const
+const llvm::BasicBlock* WorkItem::getCurrentBlock() const
 {
-  return m_position->callStack;
+  return m_position->currBlock;
 }
 
 const llvm::Instruction* WorkItem::getCurrentInstruction() const
 {
-  return m_position->currInst;
+  return &*m_position->currInst;
 }
 
 Size3 WorkItem::getGlobalID() const
@@ -416,6 +447,11 @@ TypedValue WorkItem::getOperand(const llvm::Value *operand) const
   assert(false);
 }
 
+const llvm::BasicBlock* WorkItem::getPreviousBlock() const
+{
+  return m_position->prevBlock;
+}
+
 Memory* WorkItem::getPrivateMemory() const
 {
   return m_privateMemory;
@@ -426,6 +462,11 @@ WorkItem::State WorkItem::getState() const
   return m_state;
 }
 
+TypedValue WorkItem::getValue(const llvm::Value *key) const
+{
+  return m_values[m_cache->getValueID(key)];
+}
+
 const unsigned char* WorkItem::getValueData(const llvm::Value *value) const
 {
   if (!hasValue(value))
@@ -437,13 +478,27 @@ const unsigned char* WorkItem::getValueData(const llvm::Value *value) const
 
 const llvm::Value* WorkItem::getVariable(std::string name) const
 {
+  // Check private variables
   VariableMap::const_iterator itr;
   itr = m_variables.find(name);
-  if (itr == m_variables.end())
+  if (itr != m_variables.end())
+    return itr->second;
+
+  // Check global variables
+  string globalName = m_position->currBlock->getParent()->getName();
+  globalName += ".";
+  globalName += name;
+  const llvm::Module *module =
+    m_kernelInvocation->getKernel()->getFunction()->getParent();
+  for (auto global = module->global_begin();
+            global != module->global_end();
+            global++)
   {
-    return NULL;
+    if (global->getName() == globalName)
+      return &*global;
   }
-  return itr->second;
+
+  return NULL;
 }
 
 const WorkGroup* WorkItem::getWorkGroup() const
@@ -478,18 +533,18 @@ bool WorkItem::printVariable(string name) const
   }
 
   // Get variable value
-  TypedValue result = getValue(value);
+  TypedValue result = getOperand(value);
   const llvm::Type *type = value->getType();
 
-  if (((const llvm::Instruction*)value)->getOpcode()
-       == llvm::Instruction::Alloca)
+  if (value->getValueID() == llvm::Value::GlobalVariableVal ||
+      ((const llvm::Instruction*)value)->getOpcode()
+         == llvm::Instruction::Alloca)
   {
-    // If value is alloca result, look-up data at address
-    const llvm::Type *elemType = value->getType()->getPointerElementType();
+    // If value is alloca or global variable, look-up data at address
     size_t address = result.getPointer();
-
-    unsigned char *data = (unsigned char*)m_privateMemory->getPointer(address);
-    printTypedData(elemType, data);
+    Memory *memory = getMemory(value->getType()->getPointerAddressSpace());
+    unsigned char *data = (unsigned char*)memory->getPointer(address);
+    printTypedData(value->getType()->getPointerElementType(), data);
   }
   else
   {
@@ -508,8 +563,14 @@ WorkItem::State WorkItem::step()
 {
   assert(m_state == READY);
 
+  if (!m_position->hasBegun)
+  {
+    m_position->hasBegun = true;
+    m_context->notifyWorkItemBegin(this);
+  }
+
   // Execute the next instruction
-  execute(m_position->currInst);
+  execute(&*m_position->currInst);
 
   // Check if we've reached the end of the block
   if (++m_position->currInst == m_position->currBlock->end() ||
@@ -525,6 +586,9 @@ WorkItem::State WorkItem::step()
     }
   }
 
+  if (m_state == FINISHED)
+    m_context->notifyWorkItemComplete(this);
+
   return m_state;
 }
 
@@ -663,9 +727,9 @@ INSTRUCTION(call)
   // Check if function has definition
   if (!function->isDeclaration())
   {
-    m_position->callStack.push(m_position->currInst);
+    m_position->callStack.push(&*m_position->currInst);
     m_position->allocations.push(list<size_t>());
-    m_position->nextBlock = function->begin();
+    m_position->nextBlock = &*function->begin();
 
     // Set function arguments
     llvm::Function::const_arg_iterator argItr;
@@ -673,7 +737,30 @@ INSTRUCTION(call)
          argItr != function->arg_end(); argItr++)
     {
       const llvm::Value *arg = callInst->getArgOperand(argItr->getArgNo());
-      setValue(argItr, m_pool.clone(getOperand(arg)));
+      TypedValue value = getOperand(arg);
+
+      if (argItr->hasByValAttr())
+      {
+        // Make new copy of value in private memory
+        void *data = m_privateMemory->getPointer(value.getPointer());
+        size_t size = getTypeSize(argItr->getType()->getPointerElementType());
+        size_t ptr  = m_privateMemory->allocateBuffer(size, 0, (uint8_t*)data);
+        m_position->allocations.top().push_back(ptr);
+
+        // Pass new allocation to function
+        TypedValue address =
+        {
+          sizeof(size_t),
+          1,
+          m_pool.alloc(sizeof(size_t))
+        };
+        address.setPointer(ptr);
+        setValue(&*argItr, address);
+      }
+      else
+      {
+        setValue(&*argItr, m_pool.clone(value));
+      }
     }
 
     return;
@@ -780,20 +867,19 @@ INSTRUCTION(fcmp)
       r = a <= b;
       break;
     case llvm::CmpInst::FCMP_FALSE:
+    case llvm::CmpInst::FCMP_UNO:
       r = false;
       break;
     case llvm::CmpInst::FCMP_TRUE:
-      r = true;
-      break;
     case llvm::CmpInst::FCMP_ORD:
-    case llvm::CmpInst::FCMP_UNO:
+      r = true;
       break;
     default:
       FATAL_ERROR("Unsupported FCmp predicate: %d", pred);
     }
 
     // Deal with NaN operands
-    if (::isnan(a) || ::isnan(b))
+    if (std::isnan(a) || std::isnan(b))
     {
       r = !llvm::CmpInst::isOrdered(pred);
     }
@@ -1057,10 +1143,14 @@ INSTRUCTION(load)
 {
   const llvm::LoadInst *loadInst = (const llvm::LoadInst*)instruction;
   unsigned addressSpace = loadInst->getPointerAddressSpace();
-  size_t address = getOperand(loadInst->getPointerOperand()).getPointer();
+  const llvm::Value *opPtr = loadInst->getPointerOperand();
+  size_t address = getOperand(opPtr).getPointer();
 
   // Check address is correctly aligned
-  if (address & (loadInst->getAlignment()-1))
+  unsigned alignment = loadInst->getAlignment();
+  if (!alignment)
+    alignment = getTypeAlignment(opPtr->getType()->getPointerElementType());
+  if (address & (alignment-1))
   {
     m_context->logError("Invalid memory load - source pointer is "
                         "not aligned to the pointed type");
@@ -1116,7 +1206,8 @@ INSTRUCTION(ret)
 
   if (!m_position->callStack.empty())
   {
-    m_position->currInst = m_position->callStack.top();
+    m_position->currInst =
+      llvm::BasicBlock::const_iterator(m_position->callStack.top());
     m_position->currBlock = m_position->currInst->getParent();
     m_position->callStack.pop();
 
@@ -1124,7 +1215,7 @@ INSTRUCTION(ret)
     const llvm::Value *returnVal = retInst->getReturnValue();
     if (returnVal)
     {
-      setValue(m_position->currInst, m_pool.clone(getOperand(returnVal)));
+      setValue(&*m_position->currInst, m_pool.clone(getOperand(returnVal)));
     }
 
     // Clear stack allocations
@@ -1141,7 +1232,6 @@ INSTRUCTION(ret)
     m_position->nextBlock = NULL;
     m_state = FINISHED;
     m_workGroup->notifyFinished(this);
-    m_context->notifyWorkItemComplete(this);
   }
 }
 
@@ -1270,10 +1360,14 @@ INSTRUCTION(store)
 {
   const llvm::StoreInst *storeInst = (const llvm::StoreInst*)instruction;
   unsigned addressSpace = storeInst->getPointerAddressSpace();
-  size_t address = getOperand(storeInst->getPointerOperand()).getPointer();
+  const llvm::Value *opPtr = storeInst->getPointerOperand();
+  size_t address = getOperand(opPtr).getPointer();
 
   // Check address is correctly aligned
-  if (address & (storeInst->getAlignment()-1))
+  unsigned alignment = storeInst->getAlignment();
+  if (!alignment)
+    alignment = getTypeAlignment(opPtr->getType()->getPointerElementType());
+  if (address & (alignment-1))
   {
     m_context->logError("Invalid memory store - source pointer is "
                         "not aligned to the pointed type");
@@ -1322,7 +1416,11 @@ INSTRUCTION(uitofp)
   TypedValue op = getOperand(instruction->getOperand(0));
   for (unsigned i = 0; i < result.num; i++)
   {
-    result.setFloat(op.getUInt(i), i);
+    uint64_t in = op.getUInt(i);
+    if (result.size == 4)
+      result.setFloat(in ? (float)in : 0.f, i);
+    else
+      result.setFloat(in ? (double)in : 0.0, i);
   }
 }
 
@@ -1365,7 +1463,7 @@ InterpreterCache::InterpreterCache(llvm::Function *kernel)
   llvm::Module::const_global_iterator G;
   for (G = module->global_begin(); G != module->global_end(); G++)
   {
-    addValueID(G);
+    addValueID(&*G);
   }
 
 
@@ -1385,7 +1483,7 @@ InterpreterCache::InterpreterCache(llvm::Function *kernel)
     llvm::Function::arg_iterator A;
     for (A = function->arg_begin(); A != function->arg_end(); A++)
     {
-      addValueID(A);
+      addValueID(&*A);
     }
 
     // Iterate through instructions in function
@@ -1573,8 +1671,6 @@ bool InterpreterCache::hasValue(const llvm::Value *value) const
 
 void InterpreterCache::addOperand(const llvm::Value *operand)
 {
-  addValueID(operand);
-
   // Resolve constants
   if (operand->getValueID() == llvm::Value::UndefValueVal            ||
       operand->getValueID() == llvm::Value::ConstantAggregateZeroVal ||
@@ -1595,8 +1691,7 @@ void InterpreterCache::addOperand(const llvm::Value *operand)
     const llvm::ConstantExpr *expr = (const llvm::ConstantExpr*)operand;
     if (!m_constExpressions.count(expr))
     {
-      for (llvm::User::const_op_iterator O = expr->op_begin();
-           O != expr->op_end(); O++)
+      for (auto O = expr->op_begin(); O != expr->op_end(); O++)
       {
         addOperand(*O);
       }
@@ -1604,57 +1699,8 @@ void InterpreterCache::addOperand(const llvm::Value *operand)
       // TODO: Resolve actual value?
     }
   }
-}
-
-
-//////////////////////////
-// WorkItem::MemoryPool //
-//////////////////////////
-
-WorkItem::MemoryPool::MemoryPool(size_t blockSize) : m_blockSize(blockSize)
-{
-  // Force first allocation to create new block
-  m_offset = m_blockSize;
-}
-
-WorkItem::MemoryPool::~MemoryPool()
-{
-  list<unsigned char*>::iterator itr;
-  for (itr = m_blocks.begin(); itr != m_blocks.end(); itr++)
-  {
-    delete[] *itr;
-  }
-}
-
-unsigned char* WorkItem::MemoryPool::alloc(size_t size)
-{
-  // Check if requested size larger than block size
-  if (size > m_blockSize)
-  {
-    // Oversized buffers allocated separately from main pool
-    unsigned char *buffer = new unsigned char[size];
-    m_blocks.push_back(buffer);
-    return buffer;
-  }
-
-  // Check if enough space in current block
-  if (m_offset + size > m_blockSize)
+  else
   {
-    // Allocate new block
-    m_blocks.push_front(new unsigned char[m_blockSize]);
-    m_offset = 0;
+    addValueID(operand);
   }
-  unsigned char *buffer = m_blocks.front() + m_offset;
-  m_offset += size;
-  return buffer;
-}
-
-TypedValue WorkItem::MemoryPool::clone(const TypedValue& source)
-{
-  TypedValue dest;
-  dest.size = source.size;
-  dest.num = source.num;
-  dest.data = alloc(dest.size*dest.num);
-  memcpy(dest.data, source.data, dest.size*dest.num);
-  return dest;
 }
diff --git a/src/core/WorkItem.h b/src/core/WorkItem.h
index ae8380c..738df37 100644
--- a/src/core/WorkItem.h
+++ b/src/core/WorkItem.h
@@ -1,5 +1,5 @@
 // WorkItem.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -10,6 +10,7 @@
 
 namespace llvm
 {
+  class BasicBlock;
   class CallInst;
   class ConstExpr;
   class DbgValueInst;
@@ -28,17 +29,17 @@ namespace oclgrind
   class WorkItemBuiltins;
 
   // Data structures for builtin functions
-  typedef struct _BuiltinFunction
+  struct BuiltinFunction
   {
     void (*func)(WorkItem*, const llvm::CallInst*,
                  const std::string&, const std::string&, TypedValue&, void*);
     void *op;
-    _BuiltinFunction(){};
-    _BuiltinFunction(void (*f)(WorkItem*, const llvm::CallInst*,
+    BuiltinFunction(){};
+    BuiltinFunction(void (*f)(WorkItem*, const llvm::CallInst*,
                      const std::string&, const std::string&, TypedValue&,
                      void*),
                      void *o) : func(f), op(o) {};
-  } BuiltinFunction;
+  };
   typedef std::unordered_map<std::string,BuiltinFunction> BuiltinFunctionMap;
   typedef std::list< std::pair<std::string, BuiltinFunction> >
     BuiltinFunctionPrefixList;
@@ -50,11 +51,11 @@ namespace oclgrind
   class InterpreterCache
   {
   public:
-    typedef struct
+    struct Builtin
     {
       BuiltinFunction function;
       std::string name, overload;
-    } Builtin;
+    };
 
     InterpreterCache(llvm::Function *kernel);
     ~InterpreterCache();
@@ -93,20 +94,6 @@ namespace oclgrind
   public:
     enum State {READY, BARRIER, FINISHED};
 
-  private:
-    class MemoryPool
-    {
-    public:
-      MemoryPool(size_t blockSize = 1024);
-      ~MemoryPool();
-      unsigned char* alloc(size_t size);
-      TypedValue clone(const TypedValue& source);
-    private:
-      size_t m_blockSize;
-      size_t m_offset;
-      std::list<unsigned char *> m_blocks;
-    } mutable m_pool;
-
   public:
     WorkItem(const KernelInvocation *kernelInvocation,
              WorkGroup *workGroup, Size3 lid);
@@ -116,11 +103,13 @@ namespace oclgrind
     void dispatch(const llvm::Instruction *instruction, TypedValue& result);
     void execute(const llvm::Instruction *instruction);
     const std::stack<const llvm::Instruction*>& getCallStack() const;
+    const llvm::BasicBlock* getCurrentBlock() const;
     const llvm::Instruction* getCurrentInstruction() const;
     Size3 getGlobalID() const;
     size_t getGlobalIndex() const;
     Size3 getLocalID() const;
     TypedValue getOperand(const llvm::Value *operand) const;
+    const llvm::BasicBlock* getPreviousBlock() const;
     Memory* getPrivateMemory() const;
     State getState() const;
     const unsigned char* getValueData(const llvm::Value *value) const;
@@ -195,6 +184,7 @@ namespace oclgrind
     const KernelInvocation *m_kernelInvocation;
     Memory *m_privateMemory;
     WorkGroup *m_workGroup;
+    mutable MemoryPool m_pool;
 
     State m_state;
     struct Position;
diff --git a/src/core/WorkItemBuiltins.cpp b/src/core/WorkItemBuiltins.cpp
index cce6da6..0bd7837 100644
--- a/src/core/WorkItemBuiltins.cpp
+++ b/src/core/WorkItemBuiltins.cpp
@@ -1,14 +1,18 @@
 // WorkItemBuiltins.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
 // license terms please see the LICENSE file distributed with this
 // source code.
 
+#include "config.h"
 #include "common.h"
+
 #include <algorithm>
+#include <float.h>
 #include <fenv.h>
+#include <math.h>
 #include <mutex>
 
 #include "llvm/IR/Instructions.h"
@@ -258,8 +262,8 @@ namespace oclgrind
         address += sizeof(size_t);
       }
       workItem->m_state = WorkItem::BARRIER;
-      workItem->m_workGroup->notifyBarrier(workItem, callInst,
-                                           CLK_LOCAL_MEM_FENCE, events);
+      workItem->m_workGroup->notifyBarrier(
+        workItem, callInst, CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, events);
     }
 
     DEFINE_BUILTIN(prefetch)
@@ -450,7 +454,7 @@ namespace oclgrind
 
     static double _sign_(double x)
     {
-      if (::isnan(x))  return  0.0;
+      if (std::isnan(x)) return 0.0;
       if (x  >  0.0) return  1.0;
       if (x == -0.0) return -0.0;
       if (x ==  0.0) return  0.0;
@@ -646,6 +650,39 @@ namespace oclgrind
       result.setFloat(r);
     }
 
+    static double geometric_length(double *values, unsigned num)
+    {
+      double lengthSq = 0.0;
+      for (unsigned i = 0; i < num; i++)
+      {
+        lengthSq += values[i] * values[i];
+      }
+
+      // Check for overflow/underflow
+      double rescale = 1.0;
+      if (lengthSq == INFINITY)
+      {
+        rescale = ldexp(1.0, -512);
+      }
+      else if (lengthSq < num*DBL_MIN/DBL_EPSILON)
+      {
+        rescale = ldexp(1.0, 640);
+      }
+
+      if (rescale != 1.0)
+      {
+        // Re-do calculations with a rescaling multiplier
+        lengthSq = 0.0;
+        for (unsigned i = 0; i < num; i++)
+        {
+          double f = values[i] * rescale;
+          lengthSq += f*f;
+        }
+      }
+
+      return sqrt(lengthSq) * (1.0/rescale);
+    }
+
     DEFINE_BUILTIN(distance)
     {
       unsigned num = 1;
@@ -654,13 +691,12 @@ namespace oclgrind
         num = ARG(0)->getType()->getVectorNumElements();
       }
 
-      double distSq = 0.0;
+      double values[4];
       for (unsigned i = 0; i < num; i++)
       {
-        double diff = FARGV(0,i) - FARGV(1,i);
-        distSq += diff*diff;
+        values[i] = FARGV(0, i) - FARGV(1, i);
       }
-      result.setFloat(sqrt(distSq));
+      result.setFloat(geometric_length(values, num));
     }
 
     DEFINE_BUILTIN(length)
@@ -671,26 +707,79 @@ namespace oclgrind
         num = ARG(0)->getType()->getVectorNumElements();
       }
 
-      double lengthSq = 0.0;
+      double values[4];
       for (unsigned i = 0; i < num; i++)
       {
-        lengthSq += FARGV(0, i) * FARGV(0, i);
+        values[i] = FARGV(0, i);
       }
-      result.setFloat(sqrt(lengthSq));
+      result.setFloat(geometric_length(values, num));
     }
 
     DEFINE_BUILTIN(normalize)
     {
+      double values[4];
       double lengthSq = 0.0;
       for (unsigned i = 0; i < result.num; i++)
       {
-        lengthSq += FARGV(0, i) * FARGV(0, i);
+        values[i] = FARGV(0, i);
+        lengthSq += values[i] * values[i];
       }
-      double length = sqrt(lengthSq);
 
+      if (lengthSq == INFINITY)
+      {
+        // Re-do calculations with a rescaling multiplier
+        lengthSq = 0.0;
+        double rescale = ldexp(1.0, -512);
+        for (unsigned i = 0; i < result.num; i++)
+        {
+          values[i] = values[i] * rescale;
+          lengthSq += values[i] * values[i];
+        }
+
+        if (lengthSq == INFINITY)
+        {
+          // Infinities in input, set all other values to 0
+          lengthSq = 0.0;
+          for (unsigned i = 0; i < result.num; i++)
+          {
+            if (std::isinf(values[i]))
+            {
+              values[i] = copysign(1.0, FARGV(0, i));
+              lengthSq += 1.0;
+            }
+            else
+            {
+              values[i] = copysign(0.0, FARGV(0, i));
+            }
+          }
+        }
+      }
+      else if (lengthSq < result.num*DBL_MIN/DBL_EPSILON)
+      {
+        // Re-do calculations with a rescaling multiplier
+        lengthSq = 0.0;
+        double rescale = ldexp(1.0, 640);
+        for (unsigned i = 0; i < result.num; i++)
+        {
+          values[i] = values[i] * rescale;
+          lengthSq += values[i] * values[i];
+        }
+
+        if (lengthSq == 0.0)
+        {
+          // Zeros in input, copy vector unchanged
+          for (unsigned i = 0; i < result.num; i++)
+          {
+            result.setFloat(FARGV(0, i), i);
+          }
+          return;
+        }
+      }
+
+      double length = sqrt(lengthSq);
       for (unsigned i = 0; i < result.num; i++)
       {
-        result.setFloat(FARGV(0, i)/length, i);
+        result.setFloat(values[i]/length, i);
       }
     }
 
@@ -992,7 +1081,7 @@ namespace oclgrind
       }
 
       // Remap channels
-      float ret;
+      float ret = 0.f;
       int channel = getInputChannel(image->format, c, &ret);
       if (channel < 0)
       {
@@ -1066,7 +1155,7 @@ namespace oclgrind
       }
 
       // Remap channels
-      float ret;
+      float ret = 0.f;
       int channel = getInputChannel(image->format, c, &ret);
       if (channel < 0)
       {
@@ -1131,7 +1220,7 @@ namespace oclgrind
       }
 
       // Remap channels
-      float ret;
+      float ret = 0.f;
       int channel = getInputChannel(image->format, c, &ret);
       if (channel < 0)
       {
@@ -1198,6 +1287,12 @@ namespace oclgrind
             +    a  *    b  *    c  * v111;
     }
 
+    DEFINE_BUILTIN(translate_sampler_initializer)
+    {
+      // A sampler initializer is just a pointer to its ConstantInt object
+      result.setPointer((size_t)ARG(0));
+    }
+
     DEFINE_BUILTIN(read_imagef)
     {
       const Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
@@ -1208,7 +1303,11 @@ namespace oclgrind
       // Check for sampler version
       if (callInst->getNumArgOperands() > 2)
       {
+#if LLVM_VERSION < 40
         sampler = UARG(1);
+#else
+        sampler = ((llvm::ConstantInt*)PARG(1))->getZExtValue();
+#endif
         coordIndex = 2;
       }
 
@@ -1326,7 +1425,11 @@ namespace oclgrind
       // Check for sampler version
       if (callInst->getNumArgOperands() > 2)
       {
+#if LLVM_VERSION < 40
         sampler = UARG(1);
+#else
+        sampler = ((llvm::ConstantInt*)PARG(1))->getZExtValue();
+#endif
         coordIndex = 2;
       }
 
@@ -1399,7 +1502,11 @@ namespace oclgrind
       // Check for sampler version
       if (callInst->getNumArgOperands() > 2)
       {
+#if LLVM_VERSION < 40
         sampler = UARG(1);
+#else
+        sampler = ((llvm::ConstantInt*)PARG(1))->getZExtValue();
+#endif
         coordIndex = 2;
       }
 
@@ -1888,8 +1995,8 @@ namespace oclgrind
           {
             uint64_t a = UARGV(0, i);
             uint64_t b = UARGV(1, i);
-            uint64_t c = (a > UINT64_MAX-b) ? (1L<<63) : 0;
-            result.setUInt(((a + b) >> 1) | c, i);
+            uint64_t c = (a & b) & 1;
+            result.setUInt((a>>1) + (b>>1) + c, i);
             break;
           }
           case 'c':
@@ -2128,8 +2235,8 @@ namespace oclgrind
           {
             uint64_t a = UARGV(0, i);
             uint64_t b = UARGV(1, i);
-            uint64_t c = (a > UINT64_MAX-(b+1)) ? (1L<<63) : 0;
-            result.setUInt(((a + b + 1) >> 1) | c, i);
+            uint64_t c = (a | b) & 1;
+            result.setUInt((a>>1) + (b>>1) + c, i);
             break;
           }
           case 'c':
@@ -2239,9 +2346,15 @@ namespace oclgrind
     static double _sinpi_(double x){ return (sin(x * M_PI)); }
     static double _tanpi_(double x){ return (tan(x * M_PI)); }
 
-    static double _fma_(double a, double b, double c)
+    DEFINE_BUILTIN(fma_builtin)
     {
-      return a*b + c;
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        if (result.size == 4)
+          result.setFloat(fmaf(FARGV(0, i), FARGV(1, i), FARGV(2, i)), i);
+        else
+          result.setFloat(fma(FARGV(0, i), FARGV(1, i), FARGV(2, i)), i);
+      }
     }
 
     static double _maxmag_(double x, double y)
@@ -2289,12 +2402,25 @@ namespace oclgrind
       for (unsigned i = 0; i < result.num; i++)
       {
         double x = FARGV(0, i);
-        double fl = floor(x);
-#if defined(_WIN32) && !defined(__MINGW32__)
-        double r = fmin(x - fl, nextafter(1, 0));
-#else
-        double r = fmin(x - fl, 0x1.fffffep-1f);
-#endif
+        double fl, r;
+        if (std::isnan(x))
+        {
+          r = nan("");
+          fl = nan("");
+        }
+        else
+        {
+          if (result.size == 4)
+          {
+            fl = floorf(x);
+            r = fmin(x - fl, nextafterf(1, 0));
+          }
+          else
+          {
+            fl = floor(x);
+            r = fmin(x - fl, nextafter(1, 0));
+          }
+        }
 
         size_t offset = i*result.size;
         result.setFloat(fl, i);
@@ -2359,7 +2485,7 @@ namespace oclgrind
       {
         double x = FARGV(0, i);
         double integral = trunc(x);
-        double fractional = copysign(::isinf(x) ? 0.0 : x - integral, x);
+        double fractional = copysign(std::isinf(x) ? 0.0 : x - integral, x);
 
         size_t offset = i*result.size;
         result.setFloat(integral, i);
@@ -2397,6 +2523,59 @@ namespace oclgrind
       }
     }
 
+    DEFINE_BUILTIN(powr)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double x = FARGV(0, i);
+        double y = FARGV(1, i);
+
+        double r;
+        if (x < 0.0)
+        {
+          r = nan("");
+        }
+        else if (std::isnan(x) || std::isnan(y))
+        {
+          r = nan("");
+        }
+        else if (x == 1.0)
+        {
+          if (std::isinf(y))
+            r = nan("");
+          else
+            r = 1.0;
+        }
+        else if (y == 0.0)
+        {
+          if (x == 0.0 || x == INFINITY)
+            r = nan("");
+          else
+            r = 1.0;
+        }
+        else if (x == 0.0)
+        {
+          if (y < 0.0)
+            r = INFINITY;
+          else
+            r = 0.0;
+        }
+        else if (x == INFINITY)
+        {
+          if (y < 0.0)
+            r = 0.0;
+          else
+            r = INFINITY;
+        }
+        else
+        {
+          r = pow(x, y);
+        }
+
+        result.setFloat(r, i);
+      }
+    }
+
     DEFINE_BUILTIN(remquo_builtin)
     {
       Memory *memory =
@@ -2419,9 +2598,43 @@ namespace oclgrind
     {
       for (unsigned i = 0; i < result.num; i++)
       {
-        double x = FARGV(0, i);
-        int y = SARGV(1, i);
-        result.setFloat(pow(x, (double)(1.0/y)), i);
+        long double x = FARGV(0, i);
+        int n = SARGV(1, i);
+
+        long double r;
+        if (n == 0)
+        {
+          r = nan("");
+        }
+        else if (x == 0)
+        {
+          if (n < 0)
+          {
+            if (n&1)
+              r = copysign(INFINITY, x);
+            else
+              r = INFINITY;
+          }
+          else
+          {
+            if (n&1)
+              r = x;
+            else
+              r = 0.0;
+          }
+        }
+        else if (x < 0 && !(n&1))
+        {
+          r = nan("");
+        }
+        else
+        {
+          r = pow(fabs(x), 1.0L/n);
+          if (x < 0 && n&1)
+            r = -r;
+        }
+
+        result.setFloat(r, i);
       }
     }
 
@@ -2488,8 +2701,8 @@ namespace oclgrind
     static int64_t _isle_(double x, double y){ return islessequal(x, y); }
     static int64_t _islg_(double x, double y){ return islessgreater(x, y); }
     static int64_t _isfin_(double x){ return isfinite(x); }
-    static int64_t _isinf_(double x){ return ::isinf(x); }
-    static int64_t _isnan_(double x){ return ::isnan(x); }
+    static int64_t _isinf_(double x){ return std::isinf(x); }
+    static int64_t _isnan_(double x){ return std::isnan(x); }
     static int64_t _isnorm_(double x){ return isnormal(x); }
     static int64_t _isord_(double x, double y){ return !isunordered(x, y); }
     static int64_t _isuord_(double x, double y){ return isunordered(x, y); }
@@ -2697,11 +2910,13 @@ namespace oclgrind
       uint64_t offset = UARG(1);
 
       // Convert to halfs
-      unsigned char *data = workItem->getOperand(value).data;
-      size_t num = size / sizeof(float);
-      size = num*sizeof(cl_half);
-      uint16_t *halfData = (uint16_t*)workItem->m_pool.alloc(2*num);
-      HalfRoundMode rmode = Half_RTE; //  The Oclgrind device's round mode
+      TypedValue op = workItem->getOperand(value);
+      unsigned char *data = op.data;
+      size = op.num*sizeof(cl_half);
+      uint16_t *halfData = (uint16_t*)workItem->m_pool.alloc(2*op.num);
+
+      // Parse rounding mode (RTE is the default)
+      HalfRoundMode rmode = Half_RTE;
       if (fnName.find("_rtz") != std::string::npos)
         rmode = Half_RTZ;
       else if (fnName.find("_rtn") != std::string::npos)
@@ -2709,19 +2924,22 @@ namespace oclgrind
       else if (fnName.find("_rtp") != std::string::npos)
         rmode = Half_RTP;
 
-      for (unsigned i = 0; i < num; i++)
+      for (unsigned i = 0; i < op.num; i++)
       {
-        halfData[i] = floatToHalf(((float*)data)[i], rmode);
+        if (op.size == 4)
+          halfData[i] = floatToHalf(((float*)data)[i], rmode);
+        else
+          halfData[i] = doubleToHalf(((double*)data)[i], rmode);
       }
 
       size_t address;
-      if (fnName.compare(0, 7, "vstorea") == 0 && num == 3)
+      if (fnName.compare(0, 7, "vstorea") == 0 && op.num == 3)
       {
         address = base + offset*sizeof(cl_half)*4;
       }
       else
       {
-        address = base + offset*sizeof(cl_half)*num;
+        address = base + offset*sizeof(cl_half)*op.num;
       }
 
       workItem->getMemory(addressSpace)->store((unsigned char*)halfData,
@@ -2798,8 +3016,41 @@ namespace oclgrind
     // Other Functions //
     /////////////////////
 
+    static void setConvertRoundingMode(const string& name, int def)
+    {
+      size_t rpos = name.find("_rt");
+      if (rpos != string::npos)
+      {
+        switch (name[rpos+3])
+        {
+        case 'e':
+          fesetround(FE_TONEAREST);
+          break;
+        case 'z':
+          fesetround(FE_TOWARDZERO);
+          break;
+        case 'p':
+          fesetround(FE_UPWARD);
+          break;
+        case 'n':
+          fesetround(FE_DOWNWARD);
+          break;
+        default:
+          FATAL_ERROR("Unsupported rounding mode: %c", name[rpos+3]);
+        }
+      }
+      else
+      {
+        fesetround(def);
+      }
+    }
+
     DEFINE_BUILTIN(convert_float)
     {
+      // Use rounding mode
+      const int origRnd = fegetround();
+      setConvertRoundingMode(fnName, FE_TONEAREST);
+
       for (unsigned i = 0; i < result.num; i++)
       {
         switch (getOverloadArgType(overload))
@@ -2808,13 +3059,19 @@ namespace oclgrind
           case 't':
           case 'j':
           case 'm':
-            result.setFloat((float)UARGV(0, i), i);
+          {
+            uint64_t in = UARGV(0, i);
+            if (result.size == 4)
+              result.setFloat(in ? (float)in : 0.f, i);
+            else
+              result.setFloat(in ? (double)in : 0.0, i);
             break;
+          }
           case 'c':
           case 's':
           case 'i':
           case 'l':
-            result.setFloat((float)SARGV(0, i), i);
+            result.setFloat(SARGV(0, i), i);
             break;
           case 'f':
           case 'd':
@@ -2825,6 +3082,7 @@ namespace oclgrind
                         getOverloadArgType(overload));
         }
       }
+      fesetround(origRnd);
     }
 
     DEFINE_BUILTIN(convert_half)
@@ -2865,44 +3123,32 @@ namespace oclgrind
       }
     }
 
-    static void setConvertRoundingMode(const string& name)
-    {
-      size_t rpos = name.find("_rt");
-      if (rpos != string::npos)
-      {
-        switch (name[rpos+3])
-        {
-        case 'e':
-          fesetround(FE_TONEAREST);
-          break;
-        case 'z':
-          fesetround(FE_TOWARDZERO);
-          break;
-        case 'p':
-          fesetround(FE_UPWARD);
-          break;
-        case 'n':
-          fesetround(FE_DOWNWARD);
-          break;
-        default:
-          FATAL_ERROR("Unsupported rounding mode: %c", name[rpos=3]);
-        }
-      }
-      else
-      {
-        fesetround(FE_TOWARDZERO);
-      }
-    }
-
     DEFINE_BUILTIN(convert_uint)
     {
       // Check for saturation modifier
       bool sat = fnName.find("_sat") != string::npos;
-      uint64_t max = (1UL<<(result.size*8)) - 1;
+      uint64_t max;
+      switch (result.size)
+      {
+      case 1:
+        max = UINT8_MAX;
+        break;
+      case 2:
+        max = UINT16_MAX;
+        break;
+      case 4:
+        max = UINT32_MAX;
+        break;
+      case 8:
+        max = UINT64_MAX;
+        break;
+      default:
+        FATAL_ERROR("Unsupported integer size %d", result.size);
+      }
 
       // Use rounding mode
       const int origRnd = fegetround();
-      setConvertRoundingMode(fnName);
+      setConvertRoundingMode(fnName, FE_TOWARDZERO);
 
       for (unsigned i = 0; i < result.num; i++)
       {
@@ -2943,7 +3189,8 @@ namespace oclgrind
           case 'd':
             if (sat)
             {
-              r = rint(_clamp_(FARGV(0, i), 0.0, (double)max));
+              r = rint(_clamp_((long double)FARGV(0, i),
+                                0.0L, (long double)max));
             }
             else
             {
@@ -2983,11 +3230,13 @@ namespace oclgrind
         min = INT64_MIN;
         max = INT64_MAX;
         break;
+      default:
+        FATAL_ERROR("Unsupported integer size %d", result.size);
       }
 
       // Use rounding mode
       const int origRnd = fegetround();
-      setConvertRoundingMode(fnName);
+      setConvertRoundingMode(fnName, FE_TOWARDZERO);
 
       for (unsigned i = 0; i < result.num; i++)
       {
@@ -3018,7 +3267,8 @@ namespace oclgrind
           case 'd':
             if (sat)
             {
-              r = rint(_clamp_(FARGV(0, i), (double)min, (double)max));
+              r = rint(_clamp_((long double)FARGV(0, i),
+                               (long double)min, (long double)max));
             }
             else
             {
@@ -3185,6 +3435,17 @@ namespace oclgrind
     // LLVM Intrinsics //
     /////////////////////
 
+    DEFINE_BUILTIN(llvm_bswap)
+    {
+      uint64_t r = 0;
+      uint64_t value = UARG(0);
+      for (unsigned i = 0; i < result.size; i++)
+      {
+        r |= ((value>>(i*8)) & 0xFF) << ((result.size - i - 1)*8);
+      }
+      result.setUInt(r);
+    }
+
     DEFINE_BUILTIN(llvm_dbg_declare)
     {
       const llvm::DbgDeclareInst *dbgInst =
@@ -3371,6 +3632,10 @@ namespace oclgrind
     ADD_BUILTIN("write_imagef", write_imagef, NULL);
     ADD_BUILTIN("write_imagei", write_imagei, NULL);
     ADD_BUILTIN("write_imageui", write_imageui, NULL);
+#if LLVM_VERSION >= 40
+    ADD_BUILTIN("__translate_sampler_initializer",
+                translate_sampler_initializer, NULL);
+#endif
 
     // Integer Functions
     ADD_BUILTIN("abs", abs_builtin, NULL);
@@ -3416,7 +3681,7 @@ namespace oclgrind
     ADD_BUILTIN("fabs", f1arg, F1ARG(fabs));
     ADD_BUILTIN("fdim", f2arg, F2ARG(fdim));
     ADD_BUILTIN("floor", f1arg, F1ARG(floor));
-    ADD_BUILTIN("fma", f3arg, F3ARG(_fma_));
+    ADD_BUILTIN("fma", fma_builtin, NULL);
     ADD_BUILTIN("fmax", f2arg, F2ARG(fmax));
     ADD_BUILTIN("fmin", f2arg, F2ARG(fmin));
     ADD_BUILTIN("fmod", f2arg, F2ARG(fmod));
@@ -3432,7 +3697,7 @@ namespace oclgrind
     ADD_BUILTIN("log10", f1arg, F1ARG(log10));
     ADD_BUILTIN("log1p", f1arg, F1ARG(log1p));
     ADD_BUILTIN("logb", f1arg, F1ARG(logb));
-    ADD_BUILTIN("mad", f3arg, F3ARG(_fma_));
+    ADD_BUILTIN("mad", fma_builtin, NULL);
     ADD_BUILTIN("maxmag", f2arg, _maxmag_);
     ADD_BUILTIN("minmag", f2arg, _minmag_);
     ADD_BUILTIN("modf", modf_builtin, NULL);
@@ -3441,7 +3706,7 @@ namespace oclgrind
     ADD_BUILTIN("nextafter", nextafter_builtin, NULL);
     ADD_BUILTIN("pow", f2arg, F2ARG(pow));
     ADD_BUILTIN("pown", pown, NULL);
-    ADD_BUILTIN("powr", f2arg, F2ARG(pow));
+    ADD_BUILTIN("powr", powr, NULL);
     ADD_BUILTIN("remainder", f2arg, F2ARG(remainder));
     ADD_BUILTIN("remquo", remquo_builtin, NULL);
     ADD_BUILTIN("rint", f1arg, F1ARG(rint));
@@ -3476,8 +3741,8 @@ namespace oclgrind
     ADD_BUILTIN("native_log2", f1arg, F1ARG(log2));
     ADD_BUILTIN("half_log10", f1arg, F1ARG(log10));
     ADD_BUILTIN("native_log10", f1arg, F1ARG(log10));
-    ADD_BUILTIN("half_powr", f2arg, F2ARG(pow));
-    ADD_BUILTIN("native_powr", f2arg, F2ARG(pow));
+    ADD_BUILTIN("half_powr", powr, NULL);
+    ADD_BUILTIN("native_powr", powr, NULL);
     ADD_BUILTIN("half_recip", f1arg, _frecip_);
     ADD_BUILTIN("native_recip", f1arg, _frecip_);
     ADD_BUILTIN("half_rsqrt", f1arg, _rsqrt_);
@@ -3546,14 +3811,16 @@ namespace oclgrind
     ADD_BUILTIN("printf", printf_builtin, NULL);
 
     // LLVM Intrinsics
+    ADD_PREFIX_BUILTIN("llvm.bswap.", llvm_bswap, NULL);
     ADD_BUILTIN("llvm.dbg.declare", llvm_dbg_declare, NULL);
     ADD_BUILTIN("llvm.dbg.value", llvm_dbg_value, NULL);
+    ADD_PREFIX_BUILTIN("llvm.fabs.f", f1arg, F1ARG(fabs));
     ADD_BUILTIN("llvm.lifetime.start", llvm_lifetime_start, NULL);
     ADD_BUILTIN("llvm.lifetime.end", llvm_lifetime_end, NULL);
     ADD_PREFIX_BUILTIN("llvm.memcpy", llvm_memcpy, NULL);
     ADD_PREFIX_BUILTIN("llvm.memmove", llvm_memcpy, NULL);
     ADD_PREFIX_BUILTIN("llvm.memset", llvm_memset, NULL);
-    ADD_PREFIX_BUILTIN("llvm.fmuladd", f3arg, F3ARG(_fma_));
+    ADD_PREFIX_BUILTIN("llvm.fmuladd", fma_builtin, NULL);
     ADD_BUILTIN("llvm.trap", llvm_trap, NULL);
 
     return builtins;
diff --git a/src/core/clc.h b/src/core/clc.h
index 320ddce..71fe306 100644
--- a/src/core/clc.h
+++ b/src/core/clc.h
@@ -1,5 +1,5 @@
 // clc.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -421,37 +421,41 @@ BUILTIN_1ARG_FLOATS(fast_normalize);
 // Image Functions //
 /////////////////////
 
-size_t __OVERLOAD__ get_image_array_size(image1d_array_t image);
-size_t __OVERLOAD__ get_image_array_size(image2d_array_t image);
-
-int __OVERLOAD__ get_image_channel_data_type(image1d_t image);
-int __OVERLOAD__ get_image_channel_data_type(image1d_buffer_t image);
-int __OVERLOAD__ get_image_channel_data_type(image1d_array_t image);
-int __OVERLOAD__ get_image_channel_data_type(image2d_t image);
-int __OVERLOAD__ get_image_channel_data_type(image2d_array_t image);
-int __OVERLOAD__ get_image_channel_data_type(image3d_t image);
-
-int __OVERLOAD__ get_image_channel_order(image1d_t image);
-int __OVERLOAD__ get_image_channel_order(image1d_buffer_t image);
-int __OVERLOAD__ get_image_channel_order(image1d_array_t image);
-int __OVERLOAD__ get_image_channel_order(image2d_t image);
-int __OVERLOAD__ get_image_channel_order(image2d_array_t image);
-int __OVERLOAD__ get_image_channel_order(image3d_t image);
-
-int2 __OVERLOAD__ get_image_dim(image2d_t image);
-int2 __OVERLOAD__ get_image_dim(image2d_array_t image);
-int4 __OVERLOAD__ get_image_dim(image3d_t image);
-
-int __OVERLOAD__ get_image_depth(image3d_t image);
-int __OVERLOAD__ get_image_height(image2d_t image);
-int __OVERLOAD__ get_image_height(image2d_array_t image);
-int __OVERLOAD__ get_image_height(image3d_t image);
-int __OVERLOAD__ get_image_width(image1d_t image);
-int __OVERLOAD__ get_image_width(image1d_buffer_t image);
-int __OVERLOAD__ get_image_width(image1d_array_t image);
-int __OVERLOAD__ get_image_width(image2d_t image);
-int __OVERLOAD__ get_image_width(image2d_array_t image);
-int __OVERLOAD__ get_image_width(image3d_t image);
+#define IMAGE_QUERY(ret, name, type) \
+  ret __OVERLOAD__ name(read_only type image); \
+  ret __OVERLOAD__ name(write_only type image)
+
+IMAGE_QUERY(size_t, get_image_array_size, image1d_array_t);
+IMAGE_QUERY(size_t, get_image_array_size, image2d_array_t);
+
+IMAGE_QUERY(int, get_image_channel_data_type, image1d_t);
+IMAGE_QUERY(int, get_image_channel_data_type, image1d_buffer_t);
+IMAGE_QUERY(int, get_image_channel_data_type, image1d_array_t);
+IMAGE_QUERY(int, get_image_channel_data_type, image2d_t);
+IMAGE_QUERY(int, get_image_channel_data_type, image2d_array_t);
+IMAGE_QUERY(int, get_image_channel_data_type, image3d_t);
+
+IMAGE_QUERY(int, get_image_channel_order, image1d_t);
+IMAGE_QUERY(int, get_image_channel_order, image1d_buffer_t);
+IMAGE_QUERY(int, get_image_channel_order, image1d_array_t);
+IMAGE_QUERY(int, get_image_channel_order, image2d_t);
+IMAGE_QUERY(int, get_image_channel_order, image2d_array_t);
+IMAGE_QUERY(int, get_image_channel_order, image3d_t);
+
+IMAGE_QUERY(int2, get_image_dim, image2d_t);
+IMAGE_QUERY(int2, get_image_dim, image2d_array_t);
+IMAGE_QUERY(int4, get_image_dim, image3d_t);
+
+IMAGE_QUERY(int, get_image_depth, image3d_t);
+IMAGE_QUERY(int, get_image_height, image2d_t);
+IMAGE_QUERY(int, get_image_height, image2d_array_t);
+IMAGE_QUERY(int, get_image_height, image3d_t);
+IMAGE_QUERY(int, get_image_width, image1d_t);
+IMAGE_QUERY(int, get_image_width, image1d_buffer_t);
+IMAGE_QUERY(int, get_image_width, image1d_array_t);
+IMAGE_QUERY(int, get_image_width, image2d_t);
+IMAGE_QUERY(int, get_image_width, image2d_array_t);
+IMAGE_QUERY(int, get_image_width, image3d_t);
 
 float4 __OVERLOAD__ read_imagef(image1d_t, int);
 float4 __OVERLOAD__ read_imagef(image1d_buffer_t, int);
@@ -507,21 +511,21 @@ uint4 __OVERLOAD__ read_imageui(image2d_array_t, sampler_t, float4);
 uint4 __OVERLOAD__ read_imageui(image3d_t, sampler_t, int4);
 uint4 __OVERLOAD__ read_imageui(image3d_t, sampler_t, float4);
 
-void __OVERLOAD__ write_imagef(image1d_t, int, float4);
-void __OVERLOAD__ write_imagef(image1d_array_t, int2, float4);
-void __OVERLOAD__ write_imagef(image2d_t, int2, float4);
-void __OVERLOAD__ write_imagef(image2d_array_t, int4, float4);
-void __OVERLOAD__ write_imagef(image3d_t, int4, float4);
-void __OVERLOAD__ write_imagei(image1d_t, int, int4);
-void __OVERLOAD__ write_imagei(image1d_array_t, int2, int4);
-void __OVERLOAD__ write_imagei(image2d_t, int2, int4);
-void __OVERLOAD__ write_imagei(image2d_array_t, int4, int4);
-void __OVERLOAD__ write_imagei(image3d_t, int4, int4);
-void __OVERLOAD__ write_imageui(image1d_t, int, uint4);
-void __OVERLOAD__ write_imageui(image1d_array_t, int2, uint4);
-void __OVERLOAD__ write_imageui(image2d_t, int2, uint4);
-void __OVERLOAD__ write_imageui(image2d_array_t, int4, uint4);
-void __OVERLOAD__ write_imageui(image3d_t, int4, uint4);
+void __OVERLOAD__ write_imagef(write_only image1d_t, int, float4);
+void __OVERLOAD__ write_imagef(write_only image1d_array_t, int2, float4);
+void __OVERLOAD__ write_imagef(write_only image2d_t, int2, float4);
+void __OVERLOAD__ write_imagef(write_only image2d_array_t, int4, float4);
+void __OVERLOAD__ write_imagef(write_only image3d_t, int4, float4);
+void __OVERLOAD__ write_imagei(write_only image1d_t, int, int4);
+void __OVERLOAD__ write_imagei(write_only image1d_array_t, int2, int4);
+void __OVERLOAD__ write_imagei(write_only image2d_t, int2, int4);
+void __OVERLOAD__ write_imagei(write_only image2d_array_t, int4, int4);
+void __OVERLOAD__ write_imagei(write_only image3d_t, int4, int4);
+void __OVERLOAD__ write_imageui(write_only image1d_t, int, uint4);
+void __OVERLOAD__ write_imageui(write_only image1d_array_t, int2, uint4);
+void __OVERLOAD__ write_imageui(write_only image2d_t, int2, uint4);
+void __OVERLOAD__ write_imageui(write_only image2d_array_t, int4, uint4);
+void __OVERLOAD__ write_imageui(write_only image3d_t, int4, uint4);
 
 
 ///////////////////////
@@ -890,7 +894,9 @@ VLOADSTORE(double);
   VSTORE_HALF_ADDRSPACE(func##_rtn, type);
 #define VSTORE_HALF_WIDTH(n)                    \
   VSTORE_HALF_ROUND(vstore_half##n, float##n);  \
-  VSTORE_HALF_ROUND(vstorea_half##n, float##n);
+  VSTORE_HALF_ROUND(vstorea_half##n, float##n); \
+  VSTORE_HALF_ROUND(vstore_half##n, double##n); \
+  VSTORE_HALF_ROUND(vstorea_half##n, double##n);
 #define VLOADSTORE_HALF_WIDTH(n) \
   VLOAD_HALF_WIDTH(n);           \
   VSTORE_HALF_WIDTH(n);
diff --git a/src/core/common.cpp b/src/core/common.cpp
index 3f849fa..a4975c7 100644
--- a/src/core/common.cpp
+++ b/src/core/common.cpp
@@ -1,11 +1,12 @@
 // common.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
 // license terms please see the LICENSE file distributed with this
 // source code.
 
+#include "config.h"
 #include "common.h"
 
 #if defined(_WIN32) && !defined(__MINGW32__)
@@ -24,19 +25,19 @@ using namespace std;
 
 namespace oclgrind
 {
-  _Size3_::_Size3_()
+  Size3::Size3()
   {
     x = y = z = 0;
   }
 
-  _Size3_::_Size3_(size_t _x, size_t _y, size_t _z)
+  Size3::Size3(size_t _x, size_t _y, size_t _z)
   {
     x = _x;
     y = _y;
     z = _z;
   }
 
-  _Size3_::_Size3_(size_t linear, _Size3_ dimensions)
+  Size3::Size3(size_t linear, Size3 dimensions)
   {
     x = linear % dimensions.x;
     y = (linear / dimensions.x) % dimensions.y;
@@ -55,6 +56,7 @@ namespace oclgrind
       return z;
     default:
       assert(false && "Size3 index out of range");
+      abort();
     }
   }
 
@@ -70,6 +72,7 @@ namespace oclgrind
       return z;
     default:
       assert(false && "Size3 index out of range");
+      abort();
     }
   }
 
@@ -78,6 +81,11 @@ namespace oclgrind
     return x == rhs.x && y == rhs.y && z == rhs.z;
   }
 
+  bool Size3::operator!=(const Size3& rhs) const
+  {
+    return x != rhs.x || y != rhs.y || z != rhs.z;
+  }
+
   ostream& operator<<(ostream& stream, const Size3& size)
   {
     stream << dec    << "("
@@ -209,13 +217,68 @@ namespace oclgrind
     }
   }
 
+  ostream& operator<<(ostream& stream, const TypedValue& tv)
+  {
+    if(tv.data)
+    {
+      if(tv.num > 1)
+      {
+        stream << "(";
+      }
+
+      for(unsigned n = 0; n < tv.num; ++n)
+      {
+        for(int i = tv.size - 1; i >= 0; --i)
+        {
+          stream << hex << uppercase << setw(2) << setfill('0')
+                 << (int)*(tv.data + tv.size * n + i);
+        }
+
+        if(n != tv.num - 1)
+        {
+          stream << ",";
+        }
+      }
+
+      if(tv.num > 1)
+      {
+        stream << ")";
+      }
+    }
+    else
+    {
+      stream << "NULL";
+    }
+
+    return stream;
+  }
+
+  bool TypedValue::operator==(const TypedValue& rhs) const
+  {
+    return (size == rhs.size) && (num == rhs.num) &&
+           (memcmp(data, rhs.data, size*num) == 0);
+  }
+
+  bool TypedValue::operator!=(const TypedValue& rhs) const
+  {
+    return (size != rhs.size) || (num != rhs.num) ||
+           (memcmp(data, rhs.data, size*num) != 0);
+  }
+
   TypedValue TypedValue::clone() const
   {
     TypedValue result;
     result.size = size;
     result.num  = num;
-    result.data = new unsigned char[size*num];
-    memcpy(result.data, data, size*num);
+    if (data)
+    {
+      result.data = new unsigned char[size*num];
+      memcpy(result.data, data, size*num);
+    }
+    else
+    {
+      result.data = NULL;
+    }
     return result;
   }
 
@@ -260,10 +323,27 @@ namespace oclgrind
     switch (type->getTypeID())
     {
     case llvm::Type::IntegerTyID:
-      memcpy(data,
-             ((llvm::ConstantInt*)constant)->getValue().getRawData(),
-             size);
+    {
+      uint64_t ui = ((llvm::ConstantInt*)constant)->getZExtValue();
+      switch (size)
+      {
+      case 1:
+        *((uint8_t*)data) = ui;
+        break;
+      case 2:
+        *((uint16_t*)data) = ui;
+        break;
+      case 4:
+        *((uint32_t*)data) = ui;
+        break;
+      case 8:
+        *((uint64_t*)data) = ui;
+        break;
+      default:
+        FATAL_ERROR("Unsupported constant int size: %u bytes", size);
+      }
       break;
+    }
     case llvm::Type::FloatTyID:
     {
       *(float*)data =
@@ -328,13 +408,8 @@ namespace oclgrind
     const llvm::ConstantExpr *expr)
   {
     // Get operands
-    unsigned numOperands = expr->getNumOperands();
-    llvm::Value **valueOperands = new llvm::Value*[numOperands];
-    for (unsigned i = 0; i < numOperands; i++)
-    {
-      valueOperands[i] = expr->getOperand(i);
-    }
-    llvm::ArrayRef<llvm::Value*> operands(valueOperands, numOperands);
+    vector<llvm::Value*> valueOperands(expr->op_begin(), expr->op_end());
+    llvm::ArrayRef<llvm::Value*> operands(valueOperands);
 
     // Create instruction
     unsigned opcode = expr->getOpcode();
@@ -378,7 +453,7 @@ namespace oclgrind
       else
       {
 #if LLVM_VERSION > 36
-        return llvm::GetElementPtrInst::Create(expr->getType(),
+        return llvm::GetElementPtrInst::Create(nullptr,
                                                operands[0], operands.slice(1));
 #else
         return llvm::GetElementPtrInst::Create(operands[0], operands.slice(1));
@@ -387,9 +462,10 @@ namespace oclgrind
       }
     case llvm::Instruction::ICmp:
     case llvm::Instruction::FCmp:
-      return llvm::CmpInst::Create((llvm::Instruction::OtherOps)opcode,
-                                   expr->getPredicate(),
-                                   operands[0], operands[1]);
+      return llvm::CmpInst::Create(
+        (llvm::Instruction::OtherOps)opcode,
+        (llvm::CmpInst::Predicate)expr->getPredicate(),
+        operands[0], operands[1]);
     default:
       assert(expr->getNumOperands() == 2 && "Must be binary operator?");
 
@@ -425,10 +501,9 @@ namespace oclgrind
     }
   }
 
-  const llvm::ConstantInt* getMDOpAsConstInt(const llvm::MDOperand& op)
+  const llvm::ConstantInt* getMDAsConstInt(const llvm::Metadata *md)
   {
-    llvm::Metadata *md = op.get();
-    llvm::ConstantAsMetadata *cam =
+    const llvm::ConstantAsMetadata *cam =
       llvm::dyn_cast<llvm::ConstantAsMetadata>(md);
     if (!cam)
       return NULL;
@@ -461,7 +536,7 @@ namespace oclgrind
     }
 
     // Unreachable
-    assert(false);
+    abort();
   }
 
   unsigned getTypeSize(const llvm::Type *type)
@@ -517,16 +592,9 @@ namespace oclgrind
     }
     else
     {
-      // For some reason, getScalarSizeInBits is not const
-      llvm::Type* nonConstTy = const_cast<llvm::Type*>(type);
-
       // Round up for types that have a bit size not multiple of 8
       // like "bool".
-      unsigned ret = nonConstTy->getScalarSizeInBits() / 8;
-      if (nonConstTy->getScalarSizeInBits() % 8)
-        ret++;
-
-      return ret;
+      return (type->getScalarSizeInBits() + 7) >> 3;
     }
   }
 
@@ -584,7 +652,7 @@ namespace oclgrind
       numElements = 1;
     }
 
-    unsigned elemSize = bits >> 3;
+    unsigned elemSize = (bits+7) >> 3;
 
     // Special case for pointer types
     if (type->isPointerTy())
@@ -675,6 +743,20 @@ namespace oclgrind
     case llvm::Type::PointerTyID:
       cout << "0x" << hex << *(size_t*)data;
       break;
+    case llvm::Type::ArrayTyID:
+    {
+      const llvm::Type *elemType = type->getArrayElementType();
+      unsigned elemSize = getTypeSize(elemType);
+      cout << "{";
+      for (unsigned i = 0; i < type->getArrayNumElements(); i++)
+      {
+        if (i > 0)
+          cout << ",";
+        printTypedData(elemType, data+i*elemSize);
+      }
+      cout << "}";
+      break;
+    }
     default:
       cout << "(raw) 0x" << hex << uppercase << setfill('0');
       for (unsigned i = 0; i < size; i++)
@@ -709,4 +791,51 @@ namespace oclgrind
   {
     return runtime_error::what();
   }
+
+  MemoryPool::MemoryPool(size_t blockSize) : m_blockSize(blockSize)
+  {
+    // Force first allocation to create new block
+    m_offset = m_blockSize;
+  }
+
+  MemoryPool::~MemoryPool()
+  {
+    for (auto itr = m_blocks.begin(); itr != m_blocks.end(); itr++)
+    {
+      delete[] *itr;
+    }
+  }
+
+  uint8_t* MemoryPool::alloc(size_t size)
+  {
+    // Check if requested size larger than block size
+    if (size > m_blockSize)
+    {
+      // Oversized buffers allocated separately from main pool
+      unsigned char *buffer = new unsigned char[size];
+      m_blocks.push_back(buffer);
+      return buffer;
+    }
+
+    // Check if enough space in current block
+    if (m_offset + size > m_blockSize)
+    {
+      // Allocate new block
+      m_blocks.push_front(new unsigned char[m_blockSize]);
+      m_offset = 0;
+    }
+    uint8_t *buffer = m_blocks.front() + m_offset;
+    m_offset += size;
+    return buffer;
+  }
+
+  TypedValue MemoryPool::clone(const TypedValue& source)
+  {
+    TypedValue dest;
+    dest.size = source.size;
+    dest.num = source.num;
+    dest.data = alloc(dest.size*dest.num);
+    memcpy(dest.data, source.data, dest.size*dest.num);
+    return dest;
+  }
 }
diff --git a/src/core/common.h b/src/core/common.h
index d908ffa..b015cab 100644
--- a/src/core/common.h
+++ b/src/core/common.h
@@ -1,5 +1,5 @@
 // common.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -9,7 +9,6 @@
 #ifndef __common_h_
 #define __common_h_
 
-#include "config.h"
 #include "CL/cl.h"
 #include <cassert>
 #include <cstdio>
@@ -37,13 +36,23 @@
 #undef ERROR
 #endif
 
+#ifdef __APPLE__
+// TODO: Remove this when thread_local fixed on OS X
+#define THREAD_LOCAL __thread
+#elif defined(_WIN32) && !defined(__MINGW32__)
+// TODO: Remove this when thread_local fixed on Windows
+#define THREAD_LOCAL __declspec(thread)
+#else
+#define THREAD_LOCAL thread_local
+#endif
+
 namespace llvm
 {
   class Constant;
   class ConstantExpr;
   class ConstantInt;
   class Instruction;
-  class MDOperand;
+  class Metadata;
   class StructType;
   class Type;
   class Value;
@@ -87,26 +96,32 @@ namespace oclgrind
   };
 
   // 3-dimensional size
-  typedef struct _Size3_
+  struct Size3
   {
     size_t x, y, z;
-    _Size3_();
-    _Size3_(size_t x, size_t y, size_t z);
-    _Size3_(size_t linear, _Size3_ dimensions);
+    Size3();
+    Size3(size_t x, size_t y, size_t z);
+    Size3(size_t linear, Size3 dimensions);
     size_t& operator[](unsigned i);
     const size_t& operator[](unsigned i) const;
-    bool operator==(const _Size3_& rhs) const;
-    friend std::ostream& operator<<(std::ostream& stream, const _Size3_& sz);
-  } Size3;
+    bool operator==(const Size3& rhs) const;
+    bool operator!=(const Size3& rhs) const;
+    friend std::ostream& operator<<(std::ostream& stream, const Size3& sz);
+  };
 
   // Structure for a value with a size/type
-  struct _TypedValue_
+  struct TypedValue
   {
     unsigned size;
     unsigned num;
     unsigned char *data;
 
-    struct _TypedValue_ clone() const;
+    bool operator==(const TypedValue& rhs) const;
+    bool operator!=(const TypedValue& rhs) const;
+
+    friend std::ostream& operator<<(std::ostream& stream, const TypedValue& tv);
+
+    struct TypedValue clone() const;
 
     double   getFloat(unsigned index = 0) const;
     size_t   getPointer(unsigned index = 0) const;
@@ -118,7 +133,6 @@ namespace oclgrind
     void     setUInt(uint64_t value, unsigned index = 0);
 
   };
-  typedef _TypedValue_ TypedValue;
 
   // Private memory map type
   typedef std::map<const llvm::Value*,TypedValue> TypedValueMap;
@@ -147,8 +161,8 @@ namespace oclgrind
   const llvm::Instruction* getConstExprAsInstruction(
     const llvm::ConstantExpr *expr);
 
-  // Get the ConstantInt object for an MDOperand
-  const llvm::ConstantInt* getMDOpAsConstInt(const llvm::MDOperand& op);
+  // Get the ConstantInt object for a Metadata node
+  const llvm::ConstantInt* getMDAsConstInt(const llvm::Metadata *md);
 
   // Get the byte offset of a struct member
   unsigned getStructMemberOffset(const llvm::StructType *type, unsigned index);
@@ -198,6 +212,90 @@ namespace oclgrind
       delete[] str;                                      \
       throw FatalError(msg, __FILE__, __LINE__);         \
     }
+
+  class MemoryPool
+  {
+  public:
+    MemoryPool(size_t blockSize = 1024);
+    ~MemoryPool();
+    uint8_t* alloc(size_t size);
+    TypedValue clone(const TypedValue& source);
+  private:
+    size_t m_blockSize;
+    size_t m_offset;
+    std::list<uint8_t*> m_blocks;
+  };
+
+  // Pool allocator class for STL containers
+  template <class T,size_t BLOCKSIZE>
+  class PoolAllocator
+  {
+    template <typename U,size_t BS> friend class PoolAllocator;
+
+  public:
+    typedef T          value_type;
+    typedef T*         pointer;
+    typedef T&         reference;
+    typedef const T*   const_pointer;
+    typedef const T&   const_reference;
+    typedef size_t     size_type;
+    typedef ptrdiff_t  difference_type;
+
+    template<typename U>
+    struct rebind
+    {
+      typedef PoolAllocator<U,BLOCKSIZE> other;
+    };
+
+    PoolAllocator()
+    {
+      pool.reset(new MemoryPool(BLOCKSIZE));
+    }
+
+    PoolAllocator(const PoolAllocator& p)
+    {
+      this->pool = p.pool;
+    }
+
+    template<typename U>
+    PoolAllocator(const PoolAllocator<U,BLOCKSIZE>& p)
+    {
+      this->pool = p.pool;
+    }
+
+    pointer allocate(size_type n, const_pointer hint=0)
+    {
+      return (pointer)(pool->alloc(n*sizeof(value_type)));
+    }
+
+    void deallocate(pointer p, size_type n){}
+
+    template<class U, class... Args>
+    void construct(U *p, Args&&... args)
+    {
+      new ((void*)p) U(std::forward<Args>(args)...);
+    }
+
+    template<class U>
+    void destroy(U *p)
+    {
+      p->~U();
+    }
+
+    bool operator==(const PoolAllocator& p) const
+    {
+      return this->pool == p.pool;
+    }
+
+    bool operator!=(const PoolAllocator& p) const
+    {
+      return this->pool != p.pool;
+    }
+
+  private:
+    std::shared_ptr<MemoryPool> pool;
+  };
+
 }
 
 #endif // __common_h_
diff --git a/src/core/half.cpp b/src/core/half.cpp
new file mode 100644
index 0000000..c7cf59a
--- /dev/null
+++ b/src/core/half.cpp
@@ -0,0 +1,259 @@
+// half.cpp (Oclgrind)
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "half.h"
+
+namespace oclgrind
+{
+  float halfToFloat(uint16_t half)
+  {
+    uint16_t h_sign, h_exponent, h_mantissa;
+    uint32_t f_sign, f_exponent, f_mantissa;
+
+    h_sign     = half & 0x8000; // 1000 0000 0000 0000
+    h_exponent = half & 0x7C00; // 0111 1100 0000 0000
+    h_mantissa = half & 0x03FF; // 0000 0011 1111 1111
+
+    f_sign     = ((uint32_t)h_sign) << 16;
+
+    if (h_exponent == 0)
+    {
+      if (h_mantissa == 0)
+      {
+        // Zero
+        f_exponent = 0;
+        f_mantissa = 0;
+      }
+      else
+      {
+        // Denorm - convert to normalized float
+        int e = -1;
+        do
+        {
+          e++;
+          h_mantissa <<= 1;
+        }
+        while((h_mantissa & 0x0400) == 0);
+
+        f_exponent = (-15 + 127 - e) << 23;
+        f_mantissa = ((uint32_t)(h_mantissa & 0x03FF)) << 13;
+      }
+    }
+    else if (h_exponent == 0x7C00)
+    {
+      // Inf or NaN
+      f_exponent = 0xFF << 23;
+      f_mantissa = h_mantissa;
+    }
+    else
+    {
+      // Normalized
+      f_exponent = (((int32_t)(h_exponent >> 10)) - 15 + 127) << 23;
+      f_mantissa = ((uint32_t)h_mantissa) << 13;
+    }
+
+    uint32_t result = f_sign | f_exponent | f_mantissa;
+    return *(float*)&result;
+  }
+
+  uint16_t floatToHalf(float sp, HalfRoundMode round)
+  {
+    uint16_t h_sign, h_exponent, h_mantissa;
+    uint32_t f_sign, f_exponent, f_mantissa;
+
+    union
+    {
+      float f;
+      uint32_t ui;
+    } FtoUI;
+    FtoUI.f = sp;
+    uint32_t f = FtoUI.ui;
+    f_sign     = f & 0x80000000; // 1000 0000 0000 0000 0000 0000 0000 0000
+    f_exponent = f & 0x7F800000; // 0111 1111 1000 0000 0000 0000 0000 0000
+    f_mantissa = f & 0x007FFFFF; // 0000 0000 0111 1111 1111 1111 1111 1111
+
+    h_sign     = f_sign >> 16;
+
+    if (f_exponent == 0)
+    {
+      // Zero
+      h_exponent = 0;
+      h_mantissa = 0;
+    }
+    else if (f_exponent == 0x7F800000)
+    {
+      // Inf or NaN
+      h_exponent = 0x7C00;
+      if (f_mantissa)
+        h_mantissa = 0x1FF;
+      else
+        h_mantissa = 0;
+    }
+    else
+    {
+      int e = (((int32_t)(f_exponent >> 23)) - 127 + 15);
+      if (e >= 0x1F)
+      {
+        // Value will overflow
+        h_exponent = 0x7C00;
+        h_mantissa = 0;
+
+        if (round == Half_RTZ)
+          h_mantissa = -1;
+        if (round == Half_RTP && h_sign)
+          h_mantissa = -1;
+        if (round == Half_RTN && !h_sign)
+          h_mantissa = -1;
+      }
+      else if (e <= 0)
+      {
+        // Value will underflow
+        h_exponent = 0;
+        if (14 - e > 24)
+        {
+          // Too small - flush to zero
+          h_mantissa = 0;
+        }
+        else
+        {
+          // Convert to denorm
+          f_mantissa |= 0x800000;
+          h_mantissa = (f_mantissa >> (14-e));
+          if ((f_mantissa >> (13 - e)) & 0x1)
+          {
+            h_mantissa += 0x1;
+          }
+        }
+      }
+      else
+      {
+        // Normalized
+        h_exponent = e << 10;
+        h_mantissa = f_mantissa >> 13;
+        // The current f_mantissa is done in RTZ
+        if (round == Half_RTE && (f & 0x00001000) != 0)
+        {
+          if ((f & 0x00002FFF) != 0)
+            h_mantissa += 1;
+        }
+        else if (round == Half_RTP)
+        {
+          FtoUI.ui &= 0xFFFFE000;
+          if (FtoUI.f < sp)
+            h_mantissa += 1;
+        }
+        else if (round == Half_RTN)
+        {
+          FtoUI.ui &= 0xFFFFE000;
+          if (sp < FtoUI.f)
+            h_mantissa += 1;
+        }
+      }
+    }
+
+    return h_sign + h_exponent + h_mantissa;
+  }
+
+  uint16_t doubleToHalf(double dp, HalfRoundMode round)
+  {
+    uint16_t h_sign, h_exponent, h_mantissa;
+    uint64_t d_sign, d_exponent, d_mantissa;
+
+    union
+    {
+      double d;
+      uint64_t ui;
+    } DtoUI;
+    DtoUI.d = dp;
+    uint64_t d = DtoUI.ui;
+    d_sign     = d & 0x8000000000000000;
+    d_exponent = d & 0x7FF0000000000000;
+    d_mantissa = d & 0x000FFFFFFFFFFFFF;
+
+    h_sign     = d_sign >> 48;
+
+    if (d_exponent == 0)
+    {
+      // Zero
+      h_exponent = 0;
+      h_mantissa = 0;
+    }
+    else if (d_exponent == 0x7FF0000000000000)
+    {
+      // Inf or NaN
+      h_exponent = 0x7C00;
+      if (d_mantissa)
+        h_mantissa = 0x1FF;
+      else
+        h_mantissa = 0;
+    }
+    else
+    {
+      int e = (((int64_t)(d_exponent >> 52)) - 1023 + 15);
+      if (e >= 0x1F)
+      {
+        // Value will overflow
+        h_exponent = 0x7C00;
+        h_mantissa = 0;
+
+        if (round == Half_RTZ)
+          h_mantissa = -1;
+        if (round == Half_RTP && h_sign)
+          h_mantissa = -1;
+        if (round == Half_RTN && !h_sign)
+          h_mantissa = -1;
+      }
+      else if (e <= 0)
+      {
+        // Value will underflow
+        h_exponent = 0;
+        if (14 - e > 24)
+        {
+          // Too small - flush to zero
+          h_mantissa = 0;
+        }
+        else
+        {
+          // Convert to denorm
+          d_mantissa |= 0x0010000000000000;
+          h_mantissa = (d_mantissa >> (14-e));
+          if ((d_mantissa >> (13 - e)) & 0x1)
+          {
+            h_mantissa += 0x1;
+          }
+        }
+      }
+      else
+      {
+        // Normalized
+        h_exponent = e << 10;
+        h_mantissa = d_mantissa >> 42;
+        // The current f_mantissa is done in RTZ
+        if (round == Half_RTE && (d & 0x20000000000) != 0)
+        {
+          if ((d & 0x5FFFFFFFFFF) != 0)
+            h_mantissa += 1;
+        }
+        else if (round == Half_RTP)
+        {
+          DtoUI.ui &= 0xFFFFFC0000000000;
+          if (DtoUI.d < dp)
+            h_mantissa += 1;
+        }
+        else if (round == Half_RTN)
+        {
+          DtoUI.ui &= 0xFFFFFC0000000000;
+          if (dp < DtoUI.d)
+            h_mantissa += 1;
+        }
+      }
+    }
+
+    return h_sign + h_exponent + h_mantissa;
+  }
+}
diff --git a/src/core/half.h b/src/core/half.h
index 58afcf1..120fa36 100644
--- a/src/core/half.h
+++ b/src/core/half.h
@@ -1,5 +1,5 @@
 // half.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -8,153 +8,22 @@
 
 #include "common.h"
 
-static float halfToFloat(uint16_t half)
+namespace oclgrind
 {
-  uint16_t h_sign, h_exponent, h_mantissa;
-  uint32_t f_sign, f_exponent, f_mantissa;
-
-  h_sign     = half & 0x8000; // 1000 0000 0000 0000
-  h_exponent = half & 0x7C00; // 0111 1100 0000 0000
-  h_mantissa = half & 0x03FF; // 0000 0011 1111 1111
-
-  f_sign     = ((uint32_t)h_sign) << 16;
-
-  if (h_exponent == 0)
-  {
-    if (h_mantissa == 0)
-    {
-      // Zero
-      f_exponent = 0;
-      f_mantissa = 0;
-    }
-    else
-    {
-      // Denorm - convert to normalized float
-      int e = -1;
-      do
-      {
-        e++;
-        h_mantissa <<= 1;
-      }
-      while((h_mantissa & 0x0400) == 0);
-
-      f_exponent = (-15 + 127 - e) << 23;
-      f_mantissa = ((uint32_t)(h_mantissa & 0x03FF)) << 13;
-    }
-  }
-  else if (h_exponent == 0x7C00)
-  {
-    // Inf or NaN
-    f_exponent = 0xFF << 23;
-    f_mantissa = h_mantissa;
-  }
-  else
-  {
-    // Normalized
-    f_exponent = (((int32_t)(h_exponent >> 10)) - 15 + 127) << 23;
-    f_mantissa = ((uint32_t)h_mantissa) << 13;
-  }
-
-  uint32_t result = f_sign | f_exponent | f_mantissa;
-  return *(float*)&result;
-}
-
-enum HalfRoundMode
-{
-  // Towards negative infinity
-  Half_RTN,
-  // Towards zero
-  Half_RTZ,
-  // Towards positive infinity
-  Half_RTP,
-  // Towards nearest even
-  Half_RTE
-};
-
-static uint16_t floatToHalf(float sp, HalfRoundMode round = Half_RTZ)
-{
-  uint16_t h_sign, h_exponent, h_mantissa;
-  uint32_t f_sign, f_exponent, f_mantissa;
-
-  union
-  {
-    float f;
-    uint32_t ui;
-  } FtoUI;
-  FtoUI.f = sp;
-  uint32_t f = FtoUI.ui;
-  f_sign     = f & 0x80000000; // 1000 0000 0000 0000 0000 0000 0000 0000
-  f_exponent = f & 0x7F800000; // 0111 1111 1000 0000 0000 0000 0000 0000
-  f_mantissa = f & 0x007FFFFF; // 0000 0000 0111 1111 1111 1111 1111 1111
-
-  h_sign     = f_sign >> 16;
-
-  if (f_exponent == 0)
-  {
-    // Zero
-    h_exponent = 0;
-    h_mantissa = 0;
-  }
-  else if (f_exponent == 0x7F800000)
+  enum HalfRoundMode
   {
-    // Inf or NaN
-    h_exponent = 0x7C00;
-    h_mantissa = f_mantissa;
-  }
-  else
-  {
-    int e = (((int32_t)(f_exponent >> 23)) - 127 + 15);
-    if (e >= 0x1F)
-    {
-      // Value will overflow
-      h_exponent = 0x7C00;
-      h_mantissa = 0;
-    }
-    else if (e <= 0)
-    {
-      // Value will underflow
-      h_exponent = 0;
-      if (14 - e > 24)
-      {
-        // Too small - flush to zero
-        h_mantissa = 0;
-      }
-      else
-      {
-        // Convert to denorm
-        f_mantissa |= 0x800000;
-        h_mantissa = (f_mantissa >> (14-e));
-        if ((f_mantissa >> (13 - e)) & 0x1)
-        {
-          h_mantissa += 0x1;
-        }
-      }
-    }
-    else
-    {
-      // Normalized
-      h_exponent = e << 10;
-      h_mantissa = f_mantissa >> 13;
-      // The current f_mantissa is done in RTZ
-      if (round == Half_RTE && (f & 0x00001000) != 0)
-      {
-        if ((f & 0x00002FFF) != 0)
-          h_mantissa += 1;
-      }
-      else if (round == Half_RTP)
-      {
-        FtoUI.ui &= 0xFFFFE000;
-        if (FtoUI.f < sp)
-          h_mantissa += 1;
-      }
-      else if (round == Half_RTN)
-      {
-        FtoUI.ui &= 0xFFFFE000;
-        if (sp < FtoUI.f)
-          h_mantissa += 1;
-      }
-    }
-  }
-
-  return h_sign + h_exponent + h_mantissa;
+    // Towards negative infinity
+    Half_RTN,
+    // Towards zero
+    Half_RTZ,
+    // Towards positive infinity
+    Half_RTP,
+    // Towards nearest even
+    Half_RTE
+  };
+
+  float halfToFloat(uint16_t half);
+
+  uint16_t floatToHalf(float sp, HalfRoundMode round = Half_RTZ);
+  uint16_t doubleToHalf(double dp, HalfRoundMode round = Half_RTZ);
 }
diff --git a/src/install/INSTALL.darwin b/src/install/INSTALL.darwin
index b3292d5..222d9e1 100644
--- a/src/install/INSTALL.darwin
+++ b/src/install/INSTALL.darwin
@@ -1,16 +1,12 @@
 To install Oclgrind, simply copy the bin, lib and include directories
-to (for example) /usr/local/, ensuring that file modification times
-are preserved. The easiest way to do this is with the following
-command:
+to (for example) /usr/local/:
 
-    sudo cp -rp {bin,lib,include} /usr/local
+    sudo cp -r {bin,lib,include} /usr/local
 
 Alternatively, Oclgrind can be used from a non-system directory. To do
 so, add $OCLGRIND_ROOT/bin to your PATH environment variable, and
 $OCLGRIND_ROOT/lib to your DYLD_LIBRARY_PATH environment variable
-(where $OCLGRIND_ROOT is the directory containing this file). If
-copying Oclgrind to a new location, ensure that the -p flag is passed
-to cp, to ensure that file modification times are preserved.
+(where $OCLGRIND_ROOT is the directory containing this file).
 
 Information about using Oclgrind can be found on the GitHub wiki page:
 
diff --git a/src/install/INSTALL.linux b/src/install/INSTALL.linux
index cf81cf9..30dbbe3 100644
--- a/src/install/INSTALL.linux
+++ b/src/install/INSTALL.linux
@@ -1,16 +1,12 @@
 To install Oclgrind, simply copy the bin, lib and include directories
-to (for example) /usr/local/, ensuring that file modification times
-are preserved. The easiest way to do this is with the following
-command:
+to (for example) /usr/local/:
 
-    sudo cp -rp {bin,lib,include} /usr/local
+    sudo cp -r {bin,lib,include} /usr/local
 
 Alternatively, Oclgrind can be used from a non-system directory. To do
 so, add $OCLGRIND_ROOT/bin to your PATH environment variable, and
 $OCLGRIND_ROOT/lib to your LD_LIBRARY_PATH environment variable (where
-$OCLGRIND_ROOT is the directory containing this file). If copying
-Oclgrind to a new location, ensure that the -p flag is passed to cp,
-to ensure that file modification times are preserved.
+$OCLGRIND_ROOT is the directory containing this file).
 
 To use Oclgrind with the OpenCL ICD loader (optional), copy
 oclgrind.icd to /etc/OpenCL/vendors/.
diff --git a/src/install/INSTALL.windows b/src/install/INSTALL.windows
index 2b02636..716e8c2 100644
--- a/src/install/INSTALL.windows
+++ b/src/install/INSTALL.windows
@@ -5,4 +5,16 @@ running 'uninstall.bat' as an Administrator.
 
 Alternatively, Oclgrind can be run from any other directory. You will
 need to manually create OpenCL ICD loading points by editing the
-registry (see oclgrind-icd.reg).
+registry (see oclgrind-icd.reg), and/or add $OCLGRIND_ROOT/bin to your
+PATH environment variable to make use of the oclgrind.exe command.
+
+You may be warned about a missing MSVCP140.dll during the installation
+process, which can cause Oclgrind to fail to run properly. This can be
+fixed by installing the Microsoft Visual C++ Redistributable from
+here:
+
+    https://www.microsoft.com/en-us/download/details.aspx?id=48145
+
+Information about using Oclgrind can be found on the GitHub wiki page:
+
+    http://github.com/jrprice/Oclgrind/wiki
diff --git a/src/install/cpack-description b/src/install/cpack-description
new file mode 100644
index 0000000..3fe78cf
--- /dev/null
+++ b/src/install/cpack-description
@@ -0,0 +1,11 @@
+Oclgrind is an extensible OpenCL device simulator that provides a
+plugin interface to facilitate the creation of tools to aid analysis
+and development of OpenCL programs. Among the tools that Oclgrind
+provides are various debugging aids, such as out-of-bounds memory
+access checking, data-race detection, and an interactive debugger.
+
+Oclgrind implements the OpenCL 1.2 runtime API, which makes simulating
+an existing OpenCL program very straightforward - simply prefix your
+usual application command-line with 'oclgrind'. There is also a simple
+interface for simulating individual kernels in isolation via the
+'oclgrind-kernel' command.
diff --git a/src/install/install.bat b/src/install/install.bat
index cea2457..3fa3d21 100644
--- a/src/install/install.bat
+++ b/src/install/install.bat
@@ -13,6 +13,19 @@ xcopy uninstall.bat "%ROOT%\"           /Y    || goto :error
 
 regedit /S oclgrind-icd.reg                   || goto :error
 
+echo.
+echo Installation completed.
+echo.
+
+if not exist C:\Windows\system32\msvcp140.dll (
+  echo WARNING: MSVCP140.dll not found - Oclgrind may fail to work correctly
+  echo Download the Microsoft Visual C++ Redistributable from here:
+  echo.
+  echo   https://www.microsoft.com/en-us/download/details.aspx?id=48145
+  echo.
+  pause
+)
+
 goto :EOF
 
 
diff --git a/src/kernel/Simulation.cpp b/src/kernel/Simulation.cpp
index 208ed77..efa20a2 100644
--- a/src/kernel/Simulation.cpp
+++ b/src/kernel/Simulation.cpp
@@ -1,12 +1,11 @@
 // Simulation.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
 // license terms please see the LICENSE file distributed with this
 // source code.
 
-#include "config.h"
 #include <cassert>
 #include <cmath>
 #include <iostream>
@@ -59,10 +58,13 @@ void Simulation::dumpArgument(DumpArg& arg)
   for (size_t i = 0; i < num; i++)
   {
     cout << "  " << arg.name << "[" << i << "] = ";
+    if (arg.hex)
+      cout << "0x" << setfill('0') << setw(sizeof(T)*2) << hex;
     if (sizeof(T) == 1)
       cout << (int)data[i];
     else
       cout << data[i];
+    cout << dec;
     cout << endl;
   }
   cout << endl;
@@ -267,6 +269,8 @@ void Simulation::parseArgument(size_t index)
   size_t typeSize = 0;
   bool null = false;
   bool dump = false;
+  bool hex = false;
+  bool noinit = false;
   string fill = "";
   string range = "";
   string name = m_kernel->getArgumentName(index).str();
@@ -357,9 +361,19 @@ void Simulation::parseArgument(size_t index)
     }
     else if (token == "hex")
     {
+      hex = true;
       m_lineBuffer.setf(ios_base::hex);
       m_lineBuffer.unsetf(ios_base::dec | ios_base::oct);
     }
+    else if (token == "noinit")
+    {
+      if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL &&
+          addrSpace != CL_KERNEL_ARG_ADDRESS_CONSTANT)
+      {
+        throw "'noinit' only valid for buffer arguments";
+      }
+      noinit = true;
+    }
     else if (token == "null")
     {
       if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL &&
@@ -429,7 +443,7 @@ void Simulation::parseArgument(size_t index)
   // Ensure size given
   if (null)
   {
-    if (size != -1 || !fill.empty() || !range.empty())
+    if (size != -1 || !fill.empty() || !range.empty() || noinit || dump)
     {
       throw "'null' not valid with other argument descriptors";
     }
@@ -482,10 +496,16 @@ void Simulation::parseArgument(size_t index)
     {
       throw "'dump' only valid for memory objects";
     }
-    if (null)
-    {
-      throw "'dump' not valid with 'null' specifier";
-    }
+  }
+
+  // Ensure only one initializer given
+  unsigned numInitializers = 0;
+  if (noinit) numInitializers++;
+  if (!fill.empty()) numInitializers++;
+  if (!range.empty()) numInitializers++;
+  if (numInitializers > 1)
+  {
+    throw "Multiple initializers present";
   }
 
   // Generate argument data
@@ -506,7 +526,8 @@ void Simulation::parseArgument(size_t index)
   {
     // Parse argument data
     unsigned char *data = new unsigned char[size];
-    if (!fill.empty())
+    if (noinit){}
+    else if (!fill.empty())
     {
       istringstream fillStream(fill);
       fillStream.copyfmt(m_lineBuffer);
@@ -591,7 +612,10 @@ void Simulation::parseArgument(size_t index)
       // Allocate buffer and store content
       Memory *globalMemory = m_context->getGlobalMemory();
       size_t address = globalMemory->allocateBuffer(size, flags);
-      globalMemory->store((unsigned char*)&data[0], address, size);
+      if (!address)
+        throw "Failed to allocate global memory";
+      if (!noinit)
+        globalMemory->store((unsigned char*)&data[0], address, size);
       value.data = new unsigned char[value.size];
       value.setPointer(address);
       delete[] data;
@@ -604,6 +628,7 @@ void Simulation::parseArgument(size_t index)
           size,
           type,
           name,
+          hex
         };
         m_dumpArguments.push_back(dump);
       }
diff --git a/src/kernel/Simulation.h b/src/kernel/Simulation.h
index 19b6e9b..63edb7e 100644
--- a/src/kernel/Simulation.h
+++ b/src/kernel/Simulation.h
@@ -1,5 +1,5 @@
 // Simulation.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -57,13 +57,14 @@ class Simulation
     size_t m_lineNumber;
     std::istringstream m_lineBuffer;
 
-    typedef struct
+    struct DumpArg
     {
       size_t address;
       size_t size;
       ArgDataType type;
       std::string name;
-    } DumpArg;
+      bool hex;
+    };
     std::list<DumpArg> m_dumpArguments;
 
     template<typename T>
diff --git a/src/kernel/oclgrind-kernel.cpp b/src/kernel/oclgrind-kernel.cpp
index 4ac1d31..1ce599d 100644
--- a/src/kernel/oclgrind-kernel.cpp
+++ b/src/kernel/oclgrind-kernel.cpp
@@ -1,5 +1,5 @@
 // main.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -7,6 +7,7 @@
 // source code.
 
 #include "config.h"
+
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
@@ -138,12 +139,16 @@ static bool parseArguments(int argc, char *argv[])
     {
       setEnvironment("OCLGRIND_UNIFORM_WRITES", "1");
     }
+    else if (!strcmp(argv[i], "--uninitialized"))
+    {
+      setEnvironment("OCLGRIND_UNINITIALIZED", "1");
+    }
     else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version"))
     {
       cout << endl;
       cout << "Oclgrind " PACKAGE_VERSION << endl;
       cout << endl;
-      cout << "Copyright (c) 2013-2015" << endl;
+      cout << "Copyright (c) 2013-2016" << endl;
       cout << "James Price and Simon McIntosh-Smith, University of Bristol"
            << endl;
       cout << "https://github.com/jrprice/Oclgrind" << endl;
@@ -210,11 +215,13 @@ static void printUsage()
     << "     --pch-dir        DIR      "
              "Override directory containing precompiled headers" << endl
     << "     --plugins        PLUGINS  "
-             "Load colon seperated list of plugin libraries" << endl
+             "Load colon separated list of plugin libraries" << endl
     << "  -q --quick                   "
              "Only run first and last work-group" << endl
     << "     --uniform-writes          "
              "Don't suppress uniform write-write data-races" << endl
+    << "     --uninitialized           "
+             "Report usage of uninitialized values" << endl
     << "  -v --version                 "
              "Display version information" << endl
     << endl
diff --git a/src/plugins/InstructionCounter.cpp b/src/plugins/InstructionCounter.cpp
index ce680f4..ed977ec 100644
--- a/src/plugins/InstructionCounter.cpp
+++ b/src/plugins/InstructionCounter.cpp
@@ -1,5 +1,5 @@
 // InstructionCounter.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/plugins/InstructionCounter.h b/src/plugins/InstructionCounter.h
index f12c33a..e6f3646 100644
--- a/src/plugins/InstructionCounter.h
+++ b/src/plugins/InstructionCounter.h
@@ -1,5 +1,5 @@
 // InstructionCounter.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/plugins/InteractiveDebugger.cpp b/src/plugins/InteractiveDebugger.cpp
index a088338..012e009 100644
--- a/src/plugins/InteractiveDebugger.cpp
+++ b/src/plugins/InteractiveDebugger.cpp
@@ -1,11 +1,12 @@
 // InteractiveDebugger.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
 // license terms please see the LICENSE file distributed with this
 // source code.
 
+#include "config.h"
 #include "core/common.h"
 
 #include <iterator>
@@ -308,7 +309,7 @@ void InteractiveDebugger::printFunction(
       cout << ", ";
     }
     cout << argItr->getName().str() << "=";
-    m_kernelInvocation->getCurrentWorkItem()->printValue(argItr);
+    m_kernelInvocation->getCurrentWorkItem()->printValue(&*argItr);
   }
 
   cout << ") at line " << dec << getLineNumber(instruction) << endl;
@@ -867,19 +868,27 @@ bool InteractiveDebugger::print(vector<string> args)
         cout << "not found" << endl;
         return false;
       }
+
       const llvm::Type *ptrType = ptr->getType();
+      unsigned addrSpace = ptrType->getPointerAddressSpace();
 
       // Check for alloca instruction, in which case look at allocated type
       bool alloca = false;
+      if (ptr->getValueID() == llvm::Value::GlobalVariableVal)
+      {
+        ptrType = ptrType->getPointerElementType();
+      }
       if (ptr->getValueID() >= llvm::Value::InstructionVal &&
           ((llvm::Instruction*)ptr)->getOpcode() == llvm::Instruction::Alloca)
       {
         ptrType = ((const llvm::AllocaInst*)ptr)->getAllocatedType();
+        if (ptrType->isPointerTy())
+          addrSpace = ptrType->getPointerAddressSpace();
         alloca = true;
       }
 
       // Ensure type is a pointer
-      if (!ptrType->isPointerTy())
+      if (!ptrType->isPointerTy() && !ptrType->isArrayTy())
       {
         cout << "not a pointer" << endl;
         return false;
@@ -891,12 +900,12 @@ bool InteractiveDebugger::print(vector<string> args)
       {
         // Load base address from private memory
         workItem->getPrivateMemory()->load((unsigned char*)&base,
-                                                    base, sizeof(size_t));
+                                           base, sizeof(size_t));
       }
 
       // Get target memory object
       Memory *memory = NULL;
-      switch (ptrType->getPointerAddressSpace())
+      switch (addrSpace)
       {
       case AddrSpacePrivate:
         memory = workItem->getPrivateMemory();
@@ -932,7 +941,12 @@ bool InteractiveDebugger::print(vector<string> args)
     }
     else
     {
-      if (!workItem->printVariable(args[i]))
+      try
+      {
+        if (!workItem->printVariable(args[i]))
+          cout << "not found";
+      }
+      catch (FatalError err)
       {
         cout << "not found";
       }
diff --git a/src/plugins/InteractiveDebugger.h b/src/plugins/InteractiveDebugger.h
index 2b5db65..d6c7775 100644
--- a/src/plugins/InteractiveDebugger.h
+++ b/src/plugins/InteractiveDebugger.h
@@ -1,5 +1,5 @@
 // InteractiveDebugger.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/plugins/Logger.cpp b/src/plugins/Logger.cpp
index 7b73296..acc8896 100644
--- a/src/plugins/Logger.cpp
+++ b/src/plugins/Logger.cpp
@@ -1,5 +1,5 @@
 // Logger.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/plugins/Logger.h b/src/plugins/Logger.h
index 294bc67..62bbcc8 100644
--- a/src/plugins/Logger.h
+++ b/src/plugins/Logger.h
@@ -1,5 +1,5 @@
 // Logger.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/plugins/MemCheck.cpp b/src/plugins/MemCheck.cpp
index fb04e57..c4d7474 100644
--- a/src/plugins/MemCheck.cpp
+++ b/src/plugins/MemCheck.cpp
@@ -1,5 +1,5 @@
 // MemCheck.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -10,6 +10,10 @@
 
 #include "core/Context.h"
 #include "core/Memory.h"
+#include "core/WorkItem.h"
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
 
 #include "MemCheck.h"
 
@@ -21,6 +25,36 @@ MemCheck::MemCheck(const Context *context)
 {
 }
 
+void MemCheck::instructionExecuted(const WorkItem *workItem,
+                                   const llvm::Instruction *instruction,
+                                   const TypedValue& result)
+{
+  // Check static array bounds if load or store is executed
+  const llvm::Value *PtrOp = nullptr;
+
+  if (auto LI = llvm::dyn_cast<llvm::LoadInst>(instruction))
+  {
+    PtrOp = LI->getPointerOperand();
+  }
+  else if (auto SI = llvm::dyn_cast<llvm::StoreInst>(instruction))
+  {
+    PtrOp = SI->getPointerOperand();
+  }
+  else
+  {
+    return;
+  }
+
+  // Walk up chain of GEP instructions leading to this access
+  while (auto GEPI =
+           llvm::dyn_cast<llvm::GetElementPtrInst>(PtrOp->stripPointerCasts()))
+  {
+    checkArrayAccess(workItem, GEPI);
+
+    PtrOp = GEPI->getPointerOperand();
+  }
+}
+
 void MemCheck::memoryAtomicLoad(const Memory *memory,
                                 const WorkItem *workItem,
                                 AtomicOp op, size_t address, size_t size)
@@ -47,6 +81,17 @@ void MemCheck::memoryLoad(const Memory *memory, const WorkGroup *workGroup,
   checkLoad(memory, address, size);
 }
 
+void MemCheck::memoryMap(const Memory *memory, size_t address,
+                         size_t offset, size_t size, cl_map_flags flags)
+{
+  MapRegion map =
+  {
+    address, offset, size, memory->getPointer(address + offset),
+    (flags == CL_MAP_READ ? MapRegion::READ : MapRegion::WRITE)
+  };
+  m_mapRegions.push_back(map);
+}
+
 void MemCheck::memoryStore(const Memory *memory, const WorkItem *workItem,
                            size_t address, size_t size,
                            const uint8_t *storeData)
@@ -61,6 +106,62 @@ void MemCheck::memoryStore(const Memory *memory, const WorkGroup *workGroup,
   checkStore(memory, address, size);
 }
 
+void MemCheck::memoryUnmap(const Memory *memory, size_t address,
+                           const void *ptr)
+{
+  for (auto region = m_mapRegions.begin();
+            region != m_mapRegions.end();
+            region++)
+  {
+    if (region->ptr == ptr)
+    {
+      m_mapRegions.erase(region);
+      return;
+    }
+  }
+}
+
+void MemCheck::checkArrayAccess(const WorkItem *workItem,
+                                const llvm::GetElementPtrInst *GEPI) const
+{
+  // Iterate through GEPI indices
+  const llvm::Type *ptrType = GEPI->getPointerOperandType();
+
+  for (auto opIndex = GEPI->idx_begin(); opIndex != GEPI->idx_end(); opIndex++)
+  {
+    int64_t index = workItem->getOperand(opIndex->get()).getSInt();
+
+    if (ptrType->isArrayTy())
+    {
+      // Check index doesn't exceed size of array
+      uint64_t size = ptrType->getArrayNumElements();
+
+      if ((uint64_t)index >= size)
+      {
+        ostringstream info;
+        info << "Index ("
+             << index << ") exceeds static array size ("
+             << size << ")";
+        m_context->logError(info.str().c_str());
+      }
+
+      ptrType = ptrType->getArrayElementType();
+    }
+    else if (ptrType->isPointerTy())
+    {
+      ptrType = ptrType->getPointerElementType();
+    }
+    else if (ptrType->isVectorTy())
+    {
+      ptrType = ptrType->getVectorElementType();
+    }
+    else if (ptrType->isStructTy())
+    {
+      ptrType = ptrType->getStructElementType(index);
+    }
+  }
+}
+
 void MemCheck::checkLoad(const Memory *memory,
                          size_t address, size_t size) const
 {
@@ -74,6 +175,21 @@ void MemCheck::checkLoad(const Memory *memory,
   {
     m_context->logError("Invalid read from write-only buffer");
   }
+  
+  if (memory->getAddressSpace() == AddrSpaceLocal || memory->getAddressSpace() == AddrSpacePrivate) return;
+
+  // Check if memory location is currently mapped for writing
+  for (auto region = m_mapRegions.begin();
+            region != m_mapRegions.end();
+            region++)
+  {
+    if (region->type == MapRegion::WRITE &&
+        address < region->address + region->size &&
+        address + size >= region->address)
+    {
+      m_context->logError("Invalid read from buffer mapped for writing");
+    }
+  }
 }
 
 void MemCheck::checkStore(const Memory *memory,
@@ -89,6 +205,20 @@ void MemCheck::checkStore(const Memory *memory,
   {
     m_context->logError("Invalid write to read-only buffer");
   }
+
+  if (memory->getAddressSpace() == AddrSpaceLocal || memory->getAddressSpace() == AddrSpacePrivate) return;
+
+  // Check if memory location is currently mapped
+  for (auto region = m_mapRegions.begin();
+            region != m_mapRegions.end();
+            region++)
+  {
+    if (address < region->address + region->size &&
+        address + size >= region->address)
+    {
+      m_context->logError("Invalid write to mapped buffer");
+    }
+  }
 }
 
 void MemCheck::logInvalidAccess(bool read, unsigned addrSpace,
@@ -104,4 +234,4 @@ void MemCheck::logInvalidAccess(bool read, unsigned addrSpace,
       << "Entity: " << msg.CURRENT_ENTITY << endl
       << msg.CURRENT_LOCATION << endl;
   msg.send();
-}
\ No newline at end of file
+}
diff --git a/src/plugins/MemCheck.h b/src/plugins/MemCheck.h
index 9e685bf..7e72d8a 100644
--- a/src/plugins/MemCheck.h
+++ b/src/plugins/MemCheck.h
@@ -1,5 +1,5 @@
 // MemCheck.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -8,6 +8,11 @@
 
 #include "core/Plugin.h"
 
+namespace llvm
+{
+    class GetElementPtrInst;
+}
+
 namespace oclgrind
 {
   class MemCheck : public Plugin
@@ -15,6 +20,9 @@ namespace oclgrind
   public:
     MemCheck(const Context *context);
 
+    virtual void instructionExecuted(const WorkItem *workItem,
+                                     const llvm::Instruction *instruction,
+                                     const TypedValue& result) override;
     virtual void memoryAtomicLoad(const Memory *memory,
                                   const WorkItem *workItem,
                                   AtomicOp op,
@@ -27,17 +35,34 @@ namespace oclgrind
                             size_t address, size_t size) override;
     virtual void memoryLoad(const Memory *memory, const WorkGroup *workGroup,
                             size_t address, size_t size) override;
+    virtual void memoryMap(const Memory *memory, size_t address,
+                           size_t offset, size_t size,
+                           cl_map_flags flags) override;
     virtual void memoryStore(const Memory *memory, const WorkItem *workItem,
                              size_t address, size_t size,
                              const uint8_t *storeData) override;
     virtual void memoryStore(const Memory *memory, const WorkGroup *workGroup,
                              size_t address, size_t size,
                              const uint8_t *storeData) override;
+    virtual void memoryUnmap(const Memory *memory, size_t address,
+                             const void *ptr) override;
 
   private:
+    void checkArrayAccess(const WorkItem *workItem,
+                          const llvm::GetElementPtrInst *GEPI) const;
     void checkLoad(const Memory *memory, size_t address, size_t size) const;
     void checkStore(const Memory *memory, size_t address, size_t size) const;
     void logInvalidAccess(bool read, unsigned addrSpace,
                           size_t address, size_t size) const;
+
+    struct MapRegion
+    {
+      size_t address;
+      size_t offset;
+      size_t size;
+      const void *ptr;
+      enum {READ, WRITE} type;
+    };
+    std::list<MapRegion> m_mapRegions;
   };
 }
diff --git a/src/plugins/RaceDetector.cpp b/src/plugins/RaceDetector.cpp
index 10f417e..8c38907 100644
--- a/src/plugins/RaceDetector.cpp
+++ b/src/plugins/RaceDetector.cpp
@@ -1,5 +1,5 @@
 // RaceDetector.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -19,7 +19,14 @@
 using namespace oclgrind;
 using namespace std;
 
-#define KEY(memory,address) make_pair(memory, EXTRACT_BUFFER(address))
+THREAD_LOCAL RaceDetector::WorkerState RaceDetector::m_state = {NULL};
+
+#define STATE(workgroup) (m_state.groups->at(workgroup))
+
+// Use a bank of mutexes to reduce unnecessary synchronisation
+#define NUM_GLOBAL_MUTEXES 4096 // Must be power of two
+#define GLOBAL_MUTEX(buffer,offset) \
+  m_globalMutexes[buffer][offset & (NUM_GLOBAL_MUTEXES-1)]
 
 RaceDetector::RaceDetector(const Context *context)
  : Plugin(context)
@@ -29,12 +36,6 @@ RaceDetector::RaceDetector(const Context *context)
   m_allowUniformWrites = !checkEnv("OCLGRIND_UNIFORM_WRITES");
 }
 
-bool RaceDetector::isThreadSafe() const
-{
-  // TODO: Improve DRD efficiency for multi-threaded case instead.
-  return false;
-}
-
 void RaceDetector::kernelBegin(const KernelInvocation *kernelInvocation)
 {
   m_kernelInvocation = kernelInvocation;
@@ -42,295 +43,486 @@ void RaceDetector::kernelBegin(const KernelInvocation *kernelInvocation)
 
 void RaceDetector::kernelEnd(const KernelInvocation *kernelInvocation)
 {
-  synchronize(m_context->getGlobalMemory(), false);
+  // Log races
+  for (auto race : kernelRaces)
+    logRace(race);
+  kernelRaces.clear();
+
+  // Clear all global memory accesses
+  for (auto &buffer : m_globalAccesses)
+  {
+    size_t sz = buffer.second.size();
+    buffer.second.clear();
+    buffer.second.resize(sz);
+  }
 
   m_kernelInvocation = NULL;
 }
 
 void RaceDetector::memoryAllocated(const Memory *memory, size_t address,
-                                   size_t size, cl_mem_flags flags)
+                                   size_t size, cl_mem_flags flags,
+                                   const uint8_t *initData)
 {
-  if (memory->getAddressSpace() == AddrSpacePrivate ||
-      memory->getAddressSpace() == AddrSpaceConstant)
-    return;
-
-  m_state[KEY(memory,address)] = make_pair(new State[size], size);
+  size_t buffer = memory->extractBuffer(address);
+  if (memory->getAddressSpace() == AddrSpaceGlobal)
+  {
+    m_globalAccesses[buffer].resize(size);
+    m_globalMutexes[buffer] = new mutex[NUM_GLOBAL_MUTEXES];
+  }
 }
 
 void RaceDetector::memoryAtomicLoad(const Memory *memory,
                                     const WorkItem *workItem,
                                     AtomicOp op, size_t address, size_t size)
 {
-  registerAtomic(memory, workItem, address, size, false);
+  registerAccess(memory, workItem->getWorkGroup(), workItem,
+                 address, size, true);
 }
 
 void RaceDetector::memoryAtomicStore(const Memory *memory,
                                      const WorkItem *workItem,
                                      AtomicOp op, size_t address, size_t size)
 {
-  registerAtomic(memory, workItem, address, size, true);
+  registerAccess(memory, workItem->getWorkGroup(), workItem,
+                 address, size, true,
+                 (const uint8_t*)memory->getPointer(address));
 }
 
 void RaceDetector::memoryDeallocated(const Memory *memory, size_t address)
 {
-  if (memory->getAddressSpace() == AddrSpacePrivate ||
-      memory->getAddressSpace() == AddrSpaceConstant)
-    return;
+  size_t buffer = memory->extractBuffer(address);
+  if (memory->getAddressSpace() == AddrSpaceGlobal)
+  {
+    m_globalAccesses.erase(buffer);
 
-  delete[] m_state[KEY(memory,address)].first;
-  m_state.erase(KEY(memory,address));
+    delete[] m_globalMutexes.at(buffer);
+    m_globalMutexes.erase(buffer);
+  }
 }
 
 void RaceDetector::memoryLoad(const Memory *memory, const WorkItem *workItem,
                               size_t address, size_t size)
 {
-  registerLoadStore(memory, workItem, workItem->getWorkGroup(),
-                    address, size, NULL);
+  registerAccess(memory, workItem->getWorkGroup(), workItem,
+                 address, size, false, NULL);
 }
 
 void RaceDetector::memoryLoad(const Memory *memory, const WorkGroup *workGroup,
                               size_t address, size_t size)
 {
-  registerLoadStore(memory, NULL, workGroup, address, size, NULL);
+  registerAccess(memory, workGroup, NULL, address, size, false);
 }
 
 void RaceDetector::memoryStore(const Memory *memory, const WorkItem *workItem,
                                size_t address, size_t size,
                                const uint8_t *storeData)
 {
-  registerLoadStore(memory, workItem, workItem->getWorkGroup(),
-                    address, size, storeData);
+  registerAccess(memory, workItem->getWorkGroup(), workItem,
+                 address, size, false, storeData);
 }
 
 void RaceDetector::memoryStore(const Memory *memory, const WorkGroup *workGroup,
                                size_t address, size_t size,
                                const uint8_t *storeData)
 {
-  registerLoadStore(memory, NULL, workGroup, address, size, storeData);
+  registerAccess(memory, workGroup, NULL,
+                 address, size, false, storeData);
+}
+
+void RaceDetector::workGroupBarrier(const WorkGroup *workGroup, uint32_t flags)
+{
+  if (flags & CLK_LOCAL_MEM_FENCE)
+  {
+    syncWorkItems(workGroup->getLocalMemory(),
+                  STATE(workGroup), STATE(workGroup).wiLocal);
+  }
+  if (flags & CLK_GLOBAL_MEM_FENCE)
+  {
+    syncWorkItems(m_context->getGlobalMemory(),
+                  STATE(workGroup), STATE(workGroup).wiGlobal);
+  }
+}
+
+void RaceDetector::workGroupBegin(const WorkGroup *workGroup)
+{
+  // Create worker state if haven't already
+  if (!m_state.groups)
+  {
+    m_state.groups = new unordered_map<const WorkGroup*,WorkGroupState>;
+  }
+
+  // Initialize work-group state
+  WorkGroupState& state = (*m_state.groups)[workGroup];
+  Size3 wgsize = workGroup->getGroupSize();
+  state.numWorkItems = wgsize.x*wgsize.y*wgsize.z;
+
+  // Re-use pool allocator for all access maps
+  AccessMap tmp(0, AccessMap::hasher(), AccessMap::key_equal(),
+                state.wgGlobal.get_allocator());
+  state.wiGlobal.resize(state.numWorkItems+1, tmp);
+  state.wiLocal.resize(state.numWorkItems+1, tmp);
+}
+
+void RaceDetector::workGroupComplete(const WorkGroup *workGroup)
+{
+  WorkGroupState& state = STATE(workGroup);
+
+  syncWorkItems(workGroup->getLocalMemory(), state, state.wiLocal);
+  syncWorkItems(m_context->getGlobalMemory(), state, state.wiGlobal);
+
+  // Merge global accesses across kernel invocation
+  size_t group = workGroup->getGroupIndex();
+  for (auto &record : state.wgGlobal)
+  {
+    size_t address = record.first;
+    size_t buffer = m_context->getGlobalMemory()->extractBuffer(address);
+    size_t offset = m_context->getGlobalMemory()->extractOffset(address);
+
+    lock_guard<mutex> lock(GLOBAL_MUTEX(buffer, offset));
+
+    AccessRecord& a = record.second;
+    AccessRecord& b = m_globalAccesses.at(buffer)[offset];
+
+    // Check for races with previous accesses
+    if (check(a.load,  b.store) && getAccessWorkGroup(b.store) != group)
+      insertKernelRace({AddrSpaceGlobal, address, a.load, b.store});
+    if (check(a.store, b.load) && getAccessWorkGroup(b.load) != group)
+      insertKernelRace({AddrSpaceGlobal, address, a.store, b.load});
+    if (check(a.store, b.store) && getAccessWorkGroup(b.store) != group)
+      insertKernelRace({AddrSpaceGlobal, address, a.store, b.store});
+
+    // Insert accesses
+    if (a.load.isSet())
+      insert(b, a.load);
+    if (a.store.isSet())
+      insert(b, a.store);
+  }
+  state.wgGlobal.clear();
+
+  // Clean-up work-group state
+  m_state.groups->erase(workGroup);
+  if (m_state.groups->empty())
+  {
+    delete m_state.groups;
+    m_state.groups = NULL;
+  }
+}
+
+bool RaceDetector::check(const MemoryAccess& a,
+                         const MemoryAccess& b) const
+{
+  // Ensure both accesses are valid
+  if (!a.isSet() || !b.isSet())
+    return false;
+
+  // No race if same work-item
+  if (a.isWorkItem() && b.isWorkItem() && (a.getEntity() == b.getEntity()))
+    return false;
+
+  // No race if both operations are atomics
+  if (a.isAtomic() && b.isAtomic())
+    return false;
+
+  // Potential race if at least one store
+  if (a.isStore() || b.isStore())
+  {
+    // Read-write race if one is a load
+    if (a.isLoad() || b.isLoad())
+      return true;
+
+    // Write-write race if not uniform
+    if (!m_allowUniformWrites || (a.getStoreData() != b.getStoreData()))
+      return true;
+  }
+
+  return false;
+}
+
+size_t RaceDetector::getAccessWorkGroup(const MemoryAccess& access) const
+{
+  if (access.isWorkItem())
+  {
+    const Size3& wgsize = m_kernelInvocation->getLocalSize();
+    return access.getEntity() / (wgsize.x*wgsize.y*wgsize.z);
+  }
+  else
+    return access.getEntity();
+}
+
+void RaceDetector::insert(AccessRecord& record,
+                          const MemoryAccess& access) const
+{
+  if (access.isLoad())
+  {
+    if (!record.load.isSet() || record.load.isAtomic())
+      record.load = access;
+  }
+  else if (access.isStore())
+  {
+    if (!record.store.isSet() || record.store.isAtomic())
+      record.store = access;
+  }
 }
 
-void RaceDetector::logRace(DataRaceType type,
-                           unsigned int addrSpace,
-                           size_t address,
-                           size_t lastWorkGroup,
-                           size_t lastWorkItem,
-                           const llvm::Instruction *lastInstruction) const
+void RaceDetector::insertKernelRace(const Race& race)
 {
-  const char *raceType = NULL;
-  switch (type)
+  lock_guard<mutex> lock(kernelRacesMutex);
+  insertRace(kernelRaces, race);
+}
+
+void RaceDetector::insertRace(RaceList& races, const Race& race) const
+{
+  // Check list for duplicates
+  for (auto x = races.begin(); x != races.end(); x++)
   {
-    case ReadWriteRace:
-      raceType = "Read-write";
-      break;
-    case WriteWriteRace:
-      raceType = "Write-write";
-      break;
+    // Check if races are equal modulo address
+    if ((race.a == x->a && race.b == x->b) ||
+        (race.a == x->b && race.b == x->a))
+    {
+      // If they match, keep the one with the lowest address
+      if (race.address < x->address)
+      {
+        races.erase(x);
+        races.push_back(race);
+        return;
+      }
+      else
+        return;
+    }
   }
 
+  races.push_back(race);
+}
+
+void RaceDetector::logRace(const Race& race) const
+{
+  const char *raceType;
+  if (race.a.isLoad() || race.b.isLoad())
+    raceType = "Read-write";
+  else
+    raceType = "Write-write";
+
   Context::Message msg(ERROR, m_context);
   msg << raceType << " data race at "
-      << getAddressSpaceName(addrSpace)
-      << " memory address 0x" << hex << address << endl
+      << getAddressSpaceName(race.addrspace)
+      << " memory address 0x" << hex << race.address << endl
       << msg.INDENT
       << "Kernel: " << msg.CURRENT_KERNEL << endl
       << endl
-      << "First entity:  " << msg.CURRENT_ENTITY << endl
-      << msg.CURRENT_LOCATION << endl
-      << endl
-      << "Second entity: ";
+      << "First entity:  ";
 
-  // Show details of other entity involved in race
-  if (lastWorkItem != -1)
+  if (race.a.isWorkItem())
   {
-    Size3 global(lastWorkItem, m_kernelInvocation->getGlobalSize());
-    Size3 local, group;
-    local.x = global.x % m_kernelInvocation->getLocalSize().x;
-    local.y = global.y % m_kernelInvocation->getLocalSize().y;
-    local.z = global.z % m_kernelInvocation->getLocalSize().z;
-    group.x = global.x / m_kernelInvocation->getLocalSize().x;
-    group.y = global.y / m_kernelInvocation->getLocalSize().y;
-    group.z = global.z / m_kernelInvocation->getLocalSize().z;
+    Size3 wgsize = m_kernelInvocation->getLocalSize();
+    Size3 global(race.a.getEntity(), m_kernelInvocation->getGlobalSize());
+    Size3 local(global.x%wgsize.x, global.y%wgsize.y, global.z%wgsize.z);
+    Size3 group(global.x/wgsize.x, global.y/wgsize.y, global.z/wgsize.z);
     msg << "Global" << global << " Local" << local << " Group" << group;
   }
-  else if (lastWorkGroup != -1)
+  else
   {
     msg << "Group"
-        << Size3(lastWorkGroup, m_kernelInvocation->getNumGroups());
+        << Size3(race.a.getEntity(), m_kernelInvocation->getLocalSize());
+  }
+
+  msg << endl << race.a.getInstruction() << endl
+      << endl
+      << "Second entity: ";
+
+  // Show details of other entity involved in race
+  if (race.b.isWorkItem())
+  {
+    Size3 wgsize = m_kernelInvocation->getLocalSize();
+    Size3 global(race.b.getEntity(), m_kernelInvocation->getGlobalSize());
+    Size3 local(global.x%wgsize.x, global.y%wgsize.y, global.z%wgsize.z);
+    Size3 group(global.x/wgsize.x, global.y/wgsize.y, global.z/wgsize.z);
+    msg << "Global" << global << " Local" << local << " Group" << group;
   }
   else
   {
-    msg << "(unknown)";
+    msg << "Group"
+        << Size3(race.b.getEntity(), m_kernelInvocation->getLocalSize());
   }
-  msg << endl
-      << lastInstruction << endl;
+  msg << endl << race.b.getInstruction() << endl;
   msg.send();
 }
 
-void RaceDetector::registerAtomic(const Memory *memory,
+void RaceDetector::registerAccess(const Memory *memory,
+                                  const WorkGroup *workGroup,
                                   const WorkItem *workItem,
-                                  size_t address, size_t size,
-                                  bool store)
+                                  size_t address, size_t size, bool atomic,
+                                  const uint8_t *storeData)
 {
+  unsigned addrSpace = memory->getAddressSpace();
+  if (addrSpace == AddrSpacePrivate ||
+      addrSpace == AddrSpaceConstant)
+    return;
   if (!memory->isAddressValid(address, size))
     return;
 
-  State *state = m_state[KEY(memory,address)].first + EXTRACT_OFFSET(address);
+  // Construct access
+  MemoryAccess access(workGroup, workItem, storeData != NULL, atomic);
 
-  // Get work-item index
-  size_t workItemIndex = workItem->getGlobalIndex();
+  size_t index;
+  if (workItem)
+  {
+    Size3 wgsize = workGroup->getGroupSize();
+    Size3 lid = workItem->getLocalID();
+    index = lid.x + (lid.y + lid.z*wgsize.y)*wgsize.x;
+  }
+  else
+  {
+    index = STATE(workGroup).wiLocal.size() - 1;
+  }
+
+  AccessMap& accesess = (addrSpace == AddrSpaceGlobal) ?
+    STATE(workGroup).wiGlobal[index] :
+    STATE(workGroup).wiLocal[index];
 
-  bool race = false;
-  for (size_t offset = 0; offset < size; offset++, state++)
+  for (size_t i = 0; i < size; i++)
   {
-    // Check for races with non-atomic operations
-    bool conflict = store ? !state->canAtomicStore : !state->canAtomicLoad;
-    if (!race && conflict && workItemIndex != state->workItem)
-    {
-      logRace(ReadWriteRace,
-              memory->getAddressSpace(),
-              address,
-              state->workItem,
-              state->workGroup,
-              state->instruction);
-      race = true;
-    }
+    if (storeData)
+      access.setStoreData(storeData[i]);
+
+    insert(accesess[address+i], access);
+  }
+}
 
-    // Update state
-    if (store)
-      state->canLoad = false;
-    state->canStore = false;
-    if (!state->wasWorkItem)
+void RaceDetector::syncWorkItems(const Memory *memory,
+                                 WorkGroupState& state,
+                                 vector<AccessMap>& accesses)
+{
+  AccessMap wgAccesses(0, AccessMap::hasher(), AccessMap::key_equal(),
+                       state.wgGlobal.get_allocator());
+
+  for (size_t i = 0; i < state.numWorkItems + 1; i++)
+  {
+    RaceList races;
+    for (auto &record : accesses[i])
     {
-      state->instruction = workItem->getCurrentInstruction();
-      state->workItem = workItemIndex;
-      state->wasWorkItem = true;
+      size_t address = record.first;
+
+      AccessRecord& a = record.second;
+      AccessRecord& b = wgAccesses[address];
+
+      if (check(a.load,  b.store))
+        insertRace(races, {memory->getAddressSpace(),address,a.load,b.store});
+      if (check(a.store, b.load))
+        insertRace(races, {memory->getAddressSpace(),address,a.store,b.load});
+      if (check(a.store, b.store))
+        insertRace(races, {memory->getAddressSpace(),address,a.store,b.store});
+
+      if (a.load.isSet())
+      {
+        insert(b, a.load);
+        if (memory->getAddressSpace() == AddrSpaceGlobal)
+          insert(state.wgGlobal[address], a.load);
+      }
+      if (a.store.isSet())
+      {
+        insert(b, a.store);
+        if (memory->getAddressSpace() == AddrSpaceGlobal)
+          insert(state.wgGlobal[address], a.store);
+      }
     }
+
+    accesses[i].clear();
+
+    // Log races
+    for (auto race : races)
+      logRace(race);
   }
 }
 
-void RaceDetector::registerLoadStore(const Memory *memory,
-                                     const WorkItem *workItem,
-                                     const WorkGroup *workGroup,
-                                     size_t address, size_t size,
-                                     const uint8_t *storeData)
+RaceDetector::MemoryAccess::MemoryAccess()
 {
-  if (!m_kernelInvocation)
-    return;
-  if (memory->getAddressSpace() == AddrSpacePrivate ||
-      memory->getAddressSpace() == AddrSpaceConstant)
-    return;
-  if (!memory->isAddressValid(address, size))
-    return;
+  this->info = 0;
+  this->instruction = NULL;
+}
 
-  bool load = !storeData;
-  bool store = storeData;
+RaceDetector::MemoryAccess::MemoryAccess(const WorkGroup *workGroup,
+                                         const WorkItem *workItem,
+                                         bool store, bool atomic)
+{
+  this->info = 0;
+
+  this->info |= 1 << SET_BIT;
+  this->info |= store << STORE_BIT;
+  this->info |= atomic << ATOMIC_BIT;
 
-  // Get index of work-item and work-group performing access
-  size_t workItemIndex = -1, workGroupIndex = -1;
   if (workItem)
   {
-    workItemIndex = workItem->getGlobalIndex();
+    this->entity = workItem->getGlobalIndex();
+    this->instruction = workItem->getCurrentInstruction();
   }
-  if (workGroup)
+  else
   {
-    workGroupIndex = workGroup->getGroupIndex();
+    this->info |= (1<<WG_BIT);
+    this->entity = workGroup->getGroupIndex();
+    this->instruction = NULL; // TODO?
   }
+}
 
-  bool race = false;
-  size_t base = EXTRACT_OFFSET(address);
-  State *state = m_state[KEY(memory, address)].first + base;
+void RaceDetector::MemoryAccess::clear()
+{
+  this->info = 0;
+  this->instruction = NULL;
+}
 
-  for (size_t offset = 0; offset < size; offset++, state++)
-  {
-    bool conflict = store ? !state->canStore : !state->canLoad;
-    if (m_allowUniformWrites && storeData)
-    {
-      uint8_t *ptr = (uint8_t*)(memory->getPointer(address));
-      conflict &= (ptr[offset] != storeData[offset]);
-    }
+bool RaceDetector::MemoryAccess::isSet() const
+{
+  return this->info & (1<<SET_BIT);
+}
 
-    if (!race && conflict &&
-        (state->wasWorkItem ?                // If state set by work-item,
-         state->workItem != workItemIndex :  // must be same work-item,
-         state->workGroup != workGroupIndex) // otherwise must be same group
-        )
-    {
-      // Report data-race
-      DataRaceType type = load|state->canLoad ? ReadWriteRace : WriteWriteRace;
-      logRace(type, memory->getAddressSpace(),
-              address + offset,
-              state->workItem,
-              state->workGroup,
-              state->instruction);
-      race = true;
-    }
-    else
-    {
-      // Only update WI info if this operation is stronger than previous one
-      bool updateWI = store || (load && state->canStore);
-
-      // Update state
-      if (store)
-        state->canAtomicLoad = false;
-      state->canAtomicStore = false;
-      state->canLoad &= load;
-      state->canStore = false;
-      if (updateWI)
-      {
-        state->workGroup = workGroupIndex;
-        if (workItem)
-        {
-          state->instruction = workItem->getCurrentInstruction();
-          state->workItem = workItemIndex;
-          state->wasWorkItem = true;
-        }
-      }
-    }
-  }
+bool RaceDetector::MemoryAccess::isAtomic() const
+{
+  return this->info & (1<<ATOMIC_BIT);
 }
 
-void RaceDetector::synchronize(const Memory *memory, bool workGroup)
+bool RaceDetector::MemoryAccess::isLoad() const
 {
-  StateMap::iterator itr;
-  for (itr = m_state.begin(); itr != m_state.end(); itr++)
-  {
-    if (itr->first.first != memory)
-      continue;
+  return !isStore();
+}
 
-    pair<State*,size_t> obj = itr->second;
-    for (State *state = obj.first; state < obj.first+obj.second; state++)
-    {
-      // TODO: atomic_intergroup_race test failure
-      state->canAtomicLoad = true;
-      state->canAtomicStore = true;
-      state->workItem = -1;
-      state->wasWorkItem = false;
-      if (!workGroup)
-      {
-        state->workGroup = -1;
-        state->canLoad = true;
-        state->canStore = true;
-      }
-    }
-  }
+bool RaceDetector::MemoryAccess::isStore() const
+{
+  return this->info & (1<<STORE_BIT);
 }
 
-void RaceDetector::workGroupBarrier(const WorkGroup *workGroup, uint32_t flags)
+bool RaceDetector::MemoryAccess::isWorkGroup() const
 {
-  if (flags & CLK_LOCAL_MEM_FENCE)
-    synchronize(workGroup->getLocalMemory(), false);
-  if (flags & CLK_GLOBAL_MEM_FENCE)
-    synchronize(m_context->getGlobalMemory(), true);
+  return this->info & (1<<WG_BIT);
+}
+
+bool RaceDetector::MemoryAccess::isWorkItem() const
+{
+  return !isWorkGroup();
+}
+
+size_t RaceDetector::MemoryAccess::getEntity() const
+{
+  return this->entity;
+}
+
+const llvm::Instruction* RaceDetector::MemoryAccess::getInstruction() const
+{
+  return this->instruction;
+}
+
+uint8_t RaceDetector::MemoryAccess::getStoreData() const
+{
+  return this->storeData;
+}
+
+void RaceDetector::MemoryAccess::setStoreData(uint8_t data)
+{
+  this->storeData = data;
 }
 
-RaceDetector::State::State()
+bool RaceDetector::MemoryAccess::operator==(
+  const RaceDetector::MemoryAccess& other) const
 {
-  instruction = NULL;
-  workItem = -1;
-  workGroup = -1;
-  canAtomicLoad = true;
-  canAtomicStore = true;
-  canLoad = true;
-  canStore = true;
-  wasWorkItem = false;
+  return this->entity == other.entity &&
+         this->instruction == other.instruction &&
+         this->info == other.info;
 }
diff --git a/src/plugins/RaceDetector.h b/src/plugins/RaceDetector.h
index 2442b56..c65cbd3 100644
--- a/src/plugins/RaceDetector.h
+++ b/src/plugins/RaceDetector.h
@@ -1,5 +1,5 @@
 // RaceDetector.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
@@ -8,6 +8,8 @@
 
 #include "core/Plugin.h"
 
+#include <mutex>
+
 namespace oclgrind
 {
   class RaceDetector : public Plugin
@@ -18,7 +20,8 @@ namespace oclgrind
     virtual void kernelBegin(const KernelInvocation *kernelInvocation) override;
     virtual void kernelEnd(const KernelInvocation *kernelInvocation) override;
     virtual void memoryAllocated(const Memory *memory, size_t address,
-                                 size_t size, cl_mem_flags flags) override;
+                                 size_t size, cl_mem_flags flags,
+                                 const uint8_t *initData) override;
     virtual void memoryAtomicLoad(const Memory *memory,
                                   const WorkItem *workItem,
                                   AtomicOp op,
@@ -27,7 +30,8 @@ namespace oclgrind
                                    const WorkItem *workItem,
                                    AtomicOp op,
                                    size_t address, size_t size) override;
-    virtual void memoryDeallocated(const Memory *memory, size_t address);
+    virtual void memoryDeallocated(const Memory *memory,
+                                   size_t address) override;
     virtual void memoryLoad(const Memory *memory, const WorkItem *workItem,
                             size_t address, size_t size) override;
     virtual void memoryLoad(const Memory *memory, const WorkGroup *workGroup,
@@ -40,55 +44,103 @@ namespace oclgrind
                              const uint8_t *storeData) override;
     virtual void workGroupBarrier(const WorkGroup *workGroup,
                                   uint32_t flags) override;
-
-    virtual bool isThreadSafe() const override;
+    virtual void workGroupBegin(const WorkGroup *workGroup) override;
+    virtual void workGroupComplete(const WorkGroup *workGroup) override;
 
   private:
-    struct State
+    struct MemoryAccess
     {
+    private:
+      size_t entity;
       const llvm::Instruction *instruction;
-      size_t workItem;
-      size_t workGroup;
-      bool canAtomicLoad;
-      bool canAtomicStore;
-      bool canLoad;
-      bool canStore;
-      bool wasWorkItem;
-
-      State();
+
+      uint8_t info;
+      static const unsigned SET_BIT     = 0;
+      static const unsigned STORE_BIT   = 1;
+      static const unsigned ATOMIC_BIT  = 2;
+      static const unsigned WG_BIT      = 3;
+      uint8_t storeData;
+
+    public:
+      void clear();
+
+      bool isSet() const;
+
+      bool isAtomic() const;
+      bool isLoad() const;
+      bool isStore() const;
+      bool isWorkGroup() const;
+      bool isWorkItem() const;
+
+      size_t getEntity() const;
+      const llvm::Instruction* getInstruction() const;
+
+      uint8_t getStoreData() const;
+      void    setStoreData(uint8_t);
+
+      MemoryAccess();
+      MemoryAccess(const WorkGroup *workGroup, const WorkItem *workItem,
+                   bool store, bool atomic);
+
+      bool operator==(const MemoryAccess& other) const;
     };
+    struct AccessRecord
+    {
+      MemoryAccess load;
+      MemoryAccess store;
+    };
+    typedef std::vector<MemoryAccess> AccessList;
+    typedef std::unordered_map<
+      size_t,AccessRecord,
+      std::hash<size_t>,
+      std::equal_to<size_t>,
+      PoolAllocator<std::pair<const size_t,AccessRecord>,8192>
+      > AccessMap;
+
+    std::unordered_map<size_t,std::vector<AccessRecord>> m_globalAccesses;
+    std::map< size_t,std::mutex* > m_globalMutexes;
 
-    // Enumeration for types of data-race
-    enum DataRaceType
+    struct WorkGroupState
+    {
+      size_t numWorkItems;
+      std::vector<AccessMap> wiLocal;
+      std::vector<AccessMap> wiGlobal;
+      AccessMap wgGlobal;
+    };
+    struct WorkerState
     {
-      ReadWriteRace,
-      WriteWriteRace
+      std::unordered_map<const WorkGroup*,WorkGroupState> *groups;
     };
+    static THREAD_LOCAL WorkerState m_state;
 
-    typedef std::map<
-                      std::pair<const Memory*, size_t>,
-                      std::pair<State*, size_t>
-                    > StateMap;
-    StateMap m_state;
+    struct Race
+    {
+      unsigned addrspace;
+      size_t address;
+      MemoryAccess a, b;
+    };
+    typedef std::list<Race> RaceList;
 
     bool m_allowUniformWrites;
     const KernelInvocation *m_kernelInvocation;
 
-    void logRace(DataRaceType type,
-                 unsigned int addrSpace,
-                 size_t address,
-                 size_t lastWorkGroup,
-                 size_t lastWorkItem,
-                 const llvm::Instruction *lastInstruction) const;
-    void registerAtomic(const Memory *memory,
+    std::mutex kernelRacesMutex;
+    RaceList kernelRaces;
+
+    size_t getAccessWorkGroup(const MemoryAccess& access) const;
+
+    bool check(const MemoryAccess& a, const MemoryAccess& b) const;
+    void insert(AccessRecord& record, const MemoryAccess& access) const;
+    void insertKernelRace(const Race& race);
+    void insertRace(RaceList& races, const Race& race) const;
+    void logRace(const Race& race) const;
+    void registerAccess(const Memory *memory,
+                        const WorkGroup *workGroup,
                         const WorkItem *workItem,
-                        size_t address, size_t size,
-                        bool store);
-    void registerLoadStore(const Memory *memory,
-                           const WorkItem *workItem,
-                           const WorkGroup *workGroup,
-                           size_t address, size_t size,
-                           const uint8_t *storeData);
-    void synchronize(const Memory *memory, bool workGroup);
+                        size_t address, size_t size, bool atomic,
+                        const uint8_t *storeData = NULL);
+    void syncWorkItems(const Memory *memory,
+                       WorkGroupState& state,
+                       std::vector<AccessMap>& accesses);
   };
 }
diff --git a/src/plugins/Uninitialized.cpp b/src/plugins/Uninitialized.cpp
new file mode 100644
index 0000000..fb9fd77
--- /dev/null
+++ b/src/plugins/Uninitialized.cpp
@@ -0,0 +1,2811 @@
+// Uninitialized.h (Oclgrind)
+// Copyright (c) 2015, Moritz Pflanzer
+// Imperial College London. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/common.h"
+
+#include "core/Context.h"
+#include "core/Memory.h"
+#include "core/WorkItem.h"
+#include "core/WorkGroup.h"
+#include "core/Kernel.h"
+#include "core/KernelInvocation.h"
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Type.h"
+
+#include "Uninitialized.h"
+#include <mutex>
+
+using namespace oclgrind;
+using namespace std;
+
+//void Uninitialized::memoryAllocated(const Memory *memory, size_t address,
+//                                 size_t size, cl_mem_flags flags,
+//                                 const uint8_t *initData)
+//{
+//    cout << "Memory: " << memory << ", address: " << hex << address << dec << ", size: " << size << endl;
+//}
+
+// Multiple mutexes to mitigate risk of unnecessary synchronisation in atomics
+#define NUM_ATOMIC_MUTEXES 64 // Must be power of two
+static std::mutex atomicShadowMutex[NUM_ATOMIC_MUTEXES];
+#define ATOMIC_MUTEX(offset) \
+  atomicShadowMutex[(((offset)>>2) & (NUM_ATOMIC_MUTEXES-1))]
+
+THREAD_LOCAL ShadowContext::WorkSpace ShadowContext::m_workSpace = {NULL, NULL, NULL, 0};
+
+Uninitialized::Uninitialized(const Context *context)
+ : Plugin(context), shadowContext(sizeof(size_t)==8 ? 32 : 16)
+{
+    shadowContext.createMemoryPool();
+}
+
+Uninitialized::~Uninitialized()
+{
+    shadowContext.destroyMemoryPool();
+}
+
+void Uninitialized::allocAndStoreShadowMemory(unsigned addrSpace, size_t address, TypedValue SM,
+        const WorkItem *workItem, const WorkGroup *workGroup, bool unchecked)
+{
+    if(addrSpace == AddrSpaceConstant)
+    {
+        //TODO: Eventually store value
+        return;
+    }
+
+    ShadowMemory *memory = getShadowMemory(addrSpace, workItem, workGroup);
+    memory->allocate(address, SM.size*SM.num);
+    storeShadowMemory(addrSpace, address, SM, workItem, workGroup, unchecked);
+}
+
+bool Uninitialized::checkAllOperandsDefined(const WorkItem *workItem, const llvm::Instruction *I)
+{
+    for(llvm::Instruction::const_op_iterator OI = I->op_begin(); OI != I->op_end(); ++OI)
+    {
+        if(!ShadowContext::isCleanValue(shadowContext.getValue(workItem, OI->get())))
+        {
+#ifdef DUMP_SHADOW
+            OI->get()->dump();
+            cout << "Shadow value: " << shadowContext.getValue(workItem, OI->get()) << endl;
+#endif
+            logUninitializedCF();
+#ifdef DUMP_SHADOW
+            shadowContext.dump(workItem);
+#endif
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void Uninitialized::checkStructMemcpy(const WorkItem *workItem, const llvm::Value *src)
+{
+    const llvm::PointerType *srcPtrTy = llvm::dyn_cast<llvm::PointerType>(src->getType());
+    const llvm::StructType *structTy = llvm::dyn_cast<llvm::StructType>(srcPtrTy->getElementType());
+    size_t srcAddr = workItem->getOperand(src).getPointer();
+    unsigned srcAddrSpace = srcPtrTy->getPointerAddressSpace();
+
+    ShadowMemory *shadowMemory;
+
+    switch(srcAddrSpace)
+    {
+        case AddrSpacePrivate:
+        {
+            shadowMemory = shadowContext.getShadowWorkItem(workItem)->getPrivateMemory();
+            break;
+        }
+        case AddrSpaceLocal:
+        {
+            shadowMemory = shadowContext.getShadowWorkGroup(workItem->getWorkGroup())->getLocalMemory();
+            break;
+        }
+        case AddrSpaceConstant:
+            //TODO: Constants should always be clean?!
+            return;
+        case AddrSpaceGlobal:
+            shadowMemory = shadowContext.getGlobalMemory();
+            break;
+        default:
+            FATAL_ERROR("Unsupported addressspace %d", srcAddrSpace);
+    }
+
+    if(!ShadowContext::isCleanStruct(shadowMemory, srcAddr, structTy))
+    {
+        logUninitializedWrite(srcAddrSpace, srcAddr);
+    }
+}
+
+void Uninitialized::copyShadowMemory(unsigned dstAddrSpace, size_t dst, unsigned srcAddrSpace, size_t src, unsigned size, const WorkItem *workItem, const WorkGroup *workGroup, bool unchecked)
+{
+    copyShadowMemoryStrided(dstAddrSpace, dst, srcAddrSpace, src, 1, 1, size, workItem, workGroup, unchecked);
+}
+
+void Uninitialized::copyShadowMemoryStrided(unsigned dstAddrSpace, size_t dst, unsigned srcAddrSpace, size_t src, size_t num, size_t stride, unsigned size, const WorkItem *workItem, const WorkGroup *workGroup, bool unchecked)
+{
+    TypedValue v = {
+        size,
+        1,
+        new unsigned char[size]
+    };
+
+    for (unsigned i = 0; i < num; i++)
+    {
+        loadShadowMemory(srcAddrSpace, src, v, workItem, workGroup);
+        storeShadowMemory(dstAddrSpace, dst, v, workItem, workGroup, unchecked);
+        src += stride * size;
+        dst += stride * size;
+    }
+
+    delete[] v.data;
+}
+
+std::string Uninitialized::extractUnmangledName(const std::string fullname)
+{
+    // Extract unmangled name
+    if(fullname.compare(0,2, "_Z") == 0)
+    {
+        int len = atoi(fullname.c_str() + 2);
+        int start = fullname.find_first_not_of("0123456789", 2);
+        return fullname.substr(start, len);
+    }
+    else
+    {
+        return fullname;
+    }
+}
+
+ShadowMemory* Uninitialized::getShadowMemory(unsigned addrSpace,
+        const WorkItem *workItem, const WorkGroup *workGroup) const
+{
+    switch(addrSpace)
+    {
+        case AddrSpacePrivate:
+        {
+            if(!workItem)
+            {
+                FATAL_ERROR("Work item needed to access private memory!");
+            }
+
+            return shadowContext.getShadowWorkItem(workItem)->getPrivateMemory();
+        }
+        case AddrSpaceLocal:
+        {
+            if(!workGroup)
+            {
+                if(!workItem)
+                {
+                    FATAL_ERROR("Work item or work group needed to access local memory!");
+                }
+
+                workGroup = workItem->getWorkGroup();
+            }
+
+            return shadowContext.getShadowWorkGroup(workGroup)->getLocalMemory();
+        }
+        //case AddrSpaceConstant:
+        //    break;
+        case AddrSpaceGlobal:
+            return shadowContext.getGlobalMemory();
+        default:
+            FATAL_ERROR("Unsupported addressspace %d", addrSpace);
+    }
+}
+
+bool Uninitialized::handleBuiltinFunction(const WorkItem *workItem, string name,
+                                                  const llvm::CallInst *CI, const TypedValue result)
+{
+    name = extractUnmangledName(name);
+    ShadowValues *shadowValues = shadowContext.getShadowWorkItem(workItem)->getValues();
+
+    if(name == "async_work_group_copy" ||
+       name == "async_work_group_strided_copy")
+    {
+        int arg = 0;
+
+        // Get src/dest addresses
+        const llvm::Value *dstOp = CI->getArgOperand(arg++);
+        const llvm::Value *srcOp = CI->getArgOperand(arg++);
+        size_t dst = workItem->getOperand(dstOp).getPointer();
+        size_t src = workItem->getOperand(srcOp).getPointer();
+
+        // Get size of copy
+        unsigned elemSize = getTypeSize(dstOp->getType()->getPointerElementType());
+
+        const llvm::Value *numOp = CI->getArgOperand(arg++);
+        uint64_t num = workItem->getOperand(numOp).getUInt();
+        TypedValue numShadow = shadowContext.getValue(workItem, numOp);
+
+        if(!ShadowContext::isCleanValue(numShadow))
+        {
+            logUninitializedIndex();
+        }
+
+        // Get stride
+        size_t stride = 1;
+
+        if(name == "async_work_group_strided_copy")
+        {
+            const llvm::Value *strideOp = CI->getArgOperand(arg++);
+            stride = workItem->getOperand(strideOp).getUInt();
+            TypedValue strideShadow = shadowContext.getValue(workItem, strideOp);
+
+            if(!ShadowContext::isCleanValue(strideShadow))
+            {
+                logUninitializedIndex();
+            }
+        }
+
+        const llvm::Value *eventOp = CI->getArgOperand(arg++);
+        TypedValue eventShadow = shadowContext.getValue(workItem, eventOp);
+
+        // Get type of copy
+        AddressSpace dstAddrSpace = AddrSpaceLocal;
+        AddressSpace srcAddrSpace = AddrSpaceLocal;
+
+        if(dstOp->getType()->getPointerAddressSpace() == AddrSpaceLocal)
+        {
+            srcAddrSpace = AddrSpaceGlobal;
+        }
+        else
+        {
+            dstAddrSpace = AddrSpaceGlobal;
+        }
+
+        copyShadowMemoryStrided(dstAddrSpace, dst, srcAddrSpace, src, num, stride, elemSize, workItem);
+        shadowValues->setValue(CI, eventShadow);
+
+        // Check shadow of src address
+        TypedValue srcShadow = shadowContext.getValue(workItem, srcOp);
+
+        if(!ShadowContext::isCleanValue(srcShadow))
+        {
+            logUninitializedAddress(srcAddrSpace, src, false);
+        }
+
+        // Check shadow of dst address
+        TypedValue dstShadow = shadowContext.getValue(workItem, dstOp);
+
+        if(!ShadowContext::isCleanValue(dstShadow))
+        {
+            logUninitializedAddress(dstAddrSpace, dst);
+        }
+        
+        return true;
+    }
+    else if(name == "wait_group_events")
+    {
+        const llvm::Value *Addr = CI->getArgOperand(1);
+        const llvm::Value *Num = CI->getArgOperand(0);
+        uint64_t num = workItem->getOperand(Num).getUInt();
+        size_t address = workItem->getOperand(Addr).getPointer();
+
+        TypedValue numShadow = shadowContext.getValue(workItem, Num);
+        TypedValue eventShadow = {
+            sizeof(size_t),
+            1,
+            new unsigned char[sizeof(size_t)]
+        };
+
+        // Check shadow for the number of events
+        if(!ShadowContext::isCleanValue(numShadow))
+        {
+            logUninitializedCF();
+        }
+
+        for(unsigned i = 0; i < num; ++i)
+        {
+            loadShadowMemory(AddrSpacePrivate, address, eventShadow, workItem);
+
+            if(!ShadowContext::isCleanValue(eventShadow))
+            {
+                logUninitializedCF();
+                delete[] eventShadow.data;
+                return true;
+            }
+
+            address += sizeof(size_t);
+        }
+
+        delete[] eventShadow.data;
+
+        // Check shadow of address
+        TypedValue addrShadow = shadowContext.getValue(workItem, Addr);
+
+        if(!ShadowContext::isCleanValue(addrShadow))
+        {
+            logUninitializedAddress(AddrSpacePrivate, address, false);
+        }
+
+        return true;
+    }
+    else if(name.compare(0, 6, "atomic") == 0)
+    {
+        if(name.compare(6, string::npos, "cmpxchg") == 0)
+        {
+            const llvm::Value *Addr = CI->getArgOperand(0);
+            unsigned addrSpace = Addr->getType()->getPointerAddressSpace();
+            size_t address = workItem->getOperand(Addr).getPointer();
+            uint32_t cmp = workItem->getOperand(CI->getArgOperand(1)).getUInt();
+            uint32_t old = workItem->getOperand(CI).getUInt();
+            TypedValue argShadow = shadowContext.getValue(workItem, CI->getArgOperand(2));
+            TypedValue cmpShadow = shadowContext.getValue(workItem, CI->getArgOperand(1));
+            TypedValue oldShadow = {
+                4,
+                1,
+                shadowContext.getMemoryPool()->alloc(4)
+            };
+
+            // Check shadow of the condition
+            if(!ShadowContext::isCleanValue(cmpShadow))
+            {
+                logUninitializedCF();
+            }
+
+            // Perform cmpxchg
+            if(addrSpace == AddrSpaceGlobal)
+            {
+                shadowContext.getGlobalMemory()->lock(address);
+            }
+
+            loadShadowMemory(addrSpace, address, oldShadow, workItem);
+
+            if(old == cmp)
+            {
+                storeShadowMemory(addrSpace, address, argShadow, workItem);
+            }
+
+            if(addrSpace == AddrSpaceGlobal)
+            {
+                shadowContext.getGlobalMemory()->unlock(address);
+            }
+
+            shadowValues->setValue(CI, oldShadow);
+
+            // Check shadow of address
+            TypedValue addrShadow = shadowContext.getValue(workItem, Addr);
+
+            if(!ShadowContext::isCleanValue(addrShadow))
+            {
+                logUninitializedAddress(addrSpace, address);
+            }
+
+            return true;
+        }
+        else
+        {
+            SimpleOrAtomic(workItem, CI);
+            return true;
+        }
+    }
+    else if(name == "fract" ||
+            name == "modf" ||
+            name == "sincos")
+    {
+        const llvm::Value *Addr = CI->getArgOperand(1);
+        unsigned addrSpace = Addr->getType()->getPointerAddressSpace();
+        size_t iptr = workItem->getOperand(Addr).getPointer();
+        TypedValue argShadow = shadowContext.getValue(workItem, CI->getArgOperand(0));
+        TypedValue newElemShadow;
+        TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+        for(unsigned i = 0; i < result.num; ++i)
+        {
+            if(!ShadowContext::isCleanValue(argShadow, i))
+            {
+                newElemShadow = ShadowContext::getPoisonedValue(result.size);
+            }
+            else
+            {
+                newElemShadow = ShadowContext::getCleanValue(result.size);
+            }
+
+            memcpy(newShadow.data, newElemShadow.data, result.size);
+        }
+
+        storeShadowMemory(addrSpace, iptr, newShadow);
+        shadowValues->setValue(CI, newShadow);
+
+        // Check shadow of address
+        TypedValue addrShadow = shadowContext.getValue(workItem, Addr);
+
+        if(!ShadowContext::isCleanValue(addrShadow))
+        {
+            logUninitializedAddress(addrSpace, iptr);
+        }
+
+        return true;
+    }
+    else if(name == "frexp" ||
+            name == "lgamma_r")
+    {
+        const llvm::Value *Addr = CI->getArgOperand(1);
+        unsigned addrSpace = Addr->getType()->getPointerAddressSpace();
+        size_t iptr = workItem->getOperand(Addr).getPointer();
+        TypedValue argShadow = shadowContext.getValue(workItem, CI->getArgOperand(0));
+        TypedValue newElemShadow;
+        TypedValue newElemIntShadow;
+        TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+        TypedValue newIntShadow = {
+            newShadow.size,
+            newShadow.num,
+            shadowContext.getMemoryPool()->alloc(4)
+        };
+
+        for(unsigned i = 0; i < result.num; ++i)
+        {
+            if(!ShadowContext::isCleanValue(argShadow, i))
+            {
+                newElemShadow = ShadowContext::getPoisonedValue(result.size);
+                newElemIntShadow = ShadowContext::getPoisonedValue(4);
+            }
+            else
+            {
+                newElemShadow = ShadowContext::getCleanValue(result.size);
+                newElemIntShadow = ShadowContext::getCleanValue(4);
+            }
+
+            memcpy(newIntShadow.data, newElemIntShadow.data, 4);
+            memcpy(newShadow.data, newElemShadow.data, result.size);
+        }
+
+        storeShadowMemory(addrSpace, iptr, newIntShadow);
+        shadowValues->setValue(CI, newShadow);
+
+        // Check shadow of address
+        TypedValue addrShadow = shadowContext.getValue(workItem, Addr);
+
+        if(!ShadowContext::isCleanValue(addrShadow))
+        {
+            logUninitializedAddress(addrSpace, iptr);
+        }
+
+        return true;
+    }
+    else if(name == "remquo")
+    {
+        const llvm::Value *Addr = CI->getArgOperand(2);
+        unsigned addrSpace = Addr->getType()->getPointerAddressSpace();
+        size_t iptr = workItem->getOperand(Addr).getPointer();
+        TypedValue arg0Shadow = shadowContext.getValue(workItem, CI->getArgOperand(0));
+        TypedValue arg1Shadow = shadowContext.getValue(workItem, CI->getArgOperand(1));
+        TypedValue newElemShadow;
+        TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+        for(unsigned i = 0; i < result.num; ++i)
+        {
+            if(!ShadowContext::isCleanValue(arg0Shadow, i) || !ShadowContext::isCleanValue(arg1Shadow, i))
+            {
+                newElemShadow = ShadowContext::getPoisonedValue(result.size);
+            }
+            else
+            {
+                newElemShadow = ShadowContext::getCleanValue(result.size);
+            }
+
+            storeShadowMemory(addrSpace, iptr + i*4, newElemShadow);
+            memcpy(newShadow.data, newElemShadow.data, result.size);
+        }
+
+        shadowValues->setValue(CI, newShadow);
+
+        // Check shadow of address
+        TypedValue addrShadow = shadowContext.getValue(workItem, Addr);
+
+        if(!ShadowContext::isCleanValue(addrShadow))
+        {
+            logUninitializedAddress(addrSpace, iptr);
+        }
+
+        return true;
+    }
+    else if(name == "shuffle")
+    {
+        TypedValue mask = workItem->getOperand(CI->getArgOperand(1));
+        TypedValue maskShadow = shadowContext.getValue(workItem, CI->getArgOperand(1));
+        TypedValue shadow = shadowContext.getValue(workItem, CI->getArgOperand(0));
+        TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+        for(unsigned i = 0; i < newShadow.num; ++i)
+        {
+            if(!ShadowContext::isCleanValue(maskShadow, i))
+            {
+                TypedValue v = ShadowContext::getPoisonedValue(newShadow.size);
+                memcpy(newShadow.data + i*newShadow.size, v.data, newShadow.size);
+            }
+            else
+            {
+                size_t srcOffset = mask.getUInt(i) * shadow.size;
+                memcpy(newShadow.data + i*newShadow.size, shadow.data + srcOffset, newShadow.size);
+            }
+        }
+
+        shadowValues->setValue(CI, newShadow);
+        return true;
+    }
+    else if(name == "shuffle2")
+    {
+        TypedValue mask = workItem->getOperand(CI->getArgOperand(2));
+        TypedValue maskShadow = shadowContext.getValue(workItem, CI->getArgOperand(2));
+        TypedValue shadow[] = {shadowContext.getValue(workItem, CI->getArgOperand(0)),
+                               shadowContext.getValue(workItem, CI->getArgOperand(1))};
+        TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+        for (unsigned i = 0; i < newShadow.num; ++i)
+        {
+            uint64_t m = 1;
+
+            if(CI->getArgOperand(0)->getType()->isVectorTy())
+            {
+                m = CI->getArgOperand(0)->getType()->getVectorNumElements();
+            }
+
+            uint64_t src = 0;
+            uint64_t index = mask.getUInt(i);
+
+            if(index >= m)
+            {
+                index -= m;
+                src = 1;
+            }
+
+            if(!ShadowContext::isCleanValue(maskShadow, i))
+            {
+                TypedValue v = ShadowContext::getPoisonedValue(newShadow.size);
+                memcpy(newShadow.data + i*newShadow.size, v.data, newShadow.size);
+            }
+            else
+            {
+                size_t srcOffset = index * shadow[src].size;
+                memcpy(newShadow.data + i*newShadow.size, shadow[src].data + srcOffset, newShadow.size);
+            }
+        }
+
+        shadowValues->setValue(CI, newShadow);
+        return true;
+    }
+    else if(name == "any")
+    {
+        const llvm::Value *argOp = CI->getArgOperand(0);
+        TypedValue shadow = shadowContext.getValue(workItem, argOp);
+
+        unsigned num = 1;
+        if(argOp->getType()->isVectorTy())
+        {
+            num = argOp->getType()->getVectorNumElements();
+        }
+
+        for(unsigned i = 0; i < num; ++i)
+        {
+            if(ShadowContext::isCleanValue(shadow, i))
+            {
+                shadowValues->setValue(CI, ShadowContext::getCleanValue(result.size));
+                return true;
+            }
+        }
+
+        shadowValues->setValue(CI, ShadowContext::getPoisonedValue(result.size));
+        return true;
+    }
+    else if(name == "select")
+    {
+        TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+        TypedValue shadow[] = {shadowContext.getValue(workItem, CI->getArgOperand(0)),
+                               shadowContext.getValue(workItem, CI->getArgOperand(1))};
+        TypedValue selectShadow = shadowContext.getValue(workItem, CI->getArgOperand(2));
+
+        for(unsigned i = 0; i < newShadow.num; ++i)
+        {
+            int64_t c = workItem->getOperand(CI->getArgOperand(2)).getSInt(i);
+            uint64_t src = ((newShadow.num > 1) ? c & INT64_MIN : c) ? 1 : 0;
+
+            if(!ShadowContext::isCleanValue(selectShadow, i))
+            {
+                TypedValue v = ShadowContext::getPoisonedValue(newShadow.size);
+                memcpy(newShadow.data + i*newShadow.size, v.data, newShadow.size);
+            }
+            else
+            {
+                size_t srcOffset = i * shadow[src].size;
+                memcpy(newShadow.data + i*newShadow.size, shadow[src].data + srcOffset, newShadow.size);
+            }
+        }
+
+        shadowValues->setValue(CI, newShadow);
+        return true;
+    }
+    else if(name.compare(0, 10, "vload_half") == 0 ||
+            name.compare(0, 11, "vloada_half") == 0)
+    {
+        const llvm::Value *BaseOp = CI->getArgOperand(1);
+        const llvm::Value *OffsetOp = CI->getArgOperand(0);
+        size_t base = workItem->getOperand(BaseOp).getPointer();
+        unsigned int addressSpace = BaseOp->getType()->getPointerAddressSpace();
+        uint64_t offset = workItem->getOperand(OffsetOp).getUInt();
+
+        size_t address;
+
+        if(name.compare(0, 6, "vloada") == 0 && result.num == 3)
+        {
+            address = base + offset * sizeof(cl_half) * 4;
+        }
+        else
+        {
+            address = base + offset * sizeof(cl_half) * result.num;
+        }
+
+        TypedValue halfShadow = {
+            sizeof(cl_half),
+            result.num,
+            shadowContext.getMemoryPool()->alloc(2 * result.num)
+        };
+        TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+        loadShadowMemory(addressSpace, address, halfShadow, workItem);
+
+        TypedValue pv = ShadowContext::getPoisonedValue(newShadow.size);
+        TypedValue cv = ShadowContext::getCleanValue(newShadow.size);
+
+        // Convert to float shadows
+        for(unsigned i = 0; i < newShadow.num; ++i)
+        {
+            if(!ShadowContext::isCleanValue(halfShadow, i))
+            {
+                memcpy(newShadow.data + i*newShadow.size, pv.data, newShadow.size);
+            }
+            else
+            {
+                memcpy(newShadow.data + i*newShadow.size, cv.data, newShadow.size);
+            }
+        }
+
+        shadowValues->setValue(CI, newShadow);
+
+        // Check shadow of address
+        TypedValue baseShadow = shadowContext.getValue(workItem, BaseOp);
+        TypedValue offsetShadow = shadowContext.getValue(workItem, OffsetOp);
+
+        if(!ShadowContext::isCleanValue(baseShadow) ||
+           !ShadowContext::isCleanValue(offsetShadow))
+        {
+            logUninitializedAddress(addressSpace, address, false);
+        }
+
+        return true;
+    }
+    else if(name.compare(0, 11, "vstore_half") == 0 ||
+            name.compare(0, 12, "vstorea_half") == 0)
+    {
+        const llvm::Value *value = CI->getArgOperand(0);
+        unsigned size = getTypeSize(value->getType());
+
+        if(isVector3(value))
+        {
+            // 3-element vectors are same size as 4-element vectors,
+            // but vstore address offset shouldn't use this.
+            size = (size / 4) * 3;
+        }
+
+        const llvm::Value *BaseOp = CI->getArgOperand(2);
+        const llvm::Value *OffsetOp = CI->getArgOperand(1);
+        size_t base = workItem->getOperand(BaseOp).getPointer();
+        unsigned int addressSpace = BaseOp->getType()->getPointerAddressSpace();
+        uint64_t offset = workItem->getOperand(OffsetOp).getUInt();
+
+        // Convert to halfs
+        TypedValue shadow = shadowContext.getValue(workItem, value);
+        unsigned num = size / sizeof(float);
+        size = num * sizeof(cl_half);
+        TypedValue halfShadow = {
+            sizeof(cl_half),
+            num,
+            shadowContext.getMemoryPool()->alloc(2 * num)
+        };
+
+        TypedValue pv = ShadowContext::getPoisonedValue(halfShadow.size);
+        TypedValue cv = ShadowContext::getCleanValue(halfShadow.size);
+
+        for(unsigned i = 0; i < num; i++)
+        {
+            if(!ShadowContext::isCleanValue(shadow, i))
+            {
+                memcpy(halfShadow.data + i*halfShadow.size, pv.data, halfShadow.size);
+            }
+            else
+            {
+                memcpy(halfShadow.data + i*halfShadow.size, cv.data, halfShadow.size);
+            }
+        }
+
+        size_t address;
+        if(name.compare(0, 7, "vstorea") == 0 && num == 3)
+        {
+            address = base + offset * sizeof(cl_half) * 4;
+        }
+        else
+        {
+            address = base + offset * sizeof(cl_half) * num;
+        }
+
+        storeShadowMemory(addressSpace, address, halfShadow, workItem);
+
+        // Check shadow of address
+        TypedValue baseShadow = shadowContext.getValue(workItem, BaseOp);
+        TypedValue offsetShadow = shadowContext.getValue(workItem, OffsetOp);
+
+        if(!ShadowContext::isCleanValue(baseShadow) ||
+           !ShadowContext::isCleanValue(offsetShadow))
+        {
+            logUninitializedAddress(addressSpace, address);
+        }
+        return true;
+    }
+    else if(name.compare(0, 5, "vload") == 0)
+    {
+        TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+        const llvm::Value *BaseOp = CI->getArgOperand(1);
+        const llvm::Value *OffsetOp = CI->getArgOperand(0);
+        unsigned int addressSpace = BaseOp->getType()->getPointerAddressSpace();
+        size_t base = workItem->getOperand(BaseOp).getPointer();
+        uint64_t offset = workItem->getOperand(OffsetOp).getUInt();
+
+        size_t size = newShadow.size*newShadow.num;
+        size_t address = base + offset*size;
+        loadShadowMemory(addressSpace, address, newShadow, workItem);
+
+        shadowValues->setValue(CI, newShadow);
+
+        // Check shadow of address
+        TypedValue baseShadow = shadowContext.getValue(workItem, BaseOp);
+        TypedValue offsetShadow = shadowContext.getValue(workItem, OffsetOp);
+
+        if(!ShadowContext::isCleanValue(baseShadow) ||
+           !ShadowContext::isCleanValue(offsetShadow))
+        {
+            logUninitializedAddress(addressSpace, address, false);
+        }
+
+        return true;
+    }
+    else if(name.compare(0, 6, "vstore") == 0)
+    {
+        const llvm::Value *value = CI->getArgOperand(0);
+        unsigned size = getTypeSize(value->getType());
+
+        if(isVector3(value))
+        {
+            // 3-element vectors are same size as 4-element vectors,
+            // but vstore address offset shouldn't use this.
+            size = (size/4) * 3;
+        }
+
+        const llvm::Value *BaseOp = CI->getArgOperand(2);
+        const llvm::Value *OffsetOp = CI->getArgOperand(1);
+        unsigned int addressSpace = BaseOp->getType()->getPointerAddressSpace();
+        size_t base = workItem->getOperand(BaseOp).getPointer();
+        uint64_t offset = workItem->getOperand(OffsetOp).getUInt();
+
+        size_t address = base + offset*size;
+        TypedValue shadow = shadowContext.getValue(workItem, value);
+        storeShadowMemory(addressSpace, address, shadow, workItem);
+
+        // Check shadow of address
+        TypedValue baseShadow = shadowContext.getValue(workItem, BaseOp);
+        TypedValue offsetShadow = shadowContext.getValue(workItem, OffsetOp);
+
+        if(!ShadowContext::isCleanValue(baseShadow) ||
+           !ShadowContext::isCleanValue(offsetShadow))
+        {
+            logUninitializedAddress(addressSpace, address);
+        }
+
+        return true;
+    }
+    else if(name == "read_imagef" ||
+            name == "read_imagei" ||
+            name == "read_imageui")
+    {
+        Image *image = *(Image**)(workItem->getOperand(CI->getArgOperand(0)).data);
+        TypedValue shadowImage = shadowContext.getValue(workItem, CI->getArgOperand(0));
+        TypedValue newShadow;
+
+        //FIXME: The new shadow should be loaded from memory
+        //and not generated based on the image description
+        //However, this currently requires to duplicate all functionality
+        //in WorkItemBuiltins.cpp for the image function
+        //Has to be changed in combination with the write functions
+        size_t address = image->address;
+
+        if(!ShadowContext::isCleanImage(shadowImage))
+        {
+            newShadow = ShadowContext::getPoisonedValue(result);
+        }
+        else
+        {
+            newShadow = ShadowContext::getCleanValue(result);
+        }
+
+        shadowValues->setValue(CI, newShadow);
+
+        // Check image
+        if(!ShadowContext::isCleanImageAddress(shadowImage))
+        {
+            logUninitializedAddress(AddrSpaceGlobal, address, false);
+        }
+
+        return true;
+    }
+    else if(name == "write_imagef" ||
+            name == "write_imagei" ||
+            name == "write_imageui")
+    {
+        Image *image = *(Image**)(workItem->getOperand(CI->getArgOperand(0)).data);
+        TypedValue shadowImage = shadowContext.getValue(workItem, CI->getArgOperand(0));
+
+        //FIXME: The actual shadow of the image should be stored to memory
+        //However, this currently requires to duplicate all functionality
+        //in WorkItemBuiltins.cpp for the image function
+        //Has to be changed in combination with the read functions
+        size_t address = image->address;
+
+        // Check image
+        if(!ShadowContext::isCleanImageAddress(shadowImage))
+        {
+            logUninitializedAddress(AddrSpaceGlobal, address);
+        }
+
+        return true;
+    }
+    else if(name.compare(0, 10, "get_image_") == 0)
+    {
+        TypedValue shadowImage = shadowContext.getValue(workItem, CI->getArgOperand(0));
+        TypedValue newShadow = {
+            result.size,
+            result.num,
+            shadowContext.getMemoryPool()->alloc(result.size * result.num)
+        };
+
+        if(name == "get_image_array_size")
+        {
+            newShadow.setUInt(((Image*)shadowImage.data)->desc.image_array_size);
+        }
+        else if(name == "get_image_dim")
+        {
+            newShadow.setUInt(((Image*)shadowImage.data)->desc.image_width, 0);
+            newShadow.setUInt(((Image*)shadowImage.data)->desc.image_height, 1);
+
+            if(newShadow.num > 2)
+            {
+                newShadow.setUInt(((Image*)shadowImage.data)->desc.image_depth, 2);
+                newShadow.setUInt(0, 3);
+            }
+        }
+        else if(name == "get_image_depth")
+        {
+            newShadow.setUInt(((Image*)shadowImage.data)->desc.image_depth);
+        }
+        else if(name == "get_image_height")
+        {
+            newShadow.setUInt(((Image*)shadowImage.data)->desc.image_height);
+        }
+        else if(name == "get_image_width")
+        {
+            newShadow.setUInt(((Image*)shadowImage.data)->desc.image_width);
+        }
+        else if(name == "get_image_channel_order")
+        {
+            newShadow.setUInt(((Image*)shadowImage.data)->format.image_channel_order);
+        }
+        else if(name == "get_image_channel_data_type")
+        {
+            newShadow.setUInt(((Image*)shadowImage.data)->format.image_channel_data_type);
+        }
+
+        shadowValues->setValue(CI, newShadow);
+        return true;
+    }
+
+    return false;
+}
+
+void Uninitialized::handleIntrinsicInstruction(const WorkItem *workItem, const llvm::IntrinsicInst *I)
+{
+    switch (I->getIntrinsicID())
+    {
+        case llvm::Intrinsic::fmuladd:
+        {
+            SimpleOr(workItem, I);
+            break;
+        }
+        case llvm::Intrinsic::memcpy:
+        {
+            const llvm::MemCpyInst *memcpyInst = (const llvm::MemCpyInst*)I;
+            const llvm::Value *dstOp = memcpyInst->getDest();
+            const llvm::Value *srcOp = memcpyInst->getSource();
+            size_t dst = workItem->getOperand(dstOp).getPointer();
+            size_t src = workItem->getOperand(srcOp).getPointer();
+            size_t size = workItem->getOperand(memcpyInst->getLength()).getUInt();
+            unsigned dstAddrSpace = memcpyInst->getDestAddressSpace();
+            unsigned srcAddrSpace = memcpyInst->getSourceAddressSpace();
+            const llvm::PointerType *srcPtrTy = llvm::dyn_cast<llvm::PointerType>(memcpyInst->getSource()->getType());
+
+            if(dstAddrSpace != AddrSpacePrivate && srcPtrTy->getElementType()->isStructTy())
+            {
+                checkStructMemcpy(workItem, memcpyInst->getSource());
+            }
+
+            copyShadowMemory(dstAddrSpace, dst, srcAddrSpace, src, size, workItem, NULL, true);
+
+            // Check shadow of src address
+            TypedValue srcShadow = shadowContext.getValue(workItem, srcOp);
+
+            if(!ShadowContext::isCleanValue(srcShadow))
+            {
+                logUninitializedAddress(srcAddrSpace, src, false);
+            }
+
+            // Check shadow of dst address
+            TypedValue dstShadow = shadowContext.getValue(workItem, dstOp);
+
+            if(!ShadowContext::isCleanValue(dstShadow))
+            {
+                logUninitializedAddress(dstAddrSpace, dst);
+            }
+            break;
+        }
+        case llvm::Intrinsic::bswap:
+        {
+            // TODO: byte-level accuracy
+            SimpleOr(workItem, I);
+            break;
+        }
+        case llvm::Intrinsic::memset:
+        {
+            const llvm::MemSetInst *memsetInst = (const llvm::MemSetInst*)I;
+            const llvm::Value *Addr = memsetInst->getDest();
+            size_t dst = workItem->getOperand(Addr).getPointer();
+            unsigned size = workItem->getOperand(memsetInst->getLength()).getUInt();
+            unsigned addrSpace = memsetInst->getDestAddressSpace();
+
+            TypedValue shadowValue = {
+                size,
+                1,
+                new unsigned char[size]
+            };
+
+            memset(shadowValue.data, shadowContext.getValue(workItem, memsetInst->getArgOperand(1)).getUInt(), size);
+            storeShadowMemory(addrSpace, dst, shadowValue, workItem, NULL, true);
+
+            delete[] shadowValue.data;
+
+            // Check shadow of address
+            TypedValue addrShadow = shadowContext.getValue(workItem, Addr);
+
+            if(!ShadowContext::isCleanValue(addrShadow))
+            {
+                logUninitializedAddress(addrSpace, dst);
+            }
+            break;
+        }
+        case llvm::Intrinsic::dbg_declare:
+            //Do nothing
+            break;
+        case llvm::Intrinsic::dbg_value:
+            //Do nothing
+            break;
+        case llvm::Intrinsic::lifetime_end:
+            //Do nothing
+            break;
+        case llvm::Intrinsic::lifetime_start:
+            //Do nothing
+            break;
+        default:
+            FATAL_ERROR("Unsupported intrinsic %s", llvm::Intrinsic::getName(I->getIntrinsicID()).data());
+    }
+}
+
+void Uninitialized::hostMemoryStore(const Memory *memory,
+                             size_t address, size_t size,
+                             const uint8_t *storeData)
+{
+    if(memory->getAddressSpace() == AddrSpaceGlobal)
+    {
+        TypedValue v = ShadowContext::getCleanValue(size);
+        allocAndStoreShadowMemory(AddrSpaceGlobal, address, v);
+    }
+}
+
+void Uninitialized::instructionExecuted(const WorkItem *workItem,
+                                        const llvm::Instruction *instruction,
+                                        const TypedValue& result)
+{
+#ifdef DUMP_SHADOW
+    cout << "++++++++++++++++++++++++++++++++++++++++++++" << endl;
+    instruction->dump();
+#endif
+
+    ShadowWorkItem *shadowWorkItem = shadowContext.getShadowWorkItem(workItem);
+    ShadowValues *shadowValues = shadowWorkItem->getValues();
+
+    switch(instruction->getOpcode())
+    {
+        case llvm::Instruction::Add:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::Alloca:
+        {
+            const llvm::AllocaInst *allocaInst = ((const llvm::AllocaInst*)instruction);
+
+            size_t address = result.getPointer();
+
+            shadowValues->setValue(instruction, ShadowContext::getCleanValue(instruction));
+
+            TypedValue v = ShadowContext::getPoisonedValue(allocaInst->getAllocatedType());
+            allocAndStoreShadowMemory(AddrSpacePrivate, address, v, workItem);
+            break;
+        }
+        case llvm::Instruction::And:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::AShr:
+        {
+            TypedValue S0 = shadowContext.getValue(workItem, instruction->getOperand(0));
+            TypedValue S1 = shadowContext.getValue(workItem, instruction->getOperand(1));
+
+            if(!ShadowContext::isCleanValue(S1))
+            {
+                shadowValues->setValue(instruction, ShadowContext::getPoisonedValue(instruction));
+            }
+            else
+            {
+                TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+                TypedValue Shift = workItem->getOperand(instruction->getOperand(1));
+                uint64_t shiftMask = (S0.num > 1 ? S0.size : max((size_t)S0.size, sizeof(uint32_t))) * 8 - 1;
+
+                for (unsigned i = 0; i < S0.num; i++)
+                {
+                    newShadow.setUInt(S0.getSInt(i) >> (Shift.getUInt(i) & shiftMask), i);
+                }
+
+                shadowValues->setValue(instruction, newShadow);
+            }
+
+            break;
+        }
+        case llvm::Instruction::BitCast:
+        {
+            TypedValue shadow = shadowContext.getValue(workItem, instruction->getOperand(0));
+            TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+            memcpy(newShadow.data, shadow.data, newShadow.size*newShadow.num);
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        case llvm::Instruction::Br:
+        {
+            checkAllOperandsDefined(workItem, instruction);
+#ifdef DUMP_SHADOW
+            // Insert pseudo value to keep numbering
+            shadowValues->setValue(instruction, ShadowContext::getCleanValue(3));
+#endif
+            break;
+        }
+        case llvm::Instruction::Call:
+        {
+            const llvm::CallInst *callInst = ((const llvm::CallInst*)instruction);
+            const llvm::Function *function = callInst->getCalledFunction();
+
+            // Check for indirect function calls
+            if (!function)
+            {
+                // Resolve indirect function pointer
+                const llvm::Value *func = callInst->getCalledValue();
+                const llvm::Value *funcPtr = ((const llvm::User*)func)->getOperand(0);
+                function = (const llvm::Function*)funcPtr;
+            }
+
+            // For inline asm, do the usual thing: check argument shadow and mark all
+            // outputs as clean. Note that any side effects of the inline asm that are
+            // not immediately visible in its constraints are not handled.
+            if (callInst->isInlineAsm())
+            {
+                checkAllOperandsDefined(workItem, instruction);
+                shadowValues->setValue(instruction, ShadowContext::getCleanValue(instruction));
+                break;
+            }
+
+            if(const llvm::IntrinsicInst *II = llvm::dyn_cast<const llvm::IntrinsicInst>(instruction))
+            {
+                handleIntrinsicInstruction(workItem, II);
+                break;
+            }
+
+            if(function->isDeclaration())
+            {
+                if(!handleBuiltinFunction(workItem, function->getName().str(), callInst, result))
+                {
+                    // Handle external function calls
+                    checkAllOperandsDefined(workItem, instruction);
+
+                    if(callInst->getType()->isSized())
+                    {
+                        // Set return value only if function is non-void
+                        shadowValues->setValue(instruction, ShadowContext::getCleanValue(instruction));
+                    }
+                }
+                break;
+            }
+
+            assert(!function->isVarArg() && "Variadic functions are not supported!");
+
+            assert(!llvm::isa<const llvm::IntrinsicInst>(instruction) && "intrinsics are handled elsewhere");
+
+            // Fresh values for function
+            ShadowFrame *values = shadowValues->createCleanShadowFrame();
+
+            llvm::Function::const_arg_iterator argItr;
+            for (argItr = function->arg_begin(); argItr != function->arg_end(); argItr++)
+            {
+                const llvm::Value *Val = callInst->getArgOperand(argItr->getArgNo());
+
+                if (!Val->getType()->isSized())
+                {
+                    continue;
+                }
+
+                if(argItr->hasByValAttr())
+                {
+                    assert(Val->getType()->isPointerTy() && "ByVal argument is not a pointer!");
+                    // Make new copy of shadow in private memory
+                    size_t origShadowAddress = workItem->getOperand(Val).getPointer();
+                    size_t newShadowAddress = workItem->getOperand(&*argItr).getPointer();
+                    ShadowMemory *mem = shadowWorkItem->getPrivateMemory();
+                    unsigned char *origShadowData = (unsigned char*)mem->getPointer(origShadowAddress);
+                    size_t size = getTypeSize(argItr->getType()->getPointerElementType());
+
+                    // Set new shadow memory
+                    TypedValue v = ShadowContext::getCleanValue(size);
+                    memcpy(v.data, origShadowData, size);
+                    allocAndStoreShadowMemory(AddrSpacePrivate, newShadowAddress, v, workItem);
+                    values->setValue(&*argItr, ShadowContext::getCleanValue(&*argItr));
+                }
+                else
+                {
+                    TypedValue newShadow = shadowContext.getMemoryPool()->clone(shadowContext.getValue(workItem, Val));
+                    values->setValue(&*argItr, newShadow);
+                }
+            }
+
+            // Now, get the shadow for the RetVal.
+            if(callInst->getType()->isSized())
+            {
+                values->setCall(callInst);
+            }
+
+            shadowValues->pushFrame(values);
+
+            break;
+        }
+        case llvm::Instruction::ExtractElement:
+        {
+            const llvm::ExtractElementInst *extractInst = ((const llvm::ExtractElementInst*)instruction);
+
+            TypedValue indexShadow = shadowContext.getValue(workItem, extractInst->getIndexOperand());
+
+            if(!ShadowContext::isCleanValue(indexShadow))
+            {
+                logUninitializedIndex();
+            }
+
+            TypedValue vectorShadow = shadowContext.getValue(workItem, extractInst->getVectorOperand());
+            TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+            unsigned index = workItem->getOperand(extractInst->getIndexOperand()).getUInt();
+            memcpy(newShadow.data, vectorShadow.data + newShadow.size*index, newShadow.size);
+
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        case llvm::Instruction::ExtractValue:
+        {
+            const llvm::ExtractValueInst *extractInst = ((const llvm::ExtractValueInst*)instruction);
+
+            const llvm::Value *Agg = extractInst->getAggregateOperand();
+            TypedValue ResShadow = shadowContext.getMemoryPool()->clone(result);
+
+            llvm::ArrayRef<unsigned int> indices = extractInst->getIndices();
+
+            // Compute offset for target value
+            int offset = 0;
+            const llvm::Type *type = Agg->getType();
+            for (unsigned i = 0; i < indices.size(); i++)
+            {
+                if (type->isArrayTy())
+                {
+                    type = type->getArrayElementType();
+                    offset += getTypeSize(type) * indices[i];
+                }
+                else if (type->isStructTy())
+                {
+                    offset += getStructMemberOffset((const llvm::StructType*)type, indices[i]);
+                    type = type->getStructElementType(indices[i]);
+                }
+                else
+                {
+                    FATAL_ERROR("Unsupported aggregate type: %d", type->getTypeID())
+                }
+            }
+
+            // Copy target value to result
+            memcpy(ResShadow.data, shadowContext.getValue(workItem, Agg).data + offset, getTypeSize(type));
+
+            shadowValues->setValue(instruction, ResShadow);
+            break;
+        }
+        case llvm::Instruction::FAdd:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::FCmp:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::FDiv:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::FMul:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::FPExt:
+        {
+            SimpleOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::FPToSI:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::FPToUI:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::FPTrunc:
+        {
+            SimpleOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::FRem:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::FSub:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::GetElementPtr:
+        {
+            SimpleOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::ICmp:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::InsertElement:
+        {
+            TypedValue indexShadow = shadowContext.getValue(workItem, instruction->getOperand(2));
+
+            if(!ShadowContext::isCleanValue(indexShadow))
+            {
+                logUninitializedIndex();
+            }
+
+            TypedValue vectorShadow = shadowContext.getValue(workItem, instruction->getOperand(0));
+            TypedValue elementShadow = shadowContext.getValue(workItem, instruction->getOperand(1));
+            TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+            unsigned index = workItem->getOperand(instruction->getOperand(2)).getUInt();
+            memcpy(newShadow.data, vectorShadow.data, newShadow.size*newShadow.num);
+            memcpy(newShadow.data + index*newShadow.size, elementShadow.data, newShadow.size);
+
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        case llvm::Instruction::InsertValue:
+        {
+            const llvm::InsertValueInst *insertInst = (const llvm::InsertValueInst*)instruction;
+
+            TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+            // Load original aggregate data
+            const llvm::Value *agg = insertInst->getAggregateOperand();
+            memcpy(newShadow.data, shadowContext.getValue(workItem, agg).data, newShadow.size*newShadow.num);
+
+            // Compute offset for inserted value
+            int offset = 0;
+            llvm::ArrayRef<unsigned int> indices = insertInst->getIndices();
+            const llvm::Type *type = agg->getType();
+            for (unsigned i = 0; i < indices.size(); i++)
+            {
+                if (type->isArrayTy())
+                {
+                    type = type->getArrayElementType();
+                    offset += getTypeSize(type) * indices[i];
+                }
+                else if (type->isStructTy())
+                {
+                    offset += getStructMemberOffset((const llvm::StructType*)type, indices[i]);
+                    type = type->getStructElementType(indices[i]);
+                }
+                else
+                {
+                    FATAL_ERROR("Unsupported aggregate type: %d", type->getTypeID())
+                }
+            }
+
+            // Copy inserted value into result
+            const llvm::Value *value = insertInst->getInsertedValueOperand();
+            memcpy(newShadow.data + offset, shadowContext.getValue(workItem, value).data, getTypeSize(value->getType()));
+
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        case llvm::Instruction::IntToPtr:
+        {
+            TypedValue shadow = shadowContext.getValue(workItem, instruction->getOperand(0));
+            TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+            for (unsigned i = 0; i < newShadow.num; i++)
+            {
+                newShadow.setPointer(shadow.getUInt(i), i);
+            }
+
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        case llvm::Instruction::Load:
+        {
+            assert(instruction->getType()->isSized() && "Load type must have size");
+            const llvm::LoadInst *loadInst = ((const llvm::LoadInst*)instruction);
+            const llvm::Value *Addr = loadInst->getPointerOperand();
+
+            size_t address = workItem->getOperand(Addr).getPointer();
+            unsigned addrSpace = loadInst->getPointerAddressSpace();
+
+            TypedValue v = shadowContext.getMemoryPool()->clone(result);
+            loadShadowMemory(addrSpace, address, v, workItem);
+            shadowValues->setValue(instruction, v);
+
+            // Check shadow of address
+            TypedValue addrShadow = shadowContext.getValue(workItem, Addr);
+
+            if(!ShadowContext::isCleanValue(addrShadow))
+            {
+                logUninitializedAddress(addrSpace, address, false);
+            }
+
+//            if (I.isAtomic())
+//                I.setOrdering(addAcquireOrdering(I.getOrdering()));
+
+            break;
+        }
+        case llvm::Instruction::LShr:
+        {
+            TypedValue S0 = shadowContext.getValue(workItem, instruction->getOperand(0));
+            TypedValue S1 = shadowContext.getValue(workItem, instruction->getOperand(1));
+
+            if(!ShadowContext::isCleanValue(S1))
+            {
+                shadowValues->setValue(instruction, ShadowContext::getPoisonedValue(instruction));
+            }
+            else
+            {
+                TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+                TypedValue Shift = workItem->getOperand(instruction->getOperand(1));
+                uint64_t shiftMask = (S0.num > 1 ? S0.size : max((size_t)S0.size, sizeof(uint32_t))) * 8 - 1;
+
+                for (unsigned i = 0; i < S0.num; i++)
+                {
+                    newShadow.setUInt(S0.getUInt(i) >> (Shift.getUInt(i) & shiftMask), i);
+                }
+
+                shadowValues->setValue(instruction, newShadow);
+            }
+
+            break;
+        }
+        case llvm::Instruction::Mul:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::Or:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::PHI:
+        {
+            const llvm::PHINode *phiNode = (const llvm::PHINode*)instruction;
+            const llvm::Value *value = phiNode->getIncomingValueForBlock(workItem->getPreviousBlock());
+            TypedValue shadowValue = shadowContext.getValue(workItem, value);
+
+            shadowValues->setValue(instruction, shadowValue);
+            break;
+        }
+        case llvm::Instruction::PtrToInt:
+        {
+            TypedValue shadow = shadowContext.getValue(workItem, instruction->getOperand(0));
+            TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+            for (unsigned i = 0; i < newShadow.num; i++)
+            {
+                newShadow.setUInt(shadow.getPointer(i), i);
+            }
+
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        case llvm::Instruction::Ret:
+        {
+            const llvm::ReturnInst *retInst = ((const llvm::ReturnInst*)instruction);
+            const llvm::Value *RetVal = retInst->getReturnValue();
+
+            if(RetVal)
+            {
+                //Value *ShadowPtr = getValuePtrForRetval(RetVal, IRB);
+                //if (CheckReturnValue) {
+                //    insertShadowCheck(RetVal, &I);
+                //    Value *Shadow = getCleanValue(RetVal);
+                //    IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+                //} else {
+                TypedValue retValShadow = shadowContext.getMemoryPool()->clone(shadowContext.getValue(workItem, RetVal));
+                const llvm::CallInst *callInst = shadowValues->getCall();
+                shadowValues->popFrame();
+                shadowValues->setValue(callInst, retValShadow);
+                //}
+            }
+            else
+            {
+#ifdef DUMP_SHADOW
+                // Insert pseudo value to keep numbering
+                shadowValues->setValue(instruction, ShadowContext::getCleanValue(3));
+#endif
+                shadowValues->popFrame();
+            }
+
+            break;
+        }
+        case llvm::Instruction::SDiv:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::Select:
+        {
+            const llvm::SelectInst *selectInst = (const llvm::SelectInst*)instruction;
+
+            TypedValue opCondition = workItem->getOperand(selectInst->getCondition());
+            TypedValue conditionShadow = shadowContext.getValue(workItem, selectInst->getCondition());
+            TypedValue newShadow;
+
+            if(!ShadowContext::isCleanValue(conditionShadow))
+            {
+                newShadow = ShadowContext::getPoisonedValue(instruction);
+            }
+            else
+            {
+                newShadow = shadowContext.getMemoryPool()->clone(result);
+
+                for(unsigned i = 0; i < result.num; i++)
+                {
+                    const bool cond = selectInst->getCondition()->getType()->isVectorTy() ?
+                        opCondition.getUInt(i) :
+                        opCondition.getUInt();
+                    const llvm::Value *op = cond ?
+                        selectInst->getTrueValue() :
+                        selectInst->getFalseValue();
+
+                    memcpy(newShadow.data + i*newShadow.size,
+                            shadowContext.getValue(workItem, op).data + i*newShadow.size,
+                            newShadow.size);
+                }
+            }
+
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        case llvm::Instruction::SExt:
+        {
+            const llvm::Value *operand = instruction->getOperand(0);
+            TypedValue shadow = shadowContext.getValue(workItem, operand);
+            TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+            for (unsigned i = 0; i < newShadow.num; i++)
+            {
+                int64_t val = shadow.getSInt(i);
+                if (operand->getType()->getPrimitiveSizeInBits() == 1)
+                {
+                    val = val ? -1 : 0;
+                }
+                newShadow.setSInt(val, i);
+            }
+
+            shadowValues->setValue(instruction, newShadow);
+
+            break;
+        }
+        case llvm::Instruction::Shl:
+        {
+            TypedValue S0 = shadowContext.getValue(workItem, instruction->getOperand(0));
+            TypedValue S1 = shadowContext.getValue(workItem, instruction->getOperand(1));
+
+            if(!ShadowContext::isCleanValue(S1))
+            {
+                shadowValues->setValue(instruction, ShadowContext::getPoisonedValue(instruction));
+            }
+            else
+            {
+                TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+                TypedValue Shift = workItem->getOperand(instruction->getOperand(1));
+                uint64_t shiftMask = (S0.num > 1 ? S0.size : max((size_t)S0.size, sizeof(uint32_t))) * 8 - 1;
+
+                for (unsigned i = 0; i < S0.num; i++)
+                {
+                    newShadow.setUInt(S0.getUInt(i) << (Shift.getUInt(i) & shiftMask), i);
+                }
+
+                shadowValues->setValue(instruction, newShadow);
+            }
+
+            break;
+        }
+        case llvm::Instruction::ShuffleVector:
+        {
+            const llvm::ShuffleVectorInst *shuffleInst = (const llvm::ShuffleVectorInst*)instruction;
+            const llvm::Value *v1 = shuffleInst->getOperand(0);
+            const llvm::Value *v2 = shuffleInst->getOperand(1);
+            TypedValue mask = workItem->getOperand(shuffleInst->getMask());
+            TypedValue maskShadow = shadowContext.getValue(workItem, shuffleInst->getMask());
+            TypedValue newShadow = ShadowContext::getCleanValue(result);
+            TypedValue pv = ShadowContext::getPoisonedValue(newShadow.size);
+
+            unsigned num = v1->getType()->getVectorNumElements();
+            for(unsigned i = 0; i < newShadow.num; i++)
+            {
+                if(shuffleInst->getMask()->getAggregateElement(i)->getValueID() == llvm::Value::UndefValueVal || !ShadowContext::isCleanValue(maskShadow, i))
+                {
+                    // Undef value are poisoned
+                    memcpy(newShadow.data + i*newShadow.size, pv.data, newShadow.size);
+                    continue;
+                }
+
+                const llvm::Value *src = v1;
+                unsigned int index = mask.getUInt(i);
+                if(index >= num)
+                {
+                    index -= num;
+                    src = v2;
+                }
+
+                TypedValue v = shadowContext.getValue(workItem, src);
+                size_t srcOffset = index*newShadow.size;
+                memcpy(newShadow.data + i*newShadow.size, v.data + srcOffset, newShadow.size);
+            }
+
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        case llvm::Instruction::SIToFP:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::SRem:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::Store:
+        {
+            PARANOID_CHECK(workItem, instruction);
+            const llvm::StoreInst *storeInst = ((const llvm::StoreInst*)instruction);
+            const llvm::Value *Val = storeInst->getValueOperand();
+            const llvm::Value *Addr = storeInst->getPointerOperand();
+
+            size_t address = workItem->getOperand(Addr).getPointer();
+            unsigned addrSpace = storeInst->getPointerAddressSpace();
+
+            TypedValue shadowVal = storeInst->isAtomic() ? ShadowContext::getCleanValue(Val) :
+                                                           shadowContext.getValue(workItem, Val);
+            storeShadowMemory(addrSpace, address, shadowVal, workItem);
+
+            // Check shadow of address
+            TypedValue addrShadow = shadowContext.getValue(workItem, Addr);
+
+            if(!ShadowContext::isCleanValue(addrShadow))
+            {
+                logUninitializedAddress(addrSpace, address);
+            }
+            break;
+        }
+        case llvm::Instruction::Sub:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::Switch:
+        {
+            checkAllOperandsDefined(workItem, instruction);
+#ifdef DUMP_SHADOW
+            // Insert pseudo value to keep numbering
+            shadowValues->setValue(instruction, ShadowContext::getCleanValue(3));
+#endif
+            break;
+        }
+        case llvm::Instruction::Trunc:
+        {
+            TypedValue shadow = shadowContext.getValue(workItem, instruction->getOperand(0));
+            TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+            for (unsigned i = 0; i < newShadow.num; i++)
+            {
+                memcpy(newShadow.data+i*newShadow.size, shadow.data+i*shadow.size, newShadow.size);
+            }
+
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        case llvm::Instruction::UDiv:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::UIToFP:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::URem:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::Unreachable:
+            FATAL_ERROR("Encountered unreachable instruction");
+        case llvm::Instruction::Xor:
+        {
+            VectorOr(workItem, instruction);
+            break;
+        }
+        case llvm::Instruction::ZExt:
+        {
+            TypedValue shadow = shadowContext.getValue(workItem, instruction->getOperand(0));
+            TypedValue newShadow = shadowContext.getMemoryPool()->clone(result);
+
+            for (unsigned i = 0; i < newShadow.num; i++)
+            {
+                newShadow.setUInt(shadow.getUInt(i), i);
+            }
+
+            shadowValues->setValue(instruction, newShadow);
+            break;
+        }
+        default:
+            FATAL_ERROR("Unsupported instruction: %s", instruction->getOpcodeName());
+    }
+
+#ifdef DUMP_SHADOW
+    if(shadowContext.hasValue(workItem, instruction))
+    {
+        cout << shadowContext.getValue(workItem, instruction) << endl;
+    }
+#endif
+}
+
+void Uninitialized::kernelBegin(const KernelInvocation *kernelInvocation)
+{
+    const Kernel *kernel = kernelInvocation->getKernel();
+
+    // Initialise kernel arguments and global variables
+    for (auto value = kernel->values_begin(); value != kernel->values_end(); value++)
+    {
+        const llvm::Type *type = value->first->getType();
+
+        if(!type->isSized())
+        {
+            continue;
+        }
+
+        if(type->isPointerTy())
+        {
+            switch(type->getPointerAddressSpace())
+            {
+                case AddrSpaceConstant:
+                {
+                    // Constants
+                    // value->second.data == ptr
+                    // value->second.size == ptr size
+                    TypedValue cleanValue = m_pool.clone(ShadowContext::getCleanValue(value->first));
+                    shadowContext.setGlobalValue(value->first, cleanValue);
+                    const llvm::Type *elementTy = type->getPointerElementType();
+                    allocAndStoreShadowMemory(AddrSpaceConstant, value->second.getPointer(),
+                                              ShadowContext::getCleanValue(elementTy));
+                    break;
+                }
+                case AddrSpaceGlobal:
+                {
+                    // Global pointer kernel arguments
+                    // value->second.data == ptr
+                    // value->second.size == ptr size
+                    size_t address = value->second.getPointer();
+
+                    if(m_context->getGlobalMemory()->isAddressValid(address) &&
+                       !shadowContext.getGlobalMemory()->isAddressValid(address))
+                    {
+                        // Allocate poisoned global memory if there was no host store
+                        size_t size = m_context->getGlobalMemory()->getBuffer(address)->size;
+                        allocAndStoreShadowMemory(AddrSpaceGlobal, address,
+                                                  ShadowContext::getPoisonedValue(size), NULL, NULL, true);
+                    }
+
+                    m_deferredInit.push_back(*value);
+                    break;
+                }
+                case AddrSpaceLocal:
+                {
+                    // Local pointer kernel arguments and local data variables
+                    // value->second.data == NULL
+                    // value->second.size == val size
+                    if(llvm::isa<llvm::Argument>(value->first))
+                    {
+                        // Arguments have a private pointer
+                        m_deferredInit.push_back(*value);
+                    }
+                    else
+                    {
+                        // Variables have a global pointer
+                        TypedValue cleanValue = m_pool.clone(ShadowContext::getCleanValue(value->first));
+                        shadowContext.setGlobalValue(value->first, cleanValue);
+                    }
+
+                    m_deferredInitGroup.push_back(*value);
+                    break;
+                }
+                case AddrSpacePrivate:
+                {
+                    const llvm::Argument *A = llvm::dyn_cast<llvm::Argument>(value->first);
+
+                    if(A && A->hasByValAttr())
+                    {
+                        // ByVal kernel argument
+                        // value->second.data == val
+                        // value->second.size == val size
+                        m_deferredInit.push_back(*value);
+                    }
+                    else
+                    {
+                        // Private struct/Union definitions with global type
+                        // value->second.data == val
+                        // value->second.size == val size
+                        m_deferredInit.push_back(*value);
+                        TypedValue cleanValue = m_pool.clone(ShadowContext::getCleanValue(value->first));
+                        //TODO: Structs can have posioned padding bytes. Is this important?
+                        shadowContext.setGlobalValue(value->first, cleanValue);
+                    }
+                    break;
+                }
+                default:
+                    FATAL_ERROR("Unsupported addressspace %d", type->getPointerAddressSpace());
+            }
+        }
+        else
+        {
+            // Non pointer type kernel arguments
+            // value->second.data == val
+            // value->second.size == val size
+            m_deferredInit.push_back(*value);
+        }
+    }
+}
+
+void Uninitialized::kernelEnd(const KernelInvocation *kernelInvocation)
+{
+    m_deferredInit.clear();
+    m_deferredInitGroup.clear();
+    shadowContext.clearGlobalValues();
+}
+
+void Uninitialized::loadShadowMemory(unsigned addrSpace, size_t address, TypedValue &SM, const WorkItem *workItem, const WorkGroup *workGroup)
+{
+    if(addrSpace == AddrSpaceConstant)
+    {
+        //TODO: Eventually load value
+        memset(SM.data, 0, SM.size*SM.num);
+        return;
+    }
+
+    ShadowMemory *memory = getShadowMemory(addrSpace, workItem, workGroup);
+    memory->load(SM.data, address, SM.size*SM.num);
+
+#ifdef DUMP_SHADOW
+    cout << "Loaded " << hex << SM << " from space " << dec << addrSpace << " at address " << hex << address << endl;
+#endif
+}
+
+void Uninitialized::logUninitializedAddress(unsigned int addrSpace, size_t address, bool write) const
+{
+  Context::Message msg(WARNING, m_context);
+  msg << "Uninitialized address used to " << (write ? "write to " : "read from ")
+      << getAddressSpaceName(addrSpace)
+      << " memory address 0x" << hex << address << endl
+      << msg.INDENT
+      << "Kernel: " << msg.CURRENT_KERNEL << endl
+      << "Entity: " << msg.CURRENT_ENTITY << endl
+      << msg.CURRENT_LOCATION << endl;
+  msg.send();
+}
+
+void Uninitialized::logUninitializedCF() const
+{
+  Context::Message msg(WARNING, m_context);
+  msg << "Controlflow depends on uninitialized value" << endl
+      << msg.INDENT
+      << "Kernel: " << msg.CURRENT_KERNEL << endl
+      << "Entity: " << msg.CURRENT_ENTITY << endl
+      << msg.CURRENT_LOCATION << endl;
+  msg.send();
+}
+
+void Uninitialized::logUninitializedIndex() const
+{
+  Context::Message msg(WARNING, m_context);
+  msg << "Instruction depends on an uninitialized index value" << endl
+      << msg.INDENT
+      << "Kernel: " << msg.CURRENT_KERNEL << endl
+      << "Entity: " << msg.CURRENT_ENTITY << endl
+      << msg.CURRENT_LOCATION << endl;
+  msg.send();
+}
+
+void Uninitialized::logUninitializedWrite(unsigned int addrSpace, size_t address) const
+{
+  Context::Message msg(WARNING, m_context);
+  msg << "Uninitialized value written to "
+      << getAddressSpaceName(addrSpace)
+      << " memory address 0x" << hex << address << endl
+      << msg.INDENT
+      << "Kernel: " << msg.CURRENT_KERNEL << endl
+      << "Entity: " << msg.CURRENT_ENTITY << endl
+      << msg.CURRENT_LOCATION << endl;
+  msg.send();
+}
+
+void Uninitialized::memoryMap(const Memory *memory, size_t address,
+                                      size_t offset, size_t size, cl_map_flags flags)
+{
+    if(!(flags & CL_MAP_READ))
+    {
+        allocAndStoreShadowMemory(memory->getAddressSpace(), address + offset,
+                ShadowContext::getCleanValue(size));
+    }
+}
+
+void Uninitialized::VectorOr(const WorkItem *workItem, const llvm::Instruction *I)
+{
+    PARANOID_CHECK(workItem, I);
+    ShadowValues *shadowValues = shadowContext.getShadowWorkItem(workItem)->getValues();
+
+    TypedValue newShadow = ShadowContext::getCleanValue(I);
+
+    for(llvm::Instruction::const_op_iterator OI = I->op_begin(); OI != I->op_end(); ++OI)
+    {
+        ShadowContext::shadowOr(newShadow, shadowContext.getValue(workItem, OI->get()));
+    }
+
+    shadowValues->setValue(I, newShadow);
+}
+
+void Uninitialized::SimpleOr(const WorkItem *workItem, const llvm::Instruction *I)
+{
+    PARANOID_CHECK(workItem, I);
+    ShadowValues *shadowValues = shadowContext.getShadowWorkItem(workItem)->getValues();
+
+    for(llvm::Instruction::const_op_iterator OI = I->op_begin(); OI != I->op_end(); ++OI)
+    {
+        if(!ShadowContext::isCleanValue(shadowContext.getValue(workItem, OI->get())))
+        {
+            shadowValues->setValue(I, ShadowContext::getPoisonedValue(I));
+            return;
+        }
+    }
+
+    shadowValues->setValue(I, ShadowContext::getCleanValue(I));
+}
+
+void Uninitialized::SimpleOrAtomic(const WorkItem *workItem, const llvm::CallInst *CI)
+{
+    const llvm::Value *Addr = CI->getArgOperand(0);
+    unsigned addrSpace = Addr->getType()->getPointerAddressSpace();
+    size_t address = workItem->getOperand(Addr).getPointer();
+
+    TypedValue oldShadow = {
+        4,
+        1,
+        shadowContext.getMemoryPool()->alloc(4)
+    };
+
+    TypedValue newShadow = ShadowContext::getCleanValue(4);
+
+    if(addrSpace == AddrSpaceGlobal)
+    {
+        shadowContext.getGlobalMemory()->lock(address);
+    }
+
+    loadShadowMemory(addrSpace, address, oldShadow, workItem);
+
+    if (!ShadowContext::isCleanValue(oldShadow))
+    {
+        newShadow = ShadowContext::getPoisonedValue(4);
+    }
+
+    if (CI->getNumArgOperands() > 1)
+    {
+        TypedValue argShadow = shadowContext.getValue(workItem, CI->getArgOperand(1));
+        if(!ShadowContext::isCleanValue(argShadow))
+        {
+            newShadow = ShadowContext::getPoisonedValue(4);
+        }
+    }
+
+    storeShadowMemory(addrSpace, address, newShadow, workItem);
+
+    if(addrSpace == AddrSpaceGlobal)
+    {
+        shadowContext.getGlobalMemory()->unlock(address);
+    }
+
+    ShadowValues *shadowValues = shadowContext.getShadowWorkItem(workItem)->getValues();
+    shadowValues->setValue(CI, oldShadow);
+
+    // Check shadow of address
+    TypedValue addrShadow = shadowContext.getValue(workItem, Addr);
+
+    if(!ShadowContext::isCleanValue(addrShadow))
+    {
+        logUninitializedAddress(addrSpace, address);
+    }
+}
+
+void Uninitialized::storeShadowMemory(unsigned addrSpace, size_t address, TypedValue SM, const WorkItem *workItem, const WorkGroup *workGroup, bool unchecked)
+{
+#ifdef DUMP_SHADOW
+    cout << "Store " << hex << SM << " to space " << dec << addrSpace << " at address " << hex << address << endl;
+#endif
+
+    if(!unchecked && addrSpace != AddrSpacePrivate && !ShadowContext::isCleanValue(SM))
+    {
+#ifdef DUMP_SHADOW
+        shadowContext.dump(workItem);
+#endif
+        logUninitializedWrite(addrSpace, address);
+    }
+
+    if(addrSpace == AddrSpaceConstant)
+    {
+        //TODO: Eventually store value
+        return;
+    }
+
+    ShadowMemory *memory = getShadowMemory(addrSpace, workItem, workGroup);
+    memory->store(SM.data, address, SM.size*SM.num);
+}
+
+void Uninitialized::workItemBegin(const WorkItem *workItem)
+{
+    shadowContext.createMemoryPool();
+    shadowContext.allocateWorkItems();
+    ShadowWorkItem *shadowWI = shadowContext.createShadowWorkItem(workItem);
+    ShadowValues *shadowValues = shadowWI->getValues();
+
+    for(auto value : m_deferredInit)
+    {
+        const llvm::Type *type = value.first->getType();
+
+        if(type->isPointerTy())
+        {
+            switch(type->getPointerAddressSpace())
+            {
+                case AddrSpaceGlobal:
+                {
+                    // Global pointer kernel arguments
+                    // value.second.data == ptr
+                    // value.second.size == ptr size
+                    shadowValues->setValue(value.first, ShadowContext::getCleanValue(type));
+                    break;
+                }
+                case AddrSpaceLocal:
+                {
+                    // Local pointer kernel arguments
+                    // value.second.data == NULL
+                    // value.second.size == val size
+                    shadowValues->setValue(value.first, ShadowContext::getCleanValue(value.first));
+                    break;
+                }
+                case AddrSpacePrivate:
+                {
+                    const llvm::Argument *A = llvm::dyn_cast<llvm::Argument>(value.first);
+
+                    if(A && A->hasByValAttr())
+                    {
+                        // ByVal kernel argument
+                        // value.second.data == val
+                        // value.second.size == val size
+                        size_t address = workItem->getOperand(value.first).getPointer();
+                        TypedValue cleanValue = ShadowContext::getCleanValue(value.second.size);
+                        allocAndStoreShadowMemory(AddrSpacePrivate, address, cleanValue, workItem);
+                        shadowValues->setValue(value.first, ShadowContext::getCleanValue(value.first));
+                    }
+                    else
+                    {
+                        // Private struct/Union definitions with global type
+                        // value.second.data == NULL
+                        // value.second.size == val size
+                        size_t address = workItem->getOperand(value.first).getPointer();
+                        TypedValue cleanValue = ShadowContext::getCleanValue(value.second.size);
+                        allocAndStoreShadowMemory(AddrSpacePrivate, address, cleanValue, workItem);
+                    }
+                    break;
+                }
+            }
+        }
+        else
+        {
+            // Non pointer type kernel arguments
+            // value->second.data == val
+            // value->second.size == val size
+            shadowValues->setValue(value.first, ShadowContext::getCleanValue(value.first));
+        }
+    }
+}
+
+void Uninitialized::workItemComplete(const WorkItem *workItem)
+{
+    shadowContext.destroyShadowWorkItem(workItem);
+    shadowContext.freeWorkItems();
+    shadowContext.destroyMemoryPool();
+}
+
+void Uninitialized::workGroupBegin(const WorkGroup *workGroup)
+{
+    shadowContext.createMemoryPool();
+    shadowContext.allocateWorkGroups();
+    shadowContext.createShadowWorkGroup(workGroup);
+
+    for(auto value : m_deferredInitGroup)
+    {
+        // Local data variables
+        // value->second.data == NULL
+        // value->second.size == val size
+        size_t address = workGroup->getLocalMemoryAddress(value.first);
+        TypedValue v;
+
+        if(llvm::isa<llvm::Argument>(value.first))
+        {
+            //TODO: Local memory clean or poisoned? May need to differentiate
+            //      between kernel argument (?) and variable (poisoned)
+            v = ShadowContext::getPoisonedValue(value.second.size);
+        }
+        else
+        {
+            v = ShadowContext::getPoisonedValue(value.second.size);
+        }
+
+        allocAndStoreShadowMemory(AddrSpaceLocal, address, v, NULL, workGroup, true);
+    }
+}
+
+void Uninitialized::workGroupComplete(const WorkGroup *workGroup)
+{
+    shadowContext.destroyShadowWorkGroup(workGroup);
+    shadowContext.freeWorkGroups();
+    shadowContext.destroyMemoryPool();
+}
+
+ShadowFrame::ShadowFrame() :
+    m_values(new UnorderedTypedValueMap())
+{
+#ifdef DUMP_SHADOW
+    m_valuesList = new ValuesList();
+#endif
+}
+
+ShadowFrame::~ShadowFrame()
+{
+    delete m_values;
+#ifdef DUMP_SHADOW
+    delete m_valuesList;
+#endif
+}
+
+void ShadowFrame::dump() const
+{
+    cout << "==== ShadowMap (private) =======" << endl;
+
+#ifdef DUMP_SHADOW
+    ValuesList::const_iterator itr;
+    unsigned num = 1;
+
+    for(itr = m_valuesList->begin(); itr != m_valuesList->end(); ++itr)
+    {
+        if((*itr)->hasName())
+        {
+            cout << "%" << (*itr)->getName().str() << ": " << m_values->at(*itr) << endl;
+        }
+        else
+        {
+            cout << "%" << dec << num++ << ": " << m_values->at(*itr) << endl;
+        }
+    }
+#else
+    cout << endl << "Dump not activated!" << endl;
+#endif
+
+    cout << "=======================" << endl;
+}
+
+TypedValue ShadowFrame::getValue(const llvm::Value *V) const
+{
+    if (llvm::isa<llvm::Instruction>(V)) {
+        // For instructions the shadow is already stored in the map.
+        assert(m_values->count(V) && "No shadow for instruction value");
+        return m_values->at(V);
+    }
+    else if (llvm::isa<llvm::UndefValue>(V)) {
+        return ShadowContext::getPoisonedValue(V);
+    }
+    else if (llvm::isa<llvm::Argument>(V)) {
+        // For arguments the shadow is already stored in the map.
+        assert(m_values->count(V) && "No shadow for argument value");
+        return m_values->at(V);
+    }
+    else if(const llvm::ConstantVector *VC = llvm::dyn_cast<llvm::ConstantVector>(V))
+    {
+        TypedValue vecShadow = ShadowContext::getCleanValue(V);
+        TypedValue elemShadow;
+
+        for(unsigned i = 0; i < vecShadow.num; ++i)
+        {
+            elemShadow = getValue(VC->getAggregateElement(i));
+            size_t offset = i*vecShadow.size;
+            memcpy(vecShadow.data + offset, elemShadow.data, vecShadow.size);
+        }
+
+        return vecShadow;
+    }
+    else
+    {
+        // For everything else the shadow is zero.
+        return ShadowContext::getCleanValue(V);
+    }
+}
+
+void ShadowFrame::setValue(const llvm::Value *V, TypedValue SV)
+{
+#ifdef DUMP_SHADOW
+    if(!m_values->count(V))
+    {
+        m_valuesList->push_back(V);
+    }
+    else
+    {
+        cout << "Shadow for value " << V->getName().str() << " reset!" << endl;
+    }
+#endif
+    (*m_values)[V] = SV;
+}
+
+ShadowValues::ShadowValues() :
+    m_stack(new ShadowValuesStack())
+{
+    pushFrame(createCleanShadowFrame());
+}
+
+ShadowValues::~ShadowValues()
+{
+    while(!m_stack->empty())
+    {
+        popFrame();
+    }
+
+    delete m_stack;
+}
+
+ShadowFrame* ShadowValues::createCleanShadowFrame()
+{
+    return new ShadowFrame();
+}
+
+ShadowWorkItem::ShadowWorkItem(unsigned bufferBits) :
+    m_memory(new ShadowMemory(AddrSpacePrivate, bufferBits)), m_values(new ShadowValues())
+{
+}
+
+ShadowWorkItem::~ShadowWorkItem()
+{
+    delete m_memory;
+    delete m_values;
+}
+
+ShadowWorkGroup::ShadowWorkGroup(unsigned bufferBits) :
+    //FIXME: Hard coded values
+    m_memory(new ShadowMemory(AddrSpaceLocal, sizeof(size_t) == 8 ? 16 : 8))
+{
+}
+
+ShadowWorkGroup::~ShadowWorkGroup()
+{
+    delete m_memory;
+}
+
+ShadowMemory::ShadowMemory(AddressSpace addrSpace, unsigned bufferBits) :
+    m_addrSpace(addrSpace), m_map(), m_numBitsAddress((sizeof(size_t)<<3) - bufferBits), m_numBitsBuffer(bufferBits)
+{
+}
+
+ShadowMemory::~ShadowMemory()
+{
+    clear();
+}
+
+void ShadowMemory::allocate(size_t address, size_t size)
+{
+    size_t index = extractBuffer(address);
+
+    if(m_map.count(index))
+    {
+        deallocate(address);
+    }
+
+    Buffer *buffer = new Buffer();
+    buffer->size   = size;
+    buffer->flags  = 0;
+    buffer->data   = new unsigned char[size];
+
+    m_map[index] = buffer;
+}
+
+void ShadowMemory::clear()
+{
+    MemoryMap::iterator mItr;
+    for(mItr = m_map.begin(); mItr != m_map.end(); ++mItr)
+    {
+        delete[] mItr->second->data;
+        delete mItr->second;
+    }
+}
+
+void ShadowMemory::deallocate(size_t address)
+{
+    size_t index = extractBuffer(address);
+
+    assert(m_map.count(index) && "Cannot deallocate non existing memory!");
+
+    delete[] m_map.at(index)->data;
+    delete m_map.at(index);
+    m_map.at(index) = NULL;
+}
+
+void ShadowMemory::dump() const
+{
+    cout << "====== ShadowMem (" << getAddressSpaceName(m_addrSpace) << ") ======";
+
+    for(unsigned b = 0, o = 1; b < m_map.size(); o++)
+    {
+        if(!m_map.count(b+o))
+        {
+            continue;
+        }
+
+        for(unsigned i = 0; i < m_map.at(b+o)->size; i++)
+        {
+            if (i%4 == 0)
+            {
+                cout << endl << hex << uppercase
+                    << setw(16) << setfill(' ') << right
+                    << ((((size_t)b+o)<<m_numBitsAddress) | i) << ":";
+            }
+            cout << " " << hex << uppercase << setw(2) << setfill('0')
+                << (int)m_map.at(b+o)->data[i];
+        }
+
+        ++b;
+        o = 0;
+    }
+    cout << endl;
+
+    cout << "=======================" << endl;
+}
+
+size_t ShadowMemory::extractBuffer(size_t address) const
+{
+    return (address >> m_numBitsAddress);
+}
+
+size_t ShadowMemory::extractOffset(size_t address) const
+{
+    return (address & (((size_t)-1) >> m_numBitsBuffer));
+}
+
+void* ShadowMemory::getPointer(size_t address) const
+{
+    size_t index = extractBuffer(address);
+    size_t offset= extractOffset(address);
+
+    assert(m_map.count(index) && "No shadow memory found!");
+
+    return m_map.at(index)->data + offset;
+}
+
+bool ShadowMemory::isAddressValid(size_t address, size_t size) const
+{
+    size_t index = extractBuffer(address);
+    size_t offset = extractOffset(address);
+    return m_map.count(index) && (offset + size <= m_map.at(index)->size);
+}
+
+void ShadowMemory::load(unsigned char *dst, size_t address, size_t size) const
+{
+    size_t index = extractBuffer(address);
+    size_t offset = extractOffset(address);
+
+    if(isAddressValid(address, size))
+    {
+        assert(m_map.count(index) && "No shadow memory found!");
+        memcpy(dst, m_map.at(index)->data + offset, size);
+    }
+    else
+    {
+        TypedValue v = ShadowContext::getPoisonedValue(size);
+        memcpy(dst, v.data, size);
+    }
+}
+
+void ShadowMemory::lock(size_t address) const
+{
+    size_t offset = extractOffset(address);
+    ATOMIC_MUTEX(offset).lock();
+}
+
+void ShadowMemory::store(const unsigned char *src, size_t address, size_t size)
+{
+    size_t index = extractBuffer(address);
+    size_t offset = extractOffset(address);
+
+    if(isAddressValid(address, size))
+    {
+        assert(m_map.count(index) && "Cannot store to unallocated memory!");
+        memcpy(m_map.at(index)->data + offset, src, size);
+    }
+}
+
+void ShadowMemory::unlock(size_t address) const
+{
+    size_t offset = extractOffset(address);
+    ATOMIC_MUTEX(offset).unlock();
+}
+
+ShadowContext::ShadowContext(unsigned bufferBits) :
+    m_globalMemory(new ShadowMemory(AddrSpaceGlobal, bufferBits)), m_globalValues(), m_numBitsBuffer(bufferBits)
+{
+}
+
+ShadowContext::~ShadowContext()
+{
+    delete m_globalMemory;
+}
+
+void ShadowContext::allocateWorkItems()
+{
+    if(!m_workSpace.workItems)
+    {
+        m_workSpace.workItems = new ShadowItemMap();
+    }
+}
+
+void ShadowContext::allocateWorkGroups()
+{
+    if(!m_workSpace.workGroups)
+    {
+        m_workSpace.workGroups = new ShadowGroupMap();
+    }
+}
+
+void ShadowContext::clearGlobalValues()
+{
+    m_globalValues.clear();
+}
+
+void ShadowContext::createMemoryPool()
+{
+    if(m_workSpace.poolUsers == 0)
+    {
+        m_workSpace.memoryPool = new MemoryPool();
+    }
+
+    ++m_workSpace.poolUsers;
+}
+
+ShadowWorkItem* ShadowContext::createShadowWorkItem(const WorkItem *workItem)
+{
+    assert(!m_workSpace.workItems->count(workItem) && "Workitems may only have one shadow");
+    ShadowWorkItem *sWI = new ShadowWorkItem(m_numBitsBuffer);
+    (*m_workSpace.workItems)[workItem] = sWI;
+    return sWI;
+}
+
+ShadowWorkGroup* ShadowContext::createShadowWorkGroup(const WorkGroup *workGroup)
+{
+    assert(!m_workSpace.workGroups->count(workGroup) && "Workgroups may only have one shadow");
+    ShadowWorkGroup *sWG = new ShadowWorkGroup(m_numBitsBuffer);
+    (*m_workSpace.workGroups)[workGroup] = sWG;
+    return sWG;
+}
+
+void ShadowContext::destroyMemoryPool()
+{
+    --m_workSpace.poolUsers;
+
+    if(m_workSpace.poolUsers == 0)
+    {
+        delete m_workSpace.memoryPool;
+    }
+}
+
+void ShadowContext::destroyShadowWorkItem(const WorkItem *workItem)
+{
+    assert(m_workSpace.workItems->count(workItem) && "No shadow for workitem found!");
+    delete (*m_workSpace.workItems)[workItem];
+    m_workSpace.workItems->erase(workItem);
+}
+
+void ShadowContext::destroyShadowWorkGroup(const WorkGroup *workGroup)
+{
+    assert(m_workSpace.workGroups->count(workGroup) && "No shadow for workgroup found!");
+    delete (*m_workSpace.workGroups)[workGroup];
+    m_workSpace.workGroups->erase(workGroup);
+}
+
+void ShadowContext::dump(const WorkItem *workItem) const
+{
+    dumpGlobalValues();
+    m_globalMemory->dump();
+    if(m_workSpace.workGroups && m_workSpace.workGroups->size())
+    {
+        m_workSpace.workGroups->begin()->second->dump();
+    }
+    if(m_workSpace.workItems && m_workSpace.workItems->size())
+    {
+        if(workItem)
+        {
+            cout << "Item " << workItem->getGlobalID() << endl;
+            getShadowWorkItem(workItem)->dump();
+        }
+        else
+        {
+            ShadowItemMap::const_iterator itr;
+            for(itr = m_workSpace.workItems->begin(); itr != m_workSpace.workItems->end(); ++itr)
+            {
+                cout << "Item " << itr->first->getGlobalID() << endl;
+                itr->second->dump();
+            }
+        }
+    }
+}
+
+void ShadowContext::dumpGlobalValues() const
+{
+    cout << "==== ShadowMap (global) =======" << endl;
+
+    UnorderedTypedValueMap::const_iterator itr;
+    unsigned num = 1;
+
+    for(itr = m_globalValues.begin(); itr != m_globalValues.end(); ++itr)
+    {
+        if(itr->first->hasName())
+        {
+            cout << "%" << itr->first->getName().str() << ": " << itr->second << endl;
+        }
+        else
+        {
+            cout << "%" << dec << num++ << ": " << itr->second << endl;
+        }
+    }
+
+    cout << "=======================" << endl;
+}
+
+void ShadowContext::freeWorkItems()
+{
+    if(m_workSpace.workItems && !m_workSpace.workItems->size())
+    {
+        delete m_workSpace.workItems;
+        m_workSpace.workItems = NULL;
+    }
+}
+
+void ShadowContext::freeWorkGroups()
+{
+    if(m_workSpace.workGroups && !m_workSpace.workGroups->size())
+    {
+        delete m_workSpace.workGroups;
+        m_workSpace.workGroups = NULL;
+    }
+}
+
+TypedValue ShadowContext::getCleanValue(unsigned size)
+{
+    TypedValue v = {
+        size,
+        1,
+        m_workSpace.memoryPool->alloc(size)
+    };
+
+    memset(v.data, 0, size);
+
+    return v;
+}
+
+TypedValue ShadowContext::getCleanValue(TypedValue v)
+{
+    TypedValue c = {
+        v.size,
+        v.num,
+        m_workSpace.memoryPool->alloc(v.size*v.num)
+    };
+
+    memset(c.data, 0, v.size*v.num);
+
+    return c;
+}
+
+TypedValue ShadowContext::getCleanValue(const llvm::Value *V)
+{
+    pair<unsigned,unsigned> size = getValueSize(V);
+    TypedValue v = {
+        size.first,
+        size.second,
+        m_workSpace.memoryPool->alloc(size.first*size.second)
+    };
+
+    memset(v.data, 0, v.size*v.num);
+
+    return v;
+}
+
+TypedValue ShadowContext::getCleanValue(const llvm::Type *Ty)
+{
+    unsigned size = getTypeSize(Ty);
+    TypedValue v = {
+        size,
+        1,
+        m_workSpace.memoryPool->alloc(size)
+    };
+
+    memset(v.data, 0, v.size);
+
+    return v;
+}
+
+TypedValue ShadowContext::getPoisonedValue(unsigned size)
+{
+    TypedValue v = {
+        size,
+        1,
+        m_workSpace.memoryPool->alloc(size)
+    };
+
+    memset(v.data, -1, size);
+
+    return v;
+}
+
+TypedValue ShadowContext::getPoisonedValue(TypedValue v)
+{
+    TypedValue p = {
+        v.size,
+        v.num,
+        m_workSpace.memoryPool->alloc(v.size*v.num)
+    };
+
+    memset(p.data, -1, v.size*v.num);
+
+    return p;
+}
+
+TypedValue ShadowContext::getPoisonedValue(const llvm::Value *V)
+{
+    pair<unsigned,unsigned> size = getValueSize(V);
+    TypedValue v = {
+        size.first,
+        size.second,
+        m_workSpace.memoryPool->alloc(size.first*size.second)
+    };
+
+    memset(v.data, -1, v.size*v.num);
+
+    return v;
+}
+
+TypedValue ShadowContext::getPoisonedValue(const llvm::Type *Ty)
+{
+    unsigned size = getTypeSize(Ty);
+    TypedValue v = {
+        size,
+        1,
+        m_workSpace.memoryPool->alloc(size)
+    };
+
+    memset(v.data, -1, v.size);
+
+    return v;
+}
+
+TypedValue ShadowContext::getValue(const WorkItem *workItem, const llvm::Value *V) const
+{
+    if(m_globalValues.count(V))
+    {
+        return m_globalValues.at(V);
+    }
+    else
+    {
+        ShadowValues *shadowValues = getShadowWorkItem(workItem)->getValues();
+        return shadowValues->getValue(V);
+    }
+}
+
+bool ShadowContext::isCleanImage(const TypedValue shadowImage)
+{
+    return (isCleanImageAddress(shadowImage) &&
+            isCleanImageDescription(shadowImage) &&
+            isCleanImageFormat(shadowImage));
+}
+
+bool ShadowContext::isCleanImageAddress(const TypedValue shadowImage)
+{
+    Image *image = (Image*)shadowImage.data;
+
+    return ShadowContext::isCleanValue(image->address);
+}
+
+bool ShadowContext::isCleanImageDescription(const TypedValue shadowImage)
+{
+    Image *image = (Image*)shadowImage.data;
+
+    //TODO: image->desc.buffer is currently not checked
+    return (ShadowContext::isCleanValue(image->desc.image_type) &&
+            ShadowContext::isCleanValue(image->desc.image_width) &&
+            ShadowContext::isCleanValue(image->desc.image_height) &&
+            ShadowContext::isCleanValue(image->desc.image_depth) &&
+            ShadowContext::isCleanValue(image->desc.image_array_size) &&
+            ShadowContext::isCleanValue(image->desc.image_row_pitch) &&
+            ShadowContext::isCleanValue(image->desc.image_slice_pitch) &&
+            ShadowContext::isCleanValue(image->desc.num_mip_levels) &&
+            ShadowContext::isCleanValue(image->desc.num_samples));
+}
+
+bool ShadowContext::isCleanImageFormat(const TypedValue shadowImage)
+{
+    Image *image = (Image*)shadowImage.data;
+
+    return (ShadowContext::isCleanValue(image->format.image_channel_order) &&
+            ShadowContext::isCleanValue(image->format.image_channel_data_type));
+}
+
+bool ShadowContext::isCleanStruct(ShadowMemory *shadowMemory, size_t address, const llvm::StructType *structTy)
+{
+    if(structTy->isPacked())
+    {
+        unsigned size = getTypeSize(structTy);
+        TypedValue v = {
+            size,
+            1,
+            m_workSpace.memoryPool->alloc(size)
+        };
+
+        shadowMemory->load(v.data, address, size);
+
+        return isCleanValue(v);
+    }
+    else
+    {
+        for(unsigned i = 0; i < structTy->getStructNumElements(); ++i)
+        {
+            size_t offset = getStructMemberOffset(structTy, i);
+            unsigned size = getTypeSize(structTy->getElementType(i));
+
+            if(const llvm::StructType *elemTy = llvm::dyn_cast<llvm::StructType>(structTy->getElementType(i)))
+            {
+                if(!isCleanStruct(shadowMemory, address + offset, elemTy))
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                TypedValue v = {
+                    size,
+                    1,
+                    m_workSpace.memoryPool->alloc(size)
+                };
+
+                shadowMemory->load(v.data, address + offset, size);
+
+                if(!isCleanValue(v))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    }
+}
+
+bool ShadowContext::isCleanValue(unsigned long v)
+{
+    return v == 0UL;
+}
+
+bool ShadowContext::isCleanValue(TypedValue v)
+{
+    return (ShadowContext::getCleanValue(v) == v);
+}
+
+bool ShadowContext::isCleanValue(TypedValue v, unsigned offset)
+{
+    TypedValue c = ShadowContext::getCleanValue(v.size);
+    return !memcmp(v.data + offset*v.size, c.data, v.size);
+}
+
+void ShadowContext::setGlobalValue(const llvm::Value *V, TypedValue SV)
+{
+    assert(!m_globalValues.count(V) && "Values may only have one shadow");
+    m_globalValues[V] = SV;
+}
+
+void ShadowContext::shadowOr(TypedValue v1, TypedValue v2)
+{
+    assert(v1.num == v2.num && "Cannot create shadow for vectors of different lengths!");
+
+    for(unsigned int i = 0; i < v1.num; ++i)
+    {
+        if(!ShadowContext::isCleanValue(v2, i))
+        {
+            memset(v1.data + i * v1.size, 0xff, v1.size);
+        }
+    }
+}
diff --git a/src/plugins/Uninitialized.h b/src/plugins/Uninitialized.h
new file mode 100644
index 0000000..eca4957
--- /dev/null
+++ b/src/plugins/Uninitialized.h
@@ -0,0 +1,314 @@
+// Uninitialized.h (Oclgrind)
+// Copyright (c) 2015, Moritz Pflanzer
+// Imperial College London. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/Plugin.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+//#define DUMP_SHADOW
+//#define PARANOID_CHECK(W, I) assert(checkAllOperandsDefined(W, I) && "Not all operands defined")
+//#define PARANOID_CHECK(W, I) checkAllOperandsDefined(W, I)
+#define PARANOID_CHECK(W, I) (void*)0
+
+namespace oclgrind
+{
+    typedef std::unordered_map<const llvm::Value*, TypedValue> UnorderedTypedValueMap;
+
+    class ShadowFrame
+    {
+        public:
+            ShadowFrame();
+            virtual ~ShadowFrame();
+
+            void dump() const;
+            inline const llvm::CallInst* getCall() const
+            {
+                return m_call;
+            }
+            TypedValue getValue(const llvm::Value *V) const;
+            inline bool hasValue(const llvm::Value* V) const
+            {
+                return llvm::isa<llvm::Constant>(V) || m_values->count(V);
+            }
+            inline void setCall(const llvm::CallInst *CI)
+            {
+                m_call = CI;
+            }
+            void setValue(const llvm::Value *V, TypedValue SV);
+
+        private:
+            typedef std::list<const llvm::Value*> ValuesList;
+
+            const llvm::CallInst *m_call;
+            UnorderedTypedValueMap *m_values;
+#ifdef DUMP_SHADOW
+            ValuesList *m_valuesList;
+#endif
+    };
+
+    class ShadowValues
+    {
+        public:
+            ShadowValues();
+            virtual ~ShadowValues();
+
+            ShadowFrame* createCleanShadowFrame();
+            inline void dump() const
+            {
+                m_stack->top()->dump();
+            }
+            inline const llvm::CallInst* getCall() const
+            {
+                return m_stack->top()->getCall();
+            }
+            inline TypedValue getValue(const llvm::Value *V) const
+            {
+                return m_stack->top()->getValue(V);
+            }
+            inline bool hasValue(const llvm::Value* V) const
+            {
+                return llvm::isa<llvm::Constant>(V) || m_stack->top()->hasValue(V);
+            }
+            inline void popFrame()
+            {
+                ShadowFrame *frame = m_stack->top();
+                m_stack->pop();
+                delete frame;
+            }
+            inline void pushFrame(ShadowFrame *frame)
+            {
+                m_stack->push(frame);
+            }
+            inline void setCall(const llvm::CallInst *CI)
+            {
+                m_stack->top()->setCall(CI);
+            }
+            inline void setValue(const llvm::Value *V, TypedValue SV)
+            {
+                m_stack->top()->setValue(V, SV);
+            }
+
+        private:
+            typedef std::stack<ShadowFrame*> ShadowValuesStack;
+
+            ShadowValuesStack *m_stack;
+    };
+
+    class ShadowMemory
+    {
+        public:
+            struct Buffer
+            {
+                size_t size;
+                cl_mem_flags flags;
+                unsigned char *data;
+            };
+
+            ShadowMemory(AddressSpace addrSpace, unsigned bufferBits);
+            virtual ~ShadowMemory();
+
+            void allocate(size_t address, size_t size);
+            void dump() const;
+            void* getPointer(size_t address) const;
+            bool isAddressValid(size_t address, size_t size=1) const;
+            void load(unsigned char *dst, size_t address, size_t size=1) const;
+            void lock(size_t address) const;
+            void store(const unsigned char *src, size_t address, size_t size=1);
+            void unlock(size_t address) const;
+
+        private:
+            typedef std::unordered_map<size_t, Buffer*> MemoryMap;
+
+            AddressSpace m_addrSpace;
+            MemoryMap m_map;
+            unsigned m_numBitsAddress;
+            unsigned m_numBitsBuffer;
+
+            void clear();
+            void deallocate(size_t address);
+            size_t extractBuffer(size_t address) const;
+            size_t extractOffset(size_t address) const;
+    };
+
+    class ShadowWorkItem
+    {
+        public:
+            ShadowWorkItem(unsigned bufferBits);
+            virtual ~ShadowWorkItem();
+
+            inline void dump() const
+            {
+                m_values->dump();
+                m_memory->dump();
+            }
+            inline ShadowMemory* getPrivateMemory()
+            {
+                return m_memory;
+            }
+            inline ShadowValues* getValues() const
+            {
+                return m_values;
+            }
+
+        private:
+            ShadowMemory *m_memory;
+            ShadowValues *m_values;
+    };
+
+    class ShadowWorkGroup
+    {
+        public:
+            ShadowWorkGroup(unsigned bufferBits);
+            virtual ~ShadowWorkGroup();
+
+            inline void dump() const
+            {
+                m_memory->dump();
+            }
+            inline ShadowMemory* getLocalMemory()
+            {
+                return m_memory;
+            }
+
+        private:
+            ShadowMemory *m_memory;
+    };
+
+    class ShadowContext
+    {
+        public:
+            ShadowContext(unsigned bufferBits);
+            virtual ~ShadowContext();
+
+            void allocateWorkItems();
+            void allocateWorkGroups();
+            void clearGlobalValues();
+            void createMemoryPool();
+            ShadowWorkItem* createShadowWorkItem(const WorkItem *workItem);
+            ShadowWorkGroup* createShadowWorkGroup(const WorkGroup *workGroup);
+            void destroyMemoryPool();
+            void destroyShadowWorkItem(const WorkItem *workItem);
+            void destroyShadowWorkGroup(const WorkGroup *workGroup);
+            void dump(const WorkItem *workItem) const;
+            void dumpGlobalValues() const;
+            void freeWorkItems();
+            void freeWorkGroups();
+            static TypedValue getCleanValue(unsigned size);
+            static TypedValue getCleanValue(TypedValue v);
+            static TypedValue getCleanValue(const llvm::Type *Ty);
+            static TypedValue getCleanValue(const llvm::Value *V);
+            inline ShadowMemory* getGlobalMemory() const
+            {
+                return m_globalMemory;
+            }
+            TypedValue getGlobalValue(const llvm::Value *V) const;
+            MemoryPool* getMemoryPool() const
+            {
+                return m_workSpace.memoryPool;
+            }
+            static TypedValue getPoisonedValue(unsigned size);
+            static TypedValue getPoisonedValue(TypedValue v);
+            static TypedValue getPoisonedValue(const llvm::Type *Ty);
+            static TypedValue getPoisonedValue(const llvm::Value *V);
+            inline ShadowWorkItem* getShadowWorkItem(const WorkItem *workItem) const
+            {
+                return m_workSpace.workItems->at(workItem);
+            }
+            inline ShadowWorkGroup* getShadowWorkGroup(const WorkGroup *workGroup) const
+            {
+                return m_workSpace.workGroups->at(workGroup);
+            }
+            TypedValue getValue(const WorkItem *workItem, const llvm::Value *V) const;
+            inline bool hasValue(const WorkItem *workItem, const llvm::Value* V) const
+            {
+                return llvm::isa<llvm::Constant>(V) || m_globalValues.count(V) || m_workSpace.workItems->at(workItem)->getValues()->hasValue(V);
+            }
+            static bool isCleanImage(const TypedValue shadowImage);
+            static bool isCleanImageAddress(const TypedValue shadowImage);
+            static bool isCleanImageDescription(const TypedValue shadowImage);
+            static bool isCleanImageFormat(const TypedValue shadowImage);
+            static bool isCleanStruct(ShadowMemory *shadowMemory, size_t address, const llvm::StructType *structTy);
+            static bool isCleanValue(unsigned long v);
+            static bool isCleanValue(TypedValue v);
+            static bool isCleanValue(TypedValue v, unsigned offset);
+            void setGlobalValue(const llvm::Value *V, TypedValue SV);
+            static void shadowOr(TypedValue v1, TypedValue v2);
+
+        private:
+            ShadowMemory *m_globalMemory;
+            UnorderedTypedValueMap m_globalValues;
+            unsigned m_numBitsBuffer;
+            typedef std::map<const WorkItem*, ShadowWorkItem*> ShadowItemMap;
+            typedef std::map<const WorkGroup*, ShadowWorkGroup*> ShadowGroupMap;
+            struct WorkSpace
+            {
+                ShadowItemMap *workItems;
+                ShadowGroupMap *workGroups;
+                MemoryPool *memoryPool;
+                unsigned poolUsers;
+            };
+            static THREAD_LOCAL WorkSpace m_workSpace;
+    };
+
+    class Uninitialized : public Plugin
+    {
+        public:
+            Uninitialized(const Context *context);
+            virtual ~Uninitialized();
+
+            virtual void hostMemoryStore(const Memory *memory,
+                    size_t address, size_t size,
+                    const uint8_t *storeData) override;
+            virtual void instructionExecuted(const WorkItem *workItem,
+                    const llvm::Instruction *instruction,
+                    const TypedValue& result) override;
+            virtual void kernelBegin(const KernelInvocation *kernelInvocation) override;
+            virtual void kernelEnd(const KernelInvocation *kernelInvocation) override;
+            virtual void memoryMap(const Memory *memory, size_t address,
+                    size_t offset, size_t size, cl_map_flags flags) override;
+            virtual void workItemBegin(const WorkItem *workItem) override;
+            virtual void workItemComplete(const WorkItem *workItem) override;
+            virtual void workGroupBegin(const WorkGroup *workGroup) override;
+            virtual void workGroupComplete(const WorkGroup *workGroup) override;
+            //virtual void memoryAllocated(const Memory *memory, size_t address,
+            //                             size_t size, cl_mem_flags flags,
+            //                             const uint8_t *initData);
+        private:
+            std::list<std::pair<const llvm::Value*, TypedValue> > m_deferredInit;
+            std::list<std::pair<const llvm::Value*, TypedValue> > m_deferredInitGroup;
+            ShadowContext shadowContext;
+            MemoryPool m_pool;
+
+            void allocAndStoreShadowMemory(unsigned addrSpace, size_t address, TypedValue SM,
+                                           const WorkItem *workItem = NULL, const WorkGroup *workGroup = NULL, bool unchecked = false);
+            bool checkAllOperandsDefined(const WorkItem *workItem, const llvm::Instruction *I);
+            void checkStructMemcpy(const WorkItem *workItem, const llvm::Value *src);
+            void copyShadowMemory(unsigned dstAddrSpace, size_t dst,
+                                  unsigned srcAddrSpace, size_t src, unsigned size,
+                                  const WorkItem *workItem = NULL, const WorkGroup *workGroup = NULL, bool unchecked = false);
+            void copyShadowMemoryStrided(unsigned dstAddrSpace, size_t dst,
+                                         unsigned srcAddrSpace, size_t src,
+                                         size_t num, size_t stride, unsigned size,
+                                         const WorkItem *workItem = NULL, const WorkGroup *workGroup = NULL, bool unchecked = false);
+            static std::string extractUnmangledName(const std::string fullname);
+            ShadowMemory* getShadowMemory(unsigned addrSpace, const WorkItem *workItem = NULL, const WorkGroup *workGroup = NULL) const;
+            bool handleBuiltinFunction(const WorkItem *workItem, std::string name, const llvm::CallInst *CI, const TypedValue result);
+            void handleIntrinsicInstruction(const WorkItem *workItem, const llvm::IntrinsicInst *I);
+            void loadShadowMemory(unsigned addrSpace, size_t address, TypedValue &SM,
+                                  const WorkItem *workItem = NULL, const WorkGroup *workGroup = NULL);
+            void logUninitializedAddress(unsigned int addrSpace, size_t address, bool write = true) const;
+            void logUninitializedCF() const;
+            void logUninitializedIndex() const;
+            void logUninitializedWrite(unsigned int addrSpace, size_t address) const;
+            void SimpleOr(const WorkItem *workItem, const llvm::Instruction *I);
+            void SimpleOrAtomic(const WorkItem *workItem, const llvm::CallInst *CI);
+            void storeShadowMemory(unsigned addrSpace, size_t address, TypedValue SM,
+                                   const WorkItem *workItem = NULL, const WorkGroup *workGroup = NULL, bool unchecked = false);
+            void VectorOr(const WorkItem *workItem, const llvm::Instruction *I);
+    };
+}
diff --git a/src/runtime/async_queue.cpp b/src/runtime/async_queue.cpp
index cc5f41c..28789df 100644
--- a/src/runtime/async_queue.cpp
+++ b/src/runtime/async_queue.cpp
@@ -1,5 +1,5 @@
 // async_queue.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/runtime/async_queue.h b/src/runtime/async_queue.h
index 5ff4f4a..39bdb50 100644
--- a/src/runtime/async_queue.h
+++ b/src/runtime/async_queue.h
@@ -1,5 +1,5 @@
 // async_queue.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/runtime/icd.h b/src/runtime/icd.h
index 7059cf9..6a2d207 100644
--- a/src/runtime/icd.h
+++ b/src/runtime/icd.h
@@ -1,5 +1,5 @@
 // icd.h (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
diff --git a/src/runtime/oclgrind b/src/runtime/oclgrind
deleted file mode 100755
index 4925be4..0000000
--- a/src/runtime/oclgrind
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/bin/bash
-# oclgrind (Oclgrind)
-# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
-# University of Bristol. All rights reserved.
-#
-# This program is provided under a three-clause BSD license. For full
-# license terms please see the LICENSE file distributed with this
-# source code.
-
-function usage
-{
-  echo "Usage: "
-  echo "  oclgrind [OPTIONS] COMMAND"
-  echo "  oclgrind [--help | --version]"
-  echo
-  echo "Options:"
-  echo -n "     --build-options  OPTIONS  "
-  echo          "Additional options to pass to the OpenCL compiler"
-  echo -n "     --check-api               "
-  echo          "Reports errors on API calls"
-  echo -n "     --data-races              "
-  echo          "Enable data-race detection"
-  echo -n "     --disable-pch             "
-  echo          "Don't use precompiled headers"
-  echo -n "     --dump-spir               "
-  echo          "Dump SPIR to /tmp/oclgrind_*.{ll,bc}"
-  echo -n "  -h --help                    "
-  echo          "Display usage information"
-  echo -n "     --inst-counts             "
-  echo          "Output histograms of instructions executed"
-  echo -n "  -i --interactive             "
-  echo          "Enable interactive mode"
-  echo -n "     --log            LOGFILE  "
-  echo          "Redirect log/error messages to a file"
-  echo -n "     --max-errors     NUM      "
-  echo          "Limit the number of error/warning messages"
-  echo -n "     --num-threads    NUM      "
-  echo          "Set the number of worker threads to use"
-  echo -n "     --pch-dir        DIR      "
-  echo          "Override directory containing precompiled headers"
-  echo -n "     --plugins        PLUGINS  "
-  echo          "Load colon seperated list of plugin libraries"
-  echo -n "  -q --quick                   "
-  echo          "Only run first and last work-group"
-  echo -n "     --uniform-writes          "
-  echo          "Don't suppress uniform write-write data-races"
-  echo -n "  -v --version                 "
-  echo          "Display version information"
-  echo
-  echo "For more information, please visit the Oclgrind wiki page:"
-  echo "-> https://github.com/jrprice/Oclgrind/wiki"
-  echo
-}
-
-# Parse arguments
-while [ $# -gt 0 -a "${1:0:1}" == "-" ]
-do
-  if [ "$1" == "--build-options" ]
-  then
-    shift
-    export OCLGRIND_BUILD_OPTIONS="$1"
-  elif [ "$1" == "--check-api" ]
-  then
-    export OCLGRIND_CHECK_API=1
-  elif [ "$1" == "--data-races" ]
-  then
-    export OCLGRIND_DATA_RACES=1
-  elif [ "$1" == "--disable-pch" ]
-  then
-    export OCLGRIND_DISABLE_PCH=1
-  elif [ "$1" == "--dump-spir" ]
-  then
-    export OCLGRIND_DUMP_SPIR=1
-  elif [ "$1" == "-h" -o "$1" == "--help" ]
-  then
-    usage
-    exit 0
-  elif [ "$1" == "--inst-counts" ]
-  then
-    export OCLGRIND_INST_COUNTS=1
-  elif [ "$1" == "-i" -o "$1" == "--interactive" ]
-  then
-    export OCLGRIND_INTERACTIVE=1
-  elif [ "$1" == "--log" ]
-  then
-    shift
-    export OCLGRIND_LOG="$1"
-  elif [ "$1" == "--max-errors" ]
-  then
-    shift
-    export OCLGRIND_MAX_ERRORS="$1"
-  elif [ "$1" == "--num-threads" ]
-  then
-    shift
-    export OCLGRIND_NUM_THREADS="$1"
-  elif [ "$1" == "--pch-dir" ]
-  then
-    shift
-    export OCLGRIND_PCH_DIR="$1"
-  elif [ "$1" == "--plugins" ]
-  then
-    shift
-    export OCLGRIND_PLUGINS="$1"
-  elif [ "$1" == "-q" -o "$1" == "--quick" ]
-  then
-    export OCLGRIND_QUICK=1
-  elif [ "$1" == "--uniform-writes" ]
-  then
-    export OCLGRIND_UNIFORM_WRITES=1
-  elif [ "$1" == "-v" -o "$1" == "--version" ]
-  then
-    echo
-    echo "Oclgrind __VERSION__"
-    echo
-    echo "Copyright (c) 2013-2015"
-    echo "James Price and Simon McIntosh-Smith, University of Bristol"
-    echo "https://github.com/jrprice/Oclgrind"
-    echo
-    exit 0
-  else
-    echo "Unrecognized argument '$1'"
-    usage
-    exit 1
-  fi
-  shift
-done
-
-# Ensure target command supplied
-if [ $# -lt 1 ]
-then
-  usage
-  exit 1
-fi
-
-# Inject liboclgrind.{so,dylib} and run command
-LIBDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/../lib"
-if [ "$(uname -s)" == "Darwin" ]
-then
-  DYLD_LIBRARY_PATH=$LIBDIR:$DYLD_LIBRARY_PATH \
-    DYLD_INSERT_LIBRARIES=$LIBDIR/liboclgrind-rt.dylib \
-    DYLD_FORCE_FLAT_NAMESPACE=1 "$@"
-else
-  LD_LIBRARY_PATH=$LIBDIR:$LD_LIBRARY_PATH \
-    LD_PRELOAD=$LIBDIR/liboclgrind-rt.so "$@"
-fi
diff --git a/src/runtime/oclgrind.cpp b/src/runtime/oclgrind.cpp
new file mode 100644
index 0000000..e547bb2
--- /dev/null
+++ b/src/runtime/oclgrind.cpp
@@ -0,0 +1,483 @@
+// oclgrind.cpp (Oclgrind)
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "config.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include <windows.h>
+#else
+#include <limits.h>
+#include <unistd.h>
+#ifdef __APPLE__
+#include <mach-o/dyld.h>
+#endif
+#endif
+
+using namespace std;
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+
+static string appCmd;
+static void checkWow64(HANDLE parent, HANDLE child);
+static void die(const char *op);
+
+#else // not Windows
+
+static char **appArgs = NULL;
+#ifdef __APPLE__
+#define LIB_EXTENSION "dylib"
+#define LD_LIBRARY_PATH_ENV "DYLD_LIBRARY_PATH"
+#define LD_PRELOAD_ENV "DYLD_INSERT_LIBRARIES"
+#else
+#define LIB_EXTENSION "so"
+#define LD_LIBRARY_PATH_ENV "LD_LIBRARY_PATH"
+#define LD_PRELOAD_ENV "LD_PRELOAD"
+#endif
+
+#endif
+
+static string getLibDirPath();
+static bool parseArguments(int argc, char *argv[]);
+static void printUsage();
+static void setEnvironment(const char *name, const char *value);
+
+int main(int argc, char *argv[])
+{
+  // Parse arguments
+  if (!parseArguments(argc, argv))
+  {
+    return 1;
+  }
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+
+  // Get full path to oclgrind-rt.dll
+  string dllpath = getLibDirPath();
+  dllpath += "\\oclgrind-rt.dll";
+
+
+  PROCESS_INFORMATION pinfo = { 0 };
+  STARTUPINFOA sinfo = { 0 };
+  sinfo.cb = sizeof(sinfo);
+
+  // Create child process in suspended state
+  if (!CreateProcessA(NULL, (LPSTR)appCmd.c_str(), NULL, NULL, FALSE,
+                      CREATE_SUSPENDED, NULL, NULL, &sinfo, &pinfo))
+    die("creating child process");
+
+  // Check that we are running as 64-bit if and only if we need to be
+  checkWow64(GetCurrentProcess(), pinfo.hProcess);
+
+  // Allocate memory for DLL path
+  void *childPath = VirtualAllocEx(pinfo.hProcess, NULL, dllpath.size()+1,
+                                   MEM_COMMIT, PAGE_READWRITE);
+  if (!childPath)
+    die("allocating child memory");
+
+  // Write DLL path to child
+  if (!WriteProcessMemory(pinfo.hProcess, childPath,
+                          (void*)dllpath.c_str(), dllpath.size()+1, NULL))
+    die("writing child memory");
+
+  // Create thread to load DLL in child process
+  HANDLE childThread =
+    CreateRemoteThread(pinfo.hProcess, NULL, 0,
+                       (LPTHREAD_START_ROUTINE)GetProcAddress(
+                         GetModuleHandleA("kernel32.dll"), "LoadLibraryA"),
+                       childPath, 0, NULL);
+  if (!childThread)
+    die("loading DLL in child thread");
+
+  // Wait for child thread to complete
+  if (WaitForSingleObject(childThread, INFINITE) != WAIT_OBJECT_0)
+    die("waiting for load thread");
+
+  CloseHandle(childThread);
+  VirtualFreeEx(pinfo.hProcess, childPath, dllpath.size()+1, MEM_RELEASE);
+
+
+  // Load DLL in this process as well to get function pointers
+  HMODULE dll = LoadLibraryA(dllpath.c_str());
+  if (!dll)
+    die("loading DLL");
+
+  // Get handle to initOclgrind function in DLL
+  HANDLE initFunction = GetProcAddress(dll, "initOclgrind");
+  if (!initFunction)
+    die("getting init function address");
+
+  // Launch init function in child process
+  childThread = CreateRemoteThread(pinfo.hProcess, NULL, 0,
+                                   (LPTHREAD_START_ROUTINE)initFunction,
+                                   NULL, 0, NULL);
+  if (!childThread)
+    die("launching init in child thread");
+
+  // Wait for init to finish
+  if (WaitForSingleObject(childThread, INFINITE) != WAIT_OBJECT_0)
+    die("waiting for init thread");
+
+
+  // Check return value
+  DWORD retval = 0;
+  if (!GetExitCodeThread(childThread, &retval))
+    die("getting init exit code");
+  if (!retval)
+  {
+    cerr << "[Oclgrind] initialization failed: " << retval << endl;
+    exit(retval);
+  }
+
+  CloseHandle(childThread);
+
+  // Resume child process
+  if (ResumeThread(pinfo.hThread) == -1)
+    die("resuming thread");
+
+  // Wait for child process to finish
+  if (WaitForSingleObject(pinfo.hProcess, INFINITE) != WAIT_OBJECT_0)
+    die("waiting for child process failed");
+
+  // Get return code and forward it
+  if (!GetExitCodeProcess(pinfo.hProcess, &retval))
+    die("getting child process exit code");
+
+  return retval;
+
+#else // not Windows
+
+  // Get path to Oclgrind library directory
+  string libdir = getLibDirPath();
+
+  // Construct new LD_LIBRARY_PATH
+  string ldLibraryPath = libdir;
+  const char *oldLdLibraryPath = getenv(LD_LIBRARY_PATH_ENV);
+  if (oldLdLibraryPath)
+  {
+    ldLibraryPath += ":";
+    ldLibraryPath += oldLdLibraryPath;
+  }
+
+  // Add oclgrind-rt library to LD_PRELOAD
+  string ldPreload = libdir;
+  ldPreload += "/liboclgrind-rt.";
+  ldPreload += LIB_EXTENSION;
+  const char *oldLdPreload = getenv(LD_PRELOAD_ENV);
+  if (oldLdPreload)
+  {
+    ldPreload += ":";
+    ldPreload += oldLdPreload;
+  }
+
+  setEnvironment(LD_LIBRARY_PATH_ENV, ldLibraryPath.c_str());
+  setEnvironment(LD_PRELOAD_ENV, ldPreload.c_str());
+#ifdef __APPLE__
+  setEnvironment("DYLD_FORCE_FLAT_NAMESPACE", "1");
+#endif
+
+  // Launch target application
+  if (execvp(appArgs[0], appArgs) == -1)
+  {
+    cerr << "[Oclgrind] Failed to launch target application" << endl;
+    exit(1);
+  }
+
+#endif
+}
+
+static bool parseArguments(int argc, char *argv[])
+{
+  for (int i = 1; i < argc; i++)
+  {
+    if (!strcmp(argv[i], "--build-options"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --build-options" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_BUILD_OPTIONS", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--check-api"))
+    {
+      setEnvironment("OCLGRIND_CHECK_API", "1");
+    }
+    else if (!strcmp(argv[i], "--data-races"))
+    {
+      setEnvironment("OCLGRIND_DATA_RACES", "1");
+    }
+    else if (!strcmp(argv[i], "--disable-pch"))
+    {
+      setEnvironment("OCLGRIND_DISABLE_PCH", "1");
+    }
+    else if (!strcmp(argv[i], "--dump-spir"))
+    {
+      setEnvironment("OCLGRIND_DUMP_SPIR", "1");
+    }
+    else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help"))
+    {
+      printUsage();
+      exit(0);
+    }
+    else if (!strcmp(argv[i], "--inst-counts"))
+    {
+      setEnvironment("OCLGRIND_INST_COUNTS", "1");
+    }
+    else if (!strcmp(argv[i], "-i") || !strcmp(argv[i], "--interactive"))
+    {
+      setEnvironment("OCLGRIND_INTERACTIVE", "1");
+    }
+    else if (!strcmp(argv[i], "--log"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --log" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_LOG", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--max-errors"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --max-errors" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_MAX_ERRORS", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--num-threads"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --num-threads" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_NUM_THREADS", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--pch-dir"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --pch-dir" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_PCH_DIR", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--plugins"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --plugins" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_PLUGINS", argv[i]);
+    }
+    else if (!strcmp(argv[i], "-q") || !strcmp(argv[i], "--quick"))
+    {
+      setEnvironment("OCLGRIND_QUICK", "1");
+    }
+    else if (!strcmp(argv[i], "--uniform-writes"))
+    {
+      setEnvironment("OCLGRIND_UNIFORM_WRITES", "1");
+    }
+    else if (!strcmp(argv[i], "--uninitialized"))
+    {
+      setEnvironment("OCLGRIND_UNINITIALIZED", "1");
+    }
+    else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version"))
+    {
+      cout << endl;
+      cout << "Oclgrind " PACKAGE_VERSION << endl;
+      cout << endl;
+      cout << "Copyright (c) 2013-2016" << endl;
+      cout << "James Price and Simon McIntosh-Smith, University of Bristol"
+           << endl;
+      cout << "https://github.com/jrprice/Oclgrind" << endl;
+      cout << endl;
+      exit(0);
+    }
+    else if (argv[i][0] == '-')
+    {
+      cerr << "Unrecognised option '" << argv[i] << "'" << endl;
+      return false;
+    }
+    else
+    {
+#if defined(_WIN32) && !defined(__MINGW32__)
+      // Build command-line for target application
+      for (; i < argc; i++)
+      {
+        appCmd += argv[i];
+        appCmd += " ";
+      }
+#else // not Windows
+      appArgs = (char**)malloc((argc-i+1) * sizeof(char*));
+      int offset = i;
+      for (; i < argc; i++)
+      {
+        appArgs[i-offset] = argv[i];
+      }
+      appArgs[argc-offset] = NULL;
+#endif
+      break;
+    }
+  }
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+  if (appCmd.size() == 0)
+#else
+  if (!appArgs)
+#endif
+  {
+    printUsage();
+    return false;
+  }
+
+  return true;
+}
+
+static string getLibDirPath()
+{
+  string libdir;
+
+  // Get full path to executable
+#if defined(_WIN32) && !defined(__MINGW32__)
+  char path[MAX_PATH];
+  GetModuleFileNameA(GetModuleHandle(NULL), path, MAX_PATH);
+  if (GetLastError() != ERROR_SUCCESS)
+    die("getting path to Oclgrind installation");
+  libdir = path;
+#else
+  char exepath[PATH_MAX];
+  char path[PATH_MAX];
+  // Get path to executable
+#if defined(__APPLE__)
+  uint32_t sz = PATH_MAX;
+  if (_NSGetExecutablePath(exepath, &sz))
+#else // not apple
+  if (readlink("/proc/self/exe", exepath, PATH_MAX) == -1)
+#endif
+  {
+    cerr << "[Oclgrind] Unable to get path to Oclgrind installation" << endl;
+    exit(1);
+  }
+  // Resolve symbolic links and normalise path
+  realpath(exepath, path);
+  libdir = path;
+#endif
+
+  // Remove executable filename and containing directory
+  size_t slash;
+  for (int i = 0; i < 2; i++)
+  {
+#if defined(_WIN32) && !defined(__MINGW32__)
+    if ((slash = libdir.find_last_of('\\')) == string::npos)
+#else
+    if ((slash = libdir.find_last_of('/')) == string::npos)
+#endif
+      cerr << "[Oclgrind] Failed to get path to library directory" << endl;
+
+    libdir.resize(slash);
+  }
+
+  // Append library directory
+  libdir += "/lib" LIBDIR_SUFFIX;
+
+  return libdir;
+}
+
+static void printUsage()
+{
+  cout
+    << "Usage: oclgrind [OPTIONS] COMMAND" << endl
+    << "       oclgrind [--help | --version]" << endl
+    << endl
+    << "Options:" << endl
+    << "     --build-options  OPTIONS  "
+             "Additional options to pass to the OpenCL compiler" << endl
+    << "     --check-api               "
+             "Report errors on API calls" << endl
+    << "     --data-races              "
+             "Enable data-race detection" << endl
+    << "     --disable-pch             "
+             "Don't use precompiled headers" << endl
+    << "     --dump-spir               "
+             "Dump SPIR to /tmp/oclgrind_*.{ll,bc}" << endl
+    << "  -h --help                    "
+             "Display usage information" << endl
+    << "     --inst-counts             "
+             "Output histograms of instructions executed" << endl
+    << "  -i --interactive             "
+             "Enable interactive mode" << endl
+    << "     --log            LOGFILE  "
+             "Redirect log/error messages to a file" << endl
+    << "     --max-errors     NUM      "
+             "Limit the number of error/warning messages" << endl
+    << "     --num-threads    NUM      "
+             "Set the number of worker threads to use" << endl
+    << "     --pch-dir        DIR      "
+             "Override directory containing precompiled headers" << endl
+    << "     --plugins        PLUGINS  "
+             "Load colon separated list of plugin libraries" << endl
+    << "  -q --quick                   "
+             "Only run first and last work-group" << endl
+    << "     --uniform-writes          "
+             "Don't suppress uniform write-write data-races" << endl
+    << "     --uninitialized           "
+             "Report usage of uninitialized values" << endl
+    << "  -v --version                 "
+             "Display version information" << endl
+    << endl
+    << "For more information, please visit the Oclgrind wiki page:" << endl
+    << "-> https://github.com/jrprice/Oclgrind/wiki" << endl
+    << endl;
+}
+
+static void setEnvironment(const char *name, const char *value)
+{
+#if defined(_WIN32) && !defined(__MINGW32__)
+  _putenv_s(name, value);
+#else
+  setenv(name, value, 1);
+#endif
+}
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+
+void checkWow64(HANDLE parent, HANDLE child)
+{
+  BOOL parentWow64, childWow64;
+  IsWow64Process(parent, &parentWow64);
+  IsWow64Process(child, &childWow64);
+  if (parentWow64 != childWow64)
+  {
+    const char *bits = childWow64 ? "32" : "64";
+    cerr << "[Oclgrind] target application is " << bits << "-bit" << endl
+         << "Use the " << bits << "-bit version of oclgrind.exe"  << endl;
+    exit(1);
+  }
+}
+
+void die(const char *op)
+{
+  DWORD err = GetLastError();
+  char buffer[1024];
+  FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err,
+    MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+    buffer, 1024, NULL);
+  cerr << "[Oclgrind] Error while '" << op << "':" << endl
+       << buffer << endl;
+  exit(1);
+}
+
+#endif
diff --git a/src/runtime/runtime.cpp b/src/runtime/runtime.cpp
index 1cf7338..55759c5 100644
--- a/src/runtime/runtime.cpp
+++ b/src/runtime/runtime.cpp
@@ -1,11 +1,13 @@
 // runtime.cpp (Oclgrind)
-// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 // University of Bristol. All rights reserved.
 //
 // This program is provided under a three-clause BSD license. For full
 // license terms please see the LICENSE file distributed with this
 // source code.
 
+#include "config.h"
+
 #include <cassert>
 #include <cmath>
 #include <cstring>
@@ -27,7 +29,7 @@ using namespace std;
 #define MAX_GLOBAL_MEM_SIZE      (128 * 1048576)
 #define MAX_CONSTANT_BUFFER_SIZE (1048576)
 #define MAX_LOCAL_MEM_SIZE       (32768)
-#define MAX_WI_SIZE              (65536)
+#define MAX_WI_SIZE              (1024)
 
 #define PLATFORM_NAME       "Oclgrind"
 #define PLATFORM_VENDOR     "University of Bristol"
@@ -53,6 +55,10 @@ using namespace std;
   cl_khr_local_int32_extended_atomics  \
   cl_khr_byte_addressable_store        \
   cl_khr_fp64"
+#define DEVICE_TYPE (CL_DEVICE_TYPE_CPU | \
+                     CL_DEVICE_TYPE_GPU | \
+                     CL_DEVICE_TYPE_ACCELERATOR | \
+                     CL_DEVICE_TYPE_DEFAULT)
 
 
 namespace
@@ -347,9 +353,7 @@ clGetDeviceIDs
     ReturnError(NULL, CL_INVALID_VALUE);
   }
 
-  if (device_type != CL_DEVICE_TYPE_CPU &&
-      device_type != CL_DEVICE_TYPE_DEFAULT &&
-      device_type != CL_DEVICE_TYPE_ALL)
+  if (!(device_type & DEVICE_TYPE))
   {
     ReturnError(NULL, CL_DEVICE_NOT_FOUND);
   }
@@ -411,7 +415,7 @@ clGetDeviceInfo
   {
   case CL_DEVICE_TYPE:
     result_size = sizeof(cl_device_type);
-    result_data.cldevicetype = CL_DEVICE_TYPE_CPU;
+    result_data.cldevicetype = DEVICE_TYPE;
     break;
   case CL_DEVICE_VENDOR_ID:
     result_size = sizeof(cl_uint);
@@ -682,8 +686,10 @@ clGetDeviceInfo
     }
     else
     {
-      const void* src = str ? (const void*)str : (const void*)&result_data;
-      memcpy(param_value, src, result_size);
+      if (str)
+        memcpy(param_value, str, result_size);
+      else
+        memcpy(param_value, &result_data, result_size);
     }
   }
 
@@ -805,9 +811,7 @@ clCreateContextFromType
                  "pfn_notify NULL but user_data non-NULL");
     return NULL;
   }
-  if (device_type != CL_DEVICE_TYPE_CPU &&
-      device_type != CL_DEVICE_TYPE_DEFAULT &&
-      device_type != CL_DEVICE_TYPE_ALL)
+  if (!(device_type & DEVICE_TYPE))
   {
     SetErrorArg(NULL, CL_DEVICE_NOT_FOUND, device_type);
     return NULL;
@@ -1634,7 +1638,7 @@ clGetSupportedImageFormats
     sizeof(ordersAll)        / sizeof(cl_channel_order),
     sizeof(ordersNormalized) / sizeof(cl_channel_order),
     sizeof(ordersByte)       / sizeof(cl_channel_order),
-    //sizeof(ordersPacked)     / sizeof(cl_channel_order),
+    sizeof(ordersPacked)     / sizeof(cl_channel_order),
   };
 
   // Channel types
@@ -1670,7 +1674,7 @@ clGetSupportedImageFormats
     sizeof(typesAll)        / sizeof(cl_channel_order),
     sizeof(typesNormalized) / sizeof(cl_channel_order),
     sizeof(typesByte)       / sizeof(cl_channel_order),
-    //sizeof(typesPacked)     / sizeof(cl_channel_order),
+    sizeof(typesPacked)     / sizeof(cl_channel_order),
   };
 
   // Calculate total number of formats
@@ -2490,9 +2494,6 @@ clGetProgramInfo
   size_t *         param_value_size_ret
 ) CL_API_SUFFIX__VERSION_1_0
 {
-  size_t result_size = 0;
-  void *result_data = NULL;
-
   // Check program is valid
   if (!program)
   {
@@ -2506,100 +2507,92 @@ clGetProgramInfo
                     "Program not successfully built");
   }
 
+  size_t dummy;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_uint cluint;
+    cl_device_id device;
+    cl_context context;
+    size_t sizet;
+  } result_data;
+  const char* str = 0;
+  string kernelNames;
+
   switch (param_name)
   {
   case CL_PROGRAM_REFERENCE_COUNT:
     result_size = sizeof(cl_uint);
-    result_data = malloc(result_size);
-    *(cl_uint*)result_data = program->refCount;
+    result_data.cluint = program->refCount;
     break;
   case CL_PROGRAM_CONTEXT:
     result_size = sizeof(cl_context);
-    result_data = malloc(result_size);
-    *(cl_context*)result_data = program->context;
+    result_data.context = program->context;
     break;
   case CL_PROGRAM_NUM_DEVICES:
     result_size = sizeof(cl_uint);
-    result_data = malloc(result_size);
-    *(cl_uint*)result_data = 1;
+    result_data.cluint = 1;
     break;
   case CL_PROGRAM_DEVICES:
     result_size = sizeof(cl_device_id);
-    result_data = malloc(result_size);
-    *(cl_device_id*)result_data = m_device;
+    result_data.device = m_device;
     break;
   case CL_PROGRAM_SOURCE:
-    result_size = strlen(program->program->getSource().c_str()) + 1;
-    result_data = malloc(result_size);
-    strcpy((char*)result_data, program->program->getSource().c_str());
+    str = program->program->getSource().c_str();
+    result_size = strlen(str) + 1;
     break;
   case CL_PROGRAM_BINARY_SIZES:
     result_size = sizeof(size_t);
-    result_data = malloc(result_size);
-    *(size_t*)result_data = program->program->getBinarySize();
+    result_data.sizet = program->program->getBinarySize();
     break;
   case CL_PROGRAM_BINARIES:
     result_size = sizeof(unsigned char*);
-    result_data = program->program->getBinary();
     break;
   case CL_PROGRAM_NUM_KERNELS:
     result_size = sizeof(size_t);
-    result_data = malloc(result_size);
-    *(size_t*)result_data = program->program->getNumKernels();
+    result_data.sizet = program->program->getNumKernels();
     break;
   case CL_PROGRAM_KERNEL_NAMES:
   {
     list<string> names = program->program->getKernelNames();
-    string ret;
     for (list<string>::iterator itr = names.begin(); itr != names.end(); itr++)
     {
-      ret += *itr;
-      ret += ";";
+      kernelNames += *itr;
+      kernelNames += ";";
     }
-    if (!ret.empty())
+    if (!kernelNames.empty())
     {
-      ret.erase(ret.length()-1);
+      kernelNames.erase(kernelNames.length()-1);
     }
-    result_size = strlen(ret.c_str()) + 1;
-    result_data = malloc(result_size);
-    strcpy((char*)result_data, ret.c_str());
+    str = kernelNames.c_str();
+    result_size = strlen(str) + 1;
     break;
   }
   default:
     ReturnErrorArg(program->context, CL_INVALID_VALUE, param_name);
   }
 
-  cl_int return_value = CL_SUCCESS;
   if (param_value)
   {
-    if (param_name == CL_PROGRAM_BINARIES)
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(NULL, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+    }
+    else if (param_name == CL_PROGRAM_BINARIES)
     {
-      memcpy(((unsigned char**)param_value)[0],
-             result_data, program->program->getBinarySize());
+      program->program->getBinary(((unsigned char**)param_value)[0]);
     }
     else
     {
-      // Check destination is large enough
-      if (param_value_size < result_size)
-      {
-        // TODO: Use API error reporting mechanism
-        return_value = CL_INVALID_VALUE;
-      }
+      if (str)
+        memcpy(param_value, str, result_size);
       else
-      {
-        memcpy(param_value, result_data, result_size);
-      }
+        memcpy(param_value, &result_data, result_size);
     }
   }
 
-  if (param_value_size_ret)
-  {
-    *param_value_size_ret = result_size;
-  }
-
-  free(result_data);
-
-  return return_value;
+  return CL_SUCCESS;
 }
 
 CL_API_ENTRY cl_int CL_API_CALL
@@ -4186,7 +4179,8 @@ clEnqueueFillImage
       ((float*)color)[output] = ((float*)fill_color)[input];
       break;
     case CL_HALF_FLOAT:
-      ((uint16_t*)color)[output] = floatToHalf(((float*)fill_color)[input]);
+      ((uint16_t*)color)[output] =
+        oclgrind::floatToHalf(((float*)fill_color)[input]);
       break;
     case CL_SIGNED_INT8:
       ((int8_t*)color)[output] = ((int32_t*)fill_color)[input];
@@ -4568,7 +4562,11 @@ clEnqueueMapBuffer
   }
 
   // Enqueue command
-  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  oclgrind::Queue::MapCommand *cmd = new oclgrind::Queue::MapCommand();
+  cmd->address = buffer->address;
+  cmd->offset  = offset;
+  cmd->size    = cb;
+  cmd->flags   = map_flags;
   asyncQueueRetain(cmd, buffer);
   asyncEnqueue(command_queue, CL_COMMAND_MAP_BUFFER, cmd,
                num_events_in_wait_list, event_wait_list, event);
@@ -4697,7 +4695,11 @@ clEnqueueMapImage
   }
 
   // Enqueue command
-  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  oclgrind::Queue::MapCommand *cmd = new oclgrind::Queue::MapCommand();
+  cmd->address = image->address;
+  cmd->offset  = offset;
+  cmd->size    = size;
+  cmd->flags   = map_flags;
   asyncQueueRetain(cmd, image);
   asyncEnqueue(command_queue, CL_COMMAND_MAP_IMAGE, cmd,
                num_events_in_wait_list, event_wait_list, event);
@@ -4733,7 +4735,9 @@ clEnqueueUnmapMemObject
   }
 
   // Enqueue command
-  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  oclgrind::Queue::UnmapCommand *cmd = new oclgrind::Queue::UnmapCommand();
+  cmd->address = memobj->address;
+  cmd->ptr     = mapped_ptr;
   asyncQueueRetain(cmd, memobj);
   asyncEnqueue(command_queue, CL_COMMAND_UNMAP_MEM_OBJECT, cmd,
                num_events_in_wait_list, event_wait_list, event);
@@ -4799,6 +4803,8 @@ clEnqueueNDRangeKernel
   }
 
   // Check global and local sizes are valid
+  size_t reqdWorkGroupSize[3];
+  kernel->kernel->getRequiredWorkGroupSize(reqdWorkGroupSize);
   for (unsigned i = 0; i < work_dim; i++)
   {
     if (!global_work_size[i])
@@ -4809,10 +4815,17 @@ clEnqueueNDRangeKernel
     if (local_work_size && global_work_size[i] % local_work_size[i])
     {
       ReturnErrorInfo(command_queue->context, CL_INVALID_WORK_GROUP_SIZE,
-                      "Dimension " << i <<
-                      ": local_work_size (" << local_work_size[i] <<
-                      ") does not divide global_work_size (" <<
-                      global_work_size[i] << ")");
+                      "local_work_size[" << i << "]=" << local_work_size[i] <<
+                      " does not divide global_work_size[" << i << "]=" <<
+                      global_work_size[i]);
+    }
+    if (local_work_size && reqdWorkGroupSize[i] &&
+        local_work_size[i] != reqdWorkGroupSize[i])
+    {
+      ReturnErrorInfo(command_queue->context, CL_INVALID_WORK_GROUP_SIZE,
+                      "local_work_size[" << i << "]=" << local_work_size[i] <<
+                      " does not match reqd_work_group_size[" << i << "]=" <<
+                      reqdWorkGroupSize[i])
     }
   }
 
@@ -5592,3 +5605,105 @@ void *m_dispatchTable[] =
   DISPATCH_TABLE_ENTRY(NULL),
 #endif
 };
+
+#if defined(_WIN32) && !defined(OCLGRIND_ICD)
+
+#include <Psapi.h>
+
+// Function to replace calls to clGetPlatformIDs with
+// the Oclgrind implementation.
+//
+// This is invoked by oclgrind.exe after this DLL is
+// injected into the child process.
+//
+// Returns true on success, false on failure.
+bool initOclgrind()
+{
+  // Get base address of process
+  char *base = (char*)GetModuleHandle(NULL);
+
+  // Get pointer to NT headers
+  PIMAGE_DOS_HEADER dosHeader = (PIMAGE_DOS_HEADER)(base);
+  PIMAGE_NT_HEADERS ntHeaders = (PIMAGE_NT_HEADERS)(base + dosHeader->e_lfanew);
+  if (ntHeaders->Signature != IMAGE_NT_SIGNATURE)
+  {
+    std::cerr << "[Oclgrind] Invalid NT signature: "
+              << ntHeaders->Signature << std::endl;
+    return false;
+  }
+
+  // Get pointer to import directory
+  DWORD importOffset =
+    ntHeaders->OptionalHeader.
+      DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT].VirtualAddress;
+  PIMAGE_IMPORT_DESCRIPTOR importDesc =
+    (PIMAGE_IMPORT_DESCRIPTOR)(base + importOffset);
+
+  // Loop over directory entries
+  while (importDesc->Name)
+  {
+    // Look for OpenCL.dll
+    const char *modname = (const char*)(base + importDesc->Name);
+    if (!stricmp(modname, "opencl.dll"))
+    {
+      // We use the OriginalFirstThunk to match the name,
+      // and then replace the function pointer in FirstThunk
+      PIMAGE_THUNK_DATA origThunk =
+        (PIMAGE_THUNK_DATA)(base + importDesc->OriginalFirstThunk);
+      PIMAGE_THUNK_DATA firstThunk =
+        (PIMAGE_THUNK_DATA)(base + importDesc->FirstThunk);
+
+      // Loop over functions
+      while (origThunk->u1.AddressOfData)
+      {
+        // Skip unnamed functions
+        if (!(origThunk->u1.Ordinal & IMAGE_ORDINAL_FLAG))
+        {
+          // Get function name and check for clGetPlatformIDs
+          PIMAGE_IMPORT_BY_NAME import =
+            (PIMAGE_IMPORT_BY_NAME)(base + origThunk->u1.AddressOfData);
+          if (!stricmp((char*)import->Name, "clGetPlatformIDs"))
+          {
+            // Make page writable temporarily
+            MEMORY_BASIC_INFORMATION mbinfo;
+            VirtualQuery(firstThunk, &mbinfo, sizeof(mbinfo));
+            if (!VirtualProtect(mbinfo.BaseAddress, mbinfo.RegionSize,
+                                PAGE_EXECUTE_READWRITE, &mbinfo.Protect))
+            {
+              std::cerr << "[Oclgrind] Failed to make page writeable: "
+                        << GetLastError() << std::endl;
+              return false;
+            }
+
+            // Replace function pointer with our implementation
+            firstThunk->u1.Function = (ULONG64)clGetPlatformIDs;
+
+            // Restore page protection
+            DWORD zero = 0;
+            if (!VirtualProtect(mbinfo.BaseAddress, mbinfo.RegionSize,
+                                mbinfo.Protect, &zero))
+            {
+              std::cerr << "[Oclgrind] Failed to restore page protection: "
+                        << GetLastError() << std::endl;
+              return false;
+            }
+
+            return true;
+          }
+        }
+
+        origThunk++;
+        firstThunk++;
+      }
+    }
+    importDesc++;
+  }
+
+  // We didn't find the function, so just warn user
+  std::cerr << "[Oclgrind] Warning: unable to patch clGetPlatformIDs"
+            << std::endl;
+
+  return true;
+}
+
+#endif
diff --git a/src/runtime/runtime.def b/src/runtime/runtime.def
index 77992cf..59b94c7 100644
--- a/src/runtime/runtime.def
+++ b/src/runtime/runtime.def
@@ -117,3 +117,5 @@ clEnqueueReleaseD3D11ObjectsKHR
 clGetDeviceIDsFromDX9MediaAdapterKHR
 clEnqueueAcquireDX9MediaSurfacesKHR
 clEnqueueReleaseDX9MediaSurfacesKHR
+
+initOclgrind
\ No newline at end of file
diff --git a/tests/Makefile.am b/tests/Makefile.am
new file mode 100644
index 0000000..fea836f
--- /dev/null
+++ b/tests/Makefile.am
@@ -0,0 +1,50 @@
+# Makefile.am (Oclgrind)
+# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+#
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+
+AUTOMAKE_OPTIONS = subdir-objects
+ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
+
+AM_CFLAGS = -std=c99 -I$(top_srcdir)/src/ -I${srcdir}/common -Wall
+LDADD     = ../liboclgrind-rt.la libcommon.la
+
+noinst_LTLIBRARIES = libcommon.la
+libcommon_la_SOURCES = common/common.c common/common.h
+
+check_PROGRAMS = \
+  apps/image/image \
+  apps/vecadd/vecadd \
+  runtime/map_buffer
+TESTS = $(check_PROGRAMS)
+
+if HAVE_PYTHON
+
+TEST_EXTENSIONS = .sim
+LOG_COMPILER = $(PYTHON) \
+  $(srcdir)/run_test.py  \
+  ${abs_top_builddir}/oclgrind-kernel
+SIM_LOG_COMPILER = $(PYTHON)        \
+  $(srcdir)/run_test.py  \
+  ${abs_top_builddir}/oclgrind-kernel
+AM_TESTS_ENVIRONMENT = \
+  export OCLGRIND_PCH_DIR=$(abs_top_builddir)/src/include/oclgrind;
+
+TESTS += $(KERNEL_TESTS)
+#XFAIL_TESTS =
+
+clean-local:
+	find . -name '*.out' -exec rm -f {} \;
+
+else
+check-local:
+	@echo
+	@echo "WARNING: Tests will not be run (Python required)."
+	@echo
+endif
+
+EXTRA_DIST = run_test.py kernels/TESTS $(KERNEL_TEST_INPUTS) \
+  runtime/map_buffer.ref
diff --git a/tests/apps/CMakeLists.txt b/tests/apps/CMakeLists.txt
index 0dff241..b8fb14a 100644
--- a/tests/apps/CMakeLists.txt
+++ b/tests/apps/CMakeLists.txt
@@ -1,29 +1,38 @@
 # CMakeLists.txt (Oclgrind)
-# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 # University of Bristol. All rights reserved.
 #
 # This program is provided under a three-clause BSD license. For full
 # license terms please see the LICENSE file distributed with this
 # source code.
 
+set(COMMON_SOURCES ../common/common.c ../common/common.h)
+include_directories(../common)
+
 # Add app tests
 foreach(test
+  image
   vecadd)
 
-  add_executable(${test} ${test}/${test}.c)
+  add_executable(${test} ${test}/${test}.c ${COMMON_SOURCES})
   target_link_libraries(${test} oclgrind-rt)
 
   # Generate test binaries in same dir as Oclgrind libraries on Windows
   if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
-    add_test(app_${test} "${CMAKE_BINARY_DIR}/${test}")
     set_target_properties(${test} PROPERTIES
       RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
   else()
-    add_test(app_${test} "${test}/${test}")
     set_target_properties(${test} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${test}")
     set_target_properties(${test} PROPERTIES LINKER_LANGUAGE CXX)
   endif()
 
+  add_test(
+    NAME app_${test}
+    COMMAND
+    ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/run_test.py
+    $<TARGET_FILE:oclgrind-kernel>
+    $<TARGET_FILE:${test}>)
+
   set_tests_properties(app_${test} PROPERTIES DEPENDS ${test})
 
   # Set PCH directory
diff --git a/tests/apps/image/image.c b/tests/apps/image/image.c
new file mode 100644
index 0000000..101dd4f
--- /dev/null
+++ b/tests/apps/image/image.c
@@ -0,0 +1,133 @@
+#include "common.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#define IMG_SIZE 100
+#define TOL 1e-8
+#define MAX_ERRORS 8
+
+const char *KERNEL_SOURCE =
+"__kernel void image_copy(__read_only image2d_array_t src,  \n"
+"                         __write_only image2d_array_t dst) \n"
+"{                                                          \n"
+"   size_t size = get_image_array_size(src);                \n"
+"   const int x = get_global_id(0);                         \n"
+"   const int y = get_global_id(1);                         \n"
+"   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
+"                             CLK_ADDRESS_CLAMP |           \n"
+"                             CLK_FILTER_NEAREST;           \n"
+"   float4 pixel = read_imagef(src, sampler, (int4)(x, y, 0, 0)); \n"
+"   write_imagef(dst, (int4)(x, y, 0, 0), pixel);                 \n"
+"}                                                          \n"
+;
+
+int main()
+{
+    cl_int err;
+    cl_kernel kernel;
+    cl_image_format img_fmt;
+    cl_image_desc img_desc;
+    cl_mem src, dst;
+    float *input, *output;
+    size_t width, height;
+    width = height = 10;
+    size_t origin[] = {0, 0, 0};
+    size_t region[] = {width, height, 1};
+    size_t GWSize[] = {width, height, 1};
+
+    input = (float*)malloc(IMG_SIZE * 3 * sizeof(float));
+    output = (float*)malloc(IMG_SIZE * 3 * sizeof(float));
+
+    // Create Input data
+    for(int i = 0; i < 3; ++i)
+    {
+        for(int j = 0; j < IMG_SIZE; ++j)
+        {
+            input[i * IMG_SIZE + j] = j + 1.0;
+        }
+    }
+
+    Context cl = createContext(KERNEL_SOURCE);
+    kernel = clCreateKernel(cl.program, "image_copy", &err);
+    checkError(err, "creating kernel");
+
+    img_fmt.image_channel_order = CL_RGB;
+    img_fmt.image_channel_data_type = CL_FLOAT;
+
+    img_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+    img_desc.image_width = width;
+    img_desc.image_height = height;
+    img_desc.image_depth = 0;
+    img_desc.image_array_size = 0;
+    img_desc.image_row_pitch = 0;
+    img_desc.image_slice_pitch = 0;
+    img_desc.num_mip_levels = 0;
+    img_desc.num_samples = 0;
+    img_desc.buffer = NULL;
+
+    src = clCreateImage(cl.context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err);
+    checkError(err, "creating source image");
+
+    dst = clCreateImage(cl.context, CL_MEM_READ_WRITE, &img_fmt, &img_desc, NULL, &err);
+    checkError(err, "creating destination image");
+
+    err = clEnqueueWriteImage(cl.queue, src, CL_TRUE, origin, region, 0, 0, input, 0, NULL, NULL);
+    checkError(err,"enqueuing write image");
+
+
+    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src);
+    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst);
+    checkError(err, "setting kernel args");
+
+    err = clEnqueueNDRangeKernel(cl.queue, kernel, 2, NULL, GWSize, NULL, 0, NULL, NULL);
+    checkError(err, "enqueuing kernel");
+
+    err = clFinish(cl.queue);
+    checkError(err, "running kernel");
+
+    err = clEnqueueReadImage(cl.queue, dst, CL_TRUE, origin, region, 0, 0, output, 0, NULL, NULL);
+    checkError(err, "reading image data");
+
+    // Check results
+    unsigned errors = 0;
+    for(int i = 0; i < 3; ++i)
+    {
+        for(int j = 0; j < IMG_SIZE; ++j)
+        {
+            float ref = input[i * IMG_SIZE + j];
+            float val = output[i * IMG_SIZE + j];
+
+            if(fabs(ref - val) > TOL)
+            {
+                if(errors < MAX_ERRORS)
+                {
+                    fprintf(stderr, "%4d: %.4f != %.4f\n", i, val, ref);
+                }
+                errors++;
+            }
+        }
+    }
+
+    free(input);
+    free(output);
+    clReleaseMemObject(src);
+    clReleaseMemObject(dst);
+    clReleaseKernel(kernel);
+    releaseContext(cl);
+
+    return (errors != 0);
+}
+
+//cl_mem image3;
+//
+//image3 = clCreateImage2D(context, CL_MEM_READ_WRITE, &img_fmt, width, height, 0, 0, &err);
+//
+//// copy Image1 to Image3
+//err = clEnqueueCopyImage(command_queue, image1, image3, origin, origin, region, 1, event, &event[3]);
+//err_check(err, "clEnqueueCopyImage");
+
+
+//clReleaseMemObject(image3);
diff --git a/tests/apps/vecadd/vecadd.c b/tests/apps/vecadd/vecadd.c
index 22d55ed..49f59e5 100644
--- a/tests/apps/vecadd/vecadd.c
+++ b/tests/apps/vecadd/vecadd.c
@@ -1,4 +1,5 @@
-#include <CL/cl.h>
+#include "common.h"
+
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -18,16 +19,9 @@ const char *KERNEL_SOURCE =
 "}                                   \n"
 ;
 
-void checkError(cl_int err, const char *operation);
-
 int main(int argc, char *argv[])
 {
   cl_int err;
-  cl_platform_id platform;
-  cl_device_id device;
-  cl_context context;
-  cl_command_queue queue;
-  cl_program program;
   cl_kernel kernel;
   cl_mem d_a, d_b, d_c;
   float *h_a, *h_b, *h_c;
@@ -50,57 +44,9 @@ int main(int argc, char *argv[])
     exit(1);
   }
 
-  // Get list of platforms
-  cl_uint numPlatforms = 0;
-  cl_platform_id platforms[MAX_PLATFORMS];
-  err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &numPlatforms);
-  checkError(err, "getting platforms");
-
-  // Find Oclgrind
-  platform = NULL;
-  for (int i = 0; i < numPlatforms; i++)
-  {
-    char name[256];
-    err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 256, name, NULL);
-    checkError(err, "getting platform name");
-    if (!strcmp(name, "Oclgrind"))
-    {
-      platform = platforms[i];
-      break;
-    }
-  }
-  if (!platform)
-  {
-    fprintf(stderr, "Unable to find Oclgrind platform\n");
-    exit(1);
-  }
-
-  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
-  checkError(err, "getting device");
-
-  context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
-  checkError(err, "creating context");
+  Context cl = createContext(KERNEL_SOURCE);
 
-  queue = clCreateCommandQueue(context, device, 0, &err);
-  checkError(err, "creating command queue");
-
-  program = clCreateProgramWithSource(context, 1, &KERNEL_SOURCE, NULL, &err);
-  checkError(err, "creating program");
-
-  err = clBuildProgram(program, 1, &device, "", NULL, NULL);
-  if (err == CL_BUILD_PROGRAM_FAILURE)
-  {
-    size_t sz;
-    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
-                          sizeof(size_t), NULL, &sz);
-    char *buildLog = malloc(++sz);
-    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
-                          sz, buildLog, NULL);
-    fprintf(stderr, "%s\n", buildLog);
-  }
-  checkError(err, "building program");
-
-  kernel = clCreateKernel(program, "vecadd", &err);
+  kernel = clCreateKernel(cl.program, "vecadd", &err);
   checkError(err, "creating kernel");
 
   size_t dataSize = N*sizeof(cl_float);
@@ -110,27 +56,27 @@ int main(int argc, char *argv[])
   h_a = malloc(dataSize);
   h_b = malloc(dataSize);
   h_c = malloc(dataSize);
-  for (int i = 0; i < N; i++)
+  for (unsigned i = 0; i < N; i++)
   {
     h_a[i] = rand()/(float)RAND_MAX;
     h_b[i] = rand()/(float)RAND_MAX;
     h_c[i] = 0;
   }
 
-  d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
+  d_a = clCreateBuffer(cl.context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
   checkError(err, "creating d_a buffer");
-  d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
+  d_b = clCreateBuffer(cl.context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
   checkError(err, "creating d_b buffer");
-  d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err);
+  d_c = clCreateBuffer(cl.context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err);
   checkError(err, "creating d_c buffer");
 
-  err = clEnqueueWriteBuffer(queue, d_a, CL_FALSE,
+  err = clEnqueueWriteBuffer(cl.queue, d_a, CL_FALSE,
                              0, dataSize, h_a, 0, NULL, NULL);
   checkError(err, "writing d_a data");
-  err = clEnqueueWriteBuffer(queue, d_b, CL_FALSE,
+  err = clEnqueueWriteBuffer(cl.queue, d_b, CL_FALSE,
                              0, dataSize, h_b, 0, NULL, NULL);
   checkError(err, "writing d_b data");
-  err = clEnqueueWriteBuffer(queue, d_c, CL_FALSE,
+  err = clEnqueueWriteBuffer(cl.queue, d_c, CL_FALSE,
                              0, dataSize, h_c, 0, NULL, NULL);
   checkError(err, "writing d_c data");
 
@@ -139,20 +85,20 @@ int main(int argc, char *argv[])
   err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
   checkError(err, "setting kernel args");
 
-  err = clEnqueueNDRangeKernel(queue, kernel,
+  err = clEnqueueNDRangeKernel(cl.queue, kernel,
                                1, NULL, &global, NULL, 0, NULL, NULL);
   checkError(err, "enqueuing kernel");
 
-  err = clFinish(queue);
+  err = clFinish(cl.queue);
   checkError(err, "running kernel");
 
-  err = clEnqueueReadBuffer(queue, d_c, CL_TRUE,
+  err = clEnqueueReadBuffer(cl.queue, d_c, CL_TRUE,
                             0, dataSize, h_c, 0, NULL, NULL);
   checkError(err, "reading d_c data");
 
   // Check results
-  int errors = 0;
-  for (int i = 0; i < N; i++)
+  unsigned errors = 0;
+  for (unsigned i = 0; i < N; i++)
   {
     float ref = h_a[i] + h_b[i];
     if (fabs(ref - h_c[i]) > TOL)
@@ -164,7 +110,8 @@ int main(int argc, char *argv[])
       errors++;
     }
   }
-  printf("%d errors detected\n", errors);
+  if (errors)
+    printf("%d errors detected\n", errors);
 
   free(h_a);
   free(h_b);
@@ -173,18 +120,7 @@ int main(int argc, char *argv[])
   clReleaseMemObject(d_b);
   clReleaseMemObject(d_c);
   clReleaseKernel(kernel);
-  clReleaseProgram(program);
-  clReleaseCommandQueue(queue);
-  clReleaseContext(context);
+  releaseContext(cl);
 
   return (errors != 0);
 }
-
-void checkError(cl_int err, const char *operation)
-{
-  if (err != CL_SUCCESS)
-  {
-    fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
-    exit(1);
-  }
-}
diff --git a/tests/common/common.c b/tests/common/common.c
new file mode 100644
index 0000000..1923048
--- /dev/null
+++ b/tests/common/common.c
@@ -0,0 +1,66 @@
+#include "common.h"
+
+#include <stdio.h>
+#include <string.h>
+
+void checkError(cl_int err, const char *operation)
+{
+  if (err != CL_SUCCESS)
+  {
+    fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
+    exit(1);
+  }
+}
+
+Context createContext(const char *source)
+{
+  Context cl;
+  cl_int err;
+
+  err = clGetPlatformIDs(1, &cl.platform, NULL);
+  checkError(err, "getting platform");
+
+  // Check platform is Oclgrind
+  char name[256];
+  err = clGetPlatformInfo(cl.platform, CL_PLATFORM_NAME, 256, name, NULL);
+  checkError(err, "getting platform name");
+  if (strcmp(name, "Oclgrind"))
+  {
+    fprintf(stderr, "Unable to find Oclgrind platform\n");
+    exit(1);
+  }
+
+  err = clGetDeviceIDs(cl.platform, CL_DEVICE_TYPE_ALL, 1, &cl.device, NULL);
+  checkError(err, "getting device");
+
+  cl.context = clCreateContext(NULL, 1, &cl.device, NULL, NULL, &err);
+  checkError(err, "creating context");
+
+  cl.queue = clCreateCommandQueue(cl.context, cl.device, 0, &err);
+  checkError(err, "creating command queue");
+
+  cl.program = clCreateProgramWithSource(cl.context, 1, &source, NULL, &err);
+  checkError(err, "creating program");
+
+  err = clBuildProgram(cl.program, 1, &cl.device, "", NULL, NULL);
+  if (err == CL_BUILD_PROGRAM_FAILURE)
+  {
+    size_t sz;
+    clGetProgramBuildInfo(cl.program, cl.device, CL_PROGRAM_BUILD_LOG,
+                          sizeof(size_t), NULL, &sz);
+    char *buildLog = malloc(++sz);
+    clGetProgramBuildInfo(cl.program, cl.device, CL_PROGRAM_BUILD_LOG,
+                          sz, buildLog, NULL);
+    fprintf(stderr, "%s\n", buildLog);
+  }
+  checkError(err, "building program");
+
+  return cl;
+}
+
+void releaseContext(Context cl)
+{
+  clReleaseProgram(cl.program);
+  clReleaseCommandQueue(cl.queue);
+  clReleaseContext(cl.context);
+}
diff --git a/tests/common/common.h b/tests/common/common.h
new file mode 100644
index 0000000..92b44db
--- /dev/null
+++ b/tests/common/common.h
@@ -0,0 +1,15 @@
+#include <CL/cl.h>
+
+typedef struct
+{
+  cl_platform_id   platform;
+  cl_device_id     device;
+  cl_context       context;
+  cl_command_queue queue;
+  cl_program       program;
+} Context;
+
+void checkError(cl_int err, const char *operation);
+
+Context createContext(const char *source);
+void    releaseContext(Context cl);
diff --git a/tests/kernels/CMakeLists.txt b/tests/kernels/CMakeLists.txt
new file mode 100644
index 0000000..017acb2
--- /dev/null
+++ b/tests/kernels/CMakeLists.txt
@@ -0,0 +1,27 @@
+# CMakeLists.txt (Oclgrind)
+# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+#
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+
+# Add kernel tests
+file(READ TESTS KERNEL_TESTS)
+string(REPLACE "\n" ";" KERNEL_TESTS ${KERNEL_TESTS})
+foreach(test ${KERNEL_TESTS})
+  add_test(
+    NAME ${test}
+    COMMAND
+    ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/run_test.py
+    $<TARGET_FILE:oclgrind-kernel>
+    ${CMAKE_SOURCE_DIR}/tests/kernels/${test}.sim)
+endforeach(${test})
+
+# Set PCH directory
+set_tests_properties(${KERNEL_TESTS} PROPERTIES
+    ENVIRONMENT "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
+
+# Expected failures
+set_tests_properties(
+  PROPERTIES WILL_FAIL TRUE)
diff --git a/tests/kernels/TESTS b/tests/kernels/TESTS
index 2ac8723..fd95269 100644
--- a/tests/kernels/TESTS
+++ b/tests/kernels/TESTS
@@ -21,12 +21,18 @@ atomics/atomic_race_before
 atomics/atomic_same_workitem
 barrier/barrier_different_instructions
 barrier/barrier_divergence
+bugs/byval_function_argument
+bugs/const_gep_expr_pointee_type
+bugs/false_warning_vector_argument
 bugs/gvn_arbitrary_integers
 bugs/kernel_struct_argument
+bugs/llvm_bswap
 bugs/many_alloca
 bugs/multidim_array_in_struct
 bugs/null_argument
+bugs/rhadd_overflow
 bugs/sroa_addrspace_cast
+bugs/write_vector_write_only_fp
 data-race/broadcast
 data-race/global_fence
 data-race/global_only_fence
@@ -42,14 +48,35 @@ data-race/local_write_write_race
 data-race/uniform_write_race
 memcheck/async_copy_out_of_bounds
 memcheck/atomic_out_of_bounds
+memcheck/casted_static_array
 memcheck/dereference_null
+memcheck/fake_out_of_bounds
 memcheck/read_out_of_bounds
 memcheck/read_write_only_memory
+memcheck/static_array
+memcheck/static_array_padded_struct
 memcheck/write_out_of_bounds
 memcheck/write_read_only_memory
 misc/array
+misc/lvalue_loads
+misc/program_scope_constant_array
 misc/reduce
 misc/vecadd
+misc/vector_argument
+uninitialized/padded_nested_struct_memcpy
+uninitialized/padded_struct_alloca_fp
+uninitialized/padded_struct_memcpy_fp
+uninitialized/partially_uninitialized_fract
+uninitialized/private_array_initializer_list
+uninitialized/uninitialized_global_buffer
+uninitialized/uninitialized_address
+uninitialized/uninitialized_local_array
+uninitialized/uninitialized_local_ptr
+uninitialized/uninitialized_local_variable
+uninitialized/uninitialized_packed_struct_memcpy
+uninitialized/uninitialized_padded_struct_memcpy
+uninitialized/uninitialized_padded_nested_struct_memcpy
+uninitialized/uninitialized_private_array
 wait_event/wait_event_chained
 wait_event/wait_event_divergent
 wait_event/wait_event_duplicates
diff --git a/tests/kernels/alignment/packed.ref b/tests/kernels/alignment/packed.ref
index df23fc9..a2d80b4 100644
--- a/tests/kernels/alignment/packed.ref
+++ b/tests/kernels/alignment/packed.ref
@@ -1,4 +1,2 @@
-
-Argument 'out': 4 bytes
-  out[0] = 2
-
+EXACT Argument 'out': 4 bytes
+EXACT   out[0] = 33554434
diff --git a/tests/kernels/alignment/packed.sim b/tests/kernels/alignment/packed.sim
index 46e9090..a92b922 100644
--- a/tests/kernels/alignment/packed.sim
+++ b/tests/kernels/alignment/packed.sim
@@ -5,6 +5,6 @@ packed
 
 <size=5 char hex>
 0x01
-0x02 0x00 0x0 0x00
+0x02 0x00 0x00 0x02
 
 <size=4 fill=0 dump>
diff --git a/tests/kernels/alignment/unaligned.ref b/tests/kernels/alignment/unaligned.ref
index 1114e03..d12f6d1 100644
--- a/tests/kernels/alignment/unaligned.ref
+++ b/tests/kernels/alignment/unaligned.ref
@@ -1,5 +1,4 @@
-ERROR EXPECTED
-
-Argument 'out': 4 bytes
-  out[0] = 2752512
+ERROR Invalid memory load - source pointer is not aligned to the pointed type
 
+EXACT Argument 'out': 4 bytes
+EXACT   out[0] = 2752512
diff --git a/tests/kernels/async_copy/async_copy.ref b/tests/kernels/async_copy/async_copy.ref
index cf0b04f..690f35e 100644
--- a/tests/kernels/async_copy/async_copy.ref
+++ b/tests/kernels/async_copy/async_copy.ref
@@ -1,7 +1,5 @@
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 2
-  data[2] = 1
-  data[3] = 0
-
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_divergent.ref b/tests/kernels/async_copy/async_copy_divergent.ref
index 8ce4dbb..9004865 100644
--- a/tests/kernels/async_copy/async_copy_divergent.ref
+++ b/tests/kernels/async_copy/async_copy_divergent.ref
@@ -1,8 +1,7 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 2
-  data[2] = 1
-  data[3] = 0
+ERROR Work-group divergence detected (async copy)
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_global_race.ref b/tests/kernels/async_copy/async_copy_global_race.ref
index 4da13c4..aee3e3d 100644
--- a/tests/kernels/async_copy/async_copy_global_race.ref
+++ b/tests/kernels/async_copy/async_copy_global_race.ref
@@ -1,8 +1,9 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 0
-  data[1] = 1
-  data[2] = 2
-  data[3] = 3
+ERROR Write-write data race at global memory
+ERROR Write-write data race at global memory
+ERROR Write-write data race at global memory
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 0
+EXACT   data[1] = 1
+EXACT   data[2] = 2
+EXACT   data[3] = 3
diff --git a/tests/kernels/async_copy/async_copy_local_race.ref b/tests/kernels/async_copy/async_copy_local_race.ref
index 8ce4dbb..73340db 100644
--- a/tests/kernels/async_copy/async_copy_local_race.ref
+++ b/tests/kernels/async_copy/async_copy_local_race.ref
@@ -1,8 +1,9 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 2
-  data[2] = 1
-  data[3] = 0
+ERROR Write-write data race at local memory
+ERROR Write-write data race at local memory
+ERROR Write-write data race at local memory
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_loop.ref b/tests/kernels/async_copy/async_copy_loop.ref
index cf0b04f..690f35e 100644
--- a/tests/kernels/async_copy/async_copy_loop.ref
+++ b/tests/kernels/async_copy/async_copy_loop.ref
@@ -1,7 +1,5 @@
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 2
-  data[2] = 1
-  data[3] = 0
-
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_loop_divergent.ref b/tests/kernels/async_copy/async_copy_loop_divergent.ref
index 8ce4dbb..9004865 100644
--- a/tests/kernels/async_copy/async_copy_loop_divergent.ref
+++ b/tests/kernels/async_copy/async_copy_loop_divergent.ref
@@ -1,8 +1,7 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 2
-  data[2] = 1
-  data[3] = 0
+ERROR Work-group divergence detected (async copy)
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_single_wi.ref b/tests/kernels/async_copy/async_copy_single_wi.ref
index 8ce4dbb..9004865 100644
--- a/tests/kernels/async_copy/async_copy_single_wi.ref
+++ b/tests/kernels/async_copy/async_copy_single_wi.ref
@@ -1,8 +1,7 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 2
-  data[2] = 1
-  data[3] = 0
+ERROR Work-group divergence detected (async copy)
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_unwaited.ref b/tests/kernels/async_copy/async_copy_unwaited.ref
index 8ce4dbb..b890ad8 100644
--- a/tests/kernels/async_copy/async_copy_unwaited.ref
+++ b/tests/kernels/async_copy/async_copy_unwaited.ref
@@ -1,8 +1,7 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 2
-  data[2] = 1
-  data[3] = 0
+ERROR Work-item finished without waiting for events
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 0
diff --git a/tests/kernels/atomics/atomic_cmpxchg_false_race.ref b/tests/kernels/atomics/atomic_cmpxchg_false_race.ref
index fe14281..4014209 100644
--- a/tests/kernels/atomics/atomic_cmpxchg_false_race.ref
+++ b/tests/kernels/atomics/atomic_cmpxchg_false_race.ref
@@ -1,8 +1,6 @@
-
-Argument 'data': 20 bytes
-  data[0] = 4
-  data[1] = 1
-  data[2] = 2
-  data[3] = 3
-  data[4] = 4
-
+EXACT Argument 'data': 20 bytes
+EXACT   data[0] = 4
+EXACT   data[1] = 1
+EXACT   data[2] = 2
+EXACT   data[3] = 3
+EXACT   data[4] = 4
diff --git a/tests/kernels/atomics/atomic_cmpxchg_read_race.ref b/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
index b398c6c..d01adc0 100644
--- a/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
+++ b/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
@@ -1,5 +1,5 @@
-ERROR EXPECTED
-
-Argument 'data': 4 bytes
-  data[0] = 1
+ERROR Read-write data race at global memory
+ERROR Write-write data race at global memory
 
+EXACT Argument 'data': 4 bytes
+EXACT   data[0] = 1
diff --git a/tests/kernels/atomics/atomic_cmpxchg_write_race.ref b/tests/kernels/atomics/atomic_cmpxchg_write_race.ref
index af96d9b..ca64ee3 100644
--- a/tests/kernels/atomics/atomic_cmpxchg_write_race.ref
+++ b/tests/kernels/atomics/atomic_cmpxchg_write_race.ref
@@ -1,5 +1,4 @@
-ERROR EXPECTED
-
-Argument 'data': 4 bytes
-  data[0] = 42
+ERROR Read-write data race at global memory
 
+EXACT Argument 'data': 4 bytes
+EXACT   data[0] = 42
diff --git a/tests/kernels/atomics/atomic_global_fence.ref b/tests/kernels/atomics/atomic_global_fence.ref
index a7bf48a..dab956f 100644
--- a/tests/kernels/atomics/atomic_global_fence.ref
+++ b/tests/kernels/atomics/atomic_global_fence.ref
@@ -1,5 +1,3 @@
-
-Argument 'data': 8 bytes
-  data[0] = 6
-  data[1] = 22
-
+EXACT Argument 'data': 8 bytes
+EXACT   data[0] = 6
+EXACT   data[1] = 22
diff --git a/tests/kernels/atomics/atomic_global_fence_race.ref b/tests/kernels/atomics/atomic_global_fence_race.ref
index 4920bcf..71c83c2 100644
--- a/tests/kernels/atomics/atomic_global_fence_race.ref
+++ b/tests/kernels/atomics/atomic_global_fence_race.ref
@@ -1,6 +1,6 @@
-ERROR EXPECTED
-
-Argument 'data': 8 bytes
-  data[0] = 6
-  data[1] = 28
+ERROR Read-write data race at global memory
+ERROR Read-write data race at global memory
 
+EXACT Argument 'data': 8 bytes
+MATCH   data[0] =
+MATCH   data[1] =
diff --git a/tests/kernels/atomics/atomic_increment.ref b/tests/kernels/atomics/atomic_increment.ref
index f61189d..fc93c7c 100644
--- a/tests/kernels/atomics/atomic_increment.ref
+++ b/tests/kernels/atomics/atomic_increment.ref
@@ -1,4 +1,2 @@
-
-Argument 'data': 4 bytes
-  data[0] = 4
-
+EXACT Argument 'data': 4 bytes
+EXACT   data[0] = 4
diff --git a/tests/kernels/atomics/atomic_intergroup_race.ref b/tests/kernels/atomics/atomic_intergroup_race.ref
index cab3430..43c056c 100644
--- a/tests/kernels/atomics/atomic_intergroup_race.ref
+++ b/tests/kernels/atomics/atomic_intergroup_race.ref
@@ -1,5 +1,5 @@
-ERROR EXPECTED
-
-Argument 'data': 4 bytes
-  data[0] = 8
+ERROR Read-write data race at global memory
+ERROR Write-write data race at global memory
 
+EXACT Argument 'data': 4 bytes
+MATCH   data[0] =
diff --git a/tests/kernels/atomics/atomic_local_fence.ref b/tests/kernels/atomics/atomic_local_fence.ref
index a7bf48a..dab956f 100644
--- a/tests/kernels/atomics/atomic_local_fence.ref
+++ b/tests/kernels/atomics/atomic_local_fence.ref
@@ -1,5 +1,3 @@
-
-Argument 'data': 8 bytes
-  data[0] = 6
-  data[1] = 22
-
+EXACT Argument 'data': 8 bytes
+EXACT   data[0] = 6
+EXACT   data[1] = 22
diff --git a/tests/kernels/atomics/atomic_race_after.ref b/tests/kernels/atomics/atomic_race_after.ref
index bc902a8..8c97680 100644
--- a/tests/kernels/atomics/atomic_race_after.ref
+++ b/tests/kernels/atomics/atomic_race_after.ref
@@ -1,5 +1,6 @@
-ERROR EXPECTED
-
-Argument 'data': 4 bytes
-  data[0] = 5
+ERROR Read-write data race at global memory
+ERROR Read-write data race at global memory
+ERROR Write-write data race at global memory
 
+EXACT Argument 'data': 4 bytes
+EXACT   data[0] = 5
diff --git a/tests/kernels/atomics/atomic_race_before.ref b/tests/kernels/atomics/atomic_race_before.ref
index 6ecedc3..65484a7 100644
--- a/tests/kernels/atomics/atomic_race_before.ref
+++ b/tests/kernels/atomics/atomic_race_before.ref
@@ -1,5 +1,9 @@
-ERROR EXPECTED
-
-Argument 'data': 4 bytes
-  data[0] = 4
+ERROR Read-write data race at global memory address
+ERROR Write-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Write-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Write-write data race at global memory address
 
+EXACT Argument 'data': 4 bytes
+EXACT   data[0] = 4
diff --git a/tests/kernels/atomics/atomic_same_workitem.ref b/tests/kernels/atomics/atomic_same_workitem.ref
index 3ef3ca7..63eb583 100644
--- a/tests/kernels/atomics/atomic_same_workitem.ref
+++ b/tests/kernels/atomics/atomic_same_workitem.ref
@@ -1,7 +1,5 @@
-
-Argument 'data': 16 bytes
-  data[0] = 1
-  data[1] = 2
-  data[2] = 1
-  data[3] = 2
-
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 1
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 2
diff --git a/tests/kernels/barrier/barrier_different_instructions.ref b/tests/kernels/barrier/barrier_different_instructions.ref
index 3ffaa5a..ab5b1f0 100644
--- a/tests/kernels/barrier/barrier_different_instructions.ref
+++ b/tests/kernels/barrier/barrier_different_instructions.ref
@@ -1,8 +1,9 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 42
-  data[1] = 43
-  data[2] = 44
-  data[3] = 45
+ERROR Work-group divergence detected (barrier)
+ERROR Work-group divergence detected (barrier)
+ERROR Work-group divergence detected (barrier)
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 42
+EXACT   data[1] = 43
+EXACT   data[2] = 44
+EXACT   data[3] = 45
diff --git a/tests/kernels/barrier/barrier_divergence.ref b/tests/kernels/barrier/barrier_divergence.ref
index 4da13c4..028c077 100644
--- a/tests/kernels/barrier/barrier_divergence.ref
+++ b/tests/kernels/barrier/barrier_divergence.ref
@@ -1,8 +1,7 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 0
-  data[1] = 1
-  data[2] = 2
-  data[3] = 3
+ERROR Work-group divergence detected (barrier)
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 0
+EXACT   data[1] = 1
+EXACT   data[2] = 2
+EXACT   data[3] = 3
diff --git a/tests/kernels/bugs/byval_function_argument.cl b/tests/kernels/bugs/byval_function_argument.cl
new file mode 100644
index 0000000..de36492
--- /dev/null
+++ b/tests/kernels/bugs/byval_function_argument.cl
@@ -0,0 +1,19 @@
+union U
+{
+  uint a;
+  uint b;
+};
+
+uint func(union U value)
+{
+  uint ret = value.a;
+  value.b = 777;
+  return ret;
+}
+
+kernel void byval_function_argument(global uint *output)
+{
+  union U u = {42};
+  output[0] = func(u);
+  output[1] = u.b;
+}
diff --git a/tests/kernels/bugs/byval_function_argument.ref b/tests/kernels/bugs/byval_function_argument.ref
new file mode 100644
index 0000000..f6eec51
--- /dev/null
+++ b/tests/kernels/bugs/byval_function_argument.ref
@@ -0,0 +1,3 @@
+EXACT Argument 'output': 8 bytes
+EXACT   output[0] = 42
+EXACT   output[1] = 42
diff --git a/tests/kernels/bugs/byval_function_argument.sim b/tests/kernels/bugs/byval_function_argument.sim
new file mode 100644
index 0000000..f7abb44
--- /dev/null
+++ b/tests/kernels/bugs/byval_function_argument.sim
@@ -0,0 +1,6 @@
+byval_function_argument.cl
+byval_function_argument
+1 1 1
+1 1 1
+
+<size=8 fill=0 dump>
diff --git a/tests/kernels/bugs/const_gep_expr_pointee_type.cl b/tests/kernels/bugs/const_gep_expr_pointee_type.cl
new file mode 100644
index 0000000..63d9780
--- /dev/null
+++ b/tests/kernels/bugs/const_gep_expr_pointee_type.cl
@@ -0,0 +1,9 @@
+struct S0 {
+  int d;
+  long b;
+} fn1() {
+  struct S0 a = {3};
+  a.d;
+  return a;
+  }
+__kernel void entry() { fn1(); }
diff --git a/tests/kernels/bugs/const_gep_expr_pointee_type.sim b/tests/kernels/bugs/const_gep_expr_pointee_type.sim
new file mode 100644
index 0000000..59ad90d
--- /dev/null
+++ b/tests/kernels/bugs/const_gep_expr_pointee_type.sim
@@ -0,0 +1,4 @@
+const_gep_expr_pointee_type.cl
+entry
+1 1 1
+1 1 1
diff --git a/tests/kernels/bugs/false_warning_vector_argument.cl b/tests/kernels/bugs/false_warning_vector_argument.cl
new file mode 100644
index 0000000..b05740f
--- /dev/null
+++ b/tests/kernels/bugs/false_warning_vector_argument.cl
@@ -0,0 +1,8 @@
+kernel void false_warning_vector_argument(int16 arg, global int8 *res)
+{
+    int8 v = (int8)(1,2,3,4,5,6,7,8);
+
+    int16 add = arg + v.s0011223344556677;
+
+    *res = add.lo;
+}
diff --git a/tests/kernels/bugs/false_warning_vector_argument.ref b/tests/kernels/bugs/false_warning_vector_argument.ref
new file mode 100644
index 0000000..ccac9e6
--- /dev/null
+++ b/tests/kernels/bugs/false_warning_vector_argument.ref
@@ -0,0 +1,9 @@
+EXACT Argument 'res': 32 bytes
+EXACT   res[0] = 1
+EXACT   res[1] = 1
+EXACT   res[2] = 2
+EXACT   res[3] = 2
+EXACT   res[4] = 3
+EXACT   res[5] = 3
+EXACT   res[6] = 4
+EXACT   res[7] = 4
diff --git a/tests/kernels/bugs/false_warning_vector_argument.sim b/tests/kernels/bugs/false_warning_vector_argument.sim
new file mode 100644
index 0000000..7a7df14
--- /dev/null
+++ b/tests/kernels/bugs/false_warning_vector_argument.sim
@@ -0,0 +1,7 @@
+false_warning_vector_argument.cl
+false_warning_vector_argument
+1 1 1
+1 1 1
+
+<size=64 fill=0>
+<size=32 fill=0 dump>
diff --git a/tests/kernels/bugs/gvn_arbitrary_integers.ref b/tests/kernels/bugs/gvn_arbitrary_integers.ref
index fafe2ec..217cbcf 100644
--- a/tests/kernels/bugs/gvn_arbitrary_integers.ref
+++ b/tests/kernels/bugs/gvn_arbitrary_integers.ref
@@ -1,6 +1,4 @@
-
-Argument 'dest': 12 bytes
-  dest[0] = 0
-  dest[1] = 0
-  dest[2] = 42
-
+EXACT Argument 'dest': 12 bytes
+EXACT   dest[0] = 0
+EXACT   dest[1] = 0
+EXACT   dest[2] = 42
diff --git a/tests/kernels/bugs/kernel_struct_argument.ref b/tests/kernels/bugs/kernel_struct_argument.ref
index b8c7e51..7a4426d 100644
--- a/tests/kernels/bugs/kernel_struct_argument.ref
+++ b/tests/kernels/bugs/kernel_struct_argument.ref
@@ -1,4 +1,2 @@
-
-Argument 'out': 4 bytes
-  out[0] = 144
-
+EXACT Argument 'out': 4 bytes
+EXACT   out[0] = 144
diff --git a/tests/kernels/bugs/llvm_bswap.cl b/tests/kernels/bugs/llvm_bswap.cl
new file mode 100644
index 0000000..c9636eb
--- /dev/null
+++ b/tests/kernels/bugs/llvm_bswap.cl
@@ -0,0 +1,8 @@
+kernel void test(global uint *input, global uint *output)
+{
+  for (unsigned int i = 0; i < 4; i++)
+  {
+    uint word = input[i];
+    output[i] = ((word & 0xff) << 24) | ((word & 0xff00) << 8) | ((word & 0xff0000) >> 8) | ((word & 0xff000000) >> 24);
+  }
+}
diff --git a/tests/kernels/bugs/llvm_bswap.ref b/tests/kernels/bugs/llvm_bswap.ref
new file mode 100644
index 0000000..e217f93
--- /dev/null
+++ b/tests/kernels/bugs/llvm_bswap.ref
@@ -0,0 +1,7 @@
+
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 0x01000000
+EXACT   output[1] = 0x00000010
+EXACT   output[2] = 0x78563412
+EXACT   output[3] = 0x45342312
+
diff --git a/tests/kernels/bugs/llvm_bswap.sim b/tests/kernels/bugs/llvm_bswap.sim
new file mode 100644
index 0000000..de720a0
--- /dev/null
+++ b/tests/kernels/bugs/llvm_bswap.sim
@@ -0,0 +1,12 @@
+llvm_bswap.cl
+test
+1 1 1
+1 1 1
+
+<size=16 hex>
+0x00000001
+0x10000000
+0x12345678
+0x12233445
+
+<size=16 fill=0 hex dump>
diff --git a/tests/kernels/bugs/many_alloca.ref b/tests/kernels/bugs/many_alloca.ref
index 201d55d..a7a484c 100644
--- a/tests/kernels/bugs/many_alloca.ref
+++ b/tests/kernels/bugs/many_alloca.ref
@@ -1,4 +1,2 @@
-
-Argument 'data': 4 bytes
-  data[0] = 100000
-
+EXACT Argument 'data': 4 bytes
+EXACT   data[0] = 100000
diff --git a/tests/kernels/bugs/multidim_array_in_struct.ref b/tests/kernels/bugs/multidim_array_in_struct.ref
index f9606f2..e7fa7eb 100644
--- a/tests/kernels/bugs/multidim_array_in_struct.ref
+++ b/tests/kernels/bugs/multidim_array_in_struct.ref
@@ -1,4 +1,2 @@
-
-Argument 'output': 8 bytes
-  output[0] = 6
-
+EXACT Argument 'output': 8 bytes
+EXACT   output[0] = 6
diff --git a/tests/kernels/bugs/null_argument.ref b/tests/kernels/bugs/null_argument.ref
index dcf81cb..87107f8 100644
--- a/tests/kernels/bugs/null_argument.ref
+++ b/tests/kernels/bugs/null_argument.ref
@@ -1,4 +1,2 @@
-
-Argument 'output': 8 bytes
-  output[0] = 1
-
+EXACT Argument 'output': 8 bytes
+EXACT   output[0] = 1
diff --git a/tests/kernels/bugs/rhadd_overflow.cl b/tests/kernels/bugs/rhadd_overflow.cl
new file mode 100644
index 0000000..42a2960
--- /dev/null
+++ b/tests/kernels/bugs/rhadd_overflow.cl
@@ -0,0 +1,4 @@
+kernel void rhadd_overflow(global ulong *output)
+{
+  output[0] = rhadd(0UL, 0xFFFFFFFFFFFFFFFFUL);
+}
diff --git a/tests/kernels/bugs/rhadd_overflow.ref b/tests/kernels/bugs/rhadd_overflow.ref
new file mode 100644
index 0000000..3bb215c
--- /dev/null
+++ b/tests/kernels/bugs/rhadd_overflow.ref
@@ -0,0 +1,3 @@
+
+EXACT Argument 'output': 8 bytes
+EXACT   output[0] = 0x8000000000000000
diff --git a/tests/kernels/bugs/rhadd_overflow.sim b/tests/kernels/bugs/rhadd_overflow.sim
new file mode 100644
index 0000000..ddb5e14
--- /dev/null
+++ b/tests/kernels/bugs/rhadd_overflow.sim
@@ -0,0 +1,6 @@
+rhadd_overflow.cl
+rhadd_overflow
+1 1 1
+1 1 1
+
+<size=8 fill=0 hex dump>
diff --git a/tests/kernels/bugs/sroa_addrspace_cast.ref b/tests/kernels/bugs/sroa_addrspace_cast.ref
index 2fff44c..0492407 100644
--- a/tests/kernels/bugs/sroa_addrspace_cast.ref
+++ b/tests/kernels/bugs/sroa_addrspace_cast.ref
@@ -1,4 +1,2 @@
-
-Argument 'output': 4 bytes
-  output[0] = 42.24
-
+EXACT Argument 'output': 4 bytes
+EXACT   output[0] = 42.24
diff --git a/tests/kernels/bugs/write_vector_write_only_fp.cl b/tests/kernels/bugs/write_vector_write_only_fp.cl
new file mode 100644
index 0000000..7426b84
--- /dev/null
+++ b/tests/kernels/bugs/write_vector_write_only_fp.cl
@@ -0,0 +1,5 @@
+kernel void write_vector_write_only_fp(global int4 *output)
+{
+  int i = get_global_id(0);
+  output[i].x = 42;
+}
diff --git a/tests/kernels/bugs/write_vector_write_only_fp.ref b/tests/kernels/bugs/write_vector_write_only_fp.ref
new file mode 100644
index 0000000..dd0ed01
--- /dev/null
+++ b/tests/kernels/bugs/write_vector_write_only_fp.ref
@@ -0,0 +1,17 @@
+EXACT Argument 'output': 64 bytes
+EXACT   output[0] = 42
+EXACT   output[1] = 7
+EXACT   output[2] = 7
+EXACT   output[3] = 7
+EXACT   output[4] = 42
+EXACT   output[5] = 7
+EXACT   output[6] = 7
+EXACT   output[7] = 7
+EXACT   output[8] = 42
+EXACT   output[9] = 7
+EXACT   output[10] = 7
+EXACT   output[11] = 7
+EXACT   output[12] = 42
+EXACT   output[13] = 7
+EXACT   output[14] = 7
+EXACT   output[15] = 7
diff --git a/tests/kernels/bugs/write_vector_write_only_fp.sim b/tests/kernels/bugs/write_vector_write_only_fp.sim
new file mode 100644
index 0000000..296d298
--- /dev/null
+++ b/tests/kernels/bugs/write_vector_write_only_fp.sim
@@ -0,0 +1,6 @@
+write_vector_write_only_fp.cl
+write_vector_write_only_fp
+4 1 1
+1 1 1
+
+<size=64 wo fill=7 dump>
diff --git a/tests/kernels/data-race/broadcast.ref b/tests/kernels/data-race/broadcast.ref
index 69790f7..c5d628b 100644
--- a/tests/kernels/data-race/broadcast.ref
+++ b/tests/kernels/data-race/broadcast.ref
@@ -1,7 +1,5 @@
-
-Argument 'output': 16 bytes
-  output[0] = 42
-  output[1] = 42
-  output[2] = 42
-  output[3] = 42
-
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 42
+EXACT   output[1] = 42
+EXACT   output[2] = 42
+EXACT   output[3] = 42
diff --git a/tests/kernels/data-race/global_fence.ref b/tests/kernels/data-race/global_fence.ref
index 342c29a..caae30a 100644
--- a/tests/kernels/data-race/global_fence.ref
+++ b/tests/kernels/data-race/global_fence.ref
@@ -1,7 +1,5 @@
-
-Argument 'output': 16 bytes
-  output[0] = 6
-  output[1] = 22
-  output[2] = 38
-  output[3] = 54
-
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 6
+EXACT   output[1] = 22
+EXACT   output[2] = 38
+EXACT   output[3] = 54
diff --git a/tests/kernels/data-race/global_only_fence.ref b/tests/kernels/data-race/global_only_fence.ref
index 5b62861..faf17e0 100644
--- a/tests/kernels/data-race/global_only_fence.ref
+++ b/tests/kernels/data-race/global_only_fence.ref
@@ -1,8 +1,9 @@
-ERROR EXPECTED
-
-Argument 'output': 16 bytes
-  output[0] = 6
-  output[1] = 0
-  output[2] = 0
-  output[3] = 0
+ERROR Read-write data race at local memory
+ERROR Read-write data race at local memory
+ERROR Read-write data race at local memory
 
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 6
+EXACT   output[1] = 0
+EXACT   output[2] = 0
+EXACT   output[3] = 0
diff --git a/tests/kernels/data-race/global_read_write_race.ref b/tests/kernels/data-race/global_read_write_race.ref
index 7e1c317..7f5a780 100644
--- a/tests/kernels/data-race/global_read_write_race.ref
+++ b/tests/kernels/data-race/global_read_write_race.ref
@@ -1,8 +1,8 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 0
-  data[1] = 0
-  data[2] = 0
-  data[3] = 0
+ERROR Read-write data race at global memory
+ERROR Read-write data race at global memory
 
+EXACT Argument 'data': 16 bytes
+MATCH   data[0] =
+MATCH   data[1] =
+MATCH   data[2] =
+MATCH   data[3] =
diff --git a/tests/kernels/data-race/global_read_write_race.sim b/tests/kernels/data-race/global_read_write_race.sim
index 11077ab..fd262ec 100644
--- a/tests/kernels/data-race/global_read_write_race.sim
+++ b/tests/kernels/data-race/global_read_write_race.sim
@@ -1,6 +1,6 @@
 global_read_write_race.cl
 global_read_write_race
 4 1 1
-1 1 1
+4 1 1
 
 <size=16 range=0:1:3 dump>
diff --git a/tests/kernels/data-race/global_write_write_race.ref b/tests/kernels/data-race/global_write_write_race.ref
index 0b31b65..475e301 100644
--- a/tests/kernels/data-race/global_write_write_race.ref
+++ b/tests/kernels/data-race/global_write_write_race.ref
@@ -1,5 +1,6 @@
-ERROR EXPECTED
-
-Argument 'data': 4 bytes
-  data[0] = 3
+ERROR Write-write data race at global memory
+ERROR Write-write data race at global memory
+ERROR Write-write data race at global memory
 
+EXACT Argument 'data': 4 bytes
+MATCH   data[0] =
diff --git a/tests/kernels/data-race/increment.ref b/tests/kernels/data-race/increment.ref
index 11a20e6..3c1a851 100644
--- a/tests/kernels/data-race/increment.ref
+++ b/tests/kernels/data-race/increment.ref
@@ -1,7 +1,5 @@
-
-Argument 'data': 16 bytes
-  data[0] = 1
-  data[1] = 2
-  data[2] = 3
-  data[3] = 4
-
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 1
+EXACT   data[1] = 2
+EXACT   data[2] = 3
+EXACT   data[3] = 4
diff --git a/tests/kernels/data-race/intergroup_hidden_race.ref b/tests/kernels/data-race/intergroup_hidden_race.ref
index 9390c4c..e210dc0 100644
--- a/tests/kernels/data-race/intergroup_hidden_race.ref
+++ b/tests/kernels/data-race/intergroup_hidden_race.ref
@@ -1,6 +1,5 @@
-ERROR EXPECTED
-
-Argument 'output': 8 bytes
-  output[0] = 0
-  output[1] = 0
+ERROR Read-write data race at global memory
 
+EXACT Argument 'output': 8 bytes
+MATCH   output[0] =
+MATCH   output[1] =
diff --git a/tests/kernels/data-race/intergroup_race.ref b/tests/kernels/data-race/intergroup_race.ref
index 4da13c4..7078eef 100644
--- a/tests/kernels/data-race/intergroup_race.ref
+++ b/tests/kernels/data-race/intergroup_race.ref
@@ -1,8 +1,5 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 0
-  data[1] = 1
-  data[2] = 2
-  data[3] = 3
+ERROR Read-write data race at global memory
 
+EXACT Argument 'data': 8 bytes
+MATCH   data[0] =
+MATCH   data[1] =
diff --git a/tests/kernels/data-race/intergroup_race.sim b/tests/kernels/data-race/intergroup_race.sim
index 4e60c87..6a10dd8 100644
--- a/tests/kernels/data-race/intergroup_race.sim
+++ b/tests/kernels/data-race/intergroup_race.sim
@@ -1,6 +1,6 @@
 intergroup_race.cl
 intergroup_race
-16 1 1
+8 1 1
 4 1 1
 
-<size=16 fill=0 dump>
+<size=8 fill=0 dump>
diff --git a/tests/kernels/data-race/intragroup_hidden_race.ref b/tests/kernels/data-race/intragroup_hidden_race.ref
index 7ff022b..102070c 100644
--- a/tests/kernels/data-race/intragroup_hidden_race.ref
+++ b/tests/kernels/data-race/intragroup_hidden_race.ref
@@ -1,6 +1,5 @@
-ERROR EXPECTED
-
-Argument 'output': 8 bytes
-  output[0] = 42
-  output[1] = 42
+ERROR Read-write data race at global memory
 
+EXACT Argument 'output': 8 bytes
+EXACT   output[0] = 42
+EXACT   output[1] = 42
diff --git a/tests/kernels/data-race/local_only_fence.ref b/tests/kernels/data-race/local_only_fence.ref
index b6b7f00..8f99004 100644
--- a/tests/kernels/data-race/local_only_fence.ref
+++ b/tests/kernels/data-race/local_only_fence.ref
@@ -1,8 +1,18 @@
-ERROR EXPECTED
-
-Argument 'output': 16 bytes
-  output[0] = 6
-  output[1] = 22
-  output[2] = 38
-  output[3] = 54
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
+ERROR Read-write data race at global memory address
 
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 6
+EXACT   output[1] = 22
+EXACT   output[2] = 38
+EXACT   output[3] = 54
diff --git a/tests/kernels/data-race/local_read_write_race.cl b/tests/kernels/data-race/local_read_write_race.cl
index bcc3ff8..3834fd4 100644
--- a/tests/kernels/data-race/local_read_write_race.cl
+++ b/tests/kernels/data-race/local_read_write_race.cl
@@ -1,6 +1,9 @@
 kernel void local_read_write_race(global int *data, local int *scratch)
 {
   int l = get_local_id(0);
+  scratch[l] = 0;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
   scratch[l] = l;
   if (l == 0)
   {
diff --git a/tests/kernels/data-race/local_read_write_race.ref b/tests/kernels/data-race/local_read_write_race.ref
index 0943b15..f4792d4 100644
--- a/tests/kernels/data-race/local_read_write_race.ref
+++ b/tests/kernels/data-race/local_read_write_race.ref
@@ -1,5 +1,6 @@
-ERROR EXPECETD
-
-Argument 'data': 4 bytes
-  data[0] = 0
+ERROR Read-write data race at local memory
+ERROR Read-write data race at local memory
+ERROR Read-write data race at local memory
 
+EXACT Argument 'data': 4 bytes
+EXACT   data[0] = 0
diff --git a/tests/kernels/data-race/local_write_write_race.ref b/tests/kernels/data-race/local_write_write_race.ref
index 3fe4e95..f094bf0 100644
--- a/tests/kernels/data-race/local_write_write_race.ref
+++ b/tests/kernels/data-race/local_write_write_race.ref
@@ -1,8 +1,9 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 3
-  data[2] = 3
-  data[3] = 3
+ERROR Write-write data race at local memory
+ERROR Write-write data race at local memory
+ERROR Write-write data race at local memory
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 3
+EXACT   data[2] = 3
+EXACT   data[3] = 3
diff --git a/tests/kernels/data-race/uniform_write_race.ref b/tests/kernels/data-race/uniform_write_race.ref
index b688113..3cdbf81 100644
--- a/tests/kernels/data-race/uniform_write_race.ref
+++ b/tests/kernels/data-race/uniform_write_race.ref
@@ -1,4 +1,2 @@
-
-Argument 'data': 4 bytes
-  data[0] = 0
-
+EXACT Argument 'data': 4 bytes
+EXACT   data[0] = 0
diff --git a/tests/kernels/memcheck/async_copy_out_of_bounds.ref b/tests/kernels/memcheck/async_copy_out_of_bounds.ref
index 9a8cb35..82c85c5 100644
--- a/tests/kernels/memcheck/async_copy_out_of_bounds.ref
+++ b/tests/kernels/memcheck/async_copy_out_of_bounds.ref
@@ -1,8 +1,7 @@
-ERROR EXPECTED
-
-Argument 'dst': 16 bytes
-  dst[0] = 0
-  dst[1] = 0
-  dst[2] = 1
-  dst[3] = 2
+ERROR Invalid write of size 4 at global memory
 
+EXACT Argument 'dst': 16 bytes
+EXACT   dst[0] = 0
+EXACT   dst[1] = 0
+EXACT   dst[2] = 1
+EXACT   dst[3] = 2
diff --git a/tests/kernels/memcheck/atomic_out_of_bounds.ref b/tests/kernels/memcheck/atomic_out_of_bounds.ref
index cfcff7d..3a26898 100644
--- a/tests/kernels/memcheck/atomic_out_of_bounds.ref
+++ b/tests/kernels/memcheck/atomic_out_of_bounds.ref
@@ -1,8 +1,9 @@
-ERROR EXPECTED
-
-Argument 'counters': 16 bytes
-  counters[0] = 1
-  counters[1] = 1
-  counters[2] = 1
-  counters[3] = 1
+ERROR Invalid read of size 4 at global memory
+ERROR Invalid write of size 4 at global memory
+ERROR Uninitialized value
 
+EXACT Argument 'counters': 16 bytes
+EXACT   counters[0] = 1
+EXACT   counters[1] = 1
+EXACT   counters[2] = 1
+EXACT   counters[3] = 1
diff --git a/tests/kernels/memcheck/casted_static_array.cl b/tests/kernels/memcheck/casted_static_array.cl
new file mode 100644
index 0000000..2519acb
--- /dev/null
+++ b/tests/kernels/memcheck/casted_static_array.cl
@@ -0,0 +1,31 @@
+void transparent_crc_no_string(ulong *p1, ulong p2) { *p1 += p2; }
+int get_linear_global_id() {
+  return (get_global_id(2) * get_global_size(1) + get_global_id(1)) *
+             get_global_size(0) +
+         get_global_id(0);
+}
+union U5 {
+  short f0;
+  int f3;
+};
+struct S6 {
+  union U5 g_75[5][7][2];
+  union U5 **g_91[78];
+};
+__kernel void casted_static_array(__global ulong *p1) {
+  int i, j, k;
+  struct S6 c_864;
+  struct S6 *p_863 = &c_864;
+  union U5 *p_863_6;
+  struct S6 c_865 = {{{{{0xD54EL}}}}, {&p_863_6}};
+  c_864 = c_865;
+  ulong crc64_context = i = 0;
+  for (; i < 9; i++) {
+    j = 0;
+    {
+      k = 0;
+      { transparent_crc_no_string(&crc64_context, p_863->g_75[i][j][k].f0); }
+    }
+  }
+  p1[get_linear_global_id()] = crc64_context;
+}
diff --git a/tests/kernels/memcheck/casted_static_array.ref b/tests/kernels/memcheck/casted_static_array.ref
new file mode 100644
index 0000000..b96430d
--- /dev/null
+++ b/tests/kernels/memcheck/casted_static_array.ref
@@ -0,0 +1,7 @@
+ERROR exceeds static array size
+ERROR exceeds static array size
+ERROR exceeds static array size
+ERROR exceeds static array size
+
+EXACT Argument 'p1': 8 bytes
+MATCH   p1[0] =
diff --git a/tests/kernels/memcheck/casted_static_array.sim b/tests/kernels/memcheck/casted_static_array.sim
new file mode 100644
index 0000000..4edcd72
--- /dev/null
+++ b/tests/kernels/memcheck/casted_static_array.sim
@@ -0,0 +1,6 @@
+casted_static_array.cl
+casted_static_array
+1 1 1
+1 1 1
+
+<size=8 fill=0 dump>
diff --git a/tests/kernels/memcheck/dereference_null.ref b/tests/kernels/memcheck/dereference_null.ref
index 5a01471..3f8d021 100644
--- a/tests/kernels/memcheck/dereference_null.ref
+++ b/tests/kernels/memcheck/dereference_null.ref
@@ -1,5 +1,5 @@
-ERROR EXPECTED
-
-Argument 'output': 4 bytes
-  output[0] = 0
+ERROR Invalid read of size 4 at global memory address 0x0
+ERROR Uninitialized value
 
+EXACT Argument 'output': 4 bytes
+EXACT   output[0] = 0
diff --git a/tests/kernels/memcheck/fake_out_of_bounds.cl b/tests/kernels/memcheck/fake_out_of_bounds.cl
new file mode 100644
index 0000000..90fce03
--- /dev/null
+++ b/tests/kernels/memcheck/fake_out_of_bounds.cl
@@ -0,0 +1,12 @@
+struct S0 {
+    uchar f[1];
+    ulong g[4];
+};
+
+__kernel void entry(__global ulong *result) {
+    struct S0 s = {{1}, {2,3,4,5}};
+    struct S0 t = s;
+
+    volatile int i = 0;
+    *result = t.g[i];
+}
diff --git a/tests/kernels/memcheck/fake_out_of_bounds.ref b/tests/kernels/memcheck/fake_out_of_bounds.ref
new file mode 100644
index 0000000..d932a4a
--- /dev/null
+++ b/tests/kernels/memcheck/fake_out_of_bounds.ref
@@ -0,0 +1,2 @@
+EXACT Argument 'result': 8 bytes
+EXACT   result[0] = 2
diff --git a/tests/kernels/memcheck/fake_out_of_bounds.sim b/tests/kernels/memcheck/fake_out_of_bounds.sim
new file mode 100644
index 0000000..23799e4
--- /dev/null
+++ b/tests/kernels/memcheck/fake_out_of_bounds.sim
@@ -0,0 +1,6 @@
+fake_out_of_bounds.cl
+entry
+1 1 1
+1 1 1
+
+<size=8 fill=0 dump>
diff --git a/tests/kernels/memcheck/read_out_of_bounds.ref b/tests/kernels/memcheck/read_out_of_bounds.ref
index 539c267..75a0f05 100644
--- a/tests/kernels/memcheck/read_out_of_bounds.ref
+++ b/tests/kernels/memcheck/read_out_of_bounds.ref
@@ -1,9 +1,10 @@
-ERROR EXPECTED
-
-Argument 'c': 20 bytes
-  c[0] = 0
-  c[1] = 2
-  c[2] = 4
-  c[3] = 6
-  c[4] = 0
+ERROR Invalid read of size 4 at global memory
+ERROR Invalid read of size 4 at global memory
+ERROR Uninitialized value
 
+EXACT Argument 'c': 20 bytes
+EXACT   c[0] = 0
+EXACT   c[1] = 2
+EXACT   c[2] = 4
+EXACT   c[3] = 6
+EXACT   c[4] = 0
diff --git a/tests/kernels/memcheck/read_write_only_memory.ref b/tests/kernels/memcheck/read_write_only_memory.ref
index cb933ab..90270d4 100644
--- a/tests/kernels/memcheck/read_write_only_memory.ref
+++ b/tests/kernels/memcheck/read_write_only_memory.ref
@@ -1,8 +1,10 @@
-ERROR EXPECTED
-
-Argument 'output': 16 bytes
-  output[0] = 0
-  output[1] = 1
-  output[2] = 2
-  output[3] = 3
+ERROR Invalid read from write-only buffer
+ERROR Invalid read from write-only buffer
+ERROR Invalid read from write-only buffer
+ERROR Invalid read from write-only buffer
 
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 0
+EXACT   output[1] = 1
+EXACT   output[2] = 2
+EXACT   output[3] = 3
diff --git a/tests/kernels/memcheck/static_array.cl b/tests/kernels/memcheck/static_array.cl
new file mode 100644
index 0000000..973f86b
--- /dev/null
+++ b/tests/kernels/memcheck/static_array.cl
@@ -0,0 +1,13 @@
+struct S
+{
+  int a;
+  char b[2];
+};
+
+kernel void static_array(global char *output)
+{
+  volatile struct S s = {-1, {42, 7}};
+  int i = get_global_id(0);
+  s.b[i] = i;
+  output[i] = s.b[i];
+}
diff --git a/tests/kernels/memcheck/static_array.ref b/tests/kernels/memcheck/static_array.ref
new file mode 100644
index 0000000..a0a4901
--- /dev/null
+++ b/tests/kernels/memcheck/static_array.ref
@@ -0,0 +1,10 @@
+ERROR exceeds static array size
+ERROR exceeds static array size
+ERROR exceeds static array size
+ERROR exceeds static array size
+
+EXACT Argument 'output': 4 bytes
+EXACT   output[0] = 0
+EXACT   output[1] = 1
+MATCH   output[2] =
+MATCH   output[3] =
diff --git a/tests/kernels/memcheck/static_array.sim b/tests/kernels/memcheck/static_array.sim
new file mode 100644
index 0000000..cbfda15
--- /dev/null
+++ b/tests/kernels/memcheck/static_array.sim
@@ -0,0 +1,6 @@
+static_array.cl
+static_array
+4 1 1
+4 1 1
+
+<size=4 fill=0 dump>
diff --git a/tests/kernels/memcheck/static_array_padded_struct.cl b/tests/kernels/memcheck/static_array_padded_struct.cl
new file mode 100644
index 0000000..30afccc
--- /dev/null
+++ b/tests/kernels/memcheck/static_array_padded_struct.cl
@@ -0,0 +1,12 @@
+struct S
+{
+  int a;
+  char b[2];
+};
+
+kernel void static_array_padded_struct(global char *output)
+{
+  struct S s = {-1, {42, 7}};
+  int i = get_global_id(0);
+  output[i] = s.b[i];
+}
diff --git a/tests/kernels/memcheck/static_array_padded_struct.ref b/tests/kernels/memcheck/static_array_padded_struct.ref
new file mode 100644
index 0000000..d38dbb7
--- /dev/null
+++ b/tests/kernels/memcheck/static_array_padded_struct.ref
@@ -0,0 +1,8 @@
+ERROR exceeds static array size
+ERROR exceeds static array size
+
+EXACT Argument 'output': 4 bytes
+EXACT   output[0] = 42
+EXACT   output[1] = 7
+MATCH   output[2] =
+MATCH   output[3] =
diff --git a/tests/kernels/memcheck/static_array_padded_struct.sim b/tests/kernels/memcheck/static_array_padded_struct.sim
new file mode 100644
index 0000000..a3bd18e
--- /dev/null
+++ b/tests/kernels/memcheck/static_array_padded_struct.sim
@@ -0,0 +1,6 @@
+static_array_padded_struct.cl
+static_array_padded_struct
+4 1 1
+4 1 1
+
+<size=4 fill=0 dump>
diff --git a/tests/kernels/memcheck/write_out_of_bounds.ref b/tests/kernels/memcheck/write_out_of_bounds.ref
index 6412f26..c79217d 100644
--- a/tests/kernels/memcheck/write_out_of_bounds.ref
+++ b/tests/kernels/memcheck/write_out_of_bounds.ref
@@ -1,8 +1,7 @@
-ERROR EXPECTED
-
-Argument 'c': 16 bytes
-  c[0] = 0
-  c[1] = 2
-  c[2] = 4
-  c[3] = 6
+ERROR Invalid write of size 4 at global memory address
 
+EXACT Argument 'c': 16 bytes
+EXACT   c[0] = 0
+EXACT   c[1] = 2
+EXACT   c[2] = 4
+EXACT   c[3] = 6
diff --git a/tests/kernels/memcheck/write_read_only_memory.ref b/tests/kernels/memcheck/write_read_only_memory.ref
index cb933ab..c1b469c 100644
--- a/tests/kernels/memcheck/write_read_only_memory.ref
+++ b/tests/kernels/memcheck/write_read_only_memory.ref
@@ -1,8 +1,10 @@
-ERROR EXPECTED
-
-Argument 'output': 16 bytes
-  output[0] = 0
-  output[1] = 1
-  output[2] = 2
-  output[3] = 3
+ERROR Invalid write to read-only buffer
+ERROR Invalid write to read-only buffer
+ERROR Invalid write to read-only buffer
+ERROR Invalid write to read-only buffer
 
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 0
+EXACT   output[1] = 1
+EXACT   output[2] = 2
+EXACT   output[3] = 3
diff --git a/tests/kernels/misc/array.ref b/tests/kernels/misc/array.ref
index 1a1d2d0..f999ca8 100644
--- a/tests/kernels/misc/array.ref
+++ b/tests/kernels/misc/array.ref
@@ -1,131 +1,129 @@
-
-Argument 'output': 1024 bytes
-  output[0] = 0
-  output[1] = 0
-  output[2] = 0
-  output[3] = 0
-  output[4] = 0
-  output[5] = 0
-  output[6] = 0
-  output[7] = 0
-  output[8] = 0
-  output[9] = 0
-  output[10] = 0
-  output[11] = 0
-  output[12] = 0
-  output[13] = 0
-  output[14] = 0
-  output[15] = 0
-  output[16] = 1
-  output[17] = 1
-  output[18] = 1
-  output[19] = 1
-  output[20] = 1
-  output[21] = 1
-  output[22] = 1
-  output[23] = 1
-  output[24] = 1
-  output[25] = 1
-  output[26] = 1
-  output[27] = 1
-  output[28] = 1
-  output[29] = 1
-  output[30] = 1
-  output[31] = 1
-  output[32] = 2
-  output[33] = 2
-  output[34] = 2
-  output[35] = 2
-  output[36] = 2
-  output[37] = 2
-  output[38] = 2
-  output[39] = 2
-  output[40] = 2
-  output[41] = 2
-  output[42] = 2
-  output[43] = 2
-  output[44] = 2
-  output[45] = 2
-  output[46] = 2
-  output[47] = 2
-  output[48] = 3
-  output[49] = 3
-  output[50] = 3
-  output[51] = 3
-  output[52] = 3
-  output[53] = 3
-  output[54] = 3
-  output[55] = 3
-  output[56] = 3
-  output[57] = 3
-  output[58] = 3
-  output[59] = 3
-  output[60] = 3
-  output[61] = 3
-  output[62] = 3
-  output[63] = 3
-  output[64] = 4
-  output[65] = 4
-  output[66] = 4
-  output[67] = 4
-  output[68] = 4
-  output[69] = 4
-  output[70] = 4
-  output[71] = 4
-  output[72] = 4
-  output[73] = 4
-  output[74] = 4
-  output[75] = 4
-  output[76] = 4
-  output[77] = 4
-  output[78] = 4
-  output[79] = 4
-  output[80] = 5
-  output[81] = 5
-  output[82] = 5
-  output[83] = 5
-  output[84] = 5
-  output[85] = 5
-  output[86] = 5
-  output[87] = 5
-  output[88] = 5
-  output[89] = 5
-  output[90] = 5
-  output[91] = 5
-  output[92] = 5
-  output[93] = 5
-  output[94] = 5
-  output[95] = 5
-  output[96] = 6
-  output[97] = 6
-  output[98] = 6
-  output[99] = 6
-  output[100] = 6
-  output[101] = 6
-  output[102] = 6
-  output[103] = 6
-  output[104] = 6
-  output[105] = 6
-  output[106] = 6
-  output[107] = 6
-  output[108] = 6
-  output[109] = 6
-  output[110] = 6
-  output[111] = 6
-  output[112] = 7
-  output[113] = 7
-  output[114] = 7
-  output[115] = 7
-  output[116] = 7
-  output[117] = 7
-  output[118] = 7
-  output[119] = 7
-  output[120] = 7
-  output[121] = 7
-  output[122] = 7
-  output[123] = 7
-  output[124] = 7
-  output[125] = 7
-  output[126] = 7
-  output[127] = 7
-
+EXACT Argument 'output': 1024 bytes
+EXACT   output[0] = 0
+EXACT   output[1] = 0
+EXACT   output[2] = 0
+EXACT   output[3] = 0
+EXACT   output[4] = 0
+EXACT   output[5] = 0
+EXACT   output[6] = 0
+EXACT   output[7] = 0
+EXACT   output[8] = 0
+EXACT   output[9] = 0
+EXACT   output[10] = 0
+EXACT   output[11] = 0
+EXACT   output[12] = 0
+EXACT   output[13] = 0
+EXACT   output[14] = 0
+EXACT   output[15] = 0
+EXACT   output[16] = 1
+EXACT   output[17] = 1
+EXACT   output[18] = 1
+EXACT   output[19] = 1
+EXACT   output[20] = 1
+EXACT   output[21] = 1
+EXACT   output[22] = 1
+EXACT   output[23] = 1
+EXACT   output[24] = 1
+EXACT   output[25] = 1
+EXACT   output[26] = 1
+EXACT   output[27] = 1
+EXACT   output[28] = 1
+EXACT   output[29] = 1
+EXACT   output[30] = 1
+EXACT   output[31] = 1
+EXACT   output[32] = 2
+EXACT   output[33] = 2
+EXACT   output[34] = 2
+EXACT   output[35] = 2
+EXACT   output[36] = 2
+EXACT   output[37] = 2
+EXACT   output[38] = 2
+EXACT   output[39] = 2
+EXACT   output[40] = 2
+EXACT   output[41] = 2
+EXACT   output[42] = 2
+EXACT   output[43] = 2
+EXACT   output[44] = 2
+EXACT   output[45] = 2
+EXACT   output[46] = 2
+EXACT   output[47] = 2
+EXACT   output[48] = 3
+EXACT   output[49] = 3
+EXACT   output[50] = 3
+EXACT   output[51] = 3
+EXACT   output[52] = 3
+EXACT   output[53] = 3
+EXACT   output[54] = 3
+EXACT   output[55] = 3
+EXACT   output[56] = 3
+EXACT   output[57] = 3
+EXACT   output[58] = 3
+EXACT   output[59] = 3
+EXACT   output[60] = 3
+EXACT   output[61] = 3
+EXACT   output[62] = 3
+EXACT   output[63] = 3
+EXACT   output[64] = 4
+EXACT   output[65] = 4
+EXACT   output[66] = 4
+EXACT   output[67] = 4
+EXACT   output[68] = 4
+EXACT   output[69] = 4
+EXACT   output[70] = 4
+EXACT   output[71] = 4
+EXACT   output[72] = 4
+EXACT   output[73] = 4
+EXACT   output[74] = 4
+EXACT   output[75] = 4
+EXACT   output[76] = 4
+EXACT   output[77] = 4
+EXACT   output[78] = 4
+EXACT   output[79] = 4
+EXACT   output[80] = 5
+EXACT   output[81] = 5
+EXACT   output[82] = 5
+EXACT   output[83] = 5
+EXACT   output[84] = 5
+EXACT   output[85] = 5
+EXACT   output[86] = 5
+EXACT   output[87] = 5
+EXACT   output[88] = 5
+EXACT   output[89] = 5
+EXACT   output[90] = 5
+EXACT   output[91] = 5
+EXACT   output[92] = 5
+EXACT   output[93] = 5
+EXACT   output[94] = 5
+EXACT   output[95] = 5
+EXACT   output[96] = 6
+EXACT   output[97] = 6
+EXACT   output[98] = 6
+EXACT   output[99] = 6
+EXACT   output[100] = 6
+EXACT   output[101] = 6
+EXACT   output[102] = 6
+EXACT   output[103] = 6
+EXACT   output[104] = 6
+EXACT   output[105] = 6
+EXACT   output[106] = 6
+EXACT   output[107] = 6
+EXACT   output[108] = 6
+EXACT   output[109] = 6
+EXACT   output[110] = 6
+EXACT   output[111] = 6
+EXACT   output[112] = 7
+EXACT   output[113] = 7
+EXACT   output[114] = 7
+EXACT   output[115] = 7
+EXACT   output[116] = 7
+EXACT   output[117] = 7
+EXACT   output[118] = 7
+EXACT   output[119] = 7
+EXACT   output[120] = 7
+EXACT   output[121] = 7
+EXACT   output[122] = 7
+EXACT   output[123] = 7
+EXACT   output[124] = 7
+EXACT   output[125] = 7
+EXACT   output[126] = 7
+EXACT   output[127] = 7
diff --git a/tests/kernels/misc/lvalue_loads.cl b/tests/kernels/misc/lvalue_loads.cl
new file mode 100644
index 0000000..eed8331
--- /dev/null
+++ b/tests/kernels/misc/lvalue_loads.cl
@@ -0,0 +1,119 @@
+typedef struct
+{
+  char  a;
+  int   b;
+  int   c;
+  char  d;
+} S;
+
+void va(global float4 *input, global float4 *output)
+{
+  int i = get_global_id(0);
+  output[i].z = 42.f;
+}
+
+void vb(global float4 *input, global float4 *output)
+{
+  int i = get_global_id(0);
+  output[i].z = 7.f;
+  output[i].y = 42.f;
+}
+
+void vc(global float4 *input, global float4 *output)
+{
+  int i = get_global_id(0);
+  output[i].zy = (float2)(7.f,42.f);
+}
+
+void vd(global float4 *input, global float4 *output)
+{
+  int i = get_global_id(0);
+  output[i].y = output[i].z;
+}
+
+void ve(global float4 *input, global float4 *output)
+{
+  int i = get_global_id(0);
+  output[i].wzyx = output[i];
+}
+
+void vf(global float4 *input, global float4 *output)
+{
+  int i = get_global_id(0);
+  output[i].zy = output[i].yz;
+}
+
+void vg(global float4 *input, global float4 *output)
+{
+  int i = get_global_id(0);
+  output[i].wzyx = input[i];
+}
+
+void vh(global float4 *input, global float4 *output)
+{
+  int i = get_global_id(0);
+  output[i].zy = input[i].yz;
+}
+
+void vi(global float4 *input, global float4 *output)
+{
+  int i = get_global_id(0);
+
+  float4 x = output[i];
+  x.z = 42.f;
+  output[i] = x;
+  output[i+1] = x;
+}
+
+void sa(global S *input, global S *output)
+{
+  int i = get_global_id(0);
+  output[i].c = 42;
+}
+
+void sb(global S *input, global S *output)
+{
+  int i = get_global_id(0);
+  output[i].c = output[i].b;
+}
+
+void sc(global S *input, global S *output)
+{
+  int i = get_global_id(0);
+  output[i].c = input[i].b;
+}
+
+kernel void lvalue_loads(
+  global float4 *vIn,
+  global float4 *vA,
+  global float4 *vB,
+  global float4 *vC,
+  global float4 *vD,
+  global float4 *vE,
+  global float4 *vF,
+  global float4 *vG,
+  global float4 *vH,
+  global float4 *vI,
+
+  global S      *sIn,
+  global S      *sA,
+  global S      *sB,
+  global S      *sC,
+
+  global float  *nop
+  )
+{
+  va(vIn, vA);
+  vb(vIn, vB);
+  vc(vIn, vC);
+  vd(vIn, vD);
+  ve(vIn, vE);
+  vf(vIn, vF);
+  vg(vIn, vG);
+  vh(vIn, vH);
+  vi(vIn, vI);
+
+  sa(sIn, sA);
+  sb(sIn, sB);
+  sc(sIn, sC);
+}
diff --git a/tests/kernels/misc/lvalue_loads.ref b/tests/kernels/misc/lvalue_loads.ref
new file mode 100644
index 0000000..a2c7852
--- /dev/null
+++ b/tests/kernels/misc/lvalue_loads.ref
@@ -0,0 +1,75 @@
+EXACT Argument 'vA': 16 bytes
+EXACT   vA[0] = 1
+EXACT   vA[1] = 2
+EXACT   vA[2] = 42
+EXACT   vA[3] = 4
+
+EXACT Argument 'vB': 16 bytes
+EXACT   vB[0] = 1
+EXACT   vB[1] = 42
+EXACT   vB[2] = 7
+EXACT   vB[3] = 4
+
+EXACT Argument 'vC': 16 bytes
+EXACT   vC[0] = 1
+EXACT   vC[1] = 42
+EXACT   vC[2] = 7
+EXACT   vC[3] = 4
+
+EXACT Argument 'vD': 16 bytes
+EXACT   vD[0] = 1
+EXACT   vD[1] = 3
+EXACT   vD[2] = 3
+EXACT   vD[3] = 4
+
+EXACT Argument 'vE': 16 bytes
+EXACT   vE[0] = 4
+EXACT   vE[1] = 3
+EXACT   vE[2] = 2
+EXACT   vE[3] = 1
+
+EXACT Argument 'vF': 16 bytes
+EXACT   vF[0] = 1
+EXACT   vF[1] = 3
+EXACT   vF[2] = 2
+EXACT   vF[3] = 4
+
+EXACT Argument 'vG': 16 bytes
+EXACT   vG[0] = 19
+EXACT   vG[1] = 18
+EXACT   vG[2] = 17
+EXACT   vG[3] = 16
+
+EXACT Argument 'vH': 16 bytes
+EXACT   vH[0] = 1
+EXACT   vH[1] = 18
+EXACT   vH[2] = 17
+EXACT   vH[3] = 4
+
+EXACT Argument 'vI': 32 bytes
+EXACT   vI[0] = 1
+EXACT   vI[1] = 2
+EXACT   vI[2] = 42
+EXACT   vI[3] = 4
+EXACT   vI[4] = 1
+EXACT   vI[5] = 2
+EXACT   vI[6] = 42
+EXACT   vI[7] = 4
+
+EXACT Argument 'sA': 16 bytes
+EXACT   sA[0] = 1
+EXACT   sA[1] = 2
+EXACT   sA[2] = 42
+EXACT   sA[3] = 4
+
+EXACT Argument 'sB': 16 bytes
+EXACT   sB[0] = 1
+EXACT   sB[1] = 2
+EXACT   sB[2] = 2
+EXACT   sB[3] = 4
+
+EXACT Argument 'sC': 16 bytes
+EXACT   sC[0] = 1
+EXACT   sC[1] = 2
+EXACT   sC[2] = 17
+EXACT   sC[3] = 4
diff --git a/tests/kernels/misc/lvalue_loads.sim b/tests/kernels/misc/lvalue_loads.sim
new file mode 100644
index 0000000..66823ee
--- /dev/null
+++ b/tests/kernels/misc/lvalue_loads.sim
@@ -0,0 +1,29 @@
+lvalue_loads.cl
+lvalue_loads
+1 1 1
+1 1 1
+
+# Vector input
+<size=16 range=16:1:19>
+
+# Vector outputs
+<size=16 range=1:1:4 dump wo>
+<size=16 range=1:1:4 dump wo>
+<size=16 range=1:1:4 dump wo>
+<size=16 range=1:1:4 dump>
+<size=16 range=1:1:4 dump>
+<size=16 range=1:1:4 dump>
+<size=16 range=1:1:4 dump wo>
+<size=16 range=1:1:4 dump wo>
+<size=32 range=1:1:8 dump>
+
+# Structure input
+<size=16 int range=16:1:19>
+
+# Structure outputs
+<size=16 int range=1:1:4 dump wo>
+<size=16 int range=1:1:4 dump>
+<size=16 int range=1:1:4 dump wo>
+
+# NOP
+<size=0 fill=0>
diff --git a/tests/kernels/misc/program_scope_constant_array.cl b/tests/kernels/misc/program_scope_constant_array.cl
new file mode 100644
index 0000000..786b0b2
--- /dev/null
+++ b/tests/kernels/misc/program_scope_constant_array.cl
@@ -0,0 +1,7 @@
+constant int data[4] = {7, 42, 0, -1};
+
+kernel void program_scope_constant_array(global int *output)
+{
+  int i = get_global_id(0);
+  output[i] = data[i];
+}
diff --git a/tests/kernels/misc/program_scope_constant_array.ref b/tests/kernels/misc/program_scope_constant_array.ref
new file mode 100644
index 0000000..ef2df1c
--- /dev/null
+++ b/tests/kernels/misc/program_scope_constant_array.ref
@@ -0,0 +1,5 @@
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 7
+EXACT   output[1] = 42
+EXACT   output[2] = 0
+EXACT   output[3] = -1
diff --git a/tests/kernels/misc/program_scope_constant_array.sim b/tests/kernels/misc/program_scope_constant_array.sim
new file mode 100644
index 0000000..1f6eecd
--- /dev/null
+++ b/tests/kernels/misc/program_scope_constant_array.sim
@@ -0,0 +1,6 @@
+program_scope_constant_array.cl
+program_scope_constant_array
+4 1 1
+1 1 1
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/misc/reduce.ref b/tests/kernels/misc/reduce.ref
index fa92b4e..3ebb5ea 100644
--- a/tests/kernels/misc/reduce.ref
+++ b/tests/kernels/misc/reduce.ref
@@ -1,4 +1,2 @@
-
-Argument 'result': 4 bytes
-  result[0] = 120
-
+EXACT Argument 'result': 4 bytes
+EXACT   result[0] = 120
diff --git a/tests/kernels/misc/vecadd.ref b/tests/kernels/misc/vecadd.ref
index 9fa7b4c..841371f 100644
--- a/tests/kernels/misc/vecadd.ref
+++ b/tests/kernels/misc/vecadd.ref
@@ -1,1027 +1,1025 @@
-
-Argument 'c': 4096 bytes
-  c[0] = 0
-  c[1] = 2
-  c[2] = 4
-  c[3] = 6
-  c[4] = 8
-  c[5] = 10
-  c[6] = 12
-  c[7] = 14
-  c[8] = 16
-  c[9] = 18
-  c[10] = 20
-  c[11] = 22
-  c[12] = 24
-  c[13] = 26
-  c[14] = 28
-  c[15] = 30
-  c[16] = 32
-  c[17] = 34
-  c[18] = 36
-  c[19] = 38
-  c[20] = 40
-  c[21] = 42
-  c[22] = 44
-  c[23] = 46
-  c[24] = 48
-  c[25] = 50
-  c[26] = 52
-  c[27] = 54
-  c[28] = 56
-  c[29] = 58
-  c[30] = 60
-  c[31] = 62
-  c[32] = 64
-  c[33] = 66
-  c[34] = 68
-  c[35] = 70
-  c[36] = 72
-  c[37] = 74
-  c[38] = 76
-  c[39] = 78
-  c[40] = 80
-  c[41] = 82
-  c[42] = 84
-  c[43] = 86
-  c[44] = 88
-  c[45] = 90
-  c[46] = 92
-  c[47] = 94
-  c[48] = 96
-  c[49] = 98
-  c[50] = 100
-  c[51] = 102
-  c[52] = 104
-  c[53] = 106
-  c[54] = 108
-  c[55] = 110
-  c[56] = 112
-  c[57] = 114
-  c[58] = 116
-  c[59] = 118
-  c[60] = 120
-  c[61] = 122
-  c[62] = 124
-  c[63] = 126
-  c[64] = 128
-  c[65] = 130
-  c[66] = 132
-  c[67] = 134
-  c[68] = 136
-  c[69] = 138
-  c[70] = 140
-  c[71] = 142
-  c[72] = 144
-  c[73] = 146
-  c[74] = 148
-  c[75] = 150
-  c[76] = 152
-  c[77] = 154
-  c[78] = 156
-  c[79] = 158
-  c[80] = 160
-  c[81] = 162
-  c[82] = 164
-  c[83] = 166
-  c[84] = 168
-  c[85] = 170
-  c[86] = 172
-  c[87] = 174
-  c[88] = 176
-  c[89] = 178
-  c[90] = 180
-  c[91] = 182
-  c[92] = 184
-  c[93] = 186
-  c[94] = 188
-  c[95] = 190
-  c[96] = 192
-  c[97] = 194
-  c[98] = 196
-  c[99] = 198
-  c[100] = 200
-  c[101] = 202
-  c[102] = 204
-  c[103] = 206
-  c[104] = 208
-  c[105] = 210
-  c[106] = 212
-  c[107] = 214
-  c[108] = 216
-  c[109] = 218
-  c[110] = 220
-  c[111] = 222
-  c[112] = 224
-  c[113] = 226
-  c[114] = 228
-  c[115] = 230
-  c[116] = 232
-  c[117] = 234
-  c[118] = 236
-  c[119] = 238
-  c[120] = 240
-  c[121] = 242
-  c[122] = 244
-  c[123] = 246
-  c[124] = 248
-  c[125] = 250
-  c[126] = 252
-  c[127] = 254
-  c[128] = 256
-  c[129] = 258
-  c[130] = 260
-  c[131] = 262
-  c[132] = 264
-  c[133] = 266
-  c[134] = 268
-  c[135] = 270
-  c[136] = 272
-  c[137] = 274
-  c[138] = 276
-  c[139] = 278
-  c[140] = 280
-  c[141] = 282
-  c[142] = 284
-  c[143] = 286
-  c[144] = 288
-  c[145] = 290
-  c[146] = 292
-  c[147] = 294
-  c[148] = 296
-  c[149] = 298
-  c[150] = 300
-  c[151] = 302
-  c[152] = 304
-  c[153] = 306
-  c[154] = 308
-  c[155] = 310
-  c[156] = 312
-  c[157] = 314
-  c[158] = 316
-  c[159] = 318
-  c[160] = 320
-  c[161] = 322
-  c[162] = 324
-  c[163] = 326
-  c[164] = 328
-  c[165] = 330
-  c[166] = 332
-  c[167] = 334
-  c[168] = 336
-  c[169] = 338
-  c[170] = 340
-  c[171] = 342
-  c[172] = 344
-  c[173] = 346
-  c[174] = 348
-  c[175] = 350
-  c[176] = 352
-  c[177] = 354
-  c[178] = 356
-  c[179] = 358
-  c[180] = 360
-  c[181] = 362
-  c[182] = 364
-  c[183] = 366
-  c[184] = 368
-  c[185] = 370
-  c[186] = 372
-  c[187] = 374
-  c[188] = 376
-  c[189] = 378
-  c[190] = 380
-  c[191] = 382
-  c[192] = 384
-  c[193] = 386
-  c[194] = 388
-  c[195] = 390
-  c[196] = 392
-  c[197] = 394
-  c[198] = 396
-  c[199] = 398
-  c[200] = 400
-  c[201] = 402
-  c[202] = 404
-  c[203] = 406
-  c[204] = 408
-  c[205] = 410
-  c[206] = 412
-  c[207] = 414
-  c[208] = 416
-  c[209] = 418
-  c[210] = 420
-  c[211] = 422
-  c[212] = 424
-  c[213] = 426
-  c[214] = 428
-  c[215] = 430
-  c[216] = 432
-  c[217] = 434
-  c[218] = 436
-  c[219] = 438
-  c[220] = 440
-  c[221] = 442
-  c[222] = 444
-  c[223] = 446
-  c[224] = 448
-  c[225] = 450
-  c[226] = 452
-  c[227] = 454
-  c[228] = 456
-  c[229] = 458
-  c[230] = 460
-  c[231] = 462
-  c[232] = 464
-  c[233] = 466
-  c[234] = 468
-  c[235] = 470
-  c[236] = 472
-  c[237] = 474
-  c[238] = 476
-  c[239] = 478
-  c[240] = 480
-  c[241] = 482
-  c[242] = 484
-  c[243] = 486
-  c[244] = 488
-  c[245] = 490
-  c[246] = 492
-  c[247] = 494
-  c[248] = 496
-  c[249] = 498
-  c[250] = 500
-  c[251] = 502
-  c[252] = 504
-  c[253] = 506
-  c[254] = 508
-  c[255] = 510
-  c[256] = 512
-  c[257] = 514
-  c[258] = 516
-  c[259] = 518
-  c[260] = 520
-  c[261] = 522
-  c[262] = 524
-  c[263] = 526
-  c[264] = 528
-  c[265] = 530
-  c[266] = 532
-  c[267] = 534
-  c[268] = 536
-  c[269] = 538
-  c[270] = 540
-  c[271] = 542
-  c[272] = 544
-  c[273] = 546
-  c[274] = 548
-  c[275] = 550
-  c[276] = 552
-  c[277] = 554
-  c[278] = 556
-  c[279] = 558
-  c[280] = 560
-  c[281] = 562
-  c[282] = 564
-  c[283] = 566
-  c[284] = 568
-  c[285] = 570
-  c[286] = 572
-  c[287] = 574
-  c[288] = 576
-  c[289] = 578
-  c[290] = 580
-  c[291] = 582
-  c[292] = 584
-  c[293] = 586
-  c[294] = 588
-  c[295] = 590
-  c[296] = 592
-  c[297] = 594
-  c[298] = 596
-  c[299] = 598
-  c[300] = 600
-  c[301] = 602
-  c[302] = 604
-  c[303] = 606
-  c[304] = 608
-  c[305] = 610
-  c[306] = 612
-  c[307] = 614
-  c[308] = 616
-  c[309] = 618
-  c[310] = 620
-  c[311] = 622
-  c[312] = 624
-  c[313] = 626
-  c[314] = 628
-  c[315] = 630
-  c[316] = 632
-  c[317] = 634
-  c[318] = 636
-  c[319] = 638
-  c[320] = 640
-  c[321] = 642
-  c[322] = 644
-  c[323] = 646
-  c[324] = 648
-  c[325] = 650
-  c[326] = 652
-  c[327] = 654
-  c[328] = 656
-  c[329] = 658
-  c[330] = 660
-  c[331] = 662
-  c[332] = 664
-  c[333] = 666
-  c[334] = 668
-  c[335] = 670
-  c[336] = 672
-  c[337] = 674
-  c[338] = 676
-  c[339] = 678
-  c[340] = 680
-  c[341] = 682
-  c[342] = 684
-  c[343] = 686
-  c[344] = 688
-  c[345] = 690
-  c[346] = 692
-  c[347] = 694
-  c[348] = 696
-  c[349] = 698
-  c[350] = 700
-  c[351] = 702
-  c[352] = 704
-  c[353] = 706
-  c[354] = 708
-  c[355] = 710
-  c[356] = 712
-  c[357] = 714
-  c[358] = 716
-  c[359] = 718
-  c[360] = 720
-  c[361] = 722
-  c[362] = 724
-  c[363] = 726
-  c[364] = 728
-  c[365] = 730
-  c[366] = 732
-  c[367] = 734
-  c[368] = 736
-  c[369] = 738
-  c[370] = 740
-  c[371] = 742
-  c[372] = 744
-  c[373] = 746
-  c[374] = 748
-  c[375] = 750
-  c[376] = 752
-  c[377] = 754
-  c[378] = 756
-  c[379] = 758
-  c[380] = 760
-  c[381] = 762
-  c[382] = 764
-  c[383] = 766
-  c[384] = 768
-  c[385] = 770
-  c[386] = 772
-  c[387] = 774
-  c[388] = 776
-  c[389] = 778
-  c[390] = 780
-  c[391] = 782
-  c[392] = 784
-  c[393] = 786
-  c[394] = 788
-  c[395] = 790
-  c[396] = 792
-  c[397] = 794
-  c[398] = 796
-  c[399] = 798
-  c[400] = 800
-  c[401] = 802
-  c[402] = 804
-  c[403] = 806
-  c[404] = 808
-  c[405] = 810
-  c[406] = 812
-  c[407] = 814
-  c[408] = 816
-  c[409] = 818
-  c[410] = 820
-  c[411] = 822
-  c[412] = 824
-  c[413] = 826
-  c[414] = 828
-  c[415] = 830
-  c[416] = 832
-  c[417] = 834
-  c[418] = 836
-  c[419] = 838
-  c[420] = 840
-  c[421] = 842
-  c[422] = 844
-  c[423] = 846
-  c[424] = 848
-  c[425] = 850
-  c[426] = 852
-  c[427] = 854
-  c[428] = 856
-  c[429] = 858
-  c[430] = 860
-  c[431] = 862
-  c[432] = 864
-  c[433] = 866
-  c[434] = 868
-  c[435] = 870
-  c[436] = 872
-  c[437] = 874
-  c[438] = 876
-  c[439] = 878
-  c[440] = 880
-  c[441] = 882
-  c[442] = 884
-  c[443] = 886
-  c[444] = 888
-  c[445] = 890
-  c[446] = 892
-  c[447] = 894
-  c[448] = 896
-  c[449] = 898
-  c[450] = 900
-  c[451] = 902
-  c[452] = 904
-  c[453] = 906
-  c[454] = 908
-  c[455] = 910
-  c[456] = 912
-  c[457] = 914
-  c[458] = 916
-  c[459] = 918
-  c[460] = 920
-  c[461] = 922
-  c[462] = 924
-  c[463] = 926
-  c[464] = 928
-  c[465] = 930
-  c[466] = 932
-  c[467] = 934
-  c[468] = 936
-  c[469] = 938
-  c[470] = 940
-  c[471] = 942
-  c[472] = 944
-  c[473] = 946
-  c[474] = 948
-  c[475] = 950
-  c[476] = 952
-  c[477] = 954
-  c[478] = 956
-  c[479] = 958
-  c[480] = 960
-  c[481] = 962
-  c[482] = 964
-  c[483] = 966
-  c[484] = 968
-  c[485] = 970
-  c[486] = 972
-  c[487] = 974
-  c[488] = 976
-  c[489] = 978
-  c[490] = 980
-  c[491] = 982
-  c[492] = 984
-  c[493] = 986
-  c[494] = 988
-  c[495] = 990
-  c[496] = 992
-  c[497] = 994
-  c[498] = 996
-  c[499] = 998
-  c[500] = 1000
-  c[501] = 1002
-  c[502] = 1004
-  c[503] = 1006
-  c[504] = 1008
-  c[505] = 1010
-  c[506] = 1012
-  c[507] = 1014
-  c[508] = 1016
-  c[509] = 1018
-  c[510] = 1020
-  c[511] = 1022
-  c[512] = 1024
-  c[513] = 1026
-  c[514] = 1028
-  c[515] = 1030
-  c[516] = 1032
-  c[517] = 1034
-  c[518] = 1036
-  c[519] = 1038
-  c[520] = 1040
-  c[521] = 1042
-  c[522] = 1044
-  c[523] = 1046
-  c[524] = 1048
-  c[525] = 1050
-  c[526] = 1052
-  c[527] = 1054
-  c[528] = 1056
-  c[529] = 1058
-  c[530] = 1060
-  c[531] = 1062
-  c[532] = 1064
-  c[533] = 1066
-  c[534] = 1068
-  c[535] = 1070
-  c[536] = 1072
-  c[537] = 1074
-  c[538] = 1076
-  c[539] = 1078
-  c[540] = 1080
-  c[541] = 1082
-  c[542] = 1084
-  c[543] = 1086
-  c[544] = 1088
-  c[545] = 1090
-  c[546] = 1092
-  c[547] = 1094
-  c[548] = 1096
-  c[549] = 1098
-  c[550] = 1100
-  c[551] = 1102
-  c[552] = 1104
-  c[553] = 1106
-  c[554] = 1108
-  c[555] = 1110
-  c[556] = 1112
-  c[557] = 1114
-  c[558] = 1116
-  c[559] = 1118
-  c[560] = 1120
-  c[561] = 1122
-  c[562] = 1124
-  c[563] = 1126
-  c[564] = 1128
-  c[565] = 1130
-  c[566] = 1132
-  c[567] = 1134
-  c[568] = 1136
-  c[569] = 1138
-  c[570] = 1140
-  c[571] = 1142
-  c[572] = 1144
-  c[573] = 1146
-  c[574] = 1148
-  c[575] = 1150
-  c[576] = 1152
-  c[577] = 1154
-  c[578] = 1156
-  c[579] = 1158
-  c[580] = 1160
-  c[581] = 1162
-  c[582] = 1164
-  c[583] = 1166
-  c[584] = 1168
-  c[585] = 1170
-  c[586] = 1172
-  c[587] = 1174
-  c[588] = 1176
-  c[589] = 1178
-  c[590] = 1180
-  c[591] = 1182
-  c[592] = 1184
-  c[593] = 1186
-  c[594] = 1188
-  c[595] = 1190
-  c[596] = 1192
-  c[597] = 1194
-  c[598] = 1196
-  c[599] = 1198
-  c[600] = 1200
-  c[601] = 1202
-  c[602] = 1204
-  c[603] = 1206
-  c[604] = 1208
-  c[605] = 1210
-  c[606] = 1212
-  c[607] = 1214
-  c[608] = 1216
-  c[609] = 1218
-  c[610] = 1220
-  c[611] = 1222
-  c[612] = 1224
-  c[613] = 1226
-  c[614] = 1228
-  c[615] = 1230
-  c[616] = 1232
-  c[617] = 1234
-  c[618] = 1236
-  c[619] = 1238
-  c[620] = 1240
-  c[621] = 1242
-  c[622] = 1244
-  c[623] = 1246
-  c[624] = 1248
-  c[625] = 1250
-  c[626] = 1252
-  c[627] = 1254
-  c[628] = 1256
-  c[629] = 1258
-  c[630] = 1260
-  c[631] = 1262
-  c[632] = 1264
-  c[633] = 1266
-  c[634] = 1268
-  c[635] = 1270
-  c[636] = 1272
-  c[637] = 1274
-  c[638] = 1276
-  c[639] = 1278
-  c[640] = 1280
-  c[641] = 1282
-  c[642] = 1284
-  c[643] = 1286
-  c[644] = 1288
-  c[645] = 1290
-  c[646] = 1292
-  c[647] = 1294
-  c[648] = 1296
-  c[649] = 1298
-  c[650] = 1300
-  c[651] = 1302
-  c[652] = 1304
-  c[653] = 1306
-  c[654] = 1308
-  c[655] = 1310
-  c[656] = 1312
-  c[657] = 1314
-  c[658] = 1316
-  c[659] = 1318
-  c[660] = 1320
-  c[661] = 1322
-  c[662] = 1324
-  c[663] = 1326
-  c[664] = 1328
-  c[665] = 1330
-  c[666] = 1332
-  c[667] = 1334
-  c[668] = 1336
-  c[669] = 1338
-  c[670] = 1340
-  c[671] = 1342
-  c[672] = 1344
-  c[673] = 1346
-  c[674] = 1348
-  c[675] = 1350
-  c[676] = 1352
-  c[677] = 1354
-  c[678] = 1356
-  c[679] = 1358
-  c[680] = 1360
-  c[681] = 1362
-  c[682] = 1364
-  c[683] = 1366
-  c[684] = 1368
-  c[685] = 1370
-  c[686] = 1372
-  c[687] = 1374
-  c[688] = 1376
-  c[689] = 1378
-  c[690] = 1380
-  c[691] = 1382
-  c[692] = 1384
-  c[693] = 1386
-  c[694] = 1388
-  c[695] = 1390
-  c[696] = 1392
-  c[697] = 1394
-  c[698] = 1396
-  c[699] = 1398
-  c[700] = 1400
-  c[701] = 1402
-  c[702] = 1404
-  c[703] = 1406
-  c[704] = 1408
-  c[705] = 1410
-  c[706] = 1412
-  c[707] = 1414
-  c[708] = 1416
-  c[709] = 1418
-  c[710] = 1420
-  c[711] = 1422
-  c[712] = 1424
-  c[713] = 1426
-  c[714] = 1428
-  c[715] = 1430
-  c[716] = 1432
-  c[717] = 1434
-  c[718] = 1436
-  c[719] = 1438
-  c[720] = 1440
-  c[721] = 1442
-  c[722] = 1444
-  c[723] = 1446
-  c[724] = 1448
-  c[725] = 1450
-  c[726] = 1452
-  c[727] = 1454
-  c[728] = 1456
-  c[729] = 1458
-  c[730] = 1460
-  c[731] = 1462
-  c[732] = 1464
-  c[733] = 1466
-  c[734] = 1468
-  c[735] = 1470
-  c[736] = 1472
-  c[737] = 1474
-  c[738] = 1476
-  c[739] = 1478
-  c[740] = 1480
-  c[741] = 1482
-  c[742] = 1484
-  c[743] = 1486
-  c[744] = 1488
-  c[745] = 1490
-  c[746] = 1492
-  c[747] = 1494
-  c[748] = 1496
-  c[749] = 1498
-  c[750] = 1500
-  c[751] = 1502
-  c[752] = 1504
-  c[753] = 1506
-  c[754] = 1508
-  c[755] = 1510
-  c[756] = 1512
-  c[757] = 1514
-  c[758] = 1516
-  c[759] = 1518
-  c[760] = 1520
-  c[761] = 1522
-  c[762] = 1524
-  c[763] = 1526
-  c[764] = 1528
-  c[765] = 1530
-  c[766] = 1532
-  c[767] = 1534
-  c[768] = 1536
-  c[769] = 1538
-  c[770] = 1540
-  c[771] = 1542
-  c[772] = 1544
-  c[773] = 1546
-  c[774] = 1548
-  c[775] = 1550
-  c[776] = 1552
-  c[777] = 1554
-  c[778] = 1556
-  c[779] = 1558
-  c[780] = 1560
-  c[781] = 1562
-  c[782] = 1564
-  c[783] = 1566
-  c[784] = 1568
-  c[785] = 1570
-  c[786] = 1572
-  c[787] = 1574
-  c[788] = 1576
-  c[789] = 1578
-  c[790] = 1580
-  c[791] = 1582
-  c[792] = 1584
-  c[793] = 1586
-  c[794] = 1588
-  c[795] = 1590
-  c[796] = 1592
-  c[797] = 1594
-  c[798] = 1596
-  c[799] = 1598
-  c[800] = 1600
-  c[801] = 1602
-  c[802] = 1604
-  c[803] = 1606
-  c[804] = 1608
-  c[805] = 1610
-  c[806] = 1612
-  c[807] = 1614
-  c[808] = 1616
-  c[809] = 1618
-  c[810] = 1620
-  c[811] = 1622
-  c[812] = 1624
-  c[813] = 1626
-  c[814] = 1628
-  c[815] = 1630
-  c[816] = 1632
-  c[817] = 1634
-  c[818] = 1636
-  c[819] = 1638
-  c[820] = 1640
-  c[821] = 1642
-  c[822] = 1644
-  c[823] = 1646
-  c[824] = 1648
-  c[825] = 1650
-  c[826] = 1652
-  c[827] = 1654
-  c[828] = 1656
-  c[829] = 1658
-  c[830] = 1660
-  c[831] = 1662
-  c[832] = 1664
-  c[833] = 1666
-  c[834] = 1668
-  c[835] = 1670
-  c[836] = 1672
-  c[837] = 1674
-  c[838] = 1676
-  c[839] = 1678
-  c[840] = 1680
-  c[841] = 1682
-  c[842] = 1684
-  c[843] = 1686
-  c[844] = 1688
-  c[845] = 1690
-  c[846] = 1692
-  c[847] = 1694
-  c[848] = 1696
-  c[849] = 1698
-  c[850] = 1700
-  c[851] = 1702
-  c[852] = 1704
-  c[853] = 1706
-  c[854] = 1708
-  c[855] = 1710
-  c[856] = 1712
-  c[857] = 1714
-  c[858] = 1716
-  c[859] = 1718
-  c[860] = 1720
-  c[861] = 1722
-  c[862] = 1724
-  c[863] = 1726
-  c[864] = 1728
-  c[865] = 1730
-  c[866] = 1732
-  c[867] = 1734
-  c[868] = 1736
-  c[869] = 1738
-  c[870] = 1740
-  c[871] = 1742
-  c[872] = 1744
-  c[873] = 1746
-  c[874] = 1748
-  c[875] = 1750
-  c[876] = 1752
-  c[877] = 1754
-  c[878] = 1756
-  c[879] = 1758
-  c[880] = 1760
-  c[881] = 1762
-  c[882] = 1764
-  c[883] = 1766
-  c[884] = 1768
-  c[885] = 1770
-  c[886] = 1772
-  c[887] = 1774
-  c[888] = 1776
-  c[889] = 1778
-  c[890] = 1780
-  c[891] = 1782
-  c[892] = 1784
-  c[893] = 1786
-  c[894] = 1788
-  c[895] = 1790
-  c[896] = 1792
-  c[897] = 1794
-  c[898] = 1796
-  c[899] = 1798
-  c[900] = 1800
-  c[901] = 1802
-  c[902] = 1804
-  c[903] = 1806
-  c[904] = 1808
-  c[905] = 1810
-  c[906] = 1812
-  c[907] = 1814
-  c[908] = 1816
-  c[909] = 1818
-  c[910] = 1820
-  c[911] = 1822
-  c[912] = 1824
-  c[913] = 1826
-  c[914] = 1828
-  c[915] = 1830
-  c[916] = 1832
-  c[917] = 1834
-  c[918] = 1836
-  c[919] = 1838
-  c[920] = 1840
-  c[921] = 1842
-  c[922] = 1844
-  c[923] = 1846
-  c[924] = 1848
-  c[925] = 1850
-  c[926] = 1852
-  c[927] = 1854
-  c[928] = 1856
-  c[929] = 1858
-  c[930] = 1860
-  c[931] = 1862
-  c[932] = 1864
-  c[933] = 1866
-  c[934] = 1868
-  c[935] = 1870
-  c[936] = 1872
-  c[937] = 1874
-  c[938] = 1876
-  c[939] = 1878
-  c[940] = 1880
-  c[941] = 1882
-  c[942] = 1884
-  c[943] = 1886
-  c[944] = 1888
-  c[945] = 1890
-  c[946] = 1892
-  c[947] = 1894
-  c[948] = 1896
-  c[949] = 1898
-  c[950] = 1900
-  c[951] = 1902
-  c[952] = 1904
-  c[953] = 1906
-  c[954] = 1908
-  c[955] = 1910
-  c[956] = 1912
-  c[957] = 1914
-  c[958] = 1916
-  c[959] = 1918
-  c[960] = 1920
-  c[961] = 1922
-  c[962] = 1924
-  c[963] = 1926
-  c[964] = 1928
-  c[965] = 1930
-  c[966] = 1932
-  c[967] = 1934
-  c[968] = 1936
-  c[969] = 1938
-  c[970] = 1940
-  c[971] = 1942
-  c[972] = 1944
-  c[973] = 1946
-  c[974] = 1948
-  c[975] = 1950
-  c[976] = 1952
-  c[977] = 1954
-  c[978] = 1956
-  c[979] = 1958
-  c[980] = 1960
-  c[981] = 1962
-  c[982] = 1964
-  c[983] = 1966
-  c[984] = 1968
-  c[985] = 1970
-  c[986] = 1972
-  c[987] = 1974
-  c[988] = 1976
-  c[989] = 1978
-  c[990] = 1980
-  c[991] = 1982
-  c[992] = 1984
-  c[993] = 1986
-  c[994] = 1988
-  c[995] = 1990
-  c[996] = 1992
-  c[997] = 1994
-  c[998] = 1996
-  c[999] = 1998
-  c[1000] = 2000
-  c[1001] = 2002
-  c[1002] = 2004
-  c[1003] = 2006
-  c[1004] = 2008
-  c[1005] = 2010
-  c[1006] = 2012
-  c[1007] = 2014
-  c[1008] = 2016
-  c[1009] = 2018
-  c[1010] = 2020
-  c[1011] = 2022
-  c[1012] = 2024
-  c[1013] = 2026
-  c[1014] = 2028
-  c[1015] = 2030
-  c[1016] = 2032
-  c[1017] = 2034
-  c[1018] = 2036
-  c[1019] = 2038
-  c[1020] = 2040
-  c[1021] = 2042
-  c[1022] = 2044
-  c[1023] = 2046
-
+EXACT Argument 'c': 4096 bytes
+EXACT   c[0] = 0
+EXACT   c[1] = 2
+EXACT   c[2] = 4
+EXACT   c[3] = 6
+EXACT   c[4] = 8
+EXACT   c[5] = 10
+EXACT   c[6] = 12
+EXACT   c[7] = 14
+EXACT   c[8] = 16
+EXACT   c[9] = 18
+EXACT   c[10] = 20
+EXACT   c[11] = 22
+EXACT   c[12] = 24
+EXACT   c[13] = 26
+EXACT   c[14] = 28
+EXACT   c[15] = 30
+EXACT   c[16] = 32
+EXACT   c[17] = 34
+EXACT   c[18] = 36
+EXACT   c[19] = 38
+EXACT   c[20] = 40
+EXACT   c[21] = 42
+EXACT   c[22] = 44
+EXACT   c[23] = 46
+EXACT   c[24] = 48
+EXACT   c[25] = 50
+EXACT   c[26] = 52
+EXACT   c[27] = 54
+EXACT   c[28] = 56
+EXACT   c[29] = 58
+EXACT   c[30] = 60
+EXACT   c[31] = 62
+EXACT   c[32] = 64
+EXACT   c[33] = 66
+EXACT   c[34] = 68
+EXACT   c[35] = 70
+EXACT   c[36] = 72
+EXACT   c[37] = 74
+EXACT   c[38] = 76
+EXACT   c[39] = 78
+EXACT   c[40] = 80
+EXACT   c[41] = 82
+EXACT   c[42] = 84
+EXACT   c[43] = 86
+EXACT   c[44] = 88
+EXACT   c[45] = 90
+EXACT   c[46] = 92
+EXACT   c[47] = 94
+EXACT   c[48] = 96
+EXACT   c[49] = 98
+EXACT   c[50] = 100
+EXACT   c[51] = 102
+EXACT   c[52] = 104
+EXACT   c[53] = 106
+EXACT   c[54] = 108
+EXACT   c[55] = 110
+EXACT   c[56] = 112
+EXACT   c[57] = 114
+EXACT   c[58] = 116
+EXACT   c[59] = 118
+EXACT   c[60] = 120
+EXACT   c[61] = 122
+EXACT   c[62] = 124
+EXACT   c[63] = 126
+EXACT   c[64] = 128
+EXACT   c[65] = 130
+EXACT   c[66] = 132
+EXACT   c[67] = 134
+EXACT   c[68] = 136
+EXACT   c[69] = 138
+EXACT   c[70] = 140
+EXACT   c[71] = 142
+EXACT   c[72] = 144
+EXACT   c[73] = 146
+EXACT   c[74] = 148
+EXACT   c[75] = 150
+EXACT   c[76] = 152
+EXACT   c[77] = 154
+EXACT   c[78] = 156
+EXACT   c[79] = 158
+EXACT   c[80] = 160
+EXACT   c[81] = 162
+EXACT   c[82] = 164
+EXACT   c[83] = 166
+EXACT   c[84] = 168
+EXACT   c[85] = 170
+EXACT   c[86] = 172
+EXACT   c[87] = 174
+EXACT   c[88] = 176
+EXACT   c[89] = 178
+EXACT   c[90] = 180
+EXACT   c[91] = 182
+EXACT   c[92] = 184
+EXACT   c[93] = 186
+EXACT   c[94] = 188
+EXACT   c[95] = 190
+EXACT   c[96] = 192
+EXACT   c[97] = 194
+EXACT   c[98] = 196
+EXACT   c[99] = 198
+EXACT   c[100] = 200
+EXACT   c[101] = 202
+EXACT   c[102] = 204
+EXACT   c[103] = 206
+EXACT   c[104] = 208
+EXACT   c[105] = 210
+EXACT   c[106] = 212
+EXACT   c[107] = 214
+EXACT   c[108] = 216
+EXACT   c[109] = 218
+EXACT   c[110] = 220
+EXACT   c[111] = 222
+EXACT   c[112] = 224
+EXACT   c[113] = 226
+EXACT   c[114] = 228
+EXACT   c[115] = 230
+EXACT   c[116] = 232
+EXACT   c[117] = 234
+EXACT   c[118] = 236
+EXACT   c[119] = 238
+EXACT   c[120] = 240
+EXACT   c[121] = 242
+EXACT   c[122] = 244
+EXACT   c[123] = 246
+EXACT   c[124] = 248
+EXACT   c[125] = 250
+EXACT   c[126] = 252
+EXACT   c[127] = 254
+EXACT   c[128] = 256
+EXACT   c[129] = 258
+EXACT   c[130] = 260
+EXACT   c[131] = 262
+EXACT   c[132] = 264
+EXACT   c[133] = 266
+EXACT   c[134] = 268
+EXACT   c[135] = 270
+EXACT   c[136] = 272
+EXACT   c[137] = 274
+EXACT   c[138] = 276
+EXACT   c[139] = 278
+EXACT   c[140] = 280
+EXACT   c[141] = 282
+EXACT   c[142] = 284
+EXACT   c[143] = 286
+EXACT   c[144] = 288
+EXACT   c[145] = 290
+EXACT   c[146] = 292
+EXACT   c[147] = 294
+EXACT   c[148] = 296
+EXACT   c[149] = 298
+EXACT   c[150] = 300
+EXACT   c[151] = 302
+EXACT   c[152] = 304
+EXACT   c[153] = 306
+EXACT   c[154] = 308
+EXACT   c[155] = 310
+EXACT   c[156] = 312
+EXACT   c[157] = 314
+EXACT   c[158] = 316
+EXACT   c[159] = 318
+EXACT   c[160] = 320
+EXACT   c[161] = 322
+EXACT   c[162] = 324
+EXACT   c[163] = 326
+EXACT   c[164] = 328
+EXACT   c[165] = 330
+EXACT   c[166] = 332
+EXACT   c[167] = 334
+EXACT   c[168] = 336
+EXACT   c[169] = 338
+EXACT   c[170] = 340
+EXACT   c[171] = 342
+EXACT   c[172] = 344
+EXACT   c[173] = 346
+EXACT   c[174] = 348
+EXACT   c[175] = 350
+EXACT   c[176] = 352
+EXACT   c[177] = 354
+EXACT   c[178] = 356
+EXACT   c[179] = 358
+EXACT   c[180] = 360
+EXACT   c[181] = 362
+EXACT   c[182] = 364
+EXACT   c[183] = 366
+EXACT   c[184] = 368
+EXACT   c[185] = 370
+EXACT   c[186] = 372
+EXACT   c[187] = 374
+EXACT   c[188] = 376
+EXACT   c[189] = 378
+EXACT   c[190] = 380
+EXACT   c[191] = 382
+EXACT   c[192] = 384
+EXACT   c[193] = 386
+EXACT   c[194] = 388
+EXACT   c[195] = 390
+EXACT   c[196] = 392
+EXACT   c[197] = 394
+EXACT   c[198] = 396
+EXACT   c[199] = 398
+EXACT   c[200] = 400
+EXACT   c[201] = 402
+EXACT   c[202] = 404
+EXACT   c[203] = 406
+EXACT   c[204] = 408
+EXACT   c[205] = 410
+EXACT   c[206] = 412
+EXACT   c[207] = 414
+EXACT   c[208] = 416
+EXACT   c[209] = 418
+EXACT   c[210] = 420
+EXACT   c[211] = 422
+EXACT   c[212] = 424
+EXACT   c[213] = 426
+EXACT   c[214] = 428
+EXACT   c[215] = 430
+EXACT   c[216] = 432
+EXACT   c[217] = 434
+EXACT   c[218] = 436
+EXACT   c[219] = 438
+EXACT   c[220] = 440
+EXACT   c[221] = 442
+EXACT   c[222] = 444
+EXACT   c[223] = 446
+EXACT   c[224] = 448
+EXACT   c[225] = 450
+EXACT   c[226] = 452
+EXACT   c[227] = 454
+EXACT   c[228] = 456
+EXACT   c[229] = 458
+EXACT   c[230] = 460
+EXACT   c[231] = 462
+EXACT   c[232] = 464
+EXACT   c[233] = 466
+EXACT   c[234] = 468
+EXACT   c[235] = 470
+EXACT   c[236] = 472
+EXACT   c[237] = 474
+EXACT   c[238] = 476
+EXACT   c[239] = 478
+EXACT   c[240] = 480
+EXACT   c[241] = 482
+EXACT   c[242] = 484
+EXACT   c[243] = 486
+EXACT   c[244] = 488
+EXACT   c[245] = 490
+EXACT   c[246] = 492
+EXACT   c[247] = 494
+EXACT   c[248] = 496
+EXACT   c[249] = 498
+EXACT   c[250] = 500
+EXACT   c[251] = 502
+EXACT   c[252] = 504
+EXACT   c[253] = 506
+EXACT   c[254] = 508
+EXACT   c[255] = 510
+EXACT   c[256] = 512
+EXACT   c[257] = 514
+EXACT   c[258] = 516
+EXACT   c[259] = 518
+EXACT   c[260] = 520
+EXACT   c[261] = 522
+EXACT   c[262] = 524
+EXACT   c[263] = 526
+EXACT   c[264] = 528
+EXACT   c[265] = 530
+EXACT   c[266] = 532
+EXACT   c[267] = 534
+EXACT   c[268] = 536
+EXACT   c[269] = 538
+EXACT   c[270] = 540
+EXACT   c[271] = 542
+EXACT   c[272] = 544
+EXACT   c[273] = 546
+EXACT   c[274] = 548
+EXACT   c[275] = 550
+EXACT   c[276] = 552
+EXACT   c[277] = 554
+EXACT   c[278] = 556
+EXACT   c[279] = 558
+EXACT   c[280] = 560
+EXACT   c[281] = 562
+EXACT   c[282] = 564
+EXACT   c[283] = 566
+EXACT   c[284] = 568
+EXACT   c[285] = 570
+EXACT   c[286] = 572
+EXACT   c[287] = 574
+EXACT   c[288] = 576
+EXACT   c[289] = 578
+EXACT   c[290] = 580
+EXACT   c[291] = 582
+EXACT   c[292] = 584
+EXACT   c[293] = 586
+EXACT   c[294] = 588
+EXACT   c[295] = 590
+EXACT   c[296] = 592
+EXACT   c[297] = 594
+EXACT   c[298] = 596
+EXACT   c[299] = 598
+EXACT   c[300] = 600
+EXACT   c[301] = 602
+EXACT   c[302] = 604
+EXACT   c[303] = 606
+EXACT   c[304] = 608
+EXACT   c[305] = 610
+EXACT   c[306] = 612
+EXACT   c[307] = 614
+EXACT   c[308] = 616
+EXACT   c[309] = 618
+EXACT   c[310] = 620
+EXACT   c[311] = 622
+EXACT   c[312] = 624
+EXACT   c[313] = 626
+EXACT   c[314] = 628
+EXACT   c[315] = 630
+EXACT   c[316] = 632
+EXACT   c[317] = 634
+EXACT   c[318] = 636
+EXACT   c[319] = 638
+EXACT   c[320] = 640
+EXACT   c[321] = 642
+EXACT   c[322] = 644
+EXACT   c[323] = 646
+EXACT   c[324] = 648
+EXACT   c[325] = 650
+EXACT   c[326] = 652
+EXACT   c[327] = 654
+EXACT   c[328] = 656
+EXACT   c[329] = 658
+EXACT   c[330] = 660
+EXACT   c[331] = 662
+EXACT   c[332] = 664
+EXACT   c[333] = 666
+EXACT   c[334] = 668
+EXACT   c[335] = 670
+EXACT   c[336] = 672
+EXACT   c[337] = 674
+EXACT   c[338] = 676
+EXACT   c[339] = 678
+EXACT   c[340] = 680
+EXACT   c[341] = 682
+EXACT   c[342] = 684
+EXACT   c[343] = 686
+EXACT   c[344] = 688
+EXACT   c[345] = 690
+EXACT   c[346] = 692
+EXACT   c[347] = 694
+EXACT   c[348] = 696
+EXACT   c[349] = 698
+EXACT   c[350] = 700
+EXACT   c[351] = 702
+EXACT   c[352] = 704
+EXACT   c[353] = 706
+EXACT   c[354] = 708
+EXACT   c[355] = 710
+EXACT   c[356] = 712
+EXACT   c[357] = 714
+EXACT   c[358] = 716
+EXACT   c[359] = 718
+EXACT   c[360] = 720
+EXACT   c[361] = 722
+EXACT   c[362] = 724
+EXACT   c[363] = 726
+EXACT   c[364] = 728
+EXACT   c[365] = 730
+EXACT   c[366] = 732
+EXACT   c[367] = 734
+EXACT   c[368] = 736
+EXACT   c[369] = 738
+EXACT   c[370] = 740
+EXACT   c[371] = 742
+EXACT   c[372] = 744
+EXACT   c[373] = 746
+EXACT   c[374] = 748
+EXACT   c[375] = 750
+EXACT   c[376] = 752
+EXACT   c[377] = 754
+EXACT   c[378] = 756
+EXACT   c[379] = 758
+EXACT   c[380] = 760
+EXACT   c[381] = 762
+EXACT   c[382] = 764
+EXACT   c[383] = 766
+EXACT   c[384] = 768
+EXACT   c[385] = 770
+EXACT   c[386] = 772
+EXACT   c[387] = 774
+EXACT   c[388] = 776
+EXACT   c[389] = 778
+EXACT   c[390] = 780
+EXACT   c[391] = 782
+EXACT   c[392] = 784
+EXACT   c[393] = 786
+EXACT   c[394] = 788
+EXACT   c[395] = 790
+EXACT   c[396] = 792
+EXACT   c[397] = 794
+EXACT   c[398] = 796
+EXACT   c[399] = 798
+EXACT   c[400] = 800
+EXACT   c[401] = 802
+EXACT   c[402] = 804
+EXACT   c[403] = 806
+EXACT   c[404] = 808
+EXACT   c[405] = 810
+EXACT   c[406] = 812
+EXACT   c[407] = 814
+EXACT   c[408] = 816
+EXACT   c[409] = 818
+EXACT   c[410] = 820
+EXACT   c[411] = 822
+EXACT   c[412] = 824
+EXACT   c[413] = 826
+EXACT   c[414] = 828
+EXACT   c[415] = 830
+EXACT   c[416] = 832
+EXACT   c[417] = 834
+EXACT   c[418] = 836
+EXACT   c[419] = 838
+EXACT   c[420] = 840
+EXACT   c[421] = 842
+EXACT   c[422] = 844
+EXACT   c[423] = 846
+EXACT   c[424] = 848
+EXACT   c[425] = 850
+EXACT   c[426] = 852
+EXACT   c[427] = 854
+EXACT   c[428] = 856
+EXACT   c[429] = 858
+EXACT   c[430] = 860
+EXACT   c[431] = 862
+EXACT   c[432] = 864
+EXACT   c[433] = 866
+EXACT   c[434] = 868
+EXACT   c[435] = 870
+EXACT   c[436] = 872
+EXACT   c[437] = 874
+EXACT   c[438] = 876
+EXACT   c[439] = 878
+EXACT   c[440] = 880
+EXACT   c[441] = 882
+EXACT   c[442] = 884
+EXACT   c[443] = 886
+EXACT   c[444] = 888
+EXACT   c[445] = 890
+EXACT   c[446] = 892
+EXACT   c[447] = 894
+EXACT   c[448] = 896
+EXACT   c[449] = 898
+EXACT   c[450] = 900
+EXACT   c[451] = 902
+EXACT   c[452] = 904
+EXACT   c[453] = 906
+EXACT   c[454] = 908
+EXACT   c[455] = 910
+EXACT   c[456] = 912
+EXACT   c[457] = 914
+EXACT   c[458] = 916
+EXACT   c[459] = 918
+EXACT   c[460] = 920
+EXACT   c[461] = 922
+EXACT   c[462] = 924
+EXACT   c[463] = 926
+EXACT   c[464] = 928
+EXACT   c[465] = 930
+EXACT   c[466] = 932
+EXACT   c[467] = 934
+EXACT   c[468] = 936
+EXACT   c[469] = 938
+EXACT   c[470] = 940
+EXACT   c[471] = 942
+EXACT   c[472] = 944
+EXACT   c[473] = 946
+EXACT   c[474] = 948
+EXACT   c[475] = 950
+EXACT   c[476] = 952
+EXACT   c[477] = 954
+EXACT   c[478] = 956
+EXACT   c[479] = 958
+EXACT   c[480] = 960
+EXACT   c[481] = 962
+EXACT   c[482] = 964
+EXACT   c[483] = 966
+EXACT   c[484] = 968
+EXACT   c[485] = 970
+EXACT   c[486] = 972
+EXACT   c[487] = 974
+EXACT   c[488] = 976
+EXACT   c[489] = 978
+EXACT   c[490] = 980
+EXACT   c[491] = 982
+EXACT   c[492] = 984
+EXACT   c[493] = 986
+EXACT   c[494] = 988
+EXACT   c[495] = 990
+EXACT   c[496] = 992
+EXACT   c[497] = 994
+EXACT   c[498] = 996
+EXACT   c[499] = 998
+EXACT   c[500] = 1000
+EXACT   c[501] = 1002
+EXACT   c[502] = 1004
+EXACT   c[503] = 1006
+EXACT   c[504] = 1008
+EXACT   c[505] = 1010
+EXACT   c[506] = 1012
+EXACT   c[507] = 1014
+EXACT   c[508] = 1016
+EXACT   c[509] = 1018
+EXACT   c[510] = 1020
+EXACT   c[511] = 1022
+EXACT   c[512] = 1024
+EXACT   c[513] = 1026
+EXACT   c[514] = 1028
+EXACT   c[515] = 1030
+EXACT   c[516] = 1032
+EXACT   c[517] = 1034
+EXACT   c[518] = 1036
+EXACT   c[519] = 1038
+EXACT   c[520] = 1040
+EXACT   c[521] = 1042
+EXACT   c[522] = 1044
+EXACT   c[523] = 1046
+EXACT   c[524] = 1048
+EXACT   c[525] = 1050
+EXACT   c[526] = 1052
+EXACT   c[527] = 1054
+EXACT   c[528] = 1056
+EXACT   c[529] = 1058
+EXACT   c[530] = 1060
+EXACT   c[531] = 1062
+EXACT   c[532] = 1064
+EXACT   c[533] = 1066
+EXACT   c[534] = 1068
+EXACT   c[535] = 1070
+EXACT   c[536] = 1072
+EXACT   c[537] = 1074
+EXACT   c[538] = 1076
+EXACT   c[539] = 1078
+EXACT   c[540] = 1080
+EXACT   c[541] = 1082
+EXACT   c[542] = 1084
+EXACT   c[543] = 1086
+EXACT   c[544] = 1088
+EXACT   c[545] = 1090
+EXACT   c[546] = 1092
+EXACT   c[547] = 1094
+EXACT   c[548] = 1096
+EXACT   c[549] = 1098
+EXACT   c[550] = 1100
+EXACT   c[551] = 1102
+EXACT   c[552] = 1104
+EXACT   c[553] = 1106
+EXACT   c[554] = 1108
+EXACT   c[555] = 1110
+EXACT   c[556] = 1112
+EXACT   c[557] = 1114
+EXACT   c[558] = 1116
+EXACT   c[559] = 1118
+EXACT   c[560] = 1120
+EXACT   c[561] = 1122
+EXACT   c[562] = 1124
+EXACT   c[563] = 1126
+EXACT   c[564] = 1128
+EXACT   c[565] = 1130
+EXACT   c[566] = 1132
+EXACT   c[567] = 1134
+EXACT   c[568] = 1136
+EXACT   c[569] = 1138
+EXACT   c[570] = 1140
+EXACT   c[571] = 1142
+EXACT   c[572] = 1144
+EXACT   c[573] = 1146
+EXACT   c[574] = 1148
+EXACT   c[575] = 1150
+EXACT   c[576] = 1152
+EXACT   c[577] = 1154
+EXACT   c[578] = 1156
+EXACT   c[579] = 1158
+EXACT   c[580] = 1160
+EXACT   c[581] = 1162
+EXACT   c[582] = 1164
+EXACT   c[583] = 1166
+EXACT   c[584] = 1168
+EXACT   c[585] = 1170
+EXACT   c[586] = 1172
+EXACT   c[587] = 1174
+EXACT   c[588] = 1176
+EXACT   c[589] = 1178
+EXACT   c[590] = 1180
+EXACT   c[591] = 1182
+EXACT   c[592] = 1184
+EXACT   c[593] = 1186
+EXACT   c[594] = 1188
+EXACT   c[595] = 1190
+EXACT   c[596] = 1192
+EXACT   c[597] = 1194
+EXACT   c[598] = 1196
+EXACT   c[599] = 1198
+EXACT   c[600] = 1200
+EXACT   c[601] = 1202
+EXACT   c[602] = 1204
+EXACT   c[603] = 1206
+EXACT   c[604] = 1208
+EXACT   c[605] = 1210
+EXACT   c[606] = 1212
+EXACT   c[607] = 1214
+EXACT   c[608] = 1216
+EXACT   c[609] = 1218
+EXACT   c[610] = 1220
+EXACT   c[611] = 1222
+EXACT   c[612] = 1224
+EXACT   c[613] = 1226
+EXACT   c[614] = 1228
+EXACT   c[615] = 1230
+EXACT   c[616] = 1232
+EXACT   c[617] = 1234
+EXACT   c[618] = 1236
+EXACT   c[619] = 1238
+EXACT   c[620] = 1240
+EXACT   c[621] = 1242
+EXACT   c[622] = 1244
+EXACT   c[623] = 1246
+EXACT   c[624] = 1248
+EXACT   c[625] = 1250
+EXACT   c[626] = 1252
+EXACT   c[627] = 1254
+EXACT   c[628] = 1256
+EXACT   c[629] = 1258
+EXACT   c[630] = 1260
+EXACT   c[631] = 1262
+EXACT   c[632] = 1264
+EXACT   c[633] = 1266
+EXACT   c[634] = 1268
+EXACT   c[635] = 1270
+EXACT   c[636] = 1272
+EXACT   c[637] = 1274
+EXACT   c[638] = 1276
+EXACT   c[639] = 1278
+EXACT   c[640] = 1280
+EXACT   c[641] = 1282
+EXACT   c[642] = 1284
+EXACT   c[643] = 1286
+EXACT   c[644] = 1288
+EXACT   c[645] = 1290
+EXACT   c[646] = 1292
+EXACT   c[647] = 1294
+EXACT   c[648] = 1296
+EXACT   c[649] = 1298
+EXACT   c[650] = 1300
+EXACT   c[651] = 1302
+EXACT   c[652] = 1304
+EXACT   c[653] = 1306
+EXACT   c[654] = 1308
+EXACT   c[655] = 1310
+EXACT   c[656] = 1312
+EXACT   c[657] = 1314
+EXACT   c[658] = 1316
+EXACT   c[659] = 1318
+EXACT   c[660] = 1320
+EXACT   c[661] = 1322
+EXACT   c[662] = 1324
+EXACT   c[663] = 1326
+EXACT   c[664] = 1328
+EXACT   c[665] = 1330
+EXACT   c[666] = 1332
+EXACT   c[667] = 1334
+EXACT   c[668] = 1336
+EXACT   c[669] = 1338
+EXACT   c[670] = 1340
+EXACT   c[671] = 1342
+EXACT   c[672] = 1344
+EXACT   c[673] = 1346
+EXACT   c[674] = 1348
+EXACT   c[675] = 1350
+EXACT   c[676] = 1352
+EXACT   c[677] = 1354
+EXACT   c[678] = 1356
+EXACT   c[679] = 1358
+EXACT   c[680] = 1360
+EXACT   c[681] = 1362
+EXACT   c[682] = 1364
+EXACT   c[683] = 1366
+EXACT   c[684] = 1368
+EXACT   c[685] = 1370
+EXACT   c[686] = 1372
+EXACT   c[687] = 1374
+EXACT   c[688] = 1376
+EXACT   c[689] = 1378
+EXACT   c[690] = 1380
+EXACT   c[691] = 1382
+EXACT   c[692] = 1384
+EXACT   c[693] = 1386
+EXACT   c[694] = 1388
+EXACT   c[695] = 1390
+EXACT   c[696] = 1392
+EXACT   c[697] = 1394
+EXACT   c[698] = 1396
+EXACT   c[699] = 1398
+EXACT   c[700] = 1400
+EXACT   c[701] = 1402
+EXACT   c[702] = 1404
+EXACT   c[703] = 1406
+EXACT   c[704] = 1408
+EXACT   c[705] = 1410
+EXACT   c[706] = 1412
+EXACT   c[707] = 1414
+EXACT   c[708] = 1416
+EXACT   c[709] = 1418
+EXACT   c[710] = 1420
+EXACT   c[711] = 1422
+EXACT   c[712] = 1424
+EXACT   c[713] = 1426
+EXACT   c[714] = 1428
+EXACT   c[715] = 1430
+EXACT   c[716] = 1432
+EXACT   c[717] = 1434
+EXACT   c[718] = 1436
+EXACT   c[719] = 1438
+EXACT   c[720] = 1440
+EXACT   c[721] = 1442
+EXACT   c[722] = 1444
+EXACT   c[723] = 1446
+EXACT   c[724] = 1448
+EXACT   c[725] = 1450
+EXACT   c[726] = 1452
+EXACT   c[727] = 1454
+EXACT   c[728] = 1456
+EXACT   c[729] = 1458
+EXACT   c[730] = 1460
+EXACT   c[731] = 1462
+EXACT   c[732] = 1464
+EXACT   c[733] = 1466
+EXACT   c[734] = 1468
+EXACT   c[735] = 1470
+EXACT   c[736] = 1472
+EXACT   c[737] = 1474
+EXACT   c[738] = 1476
+EXACT   c[739] = 1478
+EXACT   c[740] = 1480
+EXACT   c[741] = 1482
+EXACT   c[742] = 1484
+EXACT   c[743] = 1486
+EXACT   c[744] = 1488
+EXACT   c[745] = 1490
+EXACT   c[746] = 1492
+EXACT   c[747] = 1494
+EXACT   c[748] = 1496
+EXACT   c[749] = 1498
+EXACT   c[750] = 1500
+EXACT   c[751] = 1502
+EXACT   c[752] = 1504
+EXACT   c[753] = 1506
+EXACT   c[754] = 1508
+EXACT   c[755] = 1510
+EXACT   c[756] = 1512
+EXACT   c[757] = 1514
+EXACT   c[758] = 1516
+EXACT   c[759] = 1518
+EXACT   c[760] = 1520
+EXACT   c[761] = 1522
+EXACT   c[762] = 1524
+EXACT   c[763] = 1526
+EXACT   c[764] = 1528
+EXACT   c[765] = 1530
+EXACT   c[766] = 1532
+EXACT   c[767] = 1534
+EXACT   c[768] = 1536
+EXACT   c[769] = 1538
+EXACT   c[770] = 1540
+EXACT   c[771] = 1542
+EXACT   c[772] = 1544
+EXACT   c[773] = 1546
+EXACT   c[774] = 1548
+EXACT   c[775] = 1550
+EXACT   c[776] = 1552
+EXACT   c[777] = 1554
+EXACT   c[778] = 1556
+EXACT   c[779] = 1558
+EXACT   c[780] = 1560
+EXACT   c[781] = 1562
+EXACT   c[782] = 1564
+EXACT   c[783] = 1566
+EXACT   c[784] = 1568
+EXACT   c[785] = 1570
+EXACT   c[786] = 1572
+EXACT   c[787] = 1574
+EXACT   c[788] = 1576
+EXACT   c[789] = 1578
+EXACT   c[790] = 1580
+EXACT   c[791] = 1582
+EXACT   c[792] = 1584
+EXACT   c[793] = 1586
+EXACT   c[794] = 1588
+EXACT   c[795] = 1590
+EXACT   c[796] = 1592
+EXACT   c[797] = 1594
+EXACT   c[798] = 1596
+EXACT   c[799] = 1598
+EXACT   c[800] = 1600
+EXACT   c[801] = 1602
+EXACT   c[802] = 1604
+EXACT   c[803] = 1606
+EXACT   c[804] = 1608
+EXACT   c[805] = 1610
+EXACT   c[806] = 1612
+EXACT   c[807] = 1614
+EXACT   c[808] = 1616
+EXACT   c[809] = 1618
+EXACT   c[810] = 1620
+EXACT   c[811] = 1622
+EXACT   c[812] = 1624
+EXACT   c[813] = 1626
+EXACT   c[814] = 1628
+EXACT   c[815] = 1630
+EXACT   c[816] = 1632
+EXACT   c[817] = 1634
+EXACT   c[818] = 1636
+EXACT   c[819] = 1638
+EXACT   c[820] = 1640
+EXACT   c[821] = 1642
+EXACT   c[822] = 1644
+EXACT   c[823] = 1646
+EXACT   c[824] = 1648
+EXACT   c[825] = 1650
+EXACT   c[826] = 1652
+EXACT   c[827] = 1654
+EXACT   c[828] = 1656
+EXACT   c[829] = 1658
+EXACT   c[830] = 1660
+EXACT   c[831] = 1662
+EXACT   c[832] = 1664
+EXACT   c[833] = 1666
+EXACT   c[834] = 1668
+EXACT   c[835] = 1670
+EXACT   c[836] = 1672
+EXACT   c[837] = 1674
+EXACT   c[838] = 1676
+EXACT   c[839] = 1678
+EXACT   c[840] = 1680
+EXACT   c[841] = 1682
+EXACT   c[842] = 1684
+EXACT   c[843] = 1686
+EXACT   c[844] = 1688
+EXACT   c[845] = 1690
+EXACT   c[846] = 1692
+EXACT   c[847] = 1694
+EXACT   c[848] = 1696
+EXACT   c[849] = 1698
+EXACT   c[850] = 1700
+EXACT   c[851] = 1702
+EXACT   c[852] = 1704
+EXACT   c[853] = 1706
+EXACT   c[854] = 1708
+EXACT   c[855] = 1710
+EXACT   c[856] = 1712
+EXACT   c[857] = 1714
+EXACT   c[858] = 1716
+EXACT   c[859] = 1718
+EXACT   c[860] = 1720
+EXACT   c[861] = 1722
+EXACT   c[862] = 1724
+EXACT   c[863] = 1726
+EXACT   c[864] = 1728
+EXACT   c[865] = 1730
+EXACT   c[866] = 1732
+EXACT   c[867] = 1734
+EXACT   c[868] = 1736
+EXACT   c[869] = 1738
+EXACT   c[870] = 1740
+EXACT   c[871] = 1742
+EXACT   c[872] = 1744
+EXACT   c[873] = 1746
+EXACT   c[874] = 1748
+EXACT   c[875] = 1750
+EXACT   c[876] = 1752
+EXACT   c[877] = 1754
+EXACT   c[878] = 1756
+EXACT   c[879] = 1758
+EXACT   c[880] = 1760
+EXACT   c[881] = 1762
+EXACT   c[882] = 1764
+EXACT   c[883] = 1766
+EXACT   c[884] = 1768
+EXACT   c[885] = 1770
+EXACT   c[886] = 1772
+EXACT   c[887] = 1774
+EXACT   c[888] = 1776
+EXACT   c[889] = 1778
+EXACT   c[890] = 1780
+EXACT   c[891] = 1782
+EXACT   c[892] = 1784
+EXACT   c[893] = 1786
+EXACT   c[894] = 1788
+EXACT   c[895] = 1790
+EXACT   c[896] = 1792
+EXACT   c[897] = 1794
+EXACT   c[898] = 1796
+EXACT   c[899] = 1798
+EXACT   c[900] = 1800
+EXACT   c[901] = 1802
+EXACT   c[902] = 1804
+EXACT   c[903] = 1806
+EXACT   c[904] = 1808
+EXACT   c[905] = 1810
+EXACT   c[906] = 1812
+EXACT   c[907] = 1814
+EXACT   c[908] = 1816
+EXACT   c[909] = 1818
+EXACT   c[910] = 1820
+EXACT   c[911] = 1822
+EXACT   c[912] = 1824
+EXACT   c[913] = 1826
+EXACT   c[914] = 1828
+EXACT   c[915] = 1830
+EXACT   c[916] = 1832
+EXACT   c[917] = 1834
+EXACT   c[918] = 1836
+EXACT   c[919] = 1838
+EXACT   c[920] = 1840
+EXACT   c[921] = 1842
+EXACT   c[922] = 1844
+EXACT   c[923] = 1846
+EXACT   c[924] = 1848
+EXACT   c[925] = 1850
+EXACT   c[926] = 1852
+EXACT   c[927] = 1854
+EXACT   c[928] = 1856
+EXACT   c[929] = 1858
+EXACT   c[930] = 1860
+EXACT   c[931] = 1862
+EXACT   c[932] = 1864
+EXACT   c[933] = 1866
+EXACT   c[934] = 1868
+EXACT   c[935] = 1870
+EXACT   c[936] = 1872
+EXACT   c[937] = 1874
+EXACT   c[938] = 1876
+EXACT   c[939] = 1878
+EXACT   c[940] = 1880
+EXACT   c[941] = 1882
+EXACT   c[942] = 1884
+EXACT   c[943] = 1886
+EXACT   c[944] = 1888
+EXACT   c[945] = 1890
+EXACT   c[946] = 1892
+EXACT   c[947] = 1894
+EXACT   c[948] = 1896
+EXACT   c[949] = 1898
+EXACT   c[950] = 1900
+EXACT   c[951] = 1902
+EXACT   c[952] = 1904
+EXACT   c[953] = 1906
+EXACT   c[954] = 1908
+EXACT   c[955] = 1910
+EXACT   c[956] = 1912
+EXACT   c[957] = 1914
+EXACT   c[958] = 1916
+EXACT   c[959] = 1918
+EXACT   c[960] = 1920
+EXACT   c[961] = 1922
+EXACT   c[962] = 1924
+EXACT   c[963] = 1926
+EXACT   c[964] = 1928
+EXACT   c[965] = 1930
+EXACT   c[966] = 1932
+EXACT   c[967] = 1934
+EXACT   c[968] = 1936
+EXACT   c[969] = 1938
+EXACT   c[970] = 1940
+EXACT   c[971] = 1942
+EXACT   c[972] = 1944
+EXACT   c[973] = 1946
+EXACT   c[974] = 1948
+EXACT   c[975] = 1950
+EXACT   c[976] = 1952
+EXACT   c[977] = 1954
+EXACT   c[978] = 1956
+EXACT   c[979] = 1958
+EXACT   c[980] = 1960
+EXACT   c[981] = 1962
+EXACT   c[982] = 1964
+EXACT   c[983] = 1966
+EXACT   c[984] = 1968
+EXACT   c[985] = 1970
+EXACT   c[986] = 1972
+EXACT   c[987] = 1974
+EXACT   c[988] = 1976
+EXACT   c[989] = 1978
+EXACT   c[990] = 1980
+EXACT   c[991] = 1982
+EXACT   c[992] = 1984
+EXACT   c[993] = 1986
+EXACT   c[994] = 1988
+EXACT   c[995] = 1990
+EXACT   c[996] = 1992
+EXACT   c[997] = 1994
+EXACT   c[998] = 1996
+EXACT   c[999] = 1998
+EXACT   c[1000] = 2000
+EXACT   c[1001] = 2002
+EXACT   c[1002] = 2004
+EXACT   c[1003] = 2006
+EXACT   c[1004] = 2008
+EXACT   c[1005] = 2010
+EXACT   c[1006] = 2012
+EXACT   c[1007] = 2014
+EXACT   c[1008] = 2016
+EXACT   c[1009] = 2018
+EXACT   c[1010] = 2020
+EXACT   c[1011] = 2022
+EXACT   c[1012] = 2024
+EXACT   c[1013] = 2026
+EXACT   c[1014] = 2028
+EXACT   c[1015] = 2030
+EXACT   c[1016] = 2032
+EXACT   c[1017] = 2034
+EXACT   c[1018] = 2036
+EXACT   c[1019] = 2038
+EXACT   c[1020] = 2040
+EXACT   c[1021] = 2042
+EXACT   c[1022] = 2044
+EXACT   c[1023] = 2046
diff --git a/tests/kernels/misc/vector_argument.cl b/tests/kernels/misc/vector_argument.cl
new file mode 100644
index 0000000..9ad03ba
--- /dev/null
+++ b/tests/kernels/misc/vector_argument.cl
@@ -0,0 +1,4 @@
+kernel void vector_argument(int4 vector, global int4 *output)
+{
+  *output = vector + 42;
+}
diff --git a/tests/kernels/misc/vector_argument.ref b/tests/kernels/misc/vector_argument.ref
new file mode 100644
index 0000000..ea2d29f
--- /dev/null
+++ b/tests/kernels/misc/vector_argument.ref
@@ -0,0 +1,5 @@
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 49
+EXACT   output[1] = 84
+EXACT   output[2] = 42
+EXACT   output[3] = 41
diff --git a/tests/kernels/misc/vector_argument.sim b/tests/kernels/misc/vector_argument.sim
new file mode 100644
index 0000000..d54a636
--- /dev/null
+++ b/tests/kernels/misc/vector_argument.sim
@@ -0,0 +1,9 @@
+vector_argument.cl
+vector_argument
+4 1 1
+1 1 1
+
+<size=16>
+7 42 0 -1
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/run_kernel_test.py b/tests/kernels/run_kernel_test.py
deleted file mode 100644
index 9387b9c..0000000
--- a/tests/kernels/run_kernel_test.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# run_kernel_test.py (Oclgrind)
-# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
-# University of Bristol. All rights reserved.
-#
-# This program is provided under a three-clause BSD license. For full
-# license terms please see the LICENSE file distributed with this
-# source code.
-
-import os
-import re
-import subprocess
-import sys
-
-# Check arguments
-if len(sys.argv) != 3:
-  print 'Usage: python run_kernel_test.py EXE SIMFILE'
-  sys.exit(1)
-if not os.path.isfile(sys.argv[2]):
-  print 'Test file not found'
-  sys.exit(1)
-
-# Construct paths to test inputs/outputs
-test_exe    = sys.argv[1]
-test_file   = sys.argv[2]
-test_dir    = os.path.dirname(os.path.realpath(test_file))
-test_file   = os.path.basename(test_file)
-test_name   = os.path.splitext(test_file)[0]
-test_out    = test_name + '.out'
-test_ref    = test_dir + os.path.sep + test_name + '.ref'
-current_dir = os.getcwd()
-
-if os.environ.get('AM_TESTS') == '1':
-  # If running via automake, use build directory for output file
-  test_out = 'tests' + os.path.sep + 'kernels' + os.path.sep + \
-             test_dir.split(os.path.sep)[-1] + os.path.sep + test_out
-else:
-  # Otherwise, use test directory for output file
-  test_out = test_dir + os.path.sep + test_out
-
-# Run oclgrind-kernel
-out = open(test_out, 'w')
-os.chdir(test_dir)
-retval = subprocess.call([test_exe, '--data-races', test_file],
-                         stdout=out, stderr=out)
-out.close()
-if retval != 0:
-  print 'oclgrind-kernel returned non-zero value (' + str(retval) + ')'
-  sys.exit(retval)
-
-# Open output and reference files
-os.chdir(current_dir)
-out = open(test_out).read().splitlines()
-ref = open(test_ref).read().splitlines()
-
-# Scan through file to reach argument data
-oi = 0
-ri = 0
-try:
-  while re.match('Argument \'.*\': [0-9]+ *bytes', out[oi]) == None:
-    oi += 1
-  while re.match('Argument \'.*\': [0-9]+ *bytes', ref[ri]) == None:
-    ri += 1
-except:
-  print 'Error searching for argument data'
-  sys.exit(1)
-
-# Check that an error was produced iff an error was expected
-# An error occured if global memory dump isn't at start of file
-# TODO: Improve this so that more details about the error are checked
-should_error = ri > 1
-if should_error and oi < 2:
-  print 'Error expected, but no error reported'
-  sys.exit(1)
-if not should_error and oi > 1:
-  print 'Error reported, but no error expected'
-  sys.exit(1)
-
-# Check that the global memory dump matches the reference
-# TODO: 32-bit machines will fail this due to memory address comparisons
-match = 1
-while oi < len(out):
-  if out[oi] != ref[ri]:
-    print '[%d:%d] "%s" vs "%s"' % (oi, ri, out[oi], ref[ri])
-    match = 0
-  oi += 1
-  ri += 1
-if not match:
-  print
-  print 'Output didn\'t match reference'
-  sys.exit(1)
-
-# Test passed
-sys.exit(0)
diff --git a/tests/kernels/uninitialized/padded_nested_struct_memcpy.cl b/tests/kernels/uninitialized/padded_nested_struct_memcpy.cl
new file mode 100644
index 0000000..20656ea
--- /dev/null
+++ b/tests/kernels/uninitialized/padded_nested_struct_memcpy.cl
@@ -0,0 +1,27 @@
+struct T
+{
+  char a;
+  int  b;
+  char c;
+};
+
+struct S
+{
+  char a;
+  int  b;
+  char c;
+  struct T d;
+};
+
+kernel void padded_nested_struct_memcpy(global struct S *output)
+{
+  struct S s;
+  s.a = 1;
+  s.b = 2;
+  s.c = 3;
+  s.d.a = 4;
+  s.d.b = 5;
+  s.d.c = 6;
+
+  *output = s;
+}
diff --git a/tests/kernels/uninitialized/padded_nested_struct_memcpy.ref b/tests/kernels/uninitialized/padded_nested_struct_memcpy.ref
new file mode 100644
index 0000000..a55c060
--- /dev/null
+++ b/tests/kernels/uninitialized/padded_nested_struct_memcpy.ref
@@ -0,0 +1,25 @@
+EXACT Argument 'output': 24 bytes
+EXACT   output[0] = 1
+MATCH   output[1] = 
+MATCH   output[2] = 
+MATCH   output[3] = 
+EXACT   output[4] = 2
+EXACT   output[5] = 0
+EXACT   output[6] = 0
+EXACT   output[7] = 0
+EXACT   output[8] = 3
+MATCH   output[9] = 
+MATCH   output[10] = 
+MATCH   output[11] = 
+EXACT   output[12] = 4
+MATCH   output[13] = 
+MATCH   output[14] = 
+MATCH   output[15] = 
+EXACT   output[16] = 5
+EXACT   output[17] = 0
+EXACT   output[18] = 0
+EXACT   output[19] = 0
+EXACT   output[20] = 6
+MATCH   output[21] = 
+MATCH   output[22] = 
+MATCH   output[23] = 
diff --git a/tests/kernels/uninitialized/padded_nested_struct_memcpy.sim b/tests/kernels/uninitialized/padded_nested_struct_memcpy.sim
new file mode 100644
index 0000000..4c9d374
--- /dev/null
+++ b/tests/kernels/uninitialized/padded_nested_struct_memcpy.sim
@@ -0,0 +1,6 @@
+padded_nested_struct_memcpy.cl
+padded_nested_struct_memcpy
+1 1 1
+1 1 1
+
+<size=24 char fill=0 dump>
diff --git a/tests/kernels/uninitialized/padded_struct_alloca_fp.cl b/tests/kernels/uninitialized/padded_struct_alloca_fp.cl
new file mode 100644
index 0000000..7f878a8
--- /dev/null
+++ b/tests/kernels/uninitialized/padded_struct_alloca_fp.cl
@@ -0,0 +1,16 @@
+struct S
+{
+  char a;
+  int  b;
+  char c;
+};
+
+kernel void padded_struct_alloca_fp(global struct S *output)
+{
+  struct S s;
+  s.a = 42;
+  s.b = -7;
+  s.c = 127;
+
+  *output = s;
+}
diff --git a/tests/kernels/uninitialized/padded_struct_alloca_fp.ref b/tests/kernels/uninitialized/padded_struct_alloca_fp.ref
new file mode 100644
index 0000000..609ed8f
--- /dev/null
+++ b/tests/kernels/uninitialized/padded_struct_alloca_fp.ref
@@ -0,0 +1,4 @@
+EXACT Argument 'output': 12 bytes
+EXACT   output[0] = 42
+EXACT   output[1] = -7
+EXACT   output[2] = 127
diff --git a/tests/kernels/uninitialized/padded_struct_alloca_fp.sim b/tests/kernels/uninitialized/padded_struct_alloca_fp.sim
new file mode 100644
index 0000000..74f1b0a
--- /dev/null
+++ b/tests/kernels/uninitialized/padded_struct_alloca_fp.sim
@@ -0,0 +1,6 @@
+padded_struct_alloca_fp.cl
+padded_struct_alloca_fp
+1 1 1
+1 1 1
+
+<size=12 int fill=0 dump>
diff --git a/tests/kernels/uninitialized/padded_struct_memcpy_fp.cl b/tests/kernels/uninitialized/padded_struct_memcpy_fp.cl
new file mode 100644
index 0000000..f1a449b
--- /dev/null
+++ b/tests/kernels/uninitialized/padded_struct_memcpy_fp.cl
@@ -0,0 +1,27 @@
+struct S
+{
+  char a;
+  int  b;
+  char c;
+};
+
+kernel void padded_struct_memcpy_fp(local struct S *scratch,
+                                    global struct S *output)
+{
+  int lid = get_local_id(0);
+
+  struct S s;
+  s.a = 42;
+  s.b = -7;
+  s.c = 127;
+
+  if (lid == 0)
+  {
+    *scratch = s;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (lid == 1)
+  {
+    *output = *scratch;
+  }
+}
diff --git a/tests/kernels/uninitialized/padded_struct_memcpy_fp.ref b/tests/kernels/uninitialized/padded_struct_memcpy_fp.ref
new file mode 100644
index 0000000..609ed8f
--- /dev/null
+++ b/tests/kernels/uninitialized/padded_struct_memcpy_fp.ref
@@ -0,0 +1,4 @@
+EXACT Argument 'output': 12 bytes
+EXACT   output[0] = 42
+EXACT   output[1] = -7
+EXACT   output[2] = 127
diff --git a/tests/kernels/uninitialized/padded_struct_memcpy_fp.sim b/tests/kernels/uninitialized/padded_struct_memcpy_fp.sim
new file mode 100644
index 0000000..8ed2ae9
--- /dev/null
+++ b/tests/kernels/uninitialized/padded_struct_memcpy_fp.sim
@@ -0,0 +1,8 @@
+padded_struct_memcpy_fp.cl
+padded_struct_memcpy_fp
+2 1 1
+2 1 1
+
+<size=12 char>
+
+<size=12 int fill=0 dump>
diff --git a/tests/kernels/uninitialized/partially_uninitialized_fract.cl b/tests/kernels/uninitialized/partially_uninitialized_fract.cl
new file mode 100644
index 0000000..c277e01
--- /dev/null
+++ b/tests/kernels/uninitialized/partially_uninitialized_fract.cl
@@ -0,0 +1,6 @@
+__kernel void partially_uninitialized_fract(__global float4 *output)
+{
+    float4 f;
+    f.xzw = 4.2;
+    *(output + 1) = fract(f, output);
+}
diff --git a/tests/kernels/uninitialized/partially_uninitialized_fract.ref b/tests/kernels/uninitialized/partially_uninitialized_fract.ref
new file mode 100644
index 0000000..7a74bbb
--- /dev/null
+++ b/tests/kernels/uninitialized/partially_uninitialized_fract.ref
@@ -0,0 +1,12 @@
+ERROR Uninitialized value
+ERROR Uninitialized value
+
+EXACT Argument 'output': 32 bytes
+EXACT   output[0] = 4
+MATCH   output[1] = 
+EXACT   output[2] = 4
+EXACT   output[3] = 4
+EXACT   output[4] = 0.2
+MATCH   output[5] = 
+EXACT   output[6] = 0.2
+EXACT   output[7] = 0.2
diff --git a/tests/kernels/uninitialized/partially_uninitialized_fract.sim b/tests/kernels/uninitialized/partially_uninitialized_fract.sim
new file mode 100644
index 0000000..699fb0e
--- /dev/null
+++ b/tests/kernels/uninitialized/partially_uninitialized_fract.sim
@@ -0,0 +1,6 @@
+partially_uninitialized_fract.cl
+partially_uninitialized_fract
+1 1 1
+1 1 1
+
+<size=32 fill=0 dump>
diff --git a/tests/kernels/uninitialized/private_array_initializer_list.cl b/tests/kernels/uninitialized/private_array_initializer_list.cl
new file mode 100644
index 0000000..20bfdab
--- /dev/null
+++ b/tests/kernels/uninitialized/private_array_initializer_list.cl
@@ -0,0 +1,9 @@
+kernel void private_array_initializer_list(global float *output)
+{
+  float scratch[4] = {7.f, 42.f, -1.f, 0.f};
+
+  for (int i = 0; i < 4; i++)
+  {
+    output[i] = scratch[i];
+  }
+}
diff --git a/tests/kernels/uninitialized/private_array_initializer_list.ref b/tests/kernels/uninitialized/private_array_initializer_list.ref
new file mode 100644
index 0000000..7de7145
--- /dev/null
+++ b/tests/kernels/uninitialized/private_array_initializer_list.ref
@@ -0,0 +1,5 @@
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 7
+EXACT   output[1] = 42
+EXACT   output[2] = -1
+EXACT   output[3] = 0
diff --git a/tests/kernels/uninitialized/private_array_initializer_list.sim b/tests/kernels/uninitialized/private_array_initializer_list.sim
new file mode 100644
index 0000000..b672712
--- /dev/null
+++ b/tests/kernels/uninitialized/private_array_initializer_list.sim
@@ -0,0 +1,6 @@
+private_array_initializer_list.cl
+private_array_initializer_list
+1 1 1
+1 1 1
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/uninitialized/uninitialized_address.cl b/tests/kernels/uninitialized/uninitialized_address.cl
new file mode 100644
index 0000000..5e39a8c
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_address.cl
@@ -0,0 +1,9 @@
+__kernel void uninitialized_address(__global ulong *output)
+{
+  int a[] = {1, 2, 3};
+  volatile int i, j;
+
+  a[i] = 4;
+
+  output[0] = a[j];
+}
diff --git a/tests/kernels/uninitialized/uninitialized_address.ref b/tests/kernels/uninitialized/uninitialized_address.ref
new file mode 100644
index 0000000..b9045ab
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_address.ref
@@ -0,0 +1,5 @@
+ERROR Uninitialized address
+ERROR Uninitialized address
+
+EXACT Argument 'output': 8 bytes
+MATCH   output[0] = 
diff --git a/tests/kernels/uninitialized/uninitialized_address.sim b/tests/kernels/uninitialized/uninitialized_address.sim
new file mode 100644
index 0000000..6fc429c
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_address.sim
@@ -0,0 +1,6 @@
+uninitialized_address.cl
+uninitialized_address
+1 1 1
+1 1 1
+
+<size=8 fill=0 dump>
diff --git a/tests/kernels/uninitialized/uninitialized_global_buffer.cl b/tests/kernels/uninitialized/uninitialized_global_buffer.cl
new file mode 100644
index 0000000..59038f4
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_global_buffer.cl
@@ -0,0 +1,5 @@
+kernel void uninitialized_global_buffer(global float *input,
+                                        global float *output)
+{
+  output[get_global_id(0)] = *input;
+}
diff --git a/tests/kernels/uninitialized/uninitialized_global_buffer.ref b/tests/kernels/uninitialized/uninitialized_global_buffer.ref
new file mode 100644
index 0000000..34caf30
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_global_buffer.ref
@@ -0,0 +1,4 @@
+ERROR Uninitialized value
+
+EXACT Argument 'output': 4 bytes
+EXACT   output[0] = 0
diff --git a/tests/kernels/uninitialized/uninitialized_global_buffer.sim b/tests/kernels/uninitialized/uninitialized_global_buffer.sim
new file mode 100644
index 0000000..184ea3d
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_global_buffer.sim
@@ -0,0 +1,8 @@
+uninitialized_global_buffer.cl
+uninitialized_global_buffer
+1 1 1
+1 1 1
+
+<size=4 noinit>
+
+<size=4 fill=0 dump>
diff --git a/tests/kernels/uninitialized/uninitialized_local_array.cl b/tests/kernels/uninitialized/uninitialized_local_array.cl
new file mode 100644
index 0000000..0c95007
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_local_array.cl
@@ -0,0 +1,11 @@
+kernel void uninitialized_local_array(global float *output)
+{
+  local float scratch[16];
+
+  int i = get_local_id(0);
+  if (i != get_local_size(0)/2)
+  {
+    scratch[i] = i;
+  }
+  output[i] = scratch[i];
+}
diff --git a/tests/kernels/uninitialized/uninitialized_local_array.ref b/tests/kernels/uninitialized/uninitialized_local_array.ref
new file mode 100644
index 0000000..85ba40e
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_local_array.ref
@@ -0,0 +1,19 @@
+ERROR Uninitialized value
+
+EXACT Argument 'output': 64 bytes
+EXACT   output[0] = 0
+EXACT   output[1] = 1
+EXACT   output[2] = 2
+EXACT   output[3] = 3
+EXACT   output[4] = 4
+EXACT   output[5] = 5
+EXACT   output[6] = 6
+EXACT   output[7] = 7
+EXACT   output[8] = 0
+EXACT   output[9] = 9
+EXACT   output[10] = 10
+EXACT   output[11] = 11
+EXACT   output[12] = 12
+EXACT   output[13] = 13
+EXACT   output[14] = 14
+EXACT   output[15] = 15
diff --git a/tests/kernels/uninitialized/uninitialized_local_array.sim b/tests/kernels/uninitialized/uninitialized_local_array.sim
new file mode 100644
index 0000000..3dc20c5
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_local_array.sim
@@ -0,0 +1,6 @@
+uninitialized_local_array.cl
+uninitialized_local_array
+16 1 1
+16 1 1
+
+<size=64 fill=0 dump>
diff --git a/tests/kernels/uninitialized/uninitialized_local_ptr.cl b/tests/kernels/uninitialized/uninitialized_local_ptr.cl
new file mode 100644
index 0000000..76631b8
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_local_ptr.cl
@@ -0,0 +1,9 @@
+kernel void uninitialized_local_ptr(local float *scratch, global float *output)
+{
+  int i = get_local_id(0);
+  if (i != get_local_size(0)/2)
+  {
+    scratch[i] = i;
+  }
+  output[i] = scratch[i];
+}
diff --git a/tests/kernels/uninitialized/uninitialized_local_ptr.ref b/tests/kernels/uninitialized/uninitialized_local_ptr.ref
new file mode 100644
index 0000000..85ba40e
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_local_ptr.ref
@@ -0,0 +1,19 @@
+ERROR Uninitialized value
+
+EXACT Argument 'output': 64 bytes
+EXACT   output[0] = 0
+EXACT   output[1] = 1
+EXACT   output[2] = 2
+EXACT   output[3] = 3
+EXACT   output[4] = 4
+EXACT   output[5] = 5
+EXACT   output[6] = 6
+EXACT   output[7] = 7
+EXACT   output[8] = 0
+EXACT   output[9] = 9
+EXACT   output[10] = 10
+EXACT   output[11] = 11
+EXACT   output[12] = 12
+EXACT   output[13] = 13
+EXACT   output[14] = 14
+EXACT   output[15] = 15
diff --git a/tests/kernels/uninitialized/uninitialized_local_ptr.sim b/tests/kernels/uninitialized/uninitialized_local_ptr.sim
new file mode 100644
index 0000000..47c711b
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_local_ptr.sim
@@ -0,0 +1,8 @@
+uninitialized_local_ptr.cl
+uninitialized_local_ptr
+16 1 1
+16 1 1
+
+<size=64>
+
+<size=64 fill=0 dump>
diff --git a/tests/kernels/uninitialized/uninitialized_local_variable.cl b/tests/kernels/uninitialized/uninitialized_local_variable.cl
new file mode 100644
index 0000000..1cf8685
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_local_variable.cl
@@ -0,0 +1,7 @@
+kernel void uninitialized_local_variable(global int *output)
+{
+  local int x;
+  if (*output > 0)
+    x = *output;
+  *output = x;
+}
diff --git a/tests/kernels/uninitialized/uninitialized_local_variable.ref b/tests/kernels/uninitialized/uninitialized_local_variable.ref
new file mode 100644
index 0000000..34caf30
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_local_variable.ref
@@ -0,0 +1,4 @@
+ERROR Uninitialized value
+
+EXACT Argument 'output': 4 bytes
+EXACT   output[0] = 0
diff --git a/tests/kernels/uninitialized/uninitialized_local_variable.sim b/tests/kernels/uninitialized/uninitialized_local_variable.sim
new file mode 100644
index 0000000..df2838e
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_local_variable.sim
@@ -0,0 +1,6 @@
+uninitialized_local_variable.cl
+uninitialized_local_variable
+1 1 1
+1 1 1
+
+<size=4 dump fill=0>
diff --git a/tests/kernels/uninitialized/uninitialized_packed_struct_memcpy.cl b/tests/kernels/uninitialized/uninitialized_packed_struct_memcpy.cl
new file mode 100644
index 0000000..fbc888e
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_packed_struct_memcpy.cl
@@ -0,0 +1,12 @@
+struct __attribute__ ((packed)) S
+{
+  char a;
+  int  b __attribute__ ((packed));
+  char c;
+};
+
+kernel void uninitialized_packed_struct_memcpy(local int *scratch, global struct S *output)
+{
+  struct S s = {1, *scratch, 2};
+  *output = s;
+}
diff --git a/tests/kernels/uninitialized/uninitialized_packed_struct_memcpy.ref b/tests/kernels/uninitialized/uninitialized_packed_struct_memcpy.ref
new file mode 100644
index 0000000..a0d374f
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_packed_struct_memcpy.ref
@@ -0,0 +1,9 @@
+ERROR Uninitialized value
+
+EXACT Argument 'output': 6 bytes
+EXACT   output[0] = 1
+MATCH   output[1] = 
+MATCH   output[2] = 
+MATCH   output[3] = 
+MATCH   output[4] = 
+EXACT   output[5] = 2
diff --git a/tests/kernels/uninitialized/uninitialized_packed_struct_memcpy.sim b/tests/kernels/uninitialized/uninitialized_packed_struct_memcpy.sim
new file mode 100644
index 0000000..236ae5b
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_packed_struct_memcpy.sim
@@ -0,0 +1,8 @@
+uninitialized_packed_struct_memcpy.cl
+uninitialized_packed_struct_memcpy
+1 1 1
+1 1 1
+
+<size=8>
+
+<size=6 char fill=0 dump>
diff --git a/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.cl b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.cl
new file mode 100644
index 0000000..4558ff6
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.cl
@@ -0,0 +1,20 @@
+struct T
+{
+  char a;
+  int  b;
+  char c;
+};
+
+struct S
+{
+  char a;
+  int  b;
+  char c;
+  struct T d;
+};
+
+kernel void uninitialized_padded_nested_struct_memcpy(local int *scratch, global struct S *output)
+{
+  struct S s = {1, 2, 3, {4, *scratch, 5}};
+  *output = s;
+}
diff --git a/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.ref b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.ref
new file mode 100644
index 0000000..ad64661
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.ref
@@ -0,0 +1,27 @@
+ERROR Uninitialized value
+
+EXACT Argument 'output': 24 bytes
+EXACT   output[0] = 1
+MATCH   output[1] = 
+MATCH   output[2] = 
+MATCH   output[3] = 
+EXACT   output[4] = 2
+EXACT   output[5] = 0
+EXACT   output[6] = 0
+EXACT   output[7] = 0
+EXACT   output[8] = 3
+MATCH   output[9] = 
+MATCH   output[10] = 
+MATCH   output[11] = 
+EXACT   output[12] = 4
+MATCH   output[13] = 
+MATCH   output[14] = 
+MATCH   output[15] = 
+MATCH   output[16] = 
+MATCH   output[17] = 
+MATCH   output[18] = 
+MATCH   output[19] = 
+EXACT   output[20] = 5
+MATCH   output[21] = 
+MATCH   output[22] = 
+MATCH   output[23] = 
diff --git a/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.sim b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.sim
new file mode 100644
index 0000000..fde5c05
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_padded_nested_struct_memcpy.sim
@@ -0,0 +1,8 @@
+uninitialized_padded_nested_struct_memcpy.cl
+uninitialized_padded_nested_struct_memcpy
+1 1 1
+1 1 1
+
+<size=8>
+
+<size=24 char fill=0 dump>
diff --git a/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.cl b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.cl
new file mode 100644
index 0000000..3b96f3a
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.cl
@@ -0,0 +1,12 @@
+struct S
+{
+  char a;
+  int  b;
+  char c;
+};
+
+kernel void uninitialized_padded_struct_memcpy(local int *scratch, global struct S *output)
+{
+  struct S s = {1, *scratch, 2};
+  *output = s;
+}
diff --git a/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.ref b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.ref
new file mode 100644
index 0000000..3fffd1d
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.ref
@@ -0,0 +1,6 @@
+ERROR Uninitialized value
+
+EXACT Argument 'output': 12 bytes
+EXACT   output[0] = 1
+MATCH   output[1] = 
+EXACT   output[2] = 2
diff --git a/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.sim b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.sim
new file mode 100644
index 0000000..cdf5827
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_padded_struct_memcpy.sim
@@ -0,0 +1,8 @@
+uninitialized_padded_struct_memcpy.cl
+uninitialized_padded_struct_memcpy
+1 1 1
+1 1 1
+
+<size=8>
+
+<size=12 int fill=0 dump>
diff --git a/tests/kernels/uninitialized/uninitialized_private_array.cl b/tests/kernels/uninitialized/uninitialized_private_array.cl
new file mode 100644
index 0000000..2f46248
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_private_array.cl
@@ -0,0 +1,16 @@
+kernel void uninitialized_private_array(global uint  *indices,
+                                        global float *input,
+                                        global float *output)
+{
+  float scratch[4];
+
+  for (int i = 0; i < 4; i++)
+  {
+    scratch[indices[i]] = i;
+  }
+
+  for (int i = 0; i < 4; i++)
+  {
+    output[i] = scratch[i];
+  }
+}
diff --git a/tests/kernels/uninitialized/uninitialized_private_array.ref b/tests/kernels/uninitialized/uninitialized_private_array.ref
new file mode 100644
index 0000000..21fef4f
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_private_array.ref
@@ -0,0 +1,7 @@
+ERROR Uninitialized value
+
+EXACT Argument 'output': 16 bytes
+EXACT   output[0] = 0
+EXACT   output[1] = 2
+EXACT   output[2] = 0
+EXACT   output[3] = 3
diff --git a/tests/kernels/uninitialized/uninitialized_private_array.sim b/tests/kernels/uninitialized/uninitialized_private_array.sim
new file mode 100644
index 0000000..c03f63c
--- /dev/null
+++ b/tests/kernels/uninitialized/uninitialized_private_array.sim
@@ -0,0 +1,14 @@
+uninitialized_private_array.cl
+uninitialized_private_array
+1 1 1
+1 1 1
+
+<size=16>
+0
+1
+1
+3
+
+<size=16 range=1:1:4>
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/wait_event/wait_event_chained.ref b/tests/kernels/wait_event/wait_event_chained.ref
index cf0b04f..690f35e 100644
--- a/tests/kernels/wait_event/wait_event_chained.ref
+++ b/tests/kernels/wait_event/wait_event_chained.ref
@@ -1,7 +1,5 @@
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 2
-  data[2] = 1
-  data[3] = 0
-
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 0
diff --git a/tests/kernels/wait_event/wait_event_divergent.cl b/tests/kernels/wait_event/wait_event_divergent.cl
index d88f3f3..6d56d10 100644
--- a/tests/kernels/wait_event/wait_event_divergent.cl
+++ b/tests/kernels/wait_event/wait_event_divergent.cl
@@ -1,6 +1,9 @@
 kernel void wait_event_divergent(global int *data, local int *scratch)
 {
   int i = get_local_id(0);
+  scratch[i] = 0;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
   event_t events[2];
   events[0] = async_work_group_copy(scratch, data, 1, 0);
   events[1] = async_work_group_copy(scratch+1, data+1, 1, 0);
diff --git a/tests/kernels/wait_event/wait_event_divergent.ref b/tests/kernels/wait_event/wait_event_divergent.ref
index 56f64ac..1f7cc25 100644
--- a/tests/kernels/wait_event/wait_event_divergent.ref
+++ b/tests/kernels/wait_event/wait_event_divergent.ref
@@ -1,6 +1,6 @@
-ERROR EXPECTED
-
-Argument 'data': 8 bytes
-  data[0] = 0
-  data[1] = 0
+ERROR Work-group divergence detected (barrier)
+ERROR Work-item finished without waiting for events
 
+EXACT Argument 'data': 8 bytes
+EXACT   data[0] = 0
+EXACT   data[1] = 0
diff --git a/tests/kernels/wait_event/wait_event_duplicates.ref b/tests/kernels/wait_event/wait_event_duplicates.ref
index cf0b04f..690f35e 100644
--- a/tests/kernels/wait_event/wait_event_duplicates.ref
+++ b/tests/kernels/wait_event/wait_event_duplicates.ref
@@ -1,7 +1,5 @@
-
-Argument 'data': 16 bytes
-  data[0] = 3
-  data[1] = 2
-  data[2] = 1
-  data[3] = 0
-
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 3
+EXACT   data[1] = 2
+EXACT   data[2] = 1
+EXACT   data[3] = 0
diff --git a/tests/kernels/wait_event/wait_event_invalid.ref b/tests/kernels/wait_event/wait_event_invalid.ref
index 4da13c4..1c2467d 100644
--- a/tests/kernels/wait_event/wait_event_invalid.ref
+++ b/tests/kernels/wait_event/wait_event_invalid.ref
@@ -1,8 +1,7 @@
-ERROR EXPECTED
-
-Argument 'data': 16 bytes
-  data[0] = 0
-  data[1] = 1
-  data[2] = 2
-  data[3] = 3
+ERROR Invalid wait event
 
+EXACT Argument 'data': 16 bytes
+EXACT   data[0] = 0
+EXACT   data[1] = 1
+EXACT   data[2] = 2
+EXACT   data[3] = 3
diff --git a/tests/run_test.py b/tests/run_test.py
new file mode 100644
index 0000000..330ca7e
--- /dev/null
+++ b/tests/run_test.py
@@ -0,0 +1,145 @@
+# run_test.py (Oclgrind)
+# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+#
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+# Check arguments
+if len(sys.argv) != 3:
+  print('Usage: python run_test.py OCLGRIND-KERNEL TEST_EXE|TEST.sim')
+  sys.exit(1)
+if not os.path.isfile(sys.argv[2]):
+  print('Test file not found')
+  sys.exit(1)
+
+# Construct paths to test inputs/outputs
+oclgrind_kernel = sys.argv[1]
+test_full_path  = sys.argv[2]
+test_dir        = os.path.dirname(os.path.realpath(test_full_path))
+test_file       = os.path.basename(test_full_path)
+test_name       = os.path.splitext(test_file)[0]
+current_dir     = os.getcwd()
+
+if test_file.endswith('.sim'):
+  test_ref = test_full_path[:-4] + '.ref'
+else:
+  if test_full_path[0] == '/':
+    rel_path = test_full_path[test_full_path.find('/tests/') + 7:]
+  else:
+    rel_path = test_full_path
+
+  test_ref = os.path.dirname(os.path.abspath(__file__)) + os.path.sep \
+    + rel_path + '.ref'
+
+# Enable race detection and uninitialized memory plugins
+os.environ["OCLGRIND_CHECK_API"] = "1"
+os.environ["OCLGRIND_DATA_RACES"] = "1"
+os.environ["OCLGRIND_UNINITIALIZED"] = "1"
+
+def fail(ret=1):
+  print('FAILED')
+  sys.exit(ret)
+
+def run(output_suffix):
+
+  # Get filename for test output
+  if test_file.endswith('.sim'):
+    test_out = test_dir.split(os.path.sep)[-1] + os.path.sep + \
+               test_name + output_suffix + '.out'
+  else:
+    test_out = test_dir + os.path.sep + \
+               test_name + output_suffix + '.out'
+
+
+  output_dir = os.path.dirname(test_out)
+  try:
+    os.makedirs(output_dir)
+  except OSError as exc:
+    if exc.errno == errno.EEXIST and os.path.isdir(output_dir):
+      pass
+    else:
+      raise
+
+  out = open(test_out, 'w')
+
+  # Run test
+  if test_file.endswith('.sim'):
+    os.chdir(test_dir)
+    retval = subprocess.call([oclgrind_kernel, test_file],
+                             stdout=out, stderr=out)
+    os.chdir(current_dir)
+  else:
+    retval = subprocess.call([test_full_path], stdout=out, stderr=out)
+
+  out.close()
+  if retval != 0:
+    print('Test returned non-zero value (' + str(retval) + ')')
+    fail(retval)
+
+  # Compare output to reference file (if provided)
+  if os.path.isfile(test_ref):
+
+    # Open output and reference files
+    out = open(test_out).read().splitlines()
+    ref = open(test_ref).read().splitlines()
+
+    # Check output matches references
+    oi = 0
+    for line in ref:
+      if len(line) == 0:
+        continue
+
+      type = line.split()[0]
+      text = line[6:]
+
+      # Find next non-blank line in output file
+      while not len(out[oi]):
+        oi += 1
+
+      if type == 'ERROR':
+        # Check first line of error contains reference message
+        if not text in out[oi]:
+          print('Expected '  + line)
+          print('Found    "' + out[oi] + '"')
+          fail()
+        # Skip remaining lines of error
+        while oi < len(out) and len(out[oi]):
+          oi += 1
+      elif type == 'EXACT':
+        # Check line of output matches reference exactly
+        if not text == out[oi]:
+          print('Expected '  + line)
+          print('Found    "' + out[oi] + '"')
+          fail()
+        oi += 1
+      elif type == 'MATCH':
+        # Check line of output contains reference text
+        if not text in out[oi]:
+          print('Expected '  + line)
+          print('Found    "' + out[oi] + '"')
+          fail()
+        oi += 1
+      else:
+        print('Invalid match type in reference file')
+        fail()
+
+print('Running test with optimisations')
+run('')
+print('PASSED')
+
+print('')
+print('Running test without optimisations')
+os.environ["OCLGRIND_BUILD_OPTIONS"] = "-cl-opt-disable"
+run('_noopt')
+print('PASSED')
+
+# Test passed
+sys.exit(0)
diff --git a/tests/apps/CMakeLists.txt b/tests/runtime/CMakeLists.txt
similarity index 57%
copy from tests/apps/CMakeLists.txt
copy to tests/runtime/CMakeLists.txt
index 0dff241..c700f96 100644
--- a/tests/apps/CMakeLists.txt
+++ b/tests/runtime/CMakeLists.txt
@@ -1,33 +1,40 @@
 # CMakeLists.txt (Oclgrind)
-# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# Copyright (c) 2013-2016, James Price and Simon McIntosh-Smith,
 # University of Bristol. All rights reserved.
 #
 # This program is provided under a three-clause BSD license. For full
 # license terms please see the LICENSE file distributed with this
 # source code.
 
-# Add app tests
+set(COMMON_SOURCES ../common/common.c ../common/common.h)
+include_directories(../common)
+
+# Add runtime tests
 foreach(test
-  vecadd)
+  map_buffer)
 
-  add_executable(${test} ${test}/${test}.c)
+  add_executable(${test} ${test}.c ${COMMON_SOURCES})
   target_link_libraries(${test} oclgrind-rt)
 
   # Generate test binaries in same dir as Oclgrind libraries on Windows
   if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
-    add_test(app_${test} "${CMAKE_BINARY_DIR}/${test}")
     set_target_properties(${test} PROPERTIES
       RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
   else()
-    add_test(app_${test} "${test}/${test}")
-    set_target_properties(${test} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${test}")
     set_target_properties(${test} PROPERTIES LINKER_LANGUAGE CXX)
   endif()
 
-  set_tests_properties(app_${test} PROPERTIES DEPENDS ${test})
+  add_test(
+    NAME rt_${test}
+    COMMAND
+    ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/run_test.py
+    $<TARGET_FILE:oclgrind-kernel>
+    $<TARGET_FILE:${test}>)
+
+  set_tests_properties(rt_${test} PROPERTIES DEPENDS ${test})
 
   # Set PCH directory
-  set_tests_properties(app_${test} PROPERTIES
+  set_tests_properties(rt_${test} PROPERTIES
       ENVIRONMENT "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
 
 endforeach(${test})
diff --git a/tests/runtime/map_buffer.c b/tests/runtime/map_buffer.c
new file mode 100644
index 0000000..059b261
--- /dev/null
+++ b/tests/runtime/map_buffer.c
@@ -0,0 +1,327 @@
+#include "common.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define TOL 1e-8
+#define MAX_ERRORS 8
+
+const char *KERNEL_SOURCE =
+"kernel void vecadd(global float *a, \n"
+"                   global float *b, \n"
+"                   global float *c) \n"
+"{                                   \n"
+"  int i = get_global_id(0);         \n"
+"  c[i] = a[i] + b[i];               \n"
+"}                                   \n"
+;
+
+unsigned checkResults(size_t N, float *a, float *b, float *results);
+
+// Run everything as normal
+unsigned run1(Context cl, cl_kernel kernel,
+              cl_mem d_a, cl_mem d_b, cl_mem d_c, size_t N)
+{
+  cl_int err;
+  float *h_a, *h_b, *h_c;
+  size_t dataSize = N*sizeof(cl_float);
+
+  // Initialise data
+  srand(0);
+  h_a = clEnqueueMapBuffer(cl.queue, d_a, CL_TRUE,
+                           CL_MAP_WRITE_INVALIDATE_REGION,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_a buffer");
+  h_b = clEnqueueMapBuffer(cl.queue, d_b, CL_TRUE,
+                           CL_MAP_WRITE_INVALIDATE_REGION,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_b buffer");
+  h_c = clEnqueueMapBuffer(cl.queue, d_c, CL_FALSE, CL_MAP_READ, 0, dataSize,
+                           0, NULL, NULL, &err);
+  checkError(err, "mapping d_c buffer");
+  for (unsigned i = 0; i < N; i++)
+  {
+    h_a[i] = rand()/(float)RAND_MAX;
+    h_b[i] = rand()/(float)RAND_MAX;
+    h_c[i] = 0;
+  }
+
+  err = clEnqueueUnmapMemObject(cl.queue, d_a, h_a, 0, NULL, NULL);
+  checkError(err, "unmapping d_a");
+  err = clEnqueueUnmapMemObject(cl.queue, d_b, h_b, 0, NULL, NULL);
+  checkError(err, "unmapping d_b");
+  err = clEnqueueUnmapMemObject(cl.queue, d_c, h_c, 0, NULL, NULL);
+  checkError(err, "unmapping d_c");
+
+  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
+  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
+  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
+  checkError(err, "setting kernel args");
+
+  err = clEnqueueNDRangeKernel(cl.queue, kernel,
+                               1, NULL, &N, NULL, 0, NULL, NULL);
+  checkError(err, "enqueuing kernel");
+
+  h_c = clEnqueueMapBuffer(cl.queue, d_c, CL_FALSE, CL_MAP_READ, 0, dataSize,
+                           0, NULL, NULL, &err);
+  checkError(err, "mapping d_c buffer");
+
+  err = clFinish(cl.queue);
+  checkError(err, "running kernel");
+
+  unsigned errors = checkResults(N, h_a, h_b, h_c);
+
+  err = clEnqueueUnmapMemObject(cl.queue, d_c, h_c, 0, NULL, NULL);
+  checkError(err, "unmapping d_c");
+
+  return errors;
+}
+
+// Don't unmap input buffers before running kernel
+// Should result in "Invalid read from buffer mapped for writing" error
+unsigned run2(Context cl, cl_kernel kernel,
+              cl_mem d_a, cl_mem d_b, cl_mem d_c, size_t N)
+{
+  cl_int err;
+  float *h_a, *h_b, *h_c;
+  size_t dataSize = N*sizeof(cl_float);
+
+  // Initialise data
+  srand(0);
+  h_a = clEnqueueMapBuffer(cl.queue, d_a, CL_TRUE,
+                           CL_MAP_WRITE_INVALIDATE_REGION,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_a buffer");
+  h_b = clEnqueueMapBuffer(cl.queue, d_b, CL_TRUE,
+                           CL_MAP_WRITE_INVALIDATE_REGION,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_b buffer");
+  h_c = clEnqueueMapBuffer(cl.queue, d_c, CL_FALSE, CL_MAP_READ, 0, dataSize,
+                           0, NULL, NULL, &err);
+  checkError(err, "mapping d_c buffer");
+  for (unsigned i = 0; i < N; i++)
+  {
+    h_a[i] = rand()/(float)RAND_MAX;
+    h_b[i] = rand()/(float)RAND_MAX;
+    h_c[i] = 0;
+  }
+
+  err = clEnqueueUnmapMemObject(cl.queue, d_c, h_c, 0, NULL, NULL);
+  checkError(err, "unmapping d_c");
+
+  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
+  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
+  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
+  checkError(err, "setting kernel args");
+
+  err = clEnqueueNDRangeKernel(cl.queue, kernel,
+                               1, NULL, &N, NULL, 0, NULL, NULL);
+  checkError(err, "enqueuing kernel");
+
+  h_c = clEnqueueMapBuffer(cl.queue, d_c, CL_FALSE, CL_MAP_READ, 0, dataSize,
+                           0, NULL, NULL, &err);
+  checkError(err, "mapping d_c buffer");
+
+  err = clFinish(cl.queue);
+  checkError(err, "running kernel");
+
+  unsigned errors = checkResults(N, h_a, h_b, h_c);
+
+  err = clEnqueueUnmapMemObject(cl.queue, d_a, h_a, 0, NULL, NULL);
+  checkError(err, "unmapping d_a");
+  err = clEnqueueUnmapMemObject(cl.queue, d_b, h_b, 0, NULL, NULL);
+  checkError(err, "unmapping d_b");
+  err = clEnqueueUnmapMemObject(cl.queue, d_c, h_c, 0, NULL, NULL);
+  checkError(err, "unmapping d_c");
+
+  return errors;
+}
+
+// Don't unmap output buffer before running kernel
+// Should result in "Invalid write to mapped buffer" error
+unsigned run3(Context cl, cl_kernel kernel,
+              cl_mem d_a, cl_mem d_b, cl_mem d_c, size_t N)
+{
+  cl_int err;
+  float *h_a, *h_b, *h_c;
+  size_t dataSize = N*sizeof(cl_float);
+
+  // Initialise data
+  srand(0);
+  h_a = clEnqueueMapBuffer(cl.queue, d_a, CL_TRUE,
+                           CL_MAP_WRITE_INVALIDATE_REGION,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_a buffer");
+  h_b = clEnqueueMapBuffer(cl.queue, d_b, CL_TRUE,
+                           CL_MAP_WRITE_INVALIDATE_REGION,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_b buffer");
+  h_c = clEnqueueMapBuffer(cl.queue, d_c, CL_FALSE, CL_MAP_READ, 0, dataSize,
+                           0, NULL, NULL, &err);
+  checkError(err, "mapping d_c buffer");
+  for (unsigned i = 0; i < N; i++)
+  {
+    h_a[i] = rand()/(float)RAND_MAX;
+    h_b[i] = rand()/(float)RAND_MAX;
+    h_c[i] = 0;
+  }
+
+  err = clEnqueueUnmapMemObject(cl.queue, d_a, h_a, 0, NULL, NULL);
+  checkError(err, "unmapping d_a");
+  err = clEnqueueUnmapMemObject(cl.queue, d_b, h_b, 0, NULL, NULL);
+  checkError(err, "unmapping d_b");
+
+  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
+  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
+  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
+  checkError(err, "setting kernel args");
+
+  err = clEnqueueNDRangeKernel(cl.queue, kernel,
+                               1, NULL, &N, NULL, 0, NULL, NULL);
+  checkError(err, "enqueuing kernel");
+
+  err = clFinish(cl.queue);
+  checkError(err, "running kernel");
+
+  unsigned errors = checkResults(N, h_a, h_b, h_c);
+
+  err = clEnqueueUnmapMemObject(cl.queue, d_c, h_c, 0, NULL, NULL);
+  checkError(err, "unmapping d_c");
+
+  return errors;
+}
+
+// Re-map input buffers for reading
+// Should not result in any error
+unsigned run4(Context cl, cl_kernel kernel,
+              cl_mem d_a, cl_mem d_b, cl_mem d_c, size_t N)
+{
+  cl_int err;
+  float *h_a, *h_b, *h_c;
+  size_t dataSize = N*sizeof(cl_float);
+
+  // Initialise data
+  srand(0);
+  h_a = clEnqueueMapBuffer(cl.queue, d_a, CL_TRUE,
+                           CL_MAP_WRITE_INVALIDATE_REGION,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_a buffer");
+  h_b = clEnqueueMapBuffer(cl.queue, d_b, CL_TRUE,
+                           CL_MAP_WRITE_INVALIDATE_REGION,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_b buffer");
+  h_c = clEnqueueMapBuffer(cl.queue, d_c, CL_FALSE, CL_MAP_READ, 0, dataSize,
+                           0, NULL, NULL, &err);
+  checkError(err, "mapping d_c buffer");
+  for (unsigned i = 0; i < N; i++)
+  {
+    h_a[i] = rand()/(float)RAND_MAX;
+    h_b[i] = rand()/(float)RAND_MAX;
+    h_c[i] = 0;
+  }
+
+  h_a = clEnqueueMapBuffer(cl.queue, d_a, CL_TRUE,
+                           CL_MAP_READ,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_a buffer");
+  h_b = clEnqueueMapBuffer(cl.queue, d_b, CL_TRUE,
+                           CL_MAP_READ,
+                           0, dataSize, 0, NULL, NULL, &err);
+  checkError(err, "mapping d_b buffer");
+
+  err = clEnqueueUnmapMemObject(cl.queue, d_a, h_a, 0, NULL, NULL);
+  checkError(err, "unmapping d_a");
+  err = clEnqueueUnmapMemObject(cl.queue, d_b, h_b, 0, NULL, NULL);
+  checkError(err, "unmapping d_b");
+  err = clEnqueueUnmapMemObject(cl.queue, d_c, h_c, 0, NULL, NULL);
+  checkError(err, "unmapping d_c");
+
+  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
+  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
+  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
+  checkError(err, "setting kernel args");
+
+  err = clEnqueueNDRangeKernel(cl.queue, kernel,
+                               1, NULL, &N, NULL, 0, NULL, NULL);
+  checkError(err, "enqueuing kernel");
+
+  h_c = clEnqueueMapBuffer(cl.queue, d_c, CL_FALSE, CL_MAP_READ, 0, dataSize,
+                           0, NULL, NULL, &err);
+  checkError(err, "mapping d_c buffer");
+
+  err = clFinish(cl.queue);
+  checkError(err, "running kernel");
+
+  unsigned errors = checkResults(N, h_a, h_b, h_c);
+
+  err = clEnqueueUnmapMemObject(cl.queue, d_c, h_c, 0, NULL, NULL);
+  checkError(err, "unmapping d_c");
+
+  return errors;
+}
+
+int main(int argc, char *argv[])
+{
+  cl_int err;
+  cl_kernel kernel;
+  cl_mem d_a, d_b, d_c;
+
+  size_t N = 1;
+  if (argc > 1)
+  {
+    N = atoi(argv[1]);
+  }
+
+  Context cl = createContext(KERNEL_SOURCE);
+
+  kernel = clCreateKernel(cl.program, "vecadd", &err);
+  checkError(err, "creating kernel");
+
+  size_t dataSize = N*sizeof(cl_float);
+
+  d_a = clCreateBuffer(cl.context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
+  checkError(err, "creating d_a buffer");
+  d_b = clCreateBuffer(cl.context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
+  checkError(err, "creating d_b buffer");
+  d_c = clCreateBuffer(cl.context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err);
+  checkError(err, "creating d_c buffer");
+
+  unsigned errors = 0;
+
+  errors += run1(cl, kernel, d_a, d_b, d_c, N);
+  errors += run2(cl, kernel, d_a, d_b, d_c, N);
+  errors += run3(cl, kernel, d_a, d_b, d_c, N);
+  errors += run4(cl, kernel, d_a, d_b, d_c, N);
+
+  clReleaseMemObject(d_a);
+  clReleaseMemObject(d_b);
+  clReleaseMemObject(d_c);
+  clReleaseKernel(kernel);
+  releaseContext(cl);
+
+  return (errors != 0);
+}
+
+unsigned checkResults(size_t N, float *a, float *b, float *results)
+{
+  // Check results
+  unsigned errors = 0;
+  for (unsigned i = 0; i < N; i++)
+  {
+    float ref = a[i] + b[i];
+    if (fabs(ref - results[i]) > TOL)
+    {
+      if (errors < MAX_ERRORS)
+      {
+        fprintf(stderr, "%4d: %.4f != %.4f\n", i, results[i], ref);
+      }
+      errors++;
+    }
+  }
+  if (errors)
+    printf("%d errors detected\n", errors);
+
+  return errors;
+}
diff --git a/tests/runtime/map_buffer.ref b/tests/runtime/map_buffer.ref
new file mode 100644
index 0000000..a64878d
--- /dev/null
+++ b/tests/runtime/map_buffer.ref
@@ -0,0 +1,4 @@
+ERROR Invalid read from buffer mapped for writing
+ERROR Invalid read from buffer mapped for writing
+
+ERROR Invalid write to mapped buffer

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/oclgrind.git



More information about the Pkg-opencl-commits mailing list