[beignet] 01/10: Imported Upstream version 1.3.0

Rebecca Palmer rnpalmer-guest at moszumanska.debian.org
Sun Jan 22 22:40:00 UTC 2017

This is an automated email from the git hooks/post-receive script.

rnpalmer-guest pushed a commit to branch master
in repository beignet.

commit 037c0b29c5b06f89e64ffc50542cbc89205317aa
Author: Rebecca N. Palmer <rebecca_palmer at zoho.com>
Date:   Sat Jan 21 13:30:00 2017 +0000

    Imported Upstream version 1.3.0
 Android.common.mk                                  |     2 +-
 CMake/FindMesaSrc.cmake                            |    26 -
 CMakeLists.txt                                     |   102 +-
 GetGenID.sh                                        |    50 +-
 backend/CMakeLists.txt                             |     5 +-
 backend/src/Android.mk                             |    11 +
 backend/src/CMakeLists.txt                         |    15 +
 backend/src/GBEConfig.h.in                         |     2 +
 backend/src/backend/context.cpp                    |     9 +-
 backend/src/backend/gen/gen_mesa_disasm.c          |    77 +-
 backend/src/backend/gen75_encoder.cpp              |     4 +-
 backend/src/backend/gen75_encoder.hpp              |     4 +-
 backend/src/backend/gen7_encoder.cpp               |     2 +-
 backend/src/backend/gen7_encoder.hpp               |     2 +-
 backend/src/backend/gen8_context.cpp               |   221 +-
 backend/src/backend/gen8_context.hpp               |     8 +
 backend/src/backend/gen8_encoder.cpp               |   264 +-
 backend/src/backend/gen8_encoder.hpp               |    16 +-
 backend/src/backend/gen8_instruction.hpp           |    59 +-
 backend/src/backend/gen9_context.cpp               |    71 +-
 backend/src/backend/gen9_context.hpp               |     1 +
 backend/src/backend/gen9_encoder.cpp               |   236 +
 backend/src/backend/gen9_encoder.hpp               |    10 +-
 backend/src/backend/gen9_instruction.hpp           |    84 +
 backend/src/backend/gen_context.cpp                |   572 +-
 backend/src/backend/gen_context.hpp                |     9 +-
 backend/src/backend/gen_defs.hpp                   |    13 +
 backend/src/backend/gen_encoder.cpp                |   215 +-
 backend/src/backend/gen_encoder.hpp                |    49 +-
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |     7 +
 backend/src/backend/gen_insn_selection.cpp         |  2125 +++-
 backend/src/backend/gen_insn_selection.hpp         |    16 +-
 backend/src/backend/gen_insn_selection.hxx         |     8 +
 .../src/backend/gen_insn_selection_optimize.cpp    |    22 +-
 backend/src/backend/gen_insn_selection_output.cpp  |   138 +-
 backend/src/backend/gen_insn_selection_output.hpp  |     2 +-
 backend/src/backend/gen_program.cpp                |    21 +-
 backend/src/backend/gen_reg_allocation.cpp         |    72 +-
 backend/src/backend/gen_register.hpp               |    18 +
 backend/src/backend/program.cpp                    |   165 +-
 backend/src/backend/program.h                      |    22 +
 backend/src/backend/program.hpp                    |    24 +
 backend/src/gbe_bin_interpreter.cpp                |     6 +
 backend/src/ir/constant.cpp                        |     6 +-
 backend/src/ir/constant.hpp                        |     7 +-
 backend/src/ir/function.cpp                        |     7 +-
 backend/src/ir/function.hpp                        |    17 +-
 backend/src/ir/instruction.cpp                     |    81 +-
 backend/src/ir/instruction.hpp                     |    15 +-
 backend/src/ir/instruction.hxx                     |     1 +
 backend/src/ir/lowering.cpp                        |    11 +-
 backend/src/ir/profile.cpp                         |    17 +-
 backend/src/ir/profile.hpp                         |    54 +-
 backend/src/ir/profiling.cpp                       |     2 +-
 backend/src/ir/register.cpp                        |     3 +
 backend/src/ir/register.hpp                        |     8 +-
 backend/src/ir/reloc.cpp                           |    87 +
 backend/src/ir/reloc.hpp                           |    90 +
 backend/src/ir/type.hpp                            |     4 +-
 backend/src/ir/unit.cpp                            |     6 +-
 backend/src/ir/unit.hpp                            |    15 +-
 backend/src/libocl/Android.mk                      |     1 -
 backend/src/libocl/CMakeLists.txt                  |   145 +-
 backend/src/libocl/include/ocl.h                   |    13 +-
 backend/src/libocl/include/ocl_atom_20.h           |   188 +
 backend/src/libocl/include/ocl_enqueue.h           |    90 +
 backend/src/libocl/include/ocl_image.h             |   222 +-
 backend/src/libocl/include/ocl_misc.h              |     9 +
 backend/src/libocl/include/ocl_pipe.h              |    51 +
 backend/src/libocl/include/ocl_sync.h              |     7 +-
 backend/src/libocl/include/ocl_types.h             |    43 +-
 backend/src/libocl/include/ocl_vload_20.h          |   150 +
 backend/src/libocl/include/ocl_workitem.h          |    20 +-
 backend/src/libocl/script/gen_vector.py            |     5 +-
 backend/src/libocl/script/ocl_integer.def          |     1 +
 backend/src/libocl/script/ocl_math_20.def          |   151 +
 backend/src/libocl/src/ocl_atom_20.cl              |   381 +
 backend/src/libocl/src/ocl_atomic_20.ll            |   165 +
 backend/src/libocl/src/ocl_barrier.ll              |    27 +-
 backend/src/libocl/src/ocl_barrier_20.ll           |    25 +
 backend/src/libocl/src/ocl_clz_20.ll               |    65 +
 backend/src/libocl/src/ocl_ctz.ll                  |    65 +
 backend/src/libocl/src/ocl_ctz_20.ll               |    65 +
 backend/src/libocl/src/ocl_enqueue.cl              |   238 +
 backend/src/libocl/src/ocl_geometric.cl            |     4 +
 backend/src/libocl/src/ocl_image.cl                |   218 +-
 backend/src/libocl/src/ocl_memcpy.cl               |    15 +
 backend/src/libocl/src/ocl_memset.cl               |     3 +
 backend/src/libocl/src/ocl_misc.cl                 |    24 +
 backend/src/libocl/src/ocl_pipe.cl                 |   296 +
 backend/src/libocl/src/ocl_sync.cl                 |     6 +-
 backend/src/libocl/src/ocl_vload_20.cl             |   284 +
 backend/src/libocl/src/ocl_workitem.cl             |    25 +-
 backend/src/libocl/tmpl/ocl_defines.tmpl.h         |     7 +-
 backend/src/libocl/tmpl/ocl_integer.tmpl.cl        |    12 +
 backend/src/libocl/tmpl/ocl_integer.tmpl.h         |    18 +
 backend/src/libocl/tmpl/ocl_math_20.tmpl.cl        |  3801 ++++++
 backend/src/libocl/tmpl/ocl_math_20.tmpl.h         |   209 +
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl           |   292 +-
 backend/src/libocl/tmpl/ocl_simd.tmpl.h            |   131 +-
 backend/src/llvm/ExpandLargeIntegers.cpp           |     5 +-
 backend/src/llvm/PromoteIntegers.cpp               |     1 +
 backend/src/llvm/StripAttributes.cpp               |     6 +-
 backend/src/llvm/llvm_bitcode_link.cpp             |    49 +-
 backend/src/llvm/llvm_device_enqueue.cpp           |   417 +
 backend/src/llvm/llvm_gen_backend.cpp              |   668 +-
 backend/src/llvm/llvm_gen_backend.hpp              |     7 +-
 backend/src/llvm/llvm_gen_ocl_function.hxx         |    66 +-
 backend/src/llvm/llvm_intrinsic_lowering.cpp       |     2 +
 backend/src/llvm/llvm_passes.cpp                   |    25 +-
 backend/src/llvm/llvm_scalarize.cpp                |    42 +-
 backend/src/llvm/llvm_to_gen.cpp                   |    22 +-
 backend/src/llvm/llvm_unroll.cpp                   |    14 +-
 backend/src/ocl_common_defines.h                   |    11 +-
 docs/Beignet.mdwn                                  |    13 +
 docs/NEWS.mdwn                                     |     3 +
 docs/howto/android-build-howto.mdwn                |    64 +
 include/CL/cl.h                                    |   389 +-
 include/CL/cl.hpp                                  | 12452 -------------------
 include/CL/cl_d3d10.h                              |     7 +-
 include/CL/cl_d3d11.h                              |     7 +-
 include/CL/cl_dx9_media_sharing.h                  |     9 +-
 include/CL/cl_egl.h                                |     9 +-
 include/CL/cl_ext.h                                |    49 +-
 include/CL/cl_gl.h                                 |     7 +-
 include/CL/cl_gl_ext.h                             |     7 +-
 include/CL/cl_platform.h                           |    38 +-
 include/CL/opencl.h                                |     7 +-
 kernels/compiler_atomic_functions_20.cl            |    53 +
 kernels/compiler_ceil64.spir                       |   Bin 0 -> 2152 bytes
 kernels/compiler_ctz.cl                            |    16 +
 kernels/compiler_device_enqueue.cl                 |    18 +
 kernels/compiler_generic_atomic.cl                 |    33 +
 kernels/compiler_generic_pointer.cl                |    33 +
 kernels/compiler_pipe_builtin.cl                   |   117 +
 kernels/compiler_program_global.cl                 |    77 +
 kernels/compiler_sub_group_shuffle.cl              |    22 +-
 kernels/compiler_sub_group_shuffle_down.cl         |    23 +-
 kernels/compiler_sub_group_shuffle_up.cl           |    23 +-
 kernels/compiler_sub_group_shuffle_xor.cl          |    23 +-
 kernels/compiler_subgroup_broadcast.cl             |    26 +-
 kernels/compiler_subgroup_buffer_block_read.cl     |    47 +-
 kernels/compiler_subgroup_buffer_block_write.cl    |    44 +-
 kernels/compiler_subgroup_image_block_read.cl      |    49 +-
 kernels/compiler_subgroup_image_block_write.cl     |    46 +-
 kernels/compiler_subgroup_reduce.cl                |    41 +
 kernels/compiler_subgroup_scan_exclusive.cl        |    55 +
 kernels/compiler_subgroup_scan_inclusive.cl        |    55 +
 src/Android.mk                                     |    18 +-
 src/CMakeLists.txt                                 |    42 +-
 src/cl_accelerator_intel.c                         |    17 +-
 src/cl_accelerator_intel.h                         |    11 +-
 src/cl_api.c                                       |  3369 +----
 src/cl_api_command_queue.c                         |   233 +
 src/cl_api_context.c                               |   174 +
 src/cl_api_device_id.c                             |    90 +
 src/cl_api_event.c                                 |   330 +
 src/cl_api_kernel.c                                |   422 +
 src/cl_api_mem.c                                   |  2435 ++++
 src/cl_api_platform_id.c                           |    65 +
 src/cl_api_program.c                               |   171 +
 src/cl_api_sampler.c                               |   127 +
 src/cl_base_object.c                               |   140 +
 src/cl_base_object.h                               |    84 +
 src/cl_cmrt.cpp                                    |     2 +-
 src/cl_command_queue.c                             |   326 +-
 src/cl_command_queue.h                             |    95 +-
 src/cl_command_queue_enqueue.c                     |   330 +
 src/cl_command_queue_gen7.c                        |   101 +-
 src/cl_context.c                                   |   270 +-
 src/cl_context.h                                   |    61 +-
 src/cl_device_enqueue.c                            |   201 +
 .../src/ocl_sync.cl => src/cl_device_enqueue.h     |    25 +-
 src/cl_device_id.c                                 |   588 +-
 src/cl_device_id.h                                 |    34 +-
 src/cl_driver.h                                    |    57 +-
 src/cl_driver_defs.c                               |    12 +-
 src/cl_enqueue.c                                   |   582 +-
 src/cl_enqueue.h                                   |    56 +-
 src/cl_event.c                                     |  1171 +-
 src/cl_event.h                                     |   148 +-
 src/cl_extensions.c                                |     5 +-
 src/cl_extensions.h                                |    11 +-
 src/cl_gbe_loader.cpp                              |    25 +
 src/cl_gbe_loader.h                                |     5 +
 src/cl_gl_api.c                                    |    19 +-
 src/cl_gt_device.h                                 |    21 +-
 src/cl_image.c                                     |    24 +-
 src/cl_image.h                                     |     1 +
 src/cl_kernel.c                                    |   112 +-
 src/cl_kernel.h                                    |    26 +-
 src/cl_khr_icd.c                                   |    30 +-
 src/cl_khr_icd.h                                   |     2 -
 src/cl_mem.c                                       |   648 +-
 src/cl_mem.h                                       |   114 +-
 src/cl_mem_gl.c                                    |     7 +-
 src/cl_platform_id.c                               |    53 +-
 src/cl_platform_id.h                               |    16 +-
 src/cl_program.c                                   |   143 +-
 src/cl_program.h                                   |    14 +-
 src/cl_sampler.c                                   |    54 +-
 src/cl_sampler.h                                   |    33 +-
 src/cl_thread.c                                    |   329 -
 src/cl_thread.h                                    |    52 -
 src/cl_utils.c                                     |    86 +
 src/cl_utils.h                                     |    90 +-
 src/intel/intel_cl_gl_share_image_info.h           |    18 +
 src/intel/intel_dri_resource_sharing.c             |   208 -
 src/intel/intel_dri_resource_sharing.h             |    39 -
 src/intel/intel_dri_resource_sharing_int.h         |   143 -
 src/intel/intel_driver.c                           |  1178 +-
 src/intel/intel_driver.h                           |     1 +
 src/intel/intel_gpgpu.c                            |    99 +-
 src/intel/intel_gpgpu.h                            |     1 +
 src/performance.c                                  |     6 +
 src/x11/mesa_egl_extension.c                       |   306 -
 src/x11/mesa_egl_extension.h                       |    20 -
 src/x11/mesa_egl_res_share.c                       |   135 -
 src/x11/mesa_egl_res_share.h                       |    44 -
 utests/CMakeLists.txt                              |    39 +-
 utests/compiler_atomic_functions_20.cpp            |   106 +
 utests/compiler_ctz.cpp                            |    62 +
 utests/compiler_device_enqueue.cpp                 |    36 +
 utests/compiler_fill_gl_image.cpp                  |    69 +-
 utests/compiler_generic_atomic.cpp                 |    45 +
 utests/compiler_generic_pointer.cpp                |    46 +
 utests/compiler_pipe_builtin.cpp                   |    69 +
 utests/compiler_program_global.cpp                 |    80 +
 utests/compiler_sampler.cpp                        |    14 +-
 utests/compiler_sub_group_shuffle.cpp              |    52 +-
 utests/compiler_sub_group_shuffle_down.cpp         |    54 +-
 utests/compiler_sub_group_shuffle_up.cpp           |    54 +-
 utests/compiler_sub_group_shuffle_xor.cpp          |    54 +-
 utests/compiler_subgroup_broadcast.cpp             |    38 +-
 utests/compiler_subgroup_buffer_block_read.cpp     |    73 +-
 utests/compiler_subgroup_buffer_block_write.cpp    |    74 +-
 utests/compiler_subgroup_image_block_read.cpp      |    98 +-
 utests/compiler_subgroup_image_block_write.cpp     |    73 +-
 utests/compiler_subgroup_reduce.cpp                |   170 +-
 utests/compiler_subgroup_scan_exclusive.cpp        |   173 +-
 utests/compiler_subgroup_scan_inclusive.cpp        |   166 +-
 utests/load_program_from_spir.cpp                  |     5 +-
 utests/multi_queue_events.cpp                      |   129 +
 utests/runtime_barrier_list.cpp                    |    11 +-
 utests/runtime_event.cpp                           |     4 +-
 utests/runtime_marker_list.cpp                     |    13 +-
 utests/runtime_pipe_query.cpp                      |    15 +
 utests/setenv.sh.in                                |     2 +
 utests/utest_helper.cpp                            |   203 +-
 utests/utest_helper.hpp                            |    11 +-
 250 files changed, 24693 insertions(+), 21135 deletions(-)

diff --git a/Android.common.mk b/Android.common.mk
index dcb3c7c..60cd23b 100644
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -2,7 +2,7 @@
 #include $(CLEAR_VARS)
 TOP_C_INCLUDE := bionic $(BEIGNET_ROOT_PATH)/include
-TOP_CPPFLAGS := -Wall -Wno-invalid-offsetof -mfpmath=sse -fno-rtti -Wcast-align -std=c++0x -msse2 -msse3 -mssse3 -msse4.1 -D__ANDROID__
+TOP_CPPFLAGS := -Wall -Wno-invalid-offsetof -mfpmath=sse -fno-rtti -Wcast-align -std=c++11 -msse2 -msse3 -mssse3 -msse4.1 -D__ANDROID__
 TOP_CFLAGS := -Wall -mfpmath=sse -msse2 -Wcast-align -msse2 -msse3 -mssse3 -msse4.1 -D__ANDROID__
 LLVM_INCLUDE_DIRS := external/llvm/device/include\
diff --git a/CMake/FindMesaSrc.cmake b/CMake/FindMesaSrc.cmake
deleted file mode 100644
index 978cb4e..0000000
--- a/CMake/FindMesaSrc.cmake
+++ /dev/null
@@ -1,26 +0,0 @@
-# Try to find mesa source code
-# Once done this will define
-# Find mesa source code.
-FIND_PATH(MESA_SOURCE_PREFIX src/mesa/main/texobj.c
-  ~/mesa
-  DOC "The mesa source directory which is needed for cl_khr_gl_sharing.")
-                         ${MESA_SOURCE_PREFIX}/include
-                         ${MESA_SOURCE_PREFIX}/src/mapi
-                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i965/
-                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i915/
-                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/common/)
-SET(MESA_SOURCE_FOUND 1 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
-SET(MESA_SOURCE_FOUND 0 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bac054c..02b5d88 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,11 +16,6 @@ endif ()
@@ -28,11 +23,6 @@ elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
   set(COMPILER "ICC")
-configure_file (
-  "src/OCLConfig.h.in"
-  "src/OCLConfig.h"
@@ -154,41 +144,38 @@ IF(DRM_INTEL_FOUND)
     MESSAGE(STATUS "Disable subslice total query support")
-  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_pooled_eu" "" HAVE_DRM_INTEL_POOLED_EU)
+  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_pooled_eu" ${DRM_INTEL_LIBDIR} HAVE_DRM_INTEL_POOLED_EU)
     MESSAGE(STATUS "Enable pooled eu query support")
     MESSAGE(STATUS "Disable pooled eu query support")
-  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_min_eu_in_pool" "" HAVE_DRM_INTEL_MIN_EU_IN_POOL)
+  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_min_eu_in_pool" ${DRM_INTEL_LIBDIR} HAVE_DRM_INTEL_MIN_EU_IN_POOL)
     MESSAGE(STATUS "Enable min eu in pool query support")
     MESSAGE(STATUS "Disable min eu in pool query support")
+  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_bo_set_softpin_offset" ${DRM_INTEL_LIBDIR} HAVE_DRM_INTEL_BO_SET_SOFTPIN)
   MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
+#disable CMRT as default, since we do not see real case,
+#but see build issue of this feature
 pkg_check_modules(CMRT libcmrt)
 # Threads
-# OpenGL (not use cmake helper)
-pkg_check_modules(OPENGL gl)
-  MESSAGE(STATUS "Looking for OpenGL - found at ${OPENGL_PREFIX}")
-  MESSAGE(STATUS "Looking for OpenGL - not found")
 # Xext
 pkg_check_modules(XEXT REQUIRED xext)
@@ -208,20 +195,22 @@ ELSE(XFIXES_FOUND)
-pkg_check_modules(EGL egl)
-  MESSAGE(STATUS "Looking for EGL - found at ${EGL_PREFIX}")
-  MESSAGE(STATUS "Looking for EGL - not found")
-# cl_khr_gl_sharing requires to build with mesa source
-#  MESSAGE(STATUS "Looking for mesa source code - found at ${MESA_SOURCE_PREFIX}")
-#  MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
+OPTION(ENABLE_GL_SHARING "cl_khr_gl_sharing" OFF)
+  pkg_check_modules(OPENGL REQUIRED gl)
+    MESSAGE(STATUS "Looking for OpenGL - found at ${OPENGL_PREFIX}")
+    MESSAGE(STATUS "Looking for OpenGL - not found")
+  pkg_check_modules(EGL REQUIRED egl>=11.0.0)
+    MESSAGE(STATUS "Looking for EGL - found at ${EGL_PREFIX}")
+    MESSAGE(STATUS "Looking for EGL - not found")
@@ -238,10 +227,51 @@ ENDIF(OCLIcd_FOUND)
 OPTION(EXPERIMENTAL_DOUBLE "Enable experimental double support" OFF)
+OPTION(ENABLE_OPENCL_20 "Enable opencl 2.0 support" OFF)
+  Find_Program(LSPCI lspci)
+    MESSAGE(FATAL_ERROR "Looking for lspci - not found")
+                          RESULT_VARIABLE SUPPORT_OCL20_DEVICE
+                          OUTPUT_VARIABLE PCI_ID_NOT_USED)
+    MESSAGE(FATAL_ERROR "Only SKL and newer devices support OpenCL 2.0 now, your device don't support.")
+    MESSAGE(FATAL_ERROR "Please update libdrm to version 2.4.66 or later to enable OpenCL 2.0.")
+    MESSAGE(FATAL_ERROR "Please update LLVM to version 3.9 or later to enable OpenCL 2.0.")
+else (ENABLE_OPENCL_20)
+endif (ENABLE_OPENCL_20)
+configure_file (
+  "src/OCLConfig.h.in"
+  "src/OCLConfig.h"
diff --git a/GetGenID.sh b/GetGenID.sh
index a0e5f85..5e5cafd 100755
--- a/GetGenID.sh
+++ b/GetGenID.sh
@@ -12,34 +12,50 @@ genpciid+=(0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 0d0b 0d1b 0d2b 0d0e 0d1e
 genpciid+=(1602 1606 160a 160d 160e 1612 1616 161a 161d 161e 1622 1626 162a 162d 162e)
 genpciid+=(22b0 22b1 22b2 22b3)
+#Only enable OpenCL 2.0 after SKL.
-genpciid+=(1906 1916 1926 190e 191e 1902 1912 1932 190b 191b 192b 193b 190a 191a 192a 193a)
+genpciid_20=(1906 1916 1926 190e 191e 1902 1912 1932 190b 191b 192b 193b 190a 191a 192a 193a)
-genpciid+=(5a84 5a85)
+genpciid_20+=(5a84 5a85 1a84 1a85)
-genpciid+=(5906 5916 5926 5913 5921 5923 5927 5902 5912 5917)
-genpciid+=(590b 591b 593b 5908 590e 591e 5915 590a 591a 591d)
+genpciid_20+=(5906 5916 5926 5913 5921 5923 5927 5902 5912 5917)
+genpciid_20+=(590b 591b 593b 5908 590e 591e 5915 590a 591a 591d)
 pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F ] '{print $1}'))
 while [ $i -lt $n ]
-    id1=${pciid[$i]}
-    let j=0
+  id1=${pciid[$i]}
+  let j=0
-    while [ $j -lt $m ]
-    do
-	id2=${genpciid[$j]}
+  while [ $j -lt $m ]
+  do
+    id2=${genpciid[$j]}
-	if [ ${id1} == ${id2} ]
-	then
-	    echo ${id1}
-	    exit 0
-	fi
-	let j=j+1
-    done
+    if [ ${id1} == ${id2} ]
+    then
+      echo ${id1}
+      exit 0
+    fi
+    let j=j+1
+  done
-    let i=i+1
+  let j=0
+  while [ $j -lt $t ]
+  do
+    id2=${genpciid_20[$j]}
+    if [ ${id1} == ${id2} ]
+    then
+      echo ${id1}
+      exit 1
+    fi
+    let j=j+1
+  done
+  let i=i+1
+exit -1
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 915d60f..d2d8710 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
diff --git a/backend/src/Android.mk b/backend/src/Android.mk
index da4d787..47d4ea7 100644
--- a/backend/src/Android.mk
+++ b/backend/src/Android.mk
@@ -62,6 +62,8 @@ BACKEND_SRC_FILES:= \
     ir/immediate.cpp \
     ir/structurizer.hpp \
     ir/structurizer.cpp \
+    ir/reloc.hpp \
+    ir/reloc.cpp \
     backend/context.cpp \
     backend/context.hpp \
     backend/program.cpp \
@@ -80,6 +82,7 @@ BACKEND_SRC_FILES:= \
     llvm/PromoteIntegers.cpp \
     llvm/ExpandLargeIntegers.cpp \
     llvm/StripAttributes.cpp \
+    llvm/llvm_device_enqueue.cpp \
     llvm/llvm_to_gen.cpp \
     llvm/llvm_loadstore_optimization.cpp \
     llvm/llvm_gen_backend.hpp \
@@ -140,12 +143,16 @@ $(shell echo "  #define INTERP_OBJECT_DIR \"/system/lib64/libgbeinterp.so\"" >>
 $(shell echo "  #define OCL_BITCODE_BIN \"/system/lib/ocl/beignet.bc\"" >> $(gbe_config_file))
 $(shell echo "  #define OCL_HEADER_DIR \"/system/lib/ocl/include\"" >> $(gbe_config_file))
 $(shell echo "  #define OCL_PCH_OBJECT \"/system/lib/ocl/beignet.pch\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_BITCODE_BIN_20 \"/system/lib/ocl/beignet_20.bc\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_PCH_OBJECT_20 \"/system/lib/ocl/beigneti_20.pch\"" >> $(gbe_config_file))
 $(shell echo "#else /*__x86_64__*/" >> $(gbe_config_file))
 $(shell echo "  #define GBE_OBJECT_DIR \"/system/lib/libgbe.so\"" >> $(gbe_config_file))
 $(shell echo "  #define INTERP_OBJECT_DIR \"/system/lib/libgbeinterp.so\"" >> $(gbe_config_file))
 $(shell echo "  #define OCL_BITCODE_BIN \"/system/lib/ocl/beignet.bc\"" >> $(gbe_config_file))
 $(shell echo "  #define OCL_HEADER_DIR \"/system/lib/ocl/include\"" >> $(gbe_config_file))
 $(shell echo "  #define OCL_PCH_OBJECT \"/system/lib/ocl/beignet.pch\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_BITCODE_BIN_20 \"/system/lib/ocl/beignet_20.bc\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_PCH_OBJECT_20 \"/system/lib/ocl/beigneti_20.pch\"" >> $(gbe_config_file))
 $(shell echo "#endif" >> $(gbe_config_file))
 $(shell echo "#else /*__ANDROID__*/" >> $(gbe_config_file))
 $(shell echo "  #define GBE_OBJECT_DIR \"\"" >> $(gbe_config_file))
@@ -153,6 +160,8 @@ $(shell echo "  #define INTERP_OBJECT_DIR \"\"" >> $(gbe_config_file))
 $(shell echo "  #define OCL_BITCODE_BIN \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
 $(shell echo "  #define OCL_HEADER_DIR \"`pwd $(TOP)`/$(generated_path)/libocl/include\"" >> $(gbe_config_file))
 $(shell echo "  #define OCL_PCH_OBJECT \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_BITCODE_BIN_20 \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_PCH_OBJECT_20 \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
 $(shell echo "#endif" >> $(gbe_config_file))
 #Build HOST libgbe.so
@@ -162,6 +171,8 @@ LOCAL_C_INCLUDES := $(TOP_C_INCLUDE) \
+LOCAL_CPPFLAGS += -Wno-extra-semi -Wno-gnu-anonymous-struct -Wno-nested-anon-types
+LOCAL_CFLAGS += -Wno-extra-semi -Wno-gnu-anonymous-struct -Wno-nested-anon-types
 LOCAL_LDLIBS += -lpthread -lm -ldl -lLLVM -lclang
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 41eb5ec..7c1f4db 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -3,6 +3,10 @@ set (OCL_HEADER_DIR "${BEIGNET_INSTALL_DIR}/include")
 set (OCL_PCH_OBJECT "${BEIGNET_INSTALL_DIR}/beignet.pch")
 set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
+set (OCL_BITCODE_BIN_20 "${BEIGNET_INSTALL_DIR}/beignet_20.bc")
+set (OCL_PCH_OBJECT_20 "${BEIGNET_INSTALL_DIR}/beignet_20.pch")
+endif (ENABLE_OPENCL_20)
 configure_file (
@@ -19,6 +23,10 @@ set (LOCAL_INTERP_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbeinterp.so" PAREN
+set (LOCAL_OCL_PCH_OBJECT_20 "${OCL_OBJECT_DIR}/beignet_20.local.pch" PARENT_SCOPE)
+endif (ENABLE_OPENCL_20)
 set (GBE_SRC
@@ -73,6 +81,8 @@ set (GBE_SRC
+    ir/reloc.hpp
+    ir/reloc.cpp
@@ -91,6 +101,7 @@ set (GBE_SRC
+    llvm/llvm_device_enqueue.cpp
@@ -195,6 +206,10 @@ endif ()
+endif (ENABLE_OPENCL_20)
diff --git a/backend/src/GBEConfig.h.in b/backend/src/GBEConfig.h.in
index b5bec14..9514483 100644
--- a/backend/src/GBEConfig.h.in
+++ b/backend/src/GBEConfig.h.in
@@ -6,3 +6,5 @@
+#define OCL_PCH_OBJECT_20 "@OCL_PCH_OBJECT_20@"
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 675dc78..e9ddd17 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -345,7 +345,7 @@ namespace gbe
   Context::Context(const ir::Unit &unit, const std::string &name) :
     unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL), useDWLabel(false)
-    GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
+    GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS || unit.getPointerSize() == ir::POINTER_64_BITS);
     this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn), true);
     this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
     // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
@@ -393,6 +393,7 @@ namespace gbe
     if(this->kernel != NULL) {
       this->kernel->scratchSize = this->alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
       this->kernel->ctx = this;
+      this->kernel->setUseDeviceEnqueue(fn.getUseDeviceEnqueue());
     return this->kernel;
@@ -471,6 +472,7 @@ namespace gbe
       kernel->args[argID].info.accessQual = arg.info.accessQual;
       kernel->args[argID].info.typeQual = arg.info.typeQual;
       kernel->args[argID].info.argName = arg.info.argName;
+      kernel->args[argID].info.typeSize = arg.info.typeSize;
       switch (arg.type) {
         case ir::FunctionArgument::VALUE:
         case ir::FunctionArgument::STRUCTURE:
@@ -498,6 +500,11 @@ namespace gbe
           kernel->args[argID].type = GBE_ARG_SAMPLER;
           kernel->args[argID].size = sizeof(void*);
+        case ir::FunctionArgument::PIPE:
+          kernel->args[argID].type = GBE_ARG_PIPE;
+          kernel->args[argID].size = sizeof(void*);
+          kernel->args[argID].bti = arg.bti;
+          break;
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 5653275..56fda89 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -50,6 +50,7 @@
 #include "backend/gen_defs.hpp"
 #include "backend/gen7_instruction.hpp"
+#include "backend/gen9_instruction.hpp"
 #include "src/cl_device_data.h"
 static const struct {
@@ -70,6 +71,7 @@ static const struct {
   [GEN_OPCODE_CBIT] = { .name = "cbit", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_F16TO32] = { .name = "f16to32", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_F32TO16] = { .name = "f32to16", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_BFREV] = { .name = "bfrev", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
@@ -103,6 +105,7 @@ static const struct {
   [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SENDS] = { .name = "sends", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -495,6 +498,24 @@ static const char *data_port1_data_cache_msg_type[] = {
   [13] = "Typed Surface Write",
+static const char *atomic_opration_type[] = {
+  [1] = "and",
+  [2] = "or",
+  [3] = "xor",
+  [4] = "xchg",
+  [5] = "inc",
+  [6] = "dec",
+  [7] = "add",
+  [8] = "sub",
+  [9] = "rsub",
+  [10] = "imax",
+  [11] = "imin",
+  [12] = "umax",
+  [13] = "umin",
+  [14] = "cmpxchg",
+  [15] = "invalid"
 static int column;
 static int gen_version;
@@ -573,6 +594,7 @@ static int gen_version;
 #define UNTYPED_RW_MSG_TYPE(inst)  GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.msg_type)
 #define BYTE_RW_SIMD_MODE(inst)    GEN_BITS_FIELD(inst, bits3.gen7_byte_rw.simd_mode)
 #define BYTE_RW_DATA_SIZE(inst)    GEN_BITS_FIELD(inst, bits3.gen7_byte_rw.data_size)
+#define UNTYPED_RW_AOP_TYPE(inst)  GEN_BITS_FIELD2(inst, bits3.gen7_atomic_op.aop_type, bits3.gen8_atomic_a64.aop_type)
 #define SCRATCH_RW_OFFSET(inst)    GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.offset)
 #define SCRATCH_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.block_size)
 #define SCRATCH_RW_INVALIDATE_AFTER_READ(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.invalidate_after_read)
@@ -1391,7 +1413,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
   } else if (OPCODE(inst) != GEN_OPCODE_SEND &&
-             OPCODE(inst) != GEN_OPCODE_SENDC) {
+             OPCODE(inst) != GEN_OPCODE_SENDC &&
+             OPCODE(inst) != GEN_OPCODE_SENDS) {
     err |= control(file, "conditional modifier", conditional_modifier,
                    COND_DST_OR_MODIFIER(inst), NULL);
     if (COND_DST_OR_MODIFIER(inst))
@@ -1406,7 +1429,20 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
     string(file, ")");
-  if (opcode[OPCODE(inst)].nsrc == 3) {
+  if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+    const union Gen9NativeInstruction *gen9_insn = (const union Gen9NativeInstruction *)inst;
+    pad(file, 16);
+    if (gen9_insn->bits1.sends.dest_reg_file_0 == 0)
+      reg(file, GEN_ARCHITECTURE_REGISTER_FILE, gen9_insn->bits1.sends.dest_reg_nr);
+    else
+      format(file, "g%d", gen9_insn->bits1.sends.dest_reg_nr);
+    pad(file, 32);
+    format(file, "g%d(addLen:%d)", gen9_insn->bits2.sends.src0_reg_nr, GENERIC_MSG_LENGTH(inst));
+    pad(file, 48);
+    format(file, "g%d(dataLen:%d)", gen9_insn->bits1.sends.src1_reg_nr, gen9_insn->bits2.sends.src1_length);
+    pad(file, 64);
+    format(file, "0x%08x", gen9_insn->bits3.ud);
+  } else if (opcode[OPCODE(inst)].nsrc == 3) {
     pad(file, 16);
     err |= dest_3src(file, inst);
@@ -1449,7 +1485,8 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
   if (OPCODE(inst) == GEN_OPCODE_SEND ||
-      OPCODE(inst) == GEN_OPCODE_SENDC) {
+      OPCODE(inst) == GEN_OPCODE_SENDC ||
+      OPCODE(inst) == GEN_OPCODE_SENDS) {
     enum GenMessageTarget target = COND_DST_OR_MODIFIER(inst);
@@ -1464,7 +1501,13 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
                      target, &space);
-    if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+    int immbti = 0;
+    if (OPCODE(inst) == GEN_OPCODE_SENDS) {
+      const union Gen9NativeInstruction *gen9_insn = (const union Gen9NativeInstruction *)inst;
+      immbti = !(gen9_insn->bits2.sends.sel_reg32_desc);
+    } else
+      immbti = (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE);
+    if (immbti) {
       switch (target) {
           format(file, " (bti: %d, msg_type: %d)",
@@ -1509,6 +1552,14 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
+            else if(UNTYPED_RW_MSG_TYPE(inst) == 6)
+              format(file, " (bti: %d, rgba: %d, %s, %s, %s, %s)",
+                  UNTYPED_RW_BTI(inst),
+                  UNTYPED_RW_RGBA(inst),
+                  data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+                  data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                  data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)],
+                  atomic_opration_type[UNTYPED_RW_AOP_TYPE(inst)]);
               format(file, " not implemented");
           } else {
@@ -1526,13 +1577,21 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
+            else if(UNTYPED_RW_MSG_TYPE(inst) == 2)
+              format(file, " (bti: %d, rgba: %d, %s, %s, %s, %s)",
+                  UNTYPED_RW_BTI(inst),
+                  UNTYPED_RW_RGBA(inst),
+                  data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+                  data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                  data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)],
+                  atomic_opration_type[UNTYPED_RW_AOP_TYPE(inst)]);
               format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
-                     UNTYPED_RW_BTI(inst),
-                     UNTYPED_RW_RGBA(inst),
-                     data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
-                     data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
-                     data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+                  UNTYPED_RW_BTI(inst),
+                  UNTYPED_RW_RGBA(inst),
+                  data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+                  data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                  data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
           format(file, " (bti: %d, %s)",
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
index fc37991..b82cc43 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -126,7 +126,7 @@ namespace gbe
     return gen7_insn->bits3.ud;
-  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
@@ -199,7 +199,7 @@ namespace gbe
     return insn->bits3.ud;
-  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
index d06f393..8877a50 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -42,9 +42,9 @@ namespace gbe
     virtual void JMPI(GenRegister src, bool longjmp = false);
     /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
     virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
-    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum, bool useSends);
     virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
-    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends);
     virtual void setHeader(GenNativeInstruction *insn);
     virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
                    uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index 4f35491..4b2cd9a 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -280,7 +280,7 @@ namespace gbe
-  void Gen7Encoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+  void Gen7Encoder::MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t size, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     const uint32_t msg_length = 1 + size;
     const uint32_t response_length = 0; // Size of registers
diff --git a/backend/src/backend/gen7_encoder.hpp b/backend/src/backend/gen7_encoder.hpp
index edb711d..7585b34 100644
--- a/backend/src/backend/gen7_encoder.hpp
+++ b/backend/src/backend/gen7_encoder.hpp
@@ -45,7 +45,7 @@ namespace gbe
     /*! MBlock read */
     virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
     /*! MBlock write */
-    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+    virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t elemSize, bool useSends);
 #endif /* __GBE_GEN7_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 09b38b2..34baee8 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -900,6 +900,32 @@ namespace gbe
+  void Gen8Context::emitUntypedReadA64Instruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t elemNum = insn.extra.elem;
+    p->UNTYPED_READA64(dst, src, elemNum);
+  }
+  void Gen8Context::emitUntypedWriteA64Instruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t elemNum = insn.extra.elem;
+    p->UNTYPED_WRITEA64(src, elemNum);
+  }
+  void Gen8Context::emitByteGatherA64Instruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t elemSize = insn.extra.elem;
+    p->BYTE_GATHERA64(dst, src, elemSize);
+  }
+  void Gen8Context::emitByteScatterA64Instruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t elemSize = insn.extra.elem;
+    p->BYTE_SCATTERA64(src, elemSize);
+  }
   void Gen8Context::emitRead64Instruction(const SelectionInstruction &insn)
     const uint32_t elemNum = insn.extra.elem;
@@ -942,6 +968,7 @@ namespace gbe
     GBE_ASSERT(elemNum == 1);
     const GenRegister addr = ra->genReg(insn.src(elemNum));
     const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
+    GenRegister data = ra->genReg(insn.src(elemNum+1));
     /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
        which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -952,11 +979,15 @@ namespace gbe
     if (bti.file == GEN_IMMEDIATE_VALUE) {
-      p->UNTYPED_WRITE(addr, bti, elemNum*2);
+      p->UNTYPED_WRITE(addr, data, bti, elemNum*2, insn.extra.splitSend);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(elemNum));
       const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
-      unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+      unsigned desc = 0;
+      if (insn.extra.splitSend)
+        desc = p->generateUntypedWriteSendsMessageDesc(0, elemNum*2);
+      else
+        desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
       unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
@@ -964,11 +995,56 @@ namespace gbe
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-        p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+        p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum*2, insn.extra.splitSend);
       afterMessage(insn, bti, tmp, btiTmp, jip0);
+  void Gen8Context::emitRead64A64Instruction(const SelectionInstruction &insn) {
+    const uint32_t elemNum = insn.extra.elem;
+    GBE_ASSERT(elemNum == 1);
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
+       which we can not accept. We just fallback to 2 DW untyperead here. */
+    p->UNTYPED_READA64(dst, src, 2*elemNum);
+    for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+      GenRegister long_tmp = ra->genReg(insn.dst(elemID));
+      GenRegister the_long = ra->genReg(insn.dst(elemID + elemNum));
+      this->packLongVec(long_tmp, the_long, p->curr.execWidth);
+    }
+  }
+  void Gen8Context::emitWrite64A64Instruction(const SelectionInstruction &insn)
+  {
+    const uint32_t elemNum = insn.extra.elem;
+    GBE_ASSERT(elemNum == 1);
+    const GenRegister addr = ra->genReg(insn.src(elemNum));
+    /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
+       which we can not accept. We just fallback to 2 DW untypewrite here. */
+    for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+      GenRegister the_long = ra->genReg(insn.src(elemID));
+      GenRegister long_tmp = ra->genReg(insn.src(elemNum + 1 + elemID));
+      this->unpackLongVec(the_long, long_tmp, p->curr.execWidth);
+    }
+    p->UNTYPED_WRITEA64(addr, elemNum*2);
+  }
+  void Gen8Context::emitAtomicA64Instruction(const SelectionInstruction &insn)
+  {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const uint32_t function = insn.extra.function;
+    unsigned srcNum = insn.extra.elem;
+    const GenRegister bti = ra->genReg(insn.src(srcNum));
+    GBE_ASSERT(bti.value.ud == 0xff);
+    p->ATOMICA64(dst, function, src, bti, srcNum);
+  }
   void Gen8Context::emitPackLongInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
     const GenRegister dst = ra->genReg(insn.dst(0));
@@ -983,7 +1059,7 @@ namespace gbe
     const GenRegister dst = ra->genReg(insn.dst(0));
     /* Scalar register need not to convert. */
-    GBE_ASSERT(dst.hstride != GEN_HORIZONTAL_STRIDE_0 && src.hstride != GEN_HORIZONTAL_STRIDE_0);
+    GBE_ASSERT(dst.hstride != GEN_HORIZONTAL_STRIDE_0);
     this->unpackLongVec(src, dst, p->curr.execWidth);
@@ -1280,7 +1356,7 @@ namespace gbe
       nextDst = GenRegister::Qn(tempDst, 1);
       p->MOV(nextDst, nextSrc);
-    p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+    p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
     p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
@@ -1296,7 +1372,7 @@ namespace gbe
       nextDst = GenRegister::Qn(tempDst, 1);
       p->MOV(nextDst, nextSrc);
-    p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+    p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
     p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
@@ -1317,6 +1393,67 @@ namespace gbe
+  void ChvContext::emitStackPointer(void) {
+    using namespace ir;
+    // Only emit stack pointer computation if we use a stack
+    if (kernel->getStackSize() == 0)
+      return;
+    // Check that everything is consistent in the kernel code
+    const uint32_t perLaneSize = kernel->getStackSize();
+    GBE_ASSERT(perLaneSize > 0);
+    const GenRegister selStatckPtr = this->simdWidth == 8 ?
+      GenRegister::ud8grf(ir::ocl::stackptr) :
+      GenRegister::ud16grf(ir::ocl::stackptr);
+    const GenRegister stackptr = ra->genReg(selStatckPtr);
+    // borrow block ip as temporary register as we will
+    // initialize block ip latter.
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+    const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
+    loadLaneID(stackptr);
+    // We compute the per-lane stack pointer here
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
+    // let private address start from zero
+    //p->MOV(stackptr, GenRegister::immud(0));
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
+      p->curr.execWidth = this->simdWidth;
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
+      p->curr.execWidth = 1;
+      p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
+      p->curr.execWidth = this->simdWidth;
+      p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize
+      if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+        const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+          GenRegister::ul8grf(ir::ocl::stackptr) :
+          GenRegister::ul16grf(ir::ocl::stackptr);
+        GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+        GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr);
+        int simdWidth = p->curr.execWidth;
+        if (simdWidth == 16) {
+          // we need do second quarter first, because the dst type is QW,
+          // while the src is DW. If we do first quater first, the 1st
+          // quarter's dst would contain the 2nd quarter's src.
+          p->curr.execWidth = 8;
+          p->curr.quarterControl = GEN_COMPRESSION_Q2;
+          p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1));
+          p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1));
+        }
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        p->MOV(sp, stackptr);
+        p->MOV(stackptr2, sp);
+      }
+    p->pop();
+  }
   /* Init value according to WORKGROUP OP
    * Emit assert is invalid combination operation - datatype */
   static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
@@ -1351,6 +1488,10 @@ namespace gbe
         p->MOV(dataReg, GenRegister::immint64(0x0));
       else if (dataReg.type == GEN_TYPE_UL)
         p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else if (dataReg.type == GEN_TYPE_W)
+        p->MOV(dataReg, GenRegister::immw(0x0));
+      else if (dataReg.type == GEN_TYPE_UW)
+        p->MOV(dataReg, GenRegister::immuw(0x0));
         GBE_ASSERT(0); /* unsupported data-type */
@@ -1371,6 +1512,10 @@ namespace gbe
         p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL));
       else if (dataReg.type == GEN_TYPE_UL)
         p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL));
+      else if (dataReg.type == GEN_TYPE_W)
+        p->MOV(dataReg, GenRegister::immw(0x7FFF));
+      else if (dataReg.type == GEN_TYPE_UW)
+        p->MOV(dataReg, GenRegister::immuw(0xFFFF));
         GBE_ASSERT(0); /* unsupported data-type */
@@ -1391,6 +1536,10 @@ namespace gbe
         p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
       else if (dataReg.type == GEN_TYPE_UL)
         p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else if (dataReg.type == GEN_TYPE_W)
+        p->MOV(dataReg, GenRegister::immw(0x8000));
+      else if (dataReg.type == GEN_TYPE_UW)
+        p->MOV(dataReg, GenRegister::immuw(0x0));
         GBE_ASSERT(0); /* unsupported data-type */
@@ -1650,7 +1799,7 @@ namespace gbe
     GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
     GenRegister localBarrier = ra->genReg(insn.src(5));
-    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t wg_op = insn.extra.wgop.workgroupOp;
     uint32_t simd = p->curr.execWidth;
     int32_t jip0, jip1;
@@ -1669,8 +1818,8 @@ namespace gbe
     /* use of continuous GRF allocation from insn selection */
     GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
     GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
-    GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
-    GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
+    GenRegister msgAddr = GenRegister::retype(msg, GEN_TYPE_UD);
+    GenRegister msgData = GenRegister::retype(ra->genReg(insn.dst(3)), dst.type);
     /* do some calculation within each thread */
     wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
@@ -1705,13 +1854,15 @@ namespace gbe
       GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
       GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4);
-      p->MOV(msgData.offset(msgData, 0), threadDataL);
-      p->MOV(msgData.offset(msgData, 1), threadDataH);
+      GenRegister msgDataL = GenRegister::retype(msgData, GEN_TYPE_D);
+      GenRegister msgDataH = msgDataL.offset(msgDataL, 1);
       p->curr.execWidth = 8;
+      p->MOV(msgDataL, threadDataL);
+      p->MOV(msgDataH, threadDataH);
       p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
       p->ADD(msgAddr, msgAddr, msgSlmOff);
-      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2);
+      p->UNTYPED_WRITE(msgAddr, msgData, GenRegister::immw(0xFE), 2, insn.extra.wgop.splitSend);
@@ -1719,7 +1870,7 @@ namespace gbe
       p->MOV(msgData, threadData);
       p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
       p->ADD(msgAddr, msgAddr, msgSlmOff);
-      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+      p->UNTYPED_WRITE(msgAddr, msgData, GenRegister::immw(0xFE), 1, insn.extra.wgop.splitSend);
     /* init partialData register, it will hold the final result */
@@ -1804,30 +1955,38 @@ namespace gbe
       else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
         || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
-        p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
         /* workaround QW datatype on CMP */
         if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
-            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 1, 0),
-                       dst.offset(dst, 1, 0), partialData);
-            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 2, 0),
-                       dst.offset(dst, 2, 0), partialData);
-            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 3, 0),
-                       dst.offset(dst, 3, 0), partialData);
-        }
+          p->push();
+            p->curr.execWidth = 8;
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
+            if (simd == 16) {
+              p->curr.execWidth = 8;
+              p->curr.quarterControl = GEN_COMPRESSION_Q2;
+              p->SEL_CMP(GEN_CONDITIONAL_LE, GenRegister::Qn(dst, 1),
+                         GenRegister::Qn(dst, 1), GenRegister::Qn(partialData, 1));
+            }
+          p->pop();
+        } else
+          p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
       else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
         || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
-        p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
         /* workaround QW datatype on CMP */
         if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
-            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 1, 0),
-                       dst.offset(dst, 1, 0), partialData);
-            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 2, 0),
-                       dst.offset(dst, 2, 0), partialData);
-            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 3, 0),
-                       dst.offset(dst, 3, 0), partialData);
-        }
+          p->push();
+            p->curr.execWidth = 8;
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
+            if (simd == 16) {
+              p->curr.execWidth = 8;
+              p->curr.quarterControl = GEN_COMPRESSION_Q2;
+              p->SEL_CMP(GEN_CONDITIONAL_GE, GenRegister::Qn(dst, 1),
+                         GenRegister::Qn(dst, 1), GenRegister::Qn(partialData, 1));
+            }
+          p->pop();
+        } else
+          p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
@@ -1857,7 +2016,7 @@ namespace gbe
     const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(0)), dst.type);
     GenRegister threadData = ra->genReg(insn.src(1));
-    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t wg_op = insn.extra.wgop.workgroupOp;
     uint32_t simd = p->curr.execWidth;
     /* masked elements should be properly set to init value */
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index ec1358c..6b75540 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -66,8 +66,15 @@ namespace gbe
     virtual void emitFloatToI64Instruction(const SelectionInstruction &insn);
     virtual void emitI64MADSATInstruction(const SelectionInstruction &insn);
+    virtual void emitUntypedWriteA64Instruction(const SelectionInstruction &insn);
+    virtual void emitUntypedReadA64Instruction(const SelectionInstruction &insn);
+    virtual void emitByteGatherA64Instruction(const SelectionInstruction &insn);
+    virtual void emitByteScatterA64Instruction(const SelectionInstruction &insn);
     virtual void emitWrite64Instruction(const SelectionInstruction &insn);
     virtual void emitRead64Instruction(const SelectionInstruction &insn);
+    virtual void emitWrite64A64Instruction(const SelectionInstruction &insn);
+    virtual void emitRead64A64Instruction(const SelectionInstruction &insn);
+    virtual void emitAtomicA64Instruction(const SelectionInstruction &insn);
     virtual void emitI64MULInstruction(const SelectionInstruction &insn);
     virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
@@ -118,6 +125,7 @@ namespace gbe
     virtual void newSelection(void);
     virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
                                            GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+    virtual void emitStackPointer(void);
 #endif /* __GBE_GEN8_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 277260f..a33fbac 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -73,17 +73,36 @@ namespace gbe
     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
     const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
-    gen8_insn->bits3.gen7_untyped_rw.msg_type = msg_type;
-    gen8_insn->bits3.gen7_untyped_rw.bti = bti;
-    gen8_insn->bits3.gen7_untyped_rw.rgba = rgba;
+    gen8_insn->bits3.gen8_untyped_rw_a64.msg_type = msg_type;
+    gen8_insn->bits3.gen8_untyped_rw_a64.bti = bti;
+    gen8_insn->bits3.gen8_untyped_rw_a64.rgba = rgba;
     if (curr.execWidth == 8)
-      gen8_insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
+      gen8_insn->bits3.gen8_untyped_rw_a64.simd_mode = GEN_UNTYPED_SIMD8;
     else if (curr.execWidth == 16)
-      gen8_insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
+      gen8_insn->bits3.gen8_untyped_rw_a64.simd_mode = GEN_UNTYPED_SIMD16;
+  static void setDPByteScatterGatherA64(GenEncoder *p,
+                                     GenNativeInstruction *insn,
+                                     uint32_t bti,
+                                     uint32_t block_size,
+                                     uint32_t data_size,
+                                     uint32_t msg_type,
+                                     uint32_t msg_length,
+                                     uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    gen8_insn->bits3.gen8_scatter_rw_a64.msg_type = msg_type;
+    gen8_insn->bits3.gen8_scatter_rw_a64.bti = bti;
+    gen8_insn->bits3.gen8_scatter_rw_a64.data_sz = data_size;
+    gen8_insn->bits3.gen8_scatter_rw_a64.block_sz = block_size;
+    GBE_ASSERT(p->curr.execWidth == 8);
+  }
   void Gen8Encoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
                                           unsigned char msg_type, uint32_t msg_length, bool header_present)
@@ -134,7 +153,7 @@ namespace gbe
     return gen8_insn->bits3.ud;
-  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
@@ -150,6 +169,48 @@ namespace gbe
       this->setSrc1(insn, bti);
+  unsigned Gen8Encoder::setAtomicA64MessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum, int type_long) {
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    assert(srcNum <= 3);
+    if (this->curr.execWidth == 8) {
+      msg_length = srcNum + 1 + type_long;
+      if(srcNum == 3 && type_long)
+        msg_length++;
+      response_length = 1 + type_long;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2 * (srcNum + 1);
+      response_length = 2;
+    } else
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    gen8_insn->bits3.gen8_atomic_a64.msg_type = GEN8_P1_UNTYPED_ATOMIC_A64;
+    gen8_insn->bits3.gen8_atomic_a64.bti = bti;
+    gen8_insn->bits3.gen8_atomic_a64.return_data = 1;
+    gen8_insn->bits3.gen8_atomic_a64.aop_type = function;
+    gen8_insn->bits3.gen8_atomic_a64.data_size = type_long;
+    return gen8_insn->bits3.ud;
+  }
+  void Gen8Encoder::ATOMICA64(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    int type_long = (dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L) ? 1: 0;
+    setAtomicA64MessageDesc(insn, function, bti.value.ud, srcNum, type_long);
+  }
   unsigned Gen8Encoder::setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
@@ -207,7 +268,7 @@ namespace gbe
     return insn->bits3.ud;
-  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
@@ -229,6 +290,101 @@ namespace gbe
+  void Gen8Encoder::UNTYPED_READA64(GenRegister dst, GenRegister src, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    assert(this->curr.execWidth == 8);
+    if (this->curr.execWidth == 8) {
+      msg_length = 2;
+      response_length = elemNum;
+    } else
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   255, // stateless bti
+                   untypedRWMask[elemNum],
+                   GEN8_P1_UNTYPED_READ_A64,
+                   msg_length,
+                   response_length);
+  }
+  void Gen8Encoder::UNTYPED_WRITEA64(GenRegister msg, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    this->setHeader(insn);
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+      msg_length = 2 + elemNum;
+    } else
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   255, //stateless bti
+                   untypedRWMask[elemNum],
+                   GEN8_P1_UNTYPED_WRITE_A64,
+                   msg_length,
+                   response_length);
+  }
+  void Gen8Encoder::BYTE_GATHERA64(GenRegister dst, GenRegister src, uint32_t elemSize) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    //setByteGatherMessageDesc(insn, bti.value.ud, elemSize);
+    GBE_ASSERT(this->curr.execWidth == 8);
+    const uint32_t msg_length = 2;
+    const uint32_t response_length = 1;
+    setDPByteScatterGatherA64(this,
+                           insn,
+                           0xff,
+                           0x0,
+                           elemSize,
+                           GEN8_P1_BYTE_GATHER_A64,
+                           msg_length,
+                           response_length);
+  }
+  void Gen8Encoder::BYTE_SCATTERA64(GenRegister msg, uint32_t elemSize) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+    // only support simd8
+    GBE_ASSERT(this->curr.execWidth == 8);
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    const uint32_t msg_length = 3;
+    const uint32_t response_length = 0;
+    setDPByteScatterGatherA64(this,
+                           insn,
+                           0xff,
+                           0x0,
+                           elemSize,
+                           GEN8_P1_BYTE_SCATTER_A64,
+                           msg_length,
+                           response_length);
+  }
   void Gen8Encoder::LOAD_INT64_IMM(GenRegister dest, GenRegister value) {
     MOV(dest, value);
@@ -275,6 +431,30 @@ namespace gbe
     this->setSrc1(&insn, GenRegister::immd(jip*8));
+  void Gen8Encoder::FENCE(GenRegister dst, bool flushRWCache) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    this->setHeader(insn);
+    this->setDst(insn, dst);
+    this->setSrc0(insn, dst);
+    setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1);
+    gen8_insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
+    gen8_insn->bits3.gen7_memory_fence.commit_enable = 0x1;
+    gen8_insn->bits3.gen7_memory_fence.flush_rw = flushRWCache ? 1 : 0;
+  }
+  void Gen8Encoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, dst);
+     this->setSrc0(insn, GenRegister::ud8grf(0,0));
+     unsigned msg_type = GEN_SAMPLER_MESSAGE_CACHE_FLUSH;
+     unsigned simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD32_64;
+     setSamplerMessage(insn, 0, 0, msg_type,
+                       1, 1,
+                       true,
+                       simd_mode, 0);
+  }
   void Gen8Encoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
@@ -406,9 +586,10 @@ namespace gbe
     assert(gen8_insn->bits1.da1.src0_reg_file != GEN_IMMEDIATE_VALUE);
-    if (reg.file == GEN_IMMEDIATE_VALUE)
+    if (reg.file == GEN_IMMEDIATE_VALUE) {
+      assert(!((reg.type == GEN_TYPE_L || reg.type == GEN_TYPE_UL || reg.type == GEN_TYPE_DF_IMM) && reg.value.u64 > 0xFFFFFFFFl));
       gen8_insn->bits3.ud = reg.value.ud;
-    else {
+    } else {
       assert (reg.address_mode == GEN_ADDRESS_DIRECT);
       if (gen8_insn->header.access_mode == GEN_ALIGN_1) {
         gen8_insn->bits3.da1.src1_subreg_nr = reg.subnr;
@@ -637,4 +818,69 @@ namespace gbe
     gen8_insn->bits1.da3srcacc.src2_abs = src2.absolute;
     gen8_insn->bits1.da3srcacc.src2_negate = src2.negation;
+  static void setOBlockRWA64(GenEncoder *p,
+                             GenNativeInstruction *insn,
+                             uint32_t bti,
+                             uint32_t size,
+                             uint32_t msg_type,
+                             uint32_t msg_length,
+                             uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    gen8_insn->bits3.gen8_block_rw_a64.msg_type = msg_type;
+    gen8_insn->bits3.gen8_block_rw_a64.bti = bti;
+    // For OWord Block read, we use unaligned read
+    gen8_insn->bits3.gen8_block_rw_a64.msg_sub_type = msg_type == GEN8_P1_BLOCK_READ_A64 ? 1 : 0;
+    gen8_insn->bits3.gen8_block_rw_a64.block_size = size;
+    gen8_insn->bits3.gen8_block_rw_a64.header_present = 1;
+  }
+  void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1;
+    uint32_t sizeinreg = ow_size / 2;
+    // half reg should also have size 1
+    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+    const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
+    const uint32_t response_length = sizeinreg; // Size is in reg
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setOBlockRWA64(this,
+                   insn,
+                   bti,
+                   block_size,
+                   GEN8_P1_BLOCK_READ_A64,
+                   msg_length,
+                   response_length);
+  }
+  void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t sizeinreg = ow_size / 2;
+    // half reg should also have size 1
+    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+    const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
+    const uint32_t response_length = 0;
+    const uint32_t block_size = getOBlockSize(ow_size);
+    this->setHeader(insn);
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    setOBlockRWA64(this,
+                   insn,
+                   bti,
+                   block_size,
+                   GEN8_P1_BLOCK_WRITE_A64,
+                   msg_length,
+                   response_length);
+   }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index 12b3765..fa62a8d 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -38,20 +38,27 @@ namespace gbe
     /*! Jump indexed instruction */
     virtual void JMPI(GenRegister src, bool longjmp = false);
+    virtual void FENCE(GenRegister dst, bool flushRWCache);
     /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
     virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
     virtual void F16TO32(GenRegister dest, GenRegister src0);
     virtual void F32TO16(GenRegister dest, GenRegister src0);
     virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
-    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
+    virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
     virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
-    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends);
+    virtual void UNTYPED_READA64(GenRegister dst, GenRegister src, uint32_t elemNum);
+    virtual void UNTYPED_WRITEA64(GenRegister src, uint32_t elemNum);
+    virtual void BYTE_GATHERA64(GenRegister dst, GenRegister src, uint32_t elemSize);
+    virtual void BYTE_SCATTERA64(GenRegister src, uint32_t elemSize);
     virtual void setHeader(GenNativeInstruction *insn);
     virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
                    uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
     virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
                                       unsigned char msg_type, uint32_t msg_length,
                                       bool header_present);
+    virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
     virtual void setDst(GenNativeInstruction *insn, GenRegister dest);
     virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg);
     virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
@@ -62,6 +69,7 @@ namespace gbe
                             GenRegister src1 = GenRegister::null());
     virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
     virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+    virtual unsigned setAtomicA64MessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum, int type_long);
     virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
     virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
     void setSrc0WithAcc(GenNativeInstruction *insn, GenRegister reg, uint32_t accN);
@@ -71,6 +79,10 @@ namespace gbe
                        uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc);
     void MADM(GenRegister dst, GenRegister src0, GenRegister src1, GenRegister src2,
               uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc, uint32_t src2Acc);
+    /*! A64 OBlock read */
+    virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! A64 OBlock write */
+    virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize);
 #endif /* __GBE_GEN8_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index 549948a..446e7f9 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -540,7 +540,11 @@ union Gen8NativeInstruction
       /*! Memory fence */
       struct {
         uint32_t bti:8;
-        uint32_t pad:5;
+        uint32_t pad:1;
+        uint32_t flush_instruction:1;
+        uint32_t flush_texture:1;
+        uint32_t flush_constant:1;
+        uint32_t flush_rw:1;
         uint32_t commit_enable:1;
         uint32_t msg_type:4;
         uint32_t pad2:1;
@@ -566,6 +570,46 @@ union Gen8NativeInstruction
         uint32_t end_of_thread:1;
       } gen7_atomic_op;
+      /*! atomic a64 messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t aop_type:4;
+        uint32_t data_size:1;
+        uint32_t return_data:1;
+        uint32_t msg_type:5;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen8_atomic_a64;
+      // gen8 untyped read/write
+      struct {
+        uint32_t bti:8;
+        uint32_t rgba:4;
+        uint32_t simd_mode:2;
+        uint32_t msg_type:5;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen8_untyped_rw_a64;
+      struct {
+        uint32_t bti:8;
+        uint32_t block_sz:2; // 00 byte 01 dword
+        uint32_t data_sz:2; // 0 ->1block 1->2block
+        uint32_t ignored:2;
+        uint32_t msg_type:5;  // 10000 scatter read,  11010 scatter write 11001 a64 untyped write
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen8_scatter_rw_a64;
       struct {
         uint32_t src1_subreg_nr_high:1;
         uint32_t src1_reg_nr:8;
@@ -604,6 +648,19 @@ union Gen8NativeInstruction
         uint32_t end_of_thread:1;
       } gen7_msg_gw;
+    struct {
+        uint32_t bti:8;
+        uint32_t block_size:3; // oword size
+        uint32_t msg_sub_type:2; // 00 OWord block R/W 01 Unaligned OWord block read 10 Oword Dual Block R/W 11 HWord Block R/W
+        uint32_t ignored:1;
+        uint32_t msg_type:5;  // 10100 A64 block read,  10101 A64 block write
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen8_block_rw_a64;
       struct {
         uint32_t jip:32;
       } gen8_branch;
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index dc05756..483b2c3 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -22,6 +22,7 @@
 #include "backend/gen9_context.hpp"
 #include "backend/gen_insn_selection.hpp"
+#include "backend/gen_program.hpp"
 namespace gbe
@@ -34,9 +35,10 @@ namespace gbe
     const GenRegister fenceDst = ra->genReg(insn.dst(0));
     uint32_t barrierType = insn.extra.barrierType;
     const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
-    if (barrierType == ir::syncGlobalBarrier) {
-      p->FENCE(fenceDst);
+    if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
+      p->FENCE(fenceDst, imageFence);
       p->MOV(fenceDst, fenceDst);
@@ -54,6 +56,10 @@ namespace gbe
       p->curr.predicate = GEN_PREDICATE_NONE;
+    if (imageFence) {
+      p->FLUSH_SAMPLERCACHE(fenceDst);
+      p->MOV(fenceDst, fenceDst);
+    }
   void BxtContext::newSelection(void) {
@@ -165,6 +171,67 @@ namespace gbe
+  void BxtContext::emitStackPointer(void) {
+    using namespace ir;
+    // Only emit stack pointer computation if we use a stack
+    if (kernel->getStackSize() == 0)
+      return;
+    // Check that everything is consistent in the kernel code
+    const uint32_t perLaneSize = kernel->getStackSize();
+    GBE_ASSERT(perLaneSize > 0);
+    const GenRegister selStatckPtr = this->simdWidth == 8 ?
+      GenRegister::ud8grf(ir::ocl::stackptr) :
+      GenRegister::ud16grf(ir::ocl::stackptr);
+    const GenRegister stackptr = ra->genReg(selStatckPtr);
+    // borrow block ip as temporary register as we will
+    // initialize block ip latter.
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+    const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
+    loadLaneID(stackptr);
+    // We compute the per-lane stack pointer here
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
+    // let private address start from zero
+    //p->MOV(stackptr, GenRegister::immud(0));
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
+      p->curr.execWidth = this->simdWidth;
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
+      p->curr.execWidth = 1;
+      p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
+      p->curr.execWidth = this->simdWidth;
+      p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize
+      if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+        const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+          GenRegister::ul8grf(ir::ocl::stackptr) :
+          GenRegister::ul16grf(ir::ocl::stackptr);
+        GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+        GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr);
+        int simdWidth = p->curr.execWidth;
+        if (simdWidth == 16) {
+          // we need do second quarter first, because the dst type is QW,
+          // while the src is DW. If we do first quater first, the 1st
+          // quarter's dst would contain the 2nd quarter's src.
+          p->curr.execWidth = 8;
+          p->curr.quarterControl = GEN_COMPRESSION_Q2;
+          p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1));
+          p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1));
+        }
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        p->MOV(sp, stackptr);
+        p->MOV(stackptr2, sp);
+      }
+    p->pop();
+  }
   void KblContext::newSelection(void) {
     this->sel = GBE_NEW(SelectionKbl, *this);
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index 2f24b56..9977e9a 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -67,6 +67,7 @@ namespace gbe
     virtual void newSelection(void);
     virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
                                            GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+    virtual void emitStackPointer(void);
   /* This class is used to implement the kabylake
      specific logic for context. */
diff --git a/backend/src/backend/gen9_encoder.cpp b/backend/src/backend/gen9_encoder.cpp
index 80df50d..b37fd98 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -26,6 +26,14 @@
 #include "backend/gen9_encoder.hpp"
+#include "backend/gen9_instruction.hpp"
+static const uint32_t untypedRWMask[] = {
+  0
 namespace gbe
@@ -60,9 +68,237 @@ namespace gbe
      this->setDst(insn, dest);
      this->setSrc0(insn, msg);
+     this->setSrc1(insn, GenRegister::immud(0));
      setSamplerMessage(insn, bti, sampler, msg_type,
                        response_length, msg_length,
                        simd_mode, return_format);
+  void Gen9Encoder::setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, GenRegister src0, GenRegister src1)
+  {
+    assert(dst.subnr == 0 && src0.subnr == 0 && src1.subnr == 0);
+      gen9_insn->bits1.sends.dest_reg_file_0 = 0;
+    else if (dst.file == GEN_GENERAL_REGISTER_FILE)
+      gen9_insn->bits1.sends.dest_reg_file_0 = 1;
+    else
+    gen9_insn->bits1.sends.src1_reg_file_0 = 1;
+    gen9_insn->bits1.sends.src1_reg_nr = src1.nr;
+    gen9_insn->bits1.sends.dest_subreg_nr = 0;
+    gen9_insn->bits1.sends.dest_reg_nr = dst.nr;
+    gen9_insn->bits1.sends.dest_address_mode = 0;  //direct mode
+    gen9_insn->bits2.sends.src0_subreg_nr = 0;
+    gen9_insn->bits2.sends.src0_reg_nr = src0.nr;
+    gen9_insn->bits2.sends.src0_address_mode = 0;
+  }
+  unsigned Gen9Encoder::setUntypedWriteSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum)
+  {
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    if (this->curr.execWidth == 8) {
+      msg_length = 1;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2;
+    }
+    else
+    setDPUntypedRW(insn,
+                   bti,
+                   untypedRWMask[elemNum],
+                   GEN75_P1_UNTYPED_SURFACE_WRITE,
+                   msg_length,
+                   response_length);
+    return insn->bits3.ud;
+  }
+  void Gen9Encoder::UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends)
+  {
+    if (!useSends)
+      Gen8Encoder::UNTYPED_WRITE(addr, data, bti, elemNum, false);
+    else {
+      GBE_ASSERT(addr.reg() != data.reg());
+      GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+      Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+      assert(elemNum >= 1 || elemNum <= 4);
+      this->setHeader(insn);
+      insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+      setSendsOperands(gen9_insn, GenRegister::null(), addr, data);
+      if (this->curr.execWidth == 8)
+        gen9_insn->bits2.sends.src1_length = elemNum;
+      else if (this->curr.execWidth == 16)
+        gen9_insn->bits2.sends.src1_length = 2 * elemNum;
+      else
+      if (bti.file == GEN_IMMEDIATE_VALUE) {
+        gen9_insn->bits2.sends.sel_reg32_desc = 0;
+        setUntypedWriteSendsMessageDesc(insn, bti.value.ud, elemNum);
+      } else
+        gen9_insn->bits2.sends.sel_reg32_desc = 1;
+    }
+  }
+  void Gen9Encoder::TYPED_WRITE(GenRegister header, GenRegister data, bool header_present, unsigned char bti, bool useSends)
+  {
+    if (!useSends)
+      Gen8Encoder::TYPED_WRITE(header, data, header_present, bti, false);
+    else {
+      GBE_ASSERT(header.reg() != data.reg());
+      GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+      Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+      assert(header_present);
+      this->setHeader(insn);
+      insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+      setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+      gen9_insn->bits2.sends.src1_length = 4;   //src0_length: 5(header+u+v+w+lod), src1_length: 4(data)
+      gen9_insn->bits2.sends.sel_reg32_desc = 0;
+      setTypedWriteMessage(insn, bti, GEN_TYPED_WRITE, 5, header_present);
+    }
+  }
+  unsigned Gen9Encoder::setByteScatterSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize)
+  {
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    if (this->curr.execWidth == 8) {
+      msg_length = 1;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2;
+    } else
+    setDPByteScatterGather(insn,
+                           bti,
+                           elemSize,
+                           GEN7_BYTE_SCATTER,
+                           msg_length,
+                           response_length);
+    return insn->bits3.ud;
+  }
+  void Gen9Encoder::BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemSize, bool useSends)
+  {
+    if (!useSends)
+      Gen8Encoder::BYTE_SCATTER(addr, data, bti, elemSize, false);
+    else {
+      GBE_ASSERT(addr.reg() != data.reg());
+      GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+      Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+      this->setHeader(insn);
+      insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+      setSendsOperands(gen9_insn, GenRegister::null(), addr, data);
+      if (this->curr.execWidth == 8)
+        gen9_insn->bits2.sends.src1_length = 1;
+      else if (this->curr.execWidth == 16)
+        gen9_insn->bits2.sends.src1_length = 2;
+      else
+      if (bti.file == GEN_IMMEDIATE_VALUE) {
+        gen9_insn->bits2.sends.sel_reg32_desc = 0;
+        setByteScatterSendsMessageDesc(insn, bti.value.ud, elemSize);
+      } else
+        gen9_insn->bits2.sends.sel_reg32_desc = 1;
+    }
+  }
+  void Gen9Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends)
+  {
+    if (!useSends)
+      Gen8Encoder::ATOMIC(dst, function, addr, data, bti, srcNum, false);
+    else {
+      GBE_ASSERT(addr.reg() != data.reg());
+      GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+      Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+      this->setHeader(insn);
+      insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+      setSendsOperands(gen9_insn, dst, addr, data);
+      if (this->curr.execWidth == 8)
+        gen9_insn->bits2.sends.src1_length = srcNum - 1;
+      else if (this->curr.execWidth == 16)
+        gen9_insn->bits2.sends.src1_length = 2 * (srcNum - 1);
+      else
+      if (bti.file == GEN_IMMEDIATE_VALUE) {
+        gen9_insn->bits2.sends.sel_reg32_desc = 0;
+        setAtomicMessageDesc(insn, function, bti.value.ud, 1);
+      } else
+        gen9_insn->bits2.sends.sel_reg32_desc = 1;
+    }
+  }
+  void Gen9Encoder::OBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t ow_size, bool useSends)
+  {
+    if (!useSends)
+      Gen8Encoder::OBWRITE(header, data, bti, ow_size, false);
+    else {
+      GBE_ASSERT(data.reg() != header.reg());
+      GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+      Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+      this->setHeader(insn);
+      insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+      setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+      uint32_t dataRegs = ow_size / 2;
+      // half reg should also have size 1
+      if (dataRegs == 0)
+        dataRegs = 1;
+      gen9_insn->bits2.sends.src1_length = dataRegs;
+      const uint32_t block_size = getOBlockSize(ow_size);
+      const uint32_t msg_length = 1;
+      const uint32_t response_length = 0;
+      setOBlockRW(insn,
+                bti,
+                block_size,
+                GEN7_OBLOCK_WRITE,
+                msg_length,
+                response_length);
+    }
+  }
+  void Gen9Encoder::MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t data_size, bool useSends)
+  {
+    if (!useSends)
+      Gen8Encoder::MBWRITE(header, data, bti, data_size, false);
+    else {
+      GBE_ASSERT(data.reg() != header.reg());
+      GenNativeInstruction *insn = this->next(GEN_OPCODE_SENDS);
+      Gen9NativeInstruction *gen9_insn = &insn->gen9_insn;
+      this->setHeader(insn);
+      insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+      setSendsOperands(gen9_insn, GenRegister::null(), header, data);
+      gen9_insn->bits2.sends.src1_length = data_size;
+      const uint32_t msg_length = 1;
+      const uint32_t response_length = 0;
+      setMBlockRW(insn,
+                bti,
+                GEN75_P1_MEDIA_TYPED_BWRITE,
+                msg_length,
+                response_length);
+    }
+  }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp b/backend/src/backend/gen9_encoder.hpp
index 319e871..2eaa538 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -47,7 +47,15 @@ namespace gbe
                 uint32_t return_format,
                 bool isLD,
                 bool isUniform);
+    void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, GenRegister src0, GenRegister src1);
+    virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends);
+    virtual void TYPED_WRITE(GenRegister header, GenRegister data, bool header_present, unsigned char bti, bool useSends);
+    virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemSize, bool useSends);
+    virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
+    virtual void OBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t ow_size, bool useSends);
+    virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t data_size, bool useSends);
 #endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen9_instruction.hpp b/backend/src/backend/gen9_instruction.hpp
new file mode 100644
index 0000000..16114ca
--- /dev/null
+++ b/backend/src/backend/gen9_instruction.hpp
@@ -0,0 +1,84 @@
+ * Copyright © 2016 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Guo, Yejun <yejun.guo at intel.com>
+ */
+union Gen9NativeInstruction
+  struct {
+    struct {
+      uint32_t opcode:7;
+      uint32_t pad:1;
+      uint32_t access_mode:1;
+      uint32_t dependency_control:2;
+      uint32_t nib_ctrl:1;
+      uint32_t quarter_control:2;
+      uint32_t thread_control:2;
+      uint32_t predicate_control:4;
+      uint32_t predicate_inverse:1;
+      uint32_t execution_size:3;
+      uint32_t destreg_or_condmod:4;
+      uint32_t acc_wr_control:1;
+      uint32_t cmpt_control:1;
+      uint32_t debug_control:1;
+      uint32_t saturate:1;
+    } header;
+    union {
+      struct {
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t mask_control:1;
+        uint32_t dest_reg_file_0:1;
+        uint32_t src1_reg_file_0:1;
+        uint32_t dest_reg_type:4;
+        uint32_t pad0:3;
+        uint32_t src1_reg_nr:8;
+        uint32_t dest_subreg_nr:1;
+        uint32_t dest_reg_nr:8;
+        uint32_t pad1:1;
+        uint32_t pad2:1;    //direct mode is used
+        uint32_t dest_address_mode:1;
+      } sends;
+      uint32_t ud;
+    }bits1;
+    union {
+      struct {
+        uint32_t src1_length:4;     //exdesc_9_6
+        uint32_t src0_subreg_nr:1;
+        uint32_t src0_reg_nr:8;
+        uint32_t sel_reg32_desc:1;
+        uint32_t pad0:1;
+        uint32_t src0_address_mode:1;
+        uint32_t exdesc_31_16:16;
+      } sends;
+      uint32_t ud;
+    } bits2;
+    union {
+      uint32_t ud;
+    } bits3;
+  };
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 4f73237..bb104cf 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -248,6 +248,23 @@ namespace gbe
       p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
       p->curr.execWidth = this->simdWidth;
       p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize
+      if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+        const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+          GenRegister::ul8grf(ir::ocl::stackptr) :
+          GenRegister::ul16grf(ir::ocl::stackptr);
+        const GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+        int simdWidth = p->curr.execWidth;
+        if (simdWidth == 16) {
+          // we need do second quarter first, because the dst type is QW,
+          // while the src is DW. If we do first quater first, the 1st
+          // quarter's dst would contain the 2nd quarter's src.
+          p->curr.execWidth = 8;
+          p->curr.quarterControl = GEN_COMPRESSION_Q2;
+          p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(stackptr,1));
+        }
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        p->MOV(stackptr2, stackptr);
+      }
@@ -274,6 +291,7 @@ namespace gbe
       case SEL_OP_F16TO32: p->F16TO32(dst, src); break;
       case SEL_OP_F32TO16: p->F32TO16(dst, src); break;
       case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src); break;
+      case SEL_OP_BFREV: p->BFREV(dst, src); break;
       case SEL_OP_CONVI64_TO_I:
         p->MOV(dst, src.bottom_half());
@@ -719,7 +737,7 @@ namespace gbe
         p->curr.quarterControl = 1;
         p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
-        p->MOV(GenRegister::offset(dst, 1, 0), indirect);
+        p->MOV(GenRegister::offset(dst, 0, 8 * typeSize(src0.type)), indirect);
       } else
@@ -1817,9 +1835,10 @@ namespace gbe
     const GenRegister fenceDst = ra->genReg(insn.dst(0));
     uint32_t barrierType = insn.extra.barrierType;
     const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    bool imageFence = barrierType & ir::SYNC_IMAGE_FENCE;
-    if (barrierType == ir::syncGlobalBarrier) {
-      p->FENCE(fenceDst);
+    if (barrierType & ir::SYNC_GLOBAL_READ_FENCE) {
+      p->FENCE(fenceDst, imageFence);
       p->MOV(fenceDst, fenceDst);
@@ -1837,11 +1856,15 @@ namespace gbe
       p->curr.predicate = GEN_PREDICATE_NONE;
+    if (imageFence) {
+      p->FLUSH_SAMPLERCACHE(fenceDst);
+      p->MOV(fenceDst, fenceDst);
+    }
   void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
-    p->FENCE(dst);
+    p->FENCE(dst, false);
     p->MOV(dst, dst);
@@ -1870,26 +1893,34 @@ namespace gbe
   void GenContext::emitAtomicInstruction(const SelectionInstruction &insn) {
-    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister addr = ra->genReg(insn.src(0));
     const GenRegister dst = ra->genReg(insn.dst(0));
     const uint32_t function = insn.extra.function;
     unsigned srcNum = insn.extra.elem;
+    GenRegister data = addr;
+    if (srcNum > 1)
+      data = ra->genReg(insn.src(1));
     const GenRegister bti = ra->genReg(insn.src(srcNum));
     if (bti.file == GEN_IMMEDIATE_VALUE) {
-      p->ATOMIC(dst, function, src, bti, srcNum);
+      p->ATOMIC(dst, function, addr, data, bti, srcNum, insn.extra.splitSend);
     } else {
       GenRegister flagTemp = ra->genReg(insn.dst(1));
       GenRegister btiTmp = ra->genReg(insn.dst(2));
-      unsigned desc = p->generateAtomicMessageDesc(function, 0, srcNum);
+      unsigned desc = 0;
+      if (insn.extra.splitSend)
+        desc = p->generateAtomicMessageDesc(function, 0, 1);
+      else
+        desc = p->generateAtomicMessageDesc(function, 0, srcNum);
       unsigned jip0 = beforeMessage(insn, bti, flagTemp, btiTmp, desc);
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-        p->ATOMIC(dst, function, src, GenRegister::addr1(0), srcNum);
+        p->ATOMIC(dst, function, addr, data, GenRegister::addr1(0), srcNum, insn.extra.splitSend);
       afterMessage(insn, bti, flagTemp, btiTmp, jip0);
@@ -1986,10 +2017,10 @@ namespace gbe
     else { //size == 8
       payload.type = GEN_TYPE_UD;
       GBE_ASSERT(payload.hstride == GEN_HORIZONTAL_STRIDE_1);
-      loadBottomHalf(payload, src);
+      loadBottomHalf(payload, src.isdf()? GenRegister::retype(src, GEN_TYPE_UL) : src );
       uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
       this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
-      loadTopHalf(payload, src);
+      loadTopHalf(payload, src.isdf() ? GenRegister::retype(src, GEN_TYPE_UL) : src);
       this->scratchWrite(msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
@@ -2016,9 +2047,9 @@ namespace gbe
     } else {
       uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
       this->scratchRead(payload, msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
-      storeBottomHalf(dst, payload);
+      storeBottomHalf(GenRegister::ul8grf(dst.nr, dst.subnr), payload);
       this->scratchRead(payload, msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
-      storeTopHalf(dst, payload);
+      storeTopHalf(GenRegister::ul8grf(dst.nr, dst.subnr), payload);
@@ -2032,8 +2063,8 @@ namespace gbe
     if (bti.file == GEN_IMMEDIATE_VALUE) {
       p->UNTYPED_READ(dst, src, bti, elemNum);
     } else {
-      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
-      const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
+      const GenRegister tmp = ra->genReg(insn.dst(insn.extra.elem));
+      const GenRegister btiTmp = ra->genReg(insn.dst(insn.extra.elem + 1));
       unsigned desc = p->generateUntypedReadMessageDesc(0, elemNum);
       unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
@@ -2123,7 +2154,7 @@ namespace gbe
     const GenRegister bti = ra->genReg(insn.src(elemNum+1));
     if (bti.file == GEN_IMMEDIATE_VALUE) {
-      p->UNTYPED_WRITE(src, bti, elemNum*2);
+      p->UNTYPED_WRITE(src, src, bti, elemNum*2, false);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(0));
       const GenRegister btiTmp = ra->genReg(insn.dst(1));
@@ -2135,22 +2166,27 @@ namespace gbe
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-        p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum*2);
+        p->UNTYPED_WRITE(src, src, GenRegister::addr1(0), elemNum*2, false);
       afterMessage(insn, bti, tmp, btiTmp, jip0);
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister addr = ra->genReg(insn.src(0));
+    GenRegister data = ra->genReg(insn.src(1));
     const uint32_t elemNum = insn.extra.elem;
     const GenRegister bti = ra->genReg(insn.src(elemNum+1));
     if (bti.file == GEN_IMMEDIATE_VALUE) {
-      p->UNTYPED_WRITE(src, bti, elemNum);
+      p->UNTYPED_WRITE(addr, data, bti, elemNum, insn.extra.splitSend);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(0));
       const GenRegister btiTmp = ra->genReg(insn.dst(1));
-      unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum);
+      unsigned desc = 0;
+      if (insn.extra.splitSend)
+        desc = p->generateUntypedWriteSendsMessageDesc(0, elemNum);
+      else
+        desc = p->generateUntypedWriteMessageDesc(0, elemNum);
       unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
@@ -2158,7 +2194,7 @@ namespace gbe
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-        p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum);
+        p->UNTYPED_WRITE(addr, data, GenRegister::addr1(0), elemNum, insn.extra.splitSend);
       afterMessage(insn, bti, tmp, btiTmp, jip0);
@@ -2190,16 +2226,21 @@ namespace gbe
   void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
-    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister addr = ra->genReg(insn.src(0));
+    GenRegister data = ra->genReg(insn.src(1));
     const uint32_t elemSize = insn.extra.elem;
     const GenRegister bti = ra->genReg(insn.src(2));
     if (bti.file == GEN_IMMEDIATE_VALUE) {
-      p->BYTE_SCATTER(src, bti, elemSize);
+      p->BYTE_SCATTER(addr, data, bti, elemSize, insn.extra.splitSend);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(0));
       const GenRegister btiTmp = ra->genReg(insn.dst(1));
-      unsigned desc = p->generateByteScatterMessageDesc(0, elemSize);
+      unsigned desc = 0;
+      if (insn.extra.splitSend)
+        desc = p->generateByteScatterSendsMessageDesc(0, elemSize);
+      else
+        desc = p->generateByteScatterMessageDesc(0, elemSize);
       unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
@@ -2207,13 +2248,38 @@ namespace gbe
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
-        p->BYTE_SCATTER(src, GenRegister::addr1(0), elemSize);
+        p->BYTE_SCATTER(addr, data, GenRegister::addr1(0), elemSize, insn.extra.splitSend);
       afterMessage(insn, bti, tmp, btiTmp, jip0);
+  void GenContext::emitUntypedReadA64Instruction(const SelectionInstruction &insn) {
+    assert(0);
+  }
+  void GenContext::emitUntypedWriteA64Instruction(const SelectionInstruction &insn) {
+    assert(0);
+  }
+  void GenContext::emitByteGatherA64Instruction(const SelectionInstruction &insn) {
+    assert(0);
+  }
+  void GenContext::emitByteScatterA64Instruction(const SelectionInstruction &insn) {
+    assert(0);
+  }
+  void GenContext::emitRead64A64Instruction(const SelectionInstruction &insn) {
+    assert(0);
+  }
+  void GenContext::emitWrite64A64Instruction(const SelectionInstruction &insn) {
+    assert(0);
+  }
+  void GenContext::emitAtomicA64Instruction(const SelectionInstruction &insn) {
+    assert(0);
+  }
   void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
     for(uint32_t i = 0; i < insn.dstNum; i++) {
@@ -2399,8 +2465,9 @@ namespace gbe
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+    GenRegister data = ra->genReg(insn.src(5));
     const uint32_t bti = insn.getbti();
-    p->TYPED_WRITE(header, true, bti);
+    p->TYPED_WRITE(header, data, true, bti, insn.extra.typedWriteSplitSend);
   static void calcGID(GenRegister& reg, GenRegister& tmp, int flag, int subFlag, int dim, GenContext *gc)
@@ -2587,6 +2654,7 @@ namespace gbe
     uint32_t tsType = insn.extra.timestampType;
     GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    (void) tsType;
     GBE_ASSERT(tsType == 1);
     GenRegister tmArf = GenRegister::tm0();
     GenRegister profilingReg[5];
@@ -2712,6 +2780,7 @@ namespace gbe
     GenRegister tmp = ra->genReg(insn.dst(0));
     uint32_t profilingType = insn.extra.profilingType;
     uint32_t bti = insn.extra.profilingBTI;
+    (void) profilingType;
     GBE_ASSERT(profilingType == 1);
     GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
     GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
@@ -2775,7 +2844,7 @@ namespace gbe
       p->curr.useFlag(insn.state.flag, insn.state.subFlag);
       p->curr.predicate = GEN_PREDICATE_NORMAL;
-      p->ATOMIC(incRes, GEN_ATOMIC_OP_INC, sndMsg, GenRegister::immud(bti), 1);
+      p->ATOMIC(incRes, GEN_ATOMIC_OP_INC, sndMsg, sndMsg, GenRegister::immud(bti), 1, false);
     } p->pop();
     // Calculate the final addr
@@ -2833,14 +2902,14 @@ namespace gbe
       // Write it out.
       p->curr.execWidth = 8;
       p->curr.noMask = 1;
-      p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+      p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
       p->ADD(addr, addr, GenRegister::immud(32));
       // time stamps
       for (int i = 0; i < 3; i++) {
         p->curr.execWidth = 8;
         p->MOV(data, GenRegister::retype(profilingReg[i], GEN_TYPE_UD));
-        p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+        p->UNTYPED_WRITE(addr, addr, GenRegister::immud(bti), 1, false);
         p->ADD(addr, addr, GenRegister::immud(32));
     } p->pop();
@@ -2878,6 +2947,10 @@ namespace gbe
         p->MOV(dataReg, GenRegister::immint64(0x0));
       else if (dataReg.type == GEN_TYPE_UL)
         p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else if (dataReg.type == GEN_TYPE_W)
+        p->MOV(dataReg, GenRegister::immw(0x0));
+      else if (dataReg.type == GEN_TYPE_UW)
+        p->MOV(dataReg, GenRegister::immuw(0x0));
         GBE_ASSERT(0); /* unsupported data-type */
@@ -2896,6 +2969,10 @@ namespace gbe
         p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL));
       else if (dataReg.type == GEN_TYPE_UL)
         p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL));
+      else if (dataReg.type == GEN_TYPE_W)
+        p->MOV(dataReg, GenRegister::immw(0x7FFF));
+      else if (dataReg.type == GEN_TYPE_UW)
+        p->MOV(dataReg, GenRegister::immuw(0xFFFF));
         GBE_ASSERT(0); /* unsupported data-type */
@@ -2914,6 +2991,10 @@ namespace gbe
         p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
       else if (dataReg.type == GEN_TYPE_UL)
         p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else if (dataReg.type == GEN_TYPE_W)
+        p->MOV(dataReg, GenRegister::immw(0x8000));
+      else if (dataReg.type == GEN_TYPE_UW)
+        p->MOV(dataReg, GenRegister::immuw(0x0));
         GBE_ASSERT(0); /* unsupported data-type */
@@ -3173,7 +3254,7 @@ namespace gbe
     GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
     GenRegister localBarrier = ra->genReg(insn.src(5));
-    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t wg_op = insn.extra.wgop.workgroupOp;
     uint32_t simd = p->curr.execWidth;
     int32_t jip0, jip1;
@@ -3192,8 +3273,8 @@ namespace gbe
     /* use of continuous GRF allocation from insn selection */
     GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
     GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
-    GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
-    GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
+    GenRegister msgAddr = GenRegister::retype(msg, GEN_TYPE_UD);
+    GenRegister msgData = GenRegister::retype(ra->genReg(insn.dst(3)), dst.type);
     /* do some calculation within each thread */
     wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
@@ -3228,13 +3309,15 @@ namespace gbe
       GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
       GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4);
-      p->MOV(msgData.offset(msgData, 0), threadDataL);
-      p->MOV(msgData.offset(msgData, 1), threadDataH);
+      GenRegister msgDataL = GenRegister::retype(msgData, GEN_TYPE_D);
+      GenRegister msgDataH = msgDataL.offset(msgDataL, 1);
       p->curr.execWidth = 8;
+      p->MOV(msgDataL, threadDataL);
+      p->MOV(msgDataH, threadDataH);
       p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
       p->ADD(msgAddr, msgAddr, msgSlmOff);
-      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2);
+      p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 2, false);
@@ -3242,7 +3325,7 @@ namespace gbe
       p->MOV(msgData, threadData);
       p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
       p->ADD(msgAddr, msgAddr, msgSlmOff);
-      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+      p->UNTYPED_WRITE(msg, msg, GenRegister::immw(0xFE), 1, false);
     /* init partialData register, it will hold the final result */
@@ -3327,30 +3410,38 @@ namespace gbe
       else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
         || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
-        p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
         /* workaround QW datatype on CMP */
         if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
-            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 1, 0),
-                       dst.offset(dst, 1, 0), partialData);
-            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 2, 0),
-                       dst.offset(dst, 2, 0), partialData);
-            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 3, 0),
-                       dst.offset(dst, 3, 0), partialData);
-        }
+          p->push();
+            p->curr.execWidth = 8;
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
+            if (simd == 16) {
+              p->curr.execWidth = 8;
+              p->curr.quarterControl = GEN_COMPRESSION_Q2;
+              p->SEL_CMP(GEN_CONDITIONAL_LE, GenRegister::Qn(dst, 1),
+                         GenRegister::Qn(dst, 1), GenRegister::Qn(partialData, 1));
+            }
+          p->pop();
+        } else
+          p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
       else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
         || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
-        p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
         /* workaround QW datatype on CMP */
         if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
-            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 1, 0),
-                       dst.offset(dst, 1, 0), partialData);
-            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 2, 0),
-                       dst.offset(dst, 2, 0), partialData);
-            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 3, 0),
-                       dst.offset(dst, 3, 0), partialData);
-        }
+          p->push();
+            p->curr.execWidth = 8;
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
+            if (simd == 16) {
+              p->curr.execWidth = 8;
+              p->curr.quarterControl = GEN_COMPRESSION_Q2;
+              p->SEL_CMP(GEN_CONDITIONAL_GE, GenRegister::Qn(dst, 1),
+                         GenRegister::Qn(dst, 1), GenRegister::Qn(partialData, 1));
+            }
+          p->pop();
+        } else
+          p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
@@ -3380,7 +3471,7 @@ namespace gbe
     const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(0)), dst.type);
     GenRegister threadData = ra->genReg(insn.src(1));
-    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t wg_op = insn.extra.wgop.workgroupOp;
     uint32_t simd = p->curr.execWidth;
     /* masked elements should be properly set to init value */
@@ -3398,27 +3489,25 @@ namespace gbe
   void GenContext::emitPrintfLongInstruction(GenRegister& addr, GenRegister& data,
-                                             GenRegister& src, uint32_t bti) {
+                                             GenRegister& src, uint32_t bti, bool useSends) {
     p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src.bottom_half());
-    p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+    p->UNTYPED_WRITE(addr, data, GenRegister::immud(bti), 1, useSends);
     p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
     p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src.top_half(this->simdWidth));
-    p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+    p->UNTYPED_WRITE(addr, data, GenRegister::immud(bti), 1, useSends);
     p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
   void GenContext::emitPrintfInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst = ra->genReg(insn.dst(0));
-    const GenRegister tmp0 = ra->genReg(insn.dst(1));
-    const GenRegister tmp1 = ra->genReg(insn.dst(2));
+    const GenRegister tmp0 = ra->genReg(insn.dst(0));
+    const GenRegister tmp1 = ra->genReg(insn.dst(1));
     GenRegister src;
     uint32_t srcNum = insn.srcNum;
-    if (insn.extra.continueFlag)
-      srcNum--;
     GenRegister addr = GenRegister::retype(tmp0, GEN_TYPE_UD);
     GenRegister data = GenRegister::retype(tmp1, GEN_TYPE_UD);
+    bool useSends = insn.extra.printfSplitSend;
     if (!insn.extra.continueFlag) {
       p->push(); {
@@ -3429,18 +3518,18 @@ namespace gbe
         p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
       } p->pop();
-      p->ATOMIC(addr, GEN_ATOMIC_OP_ADD, addr, GenRegister::immud(insn.extra.printfBTI), 2);
+      p->ATOMIC(addr, GEN_ATOMIC_OP_ADD, addr, data, GenRegister::immud(insn.extra.printfBTI), 2, useSends);
       /* Write out the header. */
       p->MOV(data, GenRegister::immud(0xAABBCCDD));
-      p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+      p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
       p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
       p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
-      p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+      p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
       p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
       p->MOV(data, GenRegister::immud(insn.extra.printfNum));
-      p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+      p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
       p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
@@ -3450,25 +3539,16 @@ namespace gbe
       src = ra->genReg(insn.src(i));
       if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D || src.type == GEN_TYPE_F) {
         p->MOV(GenRegister::retype(data, src.type), src);
-        p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+        p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
         p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
       } else if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_UB ) {
         p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src);
-        p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+        p->UNTYPED_WRITE(addr, data, GenRegister::immud(insn.extra.printfBTI), 1, useSends);
         p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
       } else if (src.type == GEN_TYPE_L || src.type == GEN_TYPE_UL ) {
-        emitPrintfLongInstruction(addr, data, src, insn.extra.printfBTI);
+        emitPrintfLongInstruction(addr, data, src, insn.extra.printfBTI, useSends);
-    if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
-      p->push();
-      p->curr.execWidth = 1;
-    }
-    p->MOV(dst, GenRegister::immd(0));
-    if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
-      p->pop();
-    }
   void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
@@ -3489,323 +3569,41 @@ namespace gbe
   void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_UD);
-    const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
-    const uint32_t vec_size = insn.extra.elem;
-    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), GEN_TYPE_UD);
-    const uint32_t simdWidth = p->curr.execWidth;
-    // Make header
-    p->push();
-    {
-      // Copy r0 into the header first
-      p->curr.execWidth = 8;
-      p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.noMask = 1;
-      p->MOV(header, GenRegister::ud8grf(0, 0));
-      // Update the header with the current address
-      p->curr.execWidth = 1;
-      p->MOV(headeraddr, addr);
-      // Put zero in the general state base address
-      p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
-    }
-    p->pop();
-    // Now read the data, oword block read can only work with simd16 and no mask
-    if (vec_size == 1) {
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        p->OBREAD(dst, header, insn.getbti(), simdWidth / 4);
-      }
-      p->pop();
-    } else if (vec_size == 2) {
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2);
-      }
-      p->pop();
-      p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
-      p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth / 8));
-    } else if (vec_size == 4 || vec_size == 8) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < vec_size / 4; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          p->push();
-          {
-            p->curr.execWidth = 16;
-            p->curr.noMask = 1;
-            p->OBREAD(tmp, header, insn.getbti(), 8);
-          }
-          p->pop();
-          for (uint32_t j = 0; j < 4; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, j));
-        }
-      } else {
-        for (uint32_t i = 0; i < vec_size / 2; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          p->OBREAD(tmp, header, insn.getbti(), 8);
-          for (uint32_t j = 0; j < 2; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, j*2));
-        }
-      }
-    } else NOT_SUPPORTED;
+    const GenRegister header = ra->genReg(insn.src(0));
+    const GenRegister tmp = ra->genReg(insn.dst(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t ow_size = insn.extra.elem;
+    bool isA64 = bti == 255;
+    if (isA64)
+       p->OBREADA64(tmp, header, bti, ow_size);
+    else
+       p->OBREAD(tmp, header, bti, ow_size);
   void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
-    const uint32_t vec_size = insn.extra.elem;
-    const GenRegister tmp = GenRegister::offset(header, 1);
-    const uint32_t simdWidth = p->curr.execWidth;
-    uint32_t tmp_size = simdWidth * vec_size / 8;
-    tmp_size = tmp_size > 4 ? 4 : tmp_size;
-    p->push();
-      // Copy r0 into the header first
-      p->curr.execWidth = 8;
-      p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.noMask = 1;
-      p->MOV(header, GenRegister::ud8grf(0,0));
-      // Update the header with the current address
-      p->curr.execWidth = 1;
-      p->SHR(headeraddr, addr, GenRegister::immud(4));
-      // Put zero in the general state base address
-      p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
-    p->pop();
-    // Now write the data, oword block write can only work with simd16 and no mask
-    if (vec_size == 1) {
-      p->MOV(tmp, ra->genReg(insn.src(1)));
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        p->OBWRITE(header, insn.getbti(), simdWidth / 4);
-      }
-      p->pop();
-    } else if (vec_size == 2) {
-      p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ;
-      p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra->genReg(insn.src(2))) ;
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        p->OBWRITE(header, insn.getbti(), simdWidth / 2);
-      }
-      p->pop();
-    } else if (vec_size == 4 || vec_size == 8) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < vec_size / 4; i++) {
-          for (uint32_t j = 0; j < 4; j++)
-            p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j + i*4))) ;
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
-            }
-            p->pop();
-          }
-          p->push();
-          {
-            p->curr.execWidth = 16;
-            p->curr.noMask = 1;
-            p->OBWRITE(header, insn.getbti(), 8);
-          }
-          p->pop();
-        }
-      } else {
-        for (uint32_t i = 0; i < vec_size / 2; i++) {
-          for (uint32_t j = 0; j < 2; j++)
-            p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + j + i*2))) ;
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
-            }
-            p->pop();
-          }
-          p->OBWRITE(header, insn.getbti(), 8);
-        }
-      }
-    } else NOT_SUPPORTED;
+    const GenRegister header = ra->genReg(insn.src(0));
+    const GenRegister data = ra->genReg(insn.src(1));
+    const uint32_t bti = insn.getbti();
+    const uint32_t ow_size = insn.extra.elem;
+    bool isA64 = bti == 255;
+    if (isA64)
+       p->OBWRITEA64(header, bti, ow_size);
+    else
+       p->OBWRITE(header, data, bti, ow_size, insn.extra.splitSend);
   void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst = ra->genReg(insn.dst(1));
-    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
-    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
-    const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
-    const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
-    size_t vec_size = insn.extra.elem;
-    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
-    if (simdWidth == 8)
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
-        p->curr.execWidth = 8;
-        p->MBREAD(dst, header, insn.getbti(), vec_size);
-      p->pop();
-    }
-    else if (simdWidth == 16)
-    {
-      const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-        // First half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
-        p->curr.execWidth = 8;
-        p->MBREAD(tmp, header, insn.getbti(), vec_size);
-        for (uint32_t i = 0; i < vec_size; i++)
-          p->MOV(ra->genReg(insn.dst(i + 1)), GenRegister::offset(tmp, i));
-        // Second half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->ADD(offsetx, offsetx, GenRegister::immud(32));
-        // Now read the data
-        p->curr.execWidth = 8;
-        p->MBREAD(tmp, header, insn.getbti(), vec_size);
-        // Move the reg to fit vector rule.
-        for (uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
-                 GenRegister::offset(tmp, i));
-      p->pop();
-    } else NOT_IMPLEMENTED;
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister header = ra->genReg(insn.src(0));
+    const size_t response_size = insn.extra.elem;
+    p->MBREAD(dst, header, insn.getbti(), response_size);
   void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
-    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    GenRegister offsetx, offsety, blocksizereg;
-    size_t vec_size = insn.extra.elem;
-    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
-    offsetx = GenRegister::offset(header, 0, 0*4);
-    offsety = GenRegister::offset(header, 0, 1*4);
-    blocksizereg = GenRegister::offset(header, 0, 2*4);
-    if (simdWidth == 8)
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i)));
-        // Now read the data
-        p->MBWRITE(header, insn.getbti(), vec_size);
-      p->pop();
-    }
-    else
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-        // First half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i)));
-        p->MBWRITE(header, insn.getbti(), vec_size);
-        // Second half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->ADD(offsetx, offsetx, GenRegister::immud(32));
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
-        // Now write the data
-        p->MBWRITE(header, insn.getbti(), vec_size);
-      p->pop();
-    }
+    const GenRegister header = ra->genReg(insn.dst(0));
+    const GenRegister data = ra->genReg(insn.dst(1));
+    const size_t data_size = insn.extra.elem;
+    p->MBWRITE(header, data, insn.getbti(), data_size, insn.extra.splitSend);
@@ -3841,9 +3639,11 @@ namespace gbe
+    sel->addID();
-      outputSelectionIR(*this, this->sel);
+      outputSelectionIR(*this, this->sel, genKernel->getName());
     schedulePreRegAllocation(*this, *this->sel);
+    sel->addID();
     if (UNLIKELY(ra->allocate(*this->sel) == false))
       return false;
     schedulePostRegAllocation(*this, *this->sel);
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index fb3d4fe..7fd40d1 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -159,8 +159,15 @@ namespace gbe
     void emitMathInstruction(const SelectionInstruction &insn);
     virtual void emitRead64Instruction(const SelectionInstruction &insn);
     virtual void emitWrite64Instruction(const SelectionInstruction &insn);
+    virtual void emitRead64A64Instruction(const SelectionInstruction &insn);
+    virtual void emitWrite64A64Instruction(const SelectionInstruction &insn);
+    virtual void emitAtomicA64Instruction(const SelectionInstruction &insn);
     void emitUntypedReadInstruction(const SelectionInstruction &insn);
     void emitUntypedWriteInstruction(const SelectionInstruction &insn);
+    virtual void emitUntypedReadA64Instruction(const SelectionInstruction &insn);
+    virtual void emitUntypedWriteA64Instruction(const SelectionInstruction &insn);
+    virtual void emitByteGatherA64Instruction(const SelectionInstruction &insn);
+    virtual void emitByteScatterA64Instruction(const SelectionInstruction &insn);
     void emitAtomicInstruction(const SelectionInstruction &insn);
     void emitByteGatherInstruction(const SelectionInstruction &insn);
     void emitByteScatterInstruction(const SelectionInstruction &insn);
@@ -236,7 +243,7 @@ namespace gbe
     void calcGlobalXYZRange(GenRegister& reg, GenRegister& tmp, int flag, int subFlag);
     virtual void subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp);
     virtual void addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp);
-    virtual void emitPrintfLongInstruction(GenRegister& addr, GenRegister& data, GenRegister& src, uint32_t bti);
+    virtual void emitPrintfLongInstruction(GenRegister& addr, GenRegister& data, GenRegister& src, uint32_t bti, bool useSends);
     CompileErrorCode errCode;
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index bcbb23f..c34e1bb 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -54,6 +54,7 @@
 #include <stdint.h>
 #include "backend/gen7_instruction.hpp"
 #include "backend/gen8_instruction.hpp"
+#include "backend/gen9_instruction.hpp"
 // Gen EU defines
@@ -129,6 +130,7 @@ enum opcode {
   GEN_OPCODE_F32TO16 = 19,
   GEN_OPCODE_F16TO32 = 20,
   GEN_OPCODE_IF = 34,
@@ -147,6 +149,7 @@ enum opcode {
@@ -357,6 +360,14 @@ enum GenMessageTarget {
 #define GEN75_P1_ATOMIC_COUNTER_4X2    12 //1100: Atomic Counter Operation 4X2
 #define GEN75_P1_TYPED_SURFACE_WRITE   13 //1101: Typed Surface Write
+#define GEN8_P1_BLOCK_READ_A64        20 //10100
+#define GEN8_P1_BLOCK_WRITE_A64       21 //10101
+#define GEN8_P1_BYTE_GATHER_A64       16 //10000
+#define GEN8_P1_UNTYPED_READ_A64      17 //10001
+#define GEN8_P1_UNTYPED_ATOMIC_A64    18 //10010
+#define GEN8_P1_UNTYPED_WRITE_A64     25 //11001
+#define GEN8_P1_BYTE_SCATTER_A64      26 //11010
 /* Data port data cache scratch messages*/
 #define GEN_SCRATCH_READ                  0
 #define GEN_SCRATCH_WRITE                 1
@@ -417,6 +428,7 @@ enum GenMessageTarget {
 #define GEN5_SAMPLER_MESSAGE_SAMPLE_LD           7
+#define GEN_SAMPLER_MESSAGE_CACHE_FLUSH          0x1f
 /* for GEN5 only */
 #define GEN_SAMPLER_SIMD_MODE_SIMD4X2                   0
@@ -549,6 +561,7 @@ union GenNativeInstruction
   union Gen7NativeInstruction gen7_insn;
   union Gen8NativeInstruction gen8_insn;
+  union Gen9NativeInstruction gen9_insn;
   //Gen7 & Gen8 common field
   struct {
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 975e1c7..03ce0e2 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -203,7 +203,6 @@ namespace gbe
                                         unsigned msg_length, unsigned response_length,
                                         bool header_present, bool end_of_thread)
-     setSrc1(inst, GenRegister::immd(0));
      inst->bits3.generic_gen5.header_present = header_present;
      inst->bits3.generic_gen5.response_length = response_length;
      inst->bits3.generic_gen5.msg_length = msg_length;
@@ -238,8 +237,7 @@ namespace gbe
-  static void setDPByteScatterGather(GenEncoder *p,
-                                     GenNativeInstruction *insn,
+  void GenEncoder::setDPByteScatterGather(GenNativeInstruction *insn,
                                      uint32_t bti,
                                      uint32_t elem_size,
                                      uint32_t msg_type,
@@ -247,44 +245,59 @@ namespace gbe
                                      uint32_t response_length)
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
-    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_byte_rw.msg_type = msg_type;
     insn->bits3.gen7_byte_rw.bti = bti;
     insn->bits3.gen7_byte_rw.data_size = elem_size;
-    if (p->curr.execWidth == 8)
+    if (curr.execWidth == 8)
       insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD8;
-    else if (p->curr.execWidth == 16)
+    else if (curr.execWidth == 16)
       insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD16;
-  static void setOBlockRW(GenEncoder *p,
-                          GenNativeInstruction *insn,
-                          uint32_t bti,
-                          uint32_t size,
-                          uint32_t msg_type,
-                          uint32_t msg_length,
-                          uint32_t response_length)
+  void GenEncoder::setOBlockRW(GenNativeInstruction *insn,
+                               uint32_t bti,
+                               uint32_t block_size,
+                               uint32_t msg_type,
+                               uint32_t msg_length,
+                               uint32_t response_length)
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
-    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
-    assert(size == 2 || size == 4 || size == 8);
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_oblock_rw.msg_type = msg_type;
     insn->bits3.gen7_oblock_rw.bti = bti;
-    insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : 4);
+    insn->bits3.gen7_oblock_rw.block_size = block_size;
     insn->bits3.gen7_oblock_rw.header_present = 1;
-  static void setMBlockRW(GenEncoder *p,
-                          GenNativeInstruction *insn,
-                          uint32_t bti,
-                          uint32_t msg_type,
-                          uint32_t msg_length,
-                          uint32_t response_length)
+  uint32_t GenEncoder::getOBlockSize(uint32_t oword_size, bool low_half)
+  {
+    /* 000: 1 OWord, read into or written from the low 128 bits of the destination register.
+     * 001: 1 OWord, read into or written from the high 128 bits of the destination register.
+     * 010: 2 OWords
+     * 011: 4 OWords
+     * 100: 8 OWords */
+    switch(oword_size)
+    {
+      case 1: return low_half ? 0 : 1;
+      case 2: return 2;
+      case 4: return 3;
+      case 8: return 4;
+      default: NOT_SUPPORTED;
+    }
+    return 0;
+  }
+  void GenEncoder::setMBlockRW(GenNativeInstruction *insn,
+                               uint32_t bti,
+                               uint32_t msg_type,
+                               uint32_t msg_length,
+                               uint32_t response_length)
     const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
-    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_mblock_rw.msg_type = msg_type;
     insn->bits3.gen7_mblock_rw.bti = bti;
     insn->bits3.gen7_mblock_rw.header_present = 1;
@@ -411,7 +424,31 @@ namespace gbe
     return insn->bits3.ud;
-  void GenEncoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+  unsigned GenEncoder::generateUntypedWriteSendsMessageDesc(unsigned bti, unsigned elemNum) {
+    GenNativeInstruction insn;
+    memset(&insn, 0, sizeof(GenNativeInstruction));
+    return setUntypedWriteSendsMessageDesc(&insn, bti, elemNum);
+  }
+  unsigned GenEncoder::setUntypedWriteSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum)
+  {
+    assert(0);
+    return 0;
+  }
+  void GenEncoder::UNTYPED_READA64(GenRegister dst, GenRegister src, uint32_t elemNum) {
+    assert(0);
+  }
+  void GenEncoder::UNTYPED_WRITEA64(GenRegister src, uint32_t elemNum){
+    assert(0);
+  }
+  void GenEncoder::ATOMICA64(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+    assert(0);
+  }
+  void GenEncoder::UNTYPED_WRITE(GenRegister msg, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
@@ -449,8 +486,7 @@ namespace gbe
       response_length = 2;
     } else
-    setDPByteScatterGather(this,
-                           insn,
+    setDPByteScatterGather(insn,
@@ -482,6 +518,18 @@ namespace gbe
     return setByteScatterMessageDesc(&insn, bti, elemSize);
+  unsigned GenEncoder::generateByteScatterSendsMessageDesc(unsigned bti, unsigned elemSize) {
+    GenNativeInstruction insn;
+    memset(&insn, 0, sizeof(GenNativeInstruction));
+    return setByteScatterSendsMessageDesc(&insn, bti, elemSize);
+  }
+  unsigned GenEncoder::setByteScatterSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize)
+  {
+    assert(0);
+    return 0;
+  }
   unsigned GenEncoder::setByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
@@ -492,8 +540,7 @@ namespace gbe
     } else
-    setDPByteScatterGather(this,
-                           insn,
+    setDPByteScatterGather(insn,
@@ -502,7 +549,7 @@ namespace gbe
     return insn->bits3.ud;
-  void GenEncoder::BYTE_SCATTER(GenRegister msg, GenRegister bti, uint32_t elemSize) {
+  void GenEncoder::BYTE_SCATTER(GenRegister msg, GenRegister data, GenRegister bti, uint32_t elemSize, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
@@ -524,6 +571,13 @@ namespace gbe
       this->setSrc1(insn, bti);
+  void GenEncoder::BYTE_GATHERA64(GenRegister dst, GenRegister src, uint32_t elemSize) {
+    assert(0);
+  }
+  void GenEncoder::BYTE_SCATTERA64(GenRegister src, uint32_t elemSize){
+    assert(0);
+  }
   void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
@@ -589,15 +643,19 @@ namespace gbe
     return insn->bits3.ud;
+  unsigned GenEncoder::setAtomicA64MessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum, int type_long) {
+    GBE_ASSERT(0);
+    return 0;
+  }
-  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
     this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(addr.nr, 0));
     if (bti.file == GEN_IMMEDIATE_VALUE) {
       this->setSrc1(insn, GenRegister::immud(0));
       setAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
@@ -795,6 +853,7 @@ namespace gbe
  // ALU2(BRC)
  // ALU1(ENDIF)
  //  ALU1(IF)
@@ -873,27 +932,30 @@ namespace gbe
      this->setDst(insn, GenRegister::null());
      this->setSrc0(insn, src);
+     this->setSrc1(insn, GenRegister::immud(0));
      setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
      insn->bits3.msg_gateway.sub_function_id = GEN_BARRIER_MSG;
      insn->bits3.msg_gateway.notify = 0x1;
   void GenEncoder::FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN) {
-     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-     this->setHeader(insn);
-     this->setDst(insn, GenRegister::null());
-     this->setSrc0(insn, src);
-     setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
-     insn->bits3.msg_gateway.sub_function_id = GEN_FORWARD_MSG;
-     GBE_ASSERT(notifyN <= 2);
-     insn->bits3.msg_gateway.notify = notifyN;
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::null());
+    this->setSrc0(insn, src);
+    this->setSrc1(insn, GenRegister::immud(0));
+    setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
+    insn->bits3.msg_gateway.sub_function_id = GEN_FORWARD_MSG;
+    GBE_ASSERT(notifyN <= 2);
+    insn->bits3.msg_gateway.notify = notifyN;
-  void GenEncoder::FENCE(GenRegister dst) {
+  void GenEncoder::FENCE(GenRegister dst, bool flushRWCache) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setDst(insn, dst);
     this->setSrc0(insn, dst);
+    this->setSrc1(insn, GenRegister::immud(0));
     setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1);
     insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
     insn->bits3.gen7_memory_fence.commit_enable = 0x1;
@@ -1154,11 +1216,16 @@ namespace gbe
      this->setDst(insn, dest);
      this->setSrc0(insn, msg);
+     this->setSrc1(insn, GenRegister::immud(0));
      setSamplerMessage(insn, bti, sampler, msg_type,
                        response_length, msg_length,
                        simd_mode, return_format);
+  void GenEncoder::FLUSH_SAMPLERCACHE(GenRegister dst) {
+    // only Gen8+ support flushing sampler cache
+    assert(0);
+  }
   void GenEncoder::setVmeMessage(GenNativeInstruction *insn,
                                 unsigned char bti,
@@ -1200,19 +1267,21 @@ namespace gbe
     this->setDst(insn, dest);
     this->setSrc0(insn, msg);
+    this->setSrc1(insn, GenRegister::immud(0));
     setVmeMessage(insn, bti, response_length, msg_length,
                   msg_type, vme_search_path_lut, lut_sub);
-  void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
+  void GenEncoder::TYPED_WRITE(GenRegister msg, GenRegister data, bool header_present, unsigned char bti, bool useSends)
-     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-     uint32_t msg_type = GEN_TYPED_WRITE;
-     uint32_t msg_length = header_present ? 9 : 8;
-     this->setHeader(insn);
-     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
-     this->setSrc0(insn, msg);
-     setTypedWriteMessage(insn, bti, msg_type, msg_length, header_present);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t msg_type = GEN_TYPED_WRITE;
+    uint32_t msg_length = header_present ? 9 : 8;
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    this->setSrc0(insn, msg);
+    this->setSrc1(insn, GenRegister::immud(0));
+    setTypedWriteMessage(insn, bti, msg_type, msg_length, header_present);
   static void setScratchMessage(GenEncoder *p,
                                    GenNativeInstruction *insn,
@@ -1258,72 +1327,86 @@ namespace gbe
      setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
-  void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+  void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     const uint32_t msg_length = 1;
-    const uint32_t response_length = size / 2; // Size is in regs
+    uint32_t sizeinreg = ow_size / 2;
+    // half reg should also have size 1
+    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+    const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
+    const uint32_t response_length = sizeinreg; // Size is in reg
     this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
-    setOBlockRW(this,
-                insn,
+    setOBlockRW(insn,
-                size,
+                block_size,
-  void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+  void GenEncoder::OBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t ow_size, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    const uint32_t msg_length = 1 + size / 2; // Size is in owords
+    uint32_t sizeinreg = ow_size / 2;
+    // half reg should also have size 1
+    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+    const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
     const uint32_t response_length = 0;
+    const uint32_t block_size = getOBlockSize(ow_size);
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
-    setOBlockRW(this,
-                insn,
+    setOBlockRW(insn,
-                size,
+                block_size,
-  void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+  void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     const uint32_t msg_length = 1;
-    const uint32_t response_length = size; // Size of registers
+    const uint32_t response_length = response_size; // Size of registers
     this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
-    setMBlockRW(this,
-                insn,
+    setMBlockRW(insn,
-  void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+  void GenEncoder::MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t data_size, bool useSends) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    const uint32_t msg_length = 1 + size;
+    const uint32_t msg_length = 1 + data_size;
     const uint32_t response_length = 0; // Size of registers
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
-    setMBlockRW(this,
-                insn,
+    setMBlockRW(insn,
+  void GenEncoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize) {
+  }
+  void GenEncoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize) {
+  }
   void GenEncoder::EOT(uint32_t msg) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 0f835ca..3e45c81 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -131,6 +131,7 @@ namespace gbe
+    ALU1(BFREV)
 #undef ALU1
 #undef ALU2
 #undef ALU2_MOD
@@ -144,7 +145,7 @@ namespace gbe
     /*! Forward the gateway message. */
     void FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN = 0);
     /*! Memory fence message (to order loads and stores between threads) */
-    void FENCE(GenRegister dst);
+    virtual void FENCE(GenRegister dst, bool flushRWCache);
     /*! Jump indexed instruction */
     virtual void JMPI(GenRegister src, bool longjmp = false);
     /*! IF indexed instruction */
@@ -170,15 +171,25 @@ namespace gbe
     /*! Wait instruction (used for the barrier) */
     void WAIT(uint32_t n = 0);
     /*! Atomic instructions */
-    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister addr, GenRegister data, GenRegister bti, uint32_t srcNum, bool useSends);
+    /*! AtomicA64 instructions */
+    virtual void ATOMICA64(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
     /*! Untyped read (upto 4 channels) */
     virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
-    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends);
+    /*! Untyped read A64(upto 4 channels) */
+    virtual void UNTYPED_READA64(GenRegister dst, GenRegister src, uint32_t elemNum);
+    /*! Untyped write (upto 4 channels) */
+    virtual void UNTYPED_WRITEA64(GenRegister src, uint32_t elemNum);
     /*! Byte gather (for unaligned bytes, shorts and ints) */
     void BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
-    void BYTE_SCATTER(GenRegister src, GenRegister bti, uint32_t elemSize);
+    virtual void BYTE_SCATTER(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemSize, bool useSends);
+    /*! Byte gather a64 (for unaligned bytes, shorts and ints) */
+    virtual void BYTE_GATHERA64(GenRegister dst, GenRegister src, uint32_t elemSize);
+    /*! Byte scatter a64 (for unaligned bytes, shorts and ints) */
+    virtual void BYTE_SCATTERA64(GenRegister src, uint32_t elemSize);
     /*! DWord gather (for constant cache read) */
     void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
     /*! for scratch memory read */
@@ -219,11 +230,14 @@ namespace gbe
                           uint32_t msg_type,
                           unsigned char vme_search_path_lut,
                           unsigned char lut_sub);
+    virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
     /*! TypedWrite instruction for texture */
     virtual void TYPED_WRITE(GenRegister header,
+                             GenRegister data,
                              bool header_present,
-                             unsigned char bti);
+                             unsigned char bti,
+                             bool useSends);
     /*! Extended math function (2 sources) */
     void MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1);
     /*! Extended math function (1 source) */
@@ -235,6 +249,8 @@ namespace gbe
     // Helper functions to encode
+    void setDPByteScatterGather(GenNativeInstruction *insn, uint32_t bti, uint32_t elem_size,
+                                     uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
     virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
                                 uint32_t msg_type, uint32_t msg_length,
                                 uint32_t response_length);
@@ -245,16 +261,21 @@ namespace gbe
                               unsigned msg_length, unsigned response_length,
                               bool header_present = false, bool end_of_thread = false);
     virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+    virtual unsigned setAtomicA64MessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum, int type_long);
     virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
     virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    virtual unsigned setUntypedWriteSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
     unsigned setByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
     unsigned setByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+    virtual unsigned setByteScatterSendsMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
     unsigned generateAtomicMessageDesc(unsigned function, unsigned bti, unsigned srcNum);
     unsigned generateUntypedReadMessageDesc(unsigned bti, unsigned elemNum);
     unsigned generateUntypedWriteMessageDesc(unsigned bti, unsigned elemNum);
+    unsigned generateUntypedWriteSendsMessageDesc(unsigned bti, unsigned elemNum);
     unsigned generateByteGatherMessageDesc(unsigned bti, unsigned elemSize);
     unsigned generateByteScatterMessageDesc(unsigned bti, unsigned elemSize);
+    unsigned generateByteScatterSendsMessageDesc(unsigned bti, unsigned elemSize);
     virtual void setHeader(GenNativeInstruction *insn) = 0;
     virtual void setDst(GenNativeInstruction *insn, GenRegister dest) = 0;
@@ -267,14 +288,24 @@ namespace gbe
     virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
                             GenRegister src1 = GenRegister::null());
     virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
+    /*! OBlock helper function */
+    uint32_t getOBlockSize(uint32_t oword_size, bool low_half = true);
+    void setMBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
+    void setOBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t block_size, uint32_t msg_type, uint32_t msg_length, uint32_t response_lengtha);
     /*! OBlock read */
-    void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size);
     /*! OBlock write */
-    void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+    virtual void OBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t ow_size, bool useSends);
     /*! MBlock read */
-    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size);
     /*! MBlock write */
-    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+    virtual void MBWRITE(GenRegister header, GenRegister data, uint32_t bti, uint32_t data_size, bool useSends);
+    /*! A64 OBlock read */
+    virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size);
+    /*! A64 OBlock write */
+    virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size);
     GBE_CLASS(GenEncoder); //!< Use custom allocators
     virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index c396626..c75557c 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -26,8 +26,14 @@ DECL_GEN7_SCHEDULE(Barrier,         80,        1,        1)
 DECL_GEN7_SCHEDULE(Fence,           80,        1,        1)
 DECL_GEN7_SCHEDULE(Read64,          80,        1,        1)
 DECL_GEN7_SCHEDULE(Write64,         80,        1,        1)
+DECL_GEN7_SCHEDULE(Read64A64,       80,        1,        1)
+DECL_GEN7_SCHEDULE(Write64A64,      80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedRead,     160,       1,        1)
 DECL_GEN7_SCHEDULE(UntypedWrite,    160,       1,        1)
+DECL_GEN7_SCHEDULE(UntypedReadA64,  160,       1,        1)
+DECL_GEN7_SCHEDULE(UntypedWriteA64, 160,       1,        1)
+DECL_GEN7_SCHEDULE(ByteGatherA64,   160,       1,        1)
+DECL_GEN7_SCHEDULE(ByteScatterA64,  160,       1,        1)
 DECL_GEN7_SCHEDULE(ByteGather,      160,       1,        1)
 DECL_GEN7_SCHEDULE(ByteScatter,     160,       1,        1)
 DECL_GEN7_SCHEDULE(DWordGather,     160,       1,        1)
@@ -41,6 +47,7 @@ DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
 DECL_GEN7_SCHEDULE(SpillReg,        20,        1,        1)
 DECL_GEN7_SCHEDULE(UnSpillReg,      160,       1,        1)
 DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
+DECL_GEN7_SCHEDULE(AtomicA64,       80,        1,        1)
 DECL_GEN7_SCHEDULE(I64MUL,          20,        40,      20)
 DECL_GEN7_SCHEDULE(I64SATADD,       20,        40,      20)
 DECL_GEN7_SCHEDULE(I64SATSUB,       20,        40,      20)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 2b89c7f..22b0ddc 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -144,6 +144,7 @@ namespace gbe
       case GEN_TYPE_UL: return TYPE_U64;
       case GEN_TYPE_F: return TYPE_FLOAT;
       case GEN_TYPE_DF: return TYPE_DOUBLE;
+      case GEN_TYPE_HF : return TYPE_HALF;
       default: NOT_SUPPORTED; return TYPE_FLOAT;
@@ -168,7 +169,7 @@ namespace gbe
   SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) :
     parent(NULL), opcode(op), dstNum(dst), srcNum(src)
-    extra.function = 0;
+    extra = { 0 };
   void SelectionInstruction::prepend(SelectionInstruction &other) {
@@ -183,9 +184,13 @@ namespace gbe
   bool SelectionInstruction::isRead(void) const {
     return this->opcode == SEL_OP_UNTYPED_READ ||
+           this->opcode == SEL_OP_UNTYPED_READA64 ||
            this->opcode == SEL_OP_READ64       ||
+           this->opcode == SEL_OP_READ64A64       ||
            this->opcode == SEL_OP_ATOMIC       ||
+           this->opcode == SEL_OP_ATOMICA64       ||
            this->opcode == SEL_OP_BYTE_GATHER  ||
+           this->opcode == SEL_OP_BYTE_GATHERA64  ||
            this->opcode == SEL_OP_SAMPLE ||
            this->opcode == SEL_OP_VME ||
            this->opcode == SEL_OP_DWORD_GATHER ||
@@ -209,9 +214,13 @@ namespace gbe
   bool SelectionInstruction::isWrite(void) const {
     return this->opcode == SEL_OP_UNTYPED_WRITE ||
+           this->opcode == SEL_OP_UNTYPED_WRITEA64 ||
            this->opcode == SEL_OP_WRITE64       ||
+           this->opcode == SEL_OP_WRITE64A64       ||
            this->opcode == SEL_OP_ATOMIC        ||
+           this->opcode == SEL_OP_ATOMICA64        ||
            this->opcode == SEL_OP_BYTE_SCATTER  ||
+           this->opcode == SEL_OP_BYTE_SCATTERA64  ||
            this->opcode == SEL_OP_TYPED_WRITE ||
            this->opcode == SEL_OP_OBWRITE ||
            this->opcode == SEL_OP_MBWRITE;
@@ -225,6 +234,50 @@ namespace gbe
     return this->opcode == SEL_OP_LABEL;
+  bool SelectionInstruction::sameAsDstRegion(uint32_t srcID) {
+    assert(srcID < srcNum);
+    if (dstNum == 0)
+      return true;
+    GenRegister &srcReg = this->src(srcID);
+    for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+      const GenRegister &dstReg = this->dst(dstID);
+      if (!dstReg.isSameRegion(srcReg))
+        return false;
+    }
+    return true;
+  }
+  bool SelectionInstruction::isNative(void) const {
+    return this->opcode == SEL_OP_NOT         || /* ALU1 */
+           this->opcode == SEL_OP_LZD         ||
+           this->opcode == SEL_OP_RNDZ        ||
+           this->opcode == SEL_OP_RNDE        ||
+           this->opcode == SEL_OP_RNDD        ||
+           this->opcode == SEL_OP_RNDU        ||
+           this->opcode == SEL_OP_FRC         ||
+           this->opcode == SEL_OP_F16TO32     ||
+           this->opcode == SEL_OP_F32TO16     ||
+           this->opcode == SEL_OP_CBIT        ||
+           this->opcode == SEL_OP_SEL         || /* ALU2 */
+           this->opcode == SEL_OP_AND         ||
+           this->opcode == SEL_OP_OR          ||
+           this->opcode == SEL_OP_XOR         ||
+           this->opcode == SEL_OP_SHR         ||
+           this->opcode == SEL_OP_SHL         ||
+           this->opcode == SEL_OP_RSR         ||
+           this->opcode == SEL_OP_RSL         ||
+           this->opcode == SEL_OP_ASR         ||
+           this->opcode == SEL_OP_SEL         ||
+           this->opcode == SEL_OP_ADD         ||
+           this->opcode == SEL_OP_MUL         ||
+           this->opcode == SEL_OP_FBH         ||
+           this->opcode == SEL_OP_FBL         ||
+           this->opcode == SEL_OP_MACH        ||
+           this->opcode == SEL_OP_MATH        ||
+           this->opcode == SEL_OP_LRP         || /* ALU3 */
+           this->opcode == SEL_OP_MAD;
+  }
   // SelectionVector
@@ -237,7 +290,7 @@ namespace gbe
   // SelectionBlock
-  SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), isLargeBlock(false), endifLabel( (ir::LabelIndex) 0), removeSimpleIfEndif(false){}
+  SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), endifLabel( (ir::LabelIndex) 0), removeSimpleIfEndif(false){}
   void SelectionBlock::append(ir::Register reg) { tmp.push_back(reg); }
@@ -364,7 +417,9 @@ namespace gbe
     /*! spill a register (insert spill/unspill instructions) */
     INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
     bool has32X32Mul() const { return bHas32X32Mul; }
+    bool hasSends() const { return bHasSends; }
     void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
+    void setHasSends(bool b) { bHasSends = b; }
     bool hasLongType() const { return bHasLongType; }
     bool hasDoubleType() const { return bHasDoubleType; }
     bool hasHalfType() const { return bHasHalfType; }
@@ -570,6 +625,7 @@ namespace gbe
+    ALU1(BFREV)
 #undef ALU1
 #undef ALU1WithTemp
 #undef ALU2
@@ -632,10 +688,16 @@ namespace gbe
     void WAIT(uint32_t n = 0);
     /*! Atomic instruction */
     void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, GenRegister bti, vector<GenRegister> temps);
+    /*! AtomicA64 instruction */
+    void ATOMICA64(Reg dst, uint32_t function, uint32_t srcNum, vector<GenRegister> src, GenRegister bti, vector<GenRegister> temps);
     /*! Read 64 bits float/int array */
     void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, vector<GenRegister> temps);
     /*! Write 64 bits float/int array */
     void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, vector<GenRegister> temps);
+    /*! Read64 A64 */
+    void READ64A64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum);
+    /*! write64 a64 */
+    void WRITE64A64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum);
     /*! Untyped read (up to 4 elements) */
     void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, vector<GenRegister> temps);
     /*! Untyped write (up to 4 elements) */
@@ -644,6 +706,14 @@ namespace gbe
     void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, vector<GenRegister> temps);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
     void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, vector <GenRegister> temps);
+    /*! Byte gather a64 (for unaligned bytes, shorts and ints) */
+    void BYTE_GATHERA64(Reg dst, Reg addr, uint32_t elemSize);
+    /*! Byte scatter (for unaligned bytes, shorts and ints) */
+    void BYTE_SCATTERA64(GenRegister *msg, unsigned msgNum, uint32_t elemSize);
+    /*! Untyped read (up to 4 elements) */
+    void UNTYPED_READA64(Reg addr, const GenRegister *dst, uint32_t dstNum, uint32_t elemNum);
+    /*! Untyped write (up to 4 elements) */
+    void UNTYPED_WRITEA64(const GenRegister *msgs, uint32_t msgNum, uint32_t elemNum);
     /*! DWord scatter (for constant cache read) */
     void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
     /*! Unpack the uint to charN */
@@ -681,7 +751,7 @@ namespace gbe
     /*! Store the profiling info */
     void STORE_PROFILING(uint32_t profilingType, uint32_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister ts[5], int tsNum);
     /*! Printf */
-    void PRINTF(GenRegister dst, uint8_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister src[8],
+    void PRINTF(uint8_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister src[8],
                 int srcNum, uint16_t num, bool isContinue, uint32_t totalSize);
     /*! Multiply 64-bit integers */
     void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool native_long);
@@ -696,19 +766,19 @@ namespace gbe
                       GenRegister tmpData1,
                       GenRegister localThreadID, GenRegister localThreadNUM,
                       GenRegister tmpData2, GenRegister slmOff,
-                      vector<GenRegister> msg, uint32_t msgSizeReq,
+                      vector<GenRegister> msg,
                       GenRegister localBarrier);
     /*! Sub Group Operations */
     void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
                       GenRegister tmpData1, GenRegister tmpData2);
     /*! Oblock read */
-    void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t ow_size);
     /*! Oblock write */
-    void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    void OBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t ow_size);
     /*! Media block read */
-    void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t response_size);
     /*! Media block write */
-    void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    void MBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t data_size);
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI
@@ -777,9 +847,9 @@ namespace gbe
             GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
-    INLINE vector<GenRegister> getBTITemps(const ir::BTI &bti) {
+    INLINE vector<GenRegister> getBTITemps(const ir::AddressMode &AM) {
       vector<GenRegister> temps;
-      if (!bti.isConst) {
+      if (AM == ir::AM_DynamicBti) {
         temps.push_back(selReg(reg(ir::FAMILY_WORD, true), ir::TYPE_U16));
         temps.push_back(selReg(reg(ir::FAMILY_DWORD, true), ir::TYPE_U32));
@@ -798,6 +868,7 @@ namespace gbe
     bool bHasDoubleType;
     bool bHasHalfType;
     bool bLongRegRestrict;
+    bool bHasSends;
     uint32_t ldMsgOrder;
     bool slowByteGather;
     INLINE ir::LabelIndex newAuxLabel()
@@ -840,7 +911,7 @@ namespace gbe
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
     stateNum(0), vectorNum(0), bwdCodeGeneration(false), storeThreadMap(false),
     currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false), bHasLongType(false),
-    bHasDoubleType(false), bHasHalfType(false), bLongRegRestrict(false),
+    bHasDoubleType(false), bHasHalfType(false), bLongRegRestrict(false), bHasSends(false),
     ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false)
     const ir::Function &fn = ctx.getFunction();
@@ -1094,7 +1165,7 @@ namespace gbe
         mov->state.predicate = GEN_PREDICATE_NORMAL;
         mov->state.flag = 0;
-        mov->state.subFlag = 0;
+        mov->state.subFlag = 1;
       if (this->isScalarReg(insn->src(regID).reg()))
         mov->state.noMask = 1;
@@ -1128,7 +1199,7 @@ namespace gbe
         mov->state.predicate = GEN_PREDICATE_NORMAL;
         mov->state.flag = 0;
-        mov->state.subFlag = 0;
+        mov->state.subFlag = 1;
       if (simdWidth == 1) {
         mov->state.noMask = 1;
@@ -1318,6 +1389,48 @@ namespace gbe
     insn->extra.function = function;
     insn->extra.elem = msgPayload;
+    if (hasSends() && msgPayload > 1) {
+      insn->extra.splitSend = 1;
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = 1;
+      vector->offsetID = 0;
+      vector->reg = &insn->src(0);
+      vector->isSrc = 1;
+      vector = this->appendVector();
+      vector->regNum = msgPayload - 1;
+      vector->offsetID = 1;
+      vector->reg = &insn->src(1);
+      vector->isSrc = 1;
+    } else {
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = msgPayload; //bti not included in SelectionVector
+      vector->offsetID = 0;
+      vector->reg = &insn->src(0);
+      vector->isSrc = 1;
+    }
+  }
+  void Selection::Opaque::ATOMICA64(Reg dst, uint32_t function,
+                                 uint32_t msgPayload, vector<GenRegister> src,
+                                 GenRegister bti,
+                                 vector<GenRegister> temps) {
+    unsigned dstNum = 1 + temps.size();
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMICA64, dstNum, msgPayload + 1);
+    insn->dst(0) = dst;
+    if(temps.size()) {
+      insn->dst(1) = temps[0];
+      insn->dst(2) = temps[1];
+    }
+    for (uint32_t elemID = 0; elemID < msgPayload; ++elemID)
+      insn->src(elemID) = src[elemID];
+    insn->src(msgPayload) = bti;
+    insn->extra.function = function;
+    insn->extra.elem = msgPayload;
     SelectionVector *vector = this->appendVector();
     vector->regNum = msgPayload; //bti not included in SelectionVector
     vector->offsetID = 0;
@@ -1399,6 +1512,39 @@ namespace gbe
     srcVector->reg = &insn->src(0);
+  void Selection::Opaque::READ64A64(Reg addr,
+                                 const GenRegister *dst,
+                                 const GenRegister *tmp,
+                                 uint32_t elemNum)
+  {
+    SelectionInstruction *insn = NULL;
+    SelectionVector *srcVector = NULL;
+    SelectionVector *dstVector = NULL;
+    insn = this->appendInsn(SEL_OP_READ64A64,elemNum*2, 1);
+    srcVector = this->appendVector();
+    dstVector = this->appendVector();
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->dst(elemID) = tmp[elemID];
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->dst(elemID + elemNum) = dst[elemID];
+    insn->src(0) = addr;
+    insn->extra.elem = elemNum;
+    dstVector->regNum = elemNum;
+    dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
+    dstVector->reg = &insn->dst(0);
+    srcVector->regNum = 1;
+    srcVector->offsetID = 0;
+    srcVector->isSrc = 1;
+    srcVector->reg = &insn->src(0);
+  }
   void Selection::Opaque::UNTYPED_READ(Reg addr,
                                        const GenRegister *dst,
                                        uint32_t elemNum,
@@ -1439,6 +1585,34 @@ namespace gbe
     srcVector->offsetID = 0;
     srcVector->reg = &insn->src(0);
+  void Selection::Opaque::UNTYPED_READA64(Reg addr,
+                                       const GenRegister *dst,
+                                       uint32_t dstNum,
+                                       uint32_t elemNum)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READA64, dstNum, 1);
+    SelectionVector *srcVector = this->appendVector();
+    SelectionVector *dstVector = this->appendVector();
+    if (this->isScalarReg(dst[0].reg()))
+      insn->state.noMask = 1;
+    // Regular instruction to encode
+    for (uint32_t id = 0; id < dstNum; ++id)
+      insn->dst(id) = dst[id];
+    insn->src(0) = addr;
+    insn->extra.elem = elemNum;
+    // Sends require contiguous allocation
+    dstVector->regNum = dstNum;
+    dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
+    dstVector->reg = &insn->dst(0);
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->offsetID = 0;
+    srcVector->reg = &insn->src(0);
+  }
   void Selection::Opaque::WRITE64(Reg addr,
                                   const GenRegister *src,
@@ -1479,7 +1653,6 @@ namespace gbe
       // dst: srcNum, (flagTemp)
       // src: srcNum, addr, srcNum, bti.
       insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
-      vector = this->appendVector();
       for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
         insn->src(elemID) = src[elemID];
@@ -1500,10 +1673,29 @@ namespace gbe
       insn->extra.elem = srcNum;
-      vector->regNum = srcNum + 1;
-      vector->offsetID = srcNum;
-      vector->reg = &insn->src(srcNum);
-      vector->isSrc = 1;
+      if (hasSends()) {
+        insn->extra.splitSend = 1;
+        //addr regs
+        vector = this->appendVector();
+        vector->regNum = 1;
+        vector->offsetID = srcNum;
+        vector->reg = &insn->src(srcNum);
+        vector->isSrc = 1;
+        //data regs
+        vector = this->appendVector();
+        vector->regNum = srcNum;
+        vector->offsetID = srcNum+1;
+        vector->reg = &insn->src(srcNum+1);
+        vector->isSrc = 1;
+      } else {
+        vector = this->appendVector();
+        vector->regNum = srcNum + 1;
+        vector->offsetID = srcNum;
+        vector->reg = &insn->src(srcNum);
+        vector->isSrc = 1;
+      }
     if (bti.file != GEN_IMMEDIATE_VALUE) {
@@ -1512,6 +1704,38 @@ namespace gbe
+  void Selection::Opaque::WRITE64A64(Reg addr,
+                                  const GenRegister *src,
+                                  const GenRegister *tmp,
+                                  uint32_t srcNum)
+  {
+    SelectionVector *vector = NULL;
+    SelectionInstruction *insn = NULL;
+    const uint32_t dstNum = srcNum;
+    insn = this->appendInsn(SEL_OP_WRITE64A64, dstNum, srcNum*2 + 1);
+    vector = this->appendVector();
+    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+      insn->src(elemID) = src[elemID];
+    insn->src(srcNum) = addr;
+    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+      insn->src(srcNum + 1 + elemID) = tmp[elemID];
+    /* We also need to add the tmp reigster to dst, in order
+       to avoid the post schedule error . */
+    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+      insn->dst(elemID) = tmp[elemID];
+    insn->extra.elem = srcNum;
+    vector->regNum = srcNum + 1;
+    vector->offsetID = srcNum;
+    vector->reg = &insn->src(srcNum);
+    vector->isSrc = 1;
+  }
   void Selection::Opaque::UNTYPED_WRITE(Reg addr,
                                         const GenRegister *src,
                                         uint32_t elemNum,
@@ -1521,7 +1745,6 @@ namespace gbe
     unsigned dstNum = temps.size();
     unsigned srcNum = elemNum + 2 + temps.size();
     SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, dstNum, srcNum);
-    SelectionVector *vector = this->appendVector();
     if (bti.file != GEN_IMMEDIATE_VALUE) {
       insn->state.flag = 0;
@@ -1541,11 +1764,45 @@ namespace gbe
     insn->extra.elem = elemNum;
+    if (hasSends()) {
+      insn->extra.splitSend = 1;
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = elemNum;
+      vector->reg = &insn->src(1);
+      vector->offsetID = 1;
+      vector->isSrc = 1;
+      vector = this->appendVector();
+      vector->regNum = 1;
+      vector->reg = &insn->src(0);
+      vector->offsetID = 0;
+      vector->isSrc = 1;
+    } else {
     // Sends require contiguous allocation for the sources
+      SelectionVector *vector = this->appendVector();
     vector->regNum = elemNum+1;
     vector->reg = &insn->src(0);
     vector->offsetID = 0;
     vector->isSrc = 1;
+    }
+  }
+  void Selection::Opaque::UNTYPED_WRITEA64(const GenRegister *src,
+                                        uint32_t msgNum,
+                                        uint32_t elemNum)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITEA64, 0, msgNum);
+    SelectionVector *vector = this->appendVector();
+    // Regular instruction to encode
+    for (uint32_t id = 0; id < msgNum; ++id)
+      insn->src(id) = src[id];
+    insn->extra.elem = elemNum;
+    // Sends require contiguous allocation for the sources
+    vector->regNum = msgNum;
+    vector->reg = &insn->src(0);
+    vector->offsetID = 0;
+    vector->isSrc = 1;
   void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr,
@@ -1591,7 +1848,6 @@ namespace gbe
                                        GenRegister bti, vector<GenRegister> temps) {
     unsigned dstNum = temps.size();
     SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, dstNum, 3);
-    SelectionVector *vector = this->appendVector();
     if (bti.file != GEN_IMMEDIATE_VALUE) {
       insn->state.flag = 0;
@@ -1608,8 +1864,59 @@ namespace gbe
     insn->src(2) = bti;
     insn->extra.elem = elemSize;
-    // value and address are contiguous in the send
-    vector->regNum = 2;
+    if (hasSends()) {
+      insn->extra.splitSend = 1;
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = 1;
+      vector->isSrc = 1;
+      vector->offsetID = 0;
+      vector->reg = &insn->src(0);
+      vector = this->appendVector();
+      vector->regNum = 1;
+      vector->isSrc = 1;
+      vector->offsetID = 1;
+      vector->reg = &insn->src(1);
+    } else {
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = 2;
+      vector->isSrc = 1;
+      vector->offsetID = 0;
+      vector->reg = &insn->src(0);
+    }
+  }
+  void Selection::Opaque::BYTE_GATHERA64(Reg dst, Reg addr, uint32_t elemSize) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHERA64, 1, 1);
+    SelectionVector *srcVector = this->appendVector();
+    SelectionVector *dstVector = this->appendVector();
+    if (this->isScalarReg(dst.reg()))
+      insn->state.noMask = 1;
+    insn->src(0) = addr;
+    insn->dst(0) = dst;
+    insn->extra.elem = elemSize;
+    dstVector->regNum = 1;
+    dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
+    dstVector->reg = &insn->dst(0);
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->offsetID = 0;
+    srcVector->reg = &insn->src(0);
+  }
+  void Selection::Opaque::BYTE_SCATTERA64(GenRegister *msg, uint32_t msgNum, uint32_t elemSize) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTERA64, 0, msgNum);
+    SelectionVector *vector = this->appendVector();
+    for (unsigned i = 0; i < msgNum; i++)
+      insn->src(i) = msg[i];
+    insn->extra.elem = elemSize;
+    vector->regNum = msgNum;
     vector->isSrc = 1;
     vector->offsetID = 0;
     vector->reg = &insn->src(0);
@@ -1930,51 +2237,41 @@ namespace gbe
-  void Selection::Opaque::PRINTF(GenRegister dst, uint8_t bti, GenRegister tmp0, GenRegister tmp1,
+  void Selection::Opaque::PRINTF(uint8_t bti, GenRegister tmp0, GenRegister tmp1,
                GenRegister src[8], int srcNum, uint16_t num, bool isContinue, uint32_t totalSize) {
-    if (isContinue) {
-      SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum + 1);
-      SelectionVector *vector = this->appendVector();
-      for (int i = 0; i < srcNum; i++)
-        insn->src(i) = src[i];
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 2, srcNum);
-      insn->src(srcNum) = tmp0;
+    for (int i = 0; i < srcNum; i++)
+      insn->src(i) = src[i];
-      insn->dst(0) = dst;
-      insn->dst(1) = tmp0;
-      insn->dst(2) = tmp1;
+    insn->dst(0) = tmp0;
+    insn->dst(1) = tmp1;
-      vector->regNum = 2;
-      vector->reg = &insn->dst(1);
+    if (hasSends()) {
+      insn->extra.printfSplitSend = 1;
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = 1;
+      vector->reg = &insn->dst(0);
       vector->offsetID = 0;
       vector->isSrc = 0;
-      insn->extra.printfSize = static_cast<uint16_t>(totalSize);
-      insn->extra.continueFlag = isContinue;
-      insn->extra.printfBTI = bti;
-      insn->extra.printfNum = num;
+      vector = this->appendVector();
+      vector->regNum = 1;
+      vector->reg = &insn->dst(1);
+      vector->offsetID = 1;
+      vector->isSrc = 0;
     } else {
-      SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum);
       SelectionVector *vector = this->appendVector();
-      for (int i = 0; i < srcNum; i++)
-        insn->src(i) = src[i];
-      insn->dst(0) = dst;
-      insn->dst(1) = tmp0;
-      insn->dst(2) = tmp1;
       vector->regNum = 2;
-      vector->reg = &insn->dst(1);
+      vector->reg = &insn->dst(0);
       vector->offsetID = 0;
       vector->isSrc = 0;
-      insn->extra.printfSize = static_cast<uint16_t>(totalSize);
-      insn->extra.continueFlag = isContinue;
-      insn->extra.printfBTI = bti;
-      insn->extra.printfNum = num;
+    insn->extra.printfSize = static_cast<uint16_t>(totalSize);
+    insn->extra.continueFlag = isContinue;
+    insn->extra.printfBTI = bti;
+    insn->extra.printfNum = num;
   void Selection::Opaque::WORKGROUP_OP(uint32_t wg_op,
@@ -1986,19 +2283,11 @@ namespace gbe
                                        GenRegister tmpData2,
                                        GenRegister slmOff,
                                        vector<GenRegister> msg,
-                                       uint32_t msgSizeReq,
                                        GenRegister localBarrier)
     SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 6);
-    SelectionVector *vector = this->appendVector();
-    /* allocate continuous GRF registers for READ/WRITE to SLM */
-    GBE_ASSERT(msg.size() >= msgSizeReq);
-    vector->regNum = msg.size();
-    vector->offsetID = 0;
-    vector->reg = &insn->dst(2);
-    vector->isSrc = 0;
-    insn->extra.workgroupOp = wg_op;
+    insn->extra.wgop.workgroupOp = wg_op;
     insn->dst(0) = dst;
     insn->dst(1) = tmpData1;
@@ -2011,6 +2300,29 @@ namespace gbe
     insn->src(3) = tmpData2;
     insn->src(4) = slmOff;
     insn->src(5) = localBarrier;
+    if (hasSends()) {
+      insn->extra.wgop.splitSend = 1;
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = 1;
+      vector->offsetID = 2;
+      vector->reg = &insn->dst(2);
+      vector->isSrc = 0;
+      vector = this->appendVector();
+      vector->regNum = msg.size() - 1;
+      vector->offsetID = 3;
+      vector->reg = &insn->dst(3);
+      vector->isSrc = 0;
+    } else {
+      /* allocate continuous GRF registers for READ/WRITE to SLM */
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = msg.size();
+      vector->offsetID = 2;
+      vector->reg = &insn->dst(2);
+      vector->isSrc = 0;
+    }
   void Selection::Opaque::SUBGROUP_OP(uint32_t wg_op,
@@ -2021,7 +2333,7 @@ namespace gbe
     SelectionInstruction *insn = this->appendInsn(SEL_OP_SUBGROUP_OP, 2, 2);
-    insn->extra.workgroupOp = wg_op;
+    insn->extra.wgop.workgroupOp = wg_op;
     insn->dst(0) = dst;
     insn->dst(1) = tmpData1;
@@ -2031,119 +2343,115 @@ namespace gbe
   void Selection::Opaque::OBREAD(GenRegister* dsts,
                                  uint32_t vec_size,
-                                 GenRegister addr,
                                  GenRegister header,
                                  uint32_t bti,
-                                 GenRegister* tmp,
-                                 uint32_t tmp_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + vec_size + tmp_size, 1);
+                                 uint32_t ow_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size, 1);
     SelectionVector *vector = this->appendVector();
-    insn->dst(0) = header;
+    insn->src(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i)
-      insn->dst(1 + i) = dsts[i];
-    for (uint32_t i = 0; i < tmp_size; ++i)
-      insn->dst(1 + i + vec_size) = tmp[i];
-    insn->src(0) = addr;
+      insn->dst(i) = dsts[i];
-    insn->extra.elem = vec_size; // number of vector size
+    insn->extra.elem = ow_size; // number of OWord size
     // tmp regs for OWORD read dst
-    vector->regNum = tmp_size;
-    vector->reg = &insn->dst(1 + vec_size);
-    vector->offsetID = 1 + vec_size;
+    vector->regNum = vec_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
     vector->isSrc = 0;
-  void Selection::Opaque::OBWRITE(GenRegister addr,
+  void Selection::Opaque::OBWRITE(GenRegister header,
                                   GenRegister* values,
                                   uint32_t vec_size,
-                                  GenRegister header,
                                   uint32_t bti,
-                                  GenRegister* tmp,
-                                  uint32_t tmp_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1);
-    SelectionVector *vector = this->appendVector();
-    insn->src(0) = addr;
+                                  uint32_t ow_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, vec_size + 1);
+    insn->src(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i)
       insn->src(i + 1) = values[i];
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < tmp_size; ++i)
-      insn->dst(i + 1) = tmp[i];
-    insn->extra.elem = vec_size; // number of vector_size
+    insn->extra.elem = ow_size; // number of OWord_size
+    // For A64 write, we did not add sends support yet.
+    if (hasSends() && bti != 255) {
+      insn->extra.splitSend = 1;
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = 1;
+      vector->reg = &insn->src(0);
+      vector->offsetID = 0;
+      vector->isSrc = 1;
+      vector = this->appendVector();
+      vector->regNum = vec_size;
+      vector->reg = &insn->src(1);
+      vector->offsetID = 1;
+      vector->isSrc = 1;
+    } else {
+      // tmp regs for OWORD write header and values
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = vec_size + 1;
+      vector->reg = &insn->src(0);
+      vector->offsetID = 0;
+      vector->isSrc = 1;
+    }
-    // tmp regs for OWORD read dst
-    vector->regNum = tmp_size + 1;
-    vector->reg = &insn->dst(0);
-    vector->offsetID = 0;
-    vector->isSrc = 0;
   void Selection::Opaque::MBREAD(GenRegister* dsts,
-                                 GenRegister coordx,
-                                 GenRegister coordy,
+                                 uint32_t tmp_size,
                                  GenRegister header,
-                                 GenRegister* tmp,
                                  uint32_t bti,
-                                 uint32_t vec_size) {
+                                 uint32_t response_size) {
-    uint32_t simdWidth = curr.execWidth;
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2);
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < vec_size; ++i) {
-      insn->dst(i + 1) = dsts[i];
-      if(simdWidth == 16)
-        insn->dst(i + vec_size + 1) = tmp[i];
-    }
-    insn->src(0) = coordx;
-    insn->src(1) = coordy;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, tmp_size, 1);
+    insn->src(0) = header;
-    insn->extra.elem = vec_size; // vector size
+    insn->extra.elem = response_size; // send response length
-    // Only in simd 8 the data is in vector form
-    if(simdWidth == 8) {
-      SelectionVector *vector = this->appendVector();
-      vector->regNum = vec_size;
-      vector->reg = &insn->dst(1);
-      vector->offsetID = 1;
-      vector->isSrc = 0;
-    }
-    if(simdWidth == 16)
-    {
-      SelectionVector *vectortmp = this->appendVector();
-      vectortmp->regNum = vec_size;
-      vectortmp->reg = &insn->dst(vec_size + 1);
-      vectortmp->offsetID = vec_size + 1;
-      vectortmp->isSrc = 0;
+    for (uint32_t i = 0; i < tmp_size; ++i) {
+      insn->dst(i) = dsts[i];
+    SelectionVector *vector = this->appendVector();
+    vector->regNum = tmp_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
+    vector->isSrc = 0;
-  void Selection::Opaque::MBWRITE(GenRegister coordx,
-                                  GenRegister coordy,
+  void Selection::Opaque::MBWRITE(GenRegister header,
                                   GenRegister* values,
-                                  GenRegister header,
-                                  GenRegister* tmp,
+                                  uint32_t tmp_size,
                                   uint32_t bti,
-                                  uint32_t vec_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size);
-    SelectionVector *vector = this->appendVector();
-    insn->src(0) = coordx;
-    insn->src(1) = coordy;
-    for (uint32_t i = 0; i < vec_size; ++i)
-      insn->src(2 + i) = values[i];
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < vec_size; ++i)
-      insn->dst(1 + i) = tmp[i];
-    insn->state = this->curr;
+                                  uint32_t data_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 + tmp_size);
+    insn->src(0) = header;
+    for (uint32_t i = 0; i < tmp_size; ++i)
+      insn->src(1 + i) = values[i];
-    insn->extra.elem = vec_size; // vector size
+    insn->extra.elem = data_size; // msg data part size
-    // We need to put the header and the data together
-    vector->regNum = 1 + vec_size;
-    vector->reg = &insn->dst(0);
-    vector->offsetID = 0;
-    vector->isSrc = 0;
+    if (hasSends()) {
+      insn->extra.splitSend = 1;
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = 1;
+      vector->reg = &insn->src(0);
+      vector->offsetID = 0;
+      vector->isSrc = 1;
+      vector = this->appendVector();
+      vector->regNum = tmp_size;
+      vector->reg = &insn->src(1);
+      vector->offsetID = 1;
+      vector->isSrc = 1;
+    } else {
+      // We need to put the header and the data together
+      SelectionVector *vector = this->appendVector();
+      vector->regNum = 1 + tmp_size;
+      vector->reg = &insn->src(0);
+      vector->offsetID = 0;
+      vector->isSrc = 1;
+    }
   // Boiler plate to initialize the selection library at c++ pre-main
@@ -2364,7 +2672,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
             this->curr.predicate = GEN_PREDICATE_NORMAL;
             this->curr.flag = 0;
-            this->curr.subFlag = 0;
+            this->curr.subFlag = 1;
         // If there is no branch at the end of this block.
@@ -2379,7 +2687,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
             this->curr.predicate = GEN_PREDICATE_NONE;
             this->curr.flag = 0;
-            this->curr.subFlag = 0;
+            this->curr.subFlag = 1;
         // If we are in if/endif fix mode, and this block is
@@ -2389,13 +2697,14 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
             this->block->insnList.size() != 0 &&
             this->block->insnList.size() % 1000 == 0 &&
             this->block->endifLabel.value() != 0) {
+          this->curr.flag = 0;
+          this->curr.subFlag = 1;
           ir::LabelIndex jip = this->block->endifLabel;
           this->ENDIF(GenRegister::immd(0), jip);
             this->curr.predicate = GEN_PREDICATE_NORMAL;
             this->IF(GenRegister::immd(0), jip, jip);
-          this->block->isLargeBlock = true;
         // Output the code in the current basic block
@@ -2524,6 +2833,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
+    this->opaque->setHasSends(true);
     opt_features = SIOF_LOGICAL_SRCMOD;
@@ -2545,6 +2855,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
+    this->opaque->setHasSends(true);
     opt_features = SIOF_LOGICAL_SRCMOD;
@@ -2553,7 +2864,6 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     uint32_t elemID = 0;
     uint32_t i;
     SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum);
-    SelectionVector *msgVector = this->appendVector();;
     for( i = 0; i < msgNum; ++i, ++elemID)
       insn->src(elemID) = msgs[i];
@@ -2561,11 +2871,31 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     insn->extra.msglen = msgNum;
     insn->extra.is3DWrite = is3D;
-    // Sends require contiguous allocation
-    msgVector->regNum = msgNum;
-    msgVector->isSrc = 1;
-    msgVector->offsetID = 0;
-    msgVector->reg = &insn->src(0);
+    if (hasSends()) {
+      assert(msgNum == 9);
+      insn->extra.typedWriteSplitSend = 1;
+      //header + coords
+      SelectionVector *msgVector = this->appendVector();
+      msgVector->regNum = 5;
+      msgVector->isSrc = 1;
+      msgVector->offsetID = 0;
+      msgVector->reg = &insn->src(0);
+      //data
+      msgVector = this->appendVector();
+      msgVector->regNum = 4;
+      msgVector->isSrc = 1;
+      msgVector->offsetID = 5;
+      msgVector->reg = &insn->src(5);
+    } else {
+      // Send require contiguous allocation
+      SelectionVector *msgVector = this->appendVector();
+      msgVector->regNum = msgNum;
+      msgVector->isSrc = 1;
+      msgVector->offsetID = 0;
+      msgVector->reg = &insn->src(0);
+    }
   Selection::~Selection(void) { GBE_DELETE(this->opaque); }
@@ -2847,6 +3177,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
           case ir::OP_FBL: sel.FBL(dst, src); break;
           case ir::OP_CBIT: sel.CBIT(dst, src); break;
           case ir::OP_LZD: sel.LZD(dst, src); break;
+          case ir::OP_BFREV: sel.BFREV(dst, src); break;
           case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
           case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
           case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
@@ -3573,8 +3904,11 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
           reg == ir::ocl::lid1 ||
           reg == ir::ocl::lid2 ||
           reg == ir::ocl::lsize0 ||
-          reg == ir::ocl::lsize1||
-          reg == ir::ocl::lsize2)
+          reg == ir::ocl::lsize1 ||
+          reg == ir::ocl::lsize2 ||
+          reg == ir::ocl::enqlsize0 ||
+          reg == ir::ocl::enqlsize1 ||
+          reg == ir::ocl::enqlsize2)
         return true;
         return false;
@@ -3799,6 +4133,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         return GEN_BYTE_SCATTER_BYTE;
+  ir::Register generateLocalMask(Selection::Opaque &sel, GenRegister addr) {
+    sel.push();
+      ir::Register localMask = sel.reg(ir::FAMILY_BOOL);
+      sel.curr.physicalFlag = 0;
+      sel.curr.modFlag = 1;
+      sel.curr.predicate = GEN_PREDICATE_NONE;
+      sel.curr.flagIndex = localMask;
+      sel.CMP(GEN_CONDITIONAL_L, addr, GenRegister::immud(64*1024));
+    sel.pop();
+    return localMask;
+  }
   class LoadInstructionPattern : public SelectionPattern
@@ -3807,36 +4152,140 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     LoadInstructionPattern(void) : SelectionPattern(1, 1) {
-    void readDWord(Selection::Opaque &sel,
+    bool isReadConstantLegacy(const ir::LoadInstruction &load) const {
+      ir::AddressMode AM = load.getAddressMode();
+      ir::AddressSpace AS = load.getAddressSpace();
+      if (AM != ir::AM_Stateless && AS == ir::MEM_CONSTANT)
+        return true;
+      return false;
+    }
+    void untypedReadStateless(Selection::Opaque &sel,
+                              GenRegister addr,
+                              vector<GenRegister> &dst
+                              ) const {
+      using namespace ir;
+      GenRegister addrQ;
+      unsigned simdWidth = sel.curr.execWidth;
+      unsigned addrBytes = typeSize(addr.type);
+      unsigned valueNum = dst.size();
+      bool isUniform = sel.isScalarReg(dst[0].reg());
+      if (addrBytes == 4) {
+        addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+        sel.MOV(addrQ, addr);
+      } else if (addrBytes == 8) {
+        addrQ = addr;
+      } else
+      if (simdWidth == 8) {
+          sel.UNTYPED_READA64(addrQ, dst.data(), valueNum, valueNum);
+      } else if (simdWidth == 16) {
+        std::vector<GenRegister> tmpData;
+        for (unsigned i = 0; i < (valueNum+1)/2; i++) {
+          tmpData.push_back(sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32));
+        }
+        sel.push();
+          /* first quarter */
+          sel.curr.execWidth = 8;
+          sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+          sel.UNTYPED_READA64(GenRegister::Qn(addrQ, 0), tmpData.data(), (valueNum+1)/2, valueNum);
+          sel.push();
+            if (isUniform)
+              sel.curr.execWidth = 1;
+            for (unsigned k = 0; k < valueNum; k++) {
+              sel.MOV(GenRegister::Qn(dst[k], 0), GenRegister::Qn(tmpData[k/2], k%2));
+            }
+          sel.pop();
+          /* second quarter */
+          sel.curr.execWidth = 8;
+          sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+          sel.UNTYPED_READA64(GenRegister::Qn(addrQ, 1), tmpData.data(), (valueNum+1)/2, valueNum);
+          if (isUniform)
+            sel.curr.execWidth = 1;
+          for (unsigned k = 0; k < valueNum; k++) {
+            sel.MOV(GenRegister::Qn(dst[k], 1), GenRegister::Qn(tmpData[k/2], k%2));
+          }
+        sel.pop();
+      }
+    }
+    void shootUntypedReadMsg(Selection::Opaque &sel,
+                   const ir::LoadInstruction &insn,
                    vector<GenRegister> &dst,
                    GenRegister addr,
                    uint32_t valueNum,
-                   ir::BTI bti) const
+                   ir::AddressSpace addrSpace) const
-        GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
-        sel.UNTYPED_READ(addr, dst.data(), valueNum, b, sel.getBTITemps(bti));
+      using namespace ir;
+      unsigned addrBytes = typeSize(addr.type);
+      AddressMode AM = insn.getAddressMode();
+      /* Notes on uniform of LoadInstruction, all-lanes-active(noMask,noPredicate)
+       * property should only need be taken care when the value is UNIFORM, if the
+       * value is not uniform, just do things under predication or mask */
+      bool isUniform = sel.isScalarReg(dst[0].reg());
+      sel.push();
+        if (isUniform) {
+          sel.curr.noMask = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+        }
+        vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+        if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+          if (AM == AM_DynamicBti) {
+            Register btiReg = insn.getBtiReg();
+            sel.UNTYPED_READ(addr, dst.data(), valueNum, sel.selReg(btiReg, TYPE_U32), btiTemp);
+          } else {
+            unsigned SI = insn.getSurfaceIndex();
+            sel.UNTYPED_READ(addr, dst.data(), valueNum, GenRegister::immud(SI), btiTemp);
+          }
+        } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn) ) {
+          // stateless mode, local/constant still use bti access
+          unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+          GenRegister addrDW = addr;
+          if (addrBytes == 8)
+            addrDW = convertU64ToU32(sel, addr);
+          sel.UNTYPED_READ(addrDW, dst.data(), valueNum, GenRegister::immud(bti), btiTemp);
+        } else if (addrSpace == ir::MEM_GENERIC) {
+          Register localMask = generateLocalMask(sel, addr);
+          sel.push();
+            sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+            GenRegister addrDW = addr;
+            if (addrBytes == 8)
+              addrDW = convertU64ToU32(sel, addr);
+            sel.UNTYPED_READ(addrDW, dst.data(), valueNum, GenRegister::immud(0xfe), btiTemp);
+            sel.curr.inversePredicate = 1;
+            untypedReadStateless(sel, addr, dst);
+          sel.pop();
+        } else {
+          untypedReadStateless(sel, addr, dst);
+        }
+      sel.pop();
     void emitUntypedRead(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
-                         ir::BTI bti) const
+                         ir::AddressSpace addrSpace) const
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> dst(valueNum);
       for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
-      readDWord(sel, dst, addr, valueNum, bti);
+      shootUntypedReadMsg(sel, insn, dst, addr, valueNum, addrSpace);
     void emitDWordGather(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
-                         ir::BTI bti) const
+                         ir::AddressSpace addrSpace) const
       using namespace ir;
-      GBE_ASSERT(bti.isConst == 1);
       GBE_ASSERT(insn.getValueNum() == 1);
       const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
@@ -3844,7 +4293,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
           sel.curr.noMask = 1;
-          sel.SAMPLE(&dst, 1, &addr, 1, bti.imm, 0, true, true);
+          sel.SAMPLE(&dst, 1, &addr, 1, BTI_CONSTANT, 0, true, true);
@@ -3857,52 +4306,144 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         if (sel.isScalarReg(addr.reg())) {
           sel.curr.noMask = 1;
-        sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+        if (sel.getRegisterFamily(addr.reg()) == FAMILY_QWORD) {
+          // as we still use offset instead of absolut graphics address,
+          // it is safe to convert from u64 to u32
+          GenRegister t = convertU64ToU32(sel, addr);
+          sel.SHR(addrDW, t, GenRegister::immud(2));
+        } else
+          sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
-      sel.DWORD_GATHER(dst, addrDW, bti.imm);
+      sel.DWORD_GATHER(dst, addrDW, BTI_CONSTANT);
+    }
+    void read64Legacy(Selection::Opaque &sel,
+                      GenRegister addr,
+                      vector<GenRegister> &dst,
+                      GenRegister bti,
+                      vector<GenRegister> &btiTemp) const {
+      const uint32_t valueNum = dst.size();
+      if (sel.hasLongType()) {
+        vector<GenRegister> tmp(valueNum);
+        for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+          tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+        }
+        sel.READ64(addr, dst.data(), tmp.data(), valueNum, bti, true, btiTemp);
+      } else {
+        sel.READ64(addr, dst.data(), NULL, valueNum, bti, false, btiTemp);
+      }
+    }
+    void read64Stateless(Selection::Opaque &sel,
+                         const GenRegister addr,
+                         vector<GenRegister> dst) const {
+      using namespace ir;
+      unsigned simdWidth = sel.ctx.getSimdWidth();
+      unsigned valueNum = dst.size();
+      vector<GenRegister> tmp(valueNum);
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+        tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+      }
+      unsigned addrBytes = typeSize(addr.type);
+      GenRegister addrQ;
+      sel.push();
+        if (addrBytes == 4) {
+          addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+          sel.MOV(addrQ, addr);
+        } else {
+          addrQ = addr;
+        }
+        if (simdWidth == 8) {
+          sel.READ64A64(addrQ, dst.data(), tmp.data(), valueNum);
+        } else {
+          assert(valueNum == 1);
+          GenRegister tmpAddr, tmpDst;
+          tmpAddr = GenRegister::Qn(addrQ, 0);
+          tmpDst = GenRegister::Qn(dst[0], 0);
+          sel.curr.execWidth = 8;
+          sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+          sel.READ64A64(tmpAddr, &tmpDst, tmp.data(), valueNum);
+          tmpAddr = GenRegister::Qn(addrQ, 1);
+          tmpDst = GenRegister::Qn(dst[0], 1);
+          sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+          sel.READ64A64(tmpAddr, &tmpDst, tmp.data(), valueNum);
+        }
+      sel.pop();
     void emitRead64(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
-                         ir::BTI bti) const
+                         ir::AddressSpace addrSpace) const
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      GBE_ASSERT(bti.isConst == 1);
       vector<GenRegister> dst(valueNum);
-      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
       for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
-      if (sel.hasLongType()) {
-        vector<GenRegister> tmp(valueNum);
-        for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
-          tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+      bool isUniform = sel.isScalarReg(insn.getValue(0));
+      unsigned addrBytes = typeSize(addr.type);
+      AddressMode AM = insn.getAddressMode();
+      vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+      sel.push();
+        if (isUniform) {
+          sel.curr.noMask = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+        if (AM != AM_Stateless) {
+          GenRegister b;
+          if (AM == AM_DynamicBti) {
+            b = sel.selReg(insn.getBtiReg(), TYPE_U32);
+          } else {
+            b = GenRegister::immud(insn.getSurfaceIndex());
+          }
+          read64Legacy(sel, addr, dst, b, btiTemp);
+        } else if (addrSpace == MEM_LOCAL || isReadConstantLegacy(insn)) {
+          GenRegister b = GenRegister::immud(addrSpace == MEM_LOCAL? 0xfe : BTI_CONSTANT);
+          GenRegister addrDW = addr;
+          if (addrBytes == 8)
+            addrDW = convertU64ToU32(sel, addr);
+          read64Legacy(sel, addrDW, dst, b, btiTemp);
+        } else if (addrSpace == ir::MEM_GENERIC) {
+          Register localMask = generateLocalMask(sel, addr);
+          sel.push();
+            sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+            GenRegister addrDW = addr;
+            if (addrBytes == 8)
+              addrDW = convertU64ToU32(sel, addr);
+            read64Legacy(sel, addrDW, dst, GenRegister::immud(0xfe), btiTemp);
-        sel.READ64(addr, dst.data(), tmp.data(), valueNum, b, true, sel.getBTITemps(bti));
-      } else {
-        sel.READ64(addr, dst.data(), NULL, valueNum, b, false, sel.getBTITemps(bti));
-      }
+            sel.curr.inversePredicate = 1;
+            read64Stateless(sel, addr, dst);
+          sel.pop();
+        } else {
+          read64Stateless(sel, addr, dst);
+        }
+      sel.pop();
     void readByteAsDWord(Selection::Opaque &sel,
+                        const ir::LoadInstruction &insn,
                         const uint32_t elemSize,
                         GenRegister address,
                         GenRegister dst,
                         bool isUniform,
-                        ir::BTI bti) const
+                        ir::AddressSpace addrSpace) const
       using namespace ir;
+        RegisterFamily addrFamily = sel.getRegisterFamily(address.reg());
+        Type addrType = getType(addrFamily);
         Register tmpReg = sel.reg(FAMILY_DWORD, isUniform);
-        GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+        GenRegister tmpAddr = sel.selReg(sel.reg(addrFamily, isUniform), addrType);
         GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32);
-        GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+        GenRegister addrOffset = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
         // Get dword aligned addr
@@ -3910,24 +4451,36 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
             sel.curr.noMask = 1;
             sel.curr.execWidth = 1;
-          sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
+          if (addrFamily == FAMILY_DWORD)
+            sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
+          else {
+            sel.MOV(tmpAddr, GenRegister::immuint64(0xfffffffffffffffc));
+            sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UL), tmpAddr);
+          }
+          vector<GenRegister> tmp;
+          tmp.push_back(tmpData);
+          shootUntypedReadMsg(sel, insn, tmp, tmpAddr, 1, addrSpace);
           if (isUniform)
             sel.curr.noMask = 1;
-          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, b, sel.getBTITemps(bti));
           if (isUniform)
             sel.curr.execWidth = 1;
           // Get the remaining offset from aligned addr
-          sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
-          sel.SHL(tmpAddr, tmpAddr, GenRegister::immud(0x3));
-          sel.SHR(tmpData, tmpData, tmpAddr);
+          if (addrFamily == FAMILY_QWORD) {
+            sel.AND(addrOffset, sel.unpacked_ud(address.reg()), GenRegister::immud(0x3));
+          } else {
+            sel.AND(addrOffset, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
+          }
+          sel.SHL(addrOffset, addrOffset, GenRegister::immud(0x3));
+          sel.SHR(tmpData, tmpData, addrOffset);
           if (elemSize == GEN_BYTE_SCATTER_WORD)
-            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), GenRegister::unpacked_uw(tmpReg, isUniform, sel.isLongReg(tmpReg)));
           else if (elemSize == GEN_BYTE_SCATTER_BYTE)
-            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), GenRegister::unpacked_ub(tmpReg, isUniform));
@@ -3936,7 +4489,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
                                const ir::LoadInstruction &insn,
                                const uint32_t elemSize,
                                GenRegister address,
-                               ir::BTI bti) const
+                               ir::AddressSpace addrSpace) const
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
@@ -3955,7 +4508,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
-      readDWord(sel, tmp, address, tmpRegNum, bti);
+      shootUntypedReadMsg(sel, insn, tmp, address, tmpRegNum, addrSpace);
       for(uint32_t i = 0; i < tmpRegNum; i++) {
         unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
@@ -3976,6 +4529,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       using namespace ir;
       GBE_ASSERT(effectData.size() == effectDataNum);
       GBE_ASSERT(tmp.size() == effectDataNum + 1);
+      RegisterFamily addrFamily = sel.getRegisterFamily(address.reg());
         Register alignedFlag = sel.reg(FAMILY_BOOL, isUniform);
         GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
@@ -3984,7 +4538,12 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
           if (isUniform)
             sel.curr.noMask = 1;
-          sel.AND(shiftL, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(0x3));
+          if (addrFamily == FAMILY_QWORD) {
+            GenRegister t = convertU64ToU32(sel, address);
+            sel.AND(shiftL, t, GenRegister::immud(0x3));
+          } else {
+            sel.AND(shiftL, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
+          }
           sel.SHL(shiftL, shiftL, GenRegister::immud(0x3));
           sel.ADD(shiftH, GenRegister::negate(shiftL), GenRegister::immud(32));
           sel.curr.physicalFlag = 0;
@@ -4012,11 +4571,93 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
+    /* Used to transform address from 64bit to 32bit, note as dataport messages
+     * cannot accept scalar register, so here to convert to non-uniform
+     * register here. */
+    GenRegister convertU64ToU32(Selection::Opaque &sel,
+                                GenRegister addr) const {
+      GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD);
+      GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+      sel.MOV(dst, unpacked);
+      return dst;
+    }
+    void byteGatherStateless(Selection::Opaque &sel,
+                             GenRegister addr,
+                             GenRegister dst,
+                             unsigned elemSize) const {
+      using namespace ir;
+      GenRegister addrQ;
+      unsigned simdWidth = sel.ctx.getSimdWidth();
+      unsigned addrBytes = typeSize(addr.type);
+      if (addrBytes == 4) {
+        addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+        sel.MOV(addrQ, addr);
+      } else {
+        addrQ = addr;
+      }
+      sel.push();
+        if (simdWidth == 8) {
+          sel.BYTE_GATHERA64(dst, addrQ, elemSize);
+        } else if (simdWidth == 16) {
+          sel.curr.execWidth = 8;
+          sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+          sel.BYTE_GATHERA64(GenRegister::Qn(dst, 0), GenRegister::Qn(addrQ, 0), elemSize);
+          sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+          sel.BYTE_GATHERA64(GenRegister::Qn(dst, 1), GenRegister::Qn(addrQ, 1), elemSize);
+        }
+      sel.pop();
+    }
+    void shootByteGatherMsg(Selection::Opaque &sel,
+                            const ir::LoadInstruction &insn,
+                            GenRegister dst,
+                            GenRegister addr,
+                            unsigned elemSize,
+                            bool isUniform,
+                            ir::AddressSpace addrSpace) const {
+      using namespace ir;
+      unsigned addrBytes = typeSize(addr.type);
+      AddressMode AM = insn.getAddressMode();
+      vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+      if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+        if (AM == AM_DynamicBti) {
+          Register btiReg = insn.getBtiReg();
+          sel.BYTE_GATHER(dst, addr, elemSize, sel.selReg(btiReg, TYPE_U32), btiTemp);
+        } else {
+          unsigned SI = insn.getSurfaceIndex();
+          sel.BYTE_GATHER(dst, addr, elemSize, GenRegister::immud(SI), btiTemp);
+        }
+      } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) {
+        unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+        GenRegister addrDW = addr;
+        if (addrBytes == 8) {
+          addrDW = convertU64ToU32(sel, addr);
+        }
+        sel.BYTE_GATHER(dst, addrDW, elemSize, GenRegister::immud(bti), btiTemp);
+      } else if (addrSpace == ir::MEM_GENERIC) {
+        Register localMask = generateLocalMask(sel, addr);
+        sel.push();
+          sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+          GenRegister addrDW = addr;
+          if (addrBytes == 8)
+            addrDW = convertU64ToU32(sel, addr);
+          sel.BYTE_GATHER(dst, addrDW, elemSize, GenRegister::immud(0xfe), btiTemp);
+          sel.curr.inversePredicate = 1;
+          byteGatherStateless(sel, addr, dst, elemSize);
+        sel.pop();
+      } else {
+        byteGatherStateless(sel, addr, dst, elemSize);
+      }
+    }
     void emitUnalignedByteGather(Selection::Opaque &sel,
                                  const ir::LoadInstruction &insn,
                                  const uint32_t elemSize,
                                  GenRegister address,
-                                 ir::BTI bti) const
+                                 ir::AddressSpace addrSpace) const
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
@@ -4024,6 +4665,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
                                  1 : sel.ctx.getSimdWidth();
       const bool isUniform = simdWidth == 1;
       RegisterFamily family = getFamily(insn.getValueType());
+      RegisterFamily addrFamily = sel.getRegisterFamily(address.reg());
+      Type addrType = getType(addrFamily);
       if(valueNum > 1) {
         GBE_ASSERT(!isUniform && "vector load should not be uniform. Something went wrong.");
@@ -4040,16 +4683,20 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
           uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
           vector<GenRegister> tmp(effectDataNum + 1);
-          vector<GenRegister> tmp2(effectDataNum + 1);
           vector<GenRegister> effectData(effectDataNum);
           for(uint32_t i = 0; i < effectDataNum + 1; i++)
-            tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
-          GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+          GenRegister alignedAddr = sel.selReg(sel.reg(addrFamily, isUniform), addrType);
             if (isUniform)
               sel.curr.noMask = 1;
-            sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
+            if (addrFamily == FAMILY_DWORD)
+              sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
+            else {
+              sel.MOV(alignedAddr,  GenRegister::immuint64(~0x3ul));
+              sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UL), alignedAddr);
+            }
           uint32_t remainedReg = effectDataNum + 1;
@@ -4057,15 +4704,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
           do {
             uint32_t width = remainedReg > 4 ? 4 : remainedReg;
             vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
-            vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
             if (pos != 0) {
-                if (isUniform)
-                  sel.curr.noMask = 1;
+              if (isUniform)
+                sel.curr.noMask = 1;
+              if (addrFamily == FAMILY_DWORD)
                 sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
+              else
+                sel.ADD(alignedAddr, alignedAddr, GenRegister::immuint64(pos * 4));
-            readDWord(sel, t1, alignedAddr, width, bti);
+            shootUntypedReadMsg(sel, insn, t1, alignedAddr, width, addrSpace);
             remainedReg -= width;
             pos += width;
           } while(remainedReg);
@@ -4082,7 +4731,6 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         } else {
           GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_BYTE);
-          GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
           vector<GenRegister> dst(valueNum);
           for(uint32_t i = 0; i < valueNum; i++)
             dst[i] = sel.selReg(insn.getValue(i), getType(family));
@@ -4106,7 +4754,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
               dataSize = GEN_BYTE_SCATTER_WORD;
               dataSize = GEN_BYTE_SCATTER_DWORD;
-            sel.BYTE_GATHER(readDst, addressForLoop, dataSize, b, sel.getBTITemps(bti));
+            shootByteGatherMsg(sel, insn, readDst, addressForLoop, dataSize, isUniform, addrSpace);
             // only 4 bytes is gathered even if valueLeft >= 4
             sel.UNPACK_BYTE(dst.data(), readDst, getFamilySize(FAMILY_BYTE), (valueLeft < 4 ? valueLeft : 4));
@@ -4122,23 +4770,22 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         GBE_ASSERT(insn.getValueNum() == 1);
         const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
-        if(sel.getSlowByteGather())
-          readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
+        if (sel.getSlowByteGather())
+          readByteAsDWord(sel, insn, elemSize, address, value, isUniform, addrSpace);
         else {
-          GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
           // We need a temporary register if we read bytes or words
-          Register dst = sel.reg(FAMILY_DWORD, isUniform);
+          Register dst = sel.reg(FAMILY_DWORD);
             if (isUniform)
               sel.curr.noMask = 1;
-            sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, elemSize, b, sel.getBTITemps(bti));
+            shootByteGatherMsg(sel, insn, sel.selReg(dst, ir::TYPE_U32), address, elemSize, isUniform, addrSpace);
             if (isUniform) {
               sel.curr.noMask = 1;
               sel.curr.execWidth = 1;
+              sel.curr.predicate = GEN_PREDICATE_NONE;
             if (elemSize == GEN_BYTE_SCATTER_WORD)
               sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst, isUniform));
@@ -4152,22 +4799,87 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     void emitOWordRead(Selection::Opaque &sel,
                        const ir::LoadInstruction &insn,
                        GenRegister address,
-                       ir::BTI bti) const
+                       ir::AddressSpace addrSpace) const
       using namespace ir;
+      uint32_t SI = insn.getSurfaceIndex();
       const uint32_t vec_size = insn.getValueNum();
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+      const Type type = insn.getValueType();
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      bool isA64 = SI == 255;
+      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
       vector<GenRegister> valuesVec;
-      for(uint32_t i = 0; i < vec_size; i++)
-        valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
-      // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size / 8;
-      tmp_size = tmp_size > 4 ? 4 : tmp_size;
       vector<GenRegister> tmpVec;
+      for(uint32_t i = 0; i < vec_size; i++)
+        valuesVec.push_back(sel.selReg(insn.getValue(i), type));
+      GenRegister headeraddr;
+      if (isA64)
+        headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+      else
+        headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 2 * 4), GEN_TYPE_UD);
+      // Make header
+      sel.push();
+      {
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+        // Update the header with the current address
+        sel.curr.execWidth = 1;
+        // Put zero in the general state base address
+        if (isA64)
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+        else {
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD));
+          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+        }
+      }
+      sel.pop();
+      /* For block read we need to unpack the block date into values, and for different
+       * simdwidth and vector size with different type size, we may need to spilt the
+       * block read send message.
+       * We can only get a send message with 5 reg length
+       * so for different combination we have different message length and tmp vector size
+       *              |  simd8  | simd16 |  simd8 | simd16
+       *  r0  |header |         |        |        |
+       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
+       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
+       *  r3  |date   | ......  | ...... | ...... |  dw1
+       *  r4  |date   | ....... | ...... | ...... |  dw1
+       */
+      uint32_t totalSize = simdWidth * typeSize * vec_size;
+      uint32_t valueSize = simdWidth * typeSize;
+      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+      uint32_t msg_num = vec_size / tmp_size;
+      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
       for(uint32_t i = 0; i < tmp_size; i++)
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
-      sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm, &tmpVec[0], tmp_size);
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+      for (uint32_t i = 0; i < msg_num; i++) {
+          if (i > 0) {
+            sel.push();
+            {
+              // Update the address in header
+              sel.curr.execWidth = 1;
+              sel.ADD(headeraddr, headeraddr, GenRegister::immud(128));
+            }
+            sel.pop();
+          }
+          sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size);
+          for (uint32_t j = 0; j < tmp_size; j++)
+            sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]);
+      }
     // check whether all binded table index point to constant memory
@@ -4182,59 +4894,51 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       using namespace ir;
       const ir::LoadInstruction &insn = cast<ir::LoadInstruction>(dag.insn);
-      GenRegister address = sel.selReg(insn.getAddressRegister(), ir::TYPE_U32);
+      Register reg = insn.getAddressRegister();
+      GenRegister address = sel.selReg(reg, getType(sel.getRegisterFamily(reg)));
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
                  insn.getAddressSpace() == MEM_CONSTANT ||
                  insn.getAddressSpace() == MEM_PRIVATE ||
                  insn.getAddressSpace() == MEM_LOCAL ||
+                 insn.getAddressSpace() == MEM_GENERIC ||
                  insn.getAddressSpace() == MEM_MIXED);
       //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
-      BTI bti;
-      AddressMode am = insn.getAddressMode();
-      if (am == AM_StaticBti) {
-        bti.isConst = 1;
-        bti.imm = insn.getSurfaceIndex();
-      } else if (am == AM_DynamicBti) {
-        bti.isConst = 0;
-        bti.reg = insn.getBtiReg();
-      } else {
-        assert(0 && "stateless not supported yet");
-      }
+      AddressSpace addrSpace = insn.getAddressSpace();
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(sel, type);
-      bool allConstant = isAllConstant(bti);
       if (insn.isBlock())
-        this->emitOWordRead(sel, insn, address, bti);
-      else if (allConstant) {
+        this->emitOWordRead(sel, insn, address, addrSpace);
+      else if (isReadConstantLegacy(insn)) {
         // XXX TODO read 64bit constant through constant cache
         // Per HW Spec, constant cache messages can read at least DWORD data.
         // So, byte/short data type, we have to read through data cache.
         if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-          this->emitRead64(sel, insn, address, bti);
+          this->emitRead64(sel, insn, address, addrSpace);
         else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-          this->emitDWordGather(sel, insn, address, bti);
+          this->emitDWordGather(sel, insn, address, addrSpace);
         else if (insn.isAligned() == true)
-          this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+          this->emitAlignedByteGather(sel, insn, elemSize, address, addrSpace);
-          this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
+          this->emitUnalignedByteGather(sel, insn, elemSize, address, addrSpace);
       } else {
         if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-          this->emitRead64(sel, insn, address, bti);
+          this->emitRead64(sel, insn, address, addrSpace);
         else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-          this->emitUntypedRead(sel, insn, address, bti);
+          this->emitUntypedRead(sel, insn, address, addrSpace);
         else if (insn.isAligned())
-          this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+          this->emitAlignedByteGather(sel, insn, elemSize, address, addrSpace);
-          this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
+          this->emitUnalignedByteGather(sel, insn, elemSize, address, addrSpace);
       return true;
   class StoreInstructionPattern : public SelectionPattern
@@ -4242,44 +4946,316 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     StoreInstructionPattern(void) : SelectionPattern(1, 1) {
-    void emitUntypedWrite(Selection::Opaque &sel,
-                          const ir::StoreInstruction &insn,
+    GenRegister convertU64ToU32(Selection::Opaque &sel,
+                                GenRegister addr) const {
+      GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD);
+      GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+      sel.MOV(dst, unpacked);
+      return dst;
+    }
+    void untypedWriteStateless(Selection::Opaque &sel,
+                               GenRegister address,
+                               vector<GenRegister> &value) const
+    {
+      using namespace ir;
+      unsigned simdWidth = sel.ctx.getSimdWidth();
+      unsigned int addrBytes = typeSize(address.type);
+      unsigned valueNum = value.size();
+      GenRegister addrQ;
+      if (addrBytes == 4) {
+        if (simdWidth == 8) {
+          addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+          sel.MOV(addrQ, address);
+        } else if (simdWidth == 16) {
+          addrQ = address;
+        }
+      } else if (addrBytes == 8) {
+        addrQ = address;
+      }
+      if (simdWidth == 8) {
+        vector<GenRegister> msg;
+        msg.push_back(addrQ);
+        for (unsigned k = 0; k < valueNum; k++)
+          msg.push_back(value[k]);
+        sel.UNTYPED_WRITEA64(msg.data(), valueNum+1, valueNum);
+      } else if (simdWidth == 16) {
+        vector<GenRegister> msgs;
+        for (unsigned k = 0; k < (valueNum+1)/2+1; k++) {
+          msgs.push_back(sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32));
+        }
+        sel.push();
+          /* do first quarter */
+          sel.curr.execWidth = 8;
+          sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+          sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0));
+          for (unsigned k = 0; k < valueNum; k++) {
+            sel.MOV(GenRegister::Qn(msgs[k/2+1], k%2), GenRegister::Qn(value[k], 0));
+          }
+          sel.UNTYPED_WRITEA64(msgs.data(), (valueNum+1)/2+1, valueNum);
+          /* do second quarter */
+          sel.curr.execWidth = 8;
+          sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+          sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1));
+          for (unsigned k = 0; k < valueNum; k++)
+            sel.MOV(GenRegister::Qn(msgs[k/2+1], k%2), GenRegister::Qn(value[k], 1));
+          sel.UNTYPED_WRITEA64(msgs.data(), (valueNum+1)/2+1, valueNum);
+        sel.pop();
+      }
+    }
+    void shootUntypedWriteMsg(Selection::Opaque &sel,
+                              const ir::StoreInstruction &insn,
+                              GenRegister &address,
+                              vector<GenRegister> &value,
+                              ir::AddressSpace addrSpace) const
+    {
+      using namespace ir;
+      unsigned int addrBytes = typeSize(address.type);
+      unsigned valueNum = value.size();
+      AddressMode AM = insn.getAddressMode();
+      vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+      if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+        if (AM == AM_DynamicBti) {
+          Register btiReg = insn.getBtiReg();
+          sel.UNTYPED_WRITE(address, value.data(), valueNum, sel.selReg(btiReg, TYPE_U32), btiTemp);
+        } else {
+          unsigned SI = insn.getSurfaceIndex();
+          sel.UNTYPED_WRITE(address, value.data(), valueNum, GenRegister::immud(SI), btiTemp);
+        }
+      } else if (addrSpace == ir::MEM_LOCAL) {
+        GenRegister addr = address;
+        if (addrBytes == 8) {
+          addr = convertU64ToU32(sel, address);
+        }
+        sel.UNTYPED_WRITE(addr, value.data(), valueNum, GenRegister::immud(0xfe), btiTemp);
+      } else if (addrSpace == ir::MEM_GENERIC) {
+        Register localMask = generateLocalMask(sel, address);
+        sel.push();
+          sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+          GenRegister addrDW = address;
+          if (addrBytes == 8)
+            addrDW = convertU64ToU32(sel, address);
+          sel.UNTYPED_WRITE(addrDW, value.data(), valueNum, GenRegister::immud(0xfe), btiTemp);
+          sel.curr.inversePredicate = 1;
+          untypedWriteStateless(sel, address, value);
+        sel.pop();
+      } else {
+        untypedWriteStateless(sel, address, value);
+      }
+    }
+    void emitUntypedWrite(Selection::Opaque &sel,
+                          const ir::StoreInstruction &insn,
+                          GenRegister address,
+                          ir::AddressSpace addrSpace) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      vector<GenRegister> value(valueNum);
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
+      shootUntypedWriteMsg(sel, insn, address, value, addrSpace);
+    }
+    void write64Legacy(Selection::Opaque &sel,
+                       GenRegister address,
+                       vector<GenRegister> &value,
+                       GenRegister bti,
+                       vector<GenRegister> &btiTemp) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = value.size();
+      if (sel.hasLongType()) {
+        vector<GenRegister> tmp(valueNum);
+        for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+          tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+        }
+        sel.WRITE64(address, value.data(), tmp.data(), valueNum, bti, true, btiTemp);
+      } else {
+        sel.WRITE64(address, value.data(), NULL, valueNum, bti, false, btiTemp);
+      }
+    }
+    void write64Stateless(Selection::Opaque &sel,
                           GenRegister address,
-                          ir::BTI &bti) const
+                          vector<GenRegister> &value) const
       using namespace ir;
-      const uint32_t valueNum = insn.getValueNum();
-      vector<GenRegister> value(valueNum), tmps;
-      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+      unsigned simdWidth = sel.ctx.getSimdWidth();
+      unsigned int addrBytes = typeSize(address.type);
+      unsigned valueNum = value.size();
+      vector<GenRegister> tmp(valueNum);
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+        tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+      }
+      GenRegister addrQ;
+      if (addrBytes == 4) {
+        addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+        sel.MOV(addrQ, address);
+      } else {
+        addrQ = address;
+      }
-      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
-        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
-      sel.UNTYPED_WRITE(address, value.data(), valueNum, b, sel.getBTITemps(bti));
-    }
+      sel.push();
+        if (simdWidth == 8) {
+          sel.WRITE64A64(addrQ, value.data(), tmp.data(), valueNum);
+        } else {
+          GenRegister tmpAddr, tmpSrc;
+          tmpAddr = GenRegister::Qn(addrQ, 0);
+          tmpSrc = GenRegister::Qn(value[0], 0);
+          GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+          /* SIMD16 long register is just enough for (SIMD8 A64 addr + SIMD8 long) */
+          sel.curr.execWidth = 8;
+          sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+          sel.MOV(GenRegister::Qn(tmp, 0), tmpAddr);
+          sel.UNPACK_LONG(GenRegister::Qn(tmp, 1), tmpSrc);
+          sel.UNTYPED_WRITEA64(&tmp, 1, 2);
+          tmpAddr = GenRegister::Qn(addrQ, 1);
+          tmpSrc = GenRegister::Qn(value[0], 1);
+          sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+          sel.MOV(GenRegister::Qn(tmp, 0), tmpAddr);
+          sel.UNPACK_LONG(GenRegister::Qn(tmp, 1), tmpSrc);
+          sel.UNTYPED_WRITEA64(&tmp, 1, 2);
+        }
+      sel.pop();
+    }
     void emitWrite64(Selection::Opaque &sel,
                      const ir::StoreInstruction &insn,
                      GenRegister address,
-                     ir::BTI &bti) const
+                     ir::AddressSpace addrSpace) const
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
       vector<GenRegister> src(valueNum);
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
-      if (sel.hasLongType()) {
-        vector<GenRegister> tmp(valueNum);
-        for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
-          tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+      AddressMode AM = insn.getAddressMode();
+      unsigned int addrBytes = typeSize(address.type);
+      vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+      if (AM != AM_Stateless) {
+        GenRegister b;
+        if (AM == AM_DynamicBti) {
+          b = sel.selReg(insn.getBtiReg(), TYPE_U32);
+        } else {
+          b = GenRegister::immud(insn.getSurfaceIndex());
+        }
+        write64Legacy(sel, address, src, b, btiTemp);
+      } else if (addrSpace == MEM_LOCAL) {
+        GenRegister b = GenRegister::immud(0xfe);
+        GenRegister addr = address;
+        if (addrBytes == 8) {
+          addr = convertU64ToU32(sel, address);
+        }
+        write64Legacy(sel, addr, src, b, btiTemp);
+      } else if (addrSpace == ir::MEM_GENERIC) {
+        Register localMask = generateLocalMask(sel, address);
+        sel.push();
+          sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+          GenRegister addrDW = address;
+          if (addrBytes == 8)
+            addrDW = convertU64ToU32(sel, address);
+          write64Legacy(sel, addrDW, src, GenRegister::immud(0xfe), btiTemp);
+          sel.curr.inversePredicate = 1;
+          write64Stateless(sel, address, src);
+        sel.pop();
+      } else {
+        GBE_ASSERT(sel.hasLongType());
+        write64Stateless(sel, address, src);
+      }
+    }
+    void byteScatterStateless(Selection::Opaque &sel,
+                              GenRegister address,
+                              GenRegister data,
+                              unsigned elemSize) const {
+      using namespace ir;
+      unsigned addrBytes = typeSize(address.type);
+      unsigned simdWidth = sel.ctx.getSimdWidth();
+      GenRegister addrQ;
+        if (addrBytes == 4) {
+          addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+          sel.MOV(addrQ, address);
+        } else {
+          addrQ = address;
+        }
+        if (simdWidth == 8) {
+          GenRegister msg[2];
+          msg[0] = addrQ;
+          msg[1] = data;
+          sel.BYTE_SCATTERA64(msg, 2, elemSize);
+        } else if (simdWidth == 16) {
+          GenRegister msgs[2];
+          msgs[0] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+          msgs[1] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+          sel.push();
+            sel.curr.execWidth = 8;
+            /* do first quarter */
+            sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+            sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0));
+            sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(data, 0));
+            sel.BYTE_SCATTERA64(msgs, 2, elemSize);
+            /* do second quarter */
+            sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+            sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1));
+            sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(data, 1));
+            sel.BYTE_SCATTERA64(msgs, 2, elemSize);
+          sel.pop();
+        }
+    }
+    void shootByteScatterMsg(Selection::Opaque &sel,
+                             const ir::StoreInstruction &insn,
+                             GenRegister address,
+                             GenRegister data,
+                             unsigned elemSize,
+                             ir::AddressSpace addrSpace) const
+    {
+      using namespace ir;
+      unsigned addrBytes = typeSize(address.type);
+      AddressMode AM = insn.getAddressMode();
+      vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+      if (AM != AM_Stateless) {
+        if (AM == AM_DynamicBti) {
+          Register btiReg = insn.getBtiReg();
+          sel.BYTE_SCATTER(address, data, elemSize, sel.selReg(btiReg, TYPE_U32), btiTemp);
+        } else {
+          unsigned SI = insn.getSurfaceIndex();
+          sel.BYTE_SCATTER(address, data, elemSize, GenRegister::immud(SI), btiTemp);
-        sel.WRITE64(address, src.data(), tmp.data(), valueNum, b, true, sel.getBTITemps(bti));
+      } else if (addrSpace == ir::MEM_LOCAL) {
+        GenRegister addr = address;
+        if (addrBytes == 8) {
+          addr = convertU64ToU32(sel, address);
+        }
+        sel.BYTE_SCATTER(addr, data, elemSize, GenRegister::immud(0xfe), btiTemp);
+      } else if (addrSpace == ir::MEM_GENERIC) {
+        Register localMask = generateLocalMask(sel, address);
+        sel.push();
+          sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+          GenRegister addrDW = address;
+          if (addrBytes == 8)
+            addrDW = convertU64ToU32(sel, address);
+          sel.BYTE_SCATTER(addrDW, data, elemSize, GenRegister::immud(0xfe), btiTemp);
+          sel.curr.inversePredicate = 1;
+          byteScatterStateless(sel, address, data, elemSize);
+        sel.pop();
       } else {
-        sel.WRITE64(address, src.data(), NULL, valueNum, b, false, sel.getBTITemps(bti));
+        byteScatterStateless(sel, address, data, elemSize);
@@ -4287,13 +5263,12 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
                          const ir::StoreInstruction &insn,
                          const uint32_t elemSize,
                          GenRegister address,
-                         ir::BTI &bti,
+                         ir::AddressSpace addrSpace,
                          bool isUniform) const
       using namespace ir;
       uint32_t valueNum = insn.getValueNum();
-      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
       if(valueNum > 1) {
         const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
         vector<GenRegister> value(valueNum);
@@ -4313,77 +5288,136 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
           sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
-        sel.UNTYPED_WRITE(address, tmp.data(), tmpRegNum, b, sel.getBTITemps(bti));
+        shootUntypedWriteMsg(sel, insn, address, tmp, addrSpace);
       } else {
         const GenRegister value = sel.selReg(insn.getValue(0));
         GBE_ASSERT(insn.getValueNum() == 1);
-        const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+        const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
-        sel.push();
-          if (isUniform) {
-            sel.curr.noMask = 1;
-            sel.curr.execWidth = 1;
-          }
+        if (elemSize == GEN_BYTE_SCATTER_WORD)
+          sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
+        else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+          sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
-          if (elemSize == GEN_BYTE_SCATTER_WORD)
-            sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
-          else if (elemSize == GEN_BYTE_SCATTER_BYTE)
-            sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
-        sel.pop();
-        sel.BYTE_SCATTER(address, tmp, elemSize, b, sel.getBTITemps(bti));
+        shootByteScatterMsg(sel, insn, address, tmp, elemSize, addrSpace);
     void emitOWordWrite(Selection::Opaque &sel,
                         const ir::StoreInstruction &insn,
                         GenRegister address,
-                        ir::BTI bti) const
+                        ir::AddressSpace addrSpace) const
       using namespace ir;
+      uint32_t SI = insn.getSurfaceIndex();
       const uint32_t vec_size = insn.getValueNum();
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+      const Type type = insn.getValueType();
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      bool isA64 = SI == 255;
+      uint32_t offset_size = isA64 ? 128 : 8;
+      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
       vector<GenRegister> valuesVec;
-      for(uint32_t i = 0; i < vec_size; i++)
-        valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
-      // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size / 8;
-      tmp_size = tmp_size > 4 ? 4 : tmp_size;
       vector<GenRegister> tmpVec;
+      for(uint32_t i = 0; i < vec_size; i++)
+        valuesVec.push_back(sel.selReg(insn.getValue(i), type));
+      GenRegister headeraddr;
+      if (isA64)
+        headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+      else
+        headeraddr = GenRegister::toUniform(sel.getOffsetReg(header, 0, 2 * 4), GEN_TYPE_UD);
+      // Make header
+      sel.push();
+      {
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+        // Update the header with the current address
+        sel.curr.execWidth = 1;
+        // Put zero in the general state base address
+        if (isA64)
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+        else {
+          sel.SHR(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD), GenRegister::immud(4));
+          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+        }
+      }
+      sel.pop();
+      /* For block write we need to pack the block date into the tmp, and for different
+       * simdwidth and vector size with different type size, we may need to spilt the
+       * block write send message.
+       * We can only get a send message with 5 reg length
+       * so for different combination we have different message length and tmp vector size
+       *              |  simd8  | simd16 |  simd8 | simd16
+       *  r0  |header |         |        |        |
+       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
+       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
+       *  r3  |date   | ......  | ...... | ...... |  dw1
+       *  r4  |date   | ....... | ...... | ...... |  dw1
+       */
+      uint32_t totalSize = simdWidth * typeSize * vec_size;
+      uint32_t valueSize = simdWidth * typeSize;
+      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+      uint32_t msg_num = vec_size / tmp_size;
+      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
       for(uint32_t i = 0; i < tmp_size; i++)
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
-      sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm, &tmpVec[0], tmp_size);
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+      for (uint32_t i = 0; i < msg_num; i++) {
+          for (uint32_t j = 0; j < tmp_size; j++)
+            sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]);
+          if (i > 0) {
+            sel.push();
+            {
+              // Update the address in header
+              sel.curr.execWidth = 1;
+              sel.ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
+            }
+            sel.pop();
+          }
+          sel.push();
+            // In simd8 mode, when data reg has more than 1 reg, execWidth 8 will get wrong
+            // result, so set the execWidth to 16.
+            sel.curr.execWidth = 16;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+            sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size);
+          sel.pop();
+      }
     virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
       using namespace ir;
       const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
-      GenRegister address = sel.selReg(insn.getAddressRegister(), ir::TYPE_U32);
+      Register reg = insn.getAddressRegister();
+      GenRegister address = sel.selReg(reg, getType(sel.getRegisterFamily(reg)));
+      AddressSpace addrSpace = insn.getAddressSpace();
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(sel, type);
       const bool isUniform = sel.isScalarReg(insn.getAddressRegister()) && sel.isScalarReg(insn.getValue(0));
-      BTI bti;
-      AddressMode am = insn.getAddressMode();
-      if (am == AM_StaticBti) {
-        bti.isConst = 1;
-        bti.imm = insn.getSurfaceIndex();
-      } else if (am == AM_DynamicBti) {
-        bti.isConst = 0;
-        bti.reg = insn.getBtiReg();
-      } else {
-        assert(0 && "stateless not supported yet");
-      }
       if (insn.isBlock())
-        this->emitOWordWrite(sel, insn, address, bti);
+        this->emitOWordWrite(sel, insn, address, addrSpace);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitWrite64(sel, insn, address, bti);
+        this->emitWrite64(sel, insn, address, addrSpace);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-        this->emitUntypedWrite(sel, insn, address,  bti);
+        this->emitUntypedWrite(sel, insn, address, addrSpace);
       else {
-        this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
+        this->emitByteScatter(sel, insn, elemSize, address, addrSpace, isUniform);
@@ -4416,9 +5450,13 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       if (liveOut.contains(dst) || dag.computeBool)
         needStoreBool = true;
+      // why we set the tmpDst to null?
+      // because for the listed type compare instruction could not
+      // generate bool(uw) result to grf directly, we need an extra
+      // select to generate the bool value to grf
       if(type == TYPE_S64 || type == TYPE_U64 ||
          type == TYPE_DOUBLE || type == TYPE_FLOAT ||
-         type == TYPE_U32 ||  type == TYPE_S32 /*||
+         type == TYPE_U32 ||  type == TYPE_S32 || type == TYPE_HALF /*||
         tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_F);
@@ -4451,7 +5489,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         } else {
           if((type == TYPE_S64 || type == TYPE_U64 ||
               type == TYPE_DOUBLE || type == TYPE_FLOAT ||
-              type == TYPE_U32 ||  type == TYPE_S32))
+              type == TYPE_U32 ||  type == TYPE_S32 || type == TYPE_HALF))
             sel.curr.flagGen = 1;
           else if (sel.isScalarReg(dst)) {
             // If the dest reg is a scalar bool, we can't set it as
@@ -5456,34 +6494,154 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
+    /* Used to transform address from 64bit to 32bit, note as dataport messages
+     * cannot accept scalar register, so here to convert to non-uniform
+     * register here. */
+    GenRegister convertU64ToU32(Selection::Opaque &sel,
+                                GenRegister addr) const {
+      GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD);
+      GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+      sel.MOV(dst, unpacked);
+      return dst;
+    }
+    void untypedAtomicA64Stateless(Selection::Opaque &sel,
+                              const ir::AtomicInstruction &insn,
+                              unsigned msgPayload,
+                              GenRegister dst,
+                              GenRegister addr,
+                              GenRegister src1,
+                              GenRegister src2,
+                              GenRegister bti) const {
+      using namespace ir;
+      GenRegister addrQ;
+      const AtomicOps atomicOp = insn.getAtomicOpcode();
+      GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
+      unsigned addrBytes = typeSize(addr.type);
+      GBE_ASSERT(msgPayload <= 3);
+      unsigned simdWidth = sel.curr.execWidth;
+      AddressMode AM = insn.getAddressMode();
+      if (addrBytes == 4) {
+        addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+        sel.MOV(addrQ, addr);
+      } else {
+        addrQ = addr;
+      }
+      if (simdWidth == 8) {
+        vector<GenRegister> msgs;
+        msgs.push_back(addr);
+        msgs.push_back(src1);
+        msgs.push_back(src2);
+        sel.ATOMICA64(dst, genAtomicOp, msgPayload, msgs, bti, sel.getBTITemps(AM));
+      } else if (simdWidth == 16) {
+        vector<GenRegister> msgs;
+        RegisterFamily family = sel.getRegisterFamily(insn.getDst(0));
+        Type type = getType(family);
+        for (unsigned k = 0; k < msgPayload; k++) {
+          msgs.push_back(sel.selReg(sel.reg(family), type));
+        }
+        sel.push();
+        /* first quarter */
+        sel.curr.execWidth = 8;
+        sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+        sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0));
+        if(msgPayload > 1) {
+          if(family == ir::FAMILY_QWORD)
+            sel.MOV(GenRegister::Qn(msgs[0], 1), GenRegister::Qn(src1, 0));
+          else
+            sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src1, 0));
+        }
+        if(msgPayload > 2) {
+          if(family == ir::FAMILY_QWORD)
+            sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src2, 0));
+          else
+            sel.MOV(GenRegister::Qn(msgs[1], 1), GenRegister::Qn(src2, 0));
+        }
+        sel.ATOMICA64(GenRegister::Qn(dst, 0), genAtomicOp, msgPayload, msgs, bti, sel.getBTITemps(AM));
+        /* second quarter */
+        sel.curr.execWidth = 8;
+        sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+        sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1));
+        if(msgPayload > 1) {
+          if(family == ir::FAMILY_QWORD)
+            sel.MOV(GenRegister::Qn(msgs[0], 1), GenRegister::Qn(src1, 1));
+          else
+            sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src1, 1));
+        }
+        if(msgPayload > 2) {
+          if(family == ir::FAMILY_QWORD)
+            sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src2, 1));
+          else
+            sel.MOV(GenRegister::Qn(msgs[1], 1), GenRegister::Qn(src2, 1));
+        }
+        sel.ATOMICA64(GenRegister::Qn(dst, 1), genAtomicOp, msgPayload, msgs, bti, sel.getBTITemps(AM));
+        sel.pop();
+      }
+    }
     INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
       using namespace ir;
       const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
-      ir::BTI b;
       const AtomicOps atomicOp = insn.getAtomicOpcode();
       unsigned srcNum = insn.getSrcNum();
       unsigned msgPayload;
+      Register reg = insn.getAddressRegister();
+      GenRegister address = sel.selReg(reg, getType(sel.getRegisterFamily(reg)));
+      AddressSpace addrSpace = insn.getAddressSpace();
+      GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
+                 insn.getAddressSpace() == MEM_PRIVATE ||
+                 insn.getAddressSpace() == MEM_LOCAL ||
+                 insn.getAddressSpace() == MEM_GENERIC ||
+                 insn.getAddressSpace() == MEM_MIXED);
+      unsigned addrBytes = typeSize(address.type);
       AddressMode AM = insn.getAddressMode();
       if (AM == AM_DynamicBti) {
-        b.reg = insn.getBtiReg();
         msgPayload = srcNum - 1;
       } else {
-        b.imm = insn.getSurfaceIndex();
-        b.isConst = 1;
         msgPayload = srcNum;
-      GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
-      GenRegister bti =  b.isConst ? GenRegister::immud(b.imm) : sel.selReg(b.reg, ir::TYPE_U32);
-      GenRegister src0 = sel.selReg(insn.getAddressRegister(), TYPE_U32);
+      Type type = getType(sel.getRegisterFamily(insn.getDst(0)));
+      GenRegister dst  = sel.selReg(insn.getDst(0), type);
+      GenRegister src0 = sel.selReg(insn.getAddressRegister(), type);
       GenRegister src1 = src0, src2 = src0;
-      if(msgPayload > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
-      if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
+      if(msgPayload > 1) src1 = sel.selReg(insn.getSrc(1), type);
+      if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), type);
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
-      sel.ATOMIC(dst, genAtomicOp, msgPayload, src0, src1, src2, bti, sel.getBTITemps(b));
+      if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+        if (AM == AM_DynamicBti) {
+          Register btiReg = insn.getBtiReg();
+          sel.ATOMIC(dst, genAtomicOp, msgPayload, address, src1, src2, sel.selReg(btiReg, type), sel.getBTITemps(AM));
+        } else {
+          unsigned SI = insn.getSurfaceIndex();
+          sel.ATOMIC(dst, genAtomicOp, msgPayload, address, src1, src2, GenRegister::immud(SI), sel.getBTITemps(AM));
+        }
+      } else if (addrSpace == ir::MEM_LOCAL) {
+        // stateless mode, local still use bti access
+        GenRegister addrDW = address;
+        if (addrBytes == 8)
+          addrDW = convertU64ToU32(sel, address);
+        sel.ATOMIC(dst, genAtomicOp, msgPayload, addrDW, src1, src2, GenRegister::immud(0xfe), sel.getBTITemps(AM));
+      } else if (addrSpace == ir::MEM_GENERIC) {
+          Register localMask = generateLocalMask(sel, address);
+          sel.push();
+            sel.curr.useVirtualFlag(localMask, GEN_PREDICATE_NORMAL);
+            GenRegister addrDW = address;
+            if (addrBytes == 8)
+              addrDW = convertU64ToU32(sel, address);
+            sel.ATOMIC(dst, genAtomicOp, msgPayload, addrDW, src1, src2, GenRegister::immud(0xfe), sel.getBTITemps(AM));
+            sel.curr.inversePredicate = 1;
+            untypedAtomicA64Stateless(sel, insn, msgPayload, dst, address, src1, src2, GenRegister::immud(0xff));
+          sel.pop();
+      } else
+        untypedAtomicA64Stateless(sel, insn, msgPayload, dst, address, src1, src2, GenRegister::immud(0xff));
       return true;
@@ -5639,6 +6797,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         sel.curr.noMask = 1;
         sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.flag = 0;
+        sel.curr.subFlag = 1;
         sel.cmpBlockIP(GEN_CONDITIONAL_LE, src0, src1);
@@ -5649,6 +6809,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         // this block, as it will always excute with all lanes activated.
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
           sel.setBlockIP(src0, sel.ctx.getMaxLabel());
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.curr.noMask = 1;
@@ -5667,6 +6829,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+         sel.curr.flag = 0;
+         sel.curr.subFlag = 1;
          sel.setBlockIP(src0, label.value());
@@ -5678,6 +6842,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
             (jip != nextLabel || sel.block->endifOffset != -1)) {
           // If it is required, insert a JUMP to bypass the block
+            sel.curr.flag = 0;
+            sel.curr.subFlag = 1;
             if (simdWidth == 8)
               sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
             else if (simdWidth == 16)
@@ -5692,6 +6858,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
+            sel.curr.flag = 0;
+            sel.curr.subFlag = 1;
             sel.curr.predicate = GEN_PREDICATE_NORMAL;
             if(!insn.getParent()->needEndif && insn.getParent()->needIf) {
               ir::LabelIndex label = insn.getParent()->endifLabel;
@@ -5831,86 +6999,97 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     INLINE bool emitOne(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren) const
-      using namespace ir;
-      const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
-      const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
-      const uint32_t dim = insn.getSrcNum() - 4;
-      if (simdWidth == 16) {
-        for(uint32_t i = 0; i < msgNum; i++)
-          msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-      } else {
-        uint32_t valueID = 0;
-        uint32_t msgID = 0;
-        msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        for(; msgID < 1 + dim; msgID++, valueID++)
-          msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
-        // fake v.
-        if (dim < 2)
-          msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        // fake w.
-        if (dim < 3)
-          msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        // LOD.
-        msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        for(; valueID < insn.getSrcNum(); msgID++, valueID++)
-          msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
-      }
+      const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG));
       sel.curr.predicate = GEN_PREDICATE_NONE;
       sel.curr.noMask = 1;
-      sel.MOV(msgs[0], GenRegister::immud(0));
+      sel.MOV(header, GenRegister::immud(0));
       sel.curr.execWidth = 1;
-      GenRegister channelEn = sel.getOffsetReg(msgs[0], 0, 7*4);
+      GenRegister channelEn = sel.getOffsetReg(header, 0, 7*4);
       // Enable all channels.
       sel.MOV(channelEn, GenRegister::immud(0xffff));
-      sel.curr.execWidth = 8;
-      // Set zero LOD.
-      if (simdWidth == 8)
-        sel.MOV(msgs[4], GenRegister::immud(0));
-      else
-        sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      if (simdWidth == 16)
+        emitWithSimd16(sel, insn, markChildren, header);
+      else if (simdWidth == 8)
+        emitWithSimd8(sel, insn, markChildren, header);
+      else
+      return true;
+    }
+    INLINE bool emitWithSimd16(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren, const GenRegister& header) const
+    {
+      using namespace ir;
+      GenRegister msgs[9]; // (header + U + V + W + LOD + 4)
+      msgs[0] = header;
+      for (uint32_t i = 1; i < 9; ++i) {
+        //SIMD16 will be split into two SIMD8,
+        //each virtual reg in msgs requires one physical reg with 8 DWORDs (32 bytes),
+        //so, declare with FAMILY_WORD, and the allocated size will be sizeof(WORD)*SIMD16 = 32 bytes
+        msgs[i] = sel.selReg(sel.reg(FAMILY_WORD), TYPE_U32);
+      }
+      const uint32_t dims = insn.getSrcNum() - 4;
       uint32_t bti = insn.getImageIndex();
-      if (simdWidth == 8)
-        sel.TYPED_WRITE(msgs, msgNum, bti, dim == 3);
-      else {
-        sel.push();
-        sel.curr.execWidth = 8;
-        for( uint32_t quarter = 0; quarter < 2; quarter++)
-        {
-          #define QUARTER_MOV0(msgs, msgid, src) \
-                    sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], GEN_TYPE_UD), msgid % 2), \
-                            GenRegister::Qn(src, quarter))
-          #define QUARTER_MOV1(msgs, msgid, src) \
-                  sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], src.type), msgid % 2), \
-                          GenRegister::Qn(src, quarter))
-          sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
-          // Set U,V,W
-          QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), insn.getCoordType()));
-          if (dim > 1)
-            QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), insn.getCoordType()));
-          if (dim > 2)
-            QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), insn.getCoordType()));
-          // Set R, G, B, A
-          QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(dim), insn.getSrcType()));
-          QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(dim + 1), insn.getSrcType()));
-          QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(dim + 2), insn.getSrcType()));
-          QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(dim + 3), insn.getSrcType()));
-          sel.TYPED_WRITE(msgs, msgNum, bti, dim == 3);
-          #undef QUARTER_MOV0
-          #undef QUARTER_MOV1
+      sel.push();
+      sel.curr.execWidth = 8;
+      for (uint32_t i = 0; i < 2; ++i) { //SIMD16 split to two SIMD8
+        sel.curr.quarterControl = (i == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
+        uint32_t msgid = 1;
+        for (uint32_t dim = 0; dim < dims; ++dim) {  //the coords
+          GenRegister coord = sel.selReg(insn.getSrc(dim), insn.getCoordType());
+          sel.MOV(GenRegister::retype(msgs[msgid++], coord.type), GenRegister::Qn(coord, i));
-        sel.pop();
+        while (msgid < 5)  //fill fake coords
+          sel.MOV(msgs[msgid++], GenRegister::immud(0));
+        for (uint32_t j = 0; j < 4; ++j) {  //the data
+          GenRegister data = sel.selReg(insn.getSrc(j + dims), insn.getSrcType());
+          sel.MOV(GenRegister::retype(msgs[msgid++], data.type), GenRegister::Qn(data, i));
+        }
+        sel.TYPED_WRITE(msgs, 9, bti, dims == 3);
+      }
+      sel.pop();
+      return true;
+    }
+    INLINE bool emitWithSimd8(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren, const GenRegister& header) const
+    {
+      using namespace ir;
+      GenRegister msgs[9]; // (header + U + V + W + LOD + 4)
+      msgs[0] = header;
+      const uint32_t dims = insn.getSrcNum() - 4;
+      uint32_t bti = insn.getImageIndex();
+      uint32_t msgid = 1;
+      for (uint32_t dim = 0; dim < dims; ++dim) {  //the coords
+        GenRegister coord = sel.selReg(insn.getSrc(dim), insn.getCoordType());
+        msgs[msgid++] = coord;
+      }
+      while (msgid < 5) {  //fill fake coords
+        GenRegister fake = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        sel.MOV(fake, GenRegister::immud(0));
+        msgs[msgid++] = fake;
+      for (uint32_t j = 0; j < 4; ++j) {  //the data
+        GenRegister data = sel.selReg(insn.getSrc(j + dims), insn.getSrcType());
+        msgs[msgid++] = data;
+      }
+      sel.TYPED_WRITE(msgs, 9, bti, dims == 3);
       return true;
     DECL_CTOR(TypedWriteInstruction, 1, 1);
@@ -6170,8 +7349,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       uint8_t BTI = insn.getBti();
       GenRegister tmp0, tmp1;
       uint32_t srcNum = insn.getSrcNum();
-      GenRegister dst = sel.selReg(insn.getDst(0), TYPE_S32);
-      //GBE_ASSERT(srcNum);
       uint32_t i = 0;
       uint32_t totalSize = 0;
       bool isContinue = false;
@@ -6192,14 +7370,14 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       i = 0;
       GenRegister regs[8];
       if (srcNum == 0) {
-          sel.PRINTF(dst, BTI, tmp0, tmp1, regs, srcNum, num, isContinue, totalSize);
+          sel.PRINTF(BTI, tmp0, tmp1, regs, srcNum, num, isContinue, totalSize);
       } else {
         do {
           uint32_t s = srcNum < 8 ? srcNum : 8;
           for (uint32_t j = 0; j < s; j++) {
             regs[j] = sel.selReg(insn.getSrc(i + j), insn.getType(i + j));
-          sel.PRINTF(dst, BTI, tmp0, tmp1, regs, s, num, isContinue, totalSize);
+          sel.PRINTF(BTI, tmp0, tmp1, regs, s, num, isContinue, totalSize);
           if (srcNum > 8) {
             srcNum -= 8;
@@ -6257,6 +7435,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       } else {
         // Update the PcIPs
         const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+        sel.curr.flag = 0;
+        sel.curr.subFlag = 1;
           sel.setBlockIP(ip, dst.value());
@@ -6323,6 +7503,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       } else {
         const LabelIndex next = bb.getNextBlock()->getLabelIndex();
         // Update the PcIPs
+        sel.curr.flag = 0;
+        sel.curr.subFlag = 1;
         sel.setBlockIP(ip, dst.value());
         sel.block->endifOffset = -1;
@@ -6440,10 +7622,15 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       GenRegister localBarrier = GenRegister::ud8grf(sel.reg(FAMILY_DWORD));
       /* Allocate registers for message sending
-       * (read/write to shared local memory) */
+       * (read/write to shared local memory),
+       * only one data (ud/ul) is needed for thread communication,
+       * we will always use SIMD8 to do the read/write
+       */
       vector<GenRegister> msg;
-      for(uint32_t i = 0; i < 6; i++)
-        msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
+      msg.push_back(GenRegister::ud8grf(sel.reg(ir::FAMILY_REG)));  //address
+      msg.push_back(GenRegister::ud8grf(sel.reg(ir::FAMILY_REG)));  //data
+      if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+        msg.push_back(GenRegister::ud8grf(sel.reg(ir::FAMILY_REG)));  //data
       /* Insert a barrier to make sure all the var we are interested in
          have been assigned the final value. */
@@ -6455,7 +7642,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       /* Perform workgroup op */
       sel.WORKGROUP_OP(workGroupOp, dst, src, tmpData1,
-                       localThreadID, localThreadNUM, tmpData2, slmOff, msg, 6,
+                       localThreadID, localThreadNUM, tmpData2, slmOff, msg,
       return true;
@@ -6713,20 +7900,78 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       using namespace ir;
       uint32_t vec_size = insn.getVectorSize();
       uint32_t simdWidth = sel.curr.execWidth;
+      const Type type = insn.getType();
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      uint32_t response_size = simdWidth * vec_size * typeSize / 32;
+      // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght is still 1
+      response_size = response_size ? response_size : 1;
+      uint32_t block_width = typeSize * simdWidth;
+      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
       for (uint32_t i = 0; i < vec_size; ++i) {
-        valuesVec.push_back(sel.selReg(insn.getDst(i), TYPE_U32));
-        if(simdWidth == 16)
-          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+        valuesVec.push_back(sel.selReg(insn.getDst(i), type));
+        if(simdWidth == 16 && typeSize == 4)
+          tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
-      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
-      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
-      GenRegister *tmp = NULL;
-      if(simdWidth == 16)
-        tmp = &tmpVec[0];
-      sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize());
+      const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+      const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD);
+      const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD);
+      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4);
+      // Make header
+      sel.push();
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+        // Update the header with the coord
+        sel.curr.execWidth = 1;
+        sel.MOV(offsetx, coordx);
+        sel.MOV(offsety, coordy);
+        // Update block width and height
+        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+      sel.pop();
+      if (simdWidth * typeSize < 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          // Now read the data
+          sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size);
+        sel.pop();
+      } else if (simdWidth * typeSize == 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), vec_size);
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(valuesVec[i], tmpVec[i]);
+          // Second half
+          // Update the header with the coord
+          sel.curr.execWidth = 1;
+          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+          // Now read the data
+          sel.curr.execWidth = 8;
+          sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), vec_size);
+          // Move the reg to fit vector rule.
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]);
+        sel.pop();
+      } else NOT_IMPLEMENTED;
       return true;
     DECL_CTOR(MediaBlockReadInstruction, 1, 1);
@@ -6739,17 +7984,85 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       using namespace ir;
       uint32_t vec_size = insn.getVectorSize();
-      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
-      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      const Type type = insn.getType();
+      uint32_t simdWidth = sel.curr.execWidth;
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      // ushort in simd8 will have half reg, but data lenght is still 1
+      uint32_t data_size = simdWidth * vec_size * typeSize / 32;
+      data_size = data_size? data_size : 1;
+      uint32_t block_width = typeSize * simdWidth;
+      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
-      for(uint32_t i = 0; i < vec_size; i++)
-      {
-        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), TYPE_U32));
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
-      }
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
-      sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
+      for (uint32_t i = 0; i < vec_size; ++i) {
+         valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
+        if(simdWidth == 16 && typeSize == 4)
+          tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
+        else
+          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+       }
+      const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+      const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD);
+      const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD);
+      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4);
+      // Make header
+      sel.push();
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+        // Update the header with the coord
+        sel.curr.execWidth = 1;
+        sel.MOV(offsetx, coordx);
+        sel.MOV(offsety, coordy);
+        // Update block width and height
+        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+      sel.pop();
+      if (simdWidth * typeSize < 64) {
+        for (uint32_t i = 0; i < vec_size; ++i) {
+            sel.MOV(tmpVec[i], valuesVec[i]);
+        }
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          // Now write the data
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size);
+        sel.pop();
+      } else if (simdWidth * typeSize == 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(tmpVec[i], valuesVec[i]);
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+          // Second half
+          // Update the header with the coord
+          sel.curr.execWidth = 1;
+          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+          sel.curr.execWidth = 8;
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32));
+          // Now write the data
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+          // Move the reg to fit vector rule.
+        sel.pop();
+      } else NOT_IMPLEMENTED;
       return true;
     DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 14ac05f..a99b8a9 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -82,6 +82,10 @@ namespace gbe
     bool isBranch(void) const;
     /*! Is it a label instruction (i.e. change the implicit mask) */
     bool isLabel(void) const;
+    /*! Is the src's gen register region is same as all dest regs' region  */
+    bool sameAsDstRegion(uint32_t srcID);
+    /*! Is it a simple navtive instruction (i.e. will be one simple ISA) */
+    bool isNative(void) const;
     /*! Get the destination register */
     GenRegister &dst(uint32_t dstID) { return regs[dstID]; }
     /*! Get the source register */
@@ -104,6 +108,7 @@ namespace gbe
         uint16_t function:8;
         /*! elemSize for byte scatters / gathers, elemNum for untyped msg, operand number for atomic */
         uint16_t elem:8;
+        uint16_t splitSend:1;
       struct {
         /*! Number of sources in the tuple */
@@ -123,6 +128,7 @@ namespace gbe
         uint16_t bti:8;
         uint16_t msglen:5;
         uint16_t is3DWrite:1;
+        uint16_t typedWriteSplitSend:1;
       struct {
         uint16_t rdbti:8;
@@ -154,8 +160,12 @@ namespace gbe
         uint32_t printfBTI:8;
         uint32_t continueFlag:8;
         uint16_t printfSize;
+        uint16_t printfSplitSend:1;
-      uint32_t workgroupOp;
+      struct {
+        uint16_t workgroupOp;
+        uint16_t splitSend:1;
+      }wgop;
     } extra;
     /*! Gen opcode */
     uint8_t opcode;
@@ -209,6 +219,7 @@ namespace gbe
     // Allocates (with a linear allocator) and owns SelectionInstruction
     friend class Selection;
+  void outputSelectionInst(SelectionInstruction &insn);
   /*! Instructions like sends require to make registers contiguous in GRF */
   class SelectionVector : public NonCopyable, public intrusive_list_node
@@ -253,7 +264,6 @@ namespace gbe
     void append(SelectionInstruction *insn);
     /*! Append a new selection instruction at the beginning of the block */
     void prepend(SelectionInstruction *insn);
-    bool isLargeBlock;
     ir::LabelIndex endifLabel;
     int endifOffset;
     bool hasBarrier;
@@ -314,6 +324,8 @@ namespace gbe
     void optimize(void);
     uint32_t opt_features;
+    /* Add insn ID for sel IR */
+    void addID(void);
     const GenContext &getCtx();
     /*! Use custom allocators */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index ccaf526..5d96e9e 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -38,6 +38,7 @@ DECL_SELECTION_IR(I64MUL, I64MULInstruction)
+DECL_SELECTION_IR(ATOMICA64, AtomicA64Instruction)
 DECL_SELECTION_IR(MACH, BinaryInstruction)
 DECL_SELECTION_IR(CMP, CompareInstruction)
 DECL_SELECTION_IR(I64CMP, I64CompareInstruction)
@@ -54,10 +55,16 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)
+DECL_SELECTION_IR(UNTYPED_READA64, UntypedReadA64Instruction)
+DECL_SELECTION_IR(UNTYPED_WRITEA64, UntypedWriteA64Instruction)
 DECL_SELECTION_IR(READ64, Read64Instruction)
 DECL_SELECTION_IR(WRITE64, Write64Instruction)
+DECL_SELECTION_IR(READ64A64, Read64A64Instruction)
+DECL_SELECTION_IR(WRITE64A64, Write64A64Instruction)
+DECL_SELECTION_IR(BYTE_GATHERA64, ByteGatherA64Instruction)
+DECL_SELECTION_IR(BYTE_SCATTERA64, ByteScatterA64Instruction)
@@ -100,3 +107,4 @@ DECL_SELECTION_IR(OBREAD, OBReadInstruction)
diff --git a/backend/src/backend/gen_insn_selection_optimize.cpp b/backend/src/backend/gen_insn_selection_optimize.cpp
index b8aa776..512a5bd 100644
--- a/backend/src/backend/gen_insn_selection_optimize.cpp
+++ b/backend/src/backend/gen_insn_selection_optimize.cpp
@@ -18,11 +18,12 @@ namespace gbe
     uint32_t elements = 0;
     uint32_t elementSize = typeSize(reg.type);
     uint32_t width = GenRegister::width_size(reg);
-    assert(execWidth >= width);
+    // reg may be other insn's source, this insn's width don't force large then execWidth.
+    //assert(execWidth >= width);
     uint32_t height = execWidth / width;
     uint32_t vstride = GenRegister::vstride_size(reg);
     uint32_t hstride = GenRegister::hstride_size(reg);
-    uint32_t base = reg.subnr;
+    uint32_t base = reg.nr * GEN_REG_SIZE + reg.subnr;
     for (uint32_t i = 0; i < height; ++i) {
       uint32_t offsetInByte = base;
       for (uint32_t j = 0; j < width; ++j) {
@@ -132,7 +133,7 @@ namespace gbe
     for (ReplaceInfoMap::iterator pos = replaceInfoMap.begin(); pos != replaceInfoMap.end(); ++pos) {
       ReplaceInfo* info = pos->second;
       if (info->intermedia.reg() == var.reg()) {   //intermedia is overwritten
-        if (info->intermedia.quarter == var.quarter && info->intermedia.subnr == var.subnr) {
+        if (info->intermedia.quarter == var.quarter && info->intermedia.subnr == var.subnr && info->intermedia.nr == var.nr) {
           // We need to check the if intermedia is fully overwritten, they may be in some prediction state.
           if (CanBeReplaced(info, insn, var))
@@ -161,7 +162,7 @@ namespace gbe
     assert(insn.opcode == SEL_OP_MOV);
     const GenRegister& src = insn.src(0);
     const GenRegister& dst = insn.dst(0);
-    if (src.type != dst.type || src.file != dst.file)
+    if (src.type != dst.type || src.file != dst.file || src.hstride != dst.hstride)
     if (liveout.find(dst.reg()) != liveout.end())
@@ -207,7 +208,8 @@ namespace gbe
     if (info->insn.state.inversePredicate != insn.state.inversePredicate)
       return false;
-    if (info->intermedia.type == var.type && info->intermedia.quarter == var.quarter && info->intermedia.subnr == var.subnr) {
+    if (info->intermedia.type == var.type && info->intermedia.quarter == var.quarter &&
+        info->intermedia.subnr == var.subnr && info->intermedia.nr == var.nr) {
       uint32_t elements = CalculateElements(var, insn.state.execWidth);  //considering width, hstrid, vstrid and execWidth
       if (info->elements == elements)
         return true;
@@ -285,4 +287,14 @@ namespace gbe
     //do global optimization
+  void Selection::addID()
+  {
+    uint32_t insnID = 0;
+    for (auto &block : *blockList)
+      for (auto &insn : block.insnList) {
+        insn.ID  = insnID;
+        insnID += 2;
+      }
+  }
 } /* namespace gbe */
diff --git a/backend/src/backend/gen_insn_selection_output.cpp b/backend/src/backend/gen_insn_selection_output.cpp
index ed26650..f23e8c8 100644
--- a/backend/src/backend/gen_insn_selection_output.cpp
+++ b/backend/src/backend/gen_insn_selection_output.cpp
@@ -45,7 +45,7 @@ namespace gbe
           cout << "(abs)";
         cout << "%" << reg.value.reg;
         if (reg.subphysical)
-          cout << "." << reg.subnr;
+          cout << "." << reg.subnr + reg.nr * GEN_REG_SIZE;
         if (dst)
           cout << "<" << GenRegister::hstride_size(reg) << ">";
@@ -96,77 +96,91 @@ namespace gbe
 #define OP_NAME_LENGTH 512
-  void outputSelectionIR(GenContext &ctx, Selection* sel)
-  {
-    cout << "SELECTION IR begin:" << endl;
-    cout << "WARNING: not completed yet, welcome for the FIX!" << endl;
-    for (SelectionBlock &block : *sel->blockList) {
-      for (SelectionInstruction &insn : block.insnList) {
-        char opname[OP_NAME_LENGTH];
-        if (insn.isLabel()) {
-            cout << "  L" << insn.index << ":" << endl;
-            continue;
-        } else {
-          switch (insn.opcode) {
-            #define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: sprintf(opname, "%s", #OP); break;
-            #include "backend/gen_insn_selection.hxx"
-            #undef DECL_SELECTION_IR
-          }
-        }
+  void outputSelectionInst(SelectionInstruction &insn) {
+    cout<<"["<<insn.ID<<"]";
+    if (insn.state.predicate != GEN_PREDICATE_NONE) {
+      if (insn.state.physicalFlag == 0)
+        cout << "(f" << insn.state.flagIndex << ")\t";
+      else
+        cout << "(f" << insn.state.flag << "." << insn.state.subFlag << ")\t";
+    }
+    else
+      cout << "    \t";
-        if (insn.opcode == SEL_OP_CMP) {
-          switch (insn.extra.function) {
-            case GEN_CONDITIONAL_LE:
-              strcat(opname, ".le");
-              break;
-            case GEN_CONDITIONAL_L:
-              strcat(opname, ".l");
-              break;
-            case GEN_CONDITIONAL_GE:
-              strcat(opname, ".ge");
-              break;
-            case GEN_CONDITIONAL_G:
-              strcat(opname, ".g");
-              break;
-            case GEN_CONDITIONAL_EQ:
-              strcat(opname, ".eq");
-              break;
-            case GEN_CONDITIONAL_NEQ:
-              strcat(opname, ".neq");
-              break;
-          }
-        }
+    char opname[OP_NAME_LENGTH];
+    if (insn.isLabel()) {
+        cout << "  L" << insn.index << ":" << endl;
+        return;
+    } else {
+      switch (insn.opcode) {
+        #define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: sprintf(opname, "%s", #OP); break;
+        #include "backend/gen_insn_selection.hxx"
+        #undef DECL_SELECTION_IR
+      }
+    }
-        int n = strlen(opname);
-        if(n >= OP_NAME_LENGTH - 20) {
-          cout << "opname too long: " << opname << endl;
-          return;
-        }
+    if (insn.opcode == SEL_OP_CMP) {
+      switch (insn.extra.function) {
+        case GEN_CONDITIONAL_LE:
+          strcat(opname, ".le");
+          break;
+        case GEN_CONDITIONAL_L:
+          strcat(opname, ".l");
+          break;
+        case GEN_CONDITIONAL_GE:
+          strcat(opname, ".ge");
+          break;
+        case GEN_CONDITIONAL_G:
+          strcat(opname, ".g");
+          break;
+        case GEN_CONDITIONAL_EQ:
+          strcat(opname, ".eq");
+          break;
+        case GEN_CONDITIONAL_NEQ:
+          strcat(opname, ".neq");
+          break;
+      }
+    }
-        sprintf(&opname[n], "(%d)", insn.state.execWidth);
-        cout << "    " << left << setw(20) << opname;
+    int n = strlen(opname);
+    if(n >= OP_NAME_LENGTH - 20) {
+      cout << "opname too long: " << opname << endl;
+      return;
+    }
-        for (int i = 0; i < insn.dstNum; ++i)
-        {
-          GenRegister dst = insn.dst(i);
-          outputGenReg(dst, true);
-          cout << "\t";
-        }
+    sprintf(&opname[n], "(%d)", insn.state.execWidth);
+    cout << left << setw(20) << opname;
-        cout << ":\t";
+    for (int i = 0; i < insn.dstNum; ++i)
+    {
+      GenRegister dst = insn.dst(i);
+      outputGenReg(dst, true);
+      cout << "\t";
+    }
-        for (int i = 0; i < insn.srcNum; ++i)
-        {
-          GenRegister src = insn.src(i);
-          outputGenReg(src, false);
-          cout << "\t";
-        }
+    cout << ":\t";
-        cout << endl;
+    for (int i = 0; i < insn.srcNum; ++i)
+    {
+      GenRegister src = insn.src(i);
+      outputGenReg(src, false);
+      cout << "\t";
+    }
+    cout << endl;
+  }
+  void outputSelectionIR(GenContext &ctx, Selection* sel, const char* KernelName)
+  {
+    cout << KernelName <<"'s SELECTION IR begin:" << endl;
+    cout << "WARNING: not completed yet, welcome for the FIX!" << endl;
+    for (SelectionBlock &block : *sel->blockList) {
+      for (SelectionInstruction &insn : block.insnList) {
+        outputSelectionInst(insn);
       cout << endl;
-    cout << "SELECTION IR end." << endl << endl;
+    cout <<KernelName << "'s SELECTION IR end." << endl << endl;
diff --git a/backend/src/backend/gen_insn_selection_output.hpp b/backend/src/backend/gen_insn_selection_output.hpp
index dd372dc..e1c72af 100644
--- a/backend/src/backend/gen_insn_selection_output.hpp
+++ b/backend/src/backend/gen_insn_selection_output.hpp
@@ -6,7 +6,7 @@ namespace gbe
   class Selection;  // Pre ISA code
   class GenContext; // Handle compilation for Gen
-  void outputSelectionIR(GenContext &ctx, Selection* sel);
+  void outputSelectionIR(GenContext &ctx, Selection* sel, const char* KernelName);
 } /* namespace gbe */
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 4ef82d1..073ede6 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -212,6 +212,7 @@ namespace gbe {
       kernel = ctx->compileKernel();
       if (kernel != NULL) {
         GBE_ASSERT(ctx->getErrCode() == NO_ERROR);
+        kernel->setOclVersion(unit.getOclVersion());
@@ -351,8 +352,10 @@ namespace gbe {
     // if load 32 bit spir binary, the triple should be spir-unknown-unknown.
     llvm::Triple triple(module->getTargetTriple());
-    if(triple.getArchName() == "spir" && triple.getVendorName() == "unknown" && triple.getOSName() == "unknown"){
+    if (triple.getArchName() == "spir" && triple.getVendorName() == "unknown" && triple.getOSName() == "unknown"){
+    } else if (triple.getArchName() == "spir64" && triple.getVendorName() == "unknown" && triple.getOSName() == "unknown"){
+      module->setTargetTriple("spir64");
     if(module == NULL){
@@ -524,7 +527,7 @@ namespace gbe {
                                       size_t stringSize,
                                       char *err,
                                       size_t *errSize,
-                                      const char *          options)
+                                      const char * options)
     using namespace gbe;
@@ -544,7 +547,9 @@ namespace gbe {
         if (strstr(options, "-cl-fast-relaxed-math") != NULL)
           fast_relaxed_math = 1;
-    char *options_str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
+      char *options_str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
+      if (options_str == NULL)
+        return;
       memcpy(options_str, options, strlen(options) + 1);
       std::string optionStr(options_str);
       while (end != std::string::npos) {
@@ -565,11 +570,11 @@ namespace gbe {
     GenProgram* p = (GenProgram*) program;
     p->fast_relaxed_math = fast_relaxed_math;
     if (!dumpASMFileName.empty()) {
-        p->asm_file_name = dumpASMFileName.c_str();
-        FILE *asmDumpStream = fopen(dumpASMFileName.c_str(), "w");
-        if (asmDumpStream)
-          fclose(asmDumpStream);
-      }
+      p->asm_file_name = dumpASMFileName.c_str();
+      FILE *asmDumpStream = fopen(dumpASMFileName.c_str(), "w");
+      if (asmDumpStream)
+        fclose(asmDumpStream);
+    }
     // Try to compile the program
     llvm::Module* module = (llvm::Module*)p->module;
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 4451efb..d88b316 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -86,14 +86,18 @@ namespace gbe
     INLINE void getRegAttrib(ir::Register reg, uint32_t &regSize, ir::RegisterFamily *regFamily = NULL) const {
       // Note that byte vector registers use two bytes per byte (and can be
       // interleaved)
-      static const size_t familyVectorSize[] = {2,2,2,4,8};
-      static const size_t familyScalarSize[] = {2,2,2,4,8};
+      static const size_t familyVectorSize[] = {2,2,2,4,8,16,32};
+      static const size_t familyScalarSize[] = {2,2,2,4,8,16,32};
       using namespace ir;
       const bool isScalar = ctx.sel->isScalarReg(reg);
       const RegisterData regData = ctx.sel->getRegisterData(reg);
       const RegisterFamily family = regData.family;
-      const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
-      regSize = isScalar ? typeSize : ctx.getSimdWidth() * typeSize;
+      if (family == ir::FAMILY_REG)
+        regSize = 32;
+      else {
+        const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
+        regSize = isScalar ? typeSize : ctx.getSimdWidth() * typeSize;
+      }
       if (regFamily != NULL)
         *regFamily = family;
@@ -148,7 +152,6 @@ namespace gbe
     vector<GenRegInterval> intervals;
     /*! All the boolean register intervals on the corresponding BB*/
     typedef map<ir::Register, GenRegInterval> RegIntervalMap;
-    set<SelectionBlock *> flag0ReservedBlocks;
     map<SelectionBlock *, RegIntervalMap *> boolIntervalsMap;
     /*! Intervals sorting based on starting point positions */
     vector<GenRegInterval*> starting;
@@ -425,6 +428,12 @@ namespace gbe
   #define GET_FLAG_REG(insn) GenRegister::uwxgrf(IS_SCALAR_FLAG(insn) ? 1 : 8,\
   #define IS_TEMP_FLAG(insn) (insn.state.flag == 0 && insn.state.subFlag == 1)
+  #define NEED_DST_GRF_TYPE_FIX(ty) \
+          (ty == GEN_TYPE_F ||      \
+           ty == GEN_TYPE_HF ||     \
+           ty == GEN_TYPE_DF ||     \
+           ty == GEN_TYPE_UL ||     \
+           ty == GEN_TYPE_L)
   // Flag is a virtual flag, this function is to validate the virtual flag
   // to a physical flag. It is used to validate both temporary flag and the
   // non-temporary flag registers.
@@ -497,7 +506,7 @@ namespace gbe
       map<ir::Register, uint32_t> allocatedFlags;
       map<const GenRegInterval*, uint32_t> allocatedFlagIntervals;
-      const uint32_t flagNum = flag0ReservedBlocks.contains(&block) ?  2 : 3;
+      const uint32_t flagNum = 3;
       uint32_t freeFlags[] = {2, 3, 0};
       uint32_t freeNum = flagNum;
       if (boolIntervalsMap.find(&block) == boolIntervalsMap.end())
@@ -590,6 +599,14 @@ namespace gbe
         // is called a "conditional modifier"). The other instructions just read
         // it
         if (insn.state.physicalFlag == 0) {
+          // SEL.bool instruction, the dst register should be stored in GRF
+          // the pred flag is used by flag register
+          if (insn.opcode == SEL_OP_SEL) {
+            ir::Register dst = insn.dst(0).reg();
+            if (ctx.sel->getRegisterFamily(dst) == ir::FAMILY_BOOL &&
+                allocatedFlags.find(dst) != allocatedFlags.end())
+              allocatedFlags.erase(dst);
+          }
           auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex));
           if (it != allocatedFlags.end()) {
             insn.state.physicalFlag = 1;
@@ -629,19 +646,28 @@ namespace gbe
             if (insn.state.predicate != GEN_PREDICATE_NONE)
               validateFlag(selection, insn);
-          // This is a CMP for a pure flag booleans, we don't need to write result to
-          // the grf. And latter, we will not allocate grf for it.
           if (insn.opcode == SEL_OP_CMP &&
               (flagBooleans.contains(insn.dst(0).reg()) ||
                GenRegister::isNull(insn.dst(0)))) {
+            // This is a CMP for a pure flag booleans, we don't need to write result to
+            // the grf. And latter, we will not allocate grf for it.
             // set a temporary register to avoid switch in this block.
             bool isSrc = false;
             bool needMov = false;
             ir::Type ir_type = ir::TYPE_FLOAT;
-            if (insn.src(0).isint64())
-              ir_type = ir::TYPE_U64;
+            // below (src : dst) type mapping for 'cmp'
+            // is allowed by hardware
+            // B,W,D,F : F
+            // HF      : HF
+            // DF      : DF
+            // Q       : Q
+            if (NEED_DST_GRF_TYPE_FIX(insn.src(0).type))
+              ir_type = getIRType(insn.src(0).type);
             this->replaceReg(selection, &insn, 0, isSrc, ir_type, needMov);
           // If the instruction requires to generate (CMP for long/int/float..)
           // the flag value to the register, and it's not a pure flag boolean,
           // we need to use SEL instruction to generate the flag value to the UW8
@@ -1215,11 +1241,9 @@ namespace gbe
       // Update the intervals of each used register. Note that we do not
       // register allocate R0, so we skip all sub-registers in r0
       RegIntervalMap *boolsMap = new RegIntervalMap;
-      if (block.isLargeBlock)
-        flag0ReservedBlocks.insert(&block);
       for (auto &insn : block.insnList) {
         const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
-        insn.ID  = insnID;
+        assert(insnID == (int32_t)insn.ID);
         bool is3SrcOp = insn.opcode == SEL_OP_MAD;
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const GenRegister &selReg = insn.src(srcID);
@@ -1244,8 +1268,14 @@ namespace gbe
           if (this->intervals[reg].conflictReg == 0 ||
               this->intervals[reg].conflictReg > conflictReg)
           this->intervals[reg].conflictReg = conflictReg;
-          this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
-          this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+          int insnsrcID = insnID;
+          // If instruction is simple, src and dst can be reused and they will have different IDs
+          // insn may be split in the encoder, if register region are not same, can't be reused.
+          // Because hard to check split or not here, so only check register regio.
+          if (insn.isNative() && insn.sameAsDstRegion(srcID))
+            insnsrcID -= 1;
+          this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnsrcID);
+          this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnsrcID);
         for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
           const GenRegister &selReg = insn.dst(dstID);
@@ -1289,8 +1319,6 @@ namespace gbe
             // is out-of the if/endif region, so we have to borrow the f0
             // to get correct bits for all channels.
             boolsMap->find(reg)->second.minID = 0;
-            if (flag0ReservedBlocks.contains(&block))
-              flag0ReservedBlocks.erase(&block);
         } else {
           // Make sure that instruction selection stage didn't use physiacl flags incorrectly.
@@ -1299,11 +1327,10 @@ namespace gbe
                        insn.opcode == SEL_OP_JMPI ||
                        insn.state.predicate == GEN_PREDICATE_NONE ||
                        (block.hasBarrier && insn.opcode == SEL_OP_MOV) ||
-                       (insn.state.flag == 0 && insn.state.subFlag == 1) ||
-                       (block.removeSimpleIfEndif && insn.state.flag == 0 && insn.state.subFlag == 0) ));
+                       (insn.state.flag == 0 && insn.state.subFlag == 1) ));
         lastID = insnID;
-        insnID++;
+        insnID += 2;
       // All registers alive at the begining of the block must update their intervals.
@@ -1472,7 +1499,7 @@ do { \
       GBE_ASSERT(RA.contains(reg.reg()) != false);
       const uint32_t grfOffset = RA.find(reg.reg())->second;
-      const uint32_t suboffset = reg.subphysical ? reg.subnr : 0;
+      const uint32_t suboffset = reg.subphysical ? reg.nr * GEN_REG_SIZE + reg.subnr : 0;
       const GenRegister dst = setGenReg(reg, grfOffset + suboffset);
       if (reg.quarter != 0)
         return GenRegister::Qn(dst, reg.quarter);
@@ -1522,7 +1549,8 @@ do { \
       const ir::FunctionArgument &arg = this->opaque->ctx.getFunction().getArg(subType);
       if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
           arg.type == ir::FunctionArgument::LOCAL_POINTER  ||
-          arg.type == ir::FunctionArgument::CONSTANT_POINTER)
+          arg.type == ir::FunctionArgument::CONSTANT_POINTER||
+          arg.type == ir::FunctionArgument::PIPE)
         regSize = this->opaque->ctx.getPointerSize();
         regSize = arg.size;
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index a8eb2e4..6c73f5e 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -169,6 +169,12 @@ namespace gbe
+    void useVirtualFlag(ir::Register flag, unsigned pred) {
+      modFlag = 0;
+      physicalFlag = 0;
+      flagIndex = flag;
+      predicate = pred;
+    }
     void useFlag(int nr, int subnr) {
       flag = nr;
       subFlag = subnr;
@@ -268,6 +274,10 @@ namespace gbe
     static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
       GenRegister r = reg;
+      if(subnr >= 32){
+        nr += subnr / 32;
+        subnr = subnr % 32;
+      }
       r.nr += nr;
       r.subnr += subnr;
       r.subphysical = 1;
@@ -283,6 +293,14 @@ namespace gbe
       return r;
+    INLINE bool isSameRegion(GenRegister reg) const {
+      return reg.file == file &&
+             typeSize(reg.type) == typeSize(type) &&
+             reg.vstride == vstride &&
+             reg.width == width &&
+             reg.hstride == hstride;
+    }
     static INLINE uint32_t grfOffset(GenRegister reg) {
       return reg.nr * GEN_REG_SIZE + reg.subnr;
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 2224880..85d0aa9 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -89,7 +89,8 @@ namespace gbe {
   Kernel::Kernel(const std::string &name) :
     name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false),
         slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), printfSet(NULL),
-        profilingInfo(NULL) {}
+        profilingInfo(NULL), useDeviceEnqueue(false) {}
   Kernel::~Kernel(void) {
     if(ctx) GBE_DELETE(ctx);
     if(samplerSet) GBE_DELETE(samplerSet);
@@ -106,11 +107,14 @@ namespace gbe {
     return it->offset; // we found it!
-  Program::Program(uint32_t fast_relaxed_math) : fast_relaxed_math(fast_relaxed_math), constantSet(NULL) {}
+  Program::Program(uint32_t fast_relaxed_math) : fast_relaxed_math(fast_relaxed_math), 
+                               constantSet(NULL),
+                               relocTable(NULL) {}
   Program::~Program(void) {
     for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it)
     if (constantSet) delete constantSet;
+    if (relocTable) delete relocTable;
@@ -119,7 +123,10 @@ namespace gbe {
   IVAR(OCL_PROFILING_LOG, 0, 0, 1); // Int for different profiling types.
-  bool Program::buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel) {
+  bool Program::buildFromLLVMFile(const char *fileName,
+                                         const void* module,
+                                         std::string &error,
+                                         int optLevel) {
     ir::Unit *unit = new ir::Unit();
     llvm::Module * cloned_module = NULL;
     bool ret = false;
@@ -174,6 +181,8 @@ namespace gbe {
   bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
     constantSet = new ir::ConstantSet(unit.getConstantSet());
+    relocTable = new ir::RelocTable(unit.getRelocTable());
+    blockFuncs = unit.blockFuncs;
     const auto &set = unit.getFunctionSet();
     const uint32_t kernelNum = set.size();
     if (OCL_OUTPUT_GEN_IR) std::cout << unit;
@@ -212,6 +221,7 @@ namespace gbe {
     uint32_t ret_size = 0;
     uint32_t ker_num = kernels.size();
     uint32_t has_constset = 0;
+    uint32_t has_relocTable = 0;
@@ -227,6 +237,18 @@ namespace gbe {
+    if(relocTable) {
+      has_relocTable = 1;
+      OUT_UPDATE_SZ(has_relocTable);
+      uint32_t sz = relocTable->serializeToBin(outs);
+      if (!sz)
+        return 0;
+      ret_size += sz;
+    } else {
+      OUT_UPDATE_SZ(has_relocTable);
+    }
     for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it) {
       uint32_t sz = it->second->serializeToBin(outs);
@@ -247,6 +269,7 @@ namespace gbe {
     int has_constset = 0;
     uint32_t ker_num;
     uint32_t magic;
+    uint32_t has_relocTable = 0;
     if (magic != magic_begin)
@@ -263,6 +286,17 @@ namespace gbe {
       total_size += sz;
+    IN_UPDATE_SZ(has_relocTable);
+    if(has_relocTable) {
+      relocTable = new ir::RelocTable;
+      uint32_t sz = relocTable->deserializeFromBin(ins);
+      if (sz == 0)
+        return 0;
+      total_size += sz;
+    }
     for (uint32_t i = 0; i < ker_num; i++) {
@@ -303,6 +337,8 @@ namespace gbe {
     outs.write(name.c_str(), name.size());
     ret_size += sizeof(char)*name.size();
+    OUT_UPDATE_SZ(oclVersion);
     for (i = 0; i < argNum; i++) {
       KernelArgument& arg = args[i];
@@ -415,6 +451,8 @@ namespace gbe {
     name = c_name;
     delete[] c_name;
+    IN_UPDATE_SZ(oclVersion);
     args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, argNum);
     for (uint32_t i = 0; i < argNum; i++) {
@@ -616,7 +654,7 @@ namespace gbe {
   static bool buildModuleFromSource(const char *source, llvm::Module** out_module, llvm::LLVMContext* llvm_ctx,
                                     std::string dumpLLVMFileName, std::string dumpSPIRBinaryName, std::vector<std::string>& options, size_t stringSize, char *err,
-                                    size_t *errSize) {
+                                    size_t *errSize, uint32_t oclVersion) {
     // Arguments to pass to the clang frontend
     vector<const char *> args;
     bool bFastMath = false;
@@ -626,6 +664,17 @@ namespace gbe {
+    // The ParseCommandLineOptions used for mllvm args can not be used with multithread
+    // and GVN now have a 100 inst limit on block scan. Now only pass a bigger limit
+    // for each context only once, this can also fix multithread bug.
+    static bool ifsetllvm = false;
+    if(!ifsetllvm) {
+      args.push_back("-mllvm");
+      args.push_back("-memdep-block-scan-limit=200");
+      ifsetllvm = true;
+    }
@@ -643,7 +692,11 @@ namespace gbe {
-    args.push_back("spir");
+    if (oclVersion >= 200) {
+      args.push_back("spir64");
+      args.push_back("-fblocks");
+    } else
+      args.push_back("spir");
 #endif /* LLVM_VERSION_MINOR <= 2 */
@@ -691,7 +744,18 @@ namespace gbe {
     clang::LangOptions & lang_opts = Clang.getLangOpts();
     lang_opts.OpenCL = 1;
-    GBE_ASSERT(Clang.getFrontendOpts().LLVMArgs.empty() && "We do not have llvm args now");
+    //llvm flags need command line parsing to take effect
+    if (!Clang.getFrontendOpts().LLVMArgs.empty()) {
+      unsigned NumArgs = Clang.getFrontendOpts().LLVMArgs.size();
+      const char **Args = new const char*[NumArgs + 2];
+      Args[0] = "clang (LLVM option parsing)";
+      for (unsigned i = 0; i != NumArgs; ++i){
+        Args[i + 1] = Clang.getFrontendOpts().LLVMArgs[i].c_str();
+      }
+      Args[NumArgs + 1] = 0;
+      llvm::cl::ParseCommandLineOptions(NumArgs + 1, Args);
+      delete [] Args;
+    }
     // Create an action and make the compiler instance carry it out
     std::unique_ptr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction(llvm_ctx));
@@ -774,6 +838,7 @@ namespace gbe {
@@ -787,10 +852,9 @@ namespace gbe {
                                      int& optLevel,
                                      size_t stringSize,
                                      char *err,
-                                     size_t *errSize)
+                                     size_t *errSize,
+                                     uint32_t &oclVersion)
-    std::string dirs = OCL_PCH_PATH;
-    std::istringstream idirs(dirs);
     std::string pchFileName;
     bool findPCH = false;
 #if defined(__ANDROID__)
@@ -803,8 +867,6 @@ namespace gbe {
     std::string hdirs = OCL_HEADER_FILE_DIR;
     if(hdirs == "")
       hdirs = OCL_HEADER_DIR;
-    if(dirs == "")
-      dirs = OCL_PCH_OBJECT;
     std::istringstream hidirs(hdirs);
     std::string headerFilePath;
     bool findOcl = false;
@@ -816,6 +878,7 @@ namespace gbe {
+    (void) findOcl;
       if(options) {
@@ -831,6 +894,8 @@ namespace gbe {
     if (options) {
       char *c_str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
+      if (c_str == NULL)
+        return false;
       memcpy(c_str, options, strlen(options) + 1);
       std::string optionStr(c_str);
       const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
@@ -912,11 +977,16 @@ EXTEND_QUOTE:
         if(str.find("-cl-std=") != std::string::npos) {
           useDefaultCLCVersion = false;
-          if (str == "-cl-std=CL1.1")
+          if (str == "-cl-std=CL1.1") {
-          else if (str == "-cl-std=CL1.2")
+            oclVersion = 110;
+          } else if (str == "-cl-std=CL1.2") {
-          else {
+            oclVersion = 120;
+          } else if (str == "-cl-std=CL2.0") {
+            clOpt.push_back("-D__OPENCL_C_VERSION__=200");
+            oclVersion = 200;
+          } else {
             if (err && stringSize > 0 && errSize)
               *errSize = snprintf(err, stringSize, "Invalid build option: %s\n", str.c_str());
             return false;
@@ -952,8 +1022,15 @@ EXTEND_QUOTE:
     if (useDefaultCLCVersion) {
+#ifdef ENABLE_OPENCL_20
+      clOpt.push_back("-D__OPENCL_C_VERSION__=200");
+      clOpt.push_back("-cl-std=CL2.0");
+      oclVersion = 200;
+      oclVersion = 120;
     //for clCompilerProgram usage.
@@ -961,6 +1038,14 @@ EXTEND_QUOTE:
+    std::string dirs = OCL_PCH_PATH;
+    if(oclVersion >= 200)
+      dirs = OCL_PCH_20_PATH;
+    if(dirs == "") {
+      dirs = oclVersion >= 200 ? OCL_PCH_OBJECT_20 : OCL_PCH_OBJECT;
+    }
+    std::istringstream idirs(dirs);
     while (getline(idirs, pchFileName, ':')) {
       if(access(pchFileName.c_str(), R_OK) == 0) {
         findPCH = true;
@@ -991,10 +1076,11 @@ EXTEND_QUOTE:
     std::vector<std::string> clOpt;
     std::string dumpLLVMFileName, dumpASMFileName;
     std::string dumpSPIRBinaryName;
+    uint32_t oclVersion = 0;
     if (!processSourceAndOption(source, options, NULL, clOpt,
                                 dumpLLVMFileName, dumpASMFileName, dumpSPIRBinaryName,
-                                stringSize, err, errSize))
+                                stringSize, err, errSize, oclVersion))
       return NULL;
     gbe_program p;
@@ -1006,7 +1092,7 @@ EXTEND_QUOTE:
     if (buildModuleFromSource(source, &out_module, llvm_ctx, dumpLLVMFileName, dumpSPIRBinaryName, clOpt,
-                              stringSize, err, errSize)) {
+                              stringSize, err, errSize, oclVersion)) {
     // Now build the program from llvm
       size_t clangErrSize = 0;
       if (err != NULL && *errSize != 0) {
@@ -1053,9 +1139,10 @@ EXTEND_QUOTE:
     std::vector<std::string> clOpt;
     std::string dumpLLVMFileName, dumpASMFileName;
     std::string dumpSPIRBinaryName;
+    uint32_t oclVersion = 0;
     if (!processSourceAndOption(source, options, temp_header_path, clOpt,
                                 dumpLLVMFileName, dumpASMFileName, dumpSPIRBinaryName,
-                                optLevel, stringSize, err, errSize))
+                                optLevel, stringSize, err, errSize, oclVersion))
       return NULL;
     gbe_program p;
@@ -1070,7 +1157,7 @@ EXTEND_QUOTE:
     if (buildModuleFromSource(source, &out_module, llvm_ctx, dumpLLVMFileName, dumpSPIRBinaryName, clOpt,
-                              stringSize, err, errSize)) {
+                              stringSize, err, errSize, oclVersion)) {
     // Now build the program from llvm
       if (err != NULL) {
         GBE_ASSERT(errSize != NULL);
@@ -1164,12 +1251,30 @@ EXTEND_QUOTE:
+  static size_t programGetGlobalRelocCount(gbe_program gbeProgram) {
+    if (gbeProgram == NULL) return 0;
+    const gbe::Program *program = (const gbe::Program*) gbeProgram;
+    return program->getGlobalRelocCount();
+  }
+  static void programGetGlobalRelocTable(gbe_program gbeProgram, char *mem) {
+    if (gbeProgram == NULL) return;
+    const gbe::Program *program = (const gbe::Program*) gbeProgram;
+    program->getGlobalRelocTable(mem);
+  }
   static uint32_t programGetKernelNum(gbe_program gbeProgram) {
     if (gbeProgram == NULL) return 0;
     const gbe::Program *program = (const gbe::Program*) gbeProgram;
     return program->getKernelNum();
+  const static char* programGetDeviceEnqueueKernelName(gbe_program gbeProgram, uint32_t index) {
+    if (gbeProgram == NULL) return 0;
+    const gbe::Program *program = (const gbe::Program*) gbeProgram;
+    return program->getDeviceEnqueueKernelName(index);
+  }
   static gbe_kernel programGetKernelByName(gbe_program gbeProgram, const char *name) {
     if (gbeProgram == NULL) return NULL;
     const gbe::Program *program = (gbe::Program*) gbeProgram;
@@ -1228,6 +1333,8 @@ EXTEND_QUOTE:
         return (void *)(info->typeQual.c_str());
       case GBE_GET_ARG_INFO_NAME:
         return (void *)(info->argName.c_str());
+        return (void *)((size_t)info->typeSize);
@@ -1333,6 +1440,12 @@ EXTEND_QUOTE:
     return ps->getPrintfNum();
+  static uint32_t kernelUseDeviceEnqueue(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->getUseDeviceEnqueue();
+  }
   static void* kernelDupPrintfSet(gbe_kernel gbeKernel) {
     if (gbeKernel == NULL) return NULL;
     const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
@@ -1376,6 +1489,12 @@ EXTEND_QUOTE:
+  static uint32_t kernelGetOclVersion(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->getOclVersion();
+  }
   static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
     return 0u;
@@ -1405,11 +1524,14 @@ GBE_EXPORT_SYMBOL gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NU
 GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table = NULL;
 GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource = NULL;
 GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_cb *gbe_program_get_kernel = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_device_enqueue_kernel_name_cb *gbe_program_get_device_enqueue_kernel_name = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_name_cb *gbe_kernel_get_name = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_attributes_cb *gbe_kernel_get_attributes = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_code_cb *gbe_kernel_get_code = NULL;
@@ -1433,6 +1555,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data =
 GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_ocl_version_cb *gbe_kernel_get_ocl_version = NULL;
 GBE_EXPORT_SYMBOL gbe_output_profiling_cb *gbe_output_profiling = NULL;
 GBE_EXPORT_SYMBOL gbe_dup_profiling_cb *gbe_dup_profiling = NULL;
 GBE_EXPORT_SYMBOL gbe_get_profiling_bti_cb *gbe_get_profiling_bti = NULL;
@@ -1441,6 +1564,7 @@ GBE_EXPORT_SYMBOL gbe_dup_printfset_cb *gbe_dup_printfset = NULL;
 GBE_EXPORT_SYMBOL gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti = NULL;
 GBE_EXPORT_SYMBOL gbe_release_printf_info_cb *gbe_release_printf_info = NULL;
 GBE_EXPORT_SYMBOL gbe_output_printf_cb *gbe_output_printf = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_use_device_enqueue_cb *gbe_kernel_use_device_enqueue = NULL;
 namespace gbe
@@ -1455,9 +1579,12 @@ namespace gbe
       gbe_program_check_opt = gbe::programCheckOption;
       gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
       gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+      gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount;
+      gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable;
       gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
       gbe_program_delete = gbe::programDelete;
       gbe_program_get_kernel_num = gbe::programGetKernelNum;
+      gbe_program_get_device_enqueue_kernel_name = gbe::programGetDeviceEnqueueKernelName;
       gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
       gbe_program_get_kernel = gbe::programGetKernel;
       gbe_kernel_get_name = gbe::kernelGetName;
@@ -1483,6 +1610,7 @@ namespace gbe
       gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
       gbe_kernel_get_image_size = gbe::kernelGetImageSize;
       gbe_kernel_get_image_data = gbe::kernelGetImageData;
+      gbe_kernel_get_ocl_version = gbe::kernelGetOclVersion;
       gbe_get_profiling_bti = gbe::kernelGetProfilingBTI;
       gbe_get_printf_num = gbe::kernelGetPrintfNum;
       gbe_dup_profiling = gbe::kernelDupProfiling;
@@ -1491,6 +1619,7 @@ namespace gbe
       gbe_dup_printfset = gbe::kernelDupPrintfSet;
       gbe_release_printf_info = gbe::kernelReleasePrintfSet;
       gbe_output_printf = gbe::kernelOutputPrintf;
+      gbe_kernel_use_device_enqueue = gbe::kernelUseDeviceEnqueue;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index db770a6..e601c97 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -55,6 +55,7 @@ enum gbe_arg_type {
   GBE_ARG_LOCAL_PTR = 3,        // __local
   GBE_ARG_IMAGE = 4,            // image2d_t, image3d_t
   GBE_ARG_SAMPLER = 5,          // sampler_t
+  GBE_ARG_PIPE = 6,             // pipe
   GBE_ARG_INVALID = 0xffffffff
@@ -65,6 +66,7 @@ enum gbe_get_arg_info_value {
   GBE_GET_ARG_INFO_INVALID = 0xffffffff
@@ -86,6 +88,9 @@ enum gbe_curbe_type {
@@ -109,6 +114,9 @@ enum gbe_curbe_type {
@@ -257,6 +265,11 @@ extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_
 typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem);
 extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data;
+typedef size_t (gbe_program_get_global_reloc_count_cb)(gbe_program gbeProgram);
+extern gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count;
+typedef void (gbe_program_get_global_reloc_table_cb)(gbe_program gbeProgram, char *mem);
+extern gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table;
 /*! Get the size of defined samplers */
 typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
 extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
@@ -289,6 +302,9 @@ extern gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name;
 typedef gbe_kernel (gbe_program_get_kernel_cb)(gbe_program, uint32_t ID);
 extern gbe_program_get_kernel_cb *gbe_program_get_kernel;
+typedef const char* (gbe_program_get_device_enqueue_kernel_name_cb)(gbe_program, uint32_t ID);
+extern gbe_program_get_device_enqueue_kernel_name_cb *gbe_program_get_device_enqueue_kernel_name;
 /*! Get the kernel name */
 typedef const char *(gbe_kernel_get_name_cb)(gbe_kernel);
 extern gbe_kernel_get_name_cb *gbe_kernel_get_name;
@@ -361,6 +377,12 @@ extern gbe_kernel_use_slm_cb *gbe_kernel_use_slm;
 /*! Get slm size needed for kernel local variables */
 typedef int32_t (gbe_kernel_get_slm_size_cb)(gbe_kernel);
 extern gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size;
+/*! Get the kernel's opencl version. */
+typedef uint32_t (gbe_kernel_get_ocl_version_cb)(gbe_kernel);
+extern gbe_kernel_get_ocl_version_cb *gbe_kernel_get_ocl_version;
+/* Kernel use device enqueue or not.  */
+typedef uint32_t (gbe_kernel_use_device_enqueue_cb)(gbe_kernel);
+extern gbe_kernel_use_device_enqueue_cb *gbe_kernel_use_device_enqueue;
 /*mutex to lock global llvmcontext access.*/
 extern void acquireLLVMContextLock();
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 1f0ec55..1aff8b9 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -57,6 +57,7 @@ namespace gbe {
       std::string accessQual;
       std::string typeQual;
       std::string argName;
+      uint32_t typeSize;
     ArgInfo info;
@@ -125,6 +126,9 @@ namespace gbe {
     INLINE bool getUseSLM(void) const { return this->useSLM; }
     /*! get slm size for kernel local variable */
     INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
+    /*! Return the OpenCL version */
+    INLINE void setOclVersion(uint32_t version) { this->oclVersion = version; }
+    INLINE uint32_t getOclVersion(void) const { return this->oclVersion; }
     /*! Set sampler set. */
     void setSamplerSet(ir::SamplerSet *from) {
       samplerSet = from;
@@ -228,6 +232,12 @@ namespace gbe {
     virtual uint32_t serializeToBin(std::ostream& outs);
     virtual uint32_t deserializeFromBin(std::istream& ins);
     virtual void printStatus(int indent, std::ostream& outs);
+    /*! Does kernel use device enqueue */
+    INLINE bool getUseDeviceEnqueue(void) const { return this->useDeviceEnqueue; }
+    /*! Change the device enqueue info of the function */
+    INLINE bool setUseDeviceEnqueue(bool useDeviceEnqueue) {
+      return this->useDeviceEnqueue = useDeviceEnqueue;
+    }
     friend class Context;      //!< Owns the kernels
@@ -240,6 +250,7 @@ namespace gbe {
     uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
     uint32_t stackSize;        //!< Stack size (0 if unused)
     uint32_t scratchSize;      //!< Scratch memory size (may be 0 if unused)
+    uint32_t oclVersion;       //!< Opencl Version (120 for 1.2, 200 for 2.0)
     bool useSLM;               //!< SLM requires a special HW config
     uint32_t slmSize;          //!< slm size for kernel variable
     Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
@@ -249,6 +260,7 @@ namespace gbe {
     ir::ProfilingInfo *profilingInfo;  //!< Copy from the corresponding function.
     uint32_t compileWgSize[3]; //!< required work group size by kernel attribute.
     std::string functionAttributes; //!< function attribute qualifiers combined.
+    bool useDeviceEnqueue;          //!< Has device enqueue?
     GBE_CLASS(Kernel);         //!< Use custom allocators
@@ -285,6 +297,12 @@ namespace gbe {
       return kernel;
+    const char *getDeviceEnqueueKernelName(uint32_t index) const {
+      if(index >= blockFuncs.size())
+        return NULL;
+      return blockFuncs[index].c_str();
+    }
     /*! Build a program from a ir::Unit */
     bool buildFromUnit(const ir::Unit &unit, std::string &error);
     /*! Buils a program from a LLVM source code */
@@ -296,6 +314,8 @@ namespace gbe {
     /*! Get the content of global constant arrays */
     void getGlobalConstantData(char *mem) const { constantSet->getData(mem); }
+    uint32_t getGlobalRelocCount(void) const { return relocTable->getCount(); }
+    void getGlobalRelocTable(char *p) const { relocTable->getData(p); }
     static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
     static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
@@ -327,6 +347,10 @@ namespace gbe {
     map<std::string, Kernel*> kernels;
     /*! Global (constants) outside any kernel */
     ir::ConstantSet *constantSet;
+    /*! relocation table */
+    ir::RelocTable *relocTable;
+    /*! device enqueue functions */
+    vector<std::string> blockFuncs;
     /*! Use custom allocators */
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
index 34d04dd..64cacd9 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -23,6 +23,7 @@
 #include "ir/constant.cpp"
 #include "ir/printf.cpp"
 #include "ir/profiling.cpp"
+#include "ir/reloc.cpp"
 #pragma GCC diagnostic ignored "-Wunused-function"
 #pragma GCC diagnostic ignored "-Wunused-variable"
@@ -39,6 +40,7 @@ struct BinInterpCallBackInitializer
     gbe_program_get_kernel_num = gbe::programGetKernelNum;
     gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
     gbe_program_get_kernel = gbe::programGetKernel;
+    gbe_program_get_device_enqueue_kernel_name = gbe::programGetDeviceEnqueueKernelName;
     gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
     gbe_kernel_get_code = gbe::kernelGetCode;
     gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
@@ -62,8 +64,11 @@ struct BinInterpCallBackInitializer
     gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
     gbe_program_delete = gbe::programDelete;
     gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+    gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount;
+    gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable;
     gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
     gbe_kernel_get_image_data = gbe::kernelGetImageData;
+    gbe_kernel_get_ocl_version = gbe::kernelGetOclVersion;
     gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
     gbe_get_profiling_bti = gbe::kernelGetProfilingBTI;
     gbe_dup_profiling = gbe::kernelDupProfiling;
@@ -73,6 +78,7 @@ struct BinInterpCallBackInitializer
     gbe_dup_printfset = gbe::kernelDupPrintfSet;
     gbe_release_printf_info = gbe::kernelReleasePrintfSet;
     gbe_output_printf = gbe::kernelOutputPrintf;
+    gbe_kernel_use_device_enqueue = gbe::kernelUseDeviceEnqueue;
   ~BinInterpCallBackInitializer() {
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
index 54ae3f1..f16f5b7 100644
--- a/backend/src/ir/constant.cpp
+++ b/backend/src/ir/constant.cpp
@@ -27,8 +27,7 @@
 namespace gbe {
 namespace ir {
-  void ConstantSet::append(const char *data,
-                           const std::string &name,
+  void ConstantSet::append(const std::string &name,
                            uint32_t size,
                            uint32_t alignment)
@@ -36,8 +35,7 @@ namespace ir {
     const uint32_t padding = offset - this->data.size();
     const Constant constant(name, size, alignment, offset);
-    for (uint32_t i = 0; i < padding; ++i) this->data.push_back(0);
-    for (uint32_t i = 0; i < size; ++i) this->data.push_back(data[i]);
+    this->data.resize(padding + size + this->data.size());
 #define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
index c9080b8..0835fad 100644
--- a/backend/src/ir/constant.hpp
+++ b/backend/src/ir/constant.hpp
@@ -69,7 +69,7 @@ namespace ir {
     /*! Append a new constant in the constant set */
-    void append(const char*, const std::string&, uint32_t size, uint32_t alignment);
+    void append(const std::string&, uint32_t size, uint32_t alignment);
     /*! Number of constants */
     size_t getConstantNum(void) const { return constants.size(); }
     /*! Get a special constant */
@@ -91,6 +91,11 @@ namespace ir {
       for (size_t i = 0; i < data.size(); i ++)
         mem[i] = data[i];
+    void setData(char *mem, int offset, int size) {
+      for (int i = 0; i < size; i++) {
+        data[i+offset] = mem[i];
+      }
+    }
     ConstantSet() {}
     ConstantSet(const ConstantSet& other) : Serializable(other),
                 data(other.data), constants(other.constants) {}
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 2fe080a..4c19a42 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -44,7 +44,7 @@ namespace ir {
   Function::Function(const std::string &name, const Unit &unit, Profile profile) :
     name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0), stackSize(0),
-    wgBroadcastSLM(-1), tidMapSLM(-1)
+    wgBroadcastSLM(-1), tidMapSLM(-1), useDeviceEnqueue(false)
     samplerSet = GBE_NEW(SamplerSet);
@@ -62,6 +62,10 @@ namespace ir {
     return unit.getPointerFamily();
+  uint32_t Function::getOclVersion(void) const {
+    return unit.getOclVersion();
+  }
   void Function::addLoop(LabelIndex preheader,
                         int parent,
                         const vector<LabelIndex> &bbs,
@@ -353,6 +357,7 @@ namespace ir {
           out << "structure." << input.size;
         case FunctionArgument::IMAGE: out << "image"; break;
+        case FunctionArgument::PIPE: out << "pipe"; break;
         default: break;
       out << " %" << input.reg << " " << input.name << std::endl;
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 71a6d07..5fcb14a 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -170,8 +170,9 @@ namespace ir {
       LOCAL_POINTER     = 2, // __local
       VALUE             = 3, // int, float
       STRUCTURE         = 4, // struct foo
-      IMAGE             = 5,  // image*d_t
-      SAMPLER           = 6
+      IMAGE             = 5, // image*d_t
+      SAMPLER           = 6,
+      PIPE              = 7  // pipe
     struct InfoFromLLVM { // All the info about passed by llvm, using -cl-kernel-arg-info
@@ -181,6 +182,7 @@ namespace ir {
       std::string accessQual;
       std::string typeQual;
       std::string argName; // My different from arg->getName()
+      uint32_t typeSize;
       // only llvm-3.6 or later has kernel_arg_base_type in metadata.
@@ -235,6 +237,9 @@ namespace ir {
                isImage2dT() || isImage2dArrayT() || isImage3dT();
+      bool isPipeType() const {
+        return typeQual.compare("pipe") == 0;
+      }
     /*! Create a function input argument */
@@ -551,6 +556,13 @@ namespace ir {
     /*! Output the control flow graph to .dot file */
     void outputCFG();
+    uint32_t getOclVersion(void) const;
+    /*! Does it use device enqueue */
+    INLINE bool getUseDeviceEnqueue(void) const { return this->useDeviceEnqueue; }
+    /*! Change the device enqueue infor of the function */
+    INLINE bool setUseDeviceEnqueue(bool useDeviceEnqueue) {
+      return this->useDeviceEnqueue = useDeviceEnqueue;
+    }
     friend class Context;           //!< Can freely modify a function
     std::string name;               //!< Function name
@@ -578,6 +590,7 @@ namespace ir {
     std::string functionAttributes; //!< function attribute qualifiers combined.
     int32_t wgBroadcastSLM;         //!< Used for broadcast the workgroup value.
     int32_t tidMapSLM;              //!< Used to store the map between groupid and hw thread.
+    bool useDeviceEnqueue;          //!< Has device enqueue?
     GBE_CLASS(Function);            //!< Use custom allocator
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index ed64580..f0c3957 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1070,18 +1070,20 @@ namespace ir {
       public TupleDstPolicy<MediaBlockReadInstruction>
-      INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum) {
+      INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum, Type type) {
         this->opcode = OP_MBREAD;
         this->dst = dst;
         this->dstNum = vec_size;
         this->src = srcTuple;
         this->srcNum = srcNum;
         this->imageIdx = imageIdx;
+        this->type = type;
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
-        out << (int)this->getVectorSize();
+        out << "." << type << "."
+            << (int)this->getVectorSize();
         out << " {";
         for (uint32_t i = 0; i < dstNum; ++i)
           out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
@@ -1092,12 +1094,14 @@ namespace ir {
       INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE uint8_t getVectorSize(void) const { return this->dstNum; }
+      INLINE Type getType(void) const { return this->type; }
       Tuple src;
       Tuple dst;
       uint8_t imageIdx;
       uint8_t srcNum;
       uint8_t dstNum;
+      Type type;
     class ALIGNED_INSTRUCTION MediaBlockWriteInstruction :
@@ -1107,17 +1111,19 @@ namespace ir {
-      INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
+      INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) {
         this->opcode = OP_MBWRITE;
         this->src = srcTuple;
         this->srcNum = srcNum;
         this->imageIdx = imageIdx;
         this->vec_size = vec_size;
+        this->type = type;
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
-        out << (int)this->getVectorSize()
+        out << "." << type << "."
+            << (int)this->getVectorSize()
             << " 2D surface id " << (int)this->getImageIndex()
             << " byte coord x %" << this->getSrc(fn, 0)
             << " row coord y %" << this->getSrc(fn, 1);
@@ -1128,12 +1134,14 @@ namespace ir {
       INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE uint8_t getVectorSize(void) const { return this->vec_size; }
+      INLINE Type getType(void) const { return this->type; }
       Tuple src;
       Register dst[0];
       uint8_t imageIdx;
       uint8_t srcNum;
       uint8_t vec_size;
+      Type type;
@@ -1349,10 +1357,11 @@ namespace ir {
       if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
         return false;
-      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
+      const RegisterFamily family = getFamily(this->type);
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
         return false;
       for (uint32_t srcID = 0; srcID < srcNum-1u; ++srcID)
-        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
+        if (UNLIKELY(checkRegisterData(family, getSrc(fn, srcID+1u), fn, whyNot) == false))
           return false;
       return true;
@@ -1466,7 +1475,8 @@ namespace ir {
                                  SYNC_LOCAL_READ_FENCE |
                                  SYNC_LOCAL_WRITE_FENCE |
                                  SYNC_GLOBAL_READ_FENCE |
-                                 SYNC_GLOBAL_WRITE_FENCE;
+                                 SYNC_GLOBAL_WRITE_FENCE |
+                                 SYNC_IMAGE_FENCE;
       if (UNLIKELY(this->parameters > maxParams)) {
         whyNot = "Invalid parameters for sync instruction";
         return false;
@@ -1493,8 +1503,9 @@ namespace ir {
     INLINE bool SimdShuffleInstruction::wellFormed(const Function &fn, std::string &whyNot) const
-      if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32 && this->type != TYPE_FLOAT)) {
-        whyNot = "Only support S32/U32/FLOAT type";
+      if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32 && this->type != TYPE_FLOAT &&
+                    this->type != TYPE_U16 && this->type != TYPE_S16)) {
+        whyNot = "Only support S16/U16/S32/U32/FLOAT type";
         return false;
@@ -1643,12 +1654,8 @@ namespace ir {
             whyNot = "Wrong number of source.";
             return false;
           } else {
-            const RegisterFamily fam = fn.getPointerFamily();
-            for (uint32_t srcID = 1; srcID < this->srcNum; ++srcID) {
-              const Register regID = fn.getRegister(src, srcID);
-              if (UNLIKELY(checkRegisterData(fam, regID, fn, whyNot) == false))
-                return false;
-            }
+            if (UNLIKELY(checkRegisterData(FAMILY_DWORD, fn.getRegister(src, 1), fn, whyNot) == false))
+              return false;
@@ -1714,6 +1721,31 @@ namespace ir {
     INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
       out << "." << AS;
+#define OUT_ATOMIC_OP(TYPE)     \
+      case ATOMIC_OP_##TYPE:    \
+      {    out << "." << #TYPE; \
+          break; \
+      }
+      switch(atomicOp)
+      {
+        OUT_ATOMIC_OP(OR)
+        default:
+          out << "." << "INVALID";
+          assert(0);
+      };
       out << " %" << this->getDst(fn, 0);
       out << " {" << "%" << this->getSrc(fn, 0) << "}";
       for (uint32_t i = 1; i < srcNum; ++i)
@@ -1826,7 +1858,7 @@ namespace ir {
     static const char *syncStr[syncFieldNum] = {
-      "workgroup", "local_read", "local_write", "global_read", "global_write"
+      "workgroup", "local_read", "local_write", "global_read", "global_write", "image"
     INLINE void SyncInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1885,7 +1917,8 @@ namespace ir {
       out << " %" << this->getDst(fn, 0);
-      out << " %" << this->getSrc(fn, 0);
+      for (uint32_t i = 0; i < this->getSrcNum(); ++i)
+        out << " %" << this->getSrc(fn, i);
       if (this->workGroupOp == WORKGROUP_OP_BROADCAST) {
         do {
@@ -1910,7 +1943,7 @@ namespace ir {
         } while(0);
-      out << "TheadID Map at SLM: " << this->slmAddr;
+      out << " (TheadID Map at SLM: " << this->slmAddr << ")";
     INLINE void SubGroupInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1987,6 +2020,7 @@ namespace ir {
       case MEM_CONSTANT: return out << "constant";
       case MEM_PRIVATE: return out << "private";
       case MEM_MIXED: return out << "mixed";
+      case MEM_GENERIC: return out << "generic";
       case MEM_INVALID: return out << "invalid";
     return out;
@@ -2374,8 +2408,10 @@ DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti())
 DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), getType(fn, ID))
 DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), getVectorSize())
+DECL_MEM_FN(MediaBlockReadInstruction, Type, getType(void), getType())
 DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize())
+DECL_MEM_FN(MediaBlockWriteInstruction, Type, getType(void), getType())
 #undef DECL_MEM_FN
@@ -2437,6 +2473,7 @@ DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
@@ -2683,12 +2720,12 @@ DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
     return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, num).convert();
-  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum) {
-    return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum).convert();
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type) {
+    return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type).convert();
-  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
-    return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size).convert();
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) {
+    return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size, type).convert();
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index b2b0b49..16c2045 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -62,6 +62,7 @@ namespace ir {
     MEM_CONSTANT,   //!< Immutable global memory
     MEM_PRIVATE,    //!< Per thread private memory
     MEM_MIXED,      //!< mixed address space pointer.
+    MEM_GENERIC,      //!< mixed address space pointer.
@@ -541,17 +542,19 @@ namespace ir {
-    SYNC_INVALID            = 1<<5
+    SYNC_IMAGE_FENCE        = 1<<5,
+    SYNC_INVALID            = 1<<6
   /*! 5 bits to encode all possible synchronization capablities */
-  static const uint32_t syncFieldNum = 5u;
+  static const uint32_t syncFieldNum = 6u;
   /*! When barrier(CLK_LOCAL_MEM_FENCE) is issued */
   static const uint32_t syncLocalBarrier = SYNC_WORKGROUP_EXEC |SYNC_LOCAL_WRITE_FENCE | SYNC_LOCAL_READ_FENCE;
   /*! When barrier(CLK_GLOBAL_MEM_FENCE) is issued */
   static const uint32_t syncGlobalBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE;
   /*! Sync instructions are used to order loads and stores for a given memory
    *  space and/or to serialize threads at a given point in the program
@@ -642,6 +645,7 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
     uint8_t getImageIndex() const;
     uint8_t getVectorSize() const;
+    Type getType(void) const;
   /*! Media Block Write.  */
@@ -651,6 +655,7 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
     uint8_t getImageIndex() const;
     uint8_t getVectorSize() const;
+    Type getType(void) const;
   /*! Specialize the instruction. Also performs typechecking first based on the
@@ -771,6 +776,8 @@ namespace ir {
   Instruction RNDZ(Type type, Register dst, Register src);
   /*! bswap.type dst src */
   Instruction BSWAP(Type type, Register dst, Register src);
+  /*! bfrev.type dst src */
+  Instruction BFREV(Type type, Register dst, Register src);
   /*! pow.type dst src0 src1 */
   Instruction POW(Type type, Register dst, Register src0, Register src1);
   /*! mul.type dst src0 src1 */
@@ -886,9 +893,9 @@ namespace ir {
   /*! printf */
   Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num);
   /*! media block read */
-  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum);
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type);
   /*! media block write */
-  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size);
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type);
 } /* namespace ir */
 } /* namespace gbe */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 7d755ae..81618eb 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -116,3 +116,4 @@ DECL_INSN(SUBGROUP, SubGroupInstruction)
 DECL_INSN(PRINTF, PrintfInstruction)
 DECL_INSN(MBREAD, MediaBlockReadInstruction)
 DECL_INSN(MBWRITE, MediaBlockWriteInstruction)
+DECL_INSN(BFREV, UnaryInstruction)
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 654a3bb..93bd96a 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -313,6 +313,7 @@ namespace ir {
         Instruction *insn = const_cast<Instruction*>(use->getInstruction());
         const Opcode opcode = insn->getOpcode();
         const uint32_t dstNum = insn->getDstNum();
+        (void) dstNum;
         GBE_ASSERT(dstNum == 1 || opcode == OP_LOAD);
         const Register dst = insn->getDst();
         auto it = addPtrInsns.find(derivedRegs[i]);
@@ -379,8 +380,14 @@ namespace ir {
         const uint32_t offset = valueID * size;
         const Register reg = load->getValue(valueID);
-        Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load->getAddressRegister(), offset);
+        Register addressReg = load->getAddressRegister();
+        if (fn->getPointerFamily() == FAMILY_QWORD) {
+          Register tmp = fn->newRegister(FAMILY_DWORD);
+          Instruction cvt = ir::CVT(ir::TYPE_U32, ir::TYPE_U64, tmp, load->getAddressRegister());
+          cvt.insert(ins_after, &ins_after);
+          addressReg = tmp;
+        }
+        Instruction mov = ir::INDIRECT_MOV(type, reg, arg, addressReg, offset);
         mov.insert(ins_after, &ins_after);
         replaced = true;
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index b16319a..212af0d 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -35,6 +35,7 @@ namespace ir {
         "group_id_0", "group_id_1", "group_id_2",
         "num_groups_0", "num_groups_1", "num_groups_2",
         "local_size_0", "local_size_1", "local_size_2",
+        "enqueued_local_size_0", "enqueued_local_size_1", "enqueued_local_size_2",
         "global_size_0", "global_size_1", "global_size_2",
         "global_offset_0", "global_offset_1", "global_offset_2",
         "stack_pointer", "stack_buffer",
@@ -47,7 +48,9 @@ namespace ir {
         "profiling_timestamps0", "profiling_timestamps1",
         "profiling_timestamps2", "profiling_timestamps3",
-        "threadid"
+        "threadid",
+        "constant_addrspace_start",
+        "stack_size", "enqueue_buffer_pointer",
@@ -72,13 +75,20 @@ namespace ir {
-      DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
+      if(fn.getOclVersion() >= 200) {
+        DECL_NEW_REG(FAMILY_QWORD, stackptr, 0);
+      } else {
+        DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
+      }
       DECL_NEW_REG(FAMILY_DWORD, barrierid, 1);
@@ -95,6 +105,9 @@ namespace ir {
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index eab7892..ebd5142 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -53,30 +53,36 @@ namespace ir {
     static const Register lsize0 = Register(9);    // get_local_size(0)
     static const Register lsize1 = Register(10);   // get_local_size(1)
     static const Register lsize2 = Register(11);   // get_local_size(2)
-    static const Register gsize0 = Register(12);   // get_global_size(0)
-    static const Register gsize1 = Register(13);   // get_global_size(1)
-    static const Register gsize2 = Register(14);   // get_global_size(2)
-    static const Register goffset0 = Register(15); // get_global_offset(0)
-    static const Register goffset1 = Register(16); // get_global_offset(1)
-    static const Register goffset2 = Register(17); // get_global_offset(2)
-    static const Register stackptr = Register(18); // stack pointer
-    static const Register stackbuffer = Register(19); // stack buffer base address.
-    static const Register blockip = Register(20);  // blockip
-    static const Register barrierid = Register(21);// barrierid
-    static const Register threadn = Register(22);  // number of threads
-    static const Register workdim = Register(23);  // work dimention.
-    static const Register zero = Register(24);     //  scalar register holds zero.
-    static const Register one = Register(25);     //  scalar register holds one. 
-    static const Register retVal = Register(26);   // helper register to do data flow analysis.
-    static const Register dwblockip = Register(27);  // blockip
-    static const Register profilingbptr = Register(28); // buffer addr for profiling.
-    static const Register profilingts0 = Register(29); // timestamp for profiling.
-    static const Register profilingts1 = Register(30); // timestamp for profiling.
-    static const Register profilingts2 = Register(31); // timestamp for profiling.
-    static const Register profilingts3 = Register(32); // timestamp for profiling.
-    static const Register profilingts4 = Register(33); // timestamp for profiling.
-    static const Register threadid = Register(34); // the thread id of this thread.
-    static const uint32_t regNum = 35;             // number of special registers
+    static const Register enqlsize0 = Register(12);    // get_local_size(0)
+    static const Register enqlsize1 = Register(13);   // get_local_size(1)
+    static const Register enqlsize2 = Register(14);   // get_local_size(2)
+    static const Register gsize0 = Register(15);   // get_global_size(0)
+    static const Register gsize1 = Register(16);   // get_global_size(1)
+    static const Register gsize2 = Register(17);   // get_global_size(2)
+    static const Register goffset0 = Register(18); // get_global_offset(0)
+    static const Register goffset1 = Register(19); // get_global_offset(1)
+    static const Register goffset2 = Register(20); // get_global_offset(2)
+    static const Register stackptr = Register(21); // stack pointer
+    static const Register stackbuffer = Register(22); // stack buffer base address.
+    static const Register blockip = Register(23);  // blockip
+    static const Register barrierid = Register(24);// barrierid
+    static const Register threadn = Register(25);  // number of threads
+    static const Register workdim = Register(26);  // work dimention.
+    static const Register zero = Register(27);     //  scalar register holds zero.
+    static const Register one = Register(28);     //  scalar register holds one.
+    static const Register retVal = Register(29);   // helper register to do data flow analysis.
+    static const Register dwblockip = Register(30);  // blockip
+    static const Register profilingbptr = Register(31); // buffer addr for profiling.
+    static const Register profilingts0 = Register(32); // timestamp for profiling.
+    static const Register profilingts1 = Register(33); // timestamp for profiling.
+    static const Register profilingts2 = Register(34); // timestamp for profiling.
+    static const Register profilingts3 = Register(35); // timestamp for profiling.
+    static const Register profilingts4 = Register(36); // timestamp for profiling.
+    static const Register threadid = Register(37); // the thread id of this thread.
+    static const Register constant_addrspace = Register(38);  // starting address of program-scope constant
+    static const Register stacksize = Register(39); // stack buffer total size
+    static const Register enqueuebufptr = Register(40); // enqueue buffer address .
+    static const uint32_t regNum = 41;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
diff --git a/backend/src/ir/profiling.cpp b/backend/src/ir/profiling.cpp
index 09537fa..ac61e9b 100644
--- a/backend/src/ir/profiling.cpp
+++ b/backend/src/ir/profiling.cpp
@@ -58,7 +58,7 @@ namespace ir
       proLog = ((proLog << 32) & 0xffffffff00000000) + log->timestampPrologLo;
       uint64_t epiLog = log->timestampEpilogHi;
       epiLog = ((epiLog << 32) & 0xffffffff00000000) + log->timestampEpilogLo;
-      printf(" | dispatch Mask:%4x prolog:%10lu  epilog:%10lu |\n", log->dispatchMask, proLog, epiLog);
+      printf(" | dispatch Mask:%4x prolog:%10" PRIu64 "  epilog:%10" PRIu64 " |\n", log->dispatchMask, proLog, epiLog);
       printf(" | globalX:%4d~%4d  globalY:%4d~%4d  globalZ:%4d~%4d |\n", log->gidXStart, log->gidXEnd,
           log->gidYStart, log->gidYEnd, log->gidZStart, log->gidZEnd);
diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp
index 8200c31..1e78722 100644
--- a/backend/src/ir/register.cpp
+++ b/backend/src/ir/register.cpp
@@ -35,6 +35,9 @@ namespace ir {
       case FAMILY_WORD: return out << "word";
       case FAMILY_DWORD: return out << "dword";
       case FAMILY_QWORD: return out << "qword";
+      case FAMILY_OWORD: return out << "oword";
+      case FAMILY_HWORD: return out << "hword";
+      case FAMILY_REG: return out << "reg";
     return out;
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index 11ab756..09af24e 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -45,11 +45,14 @@ namespace ir {
     FAMILY_BYTE  = 1,
     FAMILY_WORD  = 2,
     FAMILY_DWORD = 3,
+    FAMILY_QWORD = 4,
+    FAMILY_OWORD = 5,
+    FAMILY_HWORD = 6,
+    FAMILY_REG   = 7
   INLINE char getFamilyName(RegisterFamily family) {
-    static char registerFamilyName[] = {'b', 'B', 'W', 'D', 'Q'};
+    static char registerFamilyName[] = {'b', 'B', 'W', 'D', 'Q', 'O', 'H', 'R'};
     return registerFamilyName[family];
@@ -59,6 +62,7 @@ namespace ir {
       case FAMILY_WORD: return 2;
       case FAMILY_DWORD: return 4;
       case FAMILY_QWORD: return 8;
+      case FAMILY_REG: return 32;
       default: NOT_SUPPORTED;
     return 0;
diff --git a/backend/src/ir/reloc.cpp b/backend/src/ir/reloc.cpp
new file mode 100644
index 0000000..4884610
--- /dev/null
+++ b/backend/src/ir/reloc.cpp
@@ -0,0 +1,87 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+ * \file constant.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "reloc.hpp"
+namespace gbe {
+namespace ir {
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+  /*! Implements the serialization. */
+  uint32_t RelocTable::serializeToBin(std::ostream& outs) {
+    uint32_t ret_size = 0;
+    uint32_t sz = 0;
+    OUT_UPDATE_SZ(magic_begin);
+    sz = getCount();
+    OUT_UPDATE_SZ(sz);
+    RelocEntry entry(0, 0);
+    for (uint32_t i = 0; i < sz; ++i) {
+      entry = entries[i];
+      OUT_UPDATE_SZ(entry.refOffset);
+      OUT_UPDATE_SZ(entry.defOffset);
+    }
+    OUT_UPDATE_SZ(magic_end);
+    OUT_UPDATE_SZ(ret_size);
+    return ret_size;
+  }
+  uint32_t RelocTable::deserializeFromBin(std::istream& ins) {
+    uint32_t total_size = 0;
+    uint32_t magic;
+    uint32_t refOffset;
+    uint32_t defOffset;
+    uint32_t sz = 0;
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+    IN_UPDATE_SZ(sz); //regMap
+    for (uint32_t i = 0; i < sz; i++) {
+      IN_UPDATE_SZ(refOffset);
+      IN_UPDATE_SZ(defOffset);
+      addEntry(refOffset, defOffset);
+    }
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+    uint32_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+    return total_size;
+  }
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/reloc.hpp b/backend/src/ir/reloc.hpp
new file mode 100644
index 0000000..de33a8a
--- /dev/null
+++ b/backend/src/ir/reloc.hpp
@@ -0,0 +1,90 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+ * \file reloc.cpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_RELOC_HPP__
+#define __GBE_IR_RELOC_HPP__
+#include "sys/vector.hpp"
+#include <string.h>
+namespace gbe {
+namespace ir {
+  /*! Complete unit of compilation. It contains a set of functions and a set of
+   *  RelocEntry the functions may refer to.
+   */
+  struct RelocEntry {
+    RelocEntry(unsigned int rO, unsigned int dO):
+      refOffset(rO),
+      defOffset(dO) {}
+    unsigned int refOffset;
+    unsigned int defOffset;
+  };
+  class RelocTable : public NonCopyable, public Serializable
+  {
+    public:
+      void addEntry(unsigned refOffset, unsigned defOffset) {
+        entries.push_back(RelocEntry(refOffset, defOffset));
+      }
+      RelocTable() : Serializable() {}
+      RelocTable(const RelocTable& other) : Serializable(other),
+                                            entries(other.entries) {}
+      uint32_t getCount() { return entries.size(); }
+      void getData(char *p) {
+        if (entries.size() > 0 && p)
+          memcpy(p, entries.data(), entries.size()*sizeof(RelocEntry));
+      }
+    static const uint32_t magic_begin = TO_MAGIC('R', 'E', 'L', 'C');
+    static const uint32_t magic_end = TO_MAGIC('C', 'L', 'E', 'R');
+    /* format:
+       magic_begin       |
+       reloc_table_size  |
+       entry_0_refOffset |
+       entry_0_defOffset |
+       entry_1_refOffset |
+       entry_1_defOffset |
+       ........         |
+       entry_n_refOffset |
+       entry_n_defOffset |
+       magic_end       |
+       total_size
+    */
+    /*! Implements the serialization. */
+    virtual uint32_t serializeToBin(std::ostream& outs);
+    virtual uint32_t deserializeFromBin(std::istream& ins);
+    private:
+      vector<RelocEntry> entries;
+      GBE_CLASS(RelocTable);
+  };
+} /* namespace ir */
+} /* namespace gbe */
+#endif /* __GBE_IR_RELOC_HPP__ */
diff --git a/backend/src/ir/type.hpp b/backend/src/ir/type.hpp
index d528859..3ac758f 100644
--- a/backend/src/ir/type.hpp
+++ b/backend/src/ir/type.hpp
@@ -86,8 +86,8 @@ namespace ir {
       case FAMILY_WORD: return TYPE_U16;
       case FAMILY_DWORD: return TYPE_U32;
       case FAMILY_QWORD: return TYPE_U64;
-    };
-    return TYPE_U32;
+      default: return TYPE_U32;
+    }
 } /* namespace ir */
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
index c9cb15e..79e129d 100644
--- a/backend/src/ir/unit.cpp
+++ b/backend/src/ir/unit.cpp
@@ -30,6 +30,7 @@ namespace ir {
   Unit::Unit(PointerSize pointerSize) : pointerSize(pointerSize), valid(true) {
     profilingInfo = GBE_NEW(ProfilingInfo);
     inProfilingMode = false;
+    oclVersion = 120;
   Unit::~Unit(void) {
     for (const auto &pair : functions) GBE_DELETE(pair.second);
@@ -50,12 +51,11 @@ namespace ir {
     functions[name] = fn;
     return fn;
-  void Unit::newConstant(const char *data,
-                         const std::string &name,
+  void Unit::newConstant(const std::string &name,
                          uint32_t size,
                          uint32_t alignment)
-    constantSet.append(data, name, size, alignment);
+    constantSet.append(name, size, alignment);
   std::ostream &operator<< (std::ostream &out, const Unit &unit) {
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 10a1af6..46d7be7 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -28,7 +28,9 @@
 #include "ir/register.hpp"
 #include "ir/profiling.hpp"
 #include "ir/printf.hpp"
+#include "ir/reloc.hpp"
 #include "sys/map.hpp"
+#include <string.h>
 #include "llvm/IR/Instructions.h"
@@ -39,15 +41,13 @@ namespace ir {
   class Function;
   class ProfilingInfo;
-  /*! Complete unit of compilation. It contains a set of functions and a set of
-   *  constant the functions may refer to.
-   */
   class Unit : public NonCopyable
     typedef map<std::string, Function*> FunctionSet;
     /*! Moved from printf pass */
     map<llvm::CallInst*, PrintfSet::PrintfFmt*> printfs;
+    vector<std::string> blockFuncs;
     /*! Create an empty unit */
     Unit(PointerSize pointerSize = POINTER_32_BITS);
     /*! Release everything (*including* the function pointers) */
@@ -59,7 +59,7 @@ namespace ir {
     /*! Return NULL if the function already exists */
     Function *newFunction(const std::string &name);
     /*! Create a new constant in the constant set */
-    void newConstant(const char*, const std::string&, uint32_t size, uint32_t alignment);
+    void newConstant(const std::string&, uint32_t size, uint32_t alignment);
     /*! Apply the given functor on all the functions */
     template <typename T>
     INLINE void apply(const T &functor) const {
@@ -68,6 +68,7 @@ namespace ir {
     /*! Return the size of the pointers manipulated */
     INLINE PointerSize getPointerSize(void) const { return pointerSize; }
+    INLINE void setPointerSize(PointerSize size) { pointerSize = size; }
     /*! Return the family of registers that contain pointer */
     INLINE RegisterFamily getPointerFamily(void) const {
       if (this->getPointerSize() == POINTER_32_BITS)
@@ -77,6 +78,8 @@ namespace ir {
     /*! Return the constant set */
     ConstantSet& getConstantSet(void) { return constantSet; }
+    const RelocTable& getRelocTable(void) const  { return relocTable; }
+    RelocTable& getRelocTable(void)   { return relocTable; }
     /*! Return the constant set */
     const ConstantSet& getConstantSet(void) const { return constantSet; }
     /*! Get profiling info in this function */
@@ -87,13 +90,17 @@ namespace ir {
     bool getInProfilingMode(void) const { return inProfilingMode; }
     void setValid(bool value) { valid = value; }
     bool getValid() { return valid; }
+    void setOclVersion(uint32_t version) { oclVersion = version; }
+    uint32_t getOclVersion() const { return oclVersion; }
     friend class ContextInterface; //!< Can free modify the unit
     FunctionSet functions; //!< All the defined functions
     ConstantSet constantSet; //!< All the constants defined in the unit
+    RelocTable relocTable;
     PointerSize pointerSize; //!< Size shared by all pointers
     ProfilingInfo *profilingInfo; //!< profilingInfo store the information for profiling.
+    uint32_t oclVersion;
     bool valid;
     bool inProfilingMode;
diff --git a/backend/src/libocl/Android.mk b/backend/src/libocl/Android.mk
index 8e45c12..08044af 100644
--- a/backend/src/libocl/Android.mk
+++ b/backend/src/libocl/Android.mk
@@ -86,4 +86,3 @@ $(shell $(HOST_OUT)/bin/llvm-link -o ${generated_sources}/../beignet.bc $(addpre
 $(shell $(HOST_OUT)/bin/clang -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${generated_sources}/include/ --relocatable-pch -emit-pch -isysroot ${generated_sources} -x cl ${generated_sources}/include/ocl.h -o ${generated_sources}/../beignet.pch)
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 1d1ec68..c68ecb0 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -2,6 +2,8 @@ PROJECT(LIBOCL)
 SET (OCL_HEADER_FILES ${OCL_OBJECT_DIR}/include/ocl_defines.h)
     COMMAND mkdir -p ${OCL_OBJECT_DIR}/include/
@@ -30,11 +32,11 @@ MACRO(COPY_THE_HEADER _mod)
     ENDIF(orgin_name STREQUAL output_name)
+MACRO(COPY_THE_SOURCE _source _mod)
     # Use the python script to generate the header files.
     STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.cl" output_name ${_mod})
     STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/src/\\1.cl" orgin_name ${_mod})
+    SET(${_source} ${${_source}} ${output_name})
     IF(orgin_name STREQUAL output_name)
     ELSE(orgin_name STREQUAL output_name)
@@ -50,14 +52,27 @@ ENDMACRO(COPY_THE_SOURCE)
 SET (OCL_COPY_HEADERS ocl ocl_types ocl_float ocl_printf)
+SET (OCL_COPY_MODULES ocl_workitem ocl_async ocl_sync ocl_memcpy
+                      ocl_memset ocl_misc ocl_geometric ocl_image ocl_work_group)
-SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_memcpy
-                      ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image ocl_work_group)
+SET (OCL_COPY_MODULES_12 ocl_vload ocl_atom)
+SET (OCL_COPY_MODULES_20 ocl_vload_20 ocl_atom_20 ocl_pipe ocl_enqueue)
@@ -77,11 +92,11 @@ MACRO(GENERATE_HEADER_PY _mod)
 	COMMENT "Generate the header by python: ${output_name}"
     STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.cl" output_name ${_mod})
     STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/tmpl/\\1.tmpl.cl" tmpl_name ${_mod})
     STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/script/\\1.def" def_name ${_mod})
+    SET(${_source} ${${_source}} ${output_name})
     ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
 	COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/src/
 	COMMAND cat ${tmpl_name} > ${output_name}
@@ -91,12 +106,24 @@ MACRO(GENERATE_SOURCE_PY _mod)
-SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math ocl_simd)
+SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_simd)
     # Use the python script to generate the header files.
@@ -129,15 +156,15 @@ FOREACH(M ${OCL_BASH_GENERATED_MODULES})
+SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -triple spir -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2" -D__OPENCL_C_VERSION__=120)
+SET (CLANG_OCL_FLAGS_20 -fno-builtin -ffp-contract=off -triple spir64 -cl-kernel-arg-info -fblocks -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL2.0" -D__OPENCL_C_VERSION__=200)
-SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2")
+MACRO(ADD_CL_TO_BC_TARGET _file _output _clang_flag)
     # CMake seems can not add pattern rule, use MACRO to replace.
-    STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" output_name ${_file})
-    ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
-	#COMMAND echo ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${LIBOCL_BINARY_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
-	COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
+	#COMMAND echo ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${LIBOCL_BINARY_DIR}/include/ -emit-llvm-bc -o ${output_name} -x cl ${_file}
+	COMMAND ${CLANG_EXECUTABLE} -cc1 ${_clang_flag} -I ${OCL_OBJECT_DIR}/include/ -emit-llvm-bc -o ${_output} -x cl ${_file}
 	COMMENT "Compiling ${_file}"
@@ -145,14 +172,16 @@ ENDMACRO(ADD_CL_TO_BC_TARGET)
-    ADD_CL_TO_BC_TARGET(${f})
     STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
-    SET(OCL_BC_FILES ${OCL_BC_FILES} ${bc_name})
+    SET(OCL_BC_FILES_12 ${OCL_BC_FILES_12} ${bc_name})
+    ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS}")
+    STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+    SET(OCL_BC_FILES_12 ${OCL_BC_FILES_12} ${bc_name})
+    ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS}")
 # handle the ll files
@@ -178,42 +207,90 @@ MACRO(ADD_LL_TO_BC_TARGET M)
 	#COMMAND echo ${LLVM_INSTALL_DIR}llvm-as -o ${output_name} ${srcll_name}
 	COMMAND ${LLVM_AS_EXECUTABLE} -o ${output_name} ${srcll_name}
 	DEPENDS ${srcll_name}
-	COMMENT "Compiling ${srcll_name}"
+	COMMENT "Compiling ${output_name}"
-SET (OCL_LL_MODULES ocl_barrier ocl_clz)
+SET (OCL_LL_MODULES_12 ocl_barrier ocl_clz ocl_ctz)
     STRING(REGEX REPLACE "\(o.*\)" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
-    SET(OCL_BC_FILES ${OCL_BC_FILES} ${bc_name})
+    SET(OCL_BC_FILES_12 ${OCL_BC_FILES_12} ${bc_name})
     COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/
-    #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES}
+    #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES12}
     COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet.bc"
     COMMAND mkdir -p ${OCL_OBJECT_DIR}
-    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
+    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
     COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.local.pch"
     COMMAND mkdir -p ${OCL_OBJECT_DIR}
-    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.pch
+    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.pch
     COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.pch"
+    STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1_20.bc" bc_name ${f})
+    SET(OCL_BC_FILES_20 ${OCL_BC_FILES_20} ${bc_name})
+    ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS_20}")
+    STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+    SET(OCL_BC_FILES_20 ${OCL_BC_FILES_20} ${bc_name})
+    ADD_CL_TO_BC_TARGET(${f} ${bc_name} "${CLANG_OCL_FLAGS_20}")
+  SET (OCL_LL_MODULES_20 ocl_barrier_20 ocl_clz_20 ocl_ctz_20 ocl_atomic_20)
+    COPY_THE_LL(${f})
+    ADD_LL_TO_BC_TARGET(${f})
+    STRING(REGEX REPLACE "\(o.*\)" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+    SET(OCL_BC_FILES_20 ${OCL_BC_FILES_20} ${bc_name})
+    COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/
+    #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES}
+    COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet_20.bc"
+    )
+    COMMAND mkdir -p ${OCL_OBJECT_DIR}
+    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS_20} -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet_20.local.pch
+    COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet_20.local.pch"
+    )
+    COMMAND mkdir -p ${OCL_OBJECT_DIR}
+    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS_20} -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet_20.pch
+    COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet_20.pch"
+    )
+endif (ENABLE_OPENCL_20)
+add_custom_target(beignet_bitcode ALL DEPENDS ${OCL_OBJECT_DIR}/beignet.bc ${OCL_OBJECT_DIR}/beignet_20.bc ${OCL_OBJECT_DIR}/beignet.pch ${OCL_OBJECT_DIR}/beignet_20.pch ${OCL_OBJECT_DIR}/beignet.local.pch ${OCL_OBJECT_DIR}/beignet_20.local.pch)
 add_custom_target(beignet_bitcode ALL DEPENDS ${OCL_OBJECT_DIR}/beignet.bc ${OCL_OBJECT_DIR}/beignet.pch ${OCL_OBJECT_DIR}/beignet.local.pch)
+endif (ENABLE_OPENCL_20)
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 5e3a788..2548cb7 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -83,21 +83,29 @@
 #include "ocl_types.h"
 #include "ocl_as.h"
 #include "ocl_async.h"
-#include "ocl_atom.h"
 #include "ocl_common.h"
 #include "ocl_convert.h"
 #include "ocl_float.h"
 #include "ocl_geometric.h"
 #include "ocl_image.h"
 #include "ocl_integer.h"
-#include "ocl_math.h"
 #include "ocl_memcpy.h"
 #include "ocl_memset.h"
 #include "ocl_misc.h"
 #include "ocl_printf.h"
 #include "ocl_relational.h"
 #include "ocl_sync.h"
+#if (__OPENCL_C_VERSION__ >= 200)
+#include "ocl_vload_20.h"
+#include "ocl_atom_20.h"
+#include "ocl_pipe.h"
+#include "ocl_math_20.h"
+#include "ocl_enqueue.h"
 #include "ocl_vload.h"
+#include "ocl_atom.h"
+#include "ocl_math.h"
 #include "ocl_workitem.h"
 #include "ocl_simd.h"
 #include "ocl_work_group.h"
@@ -114,6 +122,7 @@
 #define cl_khr_fp16
 #define cl_khr_3d_image_writes
 #define cl_intel_subgroups
+#define cl_intel_subgroups_short
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
diff --git a/backend/src/libocl/include/ocl_atom_20.h b/backend/src/libocl/include/ocl_atom_20.h
new file mode 100644
index 0000000..9e34c31
--- /dev/null
+++ b/backend/src/libocl/include/ocl_atom_20.h
@@ -0,0 +1,188 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_ATOM20_H__
+#define __OCL_ATOM20_H__
+#include "ocl_types.h"
+// Atomic functions
+OVERLOADABLE uint atomic_add(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_add(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_add(volatile __global int *p, int val);
+OVERLOADABLE int atomic_add(volatile __local int *p, int val);
+OVERLOADABLE uint atomic_sub(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_sub(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_sub(volatile __global int *p, int val);
+OVERLOADABLE int atomic_sub(volatile __local int *p, int val);
+OVERLOADABLE uint atomic_and(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_and(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_and(volatile __global int *p, int val);
+OVERLOADABLE int atomic_and(volatile __local int *p, int val);
+OVERLOADABLE uint atomic_or(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_or(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_or(volatile __global int *p, int val);
+OVERLOADABLE int atomic_or(volatile __local int *p, int val);
+OVERLOADABLE uint atomic_xor(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_xor(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_xor(volatile __global int *p, int val);
+OVERLOADABLE int atomic_xor(volatile __local int *p, int val);
+OVERLOADABLE uint atomic_xchg(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_xchg(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_xchg(volatile __global int *p, int val);
+OVERLOADABLE int atomic_xchg(volatile __local int *p, int val);
+OVERLOADABLE int atomic_min(volatile __global int *p, int val);
+OVERLOADABLE int atomic_min(volatile __local int *p, int val);
+OVERLOADABLE int atomic_max(volatile __global int *p, int val);
+OVERLOADABLE int atomic_max(volatile __local int *p, int val);
+OVERLOADABLE uint atomic_min(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_min(volatile __local uint *p, uint val);
+OVERLOADABLE uint atomic_max(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_max(volatile __local uint *p, uint val);
+OVERLOADABLE float atomic_xchg (volatile __global float *p, float val);
+OVERLOADABLE float atomic_xchg (volatile __local float *p, float val);
+OVERLOADABLE uint atomic_inc (volatile __global uint *p);
+OVERLOADABLE uint atomic_inc (volatile __local uint *p);
+OVERLOADABLE int atomic_inc (volatile __global int *p);
+OVERLOADABLE int atomic_inc (volatile __local int *p);
+OVERLOADABLE uint atomic_dec (volatile __global uint *p);
+OVERLOADABLE uint atomic_dec (volatile __local uint *p);
+OVERLOADABLE int atomic_dec (volatile __global int *p);
+OVERLOADABLE int atomic_dec (volatile __local int *p);
+OVERLOADABLE uint atomic_cmpxchg (volatile __global uint *p, uint cmp, uint val);
+OVERLOADABLE uint atomic_cmpxchg (volatile __local uint *p, uint cmp, uint val);
+OVERLOADABLE int atomic_cmpxchg (volatile __global int *p, int cmp, int val);
+OVERLOADABLE int atomic_cmpxchg (volatile __local int *p, int cmp, int val);
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+//OpenCL 2.0 features
+CTYPE __gen_ocl_atomic_exchange##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope);  \
+CTYPE __gen_ocl_atomic_fetch_add##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_sub##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_or##POSTFIX(volatile ATYPE *p,  CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_xor##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_and##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_imin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_umin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_imax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_umax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope);\
+CTYPE __gen_ocl_atomic_compare_exchange_strong##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope); \
+CTYPE __gen_ocl_atomic_compare_exchange_weak##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope);
+ATOMIC_GEN_FUNCTIONS(atomic_int, int, 32)
+ATOMIC_GEN_FUNCTIONS(atomic_long, long, 64)
+float __gen_ocl_atomic_exchangef(volatile atomic_int *p, float val, int order, int scope);
+float __gen_ocl_atomic_fetch_addf(volatile atomic_int *p, float val, int order, int scope);
+/* only used to initialize global address space */
+//#define ATOMIC_VAR_INIT(C value)
+OVERLOADABLE void atomic_init(volatile ATYPE *object, CTYPE desired);  \
+OVERLOADABLE void atomic_store(volatile ATYPE *object, CTYPE desired);  \
+OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order);  \
+OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_load(volatile ATYPE *object);  \
+OVERLOADABLE CTYPE  atomic_load_explicit(volatile ATYPE *object, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_load_explicit(volatile ATYPE *object, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_exchange(volatile ATYPE *object, CTYPE desired);  \
+OVERLOADABLE CTYPE  atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE bool atomic_compare_exchange_strong(volatile ATYPE *object, CTYPE *expected, CTYPE desired);  \
+OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure);  \
+OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope);  \
+OVERLOADABLE bool atomic_compare_exchange_weak(volatile ATYPE *object, CTYPE *expected, CTYPE desired);  \
+OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure);  \
+OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_add(volatile ATYPE *object, MTYPE1 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_sub(volatile ATYPE *object, MTYPE1 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_or(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_xor(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_and(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_min(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_max(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);
+ATOMIC_FUNCTIONS(atomic_int, int, int, int)
+ATOMIC_FUNCTIONS(atomic_uint, uint, uint, uint)
+ATOMIC_FUNCTIONS(atomic_long, long, long, long)
+ATOMIC_FUNCTIONS(atomic_ulong, ulong, ulong, ulong)
+ATOMIC_FUNCTIONS(atomic_float, float, float, float)
+OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object);
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order);
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope);
+#endif  /* __OCL_ATOM20_H__ */
diff --git a/backend/src/libocl/include/ocl_enqueue.h b/backend/src/libocl/include/ocl_enqueue.h
new file mode 100644
index 0000000..6479df7
--- /dev/null
+++ b/backend/src/libocl/include/ocl_enqueue.h
@@ -0,0 +1,90 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_ENQUEUE_H__
+#define __OCL_ENQUEUE_H__
+#include "ocl_types.h"
+#define CLK_SUCCESS 0
+#define CL_COMPLETE 0
+struct ndrange_info_t {
+  int type;
+  int global_work_size[3];
+  int local_work_size[3];
+  int global_work_offset[3];
+struct Block_literal {
+  void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
+  int flags;
+  int reserved;
+  __global void (*invoke)(void *, ...);
+  struct Block_descriptor_1 {
+    unsigned long int reserved;         // NULL
+    unsigned long int size;         // sizeof(struct Block_literal_1)
+    // optional helper functions
+    void (*copy_helper)(void *dst, void *src);     // IFF (1<<25)
+    void (*dispose_helper)(void *src);             // IFF (1<<25)
+    // required ABI.2010.3.16
+    const char *signature;                         // IFF (1<<30)
+  } *descriptor;
+  // imported variables
+clk_event_t create_user_event(void);
+void retain_event(clk_event_t event);
+void release_event(clk_event_t event);
+void set_user_event_status(clk_event_t event, int status);
+bool is_valid_event(clk_event_t event);
+void capture_event_profiling_info(clk_event_t event, int name, global void *value);
+uint __get_kernel_work_group_size_impl(__private void *block);
+uint __get_kernel_preferred_work_group_multiple_impl(__private void *block);
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void));
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+                                clk_event_t *event_ret, void (^block)(void));
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, __private void *block, uint size0, ...);
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+                                clk_event_t *event_ret,  __private void *block, uint size0, ...);
+queue_t get_default_queue(void);
+int __gen_enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void), int size);
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes);
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size);
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size);
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_size, size_t local_work_size);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2]);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2]);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size_t global_work_size[2], const size_t local_work_size[2]);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3]);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3]);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3]);
+int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret);
diff --git a/backend/src/libocl/include/ocl_image.h b/backend/src/libocl/include/ocl_image.h
index cdb3411..5a679aa 100644
--- a/backend/src/libocl/include/ocl_image.h
+++ b/backend/src/libocl/include/ocl_image.h
@@ -20,28 +20,77 @@
 #include "ocl_types.h"
-OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, const sampler_t sampler, float coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_t cl_image, int coord);
-OVERLOADABLE void write_imagei(write_only image1d_t cl_image, int coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image1d_t cl_image, float coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, const sampler_t sampler, float coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_t cl_image, int coord);
-OVERLOADABLE void write_imageui(write_only image1d_t cl_image, int coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image1d_t cl_image, float coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, const sampler_t sampler, float coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_t cl_image, int coord);
-OVERLOADABLE void write_imagef(write_only image1d_t cl_image, int coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image1d_t cl_image, float coord, float4 color);
-OVERLOADABLE int4 read_imagei(read_only image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imagei(write_only image1d_buffer_t cl_image, int coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imageui(write_only image1d_buffer_t cl_image, int coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image1d_buffer_t cl_image, float coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image1d_buffer_t cl_image, int coord);
-OVERLOADABLE void write_imagef(write_only image1d_buffer_t cl_image, int coord, float4 color);
+#define int1 int
+#define float1 float
+  OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const sampler_t sampler, int##N coord); \
+  OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, const sampler_t sampler, float##N coord);
+  OVERLOADABLE DATA_YPE read_image ## SUFFIX(IMG_TYPE cl_image, int##N coord);
+  OVERLOADABLE void write_image ## SUFFIX(IMG_TYPE cl_image, int##N coord, DATA_YPE color);
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_IMAGE(IMG_TYPE, N) \
+    DECL_IMAGE_TYPE_WRITE(write_only IMG_TYPE, N) \
+#define DECL_IMAGE(IMG_TYPE, N) \
+DECL_IMAGE(image1d_t, 1)
+DECL_IMAGE(image2d_t, 2)
+DECL_IMAGE(image1d_array_t, 2)
+DECL_IMAGE(image3d_t, 3)
+DECL_IMAGE(image3d_t, 4)
+DECL_IMAGE(image2d_array_t, 3)
+DECL_IMAGE(image2d_array_t, 4)
+#undef DECL_IMAGE
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_IMAGE(IMG_TYPE, N) \
+    DECL_IMAGE_TYPE_WRITE(write_only IMG_TYPE, N) \
+#define DECL_IMAGE(IMG_TYPE, N) \
+DECL_IMAGE(image1d_buffer_t, 1)
+#undef int1
+#undef float1
+#undef DECL_IMAGE
 OVERLOADABLE int get_image_channel_data_type(read_only image1d_t image);
 OVERLOADABLE int get_image_channel_order(read_only image1d_t image);
@@ -51,37 +100,6 @@ OVERLOADABLE int get_image_channel_data_type(read_only image1d_buffer_t image);
 OVERLOADABLE int get_image_channel_order(read_only image1d_buffer_t image);
 OVERLOADABLE int get_image_width(read_only image1d_buffer_t image);
-OVERLOADABLE int4 read_imagei(read_only image2d_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_t cl_image, int2 coord);
-OVERLOADABLE void write_imagei(write_only image2d_t cl_image, int2 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image2d_t cl_image, float2 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image2d_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_t cl_image, int2 coord);
-OVERLOADABLE void write_imageui(write_only image2d_t cl_image, int2 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image2d_t cl_image, float2 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image2d_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_t cl_image, int2 coord);
-OVERLOADABLE void write_imagef(write_only image2d_t cl_image, int2 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image2d_t cl_image, float2 coord, float4 color);
-OVERLOADABLE int4 read_imagei(read_only image1d_array_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_array_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE int4 read_imagei(read_only image1d_array_t cl_image, int2 coord);
-OVERLOADABLE void write_imagei(write_only image1d_array_t cl_image, int2 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image1d_array_t cl_image, float2 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image1d_array_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_array_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE uint4 read_imageui(read_only image1d_array_t cl_image, int2 coord);
-OVERLOADABLE void write_imageui(write_only image1d_array_t cl_image, int2 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image1d_array_t cl_image, float2 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image1d_array_t cl_image, const sampler_t sampler, int2 coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_array_t cl_image, const sampler_t sampler, float2 coord);
-OVERLOADABLE float4 read_imagef(read_only image1d_array_t cl_image, int2 coord);
-OVERLOADABLE void write_imagef(write_only image1d_array_t cl_image, int2 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image1d_array_t cl_image, float2 coord, float4 color);
 OVERLOADABLE int get_image_channel_data_type(read_only image2d_t image);
 OVERLOADABLE int get_image_channel_order(read_only image2d_t image);
 OVERLOADABLE int get_image_width(read_only image2d_t image);
@@ -93,69 +111,6 @@ OVERLOADABLE int get_image_channel_order(read_only image1d_array_t image);
 OVERLOADABLE int get_image_width(read_only image1d_array_t image);
 OVERLOADABLE size_t get_image_array_size(read_only image1d_array_t image);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, int4 coord);
-OVERLOADABLE void write_imagei(write_only image3d_t cl_image, int4 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image3d_t cl_image, float4 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, int4 coord);
-OVERLOADABLE void write_imageui(write_only image3d_t cl_image, int4 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image3d_t cl_image, float4 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, int4 coord);
-OVERLOADABLE void write_imagef(write_only image3d_t cl_image, int4 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image3d_t cl_image, float4 coord, float4 color);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE int4 read_imagei(read_only image3d_t cl_image, int3 coord);
-OVERLOADABLE void write_imagei(write_only image3d_t cl_image, int3 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image3d_t cl_image, float3 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE uint4 read_imageui(read_only image3d_t cl_image, int3 coord);
-OVERLOADABLE void write_imageui(write_only image3d_t cl_image, int3 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image3d_t cl_image, float3 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE float4 read_imagef(read_only image3d_t cl_image, int3 coord);
-OVERLOADABLE void write_imagef(write_only image3d_t cl_image, int3 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image3d_t cl_image, float3 coord, float4 color);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, int4 coord);
-OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, int4 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, float4 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, int4 coord);
-OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, int4 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, float4 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, int4 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, float4 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, int4 coord);
-OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, int4 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, float4 coord, float4 color);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE int4 read_imagei(read_only image2d_array_t cl_image, int3 coord);
-OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, int3 coord, int4 color);
-OVERLOADABLE void write_imagei(write_only image2d_array_t cl_image, float3 coord, int4 color);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE uint4 read_imageui(read_only image2d_array_t cl_image, int3 coord);
-OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, int3 coord, uint4 color);
-OVERLOADABLE void write_imageui(write_only image2d_array_t cl_image, float3 coord, uint4 color);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, int3 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, const sampler_t sampler, float3 coord);
-OVERLOADABLE float4 read_imagef(read_only image2d_array_t cl_image, int3 coord);
-OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, int3 coord, float4 color);
-OVERLOADABLE void write_imagef(write_only image2d_array_t cl_image, float3 coord, float4 color);
 OVERLOADABLE int get_image_channel_data_type(read_only image3d_t image);
 OVERLOADABLE int get_image_channel_order(read_only image3d_t image);
 OVERLOADABLE int get_image_width(read_only image3d_t image);
@@ -205,4 +160,39 @@ OVERLOADABLE int2 get_image_dim(write_only image2d_array_t image);
 OVERLOADABLE size_t get_image_array_size(write_only image2d_array_t image);
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE int get_image_channel_data_type(read_write image1d_t image);
+OVERLOADABLE int get_image_channel_order(read_write image1d_t image);
+OVERLOADABLE int get_image_width(read_write image1d_t image);
+OVERLOADABLE int get_image_channel_data_type(read_write image1d_buffer_t image);
+OVERLOADABLE int get_image_channel_order(read_write image1d_buffer_t image);
+OVERLOADABLE int get_image_width(read_write image1d_buffer_t image);
+OVERLOADABLE int get_image_channel_data_type(read_write image2d_t image);
+OVERLOADABLE int get_image_channel_order(read_write image2d_t image);
+OVERLOADABLE int get_image_width(read_write image2d_t image);
+OVERLOADABLE int get_image_height(read_write image2d_t image);
+OVERLOADABLE int2 get_image_dim(read_write image2d_t image);
+OVERLOADABLE int get_image_channel_data_type(read_write image1d_array_t image);
+OVERLOADABLE int get_image_channel_order(read_write image1d_array_t image);
+OVERLOADABLE int get_image_width(read_write image1d_array_t image);
+OVERLOADABLE size_t get_image_array_size(read_write image1d_array_t image);
+OVERLOADABLE int get_image_channel_data_type(read_write image3d_t image);
+OVERLOADABLE int get_image_channel_order(read_write image3d_t image);
+OVERLOADABLE int get_image_width(read_write image3d_t image);
+OVERLOADABLE int get_image_height(read_write image3d_t image);
+OVERLOADABLE int get_image_depth(read_write image3d_t image);
+OVERLOADABLE int4 get_image_dim(read_write image3d_t image);
+OVERLOADABLE int get_image_channel_data_type(read_write image2d_array_t image);
+OVERLOADABLE int get_image_channel_order(read_write image2d_array_t image);
+OVERLOADABLE int get_image_width(read_write image2d_array_t image);
+OVERLOADABLE int get_image_height(read_write image2d_array_t image);
+OVERLOADABLE int2 get_image_dim(read_write image2d_array_t image);
+OVERLOADABLE size_t get_image_array_size(read_write image2d_array_t image);
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 7d4abab..2c0d700 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -152,4 +152,13 @@ uint8 __gen_ocl_vme(image2d_t, image2d_t,
                    uint, uint, uint, uint,
                    uint, uint, uint, uint,
                    int, int, int);
+bool __gen_ocl_in_local(size_t p);
+bool __gen_ocl_in_private(size_t p);
+#if (__OPENCL_C_VERSION__ >= 200)
+local void *__to_local(generic void *p);
+global void *__to_global(generic void *p);
+private void *__to_private(generic void *p);
diff --git a/backend/src/libocl/include/ocl_pipe.h b/backend/src/libocl/include/ocl_pipe.h
new file mode 100644
index 0000000..349b1dd
--- /dev/null
+++ b/backend/src/libocl/include/ocl_pipe.h
@@ -0,0 +1,51 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_PIPE_H__
+#define __OCL_PIPE_H__
+#include "ocl_types.h"
+#include "ocl_work_group.h"
+#include "ocl_simd.h"
+/* The pipe read function. */
+int __read_pipe_2(pipe int p, __generic void* dst);
+int __read_pipe_4(pipe int p, reserve_id_t id, uint index, void* dst);
+reserve_id_t __reserve_read_pipe(pipe int p, uint num);
+void __commit_read_pipe(pipe int p, reserve_id_t rid);
+reserve_id_t __work_group_reserve_read_pipe(pipe int p, uint num);
+void __work_group_commit_read_pipe(pipe int p, reserve_id_t rid);
+reserve_id_t __sub_group_reserve_read_pipe(pipe int p, uint num);
+void __sub_group_commit_read_pipe(pipe int p, reserve_id_t rid);
+/* The pipe write function. */
+int __write_pipe_2(pipe int p, __generic void* src);
+int __write_pipe_4(pipe int p, reserve_id_t id, uint index, void* src);
+reserve_id_t __reserve_write_pipe(pipe int p, uint num);
+void __commit_write_pipe(pipe int p, reserve_id_t rid);
+reserve_id_t __work_group_reserve_write_pipe(pipe int p, uint num);
+void __work_group_commit_write_pipe(pipe int p, reserve_id_t rid);
+reserve_id_t __sub_group_reserve_write_pipe(pipe int p, uint num);
+void __sub_group_commit_write_pipe(pipe int p, reserve_id_t rid);
+/* The reserve_id_t function. */
+bool is_valid_reserve_id(reserve_id_t rid);
+/* The pipe query function. */
+uint __get_pipe_num_packets(pipe int p);
+uint __get_pipe_max_packets(pipe int p);
diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h
index 312928e..22ff89a 100644
--- a/backend/src/libocl/include/ocl_sync.h
+++ b/backend/src/libocl/include/ocl_sync.h
@@ -23,14 +23,11 @@
 // Synchronization functions
-#define CLK_LOCAL_MEM_FENCE  (1 << 0)
-#define CLK_GLOBAL_MEM_FENCE (1 << 1)
-typedef uint cl_mem_fence_flags;
 OVERLOADABLE void barrier(cl_mem_fence_flags flags);
 OVERLOADABLE void debugwait(void);
 OVERLOADABLE void mem_fence(cl_mem_fence_flags flags);
 OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags);
 OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags);
+#define work_group_barrier barrier
+cl_mem_fence_flags get_fence(void *ptr);
 #endif  /* __OCL_SYNC_H__ */
diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h
index eb4c3b4..327624b 100644
--- a/backend/src/libocl/include/ocl_types.h
+++ b/backend/src/libocl/include/ocl_types.h
@@ -20,6 +20,11 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
 #include "ocl_defines.h"
 #define NULL 0
@@ -32,8 +37,6 @@
 #define PURE __attribute__((pure))
 #define CONST __attribute__((const))
 #define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
-// FIXME, clang's opencl FE doesn't support static.
-#define static
 // OpenCL built-in scalar data types
@@ -44,8 +47,12 @@ typedef unsigned int uint;
 typedef unsigned long ulong;
 typedef __typeof__(sizeof(int)) size_t;
 typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
-typedef signed int intptr_t;
-typedef unsigned int uintptr_t;
+#define __int_t_type(a,b,c) a##b##c
+#define __int_type(type,n) __int_t_type(type,n,_TYPE__)
+typedef __int_type(__INT,__INTPTR_WIDTH__) intptr_t;
+typedef __int_type(__UINT,__INTPTR_WIDTH__) uintptr_t;
+#undef __int_type
+#undef __int_t_type
 // OpenCL address space
@@ -84,6 +91,34 @@ DEF(half);
 #undef DEF
+// OpenCL atomic related types
+//atomic flags
+#define CLK_LOCAL_MEM_FENCE  (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+#define  CLK_IMAGE_MEM_FENCE (1 << 2)
+typedef uint cl_mem_fence_flags;
+//memory order
+typedef enum {
+	memory_order_relaxed,
+	memory_order_acquire,
+	memory_order_release,
+	memory_order_acq_rel,
+	memory_order_seq_cst
+} memory_order;
+//memory scope
+typedef enum {
+	memory_scope_work_item,
+	memory_scope_work_group,
+	memory_scope_device,
+	memory_scope_all_svm_devices,
+	memory_scope_sub_group,
+} memory_scope;
 // OpenCL built-in event types
 // FIXME:
diff --git a/backend/src/libocl/include/ocl_vload_20.h b/backend/src/libocl/include/ocl_vload_20.h
new file mode 100644
index 0000000..3f7fc62
--- /dev/null
+++ b/backend/src/libocl/include/ocl_vload_20.h
@@ -0,0 +1,150 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_VLOAD_20_H__
+#define __OCL_VLOAD_20_H__
+#include "ocl_types.h"
+// Vector loads and stores
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p);
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p);
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);
+OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p);  \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);  \
+OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p);  \
+OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p);  \
+OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p);
+OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p);
+  DECL_BYTE_RD_SPACE(TYPE, __generic) \
+  DECL_BYTE_WR_SPACE(TYPE, __generic) \
+  DECL_BYTE_RD_SPACE(TYPE, __constant)
+OVERLOADABLE float vload_half(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p); \
+OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p); \
+OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p); \
+OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p); \
+OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p); \
+OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p); \
+OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p);  \
+OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p);  \
+OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p);
+#endif  /* __OCL_VLOAD_20_H__ */
diff --git a/backend/src/libocl/include/ocl_workitem.h b/backend/src/libocl/include/ocl_workitem.h
index c3b0bdb..1a96aa8 100644
--- a/backend/src/libocl/include/ocl_workitem.h
+++ b/backend/src/libocl/include/ocl_workitem.h
@@ -21,15 +21,15 @@
 #include "ocl_types.h"
 OVERLOADABLE uint get_work_dim(void);
-OVERLOADABLE uint get_global_size(uint dimindx);
-OVERLOADABLE uint get_global_id(uint dimindx);
-OVERLOADABLE uint get_local_size(uint dimindx);
-OVERLOADABLE uint get_enqueued_local_size(uint dimindx);
-OVERLOADABLE uint get_local_id(uint dimindx);
-OVERLOADABLE uint get_num_groups(uint dimindx);
-OVERLOADABLE uint get_group_id(uint dimindx);
-OVERLOADABLE uint get_global_offset(uint dimindx);
-OVERLOADABLE uint get_global_linear_id(void);
-OVERLOADABLE uint get_local_linear_id(void);
+OVERLOADABLE size_t get_global_size(uint dimindx);
+OVERLOADABLE size_t get_global_id(uint dimindx);
+OVERLOADABLE size_t get_local_size(uint dimindx);
+OVERLOADABLE size_t get_enqueued_local_size(uint dimindx);
+OVERLOADABLE size_t get_local_id(uint dimindx);
+OVERLOADABLE size_t get_num_groups(uint dimindx);
+OVERLOADABLE size_t get_group_id(uint dimindx);
+OVERLOADABLE size_t get_global_offset(uint dimindx);
+OVERLOADABLE size_t get_global_linear_id(void);
+OVERLOADABLE size_t get_local_linear_id(void);
 #endif  /* __OCL_WORKITEM_H__ */
diff --git a/backend/src/libocl/script/gen_vector.py b/backend/src/libocl/script/gen_vector.py
index 10e8634..1bc8e59 100755
--- a/backend/src/libocl/script/gen_vector.py
+++ b/backend/src/libocl/script/gen_vector.py
@@ -114,7 +114,7 @@ def _prefix(prefix, dtype):
         return prefix + '_' + dtype
     return dtype
-memspaces = ["__local ", "__private ", "__global "]
+memspaces = ["__local ", "__private ", "__global ", "__generic "]
 def stripMemSpace(t):
     if t[0:2] == '__':
@@ -254,7 +254,8 @@ class builtinProto():
                     tmpType = line[i]
                 if tmpType == '__local' or   \
                    tmpType == '__private' or \
-                   tmpType == '__global':
+                   tmpType == '__global' or\
+                   tmpType == '__generic':
                    memSpace = tmpType + ' '
                    stripped += 1
diff --git a/backend/src/libocl/script/ocl_integer.def b/backend/src/libocl/script/ocl_integer.def
index c35c242..5e41c34 100644
--- a/backend/src/libocl/script/ocl_integer.def
+++ b/backend/src/libocl/script/ocl_integer.def
@@ -7,6 +7,7 @@ gentype rhadd (gentype x, gentype y)
 gentype clamp (gentype x, gentype minval, gentype maxval)
 gentype clamp (gentype x, sgentype minval, sgentype maxval)
 gentype clz (gentype x)
+gentype ctz (gentype x)
 gentype mad_hi (gentype a, gentype b, gentype c)
 gentype mad_sat (gentype a, gentype b, gentype c)
 gentype max (gentype x,  gentype y)
diff --git a/backend/src/libocl/script/ocl_math_20.def b/backend/src/libocl/script/ocl_math_20.def
new file mode 100644
index 0000000..7392108
--- /dev/null
+++ b/backend/src/libocl/script/ocl_math_20.def
@@ -0,0 +1,151 @@
+gentype acos (gentype)
+gentype acosh (gentype)
+gentype acospi (gentype x)
+gentype asin (gentype)
+gentype asinh (gentype)
+gentype asinpi (gentype x)
+gentype atan (gentype y_over_x)
+gentype atan2 (gentype y, gentype x)
+gentype atanh (gentype)
+gentype atanpi (gentype x)
+gentype atan2pi (gentype y, gentype x)
+gentype cbrt (gentype)
+gentype ceil (gentype)
+gentype copysign (gentype x, gentype y)
+gentype cos (gentype)
+gentype cosh (gentype)
+gentype cospi (gentype x)
+gentype erfc (gentype)
+gentype erf (gentype)
+gentype exp (gentype x)
+gentype exp2 (gentype)
+gentype exp10 (gentype)
+gentype expm1 (gentype x)
+gentype fabs (gentype)
+gentype fdim (gentype x, gentype y)
+gentype floor (gentype)
+# XXX we use madd for fma
+gentype fma (gentype a, gentype b, gentype c)
+gentype fmax (gentype x, gentype y)
+gentypef fmax (gentypef x, float y)
+gentypeh fmax (gentypeh x, half y)
+gentyped fmax (gentyped x, double y)
+gentype fmin (gentype x, gentype y)
+gentypef fmin (gentypef x, float y)
+gentypeh fmin (gentypeh x, half y)
+gentyped fmin (gentyped x, double y)
+gentype fmod (gentype x, gentype y)
+gentype fract (gentype x, __generic gentype *iptr)
+floatn frexp (floatn x, __generic intn *exp)
+float frexp (float x, __generic int *exp)
+halfn frexp (halfn x, __generic intn *exp)
+half frexp (half x, __generic int *exp)
+doublen frexp (doublen x, __generic intn *exp)
+double frexp (double x, __generic int *exp)
+gentype hypot (gentype x, gentype y)
+intn ilogb (floatn x)
+int ilogb (float x)
+shortn ilogb (halfn x)
+short ilogb (half x)
+intn ilogb (doublen x)
+int ilogb (double x)
+floatn ldexp (floatn x, intn k)
+floatn ldexp (floatn x, int k)
+float ldexp (float x, int k)
+halfn ldexp (halfn x, intn k)
+halfn ldexp (halfn x, int k)
+half ldexp (half x, int k)
+doublen ldexp (doublen x, intn k)
+doublen ldexp (doublen x, int k)
+double ldexp (double x, int k)
+gentype lgamma (gentype x)
+floatn lgamma_r (floatn x, __generic intn *signp)
+float lgamma_r (float x, __generic int *signp)
+halfn lgamma_r (halfn x, __generic intn *signp)
+half lgamma_r (half x, __generic int *signp)
+#doublen lgamma_r (doublen x, __generic intn *signp)
+#double lgamma_r (double x, __generic int *signp)
+gentype log (gentype)
+gentype log2 (gentype)
+gentype log10 (gentype)
+gentype log1p (gentype x)
+gentype logb (gentype x)
+gentype mad (gentype a, gentype b, gentype c)
+gentype maxmag (gentype x, gentype y)
+gentype minmag (gentype x, gentype y)
+gentype modf (gentype x, __generic gentype *iptr)
+floatn nan (uintn nancode)
+float nan (uint nancode)
+halfn nan (ushortn nancode)
+half nan (ushort nancode)
+doublen nan (ulongn nancode)
+double nan (ulong nancode)
+gentype nextafter (gentype x, gentype y)
+gentype pow (gentype x, gentype y)
+floatn pown (floatn x, intn y)
+float pown (float x, int y)
+halfn pown (halfn x, intn y)
+half pown (half x, int y)
+doublen pown (doublen x, intn y)
+double pown (double x, int y)
+gentype powr (gentype x, gentype y)
+gentype remainder (gentype x, gentype y)
+floatn remquo (floatn x, floatn y, __generic intn *quo)
+float remquo (float x, float y, __generic int *quo)
+halfn remquo (halfn x, halfn y, __generic intn *quo)
+half remquo (half x, half y, __generic int *quo)
+doublen remquo (doublen x, doublen y, __generic intn *quo)
+double remquo (double x, double y, __generic int *quo)
+gentype rint (gentype)
+floatn rootn (floatn x, intn y)
+halfn rootn (halfn x, intn y)
+doublen rootn (doublen x, intn y)
+doublen rootn (double x, int y)
+gentype round (gentype x)
+gentype rsqrt (gentype)
+gentype sin (gentype)
+gentype sincos (gentype x, __generic gentype *cosval)
+gentype sinh (gentype)
+gentype sinpi (gentype x)
+gentype sqrt (gentype)
+gentype tan (gentype)
+gentype tanh (gentype)
+gentype tanpi (gentype x)
+gentype tgamma (gentype)
+gentype trunc (gentype)
+# XXX we already defined all native and non-native
+# functions to the same one.
+gentype native_cos (gentype x)
+gentype native_divide (gentype x, gentype y)
+gentype native_exp (gentype x)
+gentype native_exp2 (gentype x)
+gentype native_exp10 (gentype x)
+gentype native_log (gentype x)
+gentype native_log2 (gentype x)
+gentype native_log10 (gentype x)
+gentype native_powr (gentype x, gentype y)
+gentype native_recip (gentype x)
+gentype native_rsqrt (gentype x)
+gentype native_sin (gentype x)
+gentype native_sqrt (gentype x)
+gentype native_tan (gentype x)
+gentype half_cos (gentype x)
+gentype half_divide (gentype x, gentype y)
+gentype half_exp (gentype x)
+gentype half_exp2 (gentype x)
+gentype half_exp10 (gentype x)
+gentype half_log (gentype x)
+gentype half_log2 (gentype x)
+gentype half_log10 (gentype x)
+gentype half_powr (gentype x, gentype y)
+gentype half_recip (gentype x)
+gentype half_rsqrt (gentype x)
+gentype half_sin (gentype x)
+gentype half_sqrt (gentype x)
+gentype half_tan (gentype x)
diff --git a/backend/src/libocl/src/ocl_atom_20.cl b/backend/src/libocl/src/ocl_atom_20.cl
new file mode 100644
index 0000000..ca200bc
--- /dev/null
+++ b/backend/src/libocl/src/ocl_atom_20.cl
@@ -0,0 +1,381 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_atom_20.h"
+#include "ocl_as.h"
+#include "ocl_sync.h"
+OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
+  }
+  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)        \
+  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
+DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
+DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return as_float(__gen_ocl_##PREFIX##NAME((SPACE uint *)p, as_uint(val))); \
+  }
+DECL_ATOMIC_OP_SPACE(xchg, float, __global, atomic_)
+DECL_ATOMIC_OP_SPACE(xchg, float, __local, atomic_)
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
+  }
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
+  }
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+// But the conformance test suite will test them anyway.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+// OpenCL 2.0 features.
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device);              \
+  }
+  OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val) { \
+    CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);   \
+    bool ret = oldValue == *expected; \
+    *expected = oldValue; \
+    return ret;  \
+  }
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, memory_order_seq_cst, memory_scope_device);              \
+  }
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val) { \
+    __gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device);              \
+  }
+  DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+  //DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+  DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+  DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+  DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int)  \
+  //DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+  DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+  DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+  DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int)  \
+  //DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+  DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+  DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+  DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int)   \
+  //DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+  DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+DECL_ATOMIC_OP(exchange,  exchange)
+DECL_ATOMIC_OP(fetch_add, fetch_add)
+DECL_ATOMIC_OP(fetch_sub, fetch_sub)
+DECL_ATOMIC_OP(fetch_and, fetch_and)
+DECL_ATOMIC_OP(fetch_or,  fetch_or)
+DECL_ATOMIC_OP(fetch_xor, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load, fetch_add)
+DECL_ATOMIC_NO_RET_OP(init, exchange)
+DECL_ATOMIC_NO_RET_OP(store, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(exchange, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(init, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(store, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_LOAD_TYPE(load, fetch_addf, atomic_float, atomic_int, float)
+// with memory_order.
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device);              \
+  }
+  OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure) { \
+    CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure, memory_scope_device);   \
+    bool ret = oldValue == *expected; \
+    *expected = oldValue; \
+    return ret;  \
+  }
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, memory_scope_device);              \
+  }
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \
+    __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device);              \
+  }
+DECL_ATOMIC_OP(exchange_explicit,  exchange)
+DECL_ATOMIC_OP(fetch_add_explicit, fetch_add)
+DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub)
+DECL_ATOMIC_OP(fetch_and_explicit, fetch_and)
+DECL_ATOMIC_OP(fetch_or_explicit,  fetch_or)
+DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add)
+DECL_ATOMIC_NO_RET_OP(store_explicit, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(exchange_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(init_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(store_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_LOAD_TYPE(load_explicit, fetch_addf, atomic_float, atomic_int, float)
+// with memory_order and memory_scope
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope);              \
+  }
+  OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure, memory_scope scope) { \
+    CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure,  scope);   \
+    bool ret = oldValue == *expected; \
+    *expected = oldValue; \
+    return ret;  \
+  }
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order, memory_scope scope) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, scope);              \
+  }
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \
+    __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope);              \
+  }
+DECL_ATOMIC_OP(exchange_explicit,  exchange)
+DECL_ATOMIC_OP(fetch_add_explicit, fetch_add)
+DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub)
+DECL_ATOMIC_OP(fetch_and_explicit, fetch_and)
+DECL_ATOMIC_OP(fetch_or_explicit,  fetch_or)
+DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add)
+DECL_ATOMIC_NO_RET_OP(store_explicit, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(exchange_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(init_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_NO_RET_TYPE(store_explicit, exchangef, atomic_float, atomic_int, float)
+DECL_ATOMIC_LOAD_TYPE(load_explicit, fetch_addf, atomic_float, atomic_int, float)
+OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object) {
+  atomic_int * temp = (atomic_int*)object;
+  int expected = 0;
+  int new_value = 1;
+  int oldValue = __gen_ocl_atomic_compare_exchange_strong32(temp, expected, new_value, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+  if(oldValue == new_value)
+    return true;
+  else
+    return false;
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order) {
+  atomic_int * temp = (atomic_int*)object;
+  int expected = 0;
+  int new_value = 1;
+  int oldValue = __gen_ocl_atomic_compare_exchange_strong32(temp, expected, new_value, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+  if(oldValue == new_value)
+    return true;
+  else
+    return false;
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){
+  atomic_int * temp = (atomic_int*)object;
+  int expected = 0;
+  int new_value = 1;
+  int oldValue = __gen_ocl_atomic_compare_exchange_strong32(temp, expected, new_value, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+  if(oldValue == new_value)
+    return true;
+  else
+    return false;
+OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object){
+  atomic_int * temp = (atomic_int*)object;
+  __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order){
+  atomic_int * temp = (atomic_int*)object;
+  __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){
+  atomic_int * temp = (atomic_int*)object;
+  __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope){
diff --git a/backend/src/libocl/src/ocl_atomic_20.ll b/backend/src/libocl/src/ocl_atomic_20.ll
new file mode 100644
index 0000000..38efac0
--- /dev/null
+++ b/backend/src/libocl/src/ocl_atomic_20.ll
@@ -0,0 +1,165 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+;32bit version.
+define i32 @__gen_ocl_atomic_exchange32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_exchangef(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_add32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_addf(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_sub32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_or32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_xor32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_and32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_imin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_imax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_umin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_fetch_umax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+define i32 @__gen_ocl_atomic_compare_exchange_strong32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %success, i32 %failure, i32 %scope) nounwind alwaysinline {
+  %0 = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+  %1 = extractvalue { i32, i1 } %0, 0
+  ret i32 %1
+define i32 @__gen_ocl_atomic_compare_exchange_weak32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+  %0 = cmpxchg weak volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+  %1 = extractvalue { i32, i1 } %0, 0
+  ret i32 %1
+;64bit version
+define i64 @__gen_ocl_atomic_exchange64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_fetch_add64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_fetch_sub64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_fetch_or64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_fetch_xor64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_fetch_and64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_fetch_imin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_fetch_imax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_fetch_umin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_fetch_umax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+    %0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+define i64 @__gen_ocl_atomic_compare_exchange_strong64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+  %0 = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst
+  %1 = extractvalue { i64, i1 } %0, 0
+  ret i64 %1
+define i64 @__gen_ocl_atomic_compare_exchange_weak64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+  %0 = cmpxchg weak volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst
+  %1 = extractvalue { i64, i1 } %0, 0
+  ret i64 %1
diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
index 9416f80..502ee67 100644
--- a/backend/src/libocl/src/ocl_barrier.ll
+++ b/backend/src/libocl/src/ocl_barrier.ll
@@ -11,34 +11,11 @@ declare i32 @_get_local_mem_fence() nounwind alwaysinline
 declare i32 @_get_global_mem_fence() nounwind alwaysinline
 declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
 declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
-declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
 declare void @__gen_ocl_debugwait() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate
 define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
-  %1 = icmp eq i32 %flags, 3
-  br i1 %1, label %barrier_local_global, label %barrier_local_check
-  call void @__gen_ocl_barrier_local_and_global()
-  br label %done
-  %2 = icmp eq i32 %flags, 1
-  br i1 %2, label %barrier_local, label %barrier_global_check
-  call void @__gen_ocl_barrier_local()
-  br label %done
-  %3 = icmp eq i32 %flags, 2
-  br i1 %3, label %barrier_global, label %done
-  call void @__gen_ocl_barrier_global()
-  br label %done
+  call void @__gen_ocl_barrier(i32 %flags)
   ret void
diff --git a/backend/src/libocl/src/ocl_barrier_20.ll b/backend/src/libocl/src/ocl_barrier_20.ll
new file mode 100644
index 0000000..8935076
--- /dev/null
+++ b/backend/src/libocl/src/ocl_barrier_20.ll
@@ -0,0 +1,25 @@
+;XXX FIXME as llvm can't use macros, we hardcoded 3, 1, 2
+;here, we may need to use a more grace way to handle this type
+;of values latter.
+;#define CLK_LOCAL_MEM_FENCE  (1 << 0)
+;#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+declare i32 @_get_local_mem_fence() nounwind alwaysinline
+declare i32 @_get_global_mem_fence() nounwind alwaysinline
+declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_debugwait() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier(i32) nounwind alwaysinline noduplicate
+define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
+  call void @__gen_ocl_barrier(i32 %flags)
+  ret void
+define void @_Z9debugwaitv() nounwind noduplicate alwaysinline {
+  call void @__gen_ocl_debugwait()
+  ret void
diff --git a/backend/src/libocl/src/ocl_clz_20.ll b/backend/src/libocl/src/ocl_clz_20.ll
new file mode 100644
index 0000000..19f4e35
--- /dev/null
+++ b/backend/src/libocl/src/ocl_clz_20.ll
@@ -0,0 +1,65 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+define i8 @clz_s8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+define i8 @clz_u8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+define i16 @clz_s16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+define i16 @clz_u16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+define i32 @clz_s32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+define i32 @clz_u32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+define i64 @clz_s64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call2, 32
+  %4 = add i32 %call1, 32
+  %5 = select i1 %cmp, i32 %call2, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
+define i64 @clz_u64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call2, 32
+  %4 = add i32 %call1, 32
+  %5 = select i1 %cmp, i32 %call2, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
diff --git a/backend/src/libocl/src/ocl_ctz.ll b/backend/src/libocl/src/ocl_ctz.ll
new file mode 100644
index 0000000..f30bd0a
--- /dev/null
+++ b/backend/src/libocl/src/ocl_ctz.ll
@@ -0,0 +1,65 @@
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+define i8 @ctz_s8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+  ret i8 %call
+define i8 @ctz_u8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+  ret i8 %call
+define i16 @ctz_s16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+  ret i16 %call
+define i16 @ctz_u16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+  ret i16 %call
+define i32 @ctz_s32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+  ret i32 %call
+define i32 @ctz_u32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+  ret i32 %call
+define i64 @ctz_s64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call1, 32
+  %4 = add i32 %call2, 32
+  %5 = select i1 %cmp, i32 %call1, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
+define i64 @ctz_u64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call1, 32
+  %4 = add i32 %call2, 32
+  %5 = select i1 %cmp, i32 %call1, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
diff --git a/backend/src/libocl/src/ocl_ctz_20.ll b/backend/src/libocl/src/ocl_ctz_20.ll
new file mode 100644
index 0000000..0a79b26
--- /dev/null
+++ b/backend/src/libocl/src/ocl_ctz_20.ll
@@ -0,0 +1,65 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+define i8 @ctz_s8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+  ret i8 %call
+define i8 @ctz_u8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+  ret i8 %call
+define i16 @ctz_s16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+  ret i16 %call
+define i16 @ctz_u16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+  ret i16 %call
+define i32 @ctz_s32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+  ret i32 %call
+define i32 @ctz_u32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+  ret i32 %call
+define i64 @ctz_s64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call1, 32
+  %4 = add i32 %call2, 32
+  %5 = select i1 %cmp, i32 %call1, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
+define i64 @ctz_u64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call1, 32
+  %4 = add i32 %call2, 32
+  %5 = select i1 %cmp, i32 %call1, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
diff --git a/backend/src/libocl/src/ocl_enqueue.cl b/backend/src/libocl/src/ocl_enqueue.cl
new file mode 100644
index 0000000..dc8fa3b
--- /dev/null
+++ b/backend/src/libocl/src/ocl_enqueue.cl
@@ -0,0 +1,238 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_types.h"
+#include "ocl_enqueue.h"
+#include "ocl_workitem.h"
+#include "ocl_atom.h"
+queue_t get_default_queue(void)
+  queue_t queue;
+  return queue; //return NULL queue
+ndrange_t __gen_ocl_set_ndrange_info(__private struct ndrange_info_t *info);
+__private struct ndrange_info_t* __gen_ocl_get_ndrange_info(ndrange_t info);
+__global int* __gen_ocl_get_enqueue_info_addr(void);
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void))
+  int i;
+  __private struct Block_literal *literal = (__private struct Block_literal *)block;
+  __private uchar *data = (__private uchar *)block;
+  int size = literal->descriptor->size;
+  literal->descriptor->reserved = 0;
+  __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
+  int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t));
+  __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+  __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+  *((__global struct ndrange_info_t *)addr) = *info;
+  addr += sizeof(*info);
+  for(i=0; i< size; i++) {
+    addr[i] = data[i];
+  }
+  return 0;
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+                                clk_event_t *event_ret, void (^block)(void))
+  return enqueue_kernel(q, flag, ndrange, block);
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes)
+  int i;
+  __private struct Block_literal* literal = (__private struct Block_literal *)block;
+  __private uchar* data = (__private uchar *)block;
+  int size = literal->descriptor->size;
+  int slm_size = count * sizeof(int);
+  literal->descriptor->reserved = slm_size;
+  __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
+  int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t) + slm_size);
+  __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+  __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+  *((__global struct ndrange_info_t *)addr) = *info;
+  addr += sizeof(*info);
+  for(i=0; i < size; i++) {
+    addr[i] = data[i];
+  }
+  addr += size;
+  for(i=0; i < count; i++) {
+    ((__global int *)addr)[i] = slm_sizes[i];
+  }
+  return 0;
+clk_event_t create_user_event(void)
+  clk_event_t e;
+  return e;
+void retain_event(clk_event_t event)
+  return;
+void release_event(clk_event_t event)
+  return;
+void set_user_event_status(clk_event_t event, int status)
+  return;
+bool is_valid_event(clk_event_t event)
+  return 1;
+uint __get_kernel_work_group_size_impl(__private void *block)
+  return 256;
+uint __get_kernel_preferred_work_group_multiple_impl(__private  void *block)
+  return 16;
+void capture_event_profiling_info(clk_event_t event, int name, global void *value)
+  //fake profiing data
+  ((__global ulong *)value)[0] = 0x3000;
+  ((__global ulong *)value)[1] = 0x6000;
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size)
+  struct ndrange_info_t info;
+  info.type = 0x1;
+  info.global_work_size[0] = global_work_size;
+  return __gen_ocl_set_ndrange_info(&info);
+  //return ndrange;
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size)
+  struct ndrange_info_t info;
+  info.type = 0x2;
+  info.global_work_size[0] = global_work_size;
+  info.local_work_size[0] = local_work_size;
+  return __gen_ocl_set_ndrange_info(&info);
+ // return ndrange;
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_size, size_t local_work_size)
+  struct ndrange_info_t info;
+  info.type = 0x3;
+  info.global_work_size[0] = global_work_size;
+  info.local_work_size[0] = local_work_size;
+  info.global_work_offset[0] = global_work_offset;
+  return __gen_ocl_set_ndrange_info(&info);
+  //return ndrange;
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2])
+  struct ndrange_info_t info;
+  info.type = 0x11;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  return __gen_ocl_set_ndrange_info(&info);
+  //return ndrange;
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2])
+  struct ndrange_info_t info;
+  info.type = 0x12;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  return __gen_ocl_set_ndrange_info(&info);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size_t global_work_size[2], const size_t local_work_size[2])
+  struct ndrange_info_t info;
+  info.type = 0x13;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  info.global_work_offset[0] = global_work_offset[0];
+  info.global_work_offset[1] = global_work_offset[1];
+  return __gen_ocl_set_ndrange_info(&info);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3])
+  struct ndrange_info_t info;
+  info.type = 0x21;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.global_work_size[2] = global_work_size[2];
+  return __gen_ocl_set_ndrange_info(&info);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3])
+  struct ndrange_info_t info;
+  info.type = 0x22;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.global_work_size[2] = global_work_size[2];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  info.local_work_size[2] = local_work_size[2];
+  return __gen_ocl_set_ndrange_info(&info);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3])
+  struct ndrange_info_t info;
+  info.type = 0x23;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.global_work_size[2] = global_work_size[2];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  info.local_work_size[2] = local_work_size[2];
+  info.global_work_offset[0] = global_work_offset[0];
+  info.global_work_offset[1] = global_work_offset[1];
+  info.global_work_offset[2] = global_work_offset[2];
+  return __gen_ocl_set_ndrange_info(&info);
+int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret)
+  return 0;
diff --git a/backend/src/libocl/src/ocl_geometric.cl b/backend/src/libocl/src/ocl_geometric.cl
index cf98503..af39ed3 100644
--- a/backend/src/libocl/src/ocl_geometric.cl
+++ b/backend/src/libocl/src/ocl_geometric.cl
@@ -18,7 +18,11 @@
 #include "ocl_geometric.h"
 #include "ocl_common.h"
 #include "ocl_relational.h"
+#if (__OPENCL_C_VERSION__ >= 200)
+#include "ocl_math_20.h"
 #include "ocl_math.h"
 #include "ocl_float.h"
 CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
diff --git a/backend/src/libocl/src/ocl_image.cl b/backend/src/libocl/src/ocl_image.cl
index a1125a8..2febfda 100644
--- a/backend/src/libocl/src/ocl_image.cl
+++ b/backend/src/libocl/src/ocl_image.cl
@@ -16,7 +16,11 @@
 #include "ocl_image.h"
+#if (__OPENCL_C_VERSION__ >= 200)
+#include "ocl_math_20.h"
 #include "ocl_math.h"
 #include "ocl_integer.h"
 #include "ocl_common.h"
 #include "ocl_convert.h"
@@ -77,6 +81,42 @@ DECL_GEN_OCL_QUERY_IMAGE(write_only image2d_t)
 DECL_GEN_OCL_QUERY_IMAGE(write_only image2d_array_t)
 DECL_GEN_OCL_QUERY_IMAGE(write_only image3d_t)
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_GEN_OCL_RW_IMAGE_WR(image_type, n) \
+  OVERLOADABLE int4 __gen_ocl_read_imagei(read_write image_type image, sampler_t sampler,            \
+                                          float ##n coord, uint sampler_offset);          \
+  OVERLOADABLE int4 __gen_ocl_read_imagei(read_write image_type image, sampler_t sampler,            \
+                                          int ##n coord, uint sampler_offset);            \
+  OVERLOADABLE uint4 __gen_ocl_read_imageui(read_write image_type image, sampler_t sampler,          \
+                                            float ##n coord, uint sampler_offset);        \
+  OVERLOADABLE uint4 __gen_ocl_read_imageui(read_write image_type image, sampler_t sampler,          \
+                                            int ##n coord, uint sampler_offset);          \
+  OVERLOADABLE float4 __gen_ocl_read_imagef(read_write image_type image, sampler_t sampler,          \
+                                            float ##n coord, uint sampler_offset);        \
+  OVERLOADABLE float4 __gen_ocl_read_imagef(read_write image_type image, sampler_t sampler,          \
+                                            int ##n coord, uint sampler_offset);          \
+  OVERLOADABLE void __gen_ocl_write_imagei(read_write image_type image, int ##n coord , int4 color); \
+  OVERLOADABLE void __gen_ocl_write_imageui(read_write image_type image, int ##n coord, uint4 color);\
+  OVERLOADABLE void __gen_ocl_write_imagef(read_write image_type image, int ##n coord, float4 color);
+DECL_GEN_OCL_RW_IMAGE_WR(image1d_t, 1)
+DECL_GEN_OCL_RW_IMAGE_WR(image1d_buffer_t, 2)
+DECL_GEN_OCL_RW_IMAGE_WR(image1d_array_t, 2)
+DECL_GEN_OCL_RW_IMAGE_WR(image1d_array_t, 4)
+DECL_GEN_OCL_RW_IMAGE_WR(image2d_t, 2)
+DECL_GEN_OCL_RW_IMAGE_WR(image2d_array_t, 3)
+DECL_GEN_OCL_RW_IMAGE_WR(image3d_t, 3)
+DECL_GEN_OCL_RW_IMAGE_WR(image2d_array_t, 4)
+DECL_GEN_OCL_RW_IMAGE_WR(image3d_t, 4)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image1d_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image1d_buffer_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image1d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image2d_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image2d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(read_write image3d_t)
 // helper functions to validate array index.
@@ -160,6 +200,51 @@ INLINE_OVERLOADABLE int3 __gen_validate_array_index(int3 coord, write_only image
   return coord;
+#if (__OPENCL_C_VERSION__ >= 200)
+INLINE_OVERLOADABLE float2 __gen_validate_array_index(float2 coord, read_write image1d_array_t image)
+  float array_size = __gen_ocl_get_image_depth(image);
+  coord.s1 = clamp(rint(coord.s1), 0.f, array_size - 1.f);
+  return coord;
+INLINE_OVERLOADABLE float4 __gen_validate_array_index(float4 coord, read_write image2d_array_t image)
+  float array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
+  return coord;
+INLINE_OVERLOADABLE float3 __gen_validate_array_index(float3 coord, read_write image2d_array_t image)
+  float array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
+  return coord;
+INLINE_OVERLOADABLE int2 __gen_validate_array_index(int2 coord, read_write image1d_array_t image)
+  int array_size = __gen_ocl_get_image_depth(image);
+  coord.s1 = clamp(coord.s1, 0, array_size - 1);
+  return coord;
+INLINE_OVERLOADABLE int4 __gen_validate_array_index(int4 coord, read_write image2d_array_t image)
+  int array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(coord.s2, 0, array_size - 1);
+  return coord;
+INLINE_OVERLOADABLE int3 __gen_validate_array_index(int3 coord, read_write image2d_array_t image)
+  int array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(coord.s2, 0, array_size - 1);
+  return coord;
 // For non array image type, we need to do nothing.
 #define GEN_VALIDATE_ARRAY_INDEX(coord_type, image_type) \
 INLINE_OVERLOADABLE coord_type __gen_validate_array_index(coord_type coord, image_type image) \
@@ -190,6 +275,19 @@ GEN_VALIDATE_ARRAY_INDEX(int3, write_only image3d_t)
 GEN_VALIDATE_ARRAY_INDEX(float, write_only image1d_buffer_t)
 GEN_VALIDATE_ARRAY_INDEX(int, write_only image1d_buffer_t)
+#if (__OPENCL_C_VERSION__ >= 200)
+GEN_VALIDATE_ARRAY_INDEX(float, read_write image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(int, read_write image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(float2, read_write image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(int2, read_write image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(float4, read_write image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int4, read_write image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float3, read_write image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int3, read_write image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float, read_write image1d_buffer_t)
+GEN_VALIDATE_ARRAY_INDEX(int, read_write image1d_buffer_t)
 // Helper functions to work around some coordiate boundary issues.
 // The major issue on Gen7/Gen7.5 are the sample message could not sampling
@@ -390,9 +488,9 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
                                           convert_float ##n (tmpCoord), 0);   \
-#define DECL_READ_IMAGE_NOSAMPLER(image_type, image_data_type,                \
+#define DECL_READ_IMAGE_NOSAMPLER(access_qual, image_type, image_data_type,   \
                                   suffix, coord_type, n)                      \
-  OVERLOADABLE image_data_type read_image ##suffix(read_only image_type cl_image,       \
+  OVERLOADABLE image_data_type read_image ##suffix(access_qual image_type cl_image, \
                                                coord_type coord)              \
   {                                                                           \
     coord = __gen_validate_array_index(coord, cl_image);                      \
@@ -402,8 +500,8 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
              cl_image, defaultSampler, convert_float ##n (coord), 0);         \
-#define DECL_WRITE_IMAGE(image_type, image_data_type, suffix, coord_type)     \
-  OVERLOADABLE void write_image ##suffix(write_only image_type cl_image,                 \
+#define DECL_WRITE_IMAGE(access_qual, image_type, image_data_type, suffix, coord_type)   \
+  OVERLOADABLE void write_image ##suffix(access_qual image_type cl_image,    \
                                          coord_type coord,                    \
                                          image_data_type color)               \
   {                                                                           \
@@ -411,13 +509,25 @@ INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
     __gen_ocl_write_image ##suffix(cl_image, fixedCoord, color);              \
+#if (__OPENCL_C_VERSION__ >= 200)
 #define DECL_IMAGE(int_clamping_fix, image_type, image_data_type, suffix, n)  \
   DECL_READ_IMAGE0(int_clamping_fix, image_type,                              \
                    image_data_type, suffix, int ##n, n)                       \
   DECL_READ_IMAGE1(int_clamping_fix, image_type,                              \
                    image_data_type, suffix, float ##n, n)                     \
-  DECL_READ_IMAGE_NOSAMPLER(image_type, image_data_type, suffix, int ##n, n)  \
-  DECL_WRITE_IMAGE(image_type, image_data_type, suffix, int ##n)              \
+  DECL_READ_IMAGE_NOSAMPLER(read_only, image_type, image_data_type, suffix, int ##n, n)  \
+  DECL_READ_IMAGE_NOSAMPLER(read_write, image_type, image_data_type, suffix, int ##n, n) \
+  DECL_WRITE_IMAGE(write_only, image_type, image_data_type, suffix, int ##n) \
+  DECL_WRITE_IMAGE(read_write, image_type, image_data_type, suffix, int ##n)
+#define DECL_IMAGE(int_clamping_fix, image_type, image_data_type, suffix, n)  \
+  DECL_READ_IMAGE0(int_clamping_fix, image_type,                              \
+                   image_data_type, suffix, int ##n, n)                       \
+  DECL_READ_IMAGE1(int_clamping_fix, image_type,                              \
+                   image_data_type, suffix, float ##n, n)                     \
+  DECL_READ_IMAGE_NOSAMPLER(read_only, image_type, image_data_type, suffix, int ##n, n)  \
+  DECL_WRITE_IMAGE(write_only, image_type, image_data_type, suffix, int ##n)
 // 1D
 #define DECL_IMAGE_TYPE(image_type, n)                                        \
@@ -432,9 +542,9 @@ DECL_IMAGE_TYPE(image3d_t, 3)
 DECL_IMAGE_TYPE(image2d_array_t, 4)
 DECL_IMAGE_TYPE(image2d_array_t, 3)
-#define DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(image_type, image_data_type,       \
+#define DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(access_qual, image_type, image_data_type, \
                                   suffix, coord_type)                         \
-  OVERLOADABLE image_data_type read_image ##suffix(read_only image_type cl_image,       \
+  OVERLOADABLE image_data_type read_image ##suffix(access_qual image_type cl_image,       \
                                                coord_type coord)              \
   {                                                                           \
     sampler_t defaultSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE \
@@ -446,8 +556,8 @@ DECL_IMAGE_TYPE(image2d_array_t, 3)
              cl_image, defaultSampler, convert_float2(effectCoord), 0);       \
-#define DECL_WRITE_IMAGE1D_BUFFER(image_type, image_data_type, suffix, coord_type)     \
-  OVERLOADABLE void write_image ##suffix(write_only image_type cl_image,                 \
+#define DECL_WRITE_IMAGE1D_BUFFER(access_qual, image_type, image_data_type, suffix, coord_type) \
+  OVERLOADABLE void write_image ##suffix(access_qual image_type cl_image,                 \
                                          coord_type coord,                    \
                                          image_data_type color)               \
   {                                                                           \
@@ -457,11 +567,20 @@ DECL_IMAGE_TYPE(image2d_array_t, 3)
     __gen_ocl_write_image ##suffix(cl_image, effectCoord, color);              \
+#if (__OPENCL_C_VERSION__ >= 200)
+#define DECL_IMAGE_1DBuffer(int_clamping_fix, image_data_type, suffix)        \
+  DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(read_only, image1d_buffer_t, image_data_type,  \
+                                     suffix, int)                             \
+  DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(read_write, image1d_buffer_t, image_data_type, \
+                                     suffix, int)                             \
+  DECL_WRITE_IMAGE1D_BUFFER(write_only, image1d_buffer_t, image_data_type, suffix, int) \
+  DECL_WRITE_IMAGE1D_BUFFER(read_write, image1d_buffer_t, image_data_type, suffix, int)
 #define DECL_IMAGE_1DBuffer(int_clamping_fix, image_data_type, suffix)        \
-  DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(image1d_buffer_t, image_data_type,       \
+  DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(read_only, image1d_buffer_t, image_data_type,       \
                                      suffix, int)                             \
-  DECL_WRITE_IMAGE1D_BUFFER(image1d_buffer_t, image_data_type, suffix, int)
+  DECL_WRITE_IMAGE1D_BUFFER(write_only, image1d_buffer_t, image_data_type, suffix, int)
@@ -535,12 +654,23 @@ INLINE_OVERLOADABLE int4 __gen_fixup_1darray_coord(int2 coord, image1d_array_t i
                                           convert_float2 (tmpCoord), 0);      \
+#if (__OPENCL_C_VERSION__ >= 200)
 #define DECL_IMAGE_1DArray(int_clamping_fix, image_data_type, suffix)         \
   DECL_READ_IMAGE0_1DArray(int_clamping_fix, image_data_type, suffix, int2)   \
   DECL_READ_IMAGE1_1DArray(int_clamping_fix, image_data_type,                 \
                            suffix, float2)                                    \
-  DECL_READ_IMAGE_NOSAMPLER(image1d_array_t, image_data_type, suffix, int2, 2)\
-  DECL_WRITE_IMAGE(image1d_array_t, image_data_type, suffix, int2)            \
+  DECL_READ_IMAGE_NOSAMPLER(read_only, image1d_array_t, image_data_type, suffix, int2, 2) \
+  DECL_READ_IMAGE_NOSAMPLER(read_write, image1d_array_t, image_data_type, suffix, int2, 2)\
+  DECL_WRITE_IMAGE(write_only, image1d_array_t, image_data_type, suffix, int2) \
+  DECL_WRITE_IMAGE(read_write, image1d_array_t, image_data_type, suffix, int2)
+#define DECL_IMAGE_1DArray(int_clamping_fix, image_data_type, suffix)         \
+  DECL_READ_IMAGE0_1DArray(int_clamping_fix, image_data_type, suffix, int2)   \
+  DECL_READ_IMAGE1_1DArray(int_clamping_fix, image_data_type,                 \
+                           suffix, float2)                                    \
+  DECL_READ_IMAGE_NOSAMPLER(read_only, image1d_array_t, image_data_type, suffix, int2, 2) \
+  DECL_WRITE_IMAGE(write_only, image1d_array_t, image_data_type, suffix, int2)
@@ -579,6 +709,15 @@ DECL_IMAGE_INFO_COMMON(write_only image3d_t)
 DECL_IMAGE_INFO_COMMON(write_only image2d_array_t)
+#if (__OPENCL_C_VERSION__ >= 200)
+DECL_IMAGE_INFO_COMMON(read_write image1d_t)
+DECL_IMAGE_INFO_COMMON(read_write image1d_buffer_t)
+DECL_IMAGE_INFO_COMMON(read_write image1d_array_t)
+DECL_IMAGE_INFO_COMMON(read_write image2d_t)
+DECL_IMAGE_INFO_COMMON(read_write image3d_t)
+DECL_IMAGE_INFO_COMMON(read_write image2d_array_t)
 // 2D extra Info
 OVERLOADABLE int get_image_height(read_only image2d_t image)
@@ -598,6 +737,17 @@ OVERLOADABLE int2 get_image_dim(write_only image2d_t image)
   return (int2){get_image_width(image), get_image_height(image)};
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE int get_image_height(read_write image2d_t image)
+  return __gen_ocl_get_image_height(image);
+OVERLOADABLE int2 get_image_dim(read_write image2d_t image)
+  return (int2){get_image_width(image), get_image_height(image)};
 // End of 2D
 // 3D extra Info
@@ -633,6 +783,24 @@ OVERLOADABLE int4 get_image_dim(write_only image3d_t image)
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE int get_image_height(read_write image3d_t image)
+  return __gen_ocl_get_image_height(image);
+OVERLOADABLE int get_image_depth(read_write image3d_t image)
+  return __gen_ocl_get_image_depth(image);
+OVERLOADABLE int4 get_image_dim(read_write image3d_t image)
+  return (int4) (get_image_width(image),
+                 get_image_height(image),
+                 get_image_depth(image),
+                 0);
 // 2D Array extra Info
 OVERLOADABLE int get_image_height(read_only image2d_array_t image)
@@ -660,6 +828,20 @@ OVERLOADABLE size_t get_image_array_size(write_only image2d_array_t image)
   return __gen_ocl_get_image_depth(image);
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE int get_image_height(read_write image2d_array_t image)
+  return __gen_ocl_get_image_height(image);
+OVERLOADABLE int2 get_image_dim(read_write image2d_array_t image)
+  return (int2){get_image_width(image), get_image_height(image)};
+OVERLOADABLE size_t get_image_array_size(read_write image2d_array_t image)
+  return __gen_ocl_get_image_depth(image);
 // 1D Array info
 OVERLOADABLE size_t get_image_array_size(read_only image1d_array_t image)
@@ -671,4 +853,10 @@ OVERLOADABLE size_t get_image_array_size(write_only image1d_array_t image)
   return __gen_ocl_get_image_depth(image);
+#if (__OPENCL_C_VERSION__ >= 200)
+OVERLOADABLE size_t get_image_array_size(read_write image1d_array_t image)
+  return __gen_ocl_get_image_depth(image);
 // End of 1DArray
diff --git a/backend/src/libocl/src/ocl_memcpy.cl b/backend/src/libocl/src/ocl_memcpy.cl
index 85f490f..131574d 100644
--- a/backend/src/libocl/src/ocl_memcpy.cl
+++ b/backend/src/libocl/src/ocl_memcpy.cl
@@ -37,13 +37,28 @@ void __gen_memcpy_ ##NAME (DST_SPACE uchar* dst, SRC_SPACE uchar* src, size_t si
   } \
+#if (__OPENCL_C_VERSION__ >= 200)
diff --git a/backend/src/libocl/src/ocl_memset.cl b/backend/src/libocl/src/ocl_memset.cl
index d8bc5df..dda7e55 100644
--- a/backend/src/libocl/src/ocl_memset.cl
+++ b/backend/src/libocl/src/ocl_memset.cl
@@ -41,4 +41,7 @@ void __gen_memset_ ##NAME (DST_SPACE uchar* dst, uchar val, size_t size) { \
 DECL_MEMSET_FN(g, __global)
 DECL_MEMSET_FN(l, __local)
 DECL_MEMSET_FN(p, __private)
+#if (__OPENCL_C_VERSION__ >= 200)
+DECL_MEMSET_FN(n, __generic)
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index 94bf178..3b2eb92 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -229,3 +229,27 @@ struct time_stamp __gen_ocl_get_timestamp(void) {
   return val;
+bool __gen_ocl_in_local(size_t p) {
+  bool cond1 = p > 0;
+  bool cond2 = p < 64*1024;
+  return cond1 && cond2;
+#if (__OPENCL_C_VERSION__ >= 200)
+local void *__to_local(generic void *p) {
+  bool cond = __gen_ocl_in_local((size_t)p);
+  return cond ? (local void*)p : NULL;
+private void *__to_private(generic void *p) {
+  bool cond = __gen_ocl_in_private((size_t)p);
+  return cond ? (private void*)p : NULL;
+global void *__to_global(generic void *p) {
+  bool cond1 = __gen_ocl_in_local((size_t)p);
+  bool cond2 = __gen_ocl_in_private((size_t)p);
+  bool cond = cond1 || cond2;
+  return !cond ? (global void*)p : NULL;
diff --git a/backend/src/libocl/src/ocl_pipe.cl b/backend/src/libocl/src/ocl_pipe.cl
new file mode 100644
index 0000000..7bfd370
--- /dev/null
+++ b/backend/src/libocl/src/ocl_pipe.cl
@@ -0,0 +1,296 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_pipe.h"
+#include "ocl_atom.h"
+#include "ocl_workitem.h"
+#define PIPE_SUCCESS 0
+#define PIPE_EMPTY -2
+#define PIPE_FULL -3
+#define PIPE_HEADER_SZ 128
+#define RID_MAGIC 0xDE
+#define RIDT ushort
+PURE CONST __global void* __gen_ocl_get_pipe(pipe int p);
+PURE CONST ulong __gen_ocl_get_rid(reserve_id_t rid);
+PURE CONST reserve_id_t __gen_ocl_make_rid(ulong rid);
+int __read_pipe_2(pipe int p, __generic void* dst)
+  __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+  int data_size = atomic_sub(pheader + 6, 1);
+  if(data_size < 0){
+    atomic_add(pheader + 6, 1);
+    return PIPE_EMPTY; //Check if element exist
+  }
+  __global char* psrc = (__global char*)pheader + PIPE_HEADER_SZ;
+  int pack_num = pheader[0];
+  int pack_size = pheader[1];
+  int pipe_size = pack_num * pack_size;
+  int read_ptr = atomic_add(pheader + 3, 1);
+  if(read_ptr == pack_num - 1)
+    atomic_sub(pheader + 3, pack_num);
+  read_ptr = read_ptr % pack_num;
+  for(int i = 0; i < pack_size ; i++)
+    ((char*)dst)[i] = psrc[i + read_ptr*pack_size];
+  return 0;
+int __read_pipe_4(pipe int p, reserve_id_t id, uint index, void* dst)
+  __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+  __global char* psrc = (__global char*)pheader + PIPE_HEADER_SZ;
+  ulong uid = __gen_ocl_get_rid(id);
+  RIDT* pid = (RIDT*)&uid;
+  RIDT start_pt = pid[0];
+  RIDT reserve_size = pid[1];
+  if(index > reserve_size) return PIPE_INDEX_OUTRANGE;
+  int pack_num = pheader[0];
+  int pack_size = pheader[1];
+  int read_ptr = (start_pt + index) % pack_num;
+  int offset = read_ptr * pack_size;
+  for(int i = 0; i < pack_size ; i++)
+    ((char*)dst)[i] = psrc[i + offset];
+  return 0;
+int __write_pipe_2(pipe int p, __generic void* src)
+  __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+  int pack_num = pheader[0];
+  int data_size = atomic_add(pheader + 6, 1);
+  if(data_size >= pack_num){
+    atomic_sub(pheader + 6, 1);
+    return PIPE_FULL; //Check if pipe full
+  }
+  __global char* psrc = (__global char*)pheader + PIPE_HEADER_SZ;
+  int pack_size = pheader[1];
+  int pipe_size = pack_num * pack_size;
+  int write_ptr = atomic_add(pheader + 2, 1);
+  if(write_ptr == pack_num - 1)
+    atomic_sub(pheader + 2, pack_num);
+  write_ptr = write_ptr % pack_num;
+  for(int i = 0; i < pack_size ; i++)
+    psrc[i + write_ptr * pack_size] = ((char*)src)[i];
+  return 0;
+int __write_pipe_4(pipe int p, reserve_id_t id, uint index, void* src)
+  __global int* pheader = __gen_ocl_get_pipe(p);
+  __global char* psrc = (__global char*)pheader + PIPE_HEADER_SZ;
+  ulong uid = __gen_ocl_get_rid(id);
+  RIDT* pid = (RIDT*)&uid;
+  RIDT start_pt = pid[0];
+  RIDT reserve_size = pid[1];
+  if(index > reserve_size) return PIPE_INDEX_OUTRANGE;
+  int pack_num = pheader[0];
+  int pack_size = pheader[1];
+  int write_ptr = (start_pt + index) % pack_num;
+  int offset = write_ptr * pack_size;
+  for(int i = 0; i < pack_size ; i++)
+    psrc[i + offset] = ((char*)src)[i];
+  return pack_size;
+reserve_id_t __reserve_read_pipe(pipe int p, uint num)
+  __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+  int data_size = atomic_sub(pheader + 6, num);
+  if(data_size < num){
+    atomic_add(pheader + 6, num);
+    return __gen_ocl_make_rid(0l);
+  }
+  int pack_num = pheader[0];
+  int pack_size = pheader[1];
+  int pipe_size = pack_num * pack_size;
+  int read_ptr = atomic_add(pheader + 3, num);
+  if(read_ptr == pack_num - num)
+    atomic_sub(pheader + 3, pack_num);
+  ulong uid = 0l;
+  RIDT* pid = (RIDT*)&uid;
+  pid[0] = read_ptr % pack_num;
+  pid[1] = num;
+  pid[2] = RID_MAGIC ;
+  return __gen_ocl_make_rid(uid);
+void __commit_read_pipe(pipe int p, reserve_id_t rid) {}
+reserve_id_t __work_group_reserve_read_pipe(pipe int p, uint num)
+  uint rid_ptr = DEAD_PTR;
+  int ret0 = 0;
+  if(get_local_linear_id()==0){
+    __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+    int data_size = atomic_sub(pheader + 6, num);
+    if(data_size < num){
+      atomic_add(pheader + 6, num);
+      int ret0 = 1;
+    }
+    int pack_num = pheader[0];
+    int pack_size = pheader[1];
+    int pipe_size = pack_num * pack_size;
+    int read_ptr = atomic_add(pheader + 3, num);
+    if(read_ptr == pack_num - num && !ret0)
+      atomic_sub(pheader + 3, pack_num);
+    if(!ret0)
+      rid_ptr = read_ptr % pack_num;
+  }
+  ulong uid = 0l;
+  RIDT* pid = (RIDT*)&uid;
+  rid_ptr = work_group_broadcast(rid_ptr,0,0,0);
+  pid[0] = rid_ptr;
+  pid[1] = num;
+  pid[2] = RID_MAGIC ;
+  if(rid_ptr == DEAD_PTR)
+    uid = 0l;
+  return __gen_ocl_make_rid(uid);
+void __work_group_commit_read_pipe(pipe int p, reserve_id_t rid) {}
+reserve_id_t __sub_group_reserve_read_pipe(pipe int p, uint num)
+  __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+  int data_size = atomic_sub(pheader + 6, num);
+  if(data_size < num){
+    atomic_add(pheader + 6, num);
+    return __gen_ocl_make_rid(0l);
+  }
+  int pack_num = pheader[0];
+  int pack_size = pheader[1];
+  int pipe_size = pack_num * pack_size;
+  int read_ptr = atomic_add(pheader + 3, num);
+  if(read_ptr == pack_num - num)
+    atomic_sub(pheader + 3, pack_num);
+  ulong uid = 0l;
+  RIDT* pid = (RIDT*)&uid;
+  pid[0] = read_ptr % pack_num;
+  pid[1] = num;
+  pid[2] = RID_MAGIC ;
+  return __gen_ocl_make_rid(uid);
+void __sub_group_commit_read_pipe(pipe int p, reserve_id_t rid) {}
+reserve_id_t __reserve_write_pipe(pipe int p, uint num)
+  __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+  int pack_num = pheader[0];
+  int data_size = atomic_add(pheader + 6, num);
+  if(data_size > pack_num - num){
+    atomic_sub(pheader + 6, num);
+    return __gen_ocl_make_rid(0l);
+  }
+  int pack_size = pheader[1];
+  int pipe_size = pack_num * pack_size;
+  int write_ptr = atomic_add(pheader + 2, num);
+  if(write_ptr == pack_num - num)
+    atomic_sub(pheader + 2, pack_num);
+  ulong uid = 0l;
+  RIDT* pid = (RIDT*)&uid;
+  pid[0] = write_ptr % pack_num;
+  pid[1] = num;
+  pid[2] = RID_MAGIC ;
+  return __gen_ocl_make_rid(uid);
+void __commit_write_pipe(pipe int p, reserve_id_t rid) {}
+reserve_id_t __work_group_reserve_write_pipe(pipe int p, uint num)
+  uint rid_ptr = DEAD_PTR;
+  int ret0 = 0;
+  if(get_local_linear_id()==0){
+    __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+    int pack_num = pheader[0];
+    int data_size = atomic_add(pheader + 6, num);
+    if(data_size > pack_num - num){
+      atomic_sub(pheader + 6, num);
+      ret0 = 1;
+    }
+    int pack_size = pheader[1];
+    int pipe_size = pack_num * pack_size;
+    int write_ptr = atomic_add(pheader + 2, num);
+    if(write_ptr == pack_num - num && !ret0)
+      atomic_sub(pheader + 2, pack_num);
+    if(!ret0)
+      rid_ptr = write_ptr % pack_num;
+  }
+  ulong uid = 0l;
+  RIDT* pid = (RIDT*)&uid;
+  rid_ptr = work_group_broadcast(rid_ptr,0,0,0);
+  pid[0] = rid_ptr;
+  pid[1] = num;
+  pid[2] = RID_MAGIC ;
+  if(rid_ptr == DEAD_PTR)
+    uid = 0l;
+  return __gen_ocl_make_rid(uid);
+void __work_group_commit_write_pipe(pipe int p, reserve_id_t rid) {}
+reserve_id_t __sub_group_reserve_write_pipe(pipe int p, uint num)
+  __global int* pheader = (__global int*)__gen_ocl_get_pipe(p);
+  int pack_num = pheader[0];
+  int data_size = atomic_add(pheader + 6, num);
+  if(data_size > pack_num - num){
+    atomic_sub(pheader + 6, num);
+    return __gen_ocl_make_rid(0l);
+  }
+  int pack_size = pheader[1];
+  int pipe_size = pack_num * pack_size;
+  int write_ptr = atomic_add(pheader + 2, num);
+  if(write_ptr == pack_num - num)
+    atomic_sub(pheader + 2, pack_num);
+  ulong uid = 0l;
+  RIDT* pid = (RIDT*)&uid;
+  pid[0] = write_ptr % pack_num;
+  pid[1] = num;
+  pid[2] = RID_MAGIC ;
+  return __gen_ocl_make_rid(uid);
+void __sub_group_commit_write_pipe(pipe int p, reserve_id_t rid) {}
+bool is_valid_reserve_id(reserve_id_t rid)
+  ulong uid = __gen_ocl_get_rid(rid);
+  RIDT* pid = (RIDT*)&uid;
+  if(pid[1] == 0) return false;
+  if(pid[2] != RID_MAGIC) return false;
+  return true;
+/* Query Function */
+uint __get_pipe_max_packets(pipe int p)
+  __global int* pheader = __gen_ocl_get_pipe(p);
+  return pheader[0];
+uint __get_pipe_num_packets(pipe int p)
+  __global int* pheader = __gen_ocl_get_pipe(p);
+  return pheader[6];
diff --git a/backend/src/libocl/src/ocl_sync.cl b/backend/src/libocl/src/ocl_sync.cl
index b6efef8..590596a 100644
--- a/backend/src/libocl/src/ocl_sync.cl
+++ b/backend/src/libocl/src/ocl_sync.cl
@@ -16,10 +16,10 @@
 #include "ocl_sync.h"
+#include "ocl_misc.h"
 void __gen_ocl_barrier_local(void);
 void __gen_ocl_barrier_global(void);
-void __gen_ocl_barrier_local_and_global(void);
 void __gen_ocl_debugwait(void);
 OVERLOADABLE void mem_fence(cl_mem_fence_flags flags) {
@@ -30,3 +30,7 @@ OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags) {
 OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags) {
+cl_mem_fence_flags get_fence(void *ptr) {
+  bool cond = __gen_ocl_in_local((size_t)ptr);
diff --git a/backend/src/libocl/src/ocl_vload_20.cl b/backend/src/libocl/src/ocl_vload_20.cl
new file mode 100644
index 0000000..ab06aa2
--- /dev/null
+++ b/backend/src/libocl/src/ocl_vload_20.cl
@@ -0,0 +1,284 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#include "ocl_vload_20.h"
+#include "ocl_relational.h"
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+} \
+OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 3 * offset) = v.s0; \
+  *(p + 3 * offset + 1) = v.s1; \
+  *(p + 3 * offset + 2) = v.s2; \
+} \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
+} \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
+} \
+OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
+} \
+OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
+} \
+OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
+OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 2 * offset) = v.s0; \
+  *(p + 2 * offset + 1) = v.s1; \
+} \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 3 * offset) = v.s0; \
+  *(p + 3 * offset + 1) = v.s1; \
+  *(p + 3 * offset + 2) = v.s2; \
+} \
+OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
+  vstore2(v.lo, 2*offset, p); \
+  vstore2(v.hi, 2*offset, p+2); \
+} \
+OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
+  vstore4(v.lo, 2*offset, p); \
+  vstore4(v.hi, 2*offset, p+4); \
+} \
+OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
+  vstore8(v.lo, 2*offset, p); \
+  vstore8(v.hi, 2*offset, p+8); \
+  DECL_BYTE_RD_SPACE(TYPE, __generic) \
+  DECL_BYTE_RD_SPACE(TYPE, __constant) \
+  DECL_BYTE_WR_SPACE(TYPE, __generic)
+PURE CONST float __gen_ocl_f16to32(short h);
+PURE CONST short __gen_ocl_f32to16(float f);
+OVERLOADABLE short f32to16_rtp(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (f > con)
+    return s - signbit(f) * 2 + 1;
+  else
+    return s;
+OVERLOADABLE short f32to16_rtn(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (con > f)
+    return s + signbit(f) * 2 - 1;
+  else
+    return s;
+OVERLOADABLE short f32to16_rtz(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (((con > f) && !signbit(f)) ||
+      ((con < f) && signbit(f)))
+    return s - 1;
+  else
+    return s;
+OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
+  return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
+} \
+OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p) { \
+  return vload_half(offset, p); \
+} \
+OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
+  return (float2)(vload_half(offset*2, p), \
+                  vload_half(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p) { \
+  return (float2)(vloada_half(offset*2, p), \
+                  vloada_half(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*3, p), \
+                  vload_half(offset*3 + 1, p), \
+                  vload_half(offset*3 + 2, p)); \
+} \
+OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*4, p), \
+                  vload_half(offset*4 + 1, p), \
+                  vload_half(offset*4 + 2, p)); \
+} \
+OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
+  return (float4)(vload_half2(offset*2, p), \
+                  vload_half2(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p) { \
+  return (float4)(vloada_half2(offset*2, p), \
+                  vloada_half2(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
+  return (float8)(vload_half4(offset*2, p), \
+                  vload_half4(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p) { \
+  return (float8)(vloada_half4(offset*2, p), \
+                  vloada_half4(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
+  return (float16)(vload_half8(offset*2, p), \
+                   vload_half8(offset*2 + 1, p)); \
+OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p) { \
+  return (float16)(vloada_half8(offset*2, p), \
+                   vloada_half8(offset*2 + 1, p)); \
+OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
+  *(SPACE short *)(p + offset) = FUNC(data); \
+} \
+OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.lo, offset*2, p); \
+  vstore_half##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+  vstore_half2##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.s0, offset*3, p); \
+  vstore_half##ROUND(data.s1, offset*3 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*3 + 2, p); \
+} \
+OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.s0, offset*4, p); \
+  vstore_half##ROUND(data.s1, offset*4 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*4 + 2, p); \
+} \
+OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+  vstore_half2##ROUND(data.lo, offset*2, p); \
+  vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+  vstore_half4##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+  vstore_half4##ROUND(data.lo, offset*2, p); \
+  vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+  vstore_half8##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+  vstore_half8##ROUND(data.lo, offset*2, p); \
+  vstore_half8##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+  vstore_half16##ROUND(data, offset, p); \
+  DECL_HALF_ST_SPACE_ROUND(SPACE,  , __gen_ocl_f32to16) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \
diff --git a/backend/src/libocl/src/ocl_workitem.cl b/backend/src/libocl/src/ocl_workitem.cl
index 235f12b..eb6210d 100644
--- a/backend/src/libocl/src/ocl_workitem.cl
+++ b/backend/src/libocl/src/ocl_workitem.cl
@@ -30,6 +30,7 @@ PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
 PURE CONST unsigned int __gen_ocl_##NAME##2(void);
@@ -37,7 +38,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
-OVERLOADABLE unsigned NAME(unsigned int dim) {             \
+OVERLOADABLE size_t NAME(unsigned int dim) {             \
   if (dim == 0) return __gen_ocl_##NAME##0();        \
   else if (dim == 1) return __gen_ocl_##NAME##1();   \
   else if (dim == 2) return __gen_ocl_##NAME##2();   \
@@ -46,24 +47,18 @@ OVERLOADABLE unsigned NAME(unsigned int dim) {             \
 DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
 DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_enqueued_local_size, 1)
 DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
 DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
 DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
 DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
-OVERLOADABLE uint get_global_id(uint dim) {
-  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
+OVERLOADABLE size_t get_global_id(uint dim) {
+  return get_local_id(dim) + get_enqueued_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
-OVERLOADABLE uint get_enqueued_local_size (uint dimindx)
-  //TODO: should be different with get_local_size when support
-  //non-uniform work-group size
-  return get_local_size(dimindx);
-OVERLOADABLE uint get_global_linear_id(void)
+OVERLOADABLE size_t get_global_linear_id(void)
   uint dim = __gen_ocl_get_work_dim();
   if (dim == 1) return get_global_id(0) - get_global_offset(0);
@@ -76,12 +71,12 @@ OVERLOADABLE uint get_global_linear_id(void)
   else return 0;
-OVERLOADABLE uint get_local_linear_id(void)
+OVERLOADABLE size_t get_local_linear_id(void)
   uint dim = __gen_ocl_get_work_dim();
   if (dim == 1) return get_local_id(0);
-  else if (dim == 2) return get_local_id(1) * get_local_size (0) + get_local_id(0);
-  else if (dim == 3) return (get_local_id(2) * get_local_size(1) * get_local_size(0)) +
-                            (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+  else if (dim == 2) return get_local_id(1) * get_enqueued_local_size(0) + get_local_id(0);
+  else if (dim == 3) return (get_local_id(2) * get_enqueued_local_size(1) * get_local_size(0)) +
+                            (get_local_id(1) * get_enqueued_local_size(0)) + get_local_id(0);
   else return 0;
diff --git a/backend/src/libocl/tmpl/ocl_defines.tmpl.h b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
index f5c65df..c16a99f 100644
--- a/backend/src/libocl/tmpl/ocl_defines.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
@@ -18,13 +18,18 @@
 #ifndef __OCL_COMMON_DEF_H__
 #define __OCL_COMMON_DEF_H__
-#define __OPENCL_VERSION__ 120
 #define __CL_VERSION_1_0__ 100
 #define __CL_VERSION_1_1__ 110
 #define __CL_VERSION_1_2__ 120
 #define CL_VERSION_1_0 100
 #define CL_VERSION_1_1 110
 #define CL_VERSION_1_2 120
+#if (__OPENCL_C_VERSION__ >= 200)
+#define __OPENCL_VERSION__ 200
+#define CL_VERSION_2_0 200
+#define __OPENCL_VERSION__ 120
 #define __ENDIAN_LITTLE__ 1
 #define __IMAGE_SUPPORT__ 1
 #define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 7e7f4ae..3327389 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -40,6 +40,18 @@ SDEF(long, s, 64);
 SDEF(ulong, u, 64);
 #undef SDEF
+#define SDEF(TYPE, TYPE_NAME, SIZE)        \
+OVERLOADABLE TYPE ctz(TYPE x){ return ctz_##TYPE_NAME##SIZE(x);}
+SDEF(char, s, 8);
+SDEF(uchar, u, 8);
+SDEF(short, s, 16);
+SDEF(ushort, u, 16);
+SDEF(int, s, 32);
+SDEF(uint, u, 32);
+SDEF(long, s, 64);
+SDEF(ulong, u, 64);
+#undef SDEF
 #define SDEF(TYPE)        \
 OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
index 4b3b5ae..ac1800b 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
@@ -54,6 +54,24 @@ uint   clz_u32(uint);
 long   clz_s64(long);
 ulong  clz_u64(ulong);
+OVERLOADABLE char ctz(char x);
+OVERLOADABLE uchar ctz(uchar x);
+OVERLOADABLE short ctz(short x);
+OVERLOADABLE ushort ctz(ushort x);
+OVERLOADABLE int ctz(int x);
+OVERLOADABLE uint ctz(uint x);
+OVERLOADABLE long ctz(long x);
+OVERLOADABLE ulong ctz(ulong x);
+char   ctz_s8(char);
+uchar  ctz_u8(uchar);
+short  ctz_s16(short);
+ushort ctz_u16(ushort);
+int    ctz_s32(int);
+uint   ctz_u32(uint);
+long   ctz_s64(long);
+ulong  ctz_u64(ulong);
 OVERLOADABLE char popcount(char x);
 OVERLOADABLE uchar popcount(uchar x);
 OVERLOADABLE short popcount(short x);
diff --git a/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl b/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl
new file mode 100644
index 0000000..d47e0a2
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl
@@ -0,0 +1,3801 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_math_20.h"
+#include "ocl_float.h"
+#include "ocl_relational.h"
+#include "ocl_common.h"
+#include "ocl_integer.h"
+extern constant int __ocl_math_fastpath_flag;
+CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
+CONST float __gen_ocl_sin(float x) __asm("llvm.sin" ".f32");
+CONST float __gen_ocl_cos(float x) __asm("llvm.cos" ".f32");
+CONST float __gen_ocl_sqrt(float x) __asm("llvm.sqrt" ".f32");
+PURE CONST float __gen_ocl_rsqrt(float x);
+CONST float __gen_ocl_log(float x) __asm("llvm.log2" ".f32");
+CONST float __gen_ocl_exp(float x) __asm("llvm.exp2" ".f32");
+PURE CONST float __gen_ocl_pow(float x, float y) __asm("llvm.pow" ".f32");
+PURE CONST float __gen_ocl_rcp(float x);
+CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
+CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
+CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
+CONST float __gen_ocl_rndd(float x) __asm("llvm.floor" ".f32");
+/* native functions */
+OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+OVERLOADABLE float native_log(float x) {
+  return native_log2(x) * 0.6931472002f;
+OVERLOADABLE float native_log10(float x) {
+  return native_log2(x) * 0.3010299956f;
+OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+OVERLOADABLE float native_tan(float x) {
+  return native_sin(x) / native_cos(x);
+OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
+OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
+OVERLOADABLE float native_exp10(float x) { return __gen_ocl_exp(M_LOG210_F*x); }
+OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+/* Fast path */
+OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x) {
+    return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x) {
+    return native_log(x + native_sqrt(x * x + 1));
+OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x) {
+    return 0.5f * native_log((1 + x) / (1 - x));
+OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x) {
+    return __gen_ocl_pow(x, 0.3333333333f);
+OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x) {
+    return native_cos(x);
+OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x) {
+    return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x) {
+    return __gen_ocl_cos(x * M_PI_F);
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x) {
+    return native_exp(x);
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x) {
+    return native_exp10(x);
+OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x) {
+    return __gen_ocl_pow(M_E_F, x) - 1;
+OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y) {
+    return x-y*__gen_ocl_rndz(x/y);
+OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y) {
+    return __gen_ocl_sqrt(x*x + y*y);
+OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x) {
+    return __gen_ocl_rndd(native_log2(x));
+OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n) {
+    return __gen_ocl_pow(2, n) * x;
+OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x) {
+    return native_log(x);
+OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x) {
+    return native_log2(x);
+OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x) {
+    return native_log10(x);
+OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x) {
+    return native_log(x + 1);
+OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x) {
+    return __gen_ocl_rndd(native_log2(x));
+OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y) {
+    return x-y*__gen_ocl_rnde(x/y);
+OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n) {
+    return __gen_ocl_pow(x, 1.f / n);
+OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x) {
+    return native_sin(x);
+OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, float *cosval) {
+    *cosval = native_cos(x);
+    return native_sin(x);
+OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x) {
+    return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x) {
+    return __gen_ocl_sin(x * M_PI_F);
+OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x) {
+    return native_tan(x);
+OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x) {
+    float y = native_exp(-2 * x);
+    return (1 - y) / (1 + y);
+/* Internal implement, high accuracy. */
+OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+  union { unsigned u; float f; } ux, uy;
+  ux.f = x;
+  uy.f = y;
+  ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
+  return ux.f;
+OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  union { unsigned int i; float f; } u;
+  const float
+  ln2_hi = 6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =  3.355443200e+07, /* 0x4c000000 */
+  Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lg3 = 2.8571429849e-01, /* 3E924925 */
+  Lg4 = 2.2222198546e-01; /* 3E638E29 */
+  const float zero   =  0.0;
+  float fsq, f, s, z, R, w, t1, t2, partial;
+  int k, ix, i, j;
+  u.f = x;  ix = u.i;
+  k = 0;
+  k += (ix>>23) - 127;
+  ix &= 0x007fffff;
+  i = (ix + (0x95f64<<3)) & 0x800000;
+  u.i = ix | (i^0x3f800000); x = u.f;
+  k += (i>>23);
+  f = x - 1.0f;
+  fsq = f * f;
+  if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
+      R = fsq * (0.5f - 0.33333333333333333f * f);
+      return k * ln2_hi + k * ln2_lo + f - R;
+  }
+  s = f / (2.0f + f);
+  z = s * s;
+  i = ix - (0x6147a << 3);
+  w = z * z;
+  j = (0x6b851 << 3) - ix;
+  t1= w * mad(w, Lg4, Lg2);
+  t2= z * mad(w, Lg3, Lg1);
+  i |= j;
+  R = t2 + t1;
+  partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
+  return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
+OVERLOADABLE float __gen_ocl_internal_log(float x)
+  union { unsigned int i; float f; } u;
+  u.f = x;
+  int ix = u.i;
+  if (ix < 0 )
+	return NAN;  /* log(-#) = NaN */
+  if (ix >= 0x7f800000)
+    return NAN;
+  return __gen_ocl_internal_log_valid(x);
+OVERLOADABLE float __gen_ocl_internal_log10(float x)
+  union { float f; unsigned i; } u;
+  const float
+  ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
+  log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
+  log10_2lo  =  7.9034151668e-07; /* 0x355427db */
+  float y, z;
+  int i, k, hx;
+  u.f = x; hx = u.i;
+  if (hx<0)
+    return NAN; /* log(-#) = NaN */
+  if (hx >= 0x7f800000)
+    return NAN;
+  k = (hx >> 23) - 127;
+  i  = ((unsigned)k & 0x80000000) >> 31;
+  hx = (hx&0x007fffff) | ((0x7f-i) << 23);
+  y  = (float)(k + i);
+  u.i = hx; x = u.f;
+  return  y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
+OVERLOADABLE float __gen_ocl_internal_log2(float x)
+  const float zero   =  0.0,
+  invln2 = 0x1.715476p+0f;
+  int ix;
+  union { float f; int i; } u;
+  u.f = x; ix = u.i;
+  if (ix < 0)
+	return NAN;    /** log(-#) = NaN */
+  if (ix >= 0x7f800000)
+	return NAN;
+  return invln2 * __gen_ocl_internal_log_valid(x);
+float __gen_ocl_scalbnf (float x, int n){
+  /* copy from fdlibm */
+  float two25 = 3.355443200e+07,	/* 0x4c000000 */
+  twom25 = 2.9802322388e-08,	        /* 0x33000000 */
+  huge = 1.0e+30,
+  tiny = 1.0e-30;
+  int k,ix;
+  k = (ix&0x7f800000)>>23; /* extract exponent */
+  if (k==0) {	/* 0 or subnormal x */
+    if ((ix&0x7fffffff)==0) return x; /* +-0 */
+    x *= two25;
+    k = ((ix&0x7f800000)>>23) - 25;
+  }
+  if (k==0xff) return x+x;	/* NaN or Inf */
+  if (n< -50000)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  if (n> 50000 || k+n > 0xfe)
+    return huge*__gen_ocl_internal_copysign(huge,x); /* overflow  */
+  /* Now k and n are bounded we know that k = k+n does not overflow. */
+  k = k+n;
+  if (k > 0) { /* normal result */
+    GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+    return x;
+  }
+  if (k <= -25)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  k += 25;				/* subnormal result */
+  GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+  return x*twom25;
+const __constant unsigned int two_over_pi[] = {
+0, 0, 0xA2F, 0x983, 0x6E4, 0xe44, 0x152, 0x9FC,
+0x275, 0x7D1, 0xF53, 0x4DD, 0xC0D, 0xB62,
+0x959, 0x93C, 0x439, 0x041, 0xFE5, 0x163,
+// The main idea is from "Radian Reduction for Trigonometric Functions"
+// written by Mary H. Payne and Robert N. Hanek. Also another reference
+// is "A Continued-Fraction Analysis of Trigonometric Argument Reduction"
+// written by Roger Alan Smith, who gave the worst case in this paper.
+// for single float, worst x = 0x1.47d0fep34, and there are 29 bit
+// leading zeros in the fraction part of x*(2.0/pi). so we need at least
+// 29 (leading zero)+ 24 (fraction )+12 (integer) + guard bits. that is,
+// 65 + guard bits, as we calculate in 12*7 = 84bits, which means we have
+// about 19 guard bits. If we need further precision, we may need more
+// guard bits
+// Note we place two 0 in two_over_pi, which is used to handle input less
+// than 0x1.0p23
+int payne_hanek(float x, float *y) {
+  union { float f; unsigned u;} ieee;
+  ieee.f = x;
+  unsigned u = ieee.u;
+  int k = ((u & 0x7f800000) >> 23)-127;
+  int ma = (u & 0x7fffff) | 0x800000;
+  unsigned  high, low;
+  high = (ma & 0xfff000) >> 12;
+  low = ma & 0xfff;
+  // Two tune below macro, you need to fully understand the algorithm
+#define CALC_BLOCKS 7
+#define ZERO_BITS 2
+  unsigned result[CALC_BLOCKS];
+  // round down, note we need 2 bits integer precision
+  int index = (k-23-2) < 0 ? (k-23-2-11)/12 : (k-23-2)/12;
+  for (int i = 0; i < CALC_BLOCKS; i++) {
+    result[i] =  low * two_over_pi[index+i+ZERO_BITS] ;
+    result[i] +=  high * two_over_pi[index+i+1+ZERO_BITS];
+  }
+  for (int i = CALC_BLOCKS-1; i > 0; i--) {
+    int temp = result[i] >> 12;
+    result[i]  -= temp << 12;
+    result[i-1] += temp;
+  }
+#undef ZERO_BITS
+  // get number of integer digits in result[0], note we only consider 12 valid bits
+  // and also it means the fraction digits in result[0] is (12-intDigit)
+  int intDigit = index*(-12) + (k-23);
+  // As the integer bits may be all included in result[0], and also maybe
+  // some bits in result[0], and some in result[1]. So we merge succesive bits,
+  // which makes easy coding.
+  unsigned b0 = (result[0] << 12) | result[1];
+  unsigned b1 = (result[2] << 12) | result[3];
+  unsigned b2 = (result[4] << 12) | result[5];
+  unsigned b3 = (result[6] << 12);
+  unsigned intPart = b0 >> (24-intDigit);
+  unsigned fract1 = ((b0 << intDigit) | (b1 >> (24-intDigit))) & 0xffffff;
+  unsigned fract2 = ((b1 << intDigit) | (b2 >> (24-intDigit))) & 0xffffff;
+  unsigned fract3 = ((b2 << intDigit) | (b3 >> (24-intDigit))) & 0xffffff;
+  // larger than 0.5? which mean larger than pi/4, we need
+  // transform from [0,pi/2] to [-pi/4, pi/4] through -(1.0-fract)
+  int largerPiBy4 = ((fract1 & 0x800000) != 0);
+  int sign = largerPiBy4 ? 1 : 0;
+  intPart = largerPiBy4 ? (intPart+1) : intPart;
+  fract1 = largerPiBy4 ? (fract1 ^ 0x00ffffff) : fract1;
+  fract2 = largerPiBy4 ? (fract2 ^ 0x00ffffff) : fract2;
+  fract3 = largerPiBy4 ? (fract3 ^ 0x00ffffff) : fract3;
+  int leadingZero = (fract1 == 0);
+  // +1 is for the hidden bit 1 in floating-point format
+  int exponent = leadingZero ? -(24+1) : -(0+1);
+  fract1 = leadingZero ? fract2 : fract1;
+  fract2 = leadingZero ? fract3 : fract2;
+  // fract1 may have leading zeros, add it
+  int shift = clz(fract1)-8;
+  exponent += -shift;
+  float pio2 = 0x1.921fb6p+0;
+  unsigned fdigit = ((fract1 << shift) | (fract2 >> (24-shift))) & 0xffffff;
+  // we know that denormal number will not appear here
+  ieee.u = (sign << 31) | ((exponent+127) << 23) | (fdigit & 0x7fffff);
+  *y = ieee.f * pio2;
+  return intPart;
+int argumentReduceSmall(float x, float * remainder) {
+  union {
+    float f;
+    unsigned u;
+  } ieee;
+  float twoByPi = 2.0f/3.14159265f;
+  float piBy2_1h = (float) 0xc90/0x1.0p11,
+        piBy2_1l = (float) 0xfda/0x1.0p23,
+        piBy2_2h = (float) 0xa22/0x1.0p35,
+        piBy2_2l = (float) 0x168/0x1.0p47,
+        piBy2_3h = (float) 0xc23/0x1.0p59,
+        piBy2_3l = (float) 0x4c4/0x1.0p71;
+  float y = (float)(int)(twoByPi * x + 0.5f);
+  ieee.f = y;
+  ieee.u = ieee.u & 0xfffff000;
+  float yh = ieee.f;
+  float yl = y - yh;
+  float rem = x - yh*piBy2_1h - yh*piBy2_1l - yl*piBy2_1h - yl*piBy2_1l;
+  rem = rem - yh*piBy2_2h - yh*piBy2_2l + yl*piBy2_2h + yl*piBy2_2l;
+  rem = rem - yh*piBy2_3h - yh*piBy2_3l - yl*piBy2_3h - yl*piBy2_3l;
+  *remainder = rem;
+  return (int)y;
+int __ieee754_rem_pio2f(float x, float *y) {
+  if (x < 4000.0f) {
+    return argumentReduceSmall(x, y);
+  } else {
+    return payne_hanek(x, y);
+  }
+OVERLOADABLE float __kernel_sinf(float x)
+  /* copied from fdlibm */
+  const float
+  S1  = -1.6666667163e-01, /* 0xbe2aaaab */
+  S2  =  8.3333337680e-03, /* 0x3c088889 */
+  S3  = -1.9841270114e-04, /* 0xb9500d01 */
+  S4  =  2.7557314297e-06; /* 0x3638ef1b */
+  float z,r,v;
+  z =  x*x;
+  v =  z*x;
+  r = mad(z, mad(z, mad(z, S4, S3), S2), S1);
+  return mad(v, r, x);
+float __kernel_cosf(float x, float y)
+  /* copied from fdlibm */
+  const float
+  one =  1.0000000000e+00, /* 0x3f800000 */
+  C1  =  4.1666667908e-02, /* 0x3d2aaaab */
+  C2  = -1.3888889225e-03, /* 0xbab60b61 */
+  C3  =  2.4801587642e-05; /* 0x37d00d01 */
+  float a,hz,z,r,qx;
+  int ix;
+  ix &= 0x7fffffff;     /* ix = |x|'s high word*/
+  z  = x*x;
+  r = z * mad(z, mad(z, C3, C2), C1);
+  if(ix < 0x3e99999a)       /* if |x| < 0.3 */
+      return one - ((float)0.5*z - (z*r - x*y));
+  else {
+      GEN_OCL_SET_FLOAT_WORD(qx,ix-0x01000000); /* x/4 */
+      hz = (float)0.5*z-qx;
+      a  = one-qx;
+      return a - (hz - (z*r-x*y));
+  }
+OVERLOADABLE float sin(float x)
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sin(x);
+  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
+  float y,z=0.0;
+  int n, ix;
+  float negative = x < 0.0f? -1.0f : 1.0f;
+  x = fabs(x);
+  ix &= 0x7fffffff;
+    /* sin(Inf or NaN) is NaN */
+  if (ix >= 0x7f800000) return x-x;
+  if(x <= pio4)
+	  return negative * __kernel_sinf(x);
+  /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,&y);
+      float s = __kernel_sinf(y);
+      float c = __kernel_cosf(y,0.0f);
+      float ret = (n&1) ? negative*c : negative*s;
+      return (n&3)> 1? -1.0f*ret : ret;
+  }
+OVERLOADABLE float cos(float x)
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cos(x);
+  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
+  float y,z=0.0;
+  int n, ix;
+  x = __gen_ocl_fabs(x);
+  ix &= 0x7fffffff;
+    /* cos(Inf or NaN) is NaN */
+  if (ix >= 0x7f800000) return x-x;
+  if(x <= pio4)
+	  return __kernel_cosf(x, 0.f);
+  /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,&y);
+      n &= 3;
+      float c = __kernel_cosf(y, 0.0f);
+      float s = __kernel_sinf(y);
+      float v = (n&1) ? s : c;
+      /* n&3   return
+          0    cos(y)
+          1   -sin(y)
+          2   -cos(y)
+          3    sin(y)
+      */
+      int mask = (n>>1) ^ n;
+      float sign = (mask&1) ? -1.0f : 1.0f;
+      return sign * v;
+  }
+float __kernel_tanf(float x, float y, int iy)
+  /* copied from fdlibm */
+        float z,r,v,w,s;
+        int ix,hx;
+        const float
+        one   =  1.0000000000e+00, /* 0x3f800000 */
+        pio4  =  7.8539812565e-01, /* 0x3f490fda */
+        pio4lo=  3.7748947079e-08; /* 0x33222168 */
+        float T[13];// =  {
+         T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
+         T[1] = 1.3333334029e-01; /* 0x3e088889 */
+         T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
+         T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
+         T[4] = 8.8632395491e-03; /* 0x3c11371f */
+         T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
+         T[6] = 1.4562094584e-03; /* 0x3abede48 */
+         T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
+        GEN_OCL_GET_FLOAT_WORD(hx,x);
+        ix = hx&0x7fffffff;     /* high word of |x| */
+        if(ix<0x31800000)                       /* x < 2**-28 */
+            {if((int)x==0) {                    /* generate inexact */
+                if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
+                else return (iy==1)? x: -one/x;
+            }
+            }
+        if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
+            if(hx<0) {x = -x; y = -y;}
+            z = pio4-x;
+            w = pio4lo-y;
+            x = z+w; y = 0.0;
+        }
+        z       =  x*x;
+        w       =  z*z;
+		/* Break x^5*(T[1]+x^2*T[2]+...) into
+		 *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+		 *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+		 */
+        r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]);
+        v = z* mad(w, mad(w, T[6], T[4]), T[2]);
+        s = z*x;
+        r = mad(z, mad(s, r + v, y), y);
+        r += T[0]*s;
+        w = x+r;
+        if(ix>=0x3f2ca140) {
+            v = (float)iy;
+            return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
+        }
+        if(iy==1) return w;
+        else
+        	return -1.0/(x+r);
+OVERLOADABLE float tan(float x)
+    if (__ocl_math_fastpath_flag)
+      return __gen_ocl_internal_fastpath_tan(x);
+    float y,z=0.0;
+    int n, ix;
+    float negative = x < 0.0f? -1.0f : 1.0f;
+    x = negative * x;
+    ix &= 0x7fffffff;
+    /* tan(Inf or NaN) is NaN */
+    if (ix>=0x7f800000) return x-x;            /* NaN */
+    /* argument reduction needed */
+    else {
+      n = __ieee754_rem_pio2f(x,&y);
+      return negative * __kernel_tanf(y,0.0f,1-((n&1)<<1)); /*   1 -- n even
+                                                              -1 -- n odd */
+    }
+OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+  int ix;
+  if(isinf(x) || isnan(x)) { return NAN; }
+  if(x < 0.0f) { x = -x; }
+  if(x> 0x1.0p24) return 1.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+  switch(ix) {
+   case 0:
+    return __kernel_cosf(m*M_PI_F, 0.0f);
+   case 1:
+   case 2:
+    return __kernel_sinf((0.5f-m)*M_PI_F);
+   case 3:
+   case 4:
+    return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
+   case 5:
+   case 6:
+    return __kernel_sinf((m-1.5f)*M_PI_F);
+   default:
+    return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
+   }
+OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+  switch(ix) {
+   case 0:
+    return sign*__kernel_sinf(m*M_PI_F);
+   case 1:
+   case 2:
+    return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
+   case 3:
+   case 4:
+    return -sign*__kernel_sinf((m-1.0f)*M_PI_F);
+   case 5:
+   case 6:
+    return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
+   default:
+    return -sign*__kernel_sinf((2.0f-m)*M_PI_F);
+   }
+OVERLOADABLE float lgamma(float x) {
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+    const float
+        zero=  0.,
+        one =  1.0000000000e+00,
+        pi  =  3.1415927410e+00,
+        a0  =  7.7215664089e-02,
+        a1  =  3.2246702909e-01,
+        a2  =  6.7352302372e-02,
+        a3  =  2.0580807701e-02,
+        a4  =  7.3855509982e-03,
+        a5  =  2.8905137442e-03,
+        a6  =  1.1927076848e-03,
+        a7  =  5.1006977446e-04,
+        a8  =  2.2086278477e-04,
+        a9  =  1.0801156895e-04,
+        a10 =  2.5214456400e-05,
+        a11 =  4.4864096708e-05,
+        tc  =  1.4616321325e+00,
+        tf  = -1.2148628384e-01,
+        tt  =  6.6971006518e-09,
+        t0  =  4.8383611441e-01,
+        t1  = -1.4758771658e-01,
+        t2  =  6.4624942839e-02,
+        t3  = -3.2788541168e-02,
+        t4  =  1.7970675603e-02,
+        t5  = -1.0314224288e-02,
+        t6  =  6.1005386524e-03,
+        t7  = -3.6845202558e-03,
+        t8  =  2.2596477065e-03,
+        t9  = -1.4034647029e-03,
+        t10 =  8.8108185446e-04,
+        t11 = -5.3859531181e-04,
+        t12 =  3.1563205994e-04,
+        t13 = -3.1275415677e-04,
+        t14 =  3.3552918467e-04,
+        u0  = -7.7215664089e-02,
+        u1  =  6.3282704353e-01,
+        u2  =  1.4549225569e+00,
+        u3  =  9.7771751881e-01,
+        u4  =  2.2896373272e-01,
+        u5  =  1.3381091878e-02,
+        v1  =  2.4559779167e+00,
+        v2  =  2.1284897327e+00,
+        v3  =  7.6928514242e-01,
+        v4  =  1.0422264785e-01,
+        v5  =  3.2170924824e-03,
+        s0  = -7.7215664089e-02,
+        s1  =  2.1498242021e-01,
+        s2  =  3.2577878237e-01,
+        s3  =  1.4635047317e-01,
+        s4  =  2.6642270386e-02,
+        s5  =  1.8402845599e-03,
+        s6  =  3.1947532989e-05,
+        r1  =  1.3920053244e+00,
+        r2  =  7.2193557024e-01,
+        r3  =  1.7193385959e-01,
+        r4  =  1.8645919859e-02,
+        r5  =  7.7794247773e-04,
+        r6  =  7.3266842264e-06,
+        w0  =  4.1893854737e-01,
+        w1  =  8.3333335817e-02,
+        w2  = -2.7777778450e-03,
+        w3  =  7.9365057172e-04,
+        w4  = -5.9518753551e-04,
+        w5  =  8.3633989561e-04,
+        w6  = -1.6309292987e-03;
+	float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+	int i, hx, ix;
+	nadj = 0;
+	hx = *(int *)&x;
+	ix = hx & 0x7fffffff;
+	if (ix >= 0x7f800000)
+		return x * x;
+	if (ix == 0)
+		return ((x + one) / zero);
+	if (ix < 0x1c800000) {
+		if (hx < 0) {
+			return -native_log(-x);
+		} else
+			return -native_log(x);
+	}
+	if (hx < 0) {
+		if (ix >= 0x4b000000)
+			return ((-x) / zero);
+		t = __gen_ocl_internal_sinpi(x);
+		if (t == zero)
+			return ((-x) / zero);
+		nadj = native_log(pi / __gen_ocl_fabs(t * x));
+		x = -x;
+	}
+	if (ix == 0x3f800000 || ix == 0x40000000)
+		r = 0;
+	else if (ix < 0x40000000) {
+		if (ix <= 0x3f666666) {
+			r = -native_log(x);
+			if (ix >= 0x3f3b4a20) {
+				y = one - x;
+				i = 0;
+			} else if (ix >= 0x3e6d3308) {
+				y = x - (tc - one);
+				i = 1;
+			} else {
+				y = x;
+				i = 2;
+			}
+		} else {
+			r = zero;
+			if (ix >= 0x3fdda618) {
+				y = (float) 2.0 - x;
+				i = 0;
+			}
+			else if (ix >= 0x3F9da620) {
+				y = x - tc;
+				i = 1;
+			}
+			else {
+				y = x - one;
+				i = 2;
+			}
+		}
+		switch (i) {
+		case 0:
+			z = y * y;
+			p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);
+			p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);
+			p = mad(y, p1, p2);
+			r += (p - (float) 0.5 * y);
+			break;
+		case 1:
+			z = y * y;
+			w = z * y;
+			p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);
+			p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);
+			p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);
+			p = mad(p1, z, mad(w, mad(y, p3, p2), -tt));
+			r += (tf + p);
+			break;
+		case 2:
+			p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);
+			p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);
+			r += (-(float) 0.5 * y + p1 / p2);
+		}
+	} else if (ix < 0x41000000) {
+		i = (int) x;
+		t = zero;
+		y = x - (float) i;
+		p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);
+		q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);
+		r = .5f * y + p / q;
+		z = one;
+		switch (i) {
+		case 7:
+			z *= (y + 6.0f);
+		case 6:
+			z *= (y + 5.0f);
+		case 5:
+			z *= (y + 4.0f);
+		case 4:
+			z *= (y + 3.0f);
+		case 3:
+			z *= (y + 2.0f);
+			r += native_log(z);
+			break;
+		}
+	} else if (ix < 0x5c800000) {
+		t = native_log(x);
+		z = one / x;
+		y = z * z;
+		w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);
+		r = (x - .5f) * (t - one) + w;
+	} else
+		r = x * (native_log(x) - one);
+	if (hx < 0)
+		r = nadj - r;
+	return r;
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#define BODY \
+    const float  \
+        zero=  0.,  \
+        one =  1.0000000000e+00,  \
+        pi  =  3.1415927410e+00,  \
+        a0  =  7.7215664089e-02,  \
+        a1  =  3.2246702909e-01,  \
+        a2  =  6.7352302372e-02,  \
+        a3  =  2.0580807701e-02,  \
+        a4  =  7.3855509982e-03,  \
+        a5  =  2.8905137442e-03,  \
+        a6  =  1.1927076848e-03,  \
+        a7  =  5.1006977446e-04,  \
+        a8  =  2.2086278477e-04,  \
+        a9  =  1.0801156895e-04,  \
+        a10 =  2.5214456400e-05,  \
+        a11 =  4.4864096708e-05,  \
+        tc  =  1.4616321325e+00,  \
+        tf  = -1.2148628384e-01,  \
+        tt  =  6.6971006518e-09,  \
+        t0  =  4.8383611441e-01,  \
+        t1  = -1.4758771658e-01,  \
+        t2  =  6.4624942839e-02,  \
+        t3  = -3.2788541168e-02,  \
+        t4  =  1.7970675603e-02,  \
+        t5  = -1.0314224288e-02,  \
+        t6  =  6.1005386524e-03,  \
+        t7  = -3.6845202558e-03,  \
+        t8  =  2.2596477065e-03,  \
+        t9  = -1.4034647029e-03,  \
+        t10 =  8.8108185446e-04,  \
+        t11 = -5.3859531181e-04,  \
+        t12 =  3.1563205994e-04,  \
+        t13 = -3.1275415677e-04,  \
+        t14 =  3.3552918467e-04,  \
+        u0  = -7.7215664089e-02,  \
+        u1  =  6.3282704353e-01,  \
+        u2  =  1.4549225569e+00,  \
+        u3  =  9.7771751881e-01,  \
+        u4  =  2.2896373272e-01,  \
+        u5  =  1.3381091878e-02,  \
+        v1  =  2.4559779167e+00,  \
+        v2  =  2.1284897327e+00,  \
+        v3  =  7.6928514242e-01,  \
+        v4  =  1.0422264785e-01,  \
+        v5  =  3.2170924824e-03,  \
+        s0  = -7.7215664089e-02,  \
+        s1  =  2.1498242021e-01,  \
+        s2  =  3.2577878237e-01,  \
+        s3  =  1.4635047317e-01,  \
+        s4  =  2.6642270386e-02,  \
+        s5  =  1.8402845599e-03,  \
+        s6  =  3.1947532989e-05,  \
+        r1  =  1.3920053244e+00,  \
+        r2  =  7.2193557024e-01,  \
+        r3  =  1.7193385959e-01,  \
+        r4  =  1.8645919859e-02,  \
+        r5  =  7.7794247773e-04,  \
+        r6  =  7.3266842264e-06,  \
+        w0  =  4.1893854737e-01,  \
+        w1  =  8.3333335817e-02,  \
+        w2  = -2.7777778450e-03,  \
+        w3  =  7.9365057172e-04,  \
+        w4  = -5.9518753551e-04,  \
+        w5  =  8.3633989561e-04,  \
+        w6  = -1.6309292987e-03;  \
+	float t, y, z, nadj, p, p1, p2, p3, q, r, w;  \
+	int i, hx, ix;  \
+	nadj = 0;  \
+	hx = *(int *)&x;  \
+	*signgamp = 1;  \
+	ix = hx & 0x7fffffff;  \
+	if (ix >= 0x7f800000)  \
+		return x * x;  \
+	if (ix == 0)  \
+		return ((x + one) / zero);  \
+	if (ix < 0x1c800000) {  \
+		if (hx < 0) {  \
+			*signgamp = -1;  \
+			return -native_log(-x);  \
+		} else  \
+			return -native_log(x);  \
+	}  \
+	if (hx < 0) {  \
+		if (ix >= 0x4b000000)  \
+			return ((-x) / zero);  \
+		t = __gen_ocl_internal_sinpi(x);  \
+		if (t == zero)  \
+			return ((-x) / zero);  \
+		nadj = native_log(pi / __gen_ocl_fabs(t * x));  \
+		if (t < zero)  \
+			*signgamp = -1;  \
+		x = -x;  \
+	}  \
+	if (ix == 0x3f800000 || ix == 0x40000000)  \
+		r = 0;  \
+	else if (ix < 0x40000000) {  \
+		if (ix <= 0x3f666666) {  \
+			r = -native_log(x);  \
+			if (ix >= 0x3f3b4a20) {  \
+				y = one - x;  \
+				i = 0;  \
+			} else if (ix >= 0x3e6d3308) {  \
+				y = x - (tc - one);  \
+				i = 1;  \
+			} else {  \
+				y = x;  \
+				i = 2;  \
+			}  \
+		} else {  \
+			r = zero;  \
+			if (ix >= 0x3fdda618) {  \
+				y = (float) 2.0 - x;  \
+				i = 0;  \
+			}  \
+			else if (ix >= 0x3F9da620) {  \
+				y = x - tc;  \
+				i = 1;  \
+			}  \
+			else {  \
+				y = x - one;  \
+				i = 2;  \
+			}  \
+		}  \
+		switch (i) {  \
+		case 0:  \
+			z = y * y;  \
+			p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);	\
+			p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);	\
+			p = mad(y, p1, p2);	\
+			r = r - mad(y, 0.5f, -p);	\
+			break;  \
+		case 1:  \
+			z = y * y;  \
+			w = z * y;  \
+			p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);	\
+			p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);	\
+			p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);	\
+			p = z * p1 + mad(w, mad(y, p3, p2), -tt);	\
+			r += (tf + p);  \
+			break;  \
+		case 2:  \
+			p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);	\
+			p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);	\
+			r = r + mad(y, -0.5f, p1 / p2);	\
+		}  \
+	} else if (ix < 0x41000000) {  \
+		i = (int) x;  \
+		t = zero;  \
+		y = x - (float) i;  \
+		p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);		\
+		q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);	\
+		r = mad(y, 0.5f, p / q);	\
+		z = one;  \
+		switch (i) {  \
+		case 7:  \
+			z *= (y + (float) 6.0);  \
+		case 6:  \
+			z *= (y + (float) 5.0);  \
+		case 5:  \
+			z *= (y + (float) 4.0);  \
+		case 4:  \
+			z *= (y + (float) 3.0);  \
+		case 3:  \
+			z *= (y + (float) 2.0);  \
+			r += native_log(z);  \
+			break;  \
+		}  \
+		  \
+	} else if (ix < 0x5c800000) {  \
+		t = native_log(x);  \
+		z = one / x;  \
+		y = z * z;  \
+		w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);  \
+		r = (x - .5f) * (t - one) + w;  \
+	} else  \
+		r = x * (native_log(x) - one);	\
+	if (hx < 0)  \
+		r = nadj - r;  \
+	return r;
+OVERLOADABLE float lgamma_r(float x, int *signgamp) { BODY; }
+#undef BODY
+OVERLOADABLE float log1p(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log1p(x);
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  const float
+  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =    3.355443200e+07, /* 0x4c000000 */
+  Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lp3 = 2.8571429849e-01, /* 3E924925 */
+  Lp4 = 2.2222198546e-01; /* 3E638E29 */
+  const float zero = 0.0;
+  float hfsq,f,c,s,z,R,u;
+  int k,hx,hu,ax;
+  union {float f; unsigned i;} un;
+  un.f = x;  hx = un.i;
+  ax = hx&0x7fffffff;
+  k = 1;
+  if (hx < 0x3ed413d7) {      /* x < 0.41422  */
+      if(ax>=0x3f800000) {    /* x <= -1.0 */
+    if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
+    else return (x-x)/(x-x);  /* log1p(x<-1)=NaN */
+      }
+      if(ax<0x31000000) {     /* |x| < 2**-29 */
+    if(two25+x>zero     /* raise inexact */
+              &&ax<0x24800000)    /* |x| < 2**-54 */
+        return x;
+    else
+        return x - x*x*(float)0.5;
+      }
+      if(hx>0||hx<=((int)0xbe95f61f)) {
+    k=0;f=x;hu=1;}  /* -0.2929<x<0.41422 */
+  }
+  if (hx >= 0x7f800000) return x+x;
+  if(k!=0) {
+      if(hx<0x5a000000) {
+    u  = (float)1.0+x;
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    /* correction term */
+          c  = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
+    c /= u;
+      } else {
+    u  = x;
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    c  = 0;
+      }
+      hu &= 0x007fffff;
+      if(hu<0x3504f7) {
+          un.i = hu|0x3f800000; u = un.f;/* normalize u */
+      } else {
+          k += 1;
+          un.i = hu|0x3f000000; u = un.f;  /* normalize u/2 */
+          hu = (0x00800000-hu)>>2;
+      }
+      f = u-(float)1.0;
+  }
+  hfsq=(float)0.5*f*f;
+  if(hu==0)
+  { /* |f| < 2**-20 */
+      if(f==zero)
+      {
+    	  if(k==0) return zero;
+    	  else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
+      }
+      R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
+      if(k==0) return f-R; else
+    	  return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
+  }
+  s = f/((float)2.0+f);
+  z = s*s;
+  R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
+  if(k==0)
+	  return f + mad(hfsq + R, s, -hfsq);
+  else
+	  return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
+OVERLOADABLE float logb(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_logb(x);
+  union {float f; unsigned i;} u;
+  u.f = x;
+  int e =  ((u.i & 0x7f800000) >> 23);
+  float r1 = e-127;
+  float r2 = -INFINITY;
+  float r3 = x*x;
+    /* sub normal or +/-0 */
+  float r = e == 0 ? r2 : r1;
+    /* inf & nan */
+  return e == 0xff ? r3 : r;
+OVERLOADABLE int ilogb(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_ilogb(x);
+  union { int i; float f; } u;
+  if (isnan(x))
+    return FP_ILOGBNAN;
+  if (isinf(x))
+    return 0x7FFFFFFF;
+  u.f = x;
+  u.i &= 0x7fffffff;
+  if (u.i == 0)
+    return FP_ILOGB0;
+  if (u.i >= 0x800000)
+    return (u.i >> 23) - 127;
+  int r = -126;
+  int a = u.i & 0x7FFFFF;
+  while(a < 0x800000) {
+    a <<= 1;
+    r --;
+  }
+  return r;
+OVERLOADABLE float nan(uint code) {
+  return NAN;
+OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  int n = __gen_ocl_internal_floor(m*4.0f);
+  if(m == 0.5f) {
+    return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
+  }
+  if(m == 0.0f) {
+    return (ix&0x1) == 0 ? 0.0f : -0.0f;
+  }
+  switch(n) {
+    case 0:
+      return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
+    case 1:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    case 2:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    default:
+      return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
+  }
+OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+  /* copied from fdlibm */
+  const unsigned
+  B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
+  B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
+  const float
+  C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
+  D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
+  E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
+  F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
+  G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
+  float r,s,t, w;
+  int hx;
+  uint sign;
+  uint high;
+  sign=hx&0x80000000;     /* sign= sign(x) */
+  hx  ^=sign;
+  if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
+  if(hx==0)
+      return(x);    /* cbrt(0) is itself */
+  GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
+    /* rough cbrt to 5 bits */
+  if(hx<0x00800000)     /* subnormal number */
+    {
+    //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
+     //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
+      t = (sign = 0) ? 0.0f : -0.0f;
+      return t;
+    }
+  else
+    GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
+    /* new cbrt to 23 bits */
+  r=t*t/x;
+  s=mad(r, t, C);
+  t*=G+F/(s+E+D/s);
+    /* one step newton iteration to 53 bits with error less than 0.667 ulps */
+  s=t*t;    /* t*t is exact */
+  r=x/s;
+  w=t+t;
+  r=(r-t)/(w+r);  /* r-s is exact */
+  t=mad(t, r, t);
+    /* retore the sign bit */
+  GEN_OCL_SET_FLOAT_WORD(t,high|sign);
+  return(t);
+#define BODY \
+  *cosval = cos(x); \
+  return sin(x);
+OVERLOADABLE float sincos(float x, float *cosval) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sincos(x, cosval);
+  BODY;
+#undef BODY
+INLINE float __gen_ocl_asin_util(float x) {
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float
+  pS0 =  1.66666666666666657415e-01,
+  pS1 = -3.25565818622400915405e-01,
+  pS2 =  2.01212532134862925881e-01,
+  pS3 = -4.00555345006794114027e-02,
+  pS4 =  7.91534994289814532176e-04,
+  qS1 = -2.40339491173441421878e+00,
+  qS2 =  2.02094576023350569471e+00,
+  qS3 = -6.88283971605453293030e-01,
+  qS4 =  7.70381505559019352791e-02;
+  float t = x*x;
+  float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1), pS0);
+  float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1), 1.0f);
+  float w = p / q;
+  return mad(x, w, x);
+OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+  uint ix;
+  union { uint i; float f; } u;
+  u.f = x;
+  ix = u.i & 0x7fffffff;
+  if(ix == 0x3f800000) {
+    return x * M_PI_2_F;  /* asin(|1|)=+-pi/2 with inexact */
+  }
+  if(ix > 0x3f800000) {            /* |x|>= 1 */
+    return  NAN;          /* asin(|x|>1) is NaN */
+  }
+  if(ix < 0x32000000) {            /* if |x| < 2**-27 */
+    if(HUGE_VALF + x > FLT_ONE) return x;   /* return x with inexact if x!=0*/
+  }
+  if(x < -0.5) {
+    return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
+  } else if(x > 0.5) {
+    return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
+  } else {
+    return __gen_ocl_asin_util(x);
+  }
+OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+  return __gen_ocl_internal_asin(x) / M_PI_F;
+OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+  if(x > 0.5)
+    return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
+  else
+    return M_PI_2_F - __gen_ocl_internal_asin(x);
+OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+  return __gen_ocl_internal_acos(x) / M_PI_F;
+__constant float atanhi[4] = {
+  4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
+  7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
+  9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
+  1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
+__constant float atanlo[4] = {
+  5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
+  3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
+  3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
+  7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
+OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+  /* copied from fdlibm */
+  float aT[11];
+  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
+  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
+  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
+  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
+  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
+  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
+  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
+  const float one = 1.0, huge = 1.0e30;
+  float w,s1,s2,z;
+  int ix,hx,id;
+  ix = hx&0x7fffffff;
+  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
+      if(ix>0x7f800000)
+    return x+x;   /* NaN */
+      if(hx>0) return  atanhi[3]+atanlo[3];
+      else     return -atanhi[3]-atanlo[3];
+  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
+      if (ix < 0x31000000) {  /* |x| < 2^-29 */
+    if(huge+x>one) return x;  /* raise inexact */
+      }
+      id = -1;
+  } else {
+  x = __gen_ocl_fabs(x);
+  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
+      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
+    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
+      } else {      /* 11/16<=|x|< 19/16 */
+    id = 1; x  = (x-one)/(x+one);
+      }
+  } else {
+      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
+    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
+      } else {      /* 2.4375 <= |x| < 2^66 */
+    id = 3; x  = -(float)1.0/x;
+      }
+  }}
+    /* end of argument reduction */
+  z = x*x;
+  w = z*z;
+    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
+  s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]);
+  s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]);
+  if (id<0) return x - x*(s1+s2);
+  else {
+      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
+      return (hx<0)? -z:z;
+  }
+OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+  return __gen_ocl_internal_atan(x) / M_PI_F;
+// XXX work-around PTX profile
+OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
+OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
+  /* copied from fdlibm */
+  float z;
+  int k,m,hx,hy,ix,iy;
+  const float
+  tiny  = 1.0e-30,
+  zero  = 0.0,
+  pi_o_4  = 7.8539818525e-01, /* 0x3f490fdb */
+  pi_o_2  = 1.5707963705e+00, /* 0x3fc90fdb */
+  pi      = 3.1415927410e+00, /* 0x40490fdb */
+  pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
+  ix = hx&0x7fffffff;
+  iy = hy&0x7fffffff;
+  if((ix>0x7f800000)||
+     (iy>0x7f800000)) /* x or y is NaN */
+     return x+y;
+  if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y);   /* x=1.0 */
+  m = ((hy>>31)&1)|((hx>>30)&2);  /* 2*sign(x)+sign(y) */
+    /* when y = 0 */
+  if(iy==0) {
+      switch(m) {
+    case 0:
+    case 1: return y;   /* atan(+-0,+anything)=+-0 */
+    case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
+    case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
+      }
+  }
+    /* when x = 0 */
+  if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
+  /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
+  if(ix <= 0x7fffff && iy <= 0x7fffff) {
+    x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
+    y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
+  }
+    /* when x is INF */
+  if(ix==0x7f800000) {
+      if(iy==0x7f800000) {
+    switch(m) {
+        case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
+        case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
+        case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
+        case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
+    }
+      } else {
+    switch(m) {
+        case 0: return  zero  ; /* atan(+...,+INF) */
+        case 1: return -zero  ; /* atan(-...,+INF) */
+        case 2: return  pi+tiny  ;  /* atan(+...,-INF) */
+        case 3: return -pi-tiny  ;  /* atan(-...,-INF) */
+    }
+      }
+  }
+    /* when y is INF */
+  if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+    /* compute y/x */
+  k = (iy-ix)>>23;
+  if(k > 60) z=pi_o_2+(float)0.5*pi_lo;   /* |y/x| >  2**60 */
+  else if(hx<0&&k<-60) z=0.0;   /* |y|/x < -2**60 */
+  else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
+  switch (m) {
+      case 0: return       z  ; /* atan(+,+) */
+      case 1: {
+              uint zh;
+          GEN_OCL_GET_FLOAT_WORD(zh,z);
+          GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
+        }
+        return       z  ; /* atan(-,+) */
+      case 2: return  pi-(z-pi_lo);/* atan(+,-) */
+      default: /* case 3 */
+            return  (z-pi_lo)-pi;/* atan(-,-) */
+  }
+OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
+OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
+OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+OVERLOADABLE float __gen_ocl_internal_round(float x) {
+  float y = __gen_ocl_rndz(x);
+  if (__gen_ocl_fabs(x - y) >= 0.5f)
+    y += __gen_ocl_internal_copysign(1.f, x);
+  return y;
+OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
+OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+  return __gen_ocl_rnde(x);
+OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
+  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
+  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
+  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one = 1.0,
+  huge = 1.0e+30,
+  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+  P2 = -2.7777778450e-03; /* 0xbb360b61 */
+  float y,hi=0.0,lo=0.0,c,t;
+  int k=0,xsb;
+  unsigned hx;
+  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
+  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
+  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
+  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
+  float half_0 = 0.5;
+  float half_1 =	-0.5;
+  xsb = (hx>>31)&1;		/* sign bit of x */
+  hx &= 0x7fffffff;		/* high word of |x| */
+  /* filter out non-finite argument */
+  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
+    if(hx>0x7f800000)
+      return x+x;			/* NaN */
+    if(hx==0x7f800000)
+      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
+    if(x > o_threshold) return huge*huge; /* overflow */
+    if(x < u_threshold) return twom100*twom100; /* underflow */
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
+      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
+    } else {
+      float tmp = xsb == 1 ? half_1 : half_0;
+      k  = ivln2*x+tmp;
+      t  = k;
+      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
+      lo = t*ln2LO_0;
+    }
+    x  = hi - lo;
+  }
+  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
+    if(huge+x>one) return one+x;/* trigger inexact */
+  }
+  else k = 0;
+  /* x is now in primary range */
+  t  = x*x;
+  c  = x - t*(P1+t*P2);
+  if(k==0)
+    return one-((x*c)/(c-(float)2.0)-x);
+  else
+    y = one-((lo-(x*c)/((float)2.0-c))-hi);
+  if(k >= -125) {
+    unsigned hy;
+    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
+    return y;
+  } else {
+    unsigned hy;
+    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
+    return y*twom100;
+  }
+/* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com.
+ */
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+const float
+tiny = 1.0e-30,
+half_val=  5.0000000000e-01, /* 0x3F000000 */
+one =  1.0000000000e+00, /* 0x3F800000 */
+two =  2.0000000000e+00, /* 0x40000000 */
+	/* c = (subfloat)0.84506291151 */
+erx =  8.4506291151e-01, /* 0x3f58560b */
+ * Coefficients for approximation to  erf on [0,0.84375]
+ */
+efx =  1.2837916613e-01, /* 0x3e0375d4 */
+efx8=  1.0270333290e+00, /* 0x3f8375d4 */
+pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
+pp1  = -3.2504209876e-01, /* 0xbea66beb */
+pp2  = -2.8481749818e-02, /* 0xbce9528f */
+pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
+pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
+qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
+qq2  =  6.5022252500e-02, /* 0x3d852a63 */
+qq3  =  5.0813062117e-03, /* 0x3ba68116 */
+qq4  =  1.3249473704e-04, /* 0x390aee49 */
+qq5  = -3.9602282413e-06, /* 0xb684e21a */
+ * Coefficients for approximation to  erf  in [0.84375,1.25]
+ */
+pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
+pa1  =  4.1485610604e-01, /* 0x3ed46805 */
+pa2  = -3.7220788002e-01, /* 0xbebe9208 */
+pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
+pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
+pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
+pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
+qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
+qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
+qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
+qa4  =  1.2617121637e-01, /* 0x3e013307 */
+qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
+qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
+ /*
+ * Coefficients for approximation to  erfc in [1.25,1/0.35]
+ */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
+ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
+ra2  = -1.0558626175e+01, /* 0xc128f022 */
+ra3  = -6.2375331879e+01, /* 0xc2798057 */
+ra4  = -1.6239666748e+02, /* 0xc322658c */
+ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
+ra6  = -8.1287437439e+01, /* 0xc2a2932b */
+ra7  = -9.8143291473e+00, /* 0xc11d077e */
+sa1  =  1.9651271820e+01, /* 0x419d35ce */
+sa2  =  1.3765776062e+02, /* 0x4309a863 */
+sa3  =  4.3456588745e+02, /* 0x43d9486f */
+sa4  =  6.4538726807e+02, /* 0x442158c9 */
+sa5  =  4.2900814819e+02, /* 0x43d6810b */
+sa6  =  1.0863500214e+02, /* 0x42d9451f */
+sa7  =  6.5702495575e+00, /* 0x40d23f7c */
+sa8  = -6.0424413532e-02, /* 0xbd777f97 */
+ * Coefficients for approximation to  erfc in [1/.35,28]
+ */
+rb0  = -9.8649431020e-03, /* 0xbc21a092 */
+rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
+rb2  = -1.7757955551e+01, /* 0xc18e104b */
+rb3  = -1.6063638306e+02, /* 0xc320a2ea */
+rb4  = -6.3756646729e+02, /* 0xc41f6441 */
+rb5  = -1.0250950928e+03, /* 0xc480230b */
+rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
+sb1  =  3.0338060379e+01, /* 0x41f2b459 */
+sb2  =  3.2579251099e+02, /* 0x43a2e571 */
+sb3  =  1.5367296143e+03, /* 0x44c01759 */
+sb4  =  3.1998581543e+03, /* 0x4547fdbb */
+sb5  =  2.5530502930e+03, /* 0x451f90ce */
+sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
+sb7  = -2.2440952301e+01; /* 0xc1b38712 */
+	int hx,ix,i;
+	float R,S,P,Q,s,y,z,r;
+	ix = hx&0x7fffffff;
+	if(ix>=0x7f800000) {		/* erf(nan)=nan */
+	    i = ((unsigned int)hx>>31)<<1;
+	    return (float)(1-i)+one/x;	/* erf(+-inf)=+-1 */
+	}
+	if(ix < 0x3f580000) {		/* |x|<0.84375 */
+	    if(ix < 0x31800000) { 	/* |x|<2**-28 */
+	        if (ix < 0x04000000)
+		    /*avoid underflow */
+		    return (float)0.125*((float)8.0*x+efx8*x);
+		return x + efx*x;
+	    }
+	    z = x*x;
+	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1), one);
+	    y = r / s;
+	    return mad(x, y, x);
+	}
+	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
+	    s = __gen_ocl_internal_fabs(x)-one;
+	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
+	    if(hx>=0) return erx + P/Q; else return -erx - P/Q;
+	}
+	if (ix >= 0x40c00000) {		/* inf>|x|>=6 */
+	    if(hx>=0) return one-tiny; else return tiny-one;
+	}
+	x = __gen_ocl_internal_fabs(x);
+    s = one/(x*x);
+	if(ix< 0x4036DB6E) {	/* |x| < 1/0.35 */
+	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
+	} else {	/* |x| >= 1/0.35 */
+	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
+	}
+	GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
+	r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
+	if(hx>=0) return one-r/x; else return  r/x-one;
+INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+const float
+tiny = 1.0e-30,
+half_val=  5.0000000000e-01, /* 0x3F000000 */
+one =  1.0000000000e+00, /* 0x3F800000 */
+two =  2.0000000000e+00, /* 0x40000000 */
+	/* c = (subfloat)0.84506291151 */
+erx =  8.4506291151e-01, /* 0x3f58560b */
+ * Coefficients for approximation to  erf on [0,0.84375]
+ */
+efx =  1.2837916613e-01, /* 0x3e0375d4 */
+efx8=  1.0270333290e+00, /* 0x3f8375d4 */
+pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
+pp1  = -3.2504209876e-01, /* 0xbea66beb */
+pp2  = -2.8481749818e-02, /* 0xbce9528f */
+pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
+pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
+qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
+qq2  =  6.5022252500e-02, /* 0x3d852a63 */
+qq3  =  5.0813062117e-03, /* 0x3ba68116 */
+qq4  =  1.3249473704e-04, /* 0x390aee49 */
+qq5  = -3.9602282413e-06, /* 0xb684e21a */
+ * Coefficients for approximation to  erf  in [0.84375,1.25]
+ */
+pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
+pa1  =  4.1485610604e-01, /* 0x3ed46805 */
+pa2  = -3.7220788002e-01, /* 0xbebe9208 */
+pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
+pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
+pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
+pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
+qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
+qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
+qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
+qa4  =  1.2617121637e-01, /* 0x3e013307 */
+qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
+qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
+ /*
+ * Coefficients for approximation to  erfc in [1.25,1/0.35]
+ */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
+ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
+ra2  = -1.0558626175e+01, /* 0xc128f022 */
+ra3  = -6.2375331879e+01, /* 0xc2798057 */
+ra4  = -1.6239666748e+02, /* 0xc322658c */
+ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
+ra6  = -8.1287437439e+01, /* 0xc2a2932b */
+ra7  = -9.8143291473e+00, /* 0xc11d077e */
+sa1  =  1.9651271820e+01, /* 0x419d35ce */
+sa2  =  1.3765776062e+02, /* 0x4309a863 */
+sa3  =  4.3456588745e+02, /* 0x43d9486f */
+sa4  =  6.4538726807e+02, /* 0x442158c9 */
+sa5  =  4.2900814819e+02, /* 0x43d6810b */
+sa6  =  1.0863500214e+02, /* 0x42d9451f */
+sa7  =  6.5702495575e+00, /* 0x40d23f7c */
+sa8  = -6.0424413532e-02, /* 0xbd777f97 */
+ * Coefficients for approximation to  erfc in [1/.35,28]
+ */
+rb0  = -9.8649431020e-03, /* 0xbc21a092 */
+rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
+rb2  = -1.7757955551e+01, /* 0xc18e104b */
+rb3  = -1.6063638306e+02, /* 0xc320a2ea */
+rb4  = -6.3756646729e+02, /* 0xc41f6441 */
+rb5  = -1.0250950928e+03, /* 0xc480230b */
+rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
+sb1  =  3.0338060379e+01, /* 0x41f2b459 */
+sb2  =  3.2579251099e+02, /* 0x43a2e571 */
+sb3  =  1.5367296143e+03, /* 0x44c01759 */
+sb4  =  3.1998581543e+03, /* 0x4547fdbb */
+sb5  =  2.5530502930e+03, /* 0x451f90ce */
+sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
+sb7  = -2.2440952301e+01; /* 0xc1b38712 */
+	int hx,ix;
+	float R,S,P,Q,s,y,z,r;
+	ix = hx&0x7fffffff;
+	if(ix>=0x7f800000) {			/* erfc(nan)=nan */
+						/* erfc(+-inf)=0,2 */
+	    return (float)(((unsigned int)hx>>31)<<1)+one/x;
+	}
+	if(ix < 0x3f580000) {		/* |x|<0.84375 */
+	    if(ix < 0x23800000)  	/* |x|<2**-56 */
+		return one-x;
+	    z = x*x;
+	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2), qq1), one);
+	    y = r/s;
+	    if(hx < 0x3e800000) {  	/* x<1/4 */
+		return one-(x+x*y);
+	    } else {
+		r = x*y;
+		r += (x-half_val);
+	        return half_val - r ;
+	    }
+	}
+	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
+	    s = __gen_ocl_internal_fabs(x)-one;
+	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
+	    if(hx>=0) {
+	        z  = one-erx; return z - P/Q;
+	    } else {
+		z = erx+P/Q; return one+z;
+	    }
+	}
+	if (ix < 0x41e00000) {		/* |x|<28 */
+	    x = __gen_ocl_internal_fabs(x);
+        s = one/(x*x);
+	    if(ix< 0x4036DB6D) {	/* |x| < 1/.35 ~ 2.857143*/
+		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
+	    } else {			/* |x| >= 1/.35 ~ 2.857143 */
+		if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
+		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
+	    }
+	    GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
+	    r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*
+			__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
+	    if(hx>0) {
+		float ret = r/x;
+		return ret;
+	    } else
+		return two-r/x;
+	} else {
+	    if(hx>0) {
+		return tiny*tiny;
+	    } else
+		return two-tiny;
+	}
+OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
+  //return x-y*__gen_ocl_rndz(x/y);
+  float one = 1.0;
+  float Zero[2];
+  int n,hx,hy,hz,ix,iy,sx,i;
+  Zero[0] = 0.0;
+  Zero[1] = -0.0;
+  sx = hx&0x80000000;		/* sign of x */
+  hx ^=sx;		/* |x| */
+  hy &= 0x7fffffff;	/* |y| */
+  /* purge off exception values */
+  if(hy==0||(hx>=0x7f800000)||		/* y=0,or x not finite */
+  (hy>0x7f800000))			/* or y is NaN */
+    return (x*y)/(x*y);
+  if(hx<hy) return x;			/* |x|<|y| return x */
+  if(hx==hy)
+    return Zero[(unsigned)sx>>31];	/* |x|=|y| return x*0*/
+  /* determine ix = ilogb(x) */
+  if(hx<0x00800000) {	/* subnormal x */
+    for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
+  } else ix = (hx>>23)-127;
+  /* determine iy = ilogb(y) */
+  if(hy<0x00800000) {	/* subnormal y */
+    for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
+  } else iy = (hy>>23)-127;
+  /* set up {hx,lx}, {hy,ly} and align y to x */
+  if(ix >= -126)
+    hx = 0x00800000|(0x007fffff&hx);
+  else {		/* subnormal x, shift x to normal */
+    n = -126-ix;
+    hx = hx<<n;
+  }
+  if(iy >= -126)
+    hy = 0x00800000|(0x007fffff&hy);
+  else {		/* subnormal y, shift y to normal */
+    n = -126-iy;
+    hy = hy<<n;
+  }
+  /* fix point fmod */
+  n = ix - iy;
+  while(n--) {
+    hz=hx-hy;
+    if(hz<0){hx = hx+hx;}
+    else {
+      if(hz==0)		/* return sign(x)*0 */
+        return Zero[(unsigned)sx>>31];
+      hx = hz+hz;
+    }
+  }
+  hz=hx-hy;
+  if(hz>=0) {hx=hz;}
+    /* convert back to floating value and restore the sign */
+  if(hx==0)			/* return sign(x)*0 */
+    return Zero[(unsigned)sx>>31];
+  while(hx<0x00800000) {		/* normalize x */
+    hx = hx+hx;
+    iy -= 1;
+  }
+  if(iy>= -126) {		/* normalize output */
+    hx = ((hx-0x00800000)|((iy+127)<<23));
+   } else {		/* subnormal output */
+     n = -126 - iy;
+     hx >>= n;
+     GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+     x *= one;		/* create necessary signal */
+  }
+  return x;		/* exact output */
+OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+  //return __gen_ocl_pow(M_E_F, x) - 1;
+  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
+  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
+  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+  huge = 1.0e30,
+  tiny = 1.0e-30,
+  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one	=  1.0,
+  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
+  float y,hi,lo,c,t,e,hxs,hfx,r1;
+  int k,xsb;
+  int hx;
+  xsb = hx&0x80000000;
+  /* sign bit of x */
+  //if(xsb==0)
+  //y=x;
+  //else
+  //y= -x; /* y = |x| */
+  y = __gen_ocl_internal_fabs(x);
+  hx &= 0x7fffffff;		/* high word of |x| */
+  /* filter out huge and non-finite argument */
+  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
+    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
+      if(hx>0x7f800000)
+        return x+x; 	 /* NaN */
+      if(hx==0x7f800000)
+        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+      if(x > o_threshold)
+        return huge*huge; /* overflow */
+    }
+    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+      if(x+tiny<(float)0.0)	/* raise inexact */
+        return tiny-one;	/* return -1 */
+    }
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+      if(xsb==0){
+        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
+      }	else {
+        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
+      }
+    } else {
+      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+      t  = k;
+      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+      lo = t*ln2_lo;
+    }
+    x  = hi - lo;
+    c  = (hi-x)-lo;
+  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
+    //t = huge+x; /* return x with inexact flags when x!=0 */
+    //return x - (t-(huge+x));
+    return x;
+  } else k = 0;
+  /* x is now in primary range */
+  hfx = (float)0.5*x;
+  hxs = x*hfx;
+  r1 = one+hxs*(Q1+hxs*Q2);
+  t = (float)3.0-r1*hfx;
+  e = hxs*((r1-t)/((float)6.0 - x*t));
+  if(k==0)
+    return x - (x*e-hxs);		/* c is 0 */
+  else{
+    e = (x*(e-c)-c);
+    e -= hxs;
+    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+    if(k==1){
+      if(x < (float)-0.25)
+        return -(float)2.0*(e-(x+(float)0.5));
+      else
+        return  (one+(float)2.0*(x-e));
+    }
+    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
+      int i;
+      y = one-(e-x);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+      return y-one;
+    }
+    t = one;
+    if(k<23) {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+      y = t-(e-x);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    } else {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
+      y = x-(e+t);
+      y += one;
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    }
+  }
+  return y;
+OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+  float one	= 1.0,
+  ln2	= 6.9314718246e-01;/* 0x3f317218 */
+  float t;
+  int hx;
+  if(hx<0x3f800000) {	/* x < 1 */
+    return (x-x)/(x-x);
+  } else if(hx >=0x4d800000) {	/* x > 2**28 */
+    if(hx >=0x7f800000) {/* x is inf of NaN */
+      return x+x;
+    } else
+      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
+  } else if (hx==0x3f800000) {
+    return 0.0;			/* acosh(1) = 0 */
+  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
+    t=x*x;
+    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
+  } else {			/* 1<x<2 */
+    t = x-one;
+    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
+  }
+OVERLOADABLE float __gen_ocl_internal_asinh(float x){
+  //return native_log(x + native_sqrt(x * x + 1));
+  float one =  1.0000000000e+00, /* 0x3F800000 */
+  ln2 =  6.9314718246e-01, /* 0x3f317218 */
+  huge=  1.0000000000e+30;
+  float w;
+  int hx,ix;
+  ix = hx&0x7fffffff;
+  if(ix< 0x38000000) {	/* |x|<2**-14 */
+    if(huge+x>one) return x;	/* return x inexact except 0 */
+  }
+  if(ix>0x47000000) {/* |x| > 2**14 */
+    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
+    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
+  } else {
+    float xa = __gen_ocl_internal_fabs(x);
+    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
+      w = __gen_ocl_internal_log(mad(xa, 2.0f, one / (__gen_ocl_sqrt(mad(xa, xa, one)) + xa)));
+    } else {		/* 2.0 > |x| > 2**-14 */
+      float t = xa*xa;
+      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
+    }
+  }
+  return __gen_ocl_internal_copysign(w, x);
+OVERLOADABLE float __gen_ocl_internal_sinh(float x){
+  //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+  float one = 1.0,
+  shuge = 1.0e37;
+  float t,w,h;
+  int ix,jx;
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x+x;
+  h = 0.5;
+  if (jx<0) h = -h;
+  /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
+  if (ix < 0x41b00000) {		/* |x|<22 */
+    if (ix<0x31800000)	/* |x|<2**-28 */
+      if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
+    t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
+    if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
+      return h*(t+t/(t+one));
+  }
+  /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
+  if (ix < 0x42b17180)  return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
+    t = h*w;
+    return t*w;
+  }
+  /* |x| > overflowthresold, sinh(x) overflow */
+  return x*shuge;
+OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+  //float y = native_exp(-2 * x);
+  //return (1 - y) / (1 + y);
+  float one=1.0, two=2.0, tiny = 1.0e-30;
+  float t,z;
+  int jx,ix;
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) {
+    if (jx>=0)
+      return one/x+one; /* tanh(+-inf)=+-1 */
+    else
+      return one/x-one; /* tanh(NaN) = NaN */
+  }
+  if (ix < 0x41b00000) { /* |x|<22 */
+    if (ix == 0)
+      return x;		/* x == +-0 */
+    if (ix<0x24000000) 	/* |x|<2**-55 */
+      return x*(one+x);    	/* tanh(small) = small */
+    if (ix>=0x3f800000) {	/* |x|>=1  */
+      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
+      z = one - two/(t+two);
+    } else {
+      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
+      z= -t/(t+two);
+    }
+  } else { /* |x| > 22, return +-1 */
+    z = one - tiny;		/* raised inexact flag */
+  }
+  return (jx>=0)? z: -z;
+OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+  //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+  float halF = 0.5,
+  huge = 1.0e+30,
+  tiny = 1.0e-30,
+  one = 1.0;
+  float t,w;
+  int ix;
+  ix &= 0x7fffffff;
+  /* |x| in [0,22] */
+  if (ix < 0x41b00000) {
+    /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
+    if(ix<0x3eb17218) {
+      t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
+      w = one+t;
+      if (ix<0x24000000) return w;	/* cosh(tiny) = 1 */
+      return one+(t*t)/(w+w);
+    }
+    /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
+    t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
+    return halF*t+halF/t;
+  }
+  /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
+  if (ix < 0x42b17180)  return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
+    t = halF*w;
+    return t*w;
+  }
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x*x;
+  /* |x| > overflowthresold, cosh(x) overflow */
+  return huge*huge;
+OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
+  //return x-y*__gen_ocl_rnde(x/y);
+  float zero = 0.0;
+  int hx,hp;
+  unsigned sx;
+  float p_half;
+  sx = hx&0x80000000;
+  hp &= 0x7fffffff;
+  hx &= 0x7fffffff;
+  /* purge off exception values */
+  if(hp==0) return (x*p)/(x*p);	        /* p = 0 */
+  if((hx>=0x7f800000)||               /* x not finite */
+    ((hp>0x7f800000)))	               /* p is NaN */
+    return (x*p)/(x*p);
+  if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
+  if ((hx-hp)==0) return zero*x;
+  x = __gen_ocl_fabs(x);
+  p = __gen_ocl_fabs(p);
+  if (hp<0x01000000) {
+    if(x+x>p) {
+      x-=p;
+      if(x+x>=p) x -= p;
+    }
+  } else {
+    p_half = (float)0.5*p;
+    if(x>p_half) {
+      x-=p;
+      if(x>=p_half) x -= p;
+    }
+  }
+  return x;
+OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
+  x = __gen_ocl_scalbnf(x,n);
+  return x;
+OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+  //return 0.5f * native_sqrt((1 + x) / (1 - x));
+  float xa = __gen_ocl_fabs (x);
+  float t;
+  if (isless (xa, 0.5f)){
+    if (xa < 0x1.0p-28f) return x;
+    t = xa + xa;
+    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
+  } else if (isless (xa, 1.0f)){
+    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
+  } else{
+    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
+    return x / 0.0f;
+  }
+  return __gen_ocl_internal_copysign(t, x);
+OVERLOADABLE float __gen_ocl_internal_exp10(float x){
+  float px, qx,ans;
+  short n;
+  int i;
+  float*p;
+  float MAXL10 = 38.230809449325611792;
+  float LOG210 = 3.32192809488736234787e0;
+  float LG102A = 3.00781250000000000000E-1;
+  float LG102B = 2.48745663981195213739E-4;
+  float P[6];
+  P[0] = 2.063216740311022E-001;
+  P[1] = 5.420251702225484E-001;
+  P[2] = 1.171292686296281E+000;
+  P[3] = 2.034649854009453E+000;
+  P[4] = 2.650948748208892E+000;
+  P[5] = 2.302585167056758E+000;
+  if( x < -MAXL10 ) return 0.0;
+  if( isinf(x))  return INFINITY;
+  /* The following is necessary because range reduction blows up: */
+  if( x == 0 )return 1.0;
+  /* Express 10**x = 10**g 2**n
+    *	 = 10**g 10**( n log10(2) )
+    *	 = 10**( g + n log10(2) )
+    */
+  px = x * LOG210;
+  qx = __gen_ocl_internal_floor( px + 0.5 );
+  n = qx;
+  x -= qx * LG102A;
+  x -= qx * LG102B;
+  /* rational approximation for exponential
+    * of the fractional part:
+    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
+    */
+  p = P;
+  ans = *p++;
+  i = 5;
+  do{
+    ans = ans * x  +  *p++;
+  }
+  while( --i );
+  px = 1.0 + x * ans;
+  /* multiply by power of 2 */
+  x = __gen_ocl_internal_ldexp( px, n );
+  return x;
+OVERLOADABLE float cospi(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cospi(x);
+  return __gen_ocl_internal_cospi(x);
+OVERLOADABLE float cosh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cosh(x);
+  return  __gen_ocl_internal_cosh(x);
+OVERLOADABLE float acos(float x) {
+  return __gen_ocl_internal_acos(x);
+OVERLOADABLE float acospi(float x) {
+  return __gen_ocl_internal_acospi(x);
+OVERLOADABLE float acosh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_acosh(x);
+  return __gen_ocl_internal_acosh(x);
+OVERLOADABLE float sinpi(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sinpi(x);
+  return __gen_ocl_internal_sinpi(x);
+OVERLOADABLE float sinh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sinh(x);
+  return __gen_ocl_internal_sinh(x);
+OVERLOADABLE float asin(float x) {
+  return __gen_ocl_internal_asin(x);
+OVERLOADABLE float asinpi(float x) {
+  return __gen_ocl_internal_asinpi(x);
+OVERLOADABLE float asinh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_asinh(x);
+  return __gen_ocl_internal_asinh(x);
+OVERLOADABLE float tanpi(float x) {
+  return __gen_ocl_internal_tanpi(x);
+OVERLOADABLE float tanh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_tanh(x);
+  return __gen_ocl_internal_tanh(x);
+OVERLOADABLE float atan(float x) {
+  return __gen_ocl_internal_atan(x);
+OVERLOADABLE float atan2(float y, float x) {
+  return __gen_ocl_internal_atan2(y, x);
+OVERLOADABLE float atan2pi(float y, float x) {
+  return __gen_ocl_internal_atan2pi(y, x);
+OVERLOADABLE float atanpi(float x) {
+  return __gen_ocl_internal_atanpi(x);
+OVERLOADABLE float atanh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_atanh(x);
+  return __gen_ocl_internal_atanh(x);
+OVERLOADABLE float cbrt(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cbrt(x);
+  return __gen_ocl_internal_cbrt(x);
+OVERLOADABLE float rint(float x) {
+  return __gen_ocl_internal_rint(x);
+OVERLOADABLE float copysign(float x, float y) {
+  return __gen_ocl_internal_copysign(x, y);
+OVERLOADABLE float erf(float x) {
+  return __gen_ocl_internal_erf(x);
+OVERLOADABLE float erfc(float x) {
+  return __gen_ocl_internal_erfc(x);
+OVERLOADABLE float fmod (float x, float y) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_fmod(x, y);
+  return __gen_ocl_internal_fmod(x, y);
+OVERLOADABLE float remainder(float x, float p) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_remainder(x, p);
+  return __gen_ocl_internal_remainder(x, p);
+OVERLOADABLE float ldexp(float x, int n) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_ldexp(x, n);
+  if (x == (float)0.0f) x = 0.0f;
+  return __gen_ocl_internal_ldexp(x, n);
+CONST OVERLOADABLE float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
+CONST OVERLOADABLE half __gen_ocl_mad(half a, half b, half c) __asm("llvm.fma" ".f16");
+PURE CONST float __gen_ocl_fmax(float a, float b);
+PURE CONST float __gen_ocl_fmin(float a, float b);
+OVERLOADABLE float mad(float a, float b, float c) {
+  return __gen_ocl_mad(a, b, c);
+#define BODY \
+  if (isnan(x) || isinf(x)) { \
+    *exp = 0; \
+    return x; \
+  } \
+  uint u = as_uint(x); \
+  uint a = u & 0x7FFFFFFFu; \
+  if (a == 0) { \
+    *exp = 0; \
+    return x; \
+  } \
+  if (a >= 0x800000) { \
+    *exp = (a >> 23) - 126; \
+    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+  } \
+  int e = -126; \
+  while (a < 0x400000) { \
+    e --; \
+    a <<= 1; \
+  } \
+  a <<= 1; \
+  *exp = e; \
+  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
+OVERLOADABLE float frexp(float x, int *exp) { BODY; }
+#undef BODY
+OVERLOADABLE float nextafter(float x, float y) {
+  int hx, hy, ix, iy;
+  hx = as_int(x);
+  hy = as_int(y);
+  ix = hx & 0x7fffffff;
+  iy = hy & 0x7fffffff;
+  if(ix == 0)
+    ix = hx & 0x7fffff;
+  if(iy == 0)
+    iy = hy & 0x7fffff;
+  if(ix>0x7f800000 || iy>0x7f800000)
+    return x+y;
+  if(hx == hy)
+    return y;
+  if(ix == 0) {
+    if(iy == 0)
+      return y;
+    else
+      return as_float((hy&0x80000000) | 1);
+  }
+  if(hx >= 0) {
+    if(hx > hy) {
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  } else {
+    if(hy >= 0 || hx > hy){
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  }
+  return as_float(hx);
+#define BODY \
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
+  if (ix > 0x7F800000) { \
+    *i = nan(0u); \
+    return nan(0u); \
+  } \
+  if (ix == 0x7F800000) { \
+    *i = x; \
+    return as_float(hx & 0x80000000u); \
+  } \
+  *i = __gen_ocl_rndz(x); \
+  return x - *i;
+OVERLOADABLE float modf(float x, float *i) { BODY; }
+#undef BODY
+OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmax(half a, half b) { return max(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmin(half a, half b) { return min(a,b); }
+OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a > b ? x : b > a ? y : max(x, y);
+OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a < b ? x : b < a ? y : min(x, y);
+OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+  if(isnan(x))
+    return x;
+  if(isnan(y))
+    return y;
+  return x > y ? (x - y) : +0.f;
+ * the pow/pown high precision implementation are copied from msun library.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com.
+ */
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
+  float z,ax,z_h,z_l,p_h,p_l;
+  float y1,t1,t2,r,s,sn,t,u,v,w;
+  int i,j,k,yisint,n;
+  int hx,hy,ix,iy,is;
+  float bp[2],dp_h[2],dp_l[2],
+  zero    =  0.0,
+  one	=  1.0,
+  two	=  2.0,
+  two24	=  16777216.0,	/* 0x4b800000 */
+  huge	=  1.0e30,
+  tiny    =  1.0e-30,
+  /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+  L1  =  6.0000002384e-01, /* 0x3f19999a */
+  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
+  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
+  P2   = -2.7777778450e-03, /* 0xbb360b61 */
+  lg2  =  6.9314718246e-01, /* 0x3f317218 */
+  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
+  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
+  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
+  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+  bp[0] = 1.0,bp[1] = 1.5,
+  dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
+  dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
+  ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
+  if (ix < 0x00800000) {	   /* x < 2**-126  */
+    ix = 0;/* Gen does not support subnormal number now */
+  }
+  if (iy < 0x00800000) {	  /* y < 2**-126  */
+    iy = 0;/* Gen does not support subnormal number now */
+  }
+   /* y==zero: x**0 = 1 */
+  if(iy==0) return one;
+  /* pow(+1, y) returns 1 for any y, even a NAN */
+  if(hx==0x3f800000) return one;
+  /* +-NaN return x+y */
+  if(ix > 0x7f800000 || iy > 0x7f800000)
+    return (x+0.0f)+y+(0.0f);
+  /* determine if y is an odd int when x < 0
+     * yisint = 0	... y is not an integer
+     * yisint = 1	... y is an odd int
+     * yisint = 2	... y is an even int
+     */
+  yisint  = 0;
+  if(hx<0) {
+    if(iy>=0x4b800000) yisint = 2; /* even integer y */
+    else if(iy>=0x3f800000) {
+      k = (iy>>23)-0x7f;	   /* exponent */
+      j = iy>>(23-k);
+      if((j<<(23-k))==iy) yisint = 2-(j&1);
+    }
+  }
+  /* special value of y */
+  if (iy==0x7f800000) {	/* y is +-inf */
+    if (ix==0x3f800000)
+      //return  y - y;	/* inf**+-1 is NaN */
+      return one;
+    else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
+      return (hy>=0)? y: zero;
+    else			/* (|x|<1)**-,+inf = inf,0 */
+      return (hy<0)?-y: zero;
+  }
+  if(iy==0x3f800000) {	/* y is  +-1 */
+    if(hy<0) return one/x; else return x;
+  }
+  if(hy==0x40000000) return x*x; /* y is  2 */
+  if(hy==0x3f000000) {	/* y is  0.5 */
+    if(hx>=0)return __gen_ocl_sqrt(x);
+  }
+  ax   = __gen_ocl_fabs(x);
+    /* special value of x */
+  if(ix==0x7f800000||ix==0||ix==0x3f800000){
+    z = ax;			/*x is +-0,+-inf,+-1*/
+    if(hy<0) z = one/z;	/* z = (1/|x|) */
+    if(hx<0) {
+      if(((ix-0x3f800000)|yisint)==0) {
+        z = (z-z)/(z-z); /* (-1)**non-int is NaN */
+      } else if(yisint==1)
+        z = -z;		/* (x<0)**odd = -(|x|**odd) */
+    }
+    return z;
+  }
+  n = ((uint)hx>>31)-1;
+  /* (x<0)**(non-int) is NaN */
+  if((n|yisint)==0) return (x-x)/(x-x);
+  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+  /* |y| is huge */
+  if(iy>0x4d000000) { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+    if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
+    if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+          log(x) by x-x^2/2+x^3/3-x^4/4 */
+    t = ax-1;		/* t has 20 trailing zeros */
+    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
+    v = t*ivln2_l-w*ivln2;
+    t1 = u+v;
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = v-(t1-u);
+  } else {
+    float s2,s_h,s_l,t_h,t_l;
+    n = 0;
+	/* take care subnormal number */
+    //if(ix<0x00800000)
+      //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+    n  += ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+	/* determine interval */
+    ix = j|0x3f800000;		/* normalize ix */
+    if(j<=0x1cc471) k=0;	/* |x|<sqrt(3/2) */
+    else if(j<0x5db3d7) k=1;	/* |x|<sqrt(3)   */
+    else {k=0;n+=1;ix -= 0x00800000;}
+	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
+    v = one/(ax+bp[k]);
+    s = u*v;
+    s_h = s;
+    GEN_OCL_GET_FLOAT_WORD(is,s_h);
+    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+    /* t_h=ax+bp[k] High */
+    is = ((ix>>1)&0xfffff000)|0x20000000;
+    GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+    t_l = ax - (t_h-bp[k]);
+    s_l = v*((u-s_h*t_h)-s_h*t_l);
+    /* compute log(ax) */
+    s2 = s*s;
+    r = s2*s2*(L1+s2*L2);
+    r += s_l*(s_h+s);
+    s2  = s_h*s_h;
+    t_h = 3.0f+s2+r;
+    GEN_OCL_GET_FLOAT_WORD(is,t_h);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+    t_l = r-((t_h-3.0f)-s2);
+    /* u+v = s*(1+...) */
+    u = s_h*t_h;
+    v = s_l*t_h+t_l*s;
+    /* 2/(3log2)*(s+...) */
+    p_h = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,p_h);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+    p_l = v-(p_h-u);
+    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
+    z_l = cp_l*p_h+p_l*cp+dp_l[k];
+    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+    t = (float)n;
+    t1 = (((z_h+z_l)+dp_h[k])+t);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+  }
+  /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
+  p_l = (y-y1)*t1+y*t2;
+  p_h = y1*t1;
+  z = p_l+p_h;
+  if (j>0x43000000)				/* if z > 128 */
+    return sn*huge*huge;			/* overflow */
+  else if (j==0x43000000) {			/* if z == 128 */
+    if(p_l+ovt>z-p_h) return sn*huge*huge;	/* overflow */
+  }
+  else if ((j&0x7fffffff)>0x43160000)		/* z <= -150 */
+    return sn*tiny*tiny;			/* underflow */
+  else if (j==0xc3160000){			/* z == -150 */
+    if(p_l<=z-p_h) return sn*tiny*tiny;		/* underflow */
+  }
+  /*
+    * compute 2**(p_h+p_l)
+    */
+  i = j&0x7fffffff;
+  k = (i>>23)-0x7f;
+  n = 0;
+  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
+    n = j+(0x00800000>>(k+1));
+    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
+    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+    n = ((n&0x007fffff)|0x00800000)>>(23-k);
+    if(j<0) n = -n;
+    p_h -= t;
+  }
+  t = p_l+p_h;
+  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
+  u = t*lg2_h;
+  v = (p_l-(t-p_h))*lg2+t*lg2_l;
+  z = u+v;
+  w = v-(z-u);
+  t  = z*z;
+  t1  = z - t*(P1+t*P2);
+  r  = (z*t1)/(t1-two)-(w+z*w);
+  z  = one-(r-z);
+  j += (n<<23);
+  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);	/* subnormal output */
+  else GEN_OCL_SET_FLOAT_WORD(z,j);
+  return sn*z;
+OVERLOADABLE float tgamma (float x)
+  /* based on glibc __ieee754_gammaf_r by Ulrich Drepper <drepper at cygnus.com> */
+  unsigned int hx;
+  if (hx == 0xff800000)
+    {
+      /* x == -Inf.  According to ISO this is NaN.  */
+      return NAN;
+    }
+  if ((hx & 0x7f800000) == 0x7f800000)
+    {
+      /* Positive infinity (return positive infinity) or NaN (return
+	 NaN).  */
+      return x;
+    }
+  if (x < 0.0f && __gen_ocl_internal_floor (x) == x)
+    {
+      /* integer x < 0 */
+      return NAN;
+    }
+  if (x >= 36.0f)
+    {
+      /* Overflow.  */
+      return INFINITY;
+    }
+  else if (x <= 0.0f && x >= -FLT_EPSILON / 4.0f)
+    {
+      return 1.0f / x;
+    }
+  else
+    {
+      float sinpix = __gen_ocl_internal_sinpi(x);
+      if (x <= -42.0f)
+	/* Underflow.  */
+	{return 0.0f * sinpix /*for sign*/;}
+      int exp2_adj = 0;
+      float x_abs = __gen_ocl_fabs(x);
+      float gam0;
+      if (x_abs < 4.0f) {
+        /* gamma = exp(lgamma) is only accurate for small lgamma */
+        float prod,x_adj;
+        if (x_abs < 0.5f) {
+          prod = 1.0f / x_abs;
+          x_adj = x_abs + 1.0f;
+        } else if (x_abs <= 1.5f) {
+          prod = 1.0f;
+          x_adj = x_abs;
+        } else if (x_abs < 2.5f) {
+          x_adj = x_abs - 1.0f;
+          prod = x_adj;
+        } else {
+          x_adj = x_abs - 2.0f;
+          prod = x_adj * (x_abs - 1.0f);
+        }
+        gam0 = __gen_ocl_internal_exp (lgamma (x_adj)) * prod;
+      }
+      else {
+        /* Compute gamma (X) using Stirling's approximation,
+  	 starting by computing pow (X, X) with a power of 2
+  	 factored out to avoid intermediate overflow.  */
+        float x_int = __gen_ocl_internal_round (x_abs);
+        float x_frac = x_abs - x_int;
+        int x_log2;
+        float x_mant = frexp (x_abs, &x_log2);
+        if (x_mant < M_SQRT1_2_F)
+          {
+          x_log2--;
+          x_mant *= 2.0f;
+          }
+        exp2_adj = x_log2 * (int) x_int;
+        float ret = (__gen_ocl_internal_pow(x_mant, x_abs)
+  		   * exp2 (x_log2 * x_frac)
+  		   * __gen_ocl_internal_exp (-x_abs)
+  		   * sqrt (2.0f * M_PI_F / x_abs) );
+        float x2 = x_abs * x_abs;
+        float bsum = (0x3.403404p-12f / x2 -0xb.60b61p-12f) / x2 + 0x1.555556p-4f;
+        gam0 = ret + ret * __gen_ocl_internal_expm1 (bsum / x_abs);
+      }
+      if (x > 0.0f) {return __gen_ocl_internal_ldexp (gam0, exp2_adj);}
+      float gam1 = M_PI_F / (-x * sinpix * gam0);
+      return __gen_ocl_internal_ldexp (gam1, -exp2_adj);
+    }
+float __gen_ocl_internal_pown(float x, int y) {
+  const float
+  bp[] = {1.0, 1.5,},
+  dp_h[] = { 0.0, 5.84960938e-01,}, /* 0x3f15c000 */
+  dp_l[] = { 0.0, 1.56322085e-06,}, /* 0x35d1cfdc */
+  zero    =  0.0,
+  one =  1.0,
+  two =  2.0,
+  two24 =  16777216.0,  /* 0x4b800000 */
+  huge  =  1.0e30,
+  tiny    =  1.0e-30,
+    /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+  L1  =  6.0000002384e-01, /* 0x3f19999a */
+  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
+  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
+  P2   = -2.7777778450e-03, /* 0xbb360b61 */
+  lg2  =  6.9314718246e-01, /* 0x3f317218 */
+  lg2_h  =  0x1.62ep-1,
+  lg2_l  =  0x1.0bfbe8p-15,
+  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
+  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+  float z,ax,z_h,z_l,p_h,p_l;
+  float y1,t1,t2,r,s,t,u,v,w;
+  int i,j,k,yisint,n;
+  int hx,ix,iy,is;
+  ix = hx&0x7fffffff;
+  iy = y > 0 ? y&0x7fffffff : (-y)&0x7fffffff;
+    /* y==zero: x**0 = 1 */
+  if(y==0) return one;
+    /* +-NaN return NAN */
+  if(ix > 0x7f800000)
+    return NAN;
+    /* determine if y is an odd int
+     * yisint = 1 ... y is an odd int
+     * yisint = 2 ... y is an even int
+     */
+    yisint = y&1 ? 1 : 2;
+  if (y == 1) return x;
+  if (y == -1) return one/x;
+  if (y == 2) return x*x;
+  ax   = __gen_ocl_fabs(x);
+   /* special value of x */
+  if(ix==0x7f800000||ix==0||ix==0x3f800000){
+      z = ax;     /*x is +-0,+-inf,+-1*/
+      if(y<0) z = one/z; /* z = (1/|x|) */
+      if(hx<0) {
+      if(yisint==1)
+        z = -z;   /* (x<0)**odd = -(|x|**odd) */
+      }
+      return z;
+  }
+  float sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+  if(((((unsigned)hx>>31)-1)|(yisint-1))==0)
+      sn = -one; /* (-ve)**(odd int) */
+    /* |y| is huge */
+  if(iy>0x08000000) { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+      if(ix<0x3f7ffff8) return (y<0)? sn*huge*huge:tiny*tiny;
+      if(ix>0x3f800007) return (y>0)? sn*huge*huge:tiny*tiny;
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+     log(x) by x-x^2/2+x^3/3-x^4/4 */
+      t = ax-1;   /* t has 20 trailing zeros */
+      w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
+      u = ivln2_h*t;  /* ivln2_h has 16 sig. bits */
+      v = t*ivln2_l-w*ivln2;
+      t1 = u+v;
+      GEN_OCL_GET_FLOAT_WORD(is,t1);
+      GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+      t2 = v-(t1-u);
+  } else {
+    float s2,s_h,s_l,t_h,t_l;
+    n = 0;
+    /* take care subnormal number */
+//      if(ix<0x00800000)
+//    {ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+    n  += ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+    /* determine interval */
+    ix = j|0x3f800000;    /* normalize ix */
+    if(j<=0x1cc471) k=0;  /* |x|<sqrt(3/2) */
+    else if(j<0x5db3d7) k=1;  /* |x|<sqrt(3)   */
+    else {k=0;n+=1;ix -= 0x00800000;}
+    /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+    u = ax-bp[k];   /* bp[0]=1.0, bp[1]=1.5 */
+    v = one/(ax+bp[k]);
+    s = u*v;
+    s_h = s;
+    GEN_OCL_GET_FLOAT_WORD(is,s_h);
+    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+    /* t_h=ax+bp[k] High */
+    GEN_OCL_SET_FLOAT_WORD(t_h, (((ix>>1)|0x20000000)+0x00400000+(k<<21)) &0xfffff000);
+    t_l = ax - (t_h-bp[k]);
+    s_l = v*((u-s_h*t_h)-s_h*t_l);
+    /* compute log(ax) */
+    s2 = s*s;
+    r = s2*s2*(L1+s2*L2);
+    r += s_l*(s_h+s);
+    s2  = s_h*s_h;
+    t_h = (float)3.0+s2+r;
+    GEN_OCL_GET_FLOAT_WORD(is,t_h);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+    t_l = r-((t_h-(float)3.0)-s2);
+    /* u+v = s*(1+...) */
+    u = s_h*t_h;
+    v = s_l*t_h+t_l*s;
+    /* 2/(3log2)*(s+...) */
+    p_h = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,p_h);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+    p_l = v-(p_h-u);
+    z_h = cp_h*p_h;   /* cp_h+cp_l = 2/(3*log2) */
+    z_l = cp_l*p_h+p_l*cp+dp_l[k];
+    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+    t = (float)n;
+    t1 = (((z_h+z_l)+dp_h[k])+t);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+  }
+  /* split up y into y1+y2+y3 and compute (y1+y2+y3)*(t1+t2) */
+  float fy = (float)y;
+  float y3 = (float)(y-(int)fy);
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+  p_l = (fy-y1)*t1 + y3*t1 + fy*t2 + y3*t2;
+  p_h = y1*t1;
+  z = p_l+p_h;
+  if (j>0x43000000)       /* if z > 128 */
+      return sn*huge*huge;       /* overflow */
+  else if (j==0x43000000) {     /* if z == 128 */
+      if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
+  }
+  else if ((j&0x7fffffff)>0x43160000)   /* z <= -150 */
+      return sn*tiny*tiny;       /* underflow */
+  else if (j==0xc3160000){      /* z == -150 */
+      if(p_l<=z-p_h) return sn*tiny*tiny;    /* underflow */
+  }
+    /*
+     * compute 2**(p_h+p_l)
+     */
+  i = j&0x7fffffff;
+  k = (i>>23)-0x7f;
+  n = 0;
+  if(i>0x3f000000) {    /* if |z| > 0.5, set n = [z+0.5] */
+      n = j+(0x00800000>>(k+1));
+      k = ((n&0x7fffffff)>>23)-0x7f;  /* new k for n */
+      GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+      n = ((n&0x007fffff)|0x00800000)>>(23-k);
+      if(j<0) n = -n;
+      p_h -= t;
+      z -= n;
+  }
+  t = z;
+  GEN_OCL_SET_FLOAT_WORD(t,is&0xfffff000);
+  u = t*lg2_h;
+  v = (p_l-(t-p_h))*lg2+t*lg2_l;
+  z = u+v;
+  w = v-(z-u);
+  t  = z*z;
+  t1  = z - t*(P1+t*P2);
+  r  = (z*t1)/(t1-two)-(w+z*w);
+  z  = one-(r-z);
+  j += (n<<23);
+  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);  /* subnormal output */
+  else GEN_OCL_SET_FLOAT_WORD(z,j);
+  return sn*z;
+OVERLOADABLE float hypot(float x, float y) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_hypot(x, y);
+  //return __gen_ocl_sqrt(x*x + y*y);
+  float a,b,an,bn,cn;
+  int e;
+  if (isfinite (x) && isfinite (y)){      /* Determine absolute values.  */
+  x = __gen_ocl_fabs (x);
+  y = __gen_ocl_fabs (y);
+  /* Find the bigger and the smaller one.  */
+  a = max(x,y);
+  b = min(x,y);
+  /* Now 0 <= b <= a.  */
+  /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1.  */
+  an = frexp (a, &e);
+  bn = ldexp (b, - e);
+  /* Through the normalization, no unneeded overflow or underflow will occur here.  */
+  cn = __gen_ocl_sqrt (an * an + bn * bn);
+  return ldexp (cn, e);
+  }else{
+    if (isinf (x) || isinf (y))  /* x or y is infinite.  Return +Infinity.  */
+      return INFINITY;
+    else        /* x or y is NaN.  Return NaN.  */
+      return x + y;
+  }
+#define BODY \
+  if (isnan(x)) { \
+    *p = x; \
+    return x; \
+  } \
+  *p = __gen_ocl_internal_floor(x); \
+  if (isinf(x)) { \
+    return x > 0 ? +0. : -0.; \
+  } \
+  return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
+OVERLOADABLE float fract(float x, float *p) { BODY; }
+#undef BODY
+#define BODY \
+  float Zero[2]; \
+  int n,hx,hy,hz,ix,iy,sx,i,sy; \
+  uint q,sxy; \
+  Zero[0] = 0.0;Zero[1] = -0.0; \
+  if (x == 0.0f) { x = 0.0f; }; \
+  if (y == 0.0f) { y = 0.0f; }\
+  sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
+  hx ^=sx; hy &= 0x7fffffff; \
+  if (hx < 0x00800000)hx = 0;if (hy < 0x00800000)hy = 0; \
+  if(hy==0||hx>=0x7f800000||hy>0x7f800000){ \
+    *quo = 0;return NAN; \
+  } \
+  if( hy == 0x7F800000 || hx == 0 ) { \
+    *quo = 0;return x; \
+  } \
+  if( hx == hy ) { \
+    *quo = (x == y) ? 1 : -1; \
+    return sx ? -0.0 : 0.0; \
+  } \
+  if(hx<hy) { \
+    q = 0; \
+    goto fixup; \
+  } else if(hx==hy) { \
+    *quo = (sxy ? -1 : 1); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  ix = (hx>>23)-127; \
+  iy = (hy>>23)-127; \
+  hx = 0x00800000|(0x007fffff&hx); \
+  hy = 0x00800000|(0x007fffff&hy); \
+  n = ix - iy; \
+  q = 0; \
+  while(n--) { \
+    hz=hx-hy; \
+    if(hz<0) hx = hx << 1; \
+    else {hx = hz << 1; q++;} \
+    q <<= 1; \
+  } \
+  hz=hx-hy; \
+  if(hz>=0) {hx=hz;q++;} \
+  if(hx==0) { \
+    q &= 0x0000007f; \
+    *quo = (sxy ? -q : q); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  while(hx<0x00800000) { \
+    hx <<= 1;iy -= 1; \
+  } \
+  if(iy>= -126) { \
+    hx = ((hx-0x00800000)|((iy+127)<<23)); \
+  } else {\
+    n = -126 - iy; \
+    hx >>= n; \
+  } \
+fixup: \
+  if(hx<0x00800000){ \
+    GEN_OCL_GET_FLOAT_WORD(hy,y); \
+    hy &= 0x7fffffff; \
+    if(hx+hx > hy ||(hx+hx==hy && (q & 1)))q++; \
+    x = 0; \
+  }else{ \
+    y = __gen_ocl_fabs(y); \
+    if (y < 0x1p-125f) { \
+      if (x+x>y || (x+x==y && (q & 1))) { \
+        q++;x-=y; \
+      } \
+    }else if (x>0.5f*y || (x==0.5f*y && (q & 1))) { \
+      q++;x-=y; \
+    } \
+  } \
+  int sign = sx==sy?0:1; \
+  q &= 0x0000007f; \
+  *quo = (sign ? -q : q); \
+  return x;
+OVERLOADABLE float remquo(float x, float y, int *quo) {
+#undef BODY
+OVERLOADABLE float powr(float x, float y) {
+  unsigned int hx, sx, hy, sy;
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_pow(x,y);
+  else {
+    if (isnan(x) || isnan(y)) return NAN;
+    sx = (hx & 0x80000000) >> 31;
+    sy = (hy & 0x80000000) >> 31;
+    if ((hx&0x7fffffff) < 0x00800000) {	   /* x < 2**-126  */
+      x = 0.0f;/* Gen does not support subnormal number now */
+      hx = hx &0x80000000;
+    }
+    if ((hy&0x7fffffff) < 0x00800000) {	  /* y < 2**-126  */
+      y = 0.0;/* Gen does not support subnormal number now */
+      hy = hy &0x80000000;
+    }
+    // (x < 0) ** y = NAN (y!=0)
+    if ((sx && (hx & 0x7fffffff))) return NAN;
+    // +/-0 ** +/-0 = NAN
+    if ( !(hx&0x7fffffff) && !(hy&0x7fffffff)) return NAN;
+    // +inf ** +/-0 = NAN
+    if ( ((hx & 0x7f800000) ==0x7f800000) && !(hy&0x7fffffff)) return NAN;
+    // others except nan/inf/0 ** 0 = 1.0
+    if (!(hy&0x7fffffff)) return 1.0f;
+    // +1 ** inf = NAN; +1 ** finite = 1;
+    if (hx == 0x3f800000) {
+      return isinf(y) ? NAN : 1.0f;
+    }
+    if ( !(hx & 0x7fffffff)) {
+        // +/-0 ** y<0 = +inf
+        // +/-0 ** y>0 = +0
+      return sy ? INFINITY : 0.0f;
+    }
+    return __gen_ocl_internal_pow(x,y);
+  }
+OVERLOADABLE float pown(float x, int n) {
+  if (__ocl_math_fastpath_flag) {
+    if (x == 0.f && n == 0)
+      return 1.f;
+    if (x < 0.f && (n&1) )
+      return -powr(-x, n);
+    return powr(x, n);
+  } else {
+    int ix;
+    float sign = ix < 0 ? -1.0f : 1.0f;
+    if (x == 0.0f) x = sign * 0.0f;
+    return __gen_ocl_internal_pown(x, n);
+  }
+OVERLOADABLE float pow(float x, float y) {
+  if (!__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_pow(x,y);
+  else {
+    int n;
+    if (x == 0.f && y == 0.f)
+      return 1.f;
+    if (x >= 0.f)
+      return powr(x, y);
+    n = y;
+    if ((float)n == y)//is exact integer
+      return pown(x, n);
+    return NAN;
+  }
+OVERLOADABLE float rootn(float x, int n) {
+  float ax,re;
+  int sign = 0;
+  int hx;
+  if( n == 0 )return NAN;
+  // Gen does not support denorm, flush to zero
+  if ((hx & 0x7fffffff) < 0x00800000) {
+    x = hx < 0 ? -0.0f : 0.0f;
+  }
+  //rootn ( x, n )  returns a NaN for x < 0 and n is even.
+  if( x < 0 && 0 == (n&1) )
+    return NAN;
+  if( x == 0.0 ){
+    switch( n & 0x80000001 ){
+      //rootn ( +-0,  n ) is +0 for even n > 0.
+      case 0:
+        return 0.0f;
+      //rootn ( +-0,  n ) is +-0 for odd n > 0.
+      case 1:
+        return x;
+      //rootn ( +-0,  n ) is +inf for even n < 0.
+      case 0x80000000:
+        return INFINITY;
+      //rootn ( +-0,  n ) is +-inf for odd n < 0.
+      case 0x80000001:
+        return __gen_ocl_internal_copysign(INFINITY, x);
+    }
+  }
+  ax = __gen_ocl_fabs(x);
+  if(x <0.0f && (n&1))
+    sign = 1;
+  if (__ocl_math_fastpath_flag)
+    re = __gen_ocl_pow(ax, 1.f/n);
+  else
+    re = __gen_ocl_internal_pow(ax,1.f/n);
+  if(sign)
+    re = -re;
+  return re;
+OVERLOADABLE float fabs(float x) {
+  return __gen_ocl_internal_fabs(x);
+OVERLOADABLE float trunc(float x) {
+  return  __gen_ocl_internal_trunc(x);
+OVERLOADABLE float round(float x) {
+  return __gen_ocl_internal_round(x);
+OVERLOADABLE float floor(float x) {
+  return __gen_ocl_internal_floor(x);
+OVERLOADABLE float ceil(float x) {
+  return __gen_ocl_internal_ceil(x);
+OVERLOADABLE float log(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log(x);
+  /* Use native instruction when it has enough precision */
+  if((x > 0x1.1p0) || (x <= 0))
+    return __gen_ocl_internal_fastpath_log(x);
+  return  __gen_ocl_internal_log(x);
+OVERLOADABLE float log2(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log2(x);
+  /* Use native instruction when it has enough precision */
+  if((x > 0x1.1p0) || (x <= 0))
+    return __gen_ocl_internal_fastpath_log2(x);
+  return  __gen_ocl_internal_log2(x);
+OVERLOADABLE float log10(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log10(x);
+  /* Use native instruction when it has enough precision */
+  if((x > 0x1.1p0) || (x <= 0))
+    return __gen_ocl_internal_fastpath_log10(x);
+  return  __gen_ocl_internal_log10(x);
+OVERLOADABLE float exp(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_exp(x);
+  /* Use native instruction when it has enough precision */
+  if (x > -0x1.6p1 && x < 0x1.6p1)
+    return __gen_ocl_internal_fastpath_exp(x);
+  return  __gen_ocl_internal_exp(x);
+OVERLOADABLE float exp2(float x) {
+  /* Use native instruction when it has enough precision, exp2 always */
+  return native_exp2(x);
+OVERLOADABLE float exp10(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_exp10(x);
+  return  __gen_ocl_internal_exp10(x);
+OVERLOADABLE float expm1(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_expm1(x);
+  return  __gen_ocl_internal_expm1(x);
+OVERLOADABLE float fmin(float a, float b) {
+  return __gen_ocl_internal_fmin(a, b);
+OVERLOADABLE float fmax(float a, float b) {
+  return __gen_ocl_internal_fmax(a, b);
+OVERLOADABLE float fma(float a, float b, float c) {
+  return mad(a, b, c);
+OVERLOADABLE float fdim(float x, float y) {
+  return __gen_ocl_internal_fdim(x, y);
+OVERLOADABLE float maxmag(float x, float y) {
+  return __gen_ocl_internal_maxmag(x, y);
+OVERLOADABLE float minmag(float x, float y) {
+  return __gen_ocl_internal_minmag(x, y);
+/* So far, the HW do not support half float math function.
+   We just do the conversion and call the float version here. */
+OVERLOADABLE half cospi(half x) {
+  float _x = (float)x;
+  return (half)cospi(_x);
+OVERLOADABLE half cosh(half x) {
+  float _x = (float)x;
+  return (half)cosh(_x);
+OVERLOADABLE half acos(half x) {
+  float _x = (float)x;
+  return (half)acos(_x);
+OVERLOADABLE float half_cos(float x) {
+  return (float)cos(x);
+OVERLOADABLE float half_divide(float x, float y) {
+  return (float)native_divide(x, y);
+OVERLOADABLE float half_exp(float x) {
+  return (float)native_exp(x);
+OVERLOADABLE float half_exp2(float x){
+  return (float)native_exp2(x);
+OVERLOADABLE float half_exp10(float x){
+  return (float)native_exp10(x);
+OVERLOADABLE float half_log(float x){
+  return (float)native_log(x);
+OVERLOADABLE float half_log2(float x){
+  return (float)native_log2(x);
+OVERLOADABLE float half_log10(float x){
+  return (float)native_log10(x);
+OVERLOADABLE float half_powr(float x, float y){
+  return (float)powr(x, y);
+OVERLOADABLE float half_recip(float x){
+  return (float)native_recip(x);
+OVERLOADABLE float half_rsqrt(float x){
+  return (float)native_rsqrt(x);
+OVERLOADABLE float half_sin(float x){
+  return (float)sin(x);
+OVERLOADABLE float half_sqrt(float x){
+  return (float)native_sqrt(x);
+OVERLOADABLE float half_tan(float x){
+  return (float)tan(x);
+OVERLOADABLE half acospi(half x) {
+  float _x = (float)x;
+  return (half)acospi(_x);
+OVERLOADABLE half acosh(half x) {
+  float _x = (float)x;
+  return (half)acosh(_x);
+OVERLOADABLE half sinpi(half x) {
+  float _x = (float)x;
+  return (half)sinpi(_x);
+OVERLOADABLE half sinh(half x) {
+  float _x = (float)x;
+  return (half)sinh(_x);
+OVERLOADABLE half asin(half x) {
+  float _x = (float)x;
+  return (half)asin(_x);
+OVERLOADABLE half asinpi(half x) {
+  float _x = (float)x;
+  return (half)asinpi(_x);
+OVERLOADABLE half asinh(half x) {
+  float _x = (float)x;
+  return (half)asinh(_x);
+OVERLOADABLE half tanpi(half x) {
+  float _x = (float)x;
+  return (half)tanpi(_x);
+OVERLOADABLE half tanh(half x) {
+  float _x = (float)x;
+  return (half)tanh(_x);
+OVERLOADABLE half atan(half x) {
+  float _x = (float)x;
+  return (half)atan(_x);
+OVERLOADABLE half atan2(half y, half x) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)atan2(_x, _y);
+OVERLOADABLE half atan2pi(half y, half x) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)atan2pi(_x, _y);
+OVERLOADABLE half atanpi(half x) {
+  float _x = (float)x;
+  return (half)atanpi(_x);
+OVERLOADABLE half atanh(half x) {
+  float _x = (float)x;
+  return (half)atanh(_x);
+OVERLOADABLE half cbrt(half x) {
+  float _x = (float)x;
+  return (half)cbrt(_x);
+OVERLOADABLE half rint(half x) {
+  float _x = (float)x;
+  return (half)rint(_x);
+OVERLOADABLE half copysign(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)copysign(_x, _y);
+OVERLOADABLE half erf(half x) {
+  float _x = (float)x;
+  return (half)erf(_x);
+OVERLOADABLE half erfc(half x) {
+  float _x = (float)x;
+  return (half)erfc(_x);
+OVERLOADABLE half fmod(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)fmod(_x, _y);
+OVERLOADABLE half remainder(half x, half p) {
+  float _x = (float)x;
+  float _p = (float)p;
+  return (half)remainder(_x, _p);
+OVERLOADABLE half ldexp(half x, int n) {
+  float _x = (float)x;
+  return (half)ldexp(_x, n);
+OVERLOADABLE half powr(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)powr(_x, _y);
+OVERLOADABLE half pow(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)pow(_x, _y);
+//no pow, we use powr instead
+OVERLOADABLE half fabs(half x) {
+  float _x = (float)x;
+  return (half)fabs(_x);
+OVERLOADABLE half trunc(half x) {
+  float _x = (float)x;
+  return (half)trunc(_x);
+OVERLOADABLE half round(half x) {
+  float _x = (float)x;
+  return (half)round(_x);
+OVERLOADABLE half floor(half x) {
+  float _x = (float)x;
+  return (half)floor(_x);
+OVERLOADABLE half ceil(half x) {
+  float _x = (float)x;
+  return (half)ceil(_x);
+OVERLOADABLE half log(half x) {
+  float _x = (float)x;
+  return (half)log(_x);
+OVERLOADABLE half log2(half x) {
+  float _x = (float)x;
+  return (half)log2(_x);
+OVERLOADABLE half log10(half x) {
+  float _x = (float)x;
+  return (half)log10(_x);
+OVERLOADABLE half exp(half x) {
+  float _x = (float)x;
+  return (half)exp(_x);
+OVERLOADABLE half exp10(half x) {
+  float _x = (float)x;
+  return (half)exp10(_x);
+OVERLOADABLE half expm1(half x) {
+  float _x = (float)x;
+  return (half)expm1(_x);
+OVERLOADABLE half fmin(half a, half b) {
+  return __gen_ocl_internal_fmin(a, b);
+OVERLOADABLE half fmax(half a, half b) {
+  return __gen_ocl_internal_fmax(a, b);
+OVERLOADABLE half fma(half a, half b, half c) {
+  float _a = (float)a;
+  float _b = (float)b;
+  float _c = (float)c;
+  return (half)fma(_a, _b, _c);
+OVERLOADABLE half fdim(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)fdim(_x, _y);
+OVERLOADABLE half maxmag(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)maxmag(_x, _y);
+OVERLOADABLE half minmag(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)minmag(_x, _y);
+OVERLOADABLE half exp2(half x) {
+  float _x = (float)x;
+  return (half)exp2(_x);
+OVERLOADABLE half mad(half a, half b, half c) {
+  return __gen_ocl_mad(a,b,c);
+OVERLOADABLE half sin(half x) {
+  float _x = (float)x;
+  return (half)sin(_x);
+OVERLOADABLE half cos(half x) {
+  float _x = (float)x;
+  return (half)cos(_x);
+OVERLOADABLE half tan(half x) {
+  float _x = (float)x;
+  return (half)tan(_x);
+OVERLOADABLE half tgamma(half x) {
+  float _x = (float)x;
+  return (half)tgamma(_x);
+OVERLOADABLE half lgamma(half x) {
+  float _x = (float)x;
+  return (half)lgamma(_x);
+OVERLOADABLE half lgamma_r(half x, int *signgamp) {
+  float _x = (float)x;
+  return (half)lgamma_r(_x, signgamp);
+OVERLOADABLE half log1p(half x) {
+  float _x = (float)x;
+  return (half)log1p(_x);
+OVERLOADABLE half logb(half x) {
+  float _x = (float)x;
+  return (half)logb(_x);
+OVERLOADABLE int ilogb(half x) {
+  float _x = (float)x;
+  return ilogb(_x);
+OVERLOADABLE half nan(ushort code) {
+  return (half)NAN;
+OVERLOADABLE half sincos(half x, half *cosval) {
+  float _x = (float)x;
+  float _cosval;
+  half ret = (half)sincos(_x, &_cosval);
+  *cosval = (half)_cosval;
+  return ret;
+OVERLOADABLE half sqrt(half x) {
+  float _x = (float)x;
+  return (half)sqrt(_x);
+OVERLOADABLE half rsqrt(half x) {
+  float _x = (float)x;
+  return (half)rsqrt(_x);
+OVERLOADABLE half frexp(half x, int *exp) {
+  float _x = (float)x;
+  return (half)frexp(_x, exp);
+OVERLOADABLE half nextafter(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)nextafter(_x, _y);
+OVERLOADABLE half modf(half x, half *i) {
+  float _x = (float)x;
+  float _i;
+  half ret = (half)modf(_x, &_i);
+  *i = (half)_i;
+  return ret;
+OVERLOADABLE half hypot(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)hypot(_x, _y);
+OVERLOADABLE half fract(half x, half *p) {
+  float _x = (float)x;
+  float _p;
+  half ret = (half)fract(_x, &_p);
+  *p = (half)_p;
+  return ret;
+OVERLOADABLE half remquo(half x, half y, int *quo) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)remquo(_x, _y, quo);
+OVERLOADABLE half pown(half x, int n) {
+  float _x = (float)x;
+  return (half)pown(_x, n);
+OVERLOADABLE half rootn(half x, int n) {
+  float _x = (float)x;
+  return (half)rootn(_x, n);
diff --git a/backend/src/libocl/tmpl/ocl_math_20.tmpl.h b/backend/src/libocl/tmpl/ocl_math_20.tmpl.h
new file mode 100644
index 0000000..271075c
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_math_20.tmpl.h
@@ -0,0 +1,209 @@
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_MATH_20_H__
+#define __OCL_MATH_20_H__
+#include "ocl_types.h"
+OVERLOADABLE float cospi(float x);
+OVERLOADABLE float cosh(float x);
+OVERLOADABLE float acos(float x);
+OVERLOADABLE float acospi(float x);
+OVERLOADABLE float acosh(float x);
+OVERLOADABLE float sinpi(float x);
+OVERLOADABLE float sinh(float x);
+OVERLOADABLE float asin(float x);
+OVERLOADABLE float asinpi(float x);
+OVERLOADABLE float asinh(float x);
+OVERLOADABLE float tanpi(float x);
+OVERLOADABLE float tanh(float x);
+OVERLOADABLE float atan(float x);
+OVERLOADABLE float atan2(float y, float x);
+OVERLOADABLE float atan2pi(float y, float x);
+OVERLOADABLE float atanpi(float x);
+OVERLOADABLE float atanh(float x);
+OVERLOADABLE float cbrt(float x);
+OVERLOADABLE float rint(float x);
+OVERLOADABLE float copysign(float x, float y);
+OVERLOADABLE float erf(float x);
+OVERLOADABLE float erfc(float x);
+OVERLOADABLE float fmod (float x, float y);
+OVERLOADABLE float remainder(float x, float p);
+OVERLOADABLE float ldexp(float x, int n);
+OVERLOADABLE float powr(float x, float y);
+OVERLOADABLE float pow(float x, float y);
+//no pow, we use powr instead
+OVERLOADABLE float fabs(float x);
+OVERLOADABLE float trunc(float x);
+OVERLOADABLE float round(float x);
+OVERLOADABLE float floor(float x);
+OVERLOADABLE float ceil(float x);
+OVERLOADABLE float log(float x);
+OVERLOADABLE float log2(float x);
+OVERLOADABLE float log10(float x);
+OVERLOADABLE float exp(float x);
+OVERLOADABLE float exp10(float x);
+OVERLOADABLE float expm1(float x);
+OVERLOADABLE float fmin(float a, float b);
+OVERLOADABLE float fmax(float a, float b);
+OVERLOADABLE float fma(float a, float b, float c);
+OVERLOADABLE float fdim(float x, float y);
+OVERLOADABLE float maxmag(float x, float y);
+OVERLOADABLE float minmag(float x, float y);
+OVERLOADABLE float exp2(float x);
+OVERLOADABLE float mad(float a, float b, float c);
+OVERLOADABLE float sin(float x);
+OVERLOADABLE float cos(float x);
+OVERLOADABLE float tan(float x);
+OVERLOADABLE float tgamma(float x);
+OVERLOADABLE float lgamma(float x);
+OVERLOADABLE float lgamma_r(float x, int *signgamp);
+OVERLOADABLE float log1p(float x);
+OVERLOADABLE float logb(float x);
+OVERLOADABLE int ilogb(float x);
+OVERLOADABLE float nan(uint code);
+OVERLOADABLE float sincos(float x, float *cosval);
+OVERLOADABLE float sqrt(float x);
+OVERLOADABLE float rsqrt(float x);
+OVERLOADABLE float frexp(float x, int *exp);
+OVERLOADABLE float nextafter(float x, float y);
+OVERLOADABLE float modf(float x, float *i);
+OVERLOADABLE float hypot(float x, float y);
+OVERLOADABLE float fract(float x, float *p);
+OVERLOADABLE float remquo(float x, float y, int *quo);
+OVERLOADABLE float pown(float x, int n);
+OVERLOADABLE float rootn(float x, int n);
+// native
+OVERLOADABLE float native_cos(float x);
+OVERLOADABLE float native_divide(float x, float y);
+OVERLOADABLE float native_exp(float x);
+OVERLOADABLE float native_exp2(float x);
+OVERLOADABLE float native_exp10(float x);
+OVERLOADABLE float native_log(float x);
+OVERLOADABLE float native_log2(float x);
+OVERLOADABLE float native_log10(float x);
+OVERLOADABLE float native_powr(float x, float y);
+OVERLOADABLE float native_recip(float x);
+OVERLOADABLE float native_rsqrt(float x);
+OVERLOADABLE float native_sin(float x);
+OVERLOADABLE float native_sqrt(float x);
+OVERLOADABLE float native_tan(float x);
+// Half float version.
+OVERLOADABLE half cospi(half x);
+OVERLOADABLE half cosh(half x);
+OVERLOADABLE half acos(half x);
+OVERLOADABLE half acospi(half x);
+OVERLOADABLE half acosh(half x);
+OVERLOADABLE half sinpi(half x);
+OVERLOADABLE half sinh(half x);
+OVERLOADABLE half asin(half x);
+OVERLOADABLE half asinpi(half x);
+OVERLOADABLE half asinh(half x);
+OVERLOADABLE half tanpi(half x);
+OVERLOADABLE half tanh(half x);
+OVERLOADABLE half atan(half x);
+OVERLOADABLE half atan2(half y, half x);
+OVERLOADABLE half atan2pi(half y, half x);
+OVERLOADABLE half atanpi(half x);
+OVERLOADABLE half atanh(half x);
+OVERLOADABLE half cbrt(half x);
+OVERLOADABLE half rint(half x);
+OVERLOADABLE half copysign(half x, half y);
+OVERLOADABLE half erf(half x);
+OVERLOADABLE half erfc(half x);
+OVERLOADABLE half fmod (half x, half y);
+OVERLOADABLE half remainder(half x, half p);
+OVERLOADABLE half ldexp(half x, int n);
+OVERLOADABLE half powr(half x, half y);
+OVERLOADABLE half pow(half x, half y);
+//no pow, we use powr instead
+OVERLOADABLE half fabs(half x);
+OVERLOADABLE half trunc(half x);
+OVERLOADABLE half round(half x);
+OVERLOADABLE half floor(half x);
+OVERLOADABLE half ceil(half x);
+OVERLOADABLE half log(half x);
+OVERLOADABLE half log2(half x);
+OVERLOADABLE half log10(half x);
+OVERLOADABLE half exp(half x);
+OVERLOADABLE half exp10(half x);
+OVERLOADABLE half expm1(half x);
+OVERLOADABLE half fmin(half a, half b);
+OVERLOADABLE half fmax(half a, half b);
+OVERLOADABLE half fma(half a, half b, half c);
+OVERLOADABLE half fdim(half x, half y);
+OVERLOADABLE half maxmag(half x, half y);
+OVERLOADABLE half minmag(half x, half y);
+OVERLOADABLE half exp2(half x);
+OVERLOADABLE half mad(half a, half b, half c);
+OVERLOADABLE half sin(half x);
+OVERLOADABLE half cos(half x);
+OVERLOADABLE half tan(half x);
+OVERLOADABLE half tgamma(half x);
+OVERLOADABLE half lgamma(half x);
+OVERLOADABLE half lgamma_r(half x, int *signgamp);
+OVERLOADABLE half log1p(half x);
+OVERLOADABLE half logb(half x);
+OVERLOADABLE int ilogb(half x);
+OVERLOADABLE half nan(ushort code);
+OVERLOADABLE half sincos(half x, half *cosval);
+OVERLOADABLE half sqrt(half x);
+OVERLOADABLE half rsqrt(half x);
+OVERLOADABLE half frexp(half x, int *exp);
+OVERLOADABLE half nextafter(half x, half y);
+OVERLOADABLE half modf(half x, half *i);
+OVERLOADABLE half hypot(half x, half y);
+OVERLOADABLE half fract(half x, half *p);
+OVERLOADABLE half remquo(half x, half y, int *quo);
+OVERLOADABLE half pown(half x, int n);
+OVERLOADABLE half rootn(half x, int n);
+// native half
+OVERLOADABLE half native_cos(half x);
+OVERLOADABLE half native_divide(half x, half y);
+OVERLOADABLE half native_exp(half x);
+OVERLOADABLE half native_exp2(half x);
+OVERLOADABLE half native_exp10(half x);
+OVERLOADABLE half native_log(half x);
+OVERLOADABLE half native_log2(half x);
+OVERLOADABLE half native_log10(half x);
+OVERLOADABLE half native_powr(half x, half y);
+OVERLOADABLE half native_recip(half x);
+OVERLOADABLE half native_rsqrt(half x);
+OVERLOADABLE half native_sin(half x);
+OVERLOADABLE half native_sqrt(half x);
+OVERLOADABLE half native_tan(half x);
+// half accuracy
+OVERLOADABLE float half_cos(float x);
+OVERLOADABLE float half_divide(float x, float y);
+OVERLOADABLE float half_exp(float x);
+OVERLOADABLE float half_exp2(float x);
+OVERLOADABLE float half_exp10(float x);
+OVERLOADABLE float half_log(float x);
+OVERLOADABLE float half_log2(float x);
+OVERLOADABLE float half_log10(float x);
+OVERLOADABLE float half_powr(float x, float y);
+OVERLOADABLE float half_recip(float x);
+OVERLOADABLE float half_rsqrt(float x);
+OVERLOADABLE float half_sin(float x);
+OVERLOADABLE float half_sqrt(float x);
+OVERLOADABLE float half_tan(float x);
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 8e22015..97e33fe 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -38,17 +38,9 @@ uint get_sub_group_size(void)
 /* broadcast */
-    OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, size_t local_id); \
-    OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, size_t local_id) { \
+    OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, uint local_id); \
+    OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, uint local_id) { \
       return __gen_ocl_sub_group_broadcast(a, local_id); \
-    } \
-    OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y); \
-    OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y) { \
-      return __gen_ocl_sub_group_broadcast(a, local_id_x, local_id_y);  \
-    } \
-    OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z); \
-    OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z) { \
-      return __gen_ocl_sub_group_broadcast(a, local_id_x, local_id_y, local_id_z); \
@@ -58,8 +50,17 @@ BROADCAST_IMPL(ulong)
+OVERLOADABLE short intel_sub_group_broadcast(short a, uint local_id) {
+  return __gen_ocl_sub_group_broadcast(a, local_id);
+OVERLOADABLE ushort intel_sub_group_broadcast(ushort a, uint local_id) {
+  return __gen_ocl_sub_group_broadcast(a, local_id);
     OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_##RANGE##_##OP(bool sign, GEN_TYPE x); \
@@ -75,6 +76,8 @@ RANGE_OP(reduce, add, ulong, false)
 RANGE_OP(reduce, add, half, true)
 RANGE_OP(reduce, add, float, true)
 RANGE_OP(reduce, add, double, true)
+RANGE_OP(reduce, add, short, true)
+RANGE_OP(reduce, add, ushort, false)
 /* reduce min */
 RANGE_OP(reduce, min, int, true)
 RANGE_OP(reduce, min, uint, false)
@@ -83,6 +86,8 @@ RANGE_OP(reduce, min, ulong, false)
 RANGE_OP(reduce, min, half, true)
 RANGE_OP(reduce, min, float, true)
 RANGE_OP(reduce, min, double, true)
+RANGE_OP(reduce, min, short, true)
+RANGE_OP(reduce, min, ushort, false)
 /* reduce max */
 RANGE_OP(reduce, max, int, true)
 RANGE_OP(reduce, max, uint, false)
@@ -91,6 +96,8 @@ RANGE_OP(reduce, max, ulong, false)
 RANGE_OP(reduce, max, half, true)
 RANGE_OP(reduce, max, float, true)
 RANGE_OP(reduce, max, double, true)
+RANGE_OP(reduce, max, short, true)
+RANGE_OP(reduce, max, ushort, false)
 /* scan_inclusive add */
 RANGE_OP(scan_inclusive, add, int, true)
@@ -100,6 +107,8 @@ RANGE_OP(scan_inclusive, add, ulong, false)
 RANGE_OP(scan_inclusive, add, half, true)
 RANGE_OP(scan_inclusive, add, float, true)
 RANGE_OP(scan_inclusive, add, double, true)
+RANGE_OP(scan_inclusive, add, short, true)
+RANGE_OP(scan_inclusive, add, ushort, false)
 /* scan_inclusive min */
 RANGE_OP(scan_inclusive, min, int, true)
 RANGE_OP(scan_inclusive, min, uint, false)
@@ -108,6 +117,8 @@ RANGE_OP(scan_inclusive, min, ulong, false)
 RANGE_OP(scan_inclusive, min, half, true)
 RANGE_OP(scan_inclusive, min, float, true)
 RANGE_OP(scan_inclusive, min, double, true)
+RANGE_OP(scan_inclusive, min, short, true)
+RANGE_OP(scan_inclusive, min, ushort, false)
 /* scan_inclusive max */
 RANGE_OP(scan_inclusive, max, int, true)
 RANGE_OP(scan_inclusive, max, uint, false)
@@ -116,6 +127,8 @@ RANGE_OP(scan_inclusive, max, ulong, false)
 RANGE_OP(scan_inclusive, max, half, true)
 RANGE_OP(scan_inclusive, max, float, true)
 RANGE_OP(scan_inclusive, max, double, true)
+RANGE_OP(scan_inclusive, max, short, true)
+RANGE_OP(scan_inclusive, max, ushort, false)
 /* scan_exclusive add */
 RANGE_OP(scan_exclusive, add, int, true)
@@ -125,6 +138,8 @@ RANGE_OP(scan_exclusive, add, ulong, false)
 RANGE_OP(scan_exclusive, add, half, true)
 RANGE_OP(scan_exclusive, add, float, true)
 RANGE_OP(scan_exclusive, add, double, true)
+RANGE_OP(scan_exclusive, add, short, true)
+RANGE_OP(scan_exclusive, add, ushort, false)
 /* scan_exclusive min */
 RANGE_OP(scan_exclusive, min, int, true)
 RANGE_OP(scan_exclusive, min, uint, false)
@@ -133,6 +148,8 @@ RANGE_OP(scan_exclusive, min, ulong, false)
 RANGE_OP(scan_exclusive, min, half, true)
 RANGE_OP(scan_exclusive, min, float, true)
 RANGE_OP(scan_exclusive, min, double, true)
+RANGE_OP(scan_exclusive, min, short, true)
+RANGE_OP(scan_exclusive, min, ushort, false)
 /* scan_exclusive max */
 RANGE_OP(scan_exclusive, max, int, true)
 RANGE_OP(scan_exclusive, max, uint, false)
@@ -141,92 +158,267 @@ RANGE_OP(scan_exclusive, max, ulong, false)
 RANGE_OP(scan_exclusive, max, half, true)
 RANGE_OP(scan_exclusive, max, float, true)
 RANGE_OP(scan_exclusive, max, double, true)
+RANGE_OP(scan_exclusive, max, short, true)
+RANGE_OP(scan_exclusive, max, ushort, false)
 #undef RANGE_OP
-PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
-PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global uint* p);
-PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global uint* p);
-PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global uint* p);
+    OVERLOADABLE GEN_TYPE intel_sub_group_##RANGE##_##OP(GEN_TYPE x) { \
+      return __gen_ocl_sub_group_##RANGE##_##OP(SIGN, x);  \
+    }
+INTEL_RANGE_OP(reduce, add, short, true)
+INTEL_RANGE_OP(reduce, add, ushort, false)
+INTEL_RANGE_OP(reduce, min, short, true)
+INTEL_RANGE_OP(reduce, min, ushort, false)
+INTEL_RANGE_OP(reduce, max, short, true)
+INTEL_RANGE_OP(reduce, max, ushort, false)
+INTEL_RANGE_OP(scan_inclusive, add, short, true)
+INTEL_RANGE_OP(scan_inclusive, add, ushort, false)
+INTEL_RANGE_OP(scan_inclusive, min, short, true)
+INTEL_RANGE_OP(scan_inclusive, min, ushort, false)
+INTEL_RANGE_OP(scan_inclusive, max, short, true)
+INTEL_RANGE_OP(scan_inclusive, max, ushort, false)
+INTEL_RANGE_OP(scan_exclusive, add, short, true)
+INTEL_RANGE_OP(scan_exclusive, add, ushort, false)
+INTEL_RANGE_OP(scan_exclusive, min, short, true)
+INTEL_RANGE_OP(scan_exclusive, min, ushort, false)
+INTEL_RANGE_OP(scan_exclusive, max, short, true)
+INTEL_RANGE_OP(scan_exclusive, max, ushort, false)
+PURE CONST uint __gen_ocl_sub_group_block_read_ui_mem(const global uint* p);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_mem2(const global uint* p);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_mem4(const global uint* p);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_mem8(const global uint* p);
 OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
-  return __gen_ocl_sub_group_block_read_mem(p);
+  return __gen_ocl_sub_group_block_read_ui_mem(p);
 OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
-  return __gen_ocl_sub_group_block_read_mem2(p);
+  return __gen_ocl_sub_group_block_read_ui_mem2(p);
 OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
-  return __gen_ocl_sub_group_block_read_mem4(p);
+  return __gen_ocl_sub_group_block_read_ui_mem4(p);
 OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
-  return __gen_ocl_sub_group_block_read_mem8(p);
+  return __gen_ocl_sub_group_block_read_ui_mem8(p);
+OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p)
+  return __gen_ocl_sub_group_block_read_ui_mem(p);
-void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint data);
-void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2 data);
-void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4 data);
-void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8 data);
-OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint data)
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p)
-  __gen_ocl_sub_group_block_write_mem(p, data);
+  return __gen_ocl_sub_group_block_read_ui_mem2(p);
-OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data)
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p)
-  __gen_ocl_sub_group_block_write_mem2(p, data);
+  return __gen_ocl_sub_group_block_read_ui_mem4(p);
-OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data)
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p)
-  __gen_ocl_sub_group_block_write_mem4(p, data);
+  return __gen_ocl_sub_group_block_read_ui_mem8(p);
+void __gen_ocl_sub_group_block_write_ui_mem(global uint* p, uint data);
+void __gen_ocl_sub_group_block_write_ui_mem2(global uint* p, uint2 data);
+void __gen_ocl_sub_group_block_write_ui_mem4(global uint* p, uint4 data);
+void __gen_ocl_sub_group_block_write_ui_mem8(global uint* p, uint8 data);
+OVERLOADABLE void intel_sub_group_block_write(global uint* p, uint data)
+  __gen_ocl_sub_group_block_write_ui_mem(p, data);
+OVERLOADABLE void intel_sub_group_block_write2(global uint* p, uint2 data)
+  __gen_ocl_sub_group_block_write_ui_mem2(p, data);
-OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data)
+OVERLOADABLE void intel_sub_group_block_write4(global uint* p,uint4 data)
-  __gen_ocl_sub_group_block_write_mem8(p, data);
+  __gen_ocl_sub_group_block_write_ui_mem4(p, data);
+OVERLOADABLE void intel_sub_group_block_write8(global uint* p,uint8 data)
+  __gen_ocl_sub_group_block_write_ui_mem8(p, data);
+OVERLOADABLE void intel_sub_group_block_write_ui(global uint* p, uint data)
+  __gen_ocl_sub_group_block_write_ui_mem(p, data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(global uint* p, uint2 data)
+  __gen_ocl_sub_group_block_write_ui_mem2(p, data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(global uint* p,uint4 data)
+  __gen_ocl_sub_group_block_write_ui_mem4(p, data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(global uint* p,uint8 data)
+  __gen_ocl_sub_group_block_write_ui_mem8(p, data);
-PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int y);
-PURE CONST uint2 __gen_ocl_sub_group_block_read_image2(image2d_t p, int x, int y);
-PURE CONST uint4 __gen_ocl_sub_group_block_read_image4(image2d_t p, int x, int y);
-PURE CONST uint8 __gen_ocl_sub_group_block_read_image8(image2d_t p, int x, int y);
+PURE CONST uint __gen_ocl_sub_group_block_read_ui_image(image2d_t p, int x, int y);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_image2(image2d_t p, int x, int y);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_image4(image2d_t p, int x, int y);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_image8(image2d_t p, int x, int y);
 OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord)
-  return __gen_ocl_sub_group_block_read_image(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
 OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord)
-  return __gen_ocl_sub_group_block_read_image2(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
 OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord)
-  return __gen_ocl_sub_group_block_read_image4(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
 OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord)
-  return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
-void __gen_ocl_sub_group_block_write_image(image2d_t p, int x, int y, uint data);
-void __gen_ocl_sub_group_block_write_image2(image2d_t p, int x, int y, uint2 data);
-void __gen_ocl_sub_group_block_write_image4(image2d_t p, int x, int y, uint4 data);
-void __gen_ocl_sub_group_block_write_image8(image2d_t p, int x, int y, uint8 data);
+OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t p, int2 cord)
+  return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t p, int2 cord)
+  return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t p, int2 cord)
+  return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t p, int2 cord)
+  return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
+void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, uint data);
+void __gen_ocl_sub_group_block_write_ui_image2(image2d_t p, int x, int y, uint2 data);
+void __gen_ocl_sub_group_block_write_ui_image4(image2d_t p, int x, int y, uint4 data);
+void __gen_ocl_sub_group_block_write_ui_image8(image2d_t p, int x, int y, uint8 data);
 OVERLOADABLE void intel_sub_group_block_write(image2d_t p, int2 cord, uint data)
-  __gen_ocl_sub_group_block_write_image(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
 OVERLOADABLE void intel_sub_group_block_write2(image2d_t p, int2 cord, uint2 data)
-  __gen_ocl_sub_group_block_write_image2(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
 OVERLOADABLE void intel_sub_group_block_write4(image2d_t p, int2 cord, uint4 data)
-  __gen_ocl_sub_group_block_write_image4(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
 OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 data)
-  __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
+OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t p, int2 cord, uint data)
+  __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t p, int2 cord, uint2 data)
+  __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t p, int2 cord, uint4 data)
+  __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t p, int2 cord, uint8 data)
+  __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
+PURE CONST ushort __gen_ocl_sub_group_block_read_us_mem(const global ushort* p);
+PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_mem2(const global ushort* p);
+PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_mem4(const global ushort* p);
+PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_mem8(const global ushort* p);
+OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p)
+  return __gen_ocl_sub_group_block_read_us_mem(p);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p)
+  return __gen_ocl_sub_group_block_read_us_mem2(p);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p)
+  return __gen_ocl_sub_group_block_read_us_mem4(p);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p)
+  return __gen_ocl_sub_group_block_read_us_mem8(p);
+void __gen_ocl_sub_group_block_write_us_mem(global ushort* p, ushort data);
+void __gen_ocl_sub_group_block_write_us_mem2(global ushort* p, ushort2 data);
+void __gen_ocl_sub_group_block_write_us_mem4(global ushort* p, ushort4 data);
+void __gen_ocl_sub_group_block_write_us_mem8(global ushort* p, ushort8 data);
+OVERLOADABLE void intel_sub_group_block_write_us(global ushort* p, ushort data)
+  __gen_ocl_sub_group_block_write_us_mem(p, data);
+OVERLOADABLE void intel_sub_group_block_write_us2(global ushort* p, ushort2 data)
+  __gen_ocl_sub_group_block_write_us_mem2(p, data);
+OVERLOADABLE void intel_sub_group_block_write_us4(global ushort* p,ushort4 data)
+  __gen_ocl_sub_group_block_write_us_mem4(p, data);
+OVERLOADABLE void intel_sub_group_block_write_us8(global ushort* p,ushort8 data)
+  __gen_ocl_sub_group_block_write_us_mem8(p, data);
+PURE CONST ushort __gen_ocl_sub_group_block_read_us_image(image2d_t p, int x, int y);
+PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_image2(image2d_t p, int x, int y);
+PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_image4(image2d_t p, int x, int y);
+PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_image8(image2d_t p, int x, int y);
+OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t p, int2 cord)
+  return __gen_ocl_sub_group_block_read_us_image(p, cord.x, cord.y);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t p, int2 cord)
+  return __gen_ocl_sub_group_block_read_us_image2(p, cord.x, cord.y);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t p, int2 cord)
+  return __gen_ocl_sub_group_block_read_us_image4(p, cord.x, cord.y);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t p, int2 cord)
+  return __gen_ocl_sub_group_block_read_us_image8(p, cord.x, cord.y);
+void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, ushort data);
+void __gen_ocl_sub_group_block_write_us_image2(image2d_t p, int x, int y, ushort2 data);
+void __gen_ocl_sub_group_block_write_us_image4(image2d_t p, int x, int y, ushort4 data);
+void __gen_ocl_sub_group_block_write_us_image8(image2d_t p, int x, int y, ushort8 data);
+OVERLOADABLE void intel_sub_group_block_write_us(image2d_t p, int2 cord, ushort data)
+  __gen_ocl_sub_group_block_write_us_image(p, cord.x, cord.y, data);
+OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t p, int2 cord, ushort2 data)
+  __gen_ocl_sub_group_block_write_us_image2(p, cord.x, cord.y, data);
+OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t p, int2 cord, ushort4 data)
+  __gen_ocl_sub_group_block_write_us_image4(p, cord.x, cord.y, data);
+OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t p, int2 cord, ushort8 data)
+  __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, data);
 OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
   TYPE res0, res1; \
@@ -238,6 +430,8 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
 #define SHUFFLE_UP(TYPE) \
@@ -251,6 +445,8 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) { \
 #undef SHUFFLE_UP
 #define SHUFFLE_XOR(TYPE) \
 OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
@@ -259,4 +455,6 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index ae3b379..608551b 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -35,30 +35,18 @@ uint get_sub_group_id(void);
 uint get_sub_group_local_id(void);
 /* broadcast */
-OVERLOADABLE int sub_group_broadcast(int a, size_t local_id);
-OVERLOADABLE uint sub_group_broadcast(uint a, size_t local_id);
-OVERLOADABLE long sub_group_broadcast(long a, size_t local_id);
-OVERLOADABLE ulong sub_group_broadcast(ulong a, size_t local_id);
-OVERLOADABLE half sub_group_broadcast(half a, size_t local_id);
-OVERLOADABLE float sub_group_broadcast(float a, size_t local_id);
-OVERLOADABLE double sub_group_broadcast(double a, size_t local_id);
-OVERLOADABLE int sub_group_broadcast(int a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE uint sub_group_broadcast(uint a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE long sub_group_broadcast(long a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE ulong sub_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE half sub_group_broadcast(half a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE float sub_group_broadcast(float a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE double sub_group_broadcast(double a, size_t local_id_x, size_t local_id_y);
-OVERLOADABLE int sub_group_broadcast(int a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE uint sub_group_broadcast(uint a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE long sub_group_broadcast(long a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE ulong sub_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE half sub_group_broadcast(half a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE float sub_group_broadcast(float a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
-OVERLOADABLE double sub_group_broadcast(double a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE int sub_group_broadcast(int a,uint local_id);
+OVERLOADABLE uint sub_group_broadcast(uint a, uint local_id);
+OVERLOADABLE long sub_group_broadcast(long a, uint local_id);
+OVERLOADABLE ulong sub_group_broadcast(ulong a, uint local_id);
+OVERLOADABLE half sub_group_broadcast(half a, uint local_id);
+OVERLOADABLE float sub_group_broadcast(float a, uint local_id);
+OVERLOADABLE double sub_group_broadcast(double a, uint local_id);
+OVERLOADABLE short sub_group_broadcast(short a,uint local_id);
+OVERLOADABLE ushort sub_group_broadcast(ushort a, uint local_id);
+OVERLOADABLE short intel_sub_group_broadcast(short a, uint local_id);
+OVERLOADABLE ushort intel_sub_group_broadcast(ushort a, uint local_id);
 /* reduce add */
 OVERLOADABLE int sub_group_reduce_add(int x);
 OVERLOADABLE uint sub_group_reduce_add(uint x);
@@ -67,6 +55,10 @@ OVERLOADABLE ulong sub_group_reduce_add(ulong x);
 OVERLOADABLE half sub_group_reduce_add(half x);
 OVERLOADABLE float sub_group_reduce_add(float x);
 OVERLOADABLE double sub_group_reduce_add(double x);
+OVERLOADABLE short sub_group_reduce_add(short x);
+OVERLOADABLE ushort sub_group_reduce_add(ushort x);
+OVERLOADABLE short intel_sug_group_reduce_add(short x);
+OVERLOADABLE ushort intel_sug_group_reduce_add(ushort x);
 /* reduce min */
 OVERLOADABLE int sub_group_reduce_min(int x);
@@ -76,6 +68,10 @@ OVERLOADABLE ulong sub_group_reduce_min(ulong x);
 OVERLOADABLE half sub_group_reduce_min(half x);
 OVERLOADABLE float sub_group_reduce_min(float x);
 OVERLOADABLE double sub_group_reduce_min(double x);
+OVERLOADABLE short sub_group_reduce_min(short x);
+OVERLOADABLE ushort sub_group_reduce_min(ushort x);
+OVERLOADABLE short intel_sug_group_reduce_min(short x);
+OVERLOADABLE ushort intel_sug_group_reduce_min(ushort x);
 /* reduce max */
 OVERLOADABLE int sub_group_reduce_max(int x);
@@ -85,6 +81,10 @@ OVERLOADABLE ulong sub_group_reduce_max(ulong x);
 OVERLOADABLE half sub_group_reduce_max(half x);
 OVERLOADABLE float sub_group_reduce_max(float x);
 OVERLOADABLE double sub_group_reduce_max(double x);
+OVERLOADABLE short sub_group_reduce_max(short x);
+OVERLOADABLE ushort sub_group_reduce_max(ushort x);
+OVERLOADABLE short intel_sug_group_reduce_max(short x);
+OVERLOADABLE ushort intel_sug_group_reduce_max(ushort x);
 /* scan_inclusive add */
 OVERLOADABLE int sub_group_scan_inclusive_add(int x);
@@ -94,6 +94,10 @@ OVERLOADABLE ulong sub_group_scan_inclusive_add(ulong x);
 OVERLOADABLE half sub_group_scan_inclusive_add(half x);
 OVERLOADABLE float sub_group_scan_inclusive_add(float x);
 OVERLOADABLE double sub_group_scan_inclusive_add(double x);
+OVERLOADABLE short sub_group_scan_inclusive_add(short x);
+OVERLOADABLE ushort sub_group_scan_inclusive_add(ushort x);
+OVERLOADABLE short intel_sug_group_scan_inclusive_add(short x);
+OVERLOADABLE ushort intel_sug_group_scan_inclusive_add(ushort x);
 /* scan_inclusive min */
 OVERLOADABLE int sub_group_scan_inclusive_min(int x);
@@ -103,6 +107,10 @@ OVERLOADABLE ulong sub_group_scan_inclusive_min(ulong x);
 OVERLOADABLE half sub_group_scan_inclusive_min(half x);
 OVERLOADABLE float sub_group_scan_inclusive_min(float x);
 OVERLOADABLE double sub_group_scan_inclusive_min(double x);
+OVERLOADABLE short sub_group_scan_inclusive_min(short x);
+OVERLOADABLE ushort sub_group_scan_inclusive_min(ushort x);
+OVERLOADABLE short intel_sug_group_scan_inclusive_min(short x);
+OVERLOADABLE ushort intel_sug_group_scan_inclusive_min(ushort x);
 /* scan_inclusive max */
 OVERLOADABLE int sub_group_scan_inclusive_max(int x);
@@ -112,6 +120,10 @@ OVERLOADABLE ulong sub_group_scan_inclusive_max(ulong x);
 OVERLOADABLE half sub_group_scan_inclusive_max(half x);
 OVERLOADABLE float sub_group_scan_inclusive_max(float x);
 OVERLOADABLE double sub_group_scan_inclusive_max(double x);
+OVERLOADABLE short sub_group_scan_inclusive_max(short x);
+OVERLOADABLE ushort sub_group_scan_inclusive_max(ushort x);
+OVERLOADABLE short intel_sug_group_scan_inclusive_max(short x);
+OVERLOADABLE ushort intel_sug_group_scan_inclusive_max(ushort x);
 /* scan_exclusive add */
 OVERLOADABLE int sub_group_scan_exclusive_add(int x);
@@ -121,6 +133,10 @@ OVERLOADABLE ulong sub_group_scan_exclusive_add(ulong x);
 OVERLOADABLE half sub_group_scan_exclusive_add(half x);
 OVERLOADABLE float sub_group_scan_exclusive_add(float x);
 OVERLOADABLE double sub_group_scan_exclusive_add(double x);
+OVERLOADABLE short sub_group_scan_exclusive_add(short x);
+OVERLOADABLE ushort sub_group_scan_exclusive_add(ushort x);
+OVERLOADABLE short intel_sub_group_scan_exclusive_add(short x);
+OVERLOADABLE ushort intel_sub_group_scan_exclusive_add(ushort x);
 /* scan_exclusive min */
 OVERLOADABLE int sub_group_scan_exclusive_min(int x);
@@ -130,6 +146,10 @@ OVERLOADABLE ulong sub_group_scan_exclusive_min(ulong x);
 OVERLOADABLE half sub_group_scan_exclusive_min(half x);
 OVERLOADABLE float sub_group_scan_exclusive_min(float x);
 OVERLOADABLE double sub_group_scan_exclusive_min(double x);
+OVERLOADABLE short sub_group_scan_exclusive_min(short x);
+OVERLOADABLE ushort sub_group_scan_exclusive_min(ushort x);
+OVERLOADABLE short intel_sug_group_scan_exclusive_min(short x);
+OVERLOADABLE ushort intel_sug_group_scan_exclusive_min(ushort x);
 /* scan_exclusive max */
 OVERLOADABLE int sub_group_scan_exclusive_max(int x);
@@ -139,21 +159,36 @@ OVERLOADABLE ulong sub_group_scan_exclusive_max(ulong x);
 OVERLOADABLE half sub_group_scan_exclusive_max(half x);
 OVERLOADABLE float sub_group_scan_exclusive_max(float x);
 OVERLOADABLE double sub_group_scan_exclusive_max(double x);
+OVERLOADABLE short sub_group_scan_exclusive_max(short x);
+OVERLOADABLE ushort sub_group_scan_exclusive_max(ushort x);
+OVERLOADABLE short intel_sug_group_scan_exclusive_max(short x);
+OVERLOADABLE ushort intel_sug_group_scan_exclusive_max(ushort x);
 /* shuffle */
 OVERLOADABLE half intel_sub_group_shuffle(half x, uint c);
 OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+OVERLOADABLE short intel_sub_group_shuffle(short x, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle(ushort x, uint c);
 OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c);
 OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c);
+OVERLOADABLE short intel_sub_group_shuffle_down(short x, short y, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle_down(ushort x, ushort y, uint c);
 OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c);
 OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c);
+OVERLOADABLE short intel_sub_group_shuffle_up(short x, short y, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle_up(ushort x, ushort y, uint c);
 OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);
+OVERLOADABLE short intel_sub_group_shuffle_xor(short x, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle_xor(ushort x, uint c);
 /* blocak read/write */
 OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
@@ -161,10 +196,10 @@ OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p);
 OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
 OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);
-OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data);
-OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data);
-OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data);
-OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data);
+OVERLOADABLE void intel_sub_group_block_write(__global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write2(__global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write4(__global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write8(__global uint* p, uint8 data);
 OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2 byte_coord);
 OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2 byte_coord);
@@ -175,3 +210,43 @@ OVERLOADABLE void intel_sub_group_block_write(image2d_t image, int2 byte_coord,
 OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2 byte_coord, uint2 data);
 OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2 byte_coord, uint4 data);
 OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2 byte_coord, uint8 data);
+OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p);
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p);
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p);
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p);
+OVERLOADABLE void intel_sub_group_block_write_ui(__global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(__global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(__global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(__global uint* p, uint8 data);
+OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t image, int2 byte_coord);
+OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t image, int2 byte_coord, uint data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t image, int2 byte_coord, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t image, int2 byte_coord, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t image, int2 byte_coord, uint8 data);
+OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p);
+OVERLOADABLE void intel_sub_group_block_write_us(__global ushort* p, ushort data);
+OVERLOADABLE void intel_sub_group_block_write_us2(__global ushort* p, ushort2 data);
+OVERLOADABLE void intel_sub_group_block_write_us4(__global ushort* p, ushort4 data);
+OVERLOADABLE void intel_sub_group_block_write_us8(__global ushort* p, ushort8 data);
+OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t image, int2 byte_coord);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t image, int2 byte_coord);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t image, int2 byte_coord);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t image, int2 byte_coord);
+OVERLOADABLE void intel_sub_group_block_write_us(image2d_t image, int2 byte_coord, ushort data);
+OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t image, int2 byte_coord, ushort2 data);
+OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t image, int2 byte_coord, ushort4 data);
+OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t image, int2 byte_coord, ushort8 data);
diff --git a/backend/src/llvm/ExpandLargeIntegers.cpp b/backend/src/llvm/ExpandLargeIntegers.cpp
index 1ee294f..60740f5 100644
--- a/backend/src/llvm/ExpandLargeIntegers.cpp
+++ b/backend/src/llvm/ExpandLargeIntegers.cpp
@@ -324,12 +324,14 @@ static Value *buildVectorOrScalar(ConversionState &State, IRBuilder<> &IRB, Smal
     Value * vec = NULL;
     unsigned ElemNo = Elements.size();
     Type *ElemTy = Elements[0]->getType();
+    // if it is illegal integer type, these instructions will be further
+    // splited, that's why these temporary values should be erased.
     bool KeepInsert = isLegalBitSize(ElemTy->getPrimitiveSizeInBits() * ElemNo);
     for (unsigned i = 0; i < ElemNo; ++i) {
       Value *tmp = vec ? vec : UndefValue::get(VectorType::get(ElemTy, ElemNo));
       Value *idx = ConstantInt::get(IntTy, i);
       vec = IRB.CreateInsertElement(tmp, Elements[i], idx);
-      if (!KeepInsert) {
+      if (!KeepInsert && !isa<Constant>(vec)) {
@@ -436,6 +438,7 @@ static void convertInstruction(Instruction *Inst, ConversionState &State,
       State.recordConverted(Trunc, NewInst);
     } else {
       TypePair Tys = getExpandedIntTypes(Trunc->getType());
+      (void) OpTys;
       assert(Tys.Lo == OpTys.Lo);
       Value *Lo = Ops.Lo;
       Value *Hi = IRB.CreateTrunc(Ops.Hi, Tys.Hi, Twine(Name, ".hi"));
diff --git a/backend/src/llvm/PromoteIntegers.cpp b/backend/src/llvm/PromoteIntegers.cpp
index adba004..a500311 100644
--- a/backend/src/llvm/PromoteIntegers.cpp
+++ b/backend/src/llvm/PromoteIntegers.cpp
@@ -151,6 +151,7 @@ static Value *convertConstant(Constant *C, bool SignExt=false) {
   } else {
     errs() << "Value: " << *C << "\n";
     report_fatal_error("Unexpected constant value");
+    return NULL;
diff --git a/backend/src/llvm/StripAttributes.cpp b/backend/src/llvm/StripAttributes.cpp
index 3bf3853..9d07c29 100644
--- a/backend/src/llvm/StripAttributes.cpp
+++ b/backend/src/llvm/StripAttributes.cpp
@@ -89,10 +89,12 @@ namespace {
 char StripAttributes::ID = 0;
 bool StripAttributes::runOnFunction(Function &Func) {
-  if (!gbe::isKernelFunction(Func))
-    Func.addFnAttr(Attribute::AlwaysInline);
+  if (!gbe::isKernelFunction(Func)) {
+    Func.addFnAttr(Attribute::AlwaysInline);
+    Func.setLinkage(GlobalValue::LinkOnceAnyLinkage);
+  }
   for (Function::iterator BB = Func.begin(), E = Func.end();
        BB != E; ++BB) {
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index a3f9886..89d5e7c 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -26,18 +26,23 @@
 #include "src/GBEConfig.h"
 #include "llvm_includes.hpp"
 #include "llvm/llvm_gen_backend.hpp"
+#include "ir/unit.hpp"
 using namespace llvm;
 namespace gbe
-  static Module* createOclBitCodeModule(LLVMContext& ctx, bool strictMath)
+  static Module* createOclBitCodeModule(LLVMContext& ctx,
+                                                 bool strictMath,
+                                                 uint32_t oclVersion)
-    std::string bitCodeFiles = OCL_BITCODE_LIB_PATH;
+    std::string bitCodeFiles = oclVersion >= 200 ?
+                               OCL_BITCODE_LIB_20_PATH : OCL_BITCODE_LIB_PATH;
     if(bitCodeFiles == "")
-      bitCodeFiles = OCL_BITCODE_BIN;
+      bitCodeFiles = oclVersion >= 200 ? OCL_BITCODE_BIN_20 : OCL_BITCODE_BIN;
     std::istringstream bitCodeFilePath(bitCodeFiles);
     std::string FilePath;
     bool findBC = false;
@@ -86,11 +91,11 @@ namespace gbe
         llvm::Function * callFunc = call->getCalledFunction();
-        if(!callFunc) {
-          continue;
-        }
+        //if(!callFunc) {
+        //  continue;
+        //}
-        if (callFunc->getIntrinsicID() != 0)
+        if (callFunc && callFunc->getIntrinsicID() != 0)
         std::string fnName = call->getCalledValue()->stripPointerCasts()->getName();
@@ -135,12 +140,16 @@ namespace gbe
-  Module* runBitCodeLinker(Module *mod, bool strictMath)
+  Module* runBitCodeLinker(Module *mod, bool strictMath, ir::Unit &unit)
     LLVMContext& ctx = mod->getContext();
     std::set<std::string> materializedFuncs;
     std::vector<GlobalValue *> Gvs;
-    Module* clonedLib = createOclBitCodeModule(ctx, strictMath);
+    uint32_t oclVersion = getModuleOclVersion(mod);
+    ir::PointerSize size = oclVersion >= 200 ? ir::POINTER_64_BITS : ir::POINTER_32_BITS;
+    unit.setPointerSize(size);
+    Module* clonedLib = createOclBitCodeModule(ctx, strictMath, oclVersion);
     if (clonedLib == NULL)
       return NULL;
@@ -182,6 +191,28 @@ namespace gbe
+    if (oclVersion >= 200) {
+      builtinFuncs.push_back("__gen_memcpy_gn");
+      builtinFuncs.push_back("__gen_memcpy_pn");
+      builtinFuncs.push_back("__gen_memcpy_ln");
+      builtinFuncs.push_back("__gen_memcpy_ng");
+      builtinFuncs.push_back("__gen_memcpy_np");
+      builtinFuncs.push_back("__gen_memcpy_nl");
+      builtinFuncs.push_back("__gen_memcpy_nc");
+      builtinFuncs.push_back("__gen_memcpy_nn");
+      builtinFuncs.push_back("__gen_memset_n");
+      builtinFuncs.push_back("__gen_memcpy_gn_align");
+      builtinFuncs.push_back("__gen_memcpy_pn_align");
+      builtinFuncs.push_back("__gen_memcpy_ln_align");
+      builtinFuncs.push_back("__gen_memcpy_ng_align");
+      builtinFuncs.push_back("__gen_memcpy_np_align");
+      builtinFuncs.push_back("__gen_memcpy_nl_align");
+      builtinFuncs.push_back("__gen_memcpy_nc_align");
+      builtinFuncs.push_back("__gen_memcpy_nn_align");
+      builtinFuncs.push_back("__gen_memset_n_align");
+    }
     for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) {
       if (SF->isDeclaration()) continue;
       if (!isKernelFunction(*SF)) continue;
diff --git a/backend/src/llvm/llvm_device_enqueue.cpp b/backend/src/llvm/llvm_device_enqueue.cpp
new file mode 100644
index 0000000..ee236de
--- /dev/null
+++ b/backend/src/llvm/llvm_device_enqueue.cpp
@@ -0,0 +1,417 @@
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <list>
+#include "llvm_includes.hpp"
+#include "ir/unit.hpp"
+#include "llvm_gen_backend.hpp"
+#include "ocl_common_defines.h"
+using namespace llvm;
+namespace gbe {
+  BitCastInst *isInvokeBitcast(Instruction *I) {
+    BitCastInst* bt = dyn_cast<BitCastInst>(I);
+    if (bt == NULL)
+      return NULL;
+    Type* type = bt->getOperand(0)->getType();
+    if(!type->isPointerTy())
+      return NULL;
+    PointerType *pointerType = dyn_cast<PointerType>(type);
+    Type *pointed = pointerType->getElementType();
+    if(!pointed->isFunctionTy())
+      return NULL;
+    Function *Fn = dyn_cast<Function>(bt->getOperand(0));
+    if(Fn == NULL)
+      return NULL;
+    /* This is a fake, to check the function bitcast is for block or not */
+    std::string fnName = Fn->getName();
+    if(fnName.find("_invoke") == std::string::npos)
+      return NULL;
+    return bt;
+  }
+  void mutateArgAddressSpace(Argument *arg)
+  {
+    std::list<Value *>WorkList;
+    WorkList.push_back(arg);
+    while(!WorkList.empty()) {
+      Value *v = WorkList.front();
+      for (Value::use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
+        // After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
+        // which is more straightforward.
+        User *theUser = *iter;
+        User *theUser = iter->getUser();
+        // becareful with sub operation
+        if (isa<StoreInst>(theUser) || isa<LoadInst>(theUser))
+          continue;
+        WorkList.push_back(theUser);
+      }
+      PointerType *ty = dyn_cast<PointerType>(v->getType());
+      if(ty == NULL) continue;   //should only one argument, private pointer type
+      ty = PointerType::get(ty->getPointerElementType(), 1);
+      v->mutateType(ty);
+      WorkList.pop_front();
+    }
+  }
+  Function* setFunctionAsKernel(Module *mod, Function *Fn)
+  {
+    LLVMContext &Context = mod->getContext();
+    Type *intTy = IntegerType::get(mod->getContext(), 32);
+    SmallVector<llvm::Metadata *, 5> kernelMDArgs;
+    // MDNode for the kernel argument address space qualifiers.
+    SmallVector<llvm::Metadata *, 8> addressQuals;
+    // MDNode for the kernel argument access qualifiers (images only).
+    SmallVector<llvm::Metadata *, 8> accessQuals;
+    // MDNode for the kernel argument type names.
+    SmallVector<llvm::Metadata *, 8> argTypeNames;
+    // MDNode for the kernel argument base type names.
+    SmallVector<llvm::Metadata *, 8> argBaseTypeNames;
+    // MDNode for the kernel argument type qualifiers.
+    SmallVector<llvm::Metadata *, 8> argTypeQuals;
+    // MDNode for the kernel argument names.
+    SmallVector<llvm::Metadata *, 8> argNames;
+    //Because paramter type changed, so must re-create the invoke function and replace the old one
+    std::vector<Type *> ParamTys;
+    ValueToValueMapTy VMap;
+    for (Function::arg_iterator I = Fn->arg_begin(), E = Fn->arg_end(); I != E; ++I) {
+      PointerType *ty = dyn_cast<PointerType>(I->getType());
+      if(ty && ty->getAddressSpace() == 0) //Foce set the address space to global
+        ty = PointerType::get(ty->getPointerElementType(), 1);
+      ParamTys.push_back(ty);
+    }
+    FunctionType* NewFT = FunctionType::get(Fn->getReturnType(), ParamTys, false);
+    Function* NewFn = Function::Create(NewFT, Function::ExternalLinkage, Fn->getName());
+    SmallVector<ReturnInst*, 8> Returns;
+    Function::arg_iterator NewFnArgIt = NewFn->arg_begin();
+    for (Function::arg_iterator I = Fn->arg_begin(), E = Fn->arg_end(); I != E; ++I) {
+      std::string ArgName = I->getName();
+      NewFnArgIt->setName(ArgName);
+      VMap[&*I] = &(*NewFnArgIt++);
+    }
+    CloneFunctionInto(NewFn, Fn, VMap, /*ModuleLevelChanges=*/true, Returns);
+    Fn->setName("__d" + Fn->getName());
+    mod->getFunctionList().push_back(NewFn);
+    //mod->getOrInsertFunction(NewFn->getName(), NewFn->getFunctionType(),
+    //                         NewFn->getAttributes());
+    for (Function::arg_iterator I = NewFn->arg_begin(), E = NewFn->arg_end(); I != E; ++I) {
+      PointerType *ty = dyn_cast<PointerType>(I->getType());
+      //mutate the address space  of all pointer derive from the argmument from private to global
+      if(ty && ty->getAddressSpace() == 1)
+        mutateArgAddressSpace(&*I);
+      //ty = dyn_cast<PointerType>(I->getType());
+      addressQuals.push_back(llvm::ConstantAsMetadata::get(ConstantInt::get(intTy, ty->getAddressSpace())));
+      accessQuals.push_back(llvm::MDString::get(Context, "none"));
+      argTypeNames.push_back(llvm::MDString::get(Context, "char*"));
+      argBaseTypeNames.push_back(llvm::MDString::get(Context, "char*"));
+      argTypeQuals.push_back(llvm::MDString::get(Context, ""));
+      argNames.push_back(llvm::MDString::get(Context, I->getName()));
+    }
+    //If run to here, llvm version always > 3.9, add the version check just for build.
+    NewFn->setMetadata("kernel_arg_addr_space",
+                    llvm::MDNode::get(Context, addressQuals));
+    NewFn->setMetadata("kernel_arg_access_qual",
+                    llvm::MDNode::get(Context, accessQuals));
+    NewFn->setMetadata("kernel_arg_type",
+                    llvm::MDNode::get(Context, argTypeNames));
+    NewFn->setMetadata("kernel_arg_base_type",
+                    llvm::MDNode::get(Context, argBaseTypeNames));
+    NewFn->setMetadata("kernel_arg_type_qual",
+                    llvm::MDNode::get(Context, argTypeQuals));
+    NewFn->setMetadata("kernel_arg_name",
+                    llvm::MDNode::get(Context, argNames));
+    return NewFn;
+    assert(0);  //only opencl 2.0 could reach hear.
+    return Fn;
+  }
+  Instruction* replaceInst(Instruction *I, Value *v)
+  {
+    //The bitcast is instruction
+    if(BitCastInst *bt = dyn_cast<BitCastInst>(&*I)) {
+      bt->replaceAllUsesWith(v);
+      return bt;
+    }
+    return NULL;
+  }
+  void collectDeviceEnqueueInfo(Module *mod, ir::Unit &unit)
+  {
+    std::set<Instruction*> deadInsnSet;
+    std::set<Function*> deadFunctionSet;
+    std::map<Value*, std::string> blocks;
+    if (getModuleOclVersion(mod) < 200)
+      return;
+    for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) {
+      Function *f = &*SF;
+      if (f->isDeclaration()) continue;
+      for (inst_iterator I = inst_begin(f), E = inst_end(f); I != E; ++I) {
+        if (BitCastInst* bt = isInvokeBitcast(&*I)) {
+          /* handle block description, convert the instruction that store block
+           * invoke pointer to store the index in the unit's block functions index.*/
+          Function *Fn = dyn_cast<Function>(bt->getOperand(0));
+          std::string fnName = Fn->getName();
+          int index = -1;
+          for(size_t i=0; i<unit.blockFuncs.size(); i++) {
+            if(unit.blockFuncs[i] == fnName) {
+              index = i;
+              break;
+            }
+          }
+          if(index == -1){
+            unit.blockFuncs.push_back(fnName);
+            index = unit.blockFuncs.size() - 1;
+          }
+          for (Value::use_iterator iter = bt->use_begin(); iter != bt->use_end(); ++iter) {
+            User *theUser = *iter;
+            User *theUser = iter->getUser();
+            if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
+              GetElementPtrInst * gep = dyn_cast<GetElementPtrInst>(st->getPointerOperand());
+              if(gep)
+                blocks[gep->getOperand(0)] = fnName;
+            }
+          }
+          if(StoreInst* st = dyn_cast<StoreInst>(&*I)) {
+            GetElementPtrInst * gep = dyn_cast<GetElementPtrInst>(st->getPointerOperand());
+            if(gep)
+              blocks[gep->getOperand(0)] = fnName;
+          }
+          Value *v = Constant::getIntegerValue(bt->getType(), APInt(unit.getPointerSize(), index));
+          bt->replaceAllUsesWith(v);
+          deadInsnSet.insert(bt);
+        }
+        if(CallInst *CI = dyn_cast<CallInst>(&*I)) {
+          IRBuilder<> builder(CI->getParent(), BasicBlock::iterator(CI));
+          if(CI->getCalledFunction() == NULL) {
+            //unnamed call function, parse the use to find the define of called function
+            SmallVector<Value*, 16> args(CI->op_begin(), CI->op_end()-1);
+            Value *v = CI->getCalledValue();
+            BitCastInst* bt = dyn_cast<BitCastInst>(v);
+            if(bt == NULL)
+              continue;
+            LoadInst* ld = dyn_cast<LoadInst>(bt->getOperand(0));
+            if(ld == NULL)
+              continue;
+            GetElementPtrInst * gep = dyn_cast<GetElementPtrInst>(ld->getPointerOperand());
+            if(gep == NULL)
+              continue;
+            BitCastInst* fnPointer = dyn_cast<BitCastInst>(gep->getOperand(0));
+            if(fnPointer == NULL)
+              continue;
+            if(BitCastInst* bt = dyn_cast<BitCastInst>(fnPointer->getOperand(0))) {
+              std::string fnName = blocks[bt->getOperand(0)];
+              Function* f = mod->getFunction(fnName);
+              CallInst *newCI = builder.CreateCall(f, args);
+              CI->replaceAllUsesWith(newCI);
+              deadInsnSet.insert(CI);
+              continue;
+            }
+            //the function is global variable
+            if(GlobalVariable* gv = dyn_cast<GlobalVariable>(fnPointer->getOperand(0))) {
+              Constant *c = gv->getInitializer();
+              ConstantExpr *expr = dyn_cast<ConstantExpr>(c->getOperand(3));
+              BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+              Function* f = dyn_cast<Function>(bt->getOperand(0));
+              CallInst *newCI = builder.CreateCall(f, args);
+              CI->replaceAllUsesWith(newCI);
+              deadInsnSet.insert(CI);
+              continue;
+            }
+            ld = dyn_cast<LoadInst>(fnPointer->getOperand(0));
+            if(ld == NULL)
+              continue;
+            if(GlobalVariable *gv = dyn_cast<GlobalVariable>(ld->getPointerOperand())) {
+              ConstantExpr *expr = dyn_cast<ConstantExpr>(gv->getInitializer());
+              BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+              GlobalVariable *block_literal = dyn_cast<GlobalVariable>(bt->getOperand(0));
+              Constant *v = block_literal->getInitializer();
+              expr = dyn_cast<ConstantExpr>(v->getOperand(3));
+              bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+              Function* f = dyn_cast<Function>(bt->getOperand(0));
+              CallInst *newCI = builder.CreateCall(f, args);
+              CI->replaceAllUsesWith(newCI);
+              deadInsnSet.insert(CI);
+              continue;
+            }
+            if(AllocaInst *ai = dyn_cast<AllocaInst>(ld->getPointerOperand())) {
+              Value *v = NULL;
+              for (Value::use_iterator iter = ai->use_begin(); iter != ai->use_end(); ++iter) {
+                User *theUser = *iter;
+                User *theUser = iter->getUser();
+                if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
+                  bt = dyn_cast<BitCastInst>(st->getValueOperand());
+                  if(bt)
+                    v = bt->getOperand(0);
+                }
+              }
+              if(blocks.find(v) == blocks.end()) {
+                if(GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+                  Constant *c = gv->getInitializer();
+                  ConstantExpr *expr = dyn_cast<ConstantExpr>(c->getOperand(3));
+                  BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+                  Function* f = dyn_cast<Function>(bt->getOperand(0));
+                  blocks[v] = f->getName();
+                }
+              }
+              std::string fnName = blocks[v];
+              Function* f = mod->getFunction(fnName);
+              CallInst *newCI = builder.CreateCall(f, args);
+              CI->replaceAllUsesWith(newCI);
+              deadInsnSet.insert(CI);
+              continue;
+            }
+            //can't find the function's define
+            assert(0);
+          } else {
+            //handle enqueue_kernel function call
+            Function *fn = CI->getCalledFunction();
+            if (fn->getName().find("enqueue_kernel") == std::string::npos)
+              continue;
+            //block parameter's index, 3 or 6
+            int block_index = 3;
+            Type *type = CI->getArgOperand(block_index)->getType();
+            if(type->isIntegerTy())
+                block_index = 6;
+            Value *block = CI->getArgOperand(block_index);
+            while(isa<BitCastInst>(block))
+               block = dyn_cast<BitCastInst>(block)->getOperand(0);
+            LoadInst *ld = dyn_cast<LoadInst>(block);
+            Value *v = NULL;
+            if(ld) {
+              Value *block = ld->getPointerOperand();
+              for (Value::use_iterator iter = block->use_begin(); iter != block->use_end(); ++iter) {
+                User *theUser = *iter;
+                User *theUser = iter->getUser();
+                if(StoreInst *st = dyn_cast<StoreInst>(theUser)) {
+                  BitCastInst *bt = dyn_cast<BitCastInst>(st->getValueOperand());
+                  if(bt)
+                    v = bt->getOperand(0);
+                }
+              }
+              if(blocks.find(v) == blocks.end()) {
+                if(GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+                  Constant *c = gv->getInitializer();
+                  ConstantExpr *expr = dyn_cast<ConstantExpr>(c->getOperand(3));
+                  BitCastInst *bt = dyn_cast<BitCastInst>(expr->getAsInstruction());
+                  Function* f = dyn_cast<Function>(bt->getOperand(0));
+                  blocks[v] = f->getName();
+                }
+              }
+            } else if(isa<AllocaInst>(block)) {
+              v = block;
+            }
+            std::string fnName = blocks[v];
+            Function* f = mod->getFunction(fnName);
+            deadFunctionSet.insert(f);
+            f = setFunctionAsKernel(mod, f);
+            if( fn->isVarArg() ) {
+              //enqueue function with slm, convert to __gen_enqueue_kernel_slm call
+              //store the slm information to a alloca address.
+              int start = block_index + 1;
+              int count = CI->getNumArgOperands() - start;
+              Type *intTy = IntegerType::get(mod->getContext(), 32);
+              AllocaInst *AI = builder.CreateAlloca(intTy, ConstantInt::get(intTy, count));
+              for(uint32_t i = start; i < CI->getNumArgOperands(); i++) {
+                Value *ptr = builder.CreateGEP(AI, ConstantInt::get(intTy, i-start));
+                builder.CreateStore(CI->getArgOperand(i), ptr);
+              }
+              SmallVector<Value*, 16> args(CI->op_begin(), CI->op_begin() + 3);
+              args.push_back(CI->getArgOperand(block_index));
+              args.push_back(ConstantInt::get(intTy, count));
+              args.push_back(AI);
+              std::vector<Type *> ParamTys;
+              for (Value** I = args.begin(); I != args.end(); ++I)
+                ParamTys.push_back((*I)->getType());
+              CallInst* newCI = builder.CreateCall(cast<llvm::Function>(mod->getOrInsertFunction(
+                              "__gen_enqueue_kernel_slm", FunctionType::get(intTy, ParamTys, false))), args);
+              CI->replaceAllUsesWith(newCI);
+              deadInsnSet.insert(CI);
+            }
+          }
+        }
+      }
+    }
+    for (auto it: deadInsnSet) {
+      it->eraseFromParent();
+    }
+    for (auto it: deadFunctionSet) {
+      it->eraseFromParent();
+    }
+  }
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 0570766..664d2ff 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -254,6 +254,7 @@ namespace gbe
       case 1: return ir::MEM_GLOBAL;
       case 2: return ir::MEM_CONSTANT;
       case 3: return ir::MEM_LOCAL;
+      case 4: return ir::MEM_GENERIC;
     return ir::MEM_GLOBAL;
@@ -280,6 +281,38 @@ namespace gbe
     return CPV;
+#define TYPESIZE(TYPE,VECT,SZ) else if( name == std::string(#TYPE).append(" __attribute__((ext_vector_type("#VECT")))") ) return VECT*SZ;
+  else if(name == #TYPE) return SZ;\
+  static uint32_t getTypeSize(Module* M, const ir::Unit &unit, std::string& name) {
+      if(name == "size_t") return sizeof(size_t);
+      TYPESIZEVEC(char,1)
+      TYPESIZEVEC(unsigned char,1)
+      TYPESIZEVEC(short,2)
+      TYPESIZEVEC(unsigned short,2)
+      TYPESIZEVEC(half,2)
+      TYPESIZEVEC(int,4)
+      TYPESIZEVEC(unsigned int,4)
+      TYPESIZEVEC(float,4)
+      TYPESIZEVEC(double,8)
+      TYPESIZEVEC(long,8)
+      TYPESIZEVEC(unsigned long,8)
+      else{
+        StructType *StrTy = M->getTypeByName("struct."+name);
+        if(StrTy)
+          return getTypeByteSize(unit,StrTy);
+      }
+      GBE_ASSERTM(false, "Unspported type name");
+      return 0;
+  }
+#undef TYPESIZE
   /*! Handle the LLVM IR Value to Gen IR register translation. This has 2 roles:
    *  - Split the LLVM vector into several scalar values
    *  - Handle the transparent copies (bitcast or use of intrincics functions
@@ -553,7 +586,7 @@ namespace gbe
     virtual bool doInitialization(Module &M);
     /*! helper function for parsing global constant data */
-    void getConstantData(const Constant * c, void* mem, uint32_t& offset) const;
+    void getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &) const;
     void collectGlobalConstant(void) const;
     ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t index = 0u);
     const ir::Immediate &processConstantImm(Constant *CPV, int32_t index = 0u);
@@ -689,6 +722,8 @@ namespace gbe
     DECL_VISIT_FN(BranchInst, BranchInst);
     DECL_VISIT_FN(AllocaInst, AllocaInst);
+    DECL_VISIT_FN(AtomicRMWInst, AtomicRMWInst);
+    DECL_VISIT_FN(AtomicCmpXchgInst, AtomicCmpXchgInst);
     // Emit unary instructions from gen native function
@@ -700,8 +735,8 @@ namespace gbe
     // Emit subgroup instructions
     void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
     // Emit subgroup instructions
-    void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
-    void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
+    void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32);
+    void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32);
     uint8_t appendSampler(CallSite::arg_iterator AI);
     uint8_t getImageID(CallInst &I);
@@ -735,6 +770,7 @@ namespace gbe
         return NULL;
       return unit.printfs[inst];
+    void emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple);
       void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug infomation in context for subsequently passing to Gen insn
       ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
@@ -894,7 +930,8 @@ namespace gbe
               pointerOrigMap.insert(std::make_pair(work, pointers));
             } else {
               // update the pointer source here,
-              (*ptrIter).second[0] = ptr;
+              if ((!isa<SelectInst>(work) && !isa<PHINode>(work)))
+                (*ptrIter).second[0] = ptr;
@@ -940,7 +977,8 @@ namespace gbe
             pointerOrigMap.insert(std::make_pair(pointer, pointers));
           } else {
             // update the pointer source here,
-            (*ptrIter).second[0] = ptr;
+            if ((!isa<SelectInst>(pointer) && !isa<PHINode>(pointer)))
+              (*ptrIter).second[0] = ptr;
         } else {
@@ -1188,8 +1226,13 @@ namespace gbe
         case 2:
-          new_bti = BTI_CONSTANT;
+          // ocl 2.0, constant pointer use separate bti
+          if(legacyMode)
+            new_bti = BTI_CONSTANT;//btiBase;
+          else {
+            new_bti = btiBase;//btiBase;
+            incBtiBase();
+          }
         case 3:
           new_bti = BTI_LOCAL;
@@ -1230,9 +1273,11 @@ namespace gbe
     MDNode *typeNameNode = NULL;
     MDNode *typeBaseNameNode = NULL;
+    MDNode *typeQualNode = NULL;
     typeNameNode = F.getMetadata("kernel_arg_type");
     typeBaseNameNode = F.getMetadata("kernel_arg_base_type");
+    typeQualNode = F.getMetadata("kernel_arg_type_qual");
     MDNode *node = getKernelFunctionMetadata(&F);
     for(uint j = 0;node && j < node->getNumOperands() - 1; j++) {
@@ -1242,6 +1287,8 @@ namespace gbe
       if (!attrName) continue;
       if (attrName->getString() == "kernel_arg_type") {
         typeNameNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_type_qual") {
+        typeQualNode = attrNode;
       if (attrName->getString() == "kernel_arg_base_type") {
         typeBaseNameNode = attrNode;
@@ -1263,9 +1310,12 @@ namespace gbe
       if(typeBaseNameNode) {
         llvmInfo.typeBaseName= (cast<MDString>(typeBaseNameNode->getOperand(opID)))->getString();
+      llvmInfo.typeName= (cast<MDString>(typeNameNode->getOperand(opID)))->getString();
+      llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(opID)))->getString();
       bool isImage = llvmInfo.isImageType();
-      if (I->getType()->isPointerTy() || isImage) {
-        BtiMap.insert(std::make_pair(&*I, getNewBti(&*I, isImage)));
+      bool isPipe = llvmInfo.isPipeType();
+      if (I->getType()->isPointerTy() || isImage || isPipe) {
+        BtiMap.insert(std::make_pair(&*I, getNewBti(&*I, isImage || isPipe)));
@@ -1319,12 +1369,12 @@ namespace gbe
-          Type *int32Ty = Type::getInt32Ty(ptr->getContext());
-          Value *v1 = Builder.CreatePtrToInt(pointerOp, int32Ty);
+          Type *ptyTy = IntegerType::get(ptr->getContext(), getTypeBitSize(unit, ptr->getType()));
+          Value *v1 = Builder.CreatePtrToInt(pointerOp, ptyTy);
-          Value *v2 = Builder.CreatePtrToInt(getSinglePointerOrigin(pointerOp), int32Ty);
-          Value *v3 = Builder.CreatePtrToInt(base, int32Ty);
-          Value *v4 = Builder.CreatePtrToInt(bti, int32Ty);
+          Value *v2 = Builder.CreatePtrToInt(getSinglePointerOrigin(pointerOp), ptyTy);
+          Value *v3 = Builder.CreatePtrToInt(base, ptyTy);
+          Value *v4 = Builder.CreatePtrToInt(bti, ptyTy);
           // newLocBase = (pointer - origin) + base_start
           Value *diff = Builder.CreateSub(v1, v2);
           Value *newLocBase = Builder.CreateAdd(v3, diff);
@@ -1390,8 +1440,8 @@ namespace gbe
     // storing/loading pointer would introduce revisit
-    for (std::vector<Value *>::iterator iter = revisit.begin(); iter != revisit.end(); ++iter) {
-      findPointerEscape(*iter, mixedPtr, true, revisit);
+    for (size_t i = 0; i < revisit.size(); ++i) {
+      findPointerEscape(revisit[i], mixedPtr, true, revisit);
     // the second pass starts from mixed pointer
@@ -1445,22 +1495,61 @@ namespace gbe
-  void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset) const {
+  void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &relocs) const {
     Type * type = c->getType();
     Type::TypeID id = type->getTypeID();
+    if (isa<ConstantExpr>(c)) {
+      const ConstantExpr *expr = dyn_cast<ConstantExpr>(c);
+      Value *pointer = expr->getOperand(0);
+      if (expr->getOpcode() == Instruction::GetElementPtr) {
+        uint32_t constantOffset = 0;
+        CompositeType* CompTy = cast<CompositeType>(pointer->getType());
+        for(uint32_t op=1; op<expr->getNumOperands(); ++op) {
+            int32_t TypeIndex;
+            ConstantInt* ConstOP = dyn_cast<ConstantInt>(expr->getOperand(op));
+            GBE_ASSERTM(ConstOP != NULL, "must be constant index");
+            TypeIndex = ConstOP->getZExtValue();
+            GBE_ASSERT(TypeIndex >= 0);
+            constantOffset += getGEPConstOffset(unit, CompTy, TypeIndex);
+            CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+        }
+        ir::Constant cc = unit.getConstantSet().getConstant(pointer->getName());
+        unsigned int defOffset = cc.getOffset();
+        relocs.push_back(ir::RelocEntry(offset, defOffset + constantOffset));
+        uint32_t size = getTypeByteSize(unit, type);
+        memset((char*)mem+offset, 0, size);
+        offset += size;
+      } else if (expr->isCast()) {
+        Constant *constPtr = cast<Constant>(pointer);
+        getConstantData(constPtr, mem, offset, relocs);
+        offset += getTypeByteSize(unit, type);
+      }
+      return;
+    }
+    if (isa<GlobalVariable>(c)) {
+      ir::Constant cc = unit.getConstantSet().getConstant(c->getName());
+      unsigned int defOffset = cc.getOffset();
+      relocs.push_back(ir::RelocEntry(offset, defOffset));
+      uint32_t size = getTypeByteSize(unit, type);
+      memset((char*)mem+offset, 0, size);
+      offset += size;
+      return;
+    }
     if(isa<UndefValue>(c)) {
       uint32_t size = getTypeByteSize(unit, type);
       offset += size;
-    } else if(isa<ConstantAggregateZero>(c)) {
+    } else if(isa<ConstantAggregateZero>(c) || isa<ConstantPointerNull>(c)) {
       uint32_t size = getTypeByteSize(unit, type);
       memset((char*)mem+offset, 0, size);
       offset += size;
     switch(id) {
       case Type::TypeID::StructTyID:
@@ -1478,7 +1567,7 @@ namespace gbe
             offset += padding/8;
             const Constant* sub = cast<Constant>(c->getOperand(op));
-            getConstantData(sub, mem, offset);
+            getConstantData(sub, mem, offset, relocs);
@@ -1499,7 +1588,7 @@ namespace gbe
             uint32_t ops = c->getNumOperands();
             for(uint32_t op = 0; op < ops; ++op) {
               Constant * ca = dyn_cast<Constant>(c->getOperand(op));
-              getConstantData(ca, mem, offset);
+              getConstantData(ca, mem, offset, relocs);
               offset += padding;
@@ -1538,30 +1627,75 @@ namespace gbe
           offset += sizeof(double);
+      case Type::TypeID::HalfTyID:
+        {
+          const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+          llvm::APFloat apf = cf->getValueAPF();
+          llvm::APInt api = apf.bitcastToAPInt();
+          uint64_t v64 = api.getZExtValue();
+          uint16_t v16 = static_cast<uint16_t>(v64);
+          *(unsigned short *)((char*)mem+offset) = v16;
+          offset += sizeof(short);
+          break;
+        }
+      case Type::TypeID::PointerTyID:
+        {
+          break;
+        }
+        {
+          c->dump();
+        }
+  static bool isProgramGlobal(const GlobalVariable &v) {
+    unsigned addrSpace = v.getType()->getAddressSpace();
+    // private/global/constant
+    return (addrSpace == 2 || addrSpace == 1 || addrSpace == 0);
+  }
   void GenWriter::collectGlobalConstant(void) const {
     const Module::GlobalListType &globalList = TheModule->getGlobalList();
+    // The first pass just create the global variable constants
     for(auto i = globalList.begin(); i != globalList.end(); i ++) {
       const GlobalVariable &v = *i;
-      if(!v.isConstantUsed()) continue;
       const char *name = v.getName().data();
-      ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace());
-      if(addrSpace == ir::AddressSpace::MEM_CONSTANT || v.isConstant()) {
-        GBE_ASSERT(v.hasInitializer());
-        const Constant *c = v.getInitializer();
-        Type * type = c->getType();
+      vector<ir::RelocEntry> relocs;
+      if(isProgramGlobal(v)) {
+        Type * type = v.getType()->getPointerElementType();
         uint32_t size = getTypeByteSize(unit, type);
-        void* mem = malloc(size);
-        uint32_t offset = 0;
-        getConstantData(c, mem, offset);
         uint32_t alignment = getAlignmentByte(unit, type);
-        unit.newConstant((char *)mem, name, size, alignment);
-        free(mem);
+        unit.newConstant(name, size, alignment);
+      }
+    }
+    // the second pass to initialize the data
+    for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+      const GlobalVariable &v = *i;
+      const char *name = v.getName().data();
+      if(isProgramGlobal(v)) {
+        if (v.hasInitializer()) {
+          vector<ir::RelocEntry> relocs;
+          uint32_t offset = 0;
+          ir::Constant &con = unit.getConstantSet().getConstant(name);
+          void* mem = malloc(con.getSize());
+          const Constant *c = v.getInitializer();
+          getConstantData(c, mem, offset, relocs);
+          unit.getConstantSet().setData((char*)mem, con.getOffset(), con.getSize());
+          free(mem);
+          if (!legacyMode) {
+            uint32_t refOffset = unit.getConstantSet().getConstant(name).getOffset();
+            for (uint32_t k = 0; k < relocs.size(); k++) {
+              unit.getRelocTable().addEntry(
+                  refOffset + relocs[k].refOffset,
+                  relocs[k].defOffset
+                  );
+            }
+          }
+        }
@@ -1571,6 +1705,9 @@ namespace gbe
     // Initialize
     TheModule = &M;
+    uint32_t oclVersion = getModuleOclVersion(TheModule);
+    legacyMode = oclVersion >= 200 ? false : true;
+    unit.setOclVersion(oclVersion);
     return false;
@@ -1703,7 +1840,10 @@ namespace gbe
       // NULL pointers
       if(isa<ConstantPointerNull>(CPV)) {
-        return ctx.newImmediate(uint32_t(0));
+        if (ctx.getPointerFamily() == ir::FAMILY_QWORD)
+          return ctx.newImmediate(uint64_t(0));
+        else
+          return ctx.newImmediate(uint32_t(0));
       const Type::TypeID typeID = CPV->getType()->getTypeID();
@@ -2222,6 +2362,14 @@ namespace gbe
         if(typeNameNode) {
           llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(opID)))->getString();
+          //LLVM 3.9 image's type name include access qual, don't match OpenCL spec, erase them.
+          std::vector<std::string> filters = {"__read_only ", "__write_only "};
+          for (uint32_t i = 0; i < filters.size(); i++) {
+            size_t pos = llvmInfo.typeName.find(filters[i]);
+            if (pos != std::string::npos) {
+              llvmInfo.typeName = llvmInfo.typeName.erase(pos, filters[i].length());
+            }
+          }
           llvmInfo.typeBaseName = (cast<MDString>(typeBaseNameNode->getOperand(opID)))->getString();
@@ -2273,6 +2421,11 @@ namespace gbe
           (void)ctx.getFunction().getSamplerSet()->append(reg, &ctx);
+        if(llvmInfo.isPipeType()) {
+          llvmInfo.typeSize = getTypeSize(F.getParent(),unit,llvmInfo.typeName);
+          ctx.input(argName, ir::FunctionArgument::PIPE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), BtiMap.find(&*I)->second);
+          continue;
+        }
         if (type->isPointerTy() == false)
           ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
@@ -2807,6 +2960,8 @@ namespace gbe
         const Constant *c = v.getInitializer();
         Type *ty = c->getType();
         uint32_t oldSlm = f.getSLMSize();
+        // FIXME temporary reserve 4 bytes to avoid 0 address
+        if (oldSlm == 0) oldSlm = 4;
         uint32_t align = 8 * getAlignmentByte(unit, ty);
         uint32_t padding = getPadding(oldSlm*8, align);
@@ -2814,32 +2969,24 @@ namespace gbe
         ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
-        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32));
-      } else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) {
-        GBE_ASSERT(v.hasInitializer());
-        this->newRegister(const_cast<GlobalVariable*>(&v));
-        ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
-        ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
-        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
-      } else {
+        ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(oldSlm + padding/8, getType(ctx, v.getType())));
+        } else if(addrSpace == ir::MEM_CONSTANT
+               || addrSpace == ir::MEM_GLOBAL
+               || v.isConstant()) {
         if(v.getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
           regTranslator.newScalarProxy(ir::ocl::profilingbptr, const_cast<GlobalVariable*>(&v));
-        } else if(v.getName().str().substr(0, 4) == ".str") {
-          /* When there are multi printf statements in multi kernel fucntions within the same
-             translate unit, if they have the same sting parameter, such as
-             kernel_func1 () {
-               printf("Line is %d\n", line_num1);
-             }
-             kernel_func2 () {
-               printf("Line is %d\n", line_num2);
-             }
-             The Clang will just generate one global string named .strXXX to represent "Line is %d\n"
-             So when translating the kernel_func1, we can not unref that global var, so we will
-             get here. Just ignore it to avoid assert. */
         } else {
-          GBE_ASSERT(0 && "Unsupported private memory access pattern");
+          this->newRegister(const_cast<GlobalVariable*>(&v));
+          ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+          ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
+          ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType())));
+          if (!legacyMode) {
+            ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace, reg);
+          }
+      } else if(addrSpace == ir::MEM_PRIVATE) {
+          this->newRegister(const_cast<GlobalVariable*>(&v));
@@ -3331,7 +3478,10 @@ namespace gbe
       case Instruction::FPTrunc:
       case Instruction::Trunc:
-      break;
+        break;
+      case Instruction::AddrSpaceCast:
+        regTranslator.newValueProxy(srcValue, dstValue);
+        break;
       default: NOT_SUPPORTED;
@@ -3339,6 +3489,8 @@ namespace gbe
   void GenWriter::emitCastInst(CastInst &I) {
     switch (I.getOpcode())
+      case Instruction::AddrSpaceCast:
+        break;
       case Instruction::PtrToInt:
       case Instruction::IntToPtr:
@@ -3628,6 +3780,7 @@ namespace gbe
           case Intrinsic::ctlz:
+          case Intrinsic::cttz:
           case Intrinsic::bswap:
@@ -3679,6 +3832,12 @@ namespace gbe
         regTranslator.newScalarProxy(ir::ocl::lsize1, dst); break;
       case GEN_OCL_GET_LOCAL_SIZE2:
         regTranslator.newScalarProxy(ir::ocl::lsize2, dst); break;
+        regTranslator.newScalarProxy(ir::ocl::enqlsize0, dst); break;
+        regTranslator.newScalarProxy(ir::ocl::enqlsize1, dst); break;
+        regTranslator.newScalarProxy(ir::ocl::enqlsize2, dst); break;
         regTranslator.newScalarProxy(ir::ocl::gsize0, dst); break;
@@ -3741,7 +3900,7 @@ namespace gbe
       case GEN_OCL_FORCE_SIMD16:
       case GEN_OCL_LBARRIER:
       case GEN_OCL_GBARRIER:
-      case GEN_OCL_LGBARRIER:
+      case GEN_OCL_BARRIER:
       case GEN_OCL_WRITE_IMAGE_I:
@@ -3827,6 +3986,7 @@ namespace gbe
       case GEN_OCL_SIMD_SIZE:
       case GEN_OCL_READ_TM:
       case GEN_OCL_REGION:
+      case GEN_OCL_IN_PRIVATE:
       case GEN_OCL_SIMD_ID:
       case GEN_OCL_VME:
@@ -3853,16 +4013,47 @@ namespace gbe
       case GEN_OCL_LRP:
+      case GEN_OCL_GET_PIPE:
+      {
+        Value *srcValue = I.getOperand(0);
+        if( BtiMap.find(dst) == BtiMap.end())
+        {
+          unsigned tranBti = BtiMap.find(srcValue)->second;
+          BtiMap.insert(std::make_pair(dst, tranBti));
+        }
+        regTranslator.newValueProxy(srcValue, dst);
+        break;
+      }
+      case GEN_OCL_MAKE_RID:
+      case GEN_OCL_GET_RID:
+      {
+        Value *srcValue = I.getOperand(0);
+        regTranslator.newValueProxy(srcValue, dst);
+        break;
+      }
+        regTranslator.newScalarProxy(ir::ocl::enqueuebufptr, dst);
+        break;
       case GEN_OCL_PRINTF:
         this->newRegister(&I);  // fall through
       case GEN_OCL_PUTS:
@@ -3877,14 +4068,22 @@ namespace gbe
       case GEN_OCL_DEBUGWAIT:
       case GEN_OCL_NOT_FOUND:
@@ -3904,6 +4103,107 @@ namespace gbe
     ctx.ALU1(opcode, type, dst, src);
+  void GenWriter::regAllocateAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    this->newRegister(&I);
+  }
+  void GenWriter::emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple) {
+    ir::Register pointer = this->getRegister(llvmPtr);
+    ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
+    // Get the function arguments
+    ir::Register ptr;
+    ir::Register btiReg;
+    unsigned SurfaceIndex = 0xff;
+    ir::AddressMode AM;
+    if (legacyMode) {
+      Value *bti = getBtiRegister(llvmPtr);
+      Value *ptrBase = getPointerBase(llvmPtr);
+      ir::Register baseReg = this->getRegister(ptrBase);
+      if (isa<ConstantInt>(bti)) {
+        AM = ir::AM_StaticBti;
+        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+        addrSpace = btiToGen(SurfaceIndex);
+      } else {
+        AM = ir::AM_DynamicBti;
+        addrSpace = ir::MEM_MIXED;
+        btiReg = this->getRegister(bti);
+      }
+      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      ptr = ctx.reg(pointerFamily);
+      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+    } else {
+      AM = ir::AM_Stateless;
+      ptr = pointer;
+    }
+    ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM, SurfaceIndex);
+  }
+  void GenWriter::emitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    // Get the function arguments
+    Value *llvmPtr = I.getPointerOperand();
+    ir::AtomicOps opcode = ir::ATOMIC_OP_CMPXCHG;
+    uint32_t payloadNum = 0;
+    vector<ir::Register> payload;
+    const ir::Register oldValue = this->getRegister(&I, 0);
+    const ir::Register compareRet = this->getRegister(&I, 1);
+    const ir::Register expected = this->getRegister(I.getCompareOperand());
+    payload.push_back(this->getRegister(I.getCompareOperand()));
+    payloadNum++;
+    payload.push_back(this->getRegister(I.getNewValOperand()));
+    payloadNum++;
+    ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+    const ir::Tuple payloadTuple = payloadNum == 0 ?
+                                   ir::Tuple(0) :
+                                   ctx.arrayTuple(&payload[0], payloadNum);
+    this->emitAtomicInstHelper(opcode, type, oldValue, llvmPtr, payloadTuple);
+    ctx.EQ(type, compareRet, oldValue, expected);
+  }
+  void GenWriter::regAllocateAtomicRMWInst(AtomicRMWInst &I) {
+    this->newRegister(&I);
+  }
+  static INLINE ir::AtomicOps atomicOpsLLVMToGen(llvm::AtomicRMWInst::BinOp llvmOp) {
+    switch(llvmOp) {
+      case llvm::AtomicRMWInst::Xchg: return ir::ATOMIC_OP_XCHG;
+      case llvm::AtomicRMWInst::Add:  return ir::ATOMIC_OP_ADD;
+      case llvm::AtomicRMWInst::Sub:  return ir::ATOMIC_OP_SUB;
+      case llvm::AtomicRMWInst::And:  return ir::ATOMIC_OP_AND;
+      case llvm::AtomicRMWInst::Or:   return ir::ATOMIC_OP_OR;
+      case llvm::AtomicRMWInst::Xor:  return ir::ATOMIC_OP_XOR;
+      case llvm::AtomicRMWInst::Max:  return ir::ATOMIC_OP_IMAX;
+      case llvm::AtomicRMWInst::Min:  return ir::ATOMIC_OP_IMIN;
+      case llvm::AtomicRMWInst::UMax: return ir::ATOMIC_OP_UMAX;
+      case llvm::AtomicRMWInst::UMin: return ir::ATOMIC_OP_UMIN;
+      case llvm::AtomicRMWInst::Nand:
+      case llvm::AtomicRMWInst::BAD_BINOP: break;
+    }
+    GBE_ASSERT(false);
+    return ir::ATOMIC_OP_INVALID;
+  }
+  void GenWriter::emitAtomicRMWInst(AtomicRMWInst &I) {
+    // Get the function arguments
+    llvm::AtomicRMWInst::BinOp llvmOpcode = I.getOperation();
+    Value *llvmPtr = I.getOperand(0);
+    ir::AtomicOps opcode = atomicOpsLLVMToGen(llvmOpcode);
+    const ir::Register dst = this->getRegister(&I);
+    uint32_t payloadNum = 0;
+    vector<ir::Register> payload;
+    payload.push_back(this->getRegister(I.getOperand(1)));
+    payloadNum++;
+    ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+    const ir::Tuple payloadTuple = payloadNum == 0 ?
+                                   ir::Tuple(0) :
+                                   ctx.arrayTuple(&payload[0], payloadNum);
+    this->emitAtomicInstHelper(opcode, type, dst, llvmPtr, payloadTuple);
+  }
   void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
@@ -4047,6 +4347,7 @@ namespace gbe
       ctx.SUBGROUP(opcode, getRegister(&I), srcTuple, 1, ir::TYPE_S32);
     } else if (opcode == ir::WORKGROUP_OP_BROADCAST) {
       int argNum = CS.arg_size();
+      GBE_ASSERT(argNum == 2);
       std::vector<ir::Register> src(argNum);
       for (int i = 0; i < argNum; i++) {
         src[i] = this->getRegister(*(AI++));
@@ -4076,7 +4377,7 @@ namespace gbe
     GBE_ASSERT(AI == AE);
-  void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
+  void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) {
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
     GBE_ASSERT(AI != AE);
@@ -4112,7 +4413,6 @@ namespace gbe
       ptr = pointer;
-    ir::Type type = ir::TYPE_U32;
     GBE_ASSERT(AM != ir::AM_DynamicBti);
@@ -4133,7 +4433,7 @@ namespace gbe
     GBE_ASSERT(AI == AE);
-  void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
+  void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) {
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
     GBE_ASSERT(AI != AE);
@@ -4149,7 +4449,7 @@ namespace gbe
         srcTupleData.push_back(getRegister(*(AI), i));
       const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + vec_size);
-      ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size);
+      ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size, type);
     } else {
       ir::Register src[2];
       src[0] = getRegister(*(AI++));
@@ -4159,7 +4459,7 @@ namespace gbe
         dstTupleData.push_back(getRegister(&I, i));
       const ir::Tuple srctuple = ctx.arrayTuple(src, 2);
       const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
-      ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2);
+      ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2, type);
     GBE_ASSERT(AI == AE);
@@ -4307,6 +4607,56 @@ namespace gbe
+          case Intrinsic::cttz:
+          {
+            Type *llvmDstType = I.getType();
+            ir::Type dstType = getType(ctx, llvmDstType);
+            Type *llvmSrcType = I.getOperand(0)->getType();
+            ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
+            //the llvm.ctlz.i64 is lowered to two llvm.cttz.i32 call in ocl_ctz.ll
+            GBE_ASSERT(srcType != ir::TYPE_U64);
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register src = this->getRegister(I.getOperand(0));
+            uint32_t imm_value = 0;
+            if(srcType == ir::TYPE_U16) {
+              imm_value = 0xFFFF0000;
+            }else if(srcType == ir::TYPE_U8) {
+              imm_value = 0xFFFFFF00;
+            }
+            if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) {
+              ir::ImmediateIndex imm;
+              ir::Type tmpType = ir::TYPE_S32;
+              ir::Type revType = ir::TYPE_U32;
+              imm = ctx.newIntegerImmediate(imm_value, revType);
+              const ir::RegisterFamily family = getFamily(revType);
+              const ir::Register immReg = ctx.reg(family);
+              ctx.LOADI(ir::TYPE_U32, immReg, imm);
+              ir::Register tmp0 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp1 = ctx.reg(getFamily(revType));
+              ir::Register tmp2 = ctx.reg(getFamily(revType));
+              ir::Register revTmp = ctx.reg(getFamily(revType));
+              ctx.CVT(tmpType, srcType, tmp0, src);
+              //gen does not have 'tzd', so reverse first
+              ctx.ADD(revType, tmp1, tmp0, immReg);
+              ctx.ALU1(ir::OP_BFREV, revType, revTmp, tmp1);
+              ctx.ALU1(ir::OP_LZD, ir::TYPE_U32, tmp2, revTmp);
+              ctx.CVT(dstType, tmpType, dst, tmp2);
+            }
+            else
+            {
+              GBE_ASSERT(srcType == ir::TYPE_U32);
+              ir::Type revType = ir::TYPE_U32;
+              ir::Register revTmp = ctx.reg(getFamily(revType));
+              ctx.ALU1(ir::OP_BFREV, revType, revTmp, src);
+              ctx.ALU1(ir::OP_LZD, ir::TYPE_U32, dst, revTmp);
+            }
+          }
+          break;
           case Intrinsic::fma:
           case Intrinsic::fmuladd:
@@ -4433,7 +4783,26 @@ namespace gbe
             ctx.VME(imageID, dstTuple, srcTuple, dst_length, src_length,
                     msg_type, vme_search_path_lut_x.getIntegerValue(),
+            break;
+          }
+          case GEN_OCL_IN_PRIVATE:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            uint32_t stackSize = ctx.getFunction().getStackSize();
+            if (stackSize == 0) {
+              ir::ImmediateIndex imm = ctx.newImmediate((bool)0);
+              ctx.LOADI(ir::TYPE_BOOL, dst, imm);
+            } else {
+              ir::Register cmp0 = ctx.reg(ir::FAMILY_BOOL);
+              ir::Register cmp1 = ctx.reg(ir::FAMILY_BOOL);
+              const ir::Register src0 = this->getRegister(*AI);
+              ir::Register tmp = ctx.reg(ir::FAMILY_QWORD);
+              ctx.GE(ir::TYPE_U64, cmp0, src0, ir::ocl::stackbuffer);
+              ctx.ADD(ir::TYPE_U64, tmp, ir::ocl::stackbuffer, ir::ocl::stacksize);
+              ctx.LT(ir::TYPE_U64, cmp1, src0, tmp);
+              ctx.AND(ir::TYPE_BOOL, dst, cmp0, cmp1);
+            }
           case GEN_OCL_REGION:
@@ -4456,7 +4825,31 @@ namespace gbe
           case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
           case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
           case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
-          case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
+          case GEN_OCL_BARRIER:
+          {
+            Constant *CPV = dyn_cast<Constant>(*AI);
+            unsigned syncFlag = 0;
+            if (CPV) {
+              const ir::Immediate &x = processConstantImm(CPV);
+              unsigned barrierArg = x.getIntegerValue();
+              if (barrierArg & 0x1) {
+                syncFlag |= ir::syncLocalBarrier;
+              }
+              if (barrierArg & 0x2) {
+                syncFlag |= ir::syncGlobalBarrier;
+              }
+              if (barrierArg & 0x4) {
+                syncFlag |= ir::syncImageBarrier;
+              }
+            } else {
+              // FIXME we default it to do global fence and barrier.
+              // we need to do runtime check here.
+              syncFlag = ir::syncLocalBarrier | ir::syncGlobalBarrier;
+            }
+            ctx.SYNC(syncFlag);
+            break;
+          }
           case GEN_OCL_ATOMIC_ADD0:
           case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break;
           case GEN_OCL_ATOMIC_SUB0:
@@ -4529,6 +4922,7 @@ namespace gbe
             bool isFloatCoord = coordType == ir::TYPE_FLOAT;
             bool requiredFloatCoord = samplerOffset == 0;
+            (void) isFloatCoord;
             GBE_ASSERT(isFloatCoord == requiredFloatCoord);
             vector<ir::Register> dstTupleData, srcTupleData;
@@ -4904,6 +5298,7 @@ namespace gbe
             Value *bti = getBtiRegister(llvmPtr);
             GBE_ASSERT(isa<ConstantInt>(bti)); //Should never be mixed pointer.
             uint32_t index = cast<ConstantInt>(bti)->getZExtValue();
+            (void) index;
             GBE_ASSERT(btiToGen(index) == ir::MEM_GLOBAL);
             GBE_ASSERT(AI != AE);
@@ -4992,38 +5387,99 @@ namespace gbe
             ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2);
             this->emitBlockReadWriteMemInst(I, CS, false, 1); break;
             this->emitBlockReadWriteMemInst(I, CS, false, 2); break;
             this->emitBlockReadWriteMemInst(I, CS, false, 4); break;
             this->emitBlockReadWriteMemInst(I, CS, false, 8); break;
             this->emitBlockReadWriteMemInst(I, CS, true, 1); break;
             this->emitBlockReadWriteMemInst(I, CS, true, 2); break;
             this->emitBlockReadWriteMemInst(I, CS, true, 4); break;
             this->emitBlockReadWriteMemInst(I, CS, true, 8); break;
             this->emitBlockReadWriteImageInst(I, CS, false, 1); break;
             this->emitBlockReadWriteImageInst(I, CS, false, 2); break;
             this->emitBlockReadWriteImageInst(I, CS, false, 4); break;
             this->emitBlockReadWriteImageInst(I, CS, false, 8); break;
             this->emitBlockReadWriteImageInst(I, CS, true, 1); break;
             this->emitBlockReadWriteImageInst(I, CS, true, 2); break;
             this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
             this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
+            this->emitBlockReadWriteMemInst(I, CS, false, 1, ir::TYPE_U16); break;
+            this->emitBlockReadWriteMemInst(I, CS, false, 2, ir::TYPE_U16); break;
+            this->emitBlockReadWriteMemInst(I, CS, false, 4, ir::TYPE_U16); break;
+            this->emitBlockReadWriteMemInst(I, CS, false, 8, ir::TYPE_U16); break;
+            this->emitBlockReadWriteMemInst(I, CS, true, 1, ir::TYPE_U16); break;
+            this->emitBlockReadWriteMemInst(I, CS, true, 2, ir::TYPE_U16); break;
+            this->emitBlockReadWriteMemInst(I, CS, true, 4, ir::TYPE_U16); break;
+            this->emitBlockReadWriteMemInst(I, CS, true, 8, ir::TYPE_U16); break;
+            this->emitBlockReadWriteImageInst(I, CS, false, 1, ir::TYPE_U16); break;
+            this->emitBlockReadWriteImageInst(I, CS, false, 2, ir::TYPE_U16); break;
+            this->emitBlockReadWriteImageInst(I, CS, false, 4, ir::TYPE_U16); break;
+            this->emitBlockReadWriteImageInst(I, CS, false, 8, ir::TYPE_U16); break;
+            this->emitBlockReadWriteImageInst(I, CS, true, 1, ir::TYPE_U16); break;
+            this->emitBlockReadWriteImageInst(I, CS, true, 2, ir::TYPE_U16); break;
+            this->emitBlockReadWriteImageInst(I, CS, true, 4, ir::TYPE_U16); break;
+            this->emitBlockReadWriteImageInst(I, CS, true, 8, ir::TYPE_U16); break;
+          case GEN_OCL_GET_PIPE:
+          case GEN_OCL_MAKE_RID:
+          case GEN_OCL_GET_RID:
+          {
+            break;
+          }
+          {
+            GBE_ASSERT(AI != AE);
+            Value *srcValue = *AI;
+            ++AI;
+            Value *dstValue = &I;
+            regTranslator.newValueProxy(srcValue, dstValue);
+            break;
+          }
+          {
+            GBE_ASSERT(AI != AE);
+            Value *srcValue = *AI;
+            ++AI;
+            Value *dstValue = &I;
+            regTranslator.newValueProxy(srcValue, dstValue);
+            break;
+          }
+          {
+            ctx.getFunction().setUseDeviceEnqueue(true);
+            break;
+          }
           default: break;
@@ -5073,15 +5529,23 @@ namespace gbe
       uint32_t prevStackPtr = ctx.getFunction().getStackSize();
       uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr;
       if (step != 0) {
-        ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
+        ir::ImmediateIndex stepImm;
+        ir::Type pointerTy = getType(pointerFamily);
+        if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+          stepImm = ctx.newImmediate(uint32_t(step));
+        else
+          stepImm = ctx.newImmediate(uint64_t(step));
         ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
-        ctx.LOADI(ir::TYPE_U32, stepReg, stepImm);
-        ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
+        ctx.LOADI(pointerTy, stepReg, stepImm);
+        ctx.ADD(pointerTy, stack, stack, stepReg);
     // Set the destination register properly
-    ctx.MOV(imm.getType(), dst, stack);
+    if (legacyMode)
+      ctx.MOV(imm.getType(), dst, stack);
+    else
+      ctx.ADD(imm.getType(), dst, stack, ir::ocl::stackbuffer);
     ctx.LOADI(imm.getType(), reg, immIndex);
     ctx.ADD(imm.getType(), stack, stack, reg);
@@ -5249,7 +5713,7 @@ namespace gbe
       // but later ArgumentLower pass need to match exact load/addImm pattern
       // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
       if (!zeroBase)
-        ctx.SUB(ir::TYPE_U32, mPtr, pointer, baseReg);
+        ctx.SUB(getType(ctx, llvmPtr->getType()), mPtr, pointer, baseReg);
         mPtr = pointer;
     } else {
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index f2a278e..1ab77c9 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -153,7 +153,12 @@ namespace gbe
   llvm::FunctionPass* createSamplerFixPass();
   /*! Add all the function call of ocl to our bitcode. */
-  llvm::Module* runBitCodeLinker(llvm::Module *mod, bool strictMath);
+  llvm::Module* runBitCodeLinker(llvm::Module *mod, bool strictMath, ir::Unit &unit);
+  /*! Get the moudule's opencl version form meta data. */
+  uint32_t getModuleOclVersion(const llvm::Module *M);
+  void collectDeviceEnqueueInfo(llvm::Module *mod, ir::Unit &unit);
   void* getPrintfInfo(llvm::CallInst* inst);
 } /* namespace gbe */
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 48a72d1..86485da 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -10,6 +10,9 @@ DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS2, __gen_ocl_get_num_groups2)
 DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE0, __gen_ocl_get_local_size0)
 DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE1, __gen_ocl_get_local_size1)
 DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE2, __gen_ocl_get_local_size2)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE0, __gen_ocl_get_enqueued_local_size0)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE1, __gen_ocl_get_enqueued_local_size1)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE2, __gen_ocl_get_enqueued_local_size2)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE0, __gen_ocl_get_global_size0)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE1, __gen_ocl_get_global_size1)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
@@ -27,7 +30,7 @@ DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
 // Barrier function
 DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
 DECL_LLVM_GEN_FUNCTION(GBARRIER, __gen_ocl_barrier_global)
-DECL_LLVM_GEN_FUNCTION(LGBARRIER, __gen_ocl_barrier_local_and_global)
 // To force SIMD8/16 compilation
 DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8,  __gen_ocl_force_simd8)
@@ -169,6 +172,7 @@ DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle)
 DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
+DECL_LLVM_GEN_FUNCTION(IN_PRIVATE, __gen_ocl_in_private)
@@ -217,22 +221,48 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, __gen_ocl_sub_group_scan_in
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_inclusive_max)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2, __gen_ocl_sub_group_block_read_mem2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4, __gen_ocl_sub_group_block_read_mem4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8, __gen_ocl_sub_group_block_read_mem8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2, __gen_ocl_sub_group_block_write_mem2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4, __gen_ocl_sub_group_block_write_mem4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8, __gen_ocl_sub_group_block_write_mem8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, __gen_ocl_sub_group_block_read_image)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, __gen_ocl_sub_group_block_read_image2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, __gen_ocl_sub_group_block_read_image4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE8, __gen_ocl_sub_group_block_read_image8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE, __gen_ocl_sub_group_block_write_image)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, __gen_ocl_sub_group_block_write_image2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, __gen_ocl_sub_group_block_write_image4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, __gen_ocl_sub_group_block_write_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM, __gen_ocl_sub_group_block_read_ui_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM2, __gen_ocl_sub_group_block_read_ui_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM4, __gen_ocl_sub_group_block_read_ui_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM8, __gen_ocl_sub_group_block_read_ui_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM, __gen_ocl_sub_group_block_write_ui_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM2, __gen_ocl_sub_group_block_write_ui_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM4, __gen_ocl_sub_group_block_write_ui_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM8, __gen_ocl_sub_group_block_write_ui_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE, __gen_ocl_sub_group_block_read_ui_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE2, __gen_ocl_sub_group_block_read_ui_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE4, __gen_ocl_sub_group_block_read_ui_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE8, __gen_ocl_sub_group_block_read_ui_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE, __gen_ocl_sub_group_block_write_ui_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE2, __gen_ocl_sub_group_block_write_ui_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE4, __gen_ocl_sub_group_block_write_ui_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE8, __gen_ocl_sub_group_block_write_ui_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM, __gen_ocl_sub_group_block_read_us_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM2, __gen_ocl_sub_group_block_read_us_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM4, __gen_ocl_sub_group_block_read_us_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM8, __gen_ocl_sub_group_block_read_us_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM, __gen_ocl_sub_group_block_write_us_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM2, __gen_ocl_sub_group_block_write_us_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM4, __gen_ocl_sub_group_block_write_us_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM8, __gen_ocl_sub_group_block_write_us_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE, __gen_ocl_sub_group_block_read_us_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE2, __gen_ocl_sub_group_block_read_us_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE4, __gen_ocl_sub_group_block_read_us_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE8, __gen_ocl_sub_group_block_read_us_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE, __gen_ocl_sub_group_block_write_us_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE2, __gen_ocl_sub_group_block_write_us_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE4, __gen_ocl_sub_group_block_write_us_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE8, __gen_ocl_sub_group_block_write_us_image8)
 // common function
+// pipe function
+DECL_LLVM_GEN_FUNCTION(GET_PIPE, __gen_ocl_get_pipe)
+DECL_LLVM_GEN_FUNCTION(GET_RID, __gen_ocl_get_rid)
+DECL_LLVM_GEN_FUNCTION(MAKE_RID, __gen_ocl_make_rid)
+//Enqueue function
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index c26e96a..f01bb51 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -54,6 +54,8 @@ namespace gbe {
             return 'c';
           case 3:
             return 'l';
+          case 4:
+            return 'n';
             assert(0 && "Non support address space");
             return '\0';
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index 02dd4bf..367a2c3 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -65,6 +65,27 @@ namespace gbe
     return bKernel;
+  uint32_t getModuleOclVersion(const llvm::Module *M) {
+    uint32_t oclVersion = 120;
+    NamedMDNode *version = M->getNamedMetadata("opencl.ocl.version");
+    if (version == NULL)
+      return oclVersion;
+    uint32_t ops = version->getNumOperands();
+    if(ops > 0) {
+      uint32_t major = 0, minor = 0;
+      MDNode* node = version->getOperand(0);
+      major = mdconst::extract<ConstantInt>(node->getOperand(0))->getZExtValue();
+      minor = mdconst::extract<ConstantInt>(node->getOperand(1))->getZExtValue();
+      major = cast<ConstantInt>(node->getOperand(0))->getZExtValue();
+      minor = cast<ConstantInt>(node->getOperand(1))->getZExtValue();
+      oclVersion = major * 100 + minor * 10;
+    }
+    return oclVersion;
+  }
   int32_t getPadding(int32_t offset, int32_t align) {
     return (align - (offset % align)) % align; 
@@ -262,7 +283,7 @@ namespace gbe
+#if 0
         //HACK TODO: Inserted by type replacement.. this code could break something????
         if(getTypeByteSize(unit, operand->getType())>4)
@@ -286,7 +307,7 @@ namespace gbe
                   "", GEPInst);
         Value* tmpMul = operand;
         if (size != 1) {
           tmpMul = BinaryOperator::Create(Instruction::Mul, newConstSize, operand,
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 615fb50..8850abb 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -682,10 +682,14 @@ namespace gbe {
             *CI = InsertToVector(call, *CI);
@@ -693,22 +697,32 @@ namespace gbe {
               *CI = InsertToVector(call, *CI);
             if ((*CI)->getType()->isVectorTy())
               *CI = InsertToVector(call, *CI);
           case GEN_OCL_VME:
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index e108810..bef4df1 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -136,6 +136,9 @@ namespace gbe
+    MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr before inlining.
+    MPM.add(createFunctionInliningPass(20000));
+    MPM.add(createBarrierNodupPass(true));    // restore noduplicate fnAttr after inlining.
     MPM.add(createStripAttributesPass());     // Strip unsupported attributes and calling conventions.
     MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
@@ -146,9 +149,6 @@ namespace gbe
     MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
     MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
     MPM.add(createPruneEHPass());             // Remove dead EH info
-    MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr before inlining.
-    MPM.add(createFunctionInliningPass(20000));
-    MPM.add(createBarrierNodupPass(true));    // restore noduplicate fnAttr after inlining.
@@ -318,11 +318,23 @@ namespace gbe
     if (!cl_mod) return false;
+    legacy::PassManager passes__;
+    PassManager passes__;
+    //run ExpandConstantExprPass before collectDeviceEnqueueInfo
+    //to simplify the analyze of block.
+    passes__.add(createExpandConstantExprPass());    // constant prop may generate ConstantExpr
+    passes__.run(*cl_mod);
+    /* Must call before materialize when link */
+    collectDeviceEnqueueInfo(cl_mod, unit);
     std::unique_ptr<Module> M;
-    /* Before do any thing, we first filter in all CL functions in bitcode. */ 
-    M.reset(runBitCodeLinker(cl_mod, strictMath));
+    /* Before do any thing, we first filter in all CL functions in bitcode. */
+    /* Also set unit's pointer size in runBitCodeLinker */
+    M.reset(runBitCodeLinker(cl_mod, strictMath, unit));
     if (!module)
       delete cl_mod;
     if (M.get() == 0)
diff --git a/backend/src/llvm/llvm_unroll.cpp b/backend/src/llvm/llvm_unroll.cpp
index 8a492d6..e24dc4f 100644
--- a/backend/src/llvm/llvm_unroll.cpp
+++ b/backend/src/llvm/llvm_unroll.cpp
@@ -103,13 +103,11 @@ namespace gbe {
       void setUnrollID(Loop *L, bool enable) {
-        if (!enable && disabledLoops.find(L) != disabledLoops.end())
-           return;
+        assert(enable);
         LLVMContext &Context = L->getHeader()->getContext();
         SmallVector<Metadata *, 2> forceUnroll;
         forceUnroll.push_back(MDString::get(Context, "llvm.loop.unroll.enable"));
-        forceUnroll.push_back(ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), enable)));
         MDNode *forceUnrollNode = MDNode::get(Context, forceUnroll);
         SmallVector<Metadata *, 4> Vals;
@@ -127,8 +125,6 @@ namespace gbe {
         // Set operand 0 to refer to the loop id itself.
         NewLoopID->replaceOperandWith(0, NewLoopID);
-        if (!enable)
-          disabledLoops.insert(L);
       static bool hasPrivateLoadStore(Loop *L) {
@@ -190,7 +186,8 @@ namespace gbe {
         if (currTripCount > 32) {
           shouldUnroll = false;
-          setUnrollID(currL, false);
+          //Don't change the unrollID if doesn't force unroll.
+          //setUnrollID(currL, false);
           return shouldUnroll;
@@ -206,7 +203,8 @@ namespace gbe {
               parentTripCount = SE->getSmallConstantTripCount(parentL, parentExitBlock);
           if (parentTripCount != 0 && currTripCount * parentTripCount > 32) {
-            setUnrollID(parentL, false);
+            //Don't change the unrollID if doesn't force unroll.
+            //setUnrollID(parentL, false);
@@ -243,8 +241,6 @@ namespace gbe {
       virtual const char *getPassName() const {
         return "SPIR backend: custom loop unrolling pass";
-    private:
-      std::set<Loop *> disabledLoops;
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index 42e6cc4..b9b5c6f 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -5,7 +5,7 @@
 // Common defines for Image intrinsics
 // Channel order
-#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB)
+#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB || color == CLK_sRGBA || color == CLK_sBGRA)
 enum {
   CLK_R = 0x10B0,
   CLK_A = 0x10B1,
@@ -29,6 +29,11 @@ enum {
   CLK_RGx = 0x10BB,
   CLK_RGBx = 0x10BC
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_2_0)
+  ,
+  CLK_sRGBA = 0x10C1,
+  CLK_sBGRA = 0x10C2
@@ -118,8 +123,4 @@ typedef enum clk_sampler_type {
 } clk_sampler_type;
-// Memory synchronization
-#define CLK_LOCAL_MEM_FENCE     (1 << 0)
-#define CLK_GLOBAL_MEM_FENCE    (1 << 1)
 #endif   /* __OCL_COMMON_DEFINES__ */
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index 64d33dc..5c62b4c 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -158,6 +158,18 @@ Supported Targets
  * 6th Generation Intel Core Processors "Skylake" and "Kabylake".
  * 5th Generation Intel Atom Processors "Broxten" or "Apollolake".
+OpenCL 2.0
+From release v1.3.0, beignet support OpenCL 2.0. By default, OpenCL 2.0 support is disabled, you can enable it when cmake with option
+-DENABLE_OPENCL_20=1. Please remember that to enable OpenCL 2.0, there are some dependencies. First, OpenCL 2.0 only support the targets
+from Skylake, include Skylake, Kabylake and Apollolake. Then, clang supports all OpenCL 2.0 feature from 3.9. So to enable OpenCL 2.0,
+you must update LLVM/clang to 3.9 or later. And also requeires libdrm at least 2.4.66.
+After enable OpenCL 2.0, beignet complies with OpenCL 2.0 spec, but some OpenCL 2.0 features are simulated by software, there is no performance
+gain, such as pipe and device queues, especially device queues.
+If you build beignet with OpenCL 2.0 enabled and your kernel don't use the OpenCL 2.0 features, please pass a build option -cl-std=CL1.2 for
+performance, the OpenCL 2.0 uses more registers and has lots of int64 operations, which may hurt performance, and beignet will continue to improve
+OpenCL 2.0 performance.
 Known Issues
@@ -273,6 +285,7 @@ Documents for OpenCL application developers
 - [[V4l2 Buffer Sharing|Beignet/howto/v4l2-buffer-sharing-howto]]
 - [[Video Motion Estimation|Beignet/howto/video-motion-estimation-howto]]
 - [[Stand Alone Unit Test|Beignet/howto/stand-alone-utest-howto]]
+- [[Android build|Beignet/android-build-howto]]
 The wiki URL is as below:
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index 2ef0a89..465f38b 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,5 +1,8 @@
 # News
+## Jan 20, 2017
+[Beignet 1.3.0](https://01.org/beignet/downloads/beignet-1.3.0-2017-01-20) is released. This is a major release. Please see the release notes for more information.
 ## Nov 4, 2016
 [Beignet 1.2.1](https://01.org/beignet/downloads/beignet-1.2.1-2016-11-04) is released. This is a bug-fix release.
diff --git a/docs/howto/android-build-howto.mdwn b/docs/howto/android-build-howto.mdwn
new file mode 100644
index 0000000..be4d6a0
--- /dev/null
+++ b/docs/howto/android-build-howto.mdwn
@@ -0,0 +1,64 @@
+Android build HowTo
+Beignet supports android system, but there are several android versions, Beignet
+only provide the android 5.1.1 (lollipop)'s build files. This document describes
+how to build and install the Beignet driver and utests to android devices.
+Please notice that, Beignet disable PCH in the android, so the kernel's compiling is very
+slow, we recommend used cross-compiler and OpenCL's binary program in android.
+You must have the android's source code and have built the android system successfully.
+Build Beignet
+Beignet is just similar as other android native libraries, the steps are as below.
+- Add the Beignet to android source code tree.
+  Put Beignet's source code to the appropriate directory under android source code,
+  such as vendor/intel/ufo or external.
+- Set up android's environment and choose a target
+  Run `source build/envsetup.sh` and `lunch`.
+- Build clang target library.
+  Android only build clang for host, Beignet depends on target libclang.so. Beignet provide
+  lollipop's patch for clang (version 3.5) to build target libclang.so, you can download the patch
+  from [this patch](https://01.org/sites/default/files/enable-clang-device-build-for-beignet.patch).
+  Change directory to external/clang, run `git apply` to apply the patch, and run `mm` to build
+  libclang.so.
+- Change to Beignet's directory, and make.
+  Run `mm -B`, if failed first time, run `mm -B` again. When finish, the Beignet libraries and
+  utests binary would be generated in the out/target/product/<target>/system/
+Install Beignet to target device
+To install Beignet, you need `adb push` all Beignet and clang libraries to the device's correspondent
+directory, include following files:
+out/target/product/<target>/system/lib64/libclang.so to /system/lib64/
+out/target/product/<target>/system/lib64/libcl.so to /system/lib64/
+out/target/product/<target>/system/lib64/libgbeinterp.so to /system/lib64/
+out/target/product/<target>/system/lib64/libgbe.so to /system/lib64/
+out/target/product/<target>/system/lib/libclang.so to /system/lib/
+out/target/product/<target>/system/lib/libcl.so to /system/lib/
+out/target/product/<target>/system/lib/libgbeinterp.so to /system/lib/
+out/target/product/<target>/system/lib/libgbe.so to /system/lib/
+Additionally, the bitcode file and ocl head file also need copy to device, as following:
+out/target/product/<target>/gen/SHARED_LIBRARIES/libgbe_intermediates/beignet.bc  to /system/lib/ocl/
+out/target/product/<target>/gen/SHARED_LIBRARIES/libgbe_intermediates/libocl/include/  to /system/lib/ocl/include/
+If your application is linked to libOpenCL.so, you also need create a soft link `ln -s libcl.so libOpenCL.so`.
+If you want to run utests in the device, you all need copy the utests files:
+out/target/product/<target>/system/bin/utest_run-x86 to /system/bin/utest_run-x86
+out/target/product/<target>/system/bin/utest_run-x86_64 to /system/bin/utest_run-x86_64
+out/target/product/<target>/system/lib64/libutests.so to /system/lib64/
+out/target/product/<target>/system/lib/libcl.so to /system/lib/
+You also need copy the utests' kernels to the device:
+<Beignet path>/kernels/ to /system/lib/ocl/
+and set the environment variable "OCL_KERNEL_PATH=/system/lib/ocl/kernels/"before run utests.
diff --git a/include/CL/cl.h b/include/CL/cl.h
index 316565d..116f5d5 100644
--- a/include/CL/cl.h
+++ b/include/CL/cl.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
@@ -55,16 +60,19 @@ typedef cl_bitfield         cl_device_fp_config;
 typedef cl_uint             cl_device_mem_cache_type;
 typedef cl_uint             cl_device_local_mem_type;
 typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_device_svm_capabilities;
 typedef cl_bitfield         cl_command_queue_properties;
 typedef intptr_t            cl_device_partition_property;
 typedef cl_bitfield         cl_device_affinity_domain;
 typedef intptr_t            cl_context_properties;
 typedef cl_uint             cl_context_info;
+typedef cl_bitfield         cl_queue_properties;
 typedef cl_uint             cl_command_queue_info;
 typedef cl_uint             cl_channel_order;
 typedef cl_uint             cl_channel_type;
 typedef cl_bitfield         cl_mem_flags;
+typedef cl_bitfield         cl_svm_mem_flags;
 typedef cl_uint             cl_mem_object_type;
 typedef cl_uint             cl_mem_info;
 typedef cl_bitfield         cl_mem_migration_flags;
@@ -74,6 +82,8 @@ typedef cl_uint             cl_addressing_mode;
 typedef cl_uint             cl_filter_mode;
 typedef cl_uint             cl_sampler_info;
 typedef cl_bitfield         cl_map_flags;
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
 typedef cl_uint             cl_program_info;
 typedef cl_uint             cl_program_build_info;
 typedef cl_uint             cl_program_binary_type;
@@ -87,7 +97,8 @@ typedef cl_uint             cl_kernel_work_group_info;
 typedef cl_uint             cl_event_info;
 typedef cl_uint             cl_command_type;
 typedef cl_uint             cl_profiling_info;
+typedef cl_bitfield         cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
 typedef struct _cl_image_format {
     cl_channel_order        image_channel_order;
@@ -104,7 +115,13 @@ typedef struct _cl_image_desc {
     size_t                  image_slice_pitch;
     cl_uint                 num_mip_levels;
     cl_uint                 num_samples;
-    cl_mem                  buffer;
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+    union {
+      cl_mem                  buffer;
+      cl_mem                  mem_object;
+    };
 } cl_image_desc;
 typedef struct _cl_buffer_region {
@@ -176,11 +193,14 @@ typedef struct _cl_buffer_region {
 #define CL_INVALID_COMPILER_OPTIONS                 -66
 #define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
 /* OpenCL Version */
 #define CL_VERSION_1_0                              1
 #define CL_VERSION_1_1                              1
 #define CL_VERSION_1_2                              1
+#define CL_VERSION_2_0                              1
 /* cl_bool */
 #define CL_FALSE                                    0
@@ -204,82 +224,98 @@ typedef struct _cl_buffer_region {
 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
 /* cl_device_info */
-#define CL_DEVICE_TYPE                              0x1000
-#define CL_DEVICE_VENDOR_ID                         0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
-#define CL_DEVICE_ADDRESS_BITS                      0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
-#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
-#define CL_DEVICE_MAX_SAMPLERS                      0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
-#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
-#define CL_DEVICE_AVAILABLE                         0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
-#define CL_DEVICE_NAME                              0x102B
-#define CL_DEVICE_VENDOR                            0x102C
-#define CL_DRIVER_VERSION                           0x102D
-#define CL_DEVICE_PROFILE                           0x102E
-#define CL_DEVICE_VERSION                           0x102F
-#define CL_DEVICE_EXTENSIONS                        0x1030
-#define CL_DEVICE_PLATFORM                          0x1031
-#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+#define CL_DEVICE_TYPE                                  0x1000
+#define CL_DEVICE_VENDOR_ID                             0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                     0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS              0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                   0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                   0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT            0x1008
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                   0x100C
+#define CL_DEVICE_ADDRESS_BITS                          0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                   0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                  0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                    0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                     0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                    0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                     0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                    0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                     0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                         0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                    0x1017
+#define CL_DEVICE_MAX_SAMPLERS                          0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                   0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE              0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                      0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                 0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE             0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                 0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                       0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE              0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                     0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                        0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                        0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT              0x1024
+#define CL_DEVICE_ENDIAN_LITTLE                         0x1026
+#define CL_DEVICE_AVAILABLE                             0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                    0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                      0x102A    /* deprecated */
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES              0x102A
+#define CL_DEVICE_NAME                                  0x102B
+#define CL_DEVICE_VENDOR                                0x102C
+#define CL_DRIVER_VERSION                               0x102D
+#define CL_DEVICE_PROFILE                               0x102E
+#define CL_DEVICE_VERSION                               0x102F
+#define CL_DEVICE_EXTENSIONS                            0x1030
+#define CL_DEVICE_PLATFORM                              0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                      0x1032
 /* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
-#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
-#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
-#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
-#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
-#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
-#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
-#define CL_DEVICE_PARENT_DEVICE                     0x1042
-#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
-#define CL_DEVICE_PARTITION_TYPE                    0x1046
-#define CL_DEVICE_REFERENCE_COUNT                   0x1047
-#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
-#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                   0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR              0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT             0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT               0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG              0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT             0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF              0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                      0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                      0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                      0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                 0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                  0x1041
+#define CL_DEVICE_PARENT_DEVICE                         0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES             0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                  0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN             0x1045
+#define CL_DEVICE_PARTITION_TYPE                        0x1046
+#define CL_DEVICE_REFERENCE_COUNT                       0x1047
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                    0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                 0x104A
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS             0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE              0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE              0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                  0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                  0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                      0x1053
+#define CL_DEVICE_MAX_PIPE_ARGS                         0x1055
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                  0x1057
 /* cl_device_fp_config - bitfield */
 #define CL_FP_DENORM                                (1 << 0)
@@ -307,6 +343,8 @@ typedef struct _cl_buffer_region {
 /* cl_command_queue_properties - bitfield */
 #define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
 /* cl_context_info  */
 #define CL_CONTEXT_REFERENCE_COUNT                  0x1080
@@ -325,20 +363,27 @@ typedef struct _cl_buffer_region {
 /* cl_device_affinity_domain */
-#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
-#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
-#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
-#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
-#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
 /* cl_command_queue_info */
 #define CL_QUEUE_CONTEXT                            0x1090
 #define CL_QUEUE_DEVICE                             0x1091
 #define CL_QUEUE_REFERENCE_COUNT                    0x1092
 #define CL_QUEUE_PROPERTIES                         0x1093
+#define CL_QUEUE_SIZE                               0x1094
-/* cl_mem_flags - bitfield */
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
 #define CL_MEM_READ_WRITE                           (1 << 0)
 #define CL_MEM_WRITE_ONLY                           (1 << 1)
 #define CL_MEM_READ_ONLY                            (1 << 2)
@@ -349,6 +394,9 @@ typedef struct _cl_buffer_region {
 #define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
 #define CL_MEM_HOST_READ_ONLY                       (1 << 8)
 #define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
 /* cl_mem_migration_flags - bitfield */
 #define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
@@ -370,6 +418,11 @@ typedef struct _cl_buffer_region {
 #define CL_RGBx                                     0x10BC
 #define CL_DEPTH                                    0x10BD
 #define CL_DEPTH_STENCIL                            0x10BE
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
 /* cl_channel_type */
 #define CL_SNORM_INT8                               0x10D0
@@ -397,6 +450,7 @@ typedef struct _cl_buffer_region {
 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
 #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
 #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#define CL_MEM_OBJECT_PIPE                          0x10F7
 /* cl_mem_info */
 #define CL_MEM_TYPE                                 0x1100
@@ -408,6 +462,7 @@ typedef struct _cl_buffer_region {
 #define CL_MEM_CONTEXT                              0x1106
 #define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
 #define CL_MEM_OFFSET                               0x1108
+#define CL_MEM_USES_SVM_POINTER                     0x1109
 /* cl_image_info */
 #define CL_IMAGE_FORMAT                             0x1110
@@ -421,6 +476,10 @@ typedef struct _cl_buffer_region {
 #define CL_IMAGE_BUFFER                             0x1118
 #define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
 #define CL_IMAGE_NUM_SAMPLES                        0x111A
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
 /* cl_addressing_mode */
 #define CL_ADDRESS_NONE                             0x1130
@@ -439,6 +498,9 @@ typedef struct _cl_buffer_region {
 #define CL_SAMPLER_NORMALIZED_COORDS                0x1152
 #define CL_SAMPLER_ADDRESSING_MODE                  0x1153
 #define CL_SAMPLER_FILTER_MODE                      0x1154
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
 /* cl_map_flags - bitfield */
 #define CL_MAP_READ                                 (1 << 0)
@@ -461,6 +523,7 @@ typedef struct _cl_buffer_region {
 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
 #define CL_PROGRAM_BUILD_LOG                        0x1183
 #define CL_PROGRAM_BINARY_TYPE                      0x1184
 /* cl_program_binary_type */
 #define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
@@ -506,6 +569,7 @@ typedef struct _cl_buffer_region {
 #define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
 #define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
 #define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
 /* cl_kernel_work_group_info */
 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
@@ -514,6 +578,10 @@ typedef struct _cl_buffer_region {
 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
 #define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
 /* cl_event_info  */
 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
@@ -548,6 +616,11 @@ typedef struct _cl_buffer_region {
 #define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
 #define CL_COMMAND_FILL_BUFFER                      0x1207
 #define CL_COMMAND_FILL_IMAGE                       0x1208
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
 /* command execution status */
 #define CL_COMPLETE                                 0x0
@@ -563,6 +636,7 @@ typedef struct _cl_buffer_region {
 #define CL_PROFILING_COMMAND_SUBMIT                 0x1281
 #define CL_PROFILING_COMMAND_START                  0x1282
 #define CL_PROFILING_COMMAND_END                    0x1283
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
@@ -638,10 +712,10 @@ clGetContextInfo(cl_context         /* context */,
 /* Command Queue APIs */
 extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context                     /* context */, 
-                     cl_device_id                   /* device */, 
-                     cl_command_queue_properties    /* properties */,
-                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateCommandQueueWithProperties(cl_context               /* context */,
+                                   cl_device_id             /* device */,
+                                   const cl_queue_properties *    /* properties */,
+                                   cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
@@ -679,6 +753,14 @@ clCreateImage(cl_context              /* context */,
               void *                  /* host_ptr */,
               cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 /* context */,
+             cl_mem_flags               /* flags */,
+             cl_uint                    /* pipe_packet_size */,
+             cl_uint                    /* pipe_max_packets */,
+             const cl_pipe_properties * /* properties */,
+             cl_int *                   /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
@@ -706,19 +788,36 @@ clGetImageInfo(cl_mem           /* image */,
                size_t           /* param_value_size */,
                void *           /* param_value */,
                size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo(cl_mem           /* pipe */,
+              cl_pipe_info     /* param_name */,
+              size_t           /* param_value_size */,
+              void *           /* param_value */,
+              size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
-clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
-                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
-                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+clSetMemObjectDestructorCallback(cl_mem /* memobj */,
+                                 void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                 void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
+/* SVM Allocation APIs */
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       /* context */,
+           cl_svm_mem_flags /* flags */,
+           size_t           /* size */,
+           cl_uint          /* alignment */) CL_API_SUFFIX__VERSION_2_0;
+clSVMFree(cl_context        /* context */,
+          void *            /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
 /* Sampler APIs */
 extern CL_API_ENTRY cl_sampler CL_API_CALL
-clCreateSampler(cl_context          /* context */,
-                cl_bool             /* normalized_coords */, 
-                cl_addressing_mode  /* addressing_mode */, 
-                cl_filter_mode      /* filter_mode */,
-                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateSamplerWithProperties(cl_context                     /* context */,
+                              const cl_sampler_properties *  /* normalized_coords */,
+                              cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
@@ -837,6 +936,17 @@ clSetKernelArg(cl_kernel    /* kernel */,
                const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    /* kernel */,
+                         cl_uint      /* arg_index */,
+                         const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            /* kernel */,
+                    cl_kernel_exec_info  /* param_name */,
+                    size_t               /* param_value_size */,
+                    const void *         /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
 clGetKernelInfo(cl_kernel       /* kernel */,
                 cl_kernel_info  /* param_name */,
                 size_t          /* param_value_size */,
@@ -1122,13 +1232,6 @@ clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
                        cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueTask(cl_command_queue  /* command_queue */,
-              cl_kernel         /* kernel */,
-              cl_uint           /* num_events_in_wait_list */,
-              const cl_event *  /* event_wait_list */,
-              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
 					  void (CL_CALLBACK * /*user_func*/)(void *), 
                       void *            /* args */,
@@ -1141,17 +1244,67 @@ clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
                       cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+clEnqueueMarkerWithWaitList(cl_command_queue  /* command_queue */,
                             cl_uint           /* num_events_in_wait_list */,
                             const cl_event *  /* event_wait_list */,
                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+clEnqueueBarrierWithWaitList(cl_command_queue  /* command_queue */,
                              cl_uint           /* num_events_in_wait_list */,
                              const cl_event *  /* event_wait_list */,
                              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  /* command_queue */,
+                 cl_uint           /* num_svm_pointers */,
+                 void *[]          /* svm_pointers[] */,
+                 void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                        cl_uint          /* num_svm_pointers */,
+                                                        void *[]         /* svm_pointers[] */,
+                                                        void *           /* user_data */),
+                 void *            /* user_data */,
+                 cl_uint           /* num_events_in_wait_list */,
+                 const cl_event *  /* event_wait_list */,
+                 cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_copy */,
+                   void *            /* dst_ptr */,
+                   const void *      /* src_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  /* command_queue */,
+                    void *            /* svm_ptr */,
+                    const void *      /* pattern */,
+                    size_t            /* pattern_size */,
+                    size_t            /* size */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  /* command_queue */,
+                cl_bool           /* blocking_map */,
+                cl_map_flags      /* flags */,
+                void *            /* svm_ptr */,
+                size_t            /* size */,
+                cl_uint           /* num_events_in_wait_list */,
+                const cl_event *  /* event_wait_list */,
+                cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  /* command_queue */,
+                  void *            /* svm_ptr */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
 /* Extension function access
@@ -1205,7 +1358,29 @@ clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+/* Deprecated OpenCL 2.0 APIs */
+clCreateCommandQueue(cl_context                     /* context */,
+                     cl_device_id                   /* device */,
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */,
+                cl_addressing_mode  /* addressing_mode */,
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
 #ifdef __cplusplus
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
deleted file mode 100644
index 38fac19..0000000
--- a/include/CL/cl.hpp
+++ /dev/null
@@ -1,12452 +0,0 @@
- * Copyright (c) 2008-2013 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- ******************************************************************************/
-/*! \file
- *
- *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
- *       OpenCL 1.2 (rev 15)    
- *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
- *   
- *   Additions and fixes from:
- *       Brian Cole, March 3rd 2010 and April 2012 
- *       Matt Gruenke, April 2012.
- *       Bruce Merry, February 2013.
- *       Tom Deakin and Simon McIntosh-Smith, July 2013
- *   
- *   \version 1.2.6
- *   \date August 2013
- *
- *   Optional extension support
- *
- *         cl
- *         cl_ext_device_fission
- *				#define USE_CL_DEVICE_FISSION
- */
-/*! \mainpage
- * \section intro Introduction
- * For many large applications C++ is the language of choice and so it seems
- * reasonable to define C++ bindings for OpenCL.
- *
- *
- * The interface is contained with a single C++ header file \em cl.hpp and all
- * definitions are contained within the namespace \em cl. There is no additional
- * requirement to include \em cl.h and to use either the C++ or original C
- * bindings it is enough to simply include \em cl.hpp.
- *
- * The bindings themselves are lightweight and correspond closely to the
- * underlying C API. Using the C++ bindings introduces no additional execution
- * overhead.
- *
- * For detail documentation on the bindings see:
- *
- * The OpenCL C++ Wrapper API 1.2 (revision 09)
- *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
- *
- * \section example Example
- *
- * The following example shows a general use case for the C++
- * bindings, including support for the optional exception feature and
- * also the supplied vector and string classes, see following sections for
- * decriptions of these features.
- *
- * \code
- * 
- * #if defined(__APPLE__) || defined(__MACOSX)
- * #include <OpenCL/cl.hpp>
- * #else
- * #include <CL/cl.hpp>
- * #endif
- * #include <cstdio>
- * #include <cstdlib>
- * #include <iostream>
- * 
- *  const char * helloStr  = "__kernel void "
- *                           "hello(void) "
- *                           "{ "
- *                           "  "
- *                           "} ";
- * 
- *  int
- *  main(void)
- *  {
- *     cl_int err = CL_SUCCESS;
- *     try {
- *
- *       std::vector<cl::Platform> platforms;
- *       cl::Platform::get(&platforms);
- *       if (platforms.size() == 0) {
- *           std::cout << "Platform size 0\n";
- *           return -1;
- *       }
- *
- *       cl_context_properties properties[] = 
- *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
- *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
- * 
- *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
- * 
- *       cl::Program::Sources source(1,
- *           std::make_pair(helloStr,strlen(helloStr)));
- *       cl::Program program_ = cl::Program(context, source);
- *       program_.build(devices);
- * 
- *       cl::Kernel kernel(program_, "hello", &err);
- * 
- *       cl::Event event;
- *       cl::CommandQueue queue(context, devices[0], 0, &err);
- *       queue.enqueueNDRangeKernel(
- *           kernel, 
- *           cl::NullRange, 
- *           cl::NDRange(4,4),
- *           cl::NullRange,
- *           NULL,
- *           &event); 
- * 
- *       event.wait();
- *     }
- *     catch (cl::Error err) {
- *        std::cerr 
- *           << "ERROR: "
- *           << err.what()
- *           << "("
- *           << err.err()
- *           << ")"
- *           << std::endl;
- *     }
- * 
- *    return EXIT_SUCCESS;
- *  }
- * 
- * \endcode
- *
- */
-#ifndef CL_HPP_
-#define CL_HPP_
-#ifdef _WIN32
-#include <windows.h>
-#include <malloc.h>
-#include <iterator>
-#include <intrin.h>
-#if defined(__CL_ENABLE_EXCEPTIONS)
-#include <exception>
-#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
-#pragma push_macro("max")
-#undef max
-#if defined(USE_DX_INTEROP)
-#include <CL/cl_d3d10.h>
-#include <CL/cl_dx9_media_sharing.h>
-#endif // _WIN32
-#include <CL/cl_ext.h>
-#if defined(__APPLE__) || defined(__MACOSX)
-#include <OpenGL/OpenGL.h>
-#include <OpenCL/opencl.h>
-#include <libkern/OSAtomic.h>
-#include <GL/gl.h>
-#include <CL/opencl.h>
-#endif // !__APPLE__
-// To avoid accidentally taking ownership of core OpenCL types
-// such as cl_kernel constructors are made explicit
-// under OpenCL 1.2
-#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-#define __CL_EXPLICIT_CONSTRUCTORS explicit
-#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-// Define deprecated prefixes and suffixes to ensure compilation
-// in case they are not pre-defined
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#if !defined(CL_CALLBACK)
-#define CL_CALLBACK
-#endif //CL_CALLBACK
-#include <utility>
-#include <limits>
-#if !defined(__NO_STD_VECTOR)
-#include <vector>
-#if !defined(__NO_STD_STRING)
-#include <string>
-#if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
-#include <alloca.h>
-#include <emmintrin.h>
-#include <xmmintrin.h>
-#endif // linux
-#include <cstring>
-/*! \namespace cl
- *
- * \brief The OpenCL C++ bindings are defined within this namespace.
- *
- */
-namespace cl {
-class Memory;
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-#define __INIT_CL_EXT_FCN_PTR(name) \
-    if(!pfn_##name) { \
-        pfn_##name = (PFN_##name) \
-            clGetExtensionFunctionAddress(#name); \
-        if(!pfn_##name) { \
-        } \
-    }
-#endif // #if defined(CL_VERSION_1_1)
-#if defined(CL_VERSION_1_2)
-#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
-    if(!pfn_##name) { \
-        pfn_##name = (PFN_##name) \
-            clGetExtensionFunctionAddressForPlatform(platform, #name); \
-        if(!pfn_##name) { \
-        } \
-    }
-#endif // #if defined(CL_VERSION_1_1)
-class Program;
-class Device;
-class Context;
-class CommandQueue;
-class Memory;
-class Buffer;
-#if defined(__CL_ENABLE_EXCEPTIONS)
-/*! \brief Exception class 
- * 
- *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
- */
-class Error : public std::exception
-    cl_int err_;
-    const char * errStr_;
-    /*! \brief Create a new CL error exception for a given error code
-     *  and corresponding message.
-     * 
-     *  \param err error code value.
-     *
-     *  \param errStr a descriptive string that must remain in scope until
-     *                handling of the exception has concluded.  If set, it
-     *                will be returned by what().
-     */
-    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
-    {}
-    ~Error() throw() {}
-    /*! \brief Get error string associated with exception
-     *
-     * \return A memory pointer to the error message string.
-     */
-    virtual const char * what() const throw ()
-    {
-        if (errStr_ == NULL) {
-            return "empty";
-        }
-        else {
-            return errStr_;
-        }
-    }
-    /*! \brief Get error code associated with exception
-     *
-     *  \return The error code.
-     */
-    cl_int err(void) const { return err_; }
-#define __ERR_STR(x) #x
-#define __ERR_STR(x) NULL
-namespace detail
-#if defined(__CL_ENABLE_EXCEPTIONS)
-static inline cl_int errHandler (
-    cl_int err,
-    const char * errStr = NULL)
-    if (err != CL_SUCCESS) {
-        throw Error(err, errStr);
-    }
-    return err;
-static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
-    (void) errStr; // suppress unused variable warning
-    return err;
-#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
-#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
-#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
-#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
-#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
-#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
-#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
-#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
-#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
-#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
-#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
-#if defined(CL_VERSION_1_2)
-#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
-#endif // #if defined(CL_VERSION_1_2)
-#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
-#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
-#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
-#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
-#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
-#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
-#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
-#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
-#define __COPY_ERR                          __ERR_STR(cl::copy)
-#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
-#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
-#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
-#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
-#if defined(CL_VERSION_1_2)
-#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
-#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
-#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
-#endif // #if defined(CL_VERSION_1_2)
-#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
-#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
-#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
-#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
-#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
-#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
-#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
-#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
-#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
-#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
-#if defined(CL_VERSION_1_2)
-#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
-#endif // #if defined(CL_VERSION_1_2)
-#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
-#if defined(CL_VERSION_1_2)
-#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
-#endif // #if defined(CL_VERSION_1_2)
-#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
-#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
-#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
-#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
-#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
-#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
-#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
-#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
-#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
-#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
-#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
-#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
-#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
-#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
-#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
-#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
-#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
-#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
-#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
-#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
-#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
-#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
-#if defined(CL_VERSION_1_2)
-#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
-#endif // #if defined(CL_VERSION_1_2)
-#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
-#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
-#define __RETAIN_ERR                        __ERR_STR(Retain Object)
-#define __RELEASE_ERR                       __ERR_STR(Release Object)
-#define __FLUSH_ERR                         __ERR_STR(clFlush)
-#define __FINISH_ERR                        __ERR_STR(clFinish)
-#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
- * CL 1.2 version that uses device fission.
- */
-#if defined(CL_VERSION_1_2)
-#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
-#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
-#endif // #if defined(CL_VERSION_1_2)
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
-#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
-#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
-#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
-#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
-#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
-#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
-#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
-#endif // #if defined(CL_VERSION_1_1)
-//! \endcond
- * CL 1.2 marker and barrier commands
- */
-#if defined(CL_VERSION_1_2)
-#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
-#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
-#endif // #if defined(CL_VERSION_1_2)
-#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
-typedef std::string STRING_CLASS;
-#elif !defined(__USE_DEV_STRING) 
-/*! \class string
- * \brief Simple string class, that provides a limited subset of std::string
- * functionality but avoids many of the issues that come with that class.
- *  \note Deprecated. Please use std::string as default or
- *  re-define the string class to match the std::string
- *  interface by defining STRING_CLASS
- */
-    ::size_t size_;
-    char * str_;
-    //! \brief Constructs an empty string, allocating no memory.
-    string(void) : size_(0), str_(NULL)
-    {
-    }
-    /*! \brief Constructs a string populated from an arbitrary value of
-     *  specified size.
-     * 
-     *  An extra '\0' is added, in case none was contained in str.
-     *
-     *  \param str the initial value of the string instance.  Note that '\0'     
-     *             characters receive no special treatment.  If NULL,
-     *             the string is left empty, with a size of 0.
-     *
-     *  \param size the number of characters to copy from str.
-     */
-    string(const char * str, ::size_t size) :
-        size_(size),
-        str_(NULL)
-    {
-        if( size > 0 ) {
-            str_ = new char[size_+1];
-            if (str_ != NULL) {
-                memcpy(str_, str, size_  * sizeof(char));
-                str_[size_] = '\0';
-            }
-            else {
-                size_ = 0;
-            }
-        }
-    }
-    /*! \brief Constructs a string populated from a null-terminated value.
-     *
-     *  \param str the null-terminated initial value of the string instance.
-     *             If NULL, the string is left empty, with a size of 0.
-     */
-    string(const char * str) :
-        size_(0),
-        str_(NULL)
-    {
-        if( str ) {
-            size_= ::strlen(str);
-        }
-        if( size_ > 0 ) {
-            str_ = new char[size_ + 1];
-            if (str_ != NULL) {
-                memcpy(str_, str, (size_ + 1) * sizeof(char));
-            }
-        }
-    }
-    void resize( ::size_t n )
-    {
-        if( size_ == n ) {
-            return;
-        }
-        if (n == 0) {
-            if( str_ ) {
-                delete [] str_;
-            }
-            str_ = NULL;
-            size_ = 0;
-        } 
-        else {
-            char *newString = new char[n + 1];
-            int copySize = n;
-            if( size_ < n ) {
-                copySize = size_;
-            }
-            size_ = n;
-            if(str_) {
-                memcpy(newString, str_, (copySize + 1) * sizeof(char));
-            }
-            if( copySize < size_ ) {
-                memset(newString + copySize, 0, size_ - copySize);
-            }
-            newString[size_] = '\0';
-            delete [] str_;
-            str_ = newString;
-        }
-    }
-    const char& operator[] ( ::size_t pos ) const
-    {
-        return str_[pos];
-    }
-    char& operator[] ( ::size_t pos )
-    {
-        return str_[pos];
-    }
-    /*! \brief Copies the value of another string to this one.
-     *
-     *  \param rhs the string to copy.
-     *
-     *  \returns a reference to the modified instance.
-     */
-    string& operator=(const string& rhs)
-    {
-        if (this == &rhs) {
-            return *this;
-        }
-        if( str_ != NULL ) {
-            delete [] str_;
-            str_ = NULL;
-            size_ = 0;
-        }
-        if (rhs.size_ == 0 || rhs.str_ == NULL) {
-            str_ = NULL;
-            size_ = 0;
-        } 
-        else {
-            str_ = new char[rhs.size_ + 1];
-            size_ = rhs.size_;
-            if (str_ != NULL) {
-                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
-            }
-            else {
-                size_ = 0;
-            }
-        }
-        return *this;
-    }
-    /*! \brief Constructs a string by copying the value of another instance.
-     *
-     *  \param rhs the string to copy.
-     */
-    string(const string& rhs) :
-        size_(0),
-        str_(NULL)
-    {
-        *this = rhs;
-    }
-    //! \brief Destructor - frees memory used to hold the current value.
-    ~string()
-    {
-        delete[] str_;
-        str_ = NULL;
-    }
-    //! \brief Queries the length of the string, excluding any added '\0's.
-    ::size_t size(void) const   { return size_; }
-    //! \brief Queries the length of the string, excluding any added '\0's.
-    ::size_t length(void) const { return size(); }
-    /*! \brief Returns a pointer to the private copy held by this instance,
-     *  or "" if empty/unset.
-     */
-    const char * c_str(void) const { return (str_) ? str_ : "";}
-typedef cl::string STRING_CLASS;
-#endif // #elif !defined(__USE_DEV_STRING) 
-#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
-#define VECTOR_CLASS std::vector
-#elif !defined(__USE_DEV_VECTOR) 
-#define VECTOR_CLASS cl::vector 
-#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
-/*! \class vector
- * \brief Fixed sized vector implementation that mirroring 
- *
- *  \note Deprecated. Please use std::vector as default or
- *  re-define the vector class to match the std::vector
- *  interface by defining VECTOR_CLASS
- *  \note Not recommended for use with custom objects as
- *  current implementation will construct N elements
- *
- * std::vector functionality.
- *  \brief Fixed sized vector compatible with std::vector.
- *
- *  \note
- *  This differs from std::vector<> not just in memory allocation,
- *  but also in terms of when members are constructed, destroyed,
- *  and assigned instead of being copy constructed.
- *
- *  \param T type of element contained in the vector.
- *
- *  \param N maximum size of the vector.
- */
-template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
-    T data_[N];
-    unsigned int size_;
-    //! \brief Constructs an empty vector with no memory allocated.
-    vector() :  
-        size_(static_cast<unsigned int>(0))
-    {}
-    //! \brief Deallocates the vector's memory and destroys all of its elements.
-    ~vector() 
-    {
-        clear();
-    }
-    //! \brief Returns the number of elements currently contained.
-    unsigned int size(void) const
-    {
-        return size_;
-    }
-    /*! \brief Empties the vector of all elements.
-     *  \note
-     *  This does not deallocate memory but will invoke destructors
-     *  on contained elements.
-     */
-    void clear()
-    {
-        while(!empty()) {
-            pop_back();
-        }
-    }
-    /*! \brief Appends an element after the last valid element.
-     * Calling this on a vector that has reached capacity will throw an 
-     * exception if exceptions are enabled.
-     */
-    void push_back (const T& x)
-    { 
-        if (size() < N) {    
-            new (&data_[size_]) T(x);
-            size_++;
-        } else {
-        }
-    }
-    /*! \brief Removes the last valid element from the vector.
-     * Calling this on an empty vector will throw an exception
-     * if exceptions are enabled.
-     */
-    void pop_back(void)
-    {
-        if (size_ != 0) {
-            --size_;
-            data_[size_].~T();
-        } else {
-        }
-    }
-    /*! \brief Constructs with a value copied from another.
-     *
-     *  \param vec the vector to copy.
-     */
-    vector(const vector<T, N>& vec) : 
-        size_(vec.size_)
-    {
-        if (size_ != 0) {	
-            assign(vec.begin(), vec.end());
-        }
-    } 
-    /*! \brief Constructs with a specified number of initial elements.
-     *
-     *  \param size number of initial elements.
-     *
-     *  \param val value of initial elements.
-     */
-    vector(unsigned int size, const T& val = T()) :
-        size_(0)
-    {
-        for (unsigned int i = 0; i < size; i++) {
-            push_back(val);
-        }
-    }
-    /*! \brief Overwrites the current content with that copied from another
-     *         instance.
-     *
-     *  \param rhs vector to copy.
-     *
-     *  \returns a reference to this.
-     */
-    vector<T, N>& operator=(const vector<T, N>& rhs)
-    {
-        if (this == &rhs) {
-            return *this;
-        }
-        if (rhs.size_ != 0) {	
-            assign(rhs.begin(), rhs.end());
-        } else {
-            clear();
-        }
-        return *this;
-    }
-    /*! \brief Tests equality against another instance.
-     *
-     *  \param vec the vector against which to compare.
-     */
-    bool operator==(vector<T,N> &vec)
-    {
-        if (size() != vec.size()) {
-            return false;
-        }
-        for( unsigned int i = 0; i < size(); ++i ) {
-            if( operator[](i) != vec[i] ) {
-                return false;
-            }
-        }
-        return true;
-    }
-    //! \brief Conversion operator to T*.
-    operator T* ()             { return data_; }
-    //! \brief Conversion operator to const T*.
-    operator const T* () const { return data_; }
-    //! \brief Tests whether this instance has any elements.
-    bool empty (void) const
-    {
-        return size_==0;
-    }
-    //! \brief Returns the maximum number of elements this instance can hold.
-    unsigned int max_size (void) const
-    {
-        return N;
-    }
-    //! \brief Returns the maximum number of elements this instance can hold.
-    unsigned int capacity () const
-    {
-        return N;
-    }
-    /*! \brief Returns a reference to a given element.
-     *
-     *  \param index which element to access.     *
-     *  \note
-     *  The caller is responsible for ensuring index is >= 0 and < size().
-     */
-    T& operator[](int index)
-    {
-        return data_[index];
-    }
-    /*! \brief Returns a const reference to a given element.
-     *
-     *  \param index which element to access.
-     *
-     *  \note
-     *  The caller is responsible for ensuring index is >= 0 and < size().
-     */
-    const T& operator[](int index) const
-    {
-        return data_[index];
-    }
-    /*! \brief Assigns elements of the vector based on a source iterator range.
-     *
-     *  \param start Beginning iterator of source range
-     *  \param end Enditerator of source range
-     *
-     *  \note
-     *  Will throw an exception if exceptions are enabled and size exceeded.
-     */
-    template<class I>
-    void assign(I start, I end)
-    {
-        clear();   
-        while(start != end) {
-            push_back(*start);
-            start++;
-        }
-    }
-    /*! \class iterator
-     * \brief Const iterator class for vectors
-     */
-    class iterator
-    {
-    private:
-        const vector<T,N> *vec_;
-        int index_;
-        /**
-         * Internal iterator constructor to capture reference
-         * to the vector it iterates over rather than taking 
-         * the vector by copy.
-         */
-        iterator (const vector<T,N> &vec, int index) :
-            vec_(&vec)
-        {            
-            if( !vec.empty() ) {
-                index_ = index;
-            } else {
-                index_ = -1;
-            }
-        }
-    public:
-        iterator(void) : 
-            index_(-1),
-            vec_(NULL)
-        {
-        }
-        iterator(const iterator& rhs) :
-            vec_(rhs.vec_),
-            index_(rhs.index_)
-        {
-        }
-        ~iterator(void) {}
-        static iterator begin(const cl::vector<T,N> &vec)
-        {
-            iterator i(vec, 0);
-            return i;
-        }
-        static iterator end(const cl::vector<T,N> &vec)
-        {
-            iterator i(vec, vec.size());
-            return i;
-        }
-        bool operator==(iterator i)
-        {
-            return ((vec_ == i.vec_) && 
-                    (index_ == i.index_));
-        }
-        bool operator!=(iterator i)
-        {
-            return (!(*this==i));
-        }
-        iterator& operator++()
-        {
-            ++index_;
-            return *this;
-        }
-        iterator operator++(int)
-        {
-            iterator retVal(*this);
-            ++index_;
-            return retVal;
-        }
-        iterator& operator--()
-        {
-            --index_;
-            return *this;
-        }
-        iterator operator--(int)
-        {
-            iterator retVal(*this);
-            --index_;
-            return retVal;
-        }
-        const T& operator *() const
-        {
-            return (*vec_)[index_];
-        }
-    };
-    iterator begin(void)
-    {
-        return iterator::begin(*this);
-    }
-    iterator begin(void) const
-    {
-        return iterator::begin(*this);
-    }
-    iterator end(void)
-    {
-        return iterator::end(*this);
-    }
-    iterator end(void) const
-    {
-        return iterator::end(*this);
-    }
-    T& front(void)
-    {
-        return data_[0];
-    }
-    T& back(void)
-    {
-        return data_[size_];
-    }
-    const T& front(void) const
-    {
-        return data_[0];
-    }
-    const T& back(void) const
-    {
-        return data_[size_-1];
-    }
-#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
-namespace detail {
-    /*
-     * Compare and exchange primitives are needed for handling of defaults
-    */
-    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
-    {
-#ifdef _WIN32
-        return (int)(InterlockedCompareExchange(
-           (volatile long*)dest, 
-           (long)exchange, 
-           (long)comparand));
-#elif defined(__APPLE__) || defined(__MACOSX)
-		return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
-#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
-        return (__sync_val_compare_and_swap(
-            dest, 
-            comparand, 
-            exchange));
-#endif // !_WIN32
-    }
-    inline void fence() { _mm_mfence(); }
-}; // namespace detail
-/*! \brief class used to interface between C++ and
- *  OpenCL C calls that require arrays of size_t values, whose
- *  size is known statically.
- */
-template <int N>
-class size_t
-    ::size_t data_[N];
-    //! \brief Initialize size_t to all 0s
-    size_t()
-    {
-        for( int i = 0; i < N; ++i ) {
-            data_[i] = 0;
-        }
-    }
-    ::size_t& operator[](int index)
-    {
-        return data_[index];
-    }
-    const ::size_t& operator[](int index) const
-    {
-        return data_[index];
-    }
-    //! \brief Conversion operator to T*.
-    operator ::size_t* ()             { return data_; }
-    //! \brief Conversion operator to const T*.
-    operator const ::size_t* () const { return data_; }
-namespace detail {
-// Generic getInfoHelper. The final parameter is used to guide overload
-// resolution: the actual parameter passed is an int, which makes this
-// a worse conversion sequence than a specialization that declares the
-// parameter as an int.
-template<typename Functor, typename T>
-inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
-    return f(name, sizeof(T), param, NULL);
-// Specialized getInfoHelper for VECTOR_CLASS params
-template <typename Func, typename T>
-inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
-    ::size_t required;
-    cl_int err = f(name, 0, NULL, &required);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    T* value = (T*) alloca(required);
-    err = f(name, required, value, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    param->assign(&value[0], &value[required/sizeof(T)]);
-    return CL_SUCCESS;
-/* Specialization for reference-counted types. This depends on the
- * existence of Wrapper<T>::cl_type, and none of the other types having the
- * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
- * does not work, because when using a derived type (e.g. Context) the generic
- * template will provide a better match.
- */
-template <typename Func, typename T>
-inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
-    ::size_t required;
-    cl_int err = f(name, 0, NULL, &required);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
-    err = f(name, required, value, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    ::size_t elements = required / sizeof(typename T::cl_type);
-    param->assign(&value[0], &value[elements]);
-    for (::size_t i = 0; i < elements; i++)
-    {
-        if (value[i] != NULL)
-        {
-            err = (*param)[i].retain();
-            if (err != CL_SUCCESS) {
-                return err;
-            }
-        }
-    }
-    return CL_SUCCESS;
-// Specialized for getInfo<CL_PROGRAM_BINARIES>
-template <typename Func>
-inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
-    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    return CL_SUCCESS;
-// Specialized GetInfoHelper for STRING_CLASS params
-template <typename Func>
-inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
-    ::size_t required;
-    cl_int err = f(name, 0, NULL, &required);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    char* value = (char*) alloca(required);
-    err = f(name, required, value, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    *param = value;
-    return CL_SUCCESS;
-// Specialized GetInfoHelper for cl::size_t params
-template <typename Func, ::size_t N>
-inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
-    ::size_t required;
-    cl_int err = f(name, 0, NULL, &required);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    ::size_t* value = (::size_t*) alloca(required);
-    err = f(name, required, value, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    for(int i = 0; i < N; ++i) {
-        (*param)[i] = value[i];
-    }
-    return CL_SUCCESS;
-template<typename T> struct ReferenceHandler;
-/* Specialization for reference-counted types. This depends on the
- * existence of Wrapper<T>::cl_type, and none of the other types having the
- * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
- * does not work, because when using a derived type (e.g. Context) the generic
- * template will provide a better match.
- */
-template<typename Func, typename T>
-inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
-    typename T::cl_type value;
-    cl_int err = f(name, sizeof(value), &value, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    *param = value;
-    if (value != NULL)
-    {
-        err = param->retain();
-        if (err != CL_SUCCESS) {
-            return err;
-        }
-    }
-    return CL_SUCCESS;
-#define __PARAM_NAME_INFO_1_0(F) \
-    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
-    \
-    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
-    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
-    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
-    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
-    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
-    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
-    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
-    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
-    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
-    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
-    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
-    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
-    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
-    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
-    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
-    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
-    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
-    \
-    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
-    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
-    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
-    \
-    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
-    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
-    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
-    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
-    \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
-    \
-    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
-    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
-    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
-    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
-    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
-    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
-    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
-    \
-    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
-    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
-    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
-    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
-    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
-    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
-    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
-    \
-    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
-    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
-    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
-    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
-    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
-    \
-    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
-    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
-    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
-    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
-    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
-    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
-    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
-    \
-    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
-    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
-    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
-    \
-    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
-    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
-    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
-    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
-    \
-    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
-    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
-    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
-    \
-    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
-    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
-    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
-    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
-#if defined(CL_VERSION_1_1)
-#define __PARAM_NAME_INFO_1_1(F) \
-    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
-    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
-    \
-    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
-    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
-    \
-    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
-    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
-    \
-    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
-#endif // CL_VERSION_1_1
-#if defined(CL_VERSION_1_2)
-#define __PARAM_NAME_INFO_1_2(F) \
-    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
-    \
-    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
-    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
-    \
-    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
-    \
-    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
-    \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
-    \
-    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
-    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
-    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
-    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
-    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
-#endif // #if defined(CL_VERSION_1_2)
-    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
-    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
-    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
-    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
-    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
-template <typename enum_type, cl_int Name>
-struct param_traits {};
-#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
-struct token;                                        \
-template<>                                           \
-struct param_traits<detail:: token,param_name>       \
-{                                                    \
-    enum { value = param_name };                     \
-    typedef T param_type;                            \
-#if defined(CL_VERSION_1_1)
-#endif // CL_VERSION_1_1
-#if defined(CL_VERSION_1_2)
-#endif // CL_VERSION_1_1
-// Convenience functions
-template <typename Func, typename T>
-inline cl_int
-getInfo(Func f, cl_uint name, T* param)
-    return getInfoHelper(f, name, param, 0);
-template <typename Func, typename Arg0>
-struct GetInfoFunctor0
-    Func f_; const Arg0& arg0_;
-    cl_int operator ()(
-        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
-    { return f_(arg0_, param, size, value, size_ret); }
-template <typename Func, typename Arg0, typename Arg1>
-struct GetInfoFunctor1
-    Func f_; const Arg0& arg0_; const Arg1& arg1_;
-    cl_int operator ()(
-        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
-    { return f_(arg0_, arg1_, param, size, value, size_ret); }
-template <typename Func, typename Arg0, typename T>
-inline cl_int
-getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
-    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
-    return getInfoHelper(f0, name, param, 0);
-template <typename Func, typename Arg0, typename Arg1, typename T>
-inline cl_int
-getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
-    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
-    return getInfoHelper(f0, name, param, 0);
-template<typename T>
-struct ReferenceHandler
-{ };
-#if defined(CL_VERSION_1_2)
- * OpenCL 1.2 devices do have retain/release.
- */
-template <>
-struct ReferenceHandler<cl_device_id>
-    /**
-     * Retain the device.
-     * \param device A valid device created using createSubDevices
-     * \return 
-     *   CL_SUCCESS if the function executed successfully.
-     *   CL_INVALID_DEVICE if device was not a valid subdevice
-     */
-    static cl_int retain(cl_device_id device)
-    { return ::clRetainDevice(device); }
-    /**
-     * Retain the device.
-     * \param device A valid device created using createSubDevices
-     * \return 
-     *   CL_SUCCESS if the function executed successfully.
-     *   CL_INVALID_DEVICE if device was not a valid subdevice
-     */
-    static cl_int release(cl_device_id device)
-    { return ::clReleaseDevice(device); }
-#else // #if defined(CL_VERSION_1_2)
- * OpenCL 1.1 devices do not have retain/release.
- */
-template <>
-struct ReferenceHandler<cl_device_id>
-    // cl_device_id does not have retain().
-    static cl_int retain(cl_device_id)
-    { return CL_SUCCESS; }
-    // cl_device_id does not have release().
-    static cl_int release(cl_device_id)
-    { return CL_SUCCESS; }
-#endif // #if defined(CL_VERSION_1_2)
-template <>
-struct ReferenceHandler<cl_platform_id>
-    // cl_platform_id does not have retain().
-    static cl_int retain(cl_platform_id)
-    { return CL_SUCCESS; }
-    // cl_platform_id does not have release().
-    static cl_int release(cl_platform_id)
-    { return CL_SUCCESS; }
-template <>
-struct ReferenceHandler<cl_context>
-    static cl_int retain(cl_context context)
-    { return ::clRetainContext(context); }
-    static cl_int release(cl_context context)
-    { return ::clReleaseContext(context); }
-template <>
-struct ReferenceHandler<cl_command_queue>
-    static cl_int retain(cl_command_queue queue)
-    { return ::clRetainCommandQueue(queue); }
-    static cl_int release(cl_command_queue queue)
-    { return ::clReleaseCommandQueue(queue); }
-template <>
-struct ReferenceHandler<cl_mem>
-    static cl_int retain(cl_mem memory)
-    { return ::clRetainMemObject(memory); }
-    static cl_int release(cl_mem memory)
-    { return ::clReleaseMemObject(memory); }
-template <>
-struct ReferenceHandler<cl_sampler>
-    static cl_int retain(cl_sampler sampler)
-    { return ::clRetainSampler(sampler); }
-    static cl_int release(cl_sampler sampler)
-    { return ::clReleaseSampler(sampler); }
-template <>
-struct ReferenceHandler<cl_program>
-    static cl_int retain(cl_program program)
-    { return ::clRetainProgram(program); }
-    static cl_int release(cl_program program)
-    { return ::clReleaseProgram(program); }
-template <>
-struct ReferenceHandler<cl_kernel>
-    static cl_int retain(cl_kernel kernel)
-    { return ::clRetainKernel(kernel); }
-    static cl_int release(cl_kernel kernel)
-    { return ::clReleaseKernel(kernel); }
-template <>
-struct ReferenceHandler<cl_event>
-    static cl_int retain(cl_event event)
-    { return ::clRetainEvent(event); }
-    static cl_int release(cl_event event)
-    { return ::clReleaseEvent(event); }
-// Extracts version number with major in the upper 16 bits, minor in the lower 16
-static cl_uint getVersion(const char *versionInfo)
-    int highVersion = 0;
-    int lowVersion = 0;
-    int index = 7;
-    while(versionInfo[index] != '.' ) {
-        highVersion *= 10;
-        highVersion += versionInfo[index]-'0';
-        ++index;
-    }
-    ++index;
-    while(versionInfo[index] != ' ' ) {
-        lowVersion *= 10;
-        lowVersion += versionInfo[index]-'0';
-        ++index;
-    }
-    return (highVersion << 16) | lowVersion;
-static cl_uint getPlatformVersion(cl_platform_id platform)
-    ::size_t size = 0;
-    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
-    char *versionInfo = (char *) alloca(size);
-    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
-    return getVersion(versionInfo);
-static cl_uint getDevicePlatformVersion(cl_device_id device)
-    cl_platform_id platform;
-    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
-    return getPlatformVersion(platform);
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-static cl_uint getContextPlatformVersion(cl_context context)
-    // The platform cannot be queried directly, so we first have to grab a
-    // device and obtain its context
-    ::size_t size = 0;
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
-    if (size == 0)
-        return 0;
-    cl_device_id *devices = (cl_device_id *) alloca(size);
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
-    return getDevicePlatformVersion(devices[0]);
-#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-template <typename T>
-class Wrapper
-    typedef T cl_type;
-    cl_type object_;
-    Wrapper() : object_(NULL) { }
-    Wrapper(const cl_type &obj) : object_(obj) { }
-    ~Wrapper()
-    {
-        if (object_ != NULL) { release(); }
-    }
-    Wrapper(const Wrapper<cl_type>& rhs)
-    {
-        object_ = rhs.object_;
-        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-    }
-    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
-    {
-        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-        object_ = rhs.object_;
-        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-        return *this;
-    }
-    Wrapper<cl_type>& operator = (const cl_type &rhs)
-    {
-        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-        object_ = rhs;
-        return *this;
-    }
-    cl_type operator ()() const { return object_; }
-    cl_type& operator ()() { return object_; }
-    template<typename Func, typename U>
-    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
-    cl_int retain() const
-    {
-        return ReferenceHandler<cl_type>::retain(object_);
-    }
-    cl_int release() const
-    {
-        return ReferenceHandler<cl_type>::release(object_);
-    }
-template <>
-class Wrapper<cl_device_id>
-    typedef cl_device_id cl_type;
-    cl_type object_;
-    bool referenceCountable_;
-    static bool isReferenceCountable(cl_device_id device)
-    {
-        bool retVal = false;
-        if (device != NULL) {
-            int version = getDevicePlatformVersion(device);
-            if(version > ((1 << 16) + 1)) {
-                retVal = true;
-            }
-        }
-        return retVal;
-    }
-    Wrapper() : object_(NULL), referenceCountable_(false) 
-    { 
-    }
-    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
-    {
-        referenceCountable_ = isReferenceCountable(obj); 
-    }
-    ~Wrapper()
-    {
-        if (object_ != NULL) { release(); }
-    }
-    Wrapper(const Wrapper<cl_type>& rhs)
-    {
-        object_ = rhs.object_;
-        referenceCountable_ = isReferenceCountable(object_); 
-        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-    }
-    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
-    {
-        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-        object_ = rhs.object_;
-        referenceCountable_ = rhs.referenceCountable_;
-        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-        return *this;
-    }
-    Wrapper<cl_type>& operator = (const cl_type &rhs)
-    {
-        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-        object_ = rhs;
-        referenceCountable_ = isReferenceCountable(object_); 
-        return *this;
-    }
-    cl_type operator ()() const { return object_; }
-    cl_type& operator ()() { return object_; }
-    template<typename Func, typename U>
-    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
-    template<typename Func, typename U>
-    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
-    cl_int retain() const
-    {
-        if( referenceCountable_ ) {
-            return ReferenceHandler<cl_type>::retain(object_);
-        }
-        else {
-            return CL_SUCCESS;
-        }
-    }
-    cl_int release() const
-    {
-        if( referenceCountable_ ) {
-            return ReferenceHandler<cl_type>::release(object_);
-        }
-        else {
-            return CL_SUCCESS;
-        }
-    }
-} // namespace detail
-//! \endcond
-/*! \stuct ImageFormat
- *  \brief Adds constructors and member functions for cl_image_format.
- *
- *  \see cl_image_format
- */
-struct ImageFormat : public cl_image_format
-    //! \brief Default constructor - performs no initialization.
-    ImageFormat(){}
-    //! \brief Initializing constructor.
-    ImageFormat(cl_channel_order order, cl_channel_type type)
-    {
-        image_channel_order = order;
-        image_channel_data_type = type;
-    }
-    //! \brief Assignment operator.
-    ImageFormat& operator = (const ImageFormat& rhs)
-    {
-        if (this != &rhs) {
-            this->image_channel_data_type = rhs.image_channel_data_type;
-            this->image_channel_order     = rhs.image_channel_order;
-        }
-        return *this;
-    }
-/*! \brief Class interface for cl_device_id.
- *
- *  \note Copies of these objects are inexpensive, since they don't 'own'
- *        any underlying resources or data structures.
- *
- *  \see cl_device_id
- */
-class Device : public detail::Wrapper<cl_device_id>
-    //! \brief Default constructor - initializes to NULL.
-    Device() : detail::Wrapper<cl_type>() { }
-    /*! \brief Copy constructor.
-     * 
-     *  This simply copies the device ID value, which is an inexpensive operation.
-     */
-    Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
-    /*! \brief Constructor from cl_device_id.
-     * 
-     *  This simply copies the device ID value, which is an inexpensive operation.
-     */
-    Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
-    /*! \brief Returns the first device on the default context.
-     *
-     *  \see Context::getDefault()
-     */
-    static Device getDefault(cl_int * err = NULL);
-    /*! \brief Assignment operator from Device.
-     * 
-     *  This simply copies the device ID value, which is an inexpensive operation.
-     */
-    Device& operator = (const Device& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment operator from cl_device_id.
-     * 
-     *  This simply copies the device ID value, which is an inexpensive operation.
-     */
-    Device& operator = (const cl_device_id& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-    //! \brief Wrapper for clGetDeviceInfo().
-    template <typename T>
-    cl_int getInfo(cl_device_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
-            __GET_DEVICE_INFO_ERR);
-    }
-    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_device_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_device_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-    /**
-     * CL 1.2 version
-     */
-#if defined(CL_VERSION_1_2)
-    //! \brief Wrapper for clCreateSubDevicesEXT().
-    cl_int createSubDevices(
-        const cl_device_partition_property * properties,
-        VECTOR_CLASS<Device>* devices)
-    {
-        cl_uint n = 0;
-        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES);
-        }
-        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
-        err = clCreateSubDevices(object_, properties, n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES);
-        }
-        devices->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-#endif // #if defined(CL_VERSION_1_2)
- * CL 1.1 version that uses device fission.
- */
-#if defined(CL_VERSION_1_1)
-    cl_int createSubDevices(
-        const cl_device_partition_property_ext * properties,
-        VECTOR_CLASS<Device>* devices)
-    {
-        typedef CL_API_ENTRY cl_int 
-            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
-                cl_device_id /*in_device*/,
-                const cl_device_partition_property_ext * /* properties */,
-                cl_uint /*num_entries*/,
-                cl_device_id * /*out_devices*/,
-                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
-        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
-        cl_uint n = 0;
-        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES);
-        }
-        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
-        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES);
-        }
-        devices->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-#endif // #if defined(USE_CL_DEVICE_FISSION)
-#endif // #if defined(CL_VERSION_1_1)
-/*! \brief Class interface for cl_platform_id.
- *
- *  \note Copies of these objects are inexpensive, since they don't 'own'
- *        any underlying resources or data structures.
- *
- *  \see cl_platform_id
- */
-class Platform : public detail::Wrapper<cl_platform_id>
-    //! \brief Default constructor - initializes to NULL.
-    Platform() : detail::Wrapper<cl_type>()  { }
-    /*! \brief Copy constructor.
-     * 
-     *  This simply copies the platform ID value, which is an inexpensive operation.
-     */
-    Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
-    /*! \brief Constructor from cl_platform_id.
-     * 
-     *  This simply copies the platform ID value, which is an inexpensive operation.
-     */
-    Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
-    /*! \brief Assignment operator from Platform.
-     * 
-     *  This simply copies the platform ID value, which is an inexpensive operation.
-     */
-    Platform& operator = (const Platform& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment operator from cl_platform_id.
-     * 
-     *  This simply copies the platform ID value, which is an inexpensive operation.
-     */
-    Platform& operator = (const cl_platform_id& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-    //! \brief Wrapper for clGetPlatformInfo().
-    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
-            __GET_PLATFORM_INFO_ERR);
-    }
-    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_platform_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_platform_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-    /*! \brief Gets a list of devices for this platform.
-     * 
-     *  Wraps clGetDeviceIDs().
-     */
-    cl_int getDevices(
-        cl_device_type type,
-        VECTOR_CLASS<Device>* devices) const
-    {
-        cl_uint n = 0;
-        if( devices == NULL ) {
-            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
-        }
-        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-        }
-        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
-        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-        }
-        devices->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-#if defined(USE_DX_INTEROP)
-   /*! \brief Get the list of available D3D10 devices.
-     *
-     *  \param d3d_device_source.
-     *
-     *  \param d3d_object.
-     *
-     *  \param d3d_device_set.
-     *
-     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
-     *  values returned in devices can be used to identify a specific OpenCL
-     *  device. If \a devices argument is NULL, this argument is ignored.
-     *
-     *  \return One of the following values:
-     *    - CL_SUCCESS if the function is executed successfully.
-     *
-     *  The application can query specific capabilities of the OpenCL device(s)
-     *  returned by cl::getDevices. This can be used by the application to
-     *  determine which device(s) to use.
-     *
-     * \note In the case that exceptions are enabled and a return value
-     * other than CL_SUCCESS is generated, then cl::Error exception is
-     * generated.
-     */
-    cl_int getDevices(
-        cl_d3d10_device_source_khr d3d_device_source,
-        void *                     d3d_object,
-        cl_d3d10_device_set_khr    d3d_device_set,
-        VECTOR_CLASS<Device>* devices) const
-    {
-        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
-            cl_platform_id platform, 
-            cl_d3d10_device_source_khr d3d_device_source, 
-            void * d3d_object,
-            cl_d3d10_device_set_khr d3d_device_set,
-            cl_uint num_entries,
-            cl_device_id * devices,
-            cl_uint* num_devices);
-        if( devices == NULL ) {
-            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
-        }
-        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
-        cl_uint n = 0;
-        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
-            object_, 
-            d3d_device_source, 
-            d3d_object,
-            d3d_device_set, 
-            0, 
-            NULL, 
-            &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-        }
-        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
-        err = pfn_clGetDeviceIDsFromD3D10KHR(
-            object_, 
-            d3d_device_source, 
-            d3d_object,
-            d3d_device_set,
-            n, 
-            ids, 
-            NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-        }
-        devices->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-    /*! \brief Gets a list of available platforms.
-     * 
-     *  Wraps clGetPlatformIDs().
-     */
-    static cl_int get(
-        VECTOR_CLASS<Platform>* platforms)
-    {
-        cl_uint n = 0;
-        if( platforms == NULL ) {
-            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
-        }
-        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-        }
-        cl_platform_id* ids = (cl_platform_id*) alloca(
-            n * sizeof(cl_platform_id));
-        err = ::clGetPlatformIDs(n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-        }
-        platforms->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-    /*! \brief Gets the first available platform.
-     * 
-     *  Wraps clGetPlatformIDs(), returning the first result.
-     */
-    static cl_int get(
-        Platform * platform)
-    {
-        cl_uint n = 0;
-        if( platform == NULL ) {
-            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
-        }
-        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-        }
-        cl_platform_id* ids = (cl_platform_id*) alloca(
-            n * sizeof(cl_platform_id));
-        err = ::clGetPlatformIDs(n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-        }
-        *platform = ids[0];
-        return CL_SUCCESS;
-    }
-    /*! \brief Gets the first available platform, returning it by value.
-     * 
-     *  Wraps clGetPlatformIDs(), returning the first result.
-     */
-    static Platform get(
-        cl_int * errResult = NULL)
-    {
-        Platform platform;
-        cl_uint n = 0;
-        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-            if (errResult != NULL) {
-                *errResult = err;
-            }
-        }
-        cl_platform_id* ids = (cl_platform_id*) alloca(
-            n * sizeof(cl_platform_id));
-        err = ::clGetPlatformIDs(n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-        }
-        if (errResult != NULL) {
-            *errResult = err;
-        }
-        return ids[0];
-    }
-    static Platform getDefault( 
-        cl_int *errResult = NULL )
-    {
-        return get(errResult);
-    }
-#if defined(CL_VERSION_1_2)
-    //! \brief Wrapper for clUnloadCompiler().
-    cl_int
-    unloadCompiler()
-    {
-        return ::clUnloadPlatformCompiler(object_);
-    }
-#endif // #if defined(CL_VERSION_1_2)
-}; // class Platform
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
- * Unload the OpenCL compiler.
- * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
- */
-inline cl_int
-    return ::clUnloadCompiler();
-#endif // #if defined(CL_VERSION_1_1)
-/*! \brief Class interface for cl_context.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_context as the original.  For details, see
- *        clRetainContext() and clReleaseContext().
- *
- *  \see cl_context
- */
-class Context 
-    : public detail::Wrapper<cl_context>
-    static volatile int default_initialized_;
-    static Context default_;
-    static volatile cl_int default_error_;
-    /*! \brief Destructor.
-     *
-     *  This calls clReleaseContext() on the value held by this instance.
-     */
-    ~Context() { }
-    /*! \brief Constructs a context including a list of specified devices.
-     *
-     *  Wraps clCreateContext().
-     */
-    Context(
-        const VECTOR_CLASS<Device>& devices,
-        cl_context_properties* properties = NULL,
-        void (CL_CALLBACK * notifyFptr)(
-            const char *,
-            const void *,
-            ::size_t,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        ::size_t numDevices = devices.size();
-        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
-        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
-            deviceIDs[deviceIndex] = (devices[deviceIndex])();
-        }
-        object_ = ::clCreateContext(
-            properties, (cl_uint) numDevices,
-            deviceIDs,
-            notifyFptr, data, &error);
-        detail::errHandler(error, __CREATE_CONTEXT_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    Context(
-        const Device& device,
-        cl_context_properties* properties = NULL,
-        void (CL_CALLBACK * notifyFptr)(
-            const char *,
-            const void *,
-            ::size_t,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        cl_device_id deviceID = device();
-        object_ = ::clCreateContext(
-            properties, 1,
-            &deviceID,
-            notifyFptr, data, &error);
-        detail::errHandler(error, __CREATE_CONTEXT_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    /*! \brief Constructs a context including all or a subset of devices of a specified type.
-     *
-     *  Wraps clCreateContextFromType().
-     */
-    Context(
-        cl_device_type type,
-        cl_context_properties* properties = NULL,
-        void (CL_CALLBACK * notifyFptr)(
-            const char *,
-            const void *,
-            ::size_t,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-#if !defined(__APPLE__) || !defined(__MACOS)
-        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
-        if (properties == NULL) {
-            // Get a valid platform ID as we cannot send in a blank one
-            VECTOR_CLASS<Platform> platforms;
-            error = Platform::get(&platforms);
-            if (error != CL_SUCCESS) {
-                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                if (err != NULL) {
-                    *err = error;
-                }
-                return;
-            }
-            // Check the platforms we found for a device of our specified type
-            cl_context_properties platform_id = 0;
-            for (unsigned int i = 0; i < platforms.size(); i++) {
-                VECTOR_CLASS<Device> devices;
-#if defined(__CL_ENABLE_EXCEPTIONS)
-                try {
-                    error = platforms[i].getDevices(type, &devices);
-#if defined(__CL_ENABLE_EXCEPTIONS)
-                } catch (Error) {}
-    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
-    // We do error checking next anyway, and can throw there if needed
-                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
-                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
-                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                    if (err != NULL) {
-                        *err = error;
-                    }
-                }
-                if (devices.size() > 0) {
-                    platform_id = (cl_context_properties)platforms[i]();
-                    break;
-                }
-            }
-            if (platform_id == 0) {
-                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                if (err != NULL) {
-                    *err = CL_DEVICE_NOT_FOUND;
-                }
-                return;
-            }
-            prop[1] = platform_id;
-            properties = &prop[0];
-        }
-        object_ = ::clCreateContextFromType(
-            properties, type, notifyFptr, data, &error);
-        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
-     *
-     *  \note All calls to this function return the same cl_context as the first.
-     */
-    static Context getDefault(cl_int * err = NULL) 
-    {
-        int state = detail::compare_exchange(
-            &default_initialized_, 
-        if (state & __DEFAULT_INITIALIZED) {
-            if (err != NULL) {
-                *err = default_error_;
-            }
-            return default_;
-        }
-        if (state & __DEFAULT_BEING_INITIALIZED) {
-              // Assume writes will propagate eventually...
-              while(default_initialized_ != __DEFAULT_INITIALIZED) {
-                  detail::fence();
-              }
-            if (err != NULL) {
-                *err = default_error_;
-            }
-            return default_;
-        }
-        cl_int error;
-        default_ = Context(
-            NULL,
-            NULL,
-            NULL,
-            &error);
-        detail::fence();
-        default_error_ = error;
-        // Assume writes will propagate eventually...
-        default_initialized_ = __DEFAULT_INITIALIZED;
-        detail::fence();
-        if (err != NULL) {
-            *err = default_error_;
-        }
-        return default_;
-    }
-    //! \brief Default constructor - initializes to NULL.
-    Context() : detail::Wrapper<cl_type>() { }
-    /*! \brief Copy constructor.
-     * 
-     *  This calls clRetainContext() on the parameter's cl_context.
-     */
-    Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
-    /*! \brief Constructor from cl_context - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_context
-     *  into the new Context object.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
-    /*! \brief Assignment operator from Context.
-     * 
-     *  This calls clRetainContext() on the parameter and clReleaseContext() on
-     *  the previous value held by this instance.
-     */
-    Context& operator = (const Context& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment operator from cl_context - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseContext() on the value previously held by this instance.
-     */
-    Context& operator = (const cl_context& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-    //! \brief Wrapper for clGetContextInfo().
-    template <typename T>
-    cl_int getInfo(cl_context_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetContextInfo, object_, name, param),
-            __GET_CONTEXT_INFO_ERR);
-    }
-    //! \brief Wrapper for clGetContextInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_context_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_context_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-    /*! \brief Gets a list of supported image formats.
-     *  
-     *  Wraps clGetSupportedImageFormats().
-     */
-    cl_int getSupportedImageFormats(
-        cl_mem_flags flags,
-        cl_mem_object_type type,
-        VECTOR_CLASS<ImageFormat>* formats) const
-    {
-        cl_uint numEntries;
-        cl_int err = ::clGetSupportedImageFormats(
-           object_, 
-           flags,
-           type, 
-           0, 
-           NULL, 
-           &numEntries);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
-        }
-        ImageFormat* value = (ImageFormat*)
-            alloca(numEntries * sizeof(ImageFormat));
-        err = ::clGetSupportedImageFormats(
-            object_, 
-            flags, 
-            type, 
-            numEntries,
-            (cl_image_format*) value, 
-            NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
-        }
-        formats->assign(&value[0], &value[numEntries]);
-        return CL_SUCCESS;
-    }
-inline Device Device::getDefault(cl_int * err)
-    cl_int error;
-    Device device;
-    Context context = Context::getDefault(&error);
-    detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-    if (error != CL_SUCCESS) {
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    else {
-        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-        if (err != NULL) {
-            *err = CL_SUCCESS;
-        }
-    }
-    return device;
-#ifdef _WIN32
-__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-__declspec(selectany) Context Context::default_;
-__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
-__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-__attribute__((weak)) Context Context::default_;
-__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
-/*! \brief Class interface for cl_event.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_event as the original.  For details, see
- *        clRetainEvent() and clReleaseEvent().
- *
- *  \see cl_event
- */
-class Event : public detail::Wrapper<cl_event>
-    /*! \brief Destructor.
-     *
-     *  This calls clReleaseEvent() on the value held by this instance.
-     */
-    ~Event() { }
-    //! \brief Default constructor - initializes to NULL.
-    Event() : detail::Wrapper<cl_type>() { }
-    /*! \brief Copy constructor.
-     * 
-     *  This calls clRetainEvent() on the parameter's cl_event.
-     */
-    Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
-    /*! \brief Constructor from cl_event - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_event
-     *  into the new Event object.
-     */
-    Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
-    /*! \brief Assignment operator from cl_event - takes ownership.
-     *
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseEvent() on the value previously held by this instance.
-     */
-    Event& operator = (const Event& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment operator from cl_event.
-     * 
-     *  This calls clRetainEvent() on the parameter and clReleaseEvent() on
-     *  the previous value held by this instance.
-     */
-    Event& operator = (const cl_event& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-    //! \brief Wrapper for clGetEventInfo().
-    template <typename T>
-    cl_int getInfo(cl_event_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetEventInfo, object_, name, param),
-            __GET_EVENT_INFO_ERR);
-    }
-    //! \brief Wrapper for clGetEventInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_event_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_event_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-    //! \brief Wrapper for clGetEventProfilingInfo().
-    template <typename T>
-    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
-    {
-        return detail::errHandler(detail::getInfo(
-            &::clGetEventProfilingInfo, object_, name, param),
-    }
-    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_profiling_info, name>::param_type
-    getProfilingInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_profiling_info, name>::param_type param;
-        cl_int result = getProfilingInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-    /*! \brief Blocks the calling thread until this event completes.
-     * 
-     *  Wraps clWaitForEvents().
-     */
-    cl_int wait() const
-    {
-        return detail::errHandler(
-            ::clWaitForEvents(1, &object_),
-            __WAIT_FOR_EVENTS_ERR);
-    }
-#if defined(CL_VERSION_1_1)
-    /*! \brief Registers a user callback function for a specific command execution status.
-     *
-     *  Wraps clSetEventCallback().
-     */
-    cl_int setCallback(
-        cl_int type,
-        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
-        void * user_data = NULL)
-    {
-        return detail::errHandler(
-            ::clSetEventCallback(
-                object_,
-                type,
-                pfn_notify,
-                user_data), 
-            __SET_EVENT_CALLBACK_ERR);
-    }
-    /*! \brief Blocks the calling thread until every event specified is complete.
-     * 
-     *  Wraps clWaitForEvents().
-     */
-    static cl_int
-    waitForEvents(const VECTOR_CLASS<Event>& events)
-    {
-        return detail::errHandler(
-            ::clWaitForEvents(
-                (cl_uint) events.size(), (cl_event*)&events.front()),
-            __WAIT_FOR_EVENTS_ERR);
-    }
-#if defined(CL_VERSION_1_1)
-/*! \brief Class interface for user events (a subset of cl_event's).
- * 
- *  See Event for details about copy semantics, etc.
- */
-class UserEvent : public Event
-    /*! \brief Constructs a user event on a given context.
-     *
-     *  Wraps clCreateUserEvent().
-     */
-    UserEvent(
-        const Context& context,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateUserEvent(
-            context(),
-            &error);
-        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    //! \brief Default constructor - initializes to NULL.
-    UserEvent() : Event() { }
-    //! \brief Copy constructor - performs shallow copy.
-    UserEvent(const UserEvent& event) : Event(event) { }
-    //! \brief Assignment Operator - performs shallow copy.
-    UserEvent& operator = (const UserEvent& rhs)
-    {
-        if (this != &rhs) {
-            Event::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Sets the execution status of a user event object.
-     *
-     *  Wraps clSetUserEventStatus().
-     */
-    cl_int setStatus(cl_int status)
-    {
-        return detail::errHandler(
-            ::clSetUserEventStatus(object_,status), 
-            __SET_USER_EVENT_STATUS_ERR);
-    }
-/*! \brief Blocks the calling thread until every event specified is complete.
- * 
- *  Wraps clWaitForEvents().
- */
-inline static cl_int
-WaitForEvents(const VECTOR_CLASS<Event>& events)
-    return detail::errHandler(
-        ::clWaitForEvents(
-            (cl_uint) events.size(), (cl_event*)&events.front()),
-        __WAIT_FOR_EVENTS_ERR);
-/*! \brief Class interface for cl_mem.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_mem as the original.  For details, see
- *        clRetainMemObject() and clReleaseMemObject().
- *
- *  \see cl_mem
- */
-class Memory : public detail::Wrapper<cl_mem>
-    /*! \brief Destructor.
-     *
-     *  This calls clReleaseMemObject() on the value held by this instance.
-     */
-    ~Memory() {}
-    //! \brief Default constructor - initializes to NULL.
-    Memory() : detail::Wrapper<cl_type>() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     * 
-     *  This calls clRetainMemObject() on the parameter's cl_mem.
-     */
-    Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_mem
-     *  into the new Memory object.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
-    /*! \brief Assignment operator from Memory.
-     * 
-     *  This calls clRetainMemObject() on the parameter and clReleaseMemObject()
-     *  on the previous value held by this instance.
-     */
-    Memory& operator = (const Memory& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment operator from cl_mem - takes ownership.
-     *
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseMemObject() on the value previously held by this instance.
-     */
-    Memory& operator = (const cl_mem& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-    //! \brief Wrapper for clGetMemObjectInfo().
-    template <typename T>
-    cl_int getInfo(cl_mem_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
-            __GET_MEM_OBJECT_INFO_ERR);
-    }
-    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_mem_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_mem_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-#if defined(CL_VERSION_1_1)
-    /*! \brief Registers a callback function to be called when the memory object
-     *         is no longer needed.
-     *
-     *  Wraps clSetMemObjectDestructorCallback().
-     *
-     *  Repeated calls to this function, for a given cl_mem value, will append
-     *  to the list of functions called (in reverse order) when memory object's
-     *  resources are freed and the memory object is deleted.
-     *
-     *  \note
-     *  The registered callbacks are associated with the underlying cl_mem
-     *  value - not the Memory class instance.
-     */
-    cl_int setDestructorCallback(
-        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
-        void * user_data = NULL)
-    {
-        return detail::errHandler(
-            ::clSetMemObjectDestructorCallback(
-                object_,
-                pfn_notify,
-                user_data), 
-    }
-// Pre-declare copy functions
-class Buffer;
-template< typename IteratorType >
-cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
-template< typename IteratorType >
-cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
-template< typename IteratorType >
-cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
-template< typename IteratorType >
-cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
-/*! \brief Class interface for Buffer Memory Objects.
- * 
- *  See Memory for details about copy semantics, etc.
- *
- *  \see Memory
- */
-class Buffer : public Memory
-    /*! \brief Constructs a Buffer in a specified context.
-     *
-     *  Wraps clCreateBuffer().
-     *
-     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
-     *                  specified.  Note alignment & exclusivity requirements.
-     */
-    Buffer(
-        const Context& context,
-        cl_mem_flags flags,
-        ::size_t size,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    /*! \brief Constructs a Buffer in the default context.
-     *
-     *  Wraps clCreateBuffer().
-     *
-     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
-     *                  specified.  Note alignment & exclusivity requirements.
-     *
-     *  \see Context::getDefault()
-     */
-    Buffer(
-         cl_mem_flags flags,
-        ::size_t size,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        Context context = Context::getDefault(err);
-        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    /*!
-     * \brief Construct a Buffer from a host container via iterators.
-     * IteratorType must be random access.
-     * If useHostPtr is specified iterators must represent contiguous data.
-     */
-    template< typename IteratorType >
-    Buffer(
-        IteratorType startIterator,
-        IteratorType endIterator,
-        bool readOnly,
-        bool useHostPtr = false,
-        cl_int* err = NULL)
-    {
-        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-        cl_int error;
-        cl_mem_flags flags = 0;
-        if( readOnly ) {
-            flags |= CL_MEM_READ_ONLY;
-        }
-        else {
-            flags |= CL_MEM_READ_WRITE;
-        }
-        if( useHostPtr ) {
-            flags |= CL_MEM_USE_HOST_PTR;
-        }
-        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
-        Context context = Context::getDefault(err);
-        if( useHostPtr ) {
-            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
-        } else {
-            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
-        }
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-        if( !useHostPtr ) {
-            error = cl::copy(startIterator, endIterator, *this);
-            detail::errHandler(error, __CREATE_BUFFER_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-    }
-    /*!
-     * \brief Construct a Buffer from a host container via iterators using a specified context.
-     * IteratorType must be random access.
-     * If useHostPtr is specified iterators must represent contiguous data.
-     */
-    template< typename IteratorType >
-    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
-        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
-    //! \brief Default constructor - initializes to NULL.
-    Buffer() : Memory() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Buffer(const Buffer& buffer) : Memory(buffer) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
-    /*! \brief Assignment from Buffer - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Buffer& operator = (const Buffer& rhs)
-    {
-        if (this != &rhs) {
-            Memory::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Buffer& operator = (const cl_mem& rhs)
-    {
-        Memory::operator=(rhs);
-        return *this;
-    }
-#if defined(CL_VERSION_1_1)
-    /*! \brief Creates a new buffer object from this.
-     *
-     *  Wraps clCreateSubBuffer().
-     */
-    Buffer createSubBuffer(
-        cl_mem_flags flags,
-        cl_buffer_create_type buffer_create_type,
-        const void * buffer_create_info,
-        cl_int * err = NULL)
-    {
-        Buffer result;
-        cl_int error;
-        result.object_ = ::clCreateSubBuffer(
-            object_, 
-            flags, 
-            buffer_create_type, 
-            buffer_create_info, 
-            &error);
-        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-        return result;
-    }		
-#if defined (USE_DX_INTEROP)
-/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
- *
- *  This is provided to facilitate interoperability with Direct3D.
- * 
- *  See Memory for details about copy semantics, etc.
- *
- *  \see Memory
- */
-class BufferD3D10 : public Buffer
-    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
-    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
-    cl_int* errcode_ret);
-    /*! \brief Constructs a BufferD3D10, in a specified context, from a
-     *         given ID3D10Buffer.
-     *
-     *  Wraps clCreateFromD3D10BufferKHR().
-     */
-    BufferD3D10(
-        const Context& context,
-        cl_mem_flags flags,
-        ID3D10Buffer* bufobj,
-        cl_int * err = NULL)
-    {
-        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
-#if defined(CL_VERSION_1_2)
-        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
-        cl_platform platform = -1;
-        for( int i = 0; i < props.size(); ++i ) {
-            if( props[i] == CL_CONTEXT_PLATFORM ) {
-                platform = props[i+1];
-            }
-        }
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
-#if defined(CL_VERSION_1_1)
-        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
-        cl_int error;
-        object_ = pfn_clCreateFromD3D10BufferKHR(
-            context(),
-            flags,
-            bufobj,
-            &error);
-        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    //! \brief Default constructor - initializes to NULL.
-    BufferD3D10() : Buffer() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
-    /*! \brief Assignment from BufferD3D10 - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferD3D10& operator = (const BufferD3D10& rhs)
-    {
-        if (this != &rhs) {
-            Buffer::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferD3D10& operator = (const cl_mem& rhs)
-    {
-        Buffer::operator=(rhs);
-        return *this;
-    }
-/*! \brief Class interface for GL Buffer Memory Objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class BufferGL : public Buffer
-    /*! \brief Constructs a BufferGL in a specified context, from a given
-     *         GL buffer.
-     *
-     *  Wraps clCreateFromGLBuffer().
-     */
-    BufferGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLuint bufobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLBuffer(
-            context(),
-            flags,
-            bufobj,
-            &error);
-        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    //! \brief Default constructor - initializes to NULL.
-    BufferGL() : Buffer() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
-    /*! \brief Assignment from BufferGL - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferGL& operator = (const BufferGL& rhs)
-    {
-        if (this != &rhs) {
-            Buffer::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferGL& operator = (const cl_mem& rhs)
-    {
-        Buffer::operator=(rhs);
-        return *this;
-    }
-    //! \brief Wrapper for clGetGLObjectInfo().
-    cl_int getObjectInfo(
-        cl_gl_object_type *type,
-        GLuint * gl_object_name)
-    {
-        return detail::errHandler(
-            ::clGetGLObjectInfo(object_,type,gl_object_name),
-            __GET_GL_OBJECT_INFO_ERR);
-    }
-/*! \brief Class interface for GL Render Buffer Memory Objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class BufferRenderGL : public Buffer
-    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
-     *         GL Renderbuffer.
-     *
-     *  Wraps clCreateFromGLRenderbuffer().
-     */
-    BufferRenderGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLuint bufobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLRenderbuffer(
-            context(),
-            flags,
-            bufobj,
-            &error);
-        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    //! \brief Default constructor - initializes to NULL.
-    BufferRenderGL() : Buffer() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
-    /*! \brief Assignment from BufferGL - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferRenderGL& operator = (const BufferRenderGL& rhs)
-    {
-        if (this != &rhs) {
-            Buffer::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferRenderGL& operator = (const cl_mem& rhs)
-    {
-        Buffer::operator=(rhs);
-        return *this;
-    }
-    //! \brief Wrapper for clGetGLObjectInfo().
-    cl_int getObjectInfo(
-        cl_gl_object_type *type,
-        GLuint * gl_object_name)
-    {
-        return detail::errHandler(
-            ::clGetGLObjectInfo(object_,type,gl_object_name),
-            __GET_GL_OBJECT_INFO_ERR);
-    }
-/*! \brief C++ base class for Image Memory objects.
- *
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image : public Memory
-    //! \brief Default constructor - initializes to NULL.
-    Image() : Memory() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image(const Image& image) : Memory(image) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
-    /*! \brief Assignment from Image - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image& operator = (const Image& rhs)
-    {
-        if (this != &rhs) {
-            Memory::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image& operator = (const cl_mem& rhs)
-    {
-        Memory::operator=(rhs);
-        return *this;
-    }
-    //! \brief Wrapper for clGetImageInfo().
-    template <typename T>
-    cl_int getImageInfo(cl_image_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetImageInfo, object_, name, param),
-            __GET_IMAGE_INFO_ERR);
-    }
-    //! \brief Wrapper for clGetImageInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_image_info, name>::param_type
-    getImageInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_image_info, name>::param_type param;
-        cl_int result = getImageInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-#if defined(CL_VERSION_1_2)
-/*! \brief Class interface for 1D Image Memory objects.
- *
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image1D : public Image
-    /*! \brief Constructs a 1D Image in a specified context.
-     *
-     *  Wraps clCreateImage().
-     */
-    Image1D(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t width,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D,
-            width,
-            0, 0, 0, 0, 0, 0, 0, 0
-        };
-        object_ = ::clCreateImage(
-            context(), 
-            flags, 
-            &format, 
-            &desc, 
-            host_ptr, 
-            &error);
-        detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    //! \brief Default constructor - initializes to NULL.
-    Image1D() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image1D(const Image1D& image1D) : Image(image1D) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
-    /*! \brief Assignment from Image1D - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image1D& operator = (const Image1D& rhs)
-    {
-        if (this != &rhs) {
-            Image::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image1D& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-/*! \class Image1DBuffer
- * \brief Image interface for 1D buffer images.
- */
-class Image1DBuffer : public Image
-    Image1DBuffer(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t width,
-        const Buffer &buffer,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        cl_image_desc desc =
-        {
-            width,
-            0, 0, 0, 0, 0, 0, 0,
-            buffer()
-        };
-        object_ = ::clCreateImage(
-            context(), 
-            flags, 
-            &format, 
-            &desc, 
-            NULL, 
-            &error);
-        detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    Image1DBuffer() { }
-    Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
-    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
-    Image1DBuffer& operator = (const Image1DBuffer& rhs)
-    {
-        if (this != &rhs) {
-            Image::operator=(rhs);
-        }
-        return *this;
-    }
-    Image1DBuffer& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-/*! \class Image1DArray
- * \brief Image interface for arrays of 1D images.
- */
-class Image1DArray : public Image
-    Image1DArray(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t arraySize,
-        ::size_t width,
-        ::size_t rowPitch,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        cl_image_desc desc =
-        {
-            width,
-            0, 0,  // height, depth (unused)
-            arraySize,
-            rowPitch,
-            0, 0, 0, 0
-        };
-        object_ = ::clCreateImage(
-            context(), 
-            flags, 
-            &format, 
-            &desc, 
-            host_ptr, 
-            &error);
-        detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    Image1DArray() { }
-    Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
-    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
-    Image1DArray& operator = (const Image1DArray& rhs)
-    {
-        if (this != &rhs) {
-            Image::operator=(rhs);
-        }
-        return *this;
-    }
-    Image1DArray& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-/*! \brief Class interface for 2D Image Memory objects.
- *
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image2D : public Image
-    /*! \brief Constructs a 1D Image in a specified context.
-     *
-     *  Wraps clCreateImage().
-     */
-    Image2D(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t width,
-        ::size_t height,
-        ::size_t row_pitch = 0,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        bool useCreateImage;
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-        // Run-time decision based on the actual platform
-        {
-            cl_uint version = detail::getContextPlatformVersion(context());
-            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
-        }
-#elif defined(CL_VERSION_1_2)
-        useCreateImage = true;
-        useCreateImage = false;
-#if defined(CL_VERSION_1_2)
-        if (useCreateImage)
-        {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE2D,
-                width,
-                height,
-                0, 0, // depth, array size (unused)
-                row_pitch,
-                0, 0, 0, 0
-            };
-            object_ = ::clCreateImage(
-                context(),
-                flags,
-                &format,
-                &desc,
-                host_ptr,
-                &error);
-            detail::errHandler(error, __CREATE_IMAGE_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-#endif // #if defined(CL_VERSION_1_2)
-#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-        if (!useCreateImage)
-        {
-            object_ = ::clCreateImage2D(
-                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
-            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    }
-    //! \brief Default constructor - initializes to NULL.
-    Image2D() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2D(const Image2D& image2D) : Image(image2D) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
-    /*! \brief Assignment from Image2D - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2D& operator = (const Image2D& rhs)
-    {
-        if (this != &rhs) {
-            Image::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2D& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-#if !defined(CL_VERSION_1_2)
-/*! \brief Class interface for GL 2D Image Memory objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
- */
-    /*! \brief Constructs an Image2DGL in a specified context, from a given
-     *         GL Texture.
-     *
-     *  Wraps clCreateFromGLTexture2D().
-     */
-    Image2DGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLenum target,
-        GLint  miplevel,
-        GLuint texobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLTexture2D(
-            context(),
-            flags,
-            target,
-            miplevel,
-            texobj,
-            &error);
-        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    //! \brief Default constructor - initializes to NULL.
-    Image2DGL() : Image2D() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2DGL(const Image2DGL& image) : Image2D(image) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
-    /*! \brief Assignment from Image2DGL - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2DGL& operator = (const Image2DGL& rhs)
-    {
-        if (this != &rhs) {
-            Image2D::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2DGL& operator = (const cl_mem& rhs)
-    {
-        Image2D::operator=(rhs);
-        return *this;
-    }
-#endif // #if !defined(CL_VERSION_1_2)
-#if defined(CL_VERSION_1_2)
-/*! \class Image2DArray
- * \brief Image interface for arrays of 2D images.
- */
-class Image2DArray : public Image
-    Image2DArray(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t arraySize,
-        ::size_t width,
-        ::size_t height,
-        ::size_t rowPitch,
-        ::size_t slicePitch,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        cl_image_desc desc =
-        {
-            width,
-            height,
-            0,       // depth (unused)
-            arraySize,
-            rowPitch,
-            slicePitch,
-            0, 0, 0
-        };
-        object_ = ::clCreateImage(
-            context(), 
-            flags, 
-            &format, 
-            &desc, 
-            host_ptr, 
-            &error);
-        detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    Image2DArray() { }
-    Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
-    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
-    Image2DArray& operator = (const Image2DArray& rhs)
-    {
-        if (this != &rhs) {
-            Image::operator=(rhs);
-        }
-        return *this;
-    }
-    Image2DArray& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-/*! \brief Class interface for 3D Image Memory objects.
- *
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image3D : public Image
-    /*! \brief Constructs a 3D Image in a specified context.
-     *
-     *  Wraps clCreateImage().
-     */
-    Image3D(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t width,
-        ::size_t height,
-        ::size_t depth,
-        ::size_t row_pitch = 0,
-        ::size_t slice_pitch = 0,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        bool useCreateImage;
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-        // Run-time decision based on the actual platform
-        {
-            cl_uint version = detail::getContextPlatformVersion(context());
-            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
-        }
-#elif defined(CL_VERSION_1_2)
-        useCreateImage = true;
-        useCreateImage = false;
-#if defined(CL_VERSION_1_2)
-        if (useCreateImage)
-        {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE3D,
-                width,
-                height,
-                depth,
-                0,      // array size (unused)
-                row_pitch,
-                slice_pitch,
-                0, 0, 0
-            };
-            object_ = ::clCreateImage(
-                context(), 
-                flags, 
-                &format, 
-                &desc, 
-                host_ptr, 
-                &error);
-            detail::errHandler(error, __CREATE_IMAGE_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-#endif  // #if defined(CL_VERSION_1_2)
-#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-        if (!useCreateImage)
-        {
-            object_ = ::clCreateImage3D(
-                context(), flags, &format, width, height, depth, row_pitch,
-                slice_pitch, host_ptr, &error);
-            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    }
-    //! \brief Default constructor - initializes to NULL.
-    Image3D() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3D(const Image3D& image3D) : Image(image3D) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
-    /*! \brief Assignment from Image3D - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3D& operator = (const Image3D& rhs)
-    {
-        if (this != &rhs) {
-            Image::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3D& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-#if !defined(CL_VERSION_1_2)
-/*! \brief Class interface for GL 3D Image Memory objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image3DGL : public Image3D
-    /*! \brief Constructs an Image3DGL in a specified context, from a given
-     *         GL Texture.
-     *
-     *  Wraps clCreateFromGLTexture3D().
-     */
-    Image3DGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLenum target,
-        GLint  miplevel,
-        GLuint texobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLTexture3D(
-            context(),
-            flags,
-            target,
-            miplevel,
-            texobj,
-            &error);
-        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    //! \brief Default constructor - initializes to NULL.
-    Image3DGL() : Image3D() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3DGL(const Image3DGL& image) : Image3D(image) { }
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
-    /*! \brief Assignment from Image3DGL - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3DGL& operator = (const Image3DGL& rhs)
-    {
-        if (this != &rhs) {
-            Image3D::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3DGL& operator = (const cl_mem& rhs)
-    {
-        Image3D::operator=(rhs);
-        return *this;
-    }
-#endif // #if !defined(CL_VERSION_1_2)
-#if defined(CL_VERSION_1_2)
-/*! \class ImageGL
- * \brief general image interface for GL interop.
- * We abstract the 2D and 3D GL images into a single instance here
- * that wraps all GL sourced images on the grounds that setup information
- * was performed by OpenCL anyway.
- */
-class ImageGL : public Image
-    ImageGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLenum target,
-        GLint  miplevel,
-        GLuint texobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLTexture(
-            context(), 
-            flags, 
-            target,
-            miplevel,
-            texobj,
-            &error);
-        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    ImageGL() : Image() { }
-    ImageGL(const ImageGL& image) : Image(image) { }
-    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
-    ImageGL& operator = (const ImageGL& rhs)
-    {
-        if (this != &rhs) {
-            Image::operator=(rhs);
-        }
-        return *this;
-    }
-    ImageGL& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-/*! \brief Class interface for cl_sampler.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_sampler as the original.  For details, see
- *        clRetainSampler() and clReleaseSampler().
- *
- *  \see cl_sampler 
- */
-class Sampler : public detail::Wrapper<cl_sampler>
-    /*! \brief Destructor.
-     *
-     *  This calls clReleaseSampler() on the value held by this instance.
-     */
-    ~Sampler() { }
-    //! \brief Default constructor - initializes to NULL.
-    Sampler() { }
-    /*! \brief Constructs a Sampler in a specified context.
-     *
-     *  Wraps clCreateSampler().
-     */
-    Sampler(
-        const Context& context,
-        cl_bool normalized_coords,
-        cl_addressing_mode addressing_mode,
-        cl_filter_mode filter_mode,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateSampler(
-            context(), 
-            normalized_coords,
-            addressing_mode,
-            filter_mode,
-            &error);
-        detail::errHandler(error, __CREATE_SAMPLER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    /*! \brief Copy constructor - performs shallow copy.
-     * 
-     *  This calls clRetainSampler() on the parameter's cl_sampler.
-     */
-    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
-    /*! \brief Constructor from cl_sampler - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_sampler
-     *  into the new Sampler object.
-     */
-    Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
-    /*! \brief Assignment operator from Sampler.
-     * 
-     *  This calls clRetainSampler() on the parameter and clReleaseSampler()
-     *  on the previous value held by this instance.
-     */
-    Sampler& operator = (const Sampler& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment operator from cl_sampler - takes ownership.
-     *
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseSampler() on the value previously held by this instance.
-     */
-    Sampler& operator = (const cl_sampler& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-    //! \brief Wrapper for clGetSamplerInfo().
-    template <typename T>
-    cl_int getInfo(cl_sampler_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
-            __GET_SAMPLER_INFO_ERR);
-    }
-    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_sampler_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_sampler_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-class Program;
-class CommandQueue;
-class Kernel;
-//! \brief Class interface for specifying NDRange values.
-class NDRange
-    size_t<3> sizes_;
-    cl_uint dimensions_;
-    //! \brief Default constructor - resulting range has zero dimensions.
-    NDRange()
-        : dimensions_(0)
-    { }
-    //! \brief Constructs one-dimensional range.
-    NDRange(::size_t size0)
-        : dimensions_(1)
-    {
-        sizes_[0] = size0;
-    }
-    //! \brief Constructs two-dimensional range.
-    NDRange(::size_t size0, ::size_t size1)
-        : dimensions_(2)
-    {
-        sizes_[0] = size0;
-        sizes_[1] = size1;
-    }
-    //! \brief Constructs three-dimensional range.
-    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
-        : dimensions_(3)
-    {
-        sizes_[0] = size0;
-        sizes_[1] = size1;
-        sizes_[2] = size2;
-    }
-    /*! \brief Conversion operator to const ::size_t *.
-     *  
-     *  \returns a pointer to the size of the first dimension.
-     */
-    operator const ::size_t*() const { 
-        return (const ::size_t*) sizes_; 
-    }
-    //! \brief Queries the number of dimensions in the range.
-    ::size_t dimensions() const { return dimensions_; }
-//! \brief A zero-dimensional range.
-static const NDRange NullRange;
-//! \brief Local address wrapper for use with Kernel::setArg
-struct LocalSpaceArg
-    ::size_t size_;
-namespace detail {
-template <typename T>
-struct KernelArgumentHandler
-    static ::size_t size(const T&) { return sizeof(T); }
-    static T* ptr(T& value) { return &value; }
-template <>
-struct KernelArgumentHandler<LocalSpaceArg>
-    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
-    static void* ptr(LocalSpaceArg&) { return NULL; }
-//! \endcond
-/*! __local
- * \brief Helper function for generating LocalSpaceArg objects.
- * Deprecated. Replaced with Local.
- */
-__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-inline LocalSpaceArg
-__local(::size_t size)
-    LocalSpaceArg ret = { size };
-    return ret;
-/*! Local
- * \brief Helper function for generating LocalSpaceArg objects.
- */
-inline LocalSpaceArg
-Local(::size_t size)
-    LocalSpaceArg ret = { size };
-    return ret;
-//class KernelFunctor;
-/*! \brief Class interface for cl_kernel.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_kernel as the original.  For details, see
- *        clRetainKernel() and clReleaseKernel().
- *
- *  \see cl_kernel
- */
-class Kernel : public detail::Wrapper<cl_kernel>
-    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
-    /*! \brief Destructor.
-     *
-     *  This calls clReleaseKernel() on the value held by this instance.
-     */
-    ~Kernel() { }
-    //! \brief Default constructor - initializes to NULL.
-    Kernel() { }
-    /*! \brief Copy constructor - performs shallow copy.
-     * 
-     *  This calls clRetainKernel() on the parameter's cl_kernel.
-     */
-    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
-    /*! \brief Constructor from cl_kernel - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_kernel
-     *  into the new Kernel object.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
-    /*! \brief Assignment operator from Kernel.
-     * 
-     *  This calls clRetainKernel() on the parameter and clReleaseKernel()
-     *  on the previous value held by this instance.
-     */
-    Kernel& operator = (const Kernel& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-    /*! \brief Assignment operator from cl_kernel - takes ownership.
-     *
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseKernel() on the value previously held by this instance.
-     */
-    Kernel& operator = (const cl_kernel& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-    template <typename T>
-    cl_int getInfo(cl_kernel_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetKernelInfo, object_, name, param),
-            __GET_KERNEL_INFO_ERR);
-    }
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_kernel_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_kernel_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-#if defined(CL_VERSION_1_2)
-    template <typename T>
-    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
-            __GET_KERNEL_ARG_INFO_ERR);
-    }
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
-    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_kernel_arg_info, name>::param_type param;
-        cl_int result = getArgInfo(argIndex, name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-    template <typename T>
-    cl_int getWorkGroupInfo(
-        const Device& device, cl_kernel_work_group_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(
-                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
-                __GET_KERNEL_WORK_GROUP_INFO_ERR);
-    }
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
-        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-        detail::cl_kernel_work_group_info, name>::param_type param;
-        cl_int result = getWorkGroupInfo(device, name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-    template <typename T>
-    cl_int setArg(cl_uint index, T value)
-    {
-        return detail::errHandler(
-            ::clSetKernelArg(
-                object_,
-                index,
-                detail::KernelArgumentHandler<T>::size(value),
-                detail::KernelArgumentHandler<T>::ptr(value)),
-            __SET_KERNEL_ARGS_ERR);
-    }
-    cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
-    {
-        return detail::errHandler(
-            ::clSetKernelArg(object_, index, size, argPtr),
-            __SET_KERNEL_ARGS_ERR);
-    }
-/*! \class Program
- * \brief Program interface that implements cl_program.
- */
-class Program : public detail::Wrapper<cl_program>
-    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
-    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
-    Program(
-        const STRING_CLASS& source,
-		bool build = false,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        const char * strings = source.c_str();
-        const ::size_t length  = source.size();
-        Context context = Context::getDefault(err);
-        object_ = ::clCreateProgramWithSource(
-            context(), (cl_uint)1, &strings, &length, &error);
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-        if (error == CL_SUCCESS && build) {
-            error = ::clBuildProgram(
-                object_,
-                0,
-                NULL,
-                "",
-                NULL,
-                NULL);
-            detail::errHandler(error, __BUILD_PROGRAM_ERR);
-        }
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    Program(
-        const Context& context,
-        const STRING_CLASS& source,
-        bool build = false,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        const char * strings = source.c_str();
-        const ::size_t length  = source.size();
-        object_ = ::clCreateProgramWithSource(
-            context(), (cl_uint)1, &strings, &length, &error);
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-        if (error == CL_SUCCESS && build) {
-            error = ::clBuildProgram(
-                object_,
-                0,
-                NULL,
-                "",
-                NULL,
-                NULL);
-            detail::errHandler(error, __BUILD_PROGRAM_ERR);
-        }
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    Program(
-        const Context& context,
-        const Sources& sources,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        const ::size_t n = (::size_t)sources.size();
-        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
-        const char** strings = (const char**) alloca(n * sizeof(const char*));
-        for (::size_t i = 0; i < n; ++i) {
-            strings[i] = sources[(int)i].first;
-            lengths[i] = sources[(int)i].second;
-        }
-        object_ = ::clCreateProgramWithSource(
-            context(), (cl_uint)n, strings, lengths, &error);
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    /**
-     * Construct a program object from a list of devices and a per-device list of binaries.
-     * \param context A valid OpenCL context in which to construct the program.
-     * \param devices A vector of OpenCL device objects for which the program will be created.
-     * \param binaries A vector of pairs of a pointer to a binary object and its length.
-     * \param binaryStatus An optional vector that on completion will be resized to
-     *   match the size of binaries and filled with values to specify if each binary
-     *   was successfully loaded.
-     *   Set to CL_SUCCESS if the binary was successfully loaded.
-     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
-     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
-     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
-     *   CL_INVALID_CONTEXT if context is not a valid context.
-     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
-     *     or if any entry in binaries is NULL or has length 0.
-     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
-     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
-     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
-     */
-    Program(
-        const Context& context,
-        const VECTOR_CLASS<Device>& devices,
-        const Binaries& binaries,
-        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        const ::size_t numDevices = devices.size();
-        // Catch size mismatch early and return
-        if(binaries.size() != numDevices) {
-            error = CL_INVALID_VALUE;
-            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-            return;
-        }
-        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
-        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
-        for (::size_t i = 0; i < numDevices; ++i) {
-            images[i] = (const unsigned char*)binaries[i].first;
-            lengths[i] = binaries[(int)i].second;
-        }
-        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
-        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
-            deviceIDs[deviceIndex] = (devices[deviceIndex])();
-        }
-        if(binaryStatus) {
-            binaryStatus->resize(numDevices);
-        }
-        object_ = ::clCreateProgramWithBinary(
-            context(), (cl_uint) devices.size(),
-            deviceIDs,
-            lengths, images, binaryStatus != NULL
-               ? &binaryStatus->front()
-               : NULL, &error);
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-#if defined(CL_VERSION_1_2)
-    /**
-     * Create program using builtin kernels.
-     * \param kernelNames Semi-colon separated list of builtin kernel names
-     */
-    Program(
-        const Context& context,
-        const VECTOR_CLASS<Device>& devices,
-        const STRING_CLASS& kernelNames,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        ::size_t numDevices = devices.size();
-        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
-        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
-            deviceIDs[deviceIndex] = (devices[deviceIndex])();
-        }
-        object_ = ::clCreateProgramWithBuiltInKernels(
-            context(), 
-            (cl_uint) devices.size(),
-            deviceIDs,
-            kernelNames.c_str(), 
-            &error);
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-#endif // #if defined(CL_VERSION_1_2)
-    Program() { }
-    Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
-    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
-    Program& operator = (const Program& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-    Program& operator = (const cl_program& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-    cl_int build(
-        const VECTOR_CLASS<Device>& devices,
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
-    {
-        ::size_t numDevices = devices.size();
-        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
-        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
-            deviceIDs[deviceIndex] = (devices[deviceIndex])();
-        }
-        return detail::errHandler(
-            ::clBuildProgram(
-                object_,
-                (cl_uint)
-                devices.size(),
-                deviceIDs,
-                options,
-                notifyFptr,
-                data),
-                __BUILD_PROGRAM_ERR);
-    }
-    cl_int build(
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
-    {
-        return detail::errHandler(
-            ::clBuildProgram(
-                object_,
-                0,
-                NULL,
-                options,
-                notifyFptr,
-                data),
-                __BUILD_PROGRAM_ERR);
-    }
-#if defined(CL_VERSION_1_2)
-	cl_int compile(
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
-    {
-        return detail::errHandler(
-            ::clCompileProgram(
-                object_,
-                0,
-                NULL,
-                options,
-				0,
-				NULL,
-				NULL,
-                notifyFptr,
-                data),
-                __COMPILE_PROGRAM_ERR);
-    }
-    template <typename T>
-    cl_int getInfo(cl_program_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetProgramInfo, object_, name, param),
-            __GET_PROGRAM_INFO_ERR);
-    }
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_program_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_program_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-    template <typename T>
-    cl_int getBuildInfo(
-        const Device& device, cl_program_build_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(
-                &::clGetProgramBuildInfo, object_, device(), name, param),
-                __GET_PROGRAM_BUILD_INFO_ERR);
-    }
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_program_build_info, name>::param_type
-    getBuildInfo(const Device& device, cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_program_build_info, name>::param_type param;
-        cl_int result = getBuildInfo(device, name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
-    {
-        cl_uint numKernels;
-        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
-        }
-        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
-        err = ::clCreateKernelsInProgram(
-            object_, numKernels, (cl_kernel*) value, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
-        }
-        kernels->assign(&value[0], &value[numKernels]);
-        return CL_SUCCESS;
-    }
-#if defined(CL_VERSION_1_2)
-inline Program linkProgram(
-    Program input1,
-    Program input2,
-    const char* options = NULL,
-    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-    void* data = NULL,
-    cl_int* err = NULL) 
-    cl_int err_local = CL_SUCCESS;
-    cl_program programs[2] = { input1(), input2() };
-    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
-    cl_program prog = ::clLinkProgram(
-        ctx(),
-        0,
-        NULL,
-        options,
-        2,
-        programs,
-        notifyFptr,
-        data,
-        &err_local);
-    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
-    if (err != NULL) {
-        *err = err_local;
-    }
-    return Program(prog);
-inline Program linkProgram(
-    VECTOR_CLASS<Program> inputPrograms,
-    const char* options = NULL,
-    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-    void* data = NULL,
-    cl_int* err = NULL) 
-    cl_int err_local = CL_SUCCESS;
-    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
-    if (programs != NULL) {
-        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
-          programs[i] = inputPrograms[i]();
-        }
-    } 
-    cl_program prog = ::clLinkProgram(
-        Context::getDefault()(),
-        0,
-        NULL,
-        options,
-        (cl_uint)inputPrograms.size(),
-        programs,
-        notifyFptr,
-        data,
-        &err_local);
-    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
-    if (err != NULL) {
-        *err = err_local;
-    }
-    return Program(prog);
-inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
-    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
-    VECTOR_CLASS<char *> binaries;
-    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
-    {
-        char *ptr = NULL;
-        if (*s != 0) 
-            ptr = new char[*s];
-        binaries.push_back(ptr);
-    }
-    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
-    if (err != NULL) {
-        *err = result;
-    }
-    return binaries;
-inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
-    cl_int error;
-    object_ = ::clCreateKernel(program(), name, &error);
-    detail::errHandler(error, __CREATE_KERNEL_ERR);
-    if (err != NULL) {
-        *err = error;
-    }
-/*! \class CommandQueue
- * \brief CommandQueue interface for cl_command_queue.
- */
-class CommandQueue : public detail::Wrapper<cl_command_queue>
-    static volatile int default_initialized_;
-    static CommandQueue default_;
-    static volatile cl_int default_error_;
-   CommandQueue(
-        cl_command_queue_properties properties,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        Context context = Context::getDefault(&error);
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (error != CL_SUCCESS) {
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-        else {
-            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-            object_ = ::clCreateCommandQueue(
-                context(), device(), properties, &error);
-            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-    }
-    /*!
-    * \brief Constructs a CommandQueue for an implementation defined device in the given context
-    */
-    explicit CommandQueue(
-        const Context& context,
-        cl_command_queue_properties properties = 0,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        VECTOR_CLASS<cl::Device> devices;
-        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (error != CL_SUCCESS)
-        {
-            if (err != NULL) {
-                *err = error;
-            }
-            return;
-        }
-        object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    CommandQueue(
-        const Context& context,
-        const Device& device,
-        cl_command_queue_properties properties = 0,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateCommandQueue(
-            context(), device(), properties, &error);
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    static CommandQueue getDefault(cl_int * err = NULL) 
-    {
-        int state = detail::compare_exchange(
-            &default_initialized_, 
-        if (state & __DEFAULT_INITIALIZED) {
-            if (err != NULL) {
-                *err = default_error_;
-            }
-            return default_;
-        }
-        if (state & __DEFAULT_BEING_INITIALIZED) {
-              // Assume writes will propagate eventually...
-              while(default_initialized_ != __DEFAULT_INITIALIZED) {
-                  detail::fence();
-              }
-            if (err != NULL) {
-                *err = default_error_;
-            }
-            return default_;
-        }
-        cl_int error;
-        Context context = Context::getDefault(&error);
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (error != CL_SUCCESS) {
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-        else {
-            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-            default_ = CommandQueue(context, device, 0, &error);
-            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-        detail::fence();
-        default_error_ = error;
-        // Assume writes will propagate eventually...
-        default_initialized_ = __DEFAULT_INITIALIZED;
-        detail::fence();
-        if (err != NULL) {
-            *err = default_error_;
-        }
-        return default_;
-    }
-    CommandQueue() { }
-    CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
-    CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
-    CommandQueue& operator = (const CommandQueue& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-    CommandQueue& operator = (const cl_command_queue& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-    template <typename T>
-    cl_int getInfo(cl_command_queue_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(
-                &::clGetCommandQueueInfo, object_, name, param),
-                __GET_COMMAND_QUEUE_INFO_ERR);
-    }
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_command_queue_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_command_queue_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-    cl_int enqueueReadBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        ::size_t offset,
-        ::size_t size,
-        void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueReadBuffer(
-                object_, buffer(), blocking, offset, size,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_READ_BUFFER_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueWriteBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        ::size_t offset,
-        ::size_t size,
-        const void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueWriteBuffer(
-                object_, buffer(), blocking, offset, size,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_WRITE_BUFFER_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueCopyBuffer(
-        const Buffer& src,
-        const Buffer& dst,
-        ::size_t src_offset,
-        ::size_t dst_offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyBuffer(
-                object_, src(), dst(), src_offset, dst_offset, size,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQEUE_COPY_BUFFER_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueReadBufferRect(
-        const Buffer& buffer,
-        cl_bool blocking,
-        const size_t<3>& buffer_offset,
-        const size_t<3>& host_offset,
-        const size_t<3>& region,
-        ::size_t buffer_row_pitch,
-        ::size_t buffer_slice_pitch,
-        ::size_t host_row_pitch,
-        ::size_t host_slice_pitch,
-        void *ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueReadBufferRect(
-                object_, 
-                buffer(), 
-                blocking, 
-                (const ::size_t *)buffer_offset,
-                (const ::size_t *)host_offset,
-                (const ::size_t *)region,
-                buffer_row_pitch,
-                buffer_slice_pitch,
-                host_row_pitch,
-                host_slice_pitch,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_READ_BUFFER_RECT_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueWriteBufferRect(
-        const Buffer& buffer,
-        cl_bool blocking,
-        const size_t<3>& buffer_offset,
-        const size_t<3>& host_offset,
-        const size_t<3>& region,
-        ::size_t buffer_row_pitch,
-        ::size_t buffer_slice_pitch,
-        ::size_t host_row_pitch,
-        ::size_t host_slice_pitch,
-        void *ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueWriteBufferRect(
-                object_, 
-                buffer(), 
-                blocking, 
-                (const ::size_t *)buffer_offset,
-                (const ::size_t *)host_offset,
-                (const ::size_t *)region,
-                buffer_row_pitch,
-                buffer_slice_pitch,
-                host_row_pitch,
-                host_slice_pitch,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueCopyBufferRect(
-        const Buffer& src,
-        const Buffer& dst,
-        const size_t<3>& src_origin,
-        const size_t<3>& dst_origin,
-        const size_t<3>& region,
-        ::size_t src_row_pitch,
-        ::size_t src_slice_pitch,
-        ::size_t dst_row_pitch,
-        ::size_t dst_slice_pitch,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyBufferRect(
-                object_, 
-                src(), 
-                dst(), 
-                (const ::size_t *)src_origin, 
-                (const ::size_t *)dst_origin, 
-                (const ::size_t *)region,
-                src_row_pitch,
-                src_slice_pitch,
-                dst_row_pitch,
-                dst_slice_pitch,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-#if defined(CL_VERSION_1_2)
-    /**
-     * Enqueue a command to fill a buffer object with a pattern
-     * of a given size. The pattern is specified a as vector.
-     * \tparam PatternType The datatype of the pattern field. 
-     *     The pattern type must be an accepted OpenCL data type.
-     */
-    template<typename PatternType>
-    cl_int enqueueFillBuffer(
-        const Buffer& buffer,
-        PatternType pattern,
-        ::size_t offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillBuffer(
-                object_, 
-                buffer(),
-                static_cast<void*>(&pattern),
-                sizeof(PatternType), 
-                offset, 
-                size,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_BUFFER_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-    cl_int enqueueReadImage(
-        const Image& image,
-        cl_bool blocking,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        ::size_t row_pitch,
-        ::size_t slice_pitch,
-        void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueReadImage(
-                object_, image(), blocking, (const ::size_t *) origin,
-                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_READ_IMAGE_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueWriteImage(
-        const Image& image,
-        cl_bool blocking,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        ::size_t row_pitch,
-        ::size_t slice_pitch,
-        void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueWriteImage(
-                object_, image(), blocking, (const ::size_t *) origin,
-                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_WRITE_IMAGE_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueCopyImage(
-        const Image& src,
-        const Image& dst,
-        const size_t<3>& src_origin,
-        const size_t<3>& dst_origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyImage(
-                object_, src(), dst(), (const ::size_t *) src_origin,
-                (const ::size_t *)dst_origin, (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_COPY_IMAGE_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-#if defined(CL_VERSION_1_2)
-    /**
-     * Enqueue a command to fill an image object with a specified color.
-     * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA floating-point color value if
-     *     the image channel data type is not an unnormalized signed or
-     *     unsigned data type.
-     */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_float4 fillColor,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                (const ::size_t *) origin, 
-                (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    /**
-     * Enqueue a command to fill an image object with a specified color.
-     * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA signed integer color value if
-     *     the image channel data type is an unnormalized signed integer
-     *     type.
-     */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_int4 fillColor,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                (const ::size_t *) origin, 
-                (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    /**
-     * Enqueue a command to fill an image object with a specified color.
-     * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA unsigned integer color value if
-     *     the image channel data type is an unnormalized unsigned integer
-     *     type.
-     */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_uint4 fillColor,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                (const ::size_t *) origin, 
-                (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-    cl_int enqueueCopyImageToBuffer(
-        const Image& src,
-        const Buffer& dst,
-        const size_t<3>& src_origin,
-        const size_t<3>& region,
-        ::size_t dst_offset,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyImageToBuffer(
-                object_, src(), dst(), (const ::size_t *) src_origin,
-                (const ::size_t *) region, dst_offset,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueCopyBufferToImage(
-        const Buffer& src,
-        const Image& dst,
-        ::size_t src_offset,
-        const size_t<3>& dst_origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyBufferToImage(
-                object_, src(), dst(), src_offset,
-                (const ::size_t *) dst_origin, (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    void* enqueueMapBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        cl_map_flags flags,
-        ::size_t offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
-    {
-        cl_int error;
-        void * result = ::clEnqueueMapBuffer(
-            object_, buffer(), blocking, flags, offset, size,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (cl_event*) event,
-            &error);
-        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-        return result;
-    }
-    void* enqueueMapImage(
-        const Image& buffer,
-        cl_bool blocking,
-        cl_map_flags flags,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        ::size_t * row_pitch,
-        ::size_t * slice_pitch,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
-    {
-        cl_int error;
-        void * result = ::clEnqueueMapImage(
-            object_, buffer(), blocking, flags,
-            (const ::size_t *) origin, (const ::size_t *) region,
-            row_pitch, slice_pitch,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (cl_event*) event,
-            &error);
-        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
-        if (err != NULL) {
-              *err = error;
-        }
-        return result;
-    }
-    cl_int enqueueUnmapMemObject(
-        const Memory& memory,
-        void* mapped_ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueUnmapMemObject(
-                object_, memory(), mapped_ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-#if defined(CL_VERSION_1_2)
-    /**
-     * Enqueues a marker command which waits for either a list of events to complete, 
-     * or all previously enqueued commands to complete.
-     *
-     * Enqueues a marker command which waits for either a list of events to complete, 
-     * or if the list is empty it waits for all commands previously enqueued in command_queue 
-     * to complete before it completes. This command returns an event which can be waited on, 
-     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
-     * or all previously enqueued commands, queued before this command to command_queue, 
-     * have completed.
-     */
-    cl_int enqueueMarkerWithWaitList(
-        const VECTOR_CLASS<Event> *events = 0,
-        Event *event = 0)
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueMarkerWithWaitList(
-                object_,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    /**
-     * A synchronization point that enqueues a barrier operation.
-     *
-     * Enqueues a barrier command which waits for either a list of events to complete, 
-     * or if the list is empty it waits for all commands previously enqueued in command_queue 
-     * to complete before it completes. This command blocks command execution, that is, any 
-     * following commands enqueued after it do not execute until it completes. This command 
-     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
-     * all events either in the event_wait_list or all previously enqueued commands, queued 
-     * before this command to command_queue, have completed.
-     */
-    cl_int enqueueBarrierWithWaitList(
-        const VECTOR_CLASS<Event> *events = 0,
-        Event *event = 0)
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueBarrierWithWaitList(
-                object_,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    /**
-     * Enqueues a command to indicate with which device a set of memory objects
-     * should be associated.
-     */
-    cl_int enqueueMigrateMemObjects(
-        const VECTOR_CLASS<Memory> &memObjects,
-        cl_mem_migration_flags flags,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL
-        )
-    {
-        cl_event tmp;
-        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
-        for( int i = 0; i < (int)memObjects.size(); ++i ) {
-            localMemObjects[i] = memObjects[i]();
-        }
-        cl_int err = detail::errHandler(
-            ::clEnqueueMigrateMemObjects(
-                object_, 
-                (cl_uint)memObjects.size(), 
-                static_cast<const cl_mem*>(localMemObjects),
-                flags,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-    cl_int enqueueNDRangeKernel(
-        const Kernel& kernel,
-        const NDRange& offset,
-        const NDRange& global,
-        const NDRange& local = NullRange,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueNDRangeKernel(
-                object_, kernel(), (cl_uint) global.dimensions(),
-                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
-                (const ::size_t*) global,
-                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueTask(
-        const Kernel& kernel,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueTask(
-                object_, kernel(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_TASK_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
-    cl_int enqueueNativeKernel(
-        void (CL_CALLBACK *userFptr)(void *),
-        std::pair<void*, ::size_t> args,
-        const VECTOR_CLASS<Memory>* mem_objects = NULL,
-        const VECTOR_CLASS<const void*>* mem_locs = NULL,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
-            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
-            : NULL;
-        if (mems != NULL) {
-            for (unsigned int i = 0; i < mem_objects->size(); i++) {
-                mems[i] = ((*mem_objects)[i])();
-            }
-        }
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueNativeKernel(
-                object_, userFptr, args.first, args.second,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                mems,
-                (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_NATIVE_KERNEL);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-    {
-        return detail::errHandler(
-            ::clEnqueueMarker(object_, (cl_event*) event),
-            __ENQUEUE_MARKER_ERR);
-    }
-    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-    {
-        return detail::errHandler(
-            ::clEnqueueWaitForEvents(
-                object_,
-                (cl_uint) events.size(),
-                (const cl_event*) &events.front()),
-    }
-#endif // #if defined(CL_VERSION_1_1)
-    cl_int enqueueAcquireGLObjects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-     {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-             ::clEnqueueAcquireGLObjects(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
-             __ENQUEUE_ACQUIRE_GL_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-     }
-    cl_int enqueueReleaseGLObjects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-     {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-             ::clEnqueueReleaseGLObjects(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
-             __ENQUEUE_RELEASE_GL_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-     }
-#if defined (USE_DX_INTEROP)
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
-    cl_command_queue command_queue, cl_uint num_objects,
-    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list, cl_event* event);
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
-    cl_command_queue command_queue, cl_uint num_objects,
-    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list, cl_event* event);
-    cl_int enqueueAcquireD3D10Objects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-    {
-        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
-        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
-        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
-        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
-#if defined(CL_VERSION_1_1)
-        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-             pfn_clEnqueueAcquireD3D10ObjectsKHR(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
-             __ENQUEUE_ACQUIRE_GL_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-     }
-    cl_int enqueueReleaseD3D10Objects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-    {
-        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
-        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
-        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
-        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_2)
-#if defined(CL_VERSION_1_1)
-        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_1)
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            pfn_clEnqueueReleaseD3D10ObjectsKHR(
-                object_,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_RELEASE_GL_ERR);
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-        return err;
-    }
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-    {
-        return detail::errHandler(
-            ::clEnqueueBarrier(object_),
-            __ENQUEUE_BARRIER_ERR);
-    }
-#endif // #if defined(CL_VERSION_1_1)
-    cl_int flush() const
-    {
-        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
-    }
-    cl_int finish() const
-    {
-        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
-    }
-#ifdef _WIN32
-__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-__declspec(selectany) CommandQueue CommandQueue::default_;
-__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
-__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-__attribute__((weak)) CommandQueue CommandQueue::default_;
-__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
-template< typename IteratorType >
-    const Context &context,
-    IteratorType startIterator,
-    IteratorType endIterator,
-    bool readOnly,
-    bool useHostPtr,
-    cl_int* err)
-    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-    cl_int error;
-    cl_mem_flags flags = 0;
-    if( readOnly ) {
-        flags |= CL_MEM_READ_ONLY;
-    }
-    else {
-        flags |= CL_MEM_READ_WRITE;
-    }
-    if( useHostPtr ) {
-        flags |= CL_MEM_USE_HOST_PTR;
-    }
-    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
-    if( useHostPtr ) {
-        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
-    } else {
-        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
-    }
-    detail::errHandler(error, __CREATE_BUFFER_ERR);
-    if (err != NULL) {
-        *err = error;
-    }
-    if( !useHostPtr ) {
-        CommandQueue queue(context, 0, &error);
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-        error = cl::copy(queue, startIterator, endIterator, *this);
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-inline cl_int enqueueReadBuffer(
-    const Buffer& buffer,
-    cl_bool blocking,
-    ::size_t offset,
-    ::size_t size,
-    void* ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
-inline cl_int enqueueWriteBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        ::size_t offset,
-        ::size_t size,
-        const void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
-inline void* enqueueMapBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        cl_map_flags flags,
-        ::size_t offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (err != NULL) {
-        *err = error;
-    }
-    void * result = ::clEnqueueMapBuffer(
-            queue(), buffer(), blocking, flags, offset, size,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (cl_event*) event,
-            &error);
-    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (err != NULL) {
-        *err = error;
-    }
-    return result;
-inline cl_int enqueueUnmapMemObject(
-    const Memory& memory,
-    void* mapped_ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    cl_event tmp;
-    cl_int err = detail::errHandler(
-        ::clEnqueueUnmapMemObject(
-            queue(), memory(), mapped_ptr,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
-    if (event != NULL && err == CL_SUCCESS)
-        *event = tmp;
-    return err;
-inline cl_int enqueueCopyBuffer(
-        const Buffer& src,
-        const Buffer& dst,
-        ::size_t src_offset,
-        ::size_t dst_offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
- * Blocking copy operation between iterators and a buffer.
- * Host to Device.
- * Uses default command queue.
- */
-template< typename IteratorType >
-inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS)
-        return error;
-    return cl::copy(queue, startIterator, endIterator, buffer);
- * Blocking copy operation between iterators and a buffer.
- * Device to Host.
- * Uses default command queue.
- */
-template< typename IteratorType >
-inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS)
-        return error;
-    return cl::copy(queue, buffer, startIterator, endIterator);
- * Blocking copy operation between iterators and a buffer.
- * Host to Device.
- * Uses specified queue.
- */
-template< typename IteratorType >
-inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
-    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-    cl_int error;
-    ::size_t length = endIterator-startIterator;
-    ::size_t byteLength = length*sizeof(DataType);
-    DataType *pointer = 
-        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
-    // if exceptions enabled, enqueueMapBuffer will throw
-    if( error != CL_SUCCESS ) {
-        return error;
-    }
-#if defined(_MSC_VER)
-    std::copy(
-        startIterator, 
-        endIterator, 
-        stdext::checked_array_iterator<DataType*>(
-            pointer, length));
-    std::copy(startIterator, endIterator, pointer);
-    Event endEvent;
-    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
-    // if exceptions enabled, enqueueUnmapMemObject will throw
-    if( error != CL_SUCCESS ) { 
-        return error;
-    }
-    endEvent.wait();
-    return CL_SUCCESS;
- * Blocking copy operation between iterators and a buffer.
- * Device to Host.
- * Uses specified queue.
- */
-template< typename IteratorType >
-inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
-    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-    cl_int error;
-    ::size_t length = endIterator-startIterator;
-    ::size_t byteLength = length*sizeof(DataType);
-    DataType *pointer = 
-        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
-    // if exceptions enabled, enqueueMapBuffer will throw
-    if( error != CL_SUCCESS ) {
-        return error;
-    }
-    std::copy(pointer, pointer + length, startIterator);
-    Event endEvent;
-    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
-    // if exceptions enabled, enqueueUnmapMemObject will throw
-    if( error != CL_SUCCESS ) { 
-        return error;
-    }
-    endEvent.wait();
-    return CL_SUCCESS;
-#if defined(CL_VERSION_1_1)
-inline cl_int enqueueReadBufferRect(
-    const Buffer& buffer,
-    cl_bool blocking,
-    const size_t<3>& buffer_offset,
-    const size_t<3>& host_offset,
-    const size_t<3>& region,
-    ::size_t buffer_row_pitch,
-    ::size_t buffer_slice_pitch,
-    ::size_t host_row_pitch,
-    ::size_t host_slice_pitch,
-    void *ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueReadBufferRect(
-        buffer, 
-        blocking, 
-        buffer_offset, 
-        host_offset,
-        region,
-        buffer_row_pitch,
-        buffer_slice_pitch,
-        host_row_pitch,
-        host_slice_pitch,
-        ptr, 
-        events, 
-        event);
-inline cl_int enqueueWriteBufferRect(
-    const Buffer& buffer,
-    cl_bool blocking,
-    const size_t<3>& buffer_offset,
-    const size_t<3>& host_offset,
-    const size_t<3>& region,
-    ::size_t buffer_row_pitch,
-    ::size_t buffer_slice_pitch,
-    ::size_t host_row_pitch,
-    ::size_t host_slice_pitch,
-    void *ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueWriteBufferRect(
-        buffer, 
-        blocking, 
-        buffer_offset, 
-        host_offset,
-        region,
-        buffer_row_pitch,
-        buffer_slice_pitch,
-        host_row_pitch,
-        host_slice_pitch,
-        ptr, 
-        events, 
-        event);
-inline cl_int enqueueCopyBufferRect(
-    const Buffer& src,
-    const Buffer& dst,
-    const size_t<3>& src_origin,
-    const size_t<3>& dst_origin,
-    const size_t<3>& region,
-    ::size_t src_row_pitch,
-    ::size_t src_slice_pitch,
-    ::size_t dst_row_pitch,
-    ::size_t dst_slice_pitch,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueCopyBufferRect(
-        src,
-        dst,
-        src_origin,
-        dst_origin,
-        region,
-        src_row_pitch,
-        src_slice_pitch,
-        dst_row_pitch,
-        dst_slice_pitch,
-        events, 
-        event);
-inline cl_int enqueueReadImage(
-    const Image& image,
-    cl_bool blocking,
-    const size_t<3>& origin,
-    const size_t<3>& region,
-    ::size_t row_pitch,
-    ::size_t slice_pitch,
-    void* ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL) 
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueReadImage(
-        image,
-        blocking,
-        origin,
-        region,
-        row_pitch,
-        slice_pitch,
-        ptr,
-        events, 
-        event);
-inline cl_int enqueueWriteImage(
-    const Image& image,
-    cl_bool blocking,
-    const size_t<3>& origin,
-    const size_t<3>& region,
-    ::size_t row_pitch,
-    ::size_t slice_pitch,
-    void* ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueWriteImage(
-        image,
-        blocking,
-        origin,
-        region,
-        row_pitch,
-        slice_pitch,
-        ptr,
-        events, 
-        event);
-inline cl_int enqueueCopyImage(
-    const Image& src,
-    const Image& dst,
-    const size_t<3>& src_origin,
-    const size_t<3>& dst_origin,
-    const size_t<3>& region,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueCopyImage(
-        src,
-        dst,
-        src_origin,
-        dst_origin,
-        region,
-        events,
-        event);
-inline cl_int enqueueCopyImageToBuffer(
-    const Image& src,
-    const Buffer& dst,
-    const size_t<3>& src_origin,
-    const size_t<3>& region,
-    ::size_t dst_offset,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueCopyImageToBuffer(
-        src,
-        dst,
-        src_origin,
-        region,
-        dst_offset,
-        events,
-        event);
-inline cl_int enqueueCopyBufferToImage(
-    const Buffer& src,
-    const Image& dst,
-    ::size_t src_offset,
-    const size_t<3>& dst_origin,
-    const size_t<3>& region,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.enqueueCopyBufferToImage(
-        src,
-        dst,
-        src_offset,
-        dst_origin,
-        region,
-        events,
-        event);
-inline cl_int flush(void)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-    return queue.flush();
-inline cl_int finish(void)
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS) {
-        return error;
-    } 
-    return queue.finish();
-// Kernel Functor support
-// New interface as of September 2011
-// Requires the C++11 std::tr1::function (note do not support TR1)
-// Visual Studio 2010 and GCC 4.2
-struct EnqueueArgs
-    CommandQueue queue_;
-    const NDRange offset_;
-    const NDRange global_;
-    const NDRange local_;
-    VECTOR_CLASS<Event> events_;
-    EnqueueArgs(NDRange global) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange)
-    {
-    }
-    EnqueueArgs(NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(local)
-    {
-    }
-    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(offset), 
-      global_(global),
-      local_(local)
-    {
-    }
-    EnqueueArgs(Event e, NDRange global) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange)
-    {
-        events_.push_back(e);
-    }
-    EnqueueArgs(Event e, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(local)
-    {
-        events_.push_back(e);
-    }
-    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(offset), 
-      global_(global),
-      local_(local)
-    {
-        events_.push_back(e);
-    }
-    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange),
-      events_(events)
-    {
-    }
-    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(local),
-      events_(events)
-    {
-    }
-    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(offset), 
-      global_(global),
-      local_(local),
-      events_(events)
-    {
-    }
-    EnqueueArgs(CommandQueue &queue, NDRange global) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange)
-    {
-    }
-    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(local)
-    {
-    }
-    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(offset), 
-      global_(global),
-      local_(local)
-    {
-    }
-    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange)
-    {
-        events_.push_back(e);
-    }
-    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(local)
-    {
-        events_.push_back(e);
-    }
-    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(offset), 
-      global_(global),
-      local_(local)
-    {
-        events_.push_back(e);
-    }
-    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange),
-      events_(events)
-    {
-    }
-    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(local),
-      events_(events)
-    {
-    }
-    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(offset), 
-      global_(global),
-      local_(local),
-      events_(events)
-    {
-    }
-namespace detail {
-class NullType {};
-template<int index, typename T0>
-struct SetArg
-    static void set (Kernel kernel, T0 arg)
-    {
-        kernel.setArg(index, arg);
-    }
-template<int index>
-struct SetArg<index, NullType>
-    static void set (Kernel, NullType)
-    { 
-    }
-template <
-   typename T0,   typename T1,   typename T2,   typename T3,
-   typename T4,   typename T5,   typename T6,   typename T7,
-   typename T8,   typename T9,   typename T10,   typename T11,
-   typename T12,   typename T13,   typename T14,   typename T15,
-   typename T16,   typename T17,   typename T18,   typename T19,
-   typename T20,   typename T21,   typename T22,   typename T23,
-   typename T24,   typename T25,   typename T26,   typename T27,
-   typename T28,   typename T29,   typename T30,   typename T31
-class KernelFunctorGlobal
-    Kernel kernel_;
-   KernelFunctorGlobal(
-        Kernel kernel) :
-            kernel_(kernel)
-    {}
-   KernelFunctorGlobal(
-        const Program& program,
-        const STRING_CLASS name,
-        cl_int * err = NULL) :
-            kernel_(program, name.c_str(), err)
-    {}
-    Event operator() (
-        const EnqueueArgs& args,
-        T0 t0,
-        T1 t1 = NullType(),
-        T2 t2 = NullType(),
-        T3 t3 = NullType(),
-        T4 t4 = NullType(),
-        T5 t5 = NullType(),
-        T6 t6 = NullType(),
-        T7 t7 = NullType(),
-        T8 t8 = NullType(),
-        T9 t9 = NullType(),
-        T10 t10 = NullType(),
-        T11 t11 = NullType(),
-        T12 t12 = NullType(),
-        T13 t13 = NullType(),
-        T14 t14 = NullType(),
-        T15 t15 = NullType(),
-        T16 t16 = NullType(),
-        T17 t17 = NullType(),
-        T18 t18 = NullType(),
-        T19 t19 = NullType(),
-        T20 t20 = NullType(),
-        T21 t21 = NullType(),
-        T22 t22 = NullType(),
-        T23 t23 = NullType(),
-        T24 t24 = NullType(),
-        T25 t25 = NullType(),
-        T26 t26 = NullType(),
-        T27 t27 = NullType(),
-        T28 t28 = NullType(),
-        T29 t29 = NullType(),
-        T30 t30 = NullType(),
-        T31 t31 = NullType()
-        )
-    {
-        Event event;
-        SetArg<0, T0>::set(kernel_, t0);
-        SetArg<1, T1>::set(kernel_, t1);
-        SetArg<2, T2>::set(kernel_, t2);
-        SetArg<3, T3>::set(kernel_, t3);
-        SetArg<4, T4>::set(kernel_, t4);
-        SetArg<5, T5>::set(kernel_, t5);
-        SetArg<6, T6>::set(kernel_, t6);
-        SetArg<7, T7>::set(kernel_, t7);
-        SetArg<8, T8>::set(kernel_, t8);
-        SetArg<9, T9>::set(kernel_, t9);
-        SetArg<10, T10>::set(kernel_, t10);
-        SetArg<11, T11>::set(kernel_, t11);
-        SetArg<12, T12>::set(kernel_, t12);
-        SetArg<13, T13>::set(kernel_, t13);
-        SetArg<14, T14>::set(kernel_, t14);
-        SetArg<15, T15>::set(kernel_, t15);
-        SetArg<16, T16>::set(kernel_, t16);
-        SetArg<17, T17>::set(kernel_, t17);
-        SetArg<18, T18>::set(kernel_, t18);
-        SetArg<19, T19>::set(kernel_, t19);
-        SetArg<20, T20>::set(kernel_, t20);
-        SetArg<21, T21>::set(kernel_, t21);
-        SetArg<22, T22>::set(kernel_, t22);
-        SetArg<23, T23>::set(kernel_, t23);
-        SetArg<24, T24>::set(kernel_, t24);
-        SetArg<25, T25>::set(kernel_, t25);
-        SetArg<26, T26>::set(kernel_, t26);
-        SetArg<27, T27>::set(kernel_, t27);
-        SetArg<28, T28>::set(kernel_, t28);
-        SetArg<29, T29>::set(kernel_, t29);
-        SetArg<30, T30>::set(kernel_, t30);
-        SetArg<31, T31>::set(kernel_, t31);
-        args.queue_.enqueueNDRangeKernel(
-            kernel_,
-            args.offset_,
-            args.global_,
-            args.local_,
-            &args.events_,
-            &event);
-        return event;
-    }
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27,
-	typename T28,
-	typename T29,
-	typename T30,
-	typename T31>
-struct functionImplementation_
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		T30,
-		T31> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		T30,
-		T31);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27,
-		T28 arg28,
-		T29 arg29,
-		T30 arg30,
-		T31 arg31)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27,
-			arg28,
-			arg29,
-			arg30,
-			arg31);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27,
-	typename T28,
-	typename T29,
-	typename T30>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	T27,
-	T28,
-	T29,
-	T30,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		T30,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		T30);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27,
-		T28 arg28,
-		T29 arg29,
-		T30 arg30)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27,
-			arg28,
-			arg29,
-			arg30);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27,
-	typename T28,
-	typename T29>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	T27,
-	T28,
-	T29,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27,
-		T28 arg28,
-		T29 arg29)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27,
-			arg28,
-			arg29);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27,
-	typename T28>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	T27,
-	T28,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27,
-		T28 arg28)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27,
-			arg28);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	T27,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4);
-	}
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3);
-	}
-	typename T0,
-	typename T1,
-	typename T2>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2);
-	}
-	typename T0,
-	typename T1>
-struct functionImplementation_
-<	T0,
-	T1,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1);
-	}
-	typename T0>
-struct functionImplementation_
-<	T0,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-    FunctorType functor_;
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-    }
-	//! \brief Return type of the functor
-	typedef Event result_type;
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0);
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0);
-	}
-} // namespace detail
-template <
-   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
-   typename T3 = detail::NullType,   typename T4 = detail::NullType,
-   typename T5 = detail::NullType,   typename T6 = detail::NullType,
-   typename T7 = detail::NullType,   typename T8 = detail::NullType,
-   typename T9 = detail::NullType,   typename T10 = detail::NullType,
-   typename T11 = detail::NullType,   typename T12 = detail::NullType,
-   typename T13 = detail::NullType,   typename T14 = detail::NullType,
-   typename T15 = detail::NullType,   typename T16 = detail::NullType,
-   typename T17 = detail::NullType,   typename T18 = detail::NullType,
-   typename T19 = detail::NullType,   typename T20 = detail::NullType,
-   typename T21 = detail::NullType,   typename T22 = detail::NullType,
-   typename T23 = detail::NullType,   typename T24 = detail::NullType,
-   typename T25 = detail::NullType,   typename T26 = detail::NullType,
-   typename T27 = detail::NullType,   typename T28 = detail::NullType,
-   typename T29 = detail::NullType,   typename T30 = detail::NullType,
-   typename T31 = detail::NullType
-struct make_kernel :
-    public detail::functionImplementation_<
-               T0,   T1,   T2,   T3,
-               T4,   T5,   T6,   T7,
-               T8,   T9,   T10,   T11,
-               T12,   T13,   T14,   T15,
-               T16,   T17,   T18,   T19,
-               T20,   T21,   T22,   T23,
-               T24,   T25,   T26,   T27,
-               T28,   T29,   T30,   T31
-    >
-	typedef detail::KernelFunctorGlobal<             
-		       T0,   T1,   T2,   T3,
-               T4,   T5,   T6,   T7,
-               T8,   T9,   T10,   T11,
-               T12,   T13,   T14,   T15,
-               T16,   T17,   T18,   T19,
-               T20,   T21,   T22,   T23,
-               T24,   T25,   T26,   T27,
-               T28,   T29,   T30,   T31
-    > FunctorType;
-    make_kernel(
-        const Program& program,
-        const STRING_CLASS name,
-        cl_int * err = NULL) :
-           detail::functionImplementation_<
-                    T0,   T1,   T2,   T3,
-                       T4,   T5,   T6,   T7,
-                       T8,   T9,   T10,   T11,
-                       T12,   T13,   T14,   T15,
-                       T16,   T17,   T18,   T19,
-                       T20,   T21,   T22,   T23,
-                       T24,   T25,   T26,   T27,
-                       T28,   T29,   T30,   T31
-           >(
-            FunctorType(program, name, err)) 
-    {}
-    make_kernel(
-        const Kernel kernel) :
-           detail::functionImplementation_<
-                    T0,   T1,   T2,   T3,
-                       T4,   T5,   T6,   T7,
-                       T8,   T9,   T10,   T11,
-                       T12,   T13,   T14,   T15,
-                       T16,   T17,   T18,   T19,
-                       T20,   T21,   T22,   T23,
-                       T24,   T25,   T26,   T27,
-                       T28,   T29,   T30,   T31
-           >(
-            FunctorType(kernel)) 
-    {}    
-#undef __ERR_STR
-// Extensions
- * Deprecated APIs for 1.2
- */
-#if defined(CL_VERSION_1_1)
-#endif // #if defined(CL_VERSION_1_1)
-} // namespace cl
-#ifdef _WIN32
-#pragma pop_macro("max")
-#endif // _WIN32
-#endif // CL_HPP_
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h
index b6c90b3..d5960a4 100644
--- a/include/CL/cl_d3d10.h
+++ b/include/CL/cl_d3d10.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
diff --git a/include/CL/cl_d3d11.h b/include/CL/cl_d3d11.h
index 2e0a63f..39f9072 100644
--- a/include/CL/cl_d3d11.h
+++ b/include/CL/cl_d3d11.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
diff --git a/include/CL/cl_dx9_media_sharing.h b/include/CL/cl_dx9_media_sharing.h
index 23f1631..2729e8b 100644
--- a/include/CL/cl_dx9_media_sharing.h
+++ b/include/CL/cl_dx9_media_sharing.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
@@ -33,7 +38,7 @@
 extern "C" {
 /* cl_khr_dx9_media_sharing                                                   */
 #define cl_khr_dx9_media_sharing 1
diff --git a/include/CL/cl_egl.h b/include/CL/cl_egl.h
index 93e6c9c..a765bd5 100644
--- a/include/CL/cl_egl.h
+++ b/include/CL/cl_egl.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
@@ -28,8 +33,6 @@
 #include <CL/cl.h>
-#include <EGL/egl.h>
-#include <EGL/eglext.h>
 #ifdef __cplusplus
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 0a66d70..fa34cba 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008-2013 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
@@ -134,15 +139,15 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
  * cl_khr_initalize_memory extension *
-#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
  * cl_khr_terminate_context extension *
-#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
 #define cl_khr_terminate_context 1
 extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
@@ -384,7 +389,7 @@ typedef struct _cl_mem_ext_host_ptr
     /* Legal values will be defined in layered extensions. */
     cl_uint  allocation_type;
-	/* Host cache policy for this external memory allocation. */
+    /* Host cache policy for this external memory allocation. */
     cl_uint  host_cache_policy;
 } cl_mem_ext_host_ptr;
@@ -411,6 +416,40 @@ typedef struct _cl_mem_ion_host_ptr
 #endif /* CL_VERSION_1_1 */
+#ifdef CL_VERSION_2_0
+* cl_khr_sub_groups extension
+#define cl_khr_sub_groups 1
+typedef cl_uint  cl_kernel_sub_group_info;
+/* cl_khr_sub_group_info */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+						   cl_device_id /*in_device*/,
+						   cl_kernel_sub_group_info /* param_name */,
+						   size_t /*input_value_size*/,
+						   const void * /*input_value*/,
+						   size_t /*param_value_size*/,
+						   void* /*param_value*/,
+						   size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0;
+typedef CL_API_ENTRY cl_int
+     ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+						      cl_device_id /*in_device*/,
+						      cl_kernel_sub_group_info /* param_name */,
+						      size_t /*input_value_size*/,
+						      const void * /*input_value*/,
+						      size_t /*param_value_size*/,
+						      void* /*param_value*/,
+						      size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0;
+#endif /* CL_VERSION_2_0 */
 #ifdef __cplusplus
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
index e52c1b6..945daa8 100644
--- a/include/CL/cl_gl.h
+++ b/include/CL/cl_gl.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
index 77d5353..e3c14c6 100644
--- a/include/CL/cl_gl_ext.h
+++ b/include/CL/cl_gl_ext.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
index 7f6f5e8..f157b63 100644
--- a/include/CL/cl_platform.h
+++ b/include/CL/cl_platform.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
@@ -45,6 +50,14 @@ extern "C" {
     #define CL_CALLBACK
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
 #ifdef __APPLE__
     #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
     #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
@@ -75,6 +88,8 @@ extern "C" {
     #define CL_EXT_SUFFIX__VERSION_1_1
     #define CL_API_SUFFIX__VERSION_1_2
     #define CL_EXT_SUFFIX__VERSION_1_2
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
     #ifdef __GNUC__
@@ -92,9 +107,17 @@ extern "C" {
             #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
             #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+         #endif
     #elif _WIN32
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
             #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
             #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
@@ -108,12 +131,23 @@ extern "C" {
             #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
             #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated)
+        #endif
diff --git a/include/CL/opencl.h b/include/CL/opencl.h
index 3f00524..9855cd7 100644
--- a/include/CL/opencl.h
+++ b/include/CL/opencl.h
@@ -1,5 +1,5 @@
- * Copyright (c) 2008-2012 The Khronos Group Inc.
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -12,6 +12,11 @@
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Materials.
+ *    https://www.khronos.org/registry/
+ *
diff --git a/kernels/compiler_atomic_functions_20.cl b/kernels/compiler_atomic_functions_20.cl
new file mode 100644
index 0000000..cbca52e
--- /dev/null
+++ b/kernels/compiler_atomic_functions_20.cl
@@ -0,0 +1,53 @@
+__kernel void compiler_atomic_functions_20(__global int *dst, __local int *tmp, __global int *src) {
+  int lid = get_local_id(0);
+  int i = lid % 12;
+  atomic_int* p = (atomic_int*)tmp;
+  if(lid == 0) {
+    for(int j=0; j<12; j=j+1) {
+      atomic_exchange(&p[j], 0);
+    }
+    atomic_exchange(&p[4], -1);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  int compare = 0;
+  switch(i) {
+    case 0: atomic_inc(&tmp[i]); break;
+    case 1: atomic_dec(&tmp[i]); break;
+    case 2: atomic_fetch_add(&p[i], src[lid]); break;
+    case 3: atomic_fetch_sub(&p[i], src[lid]); break;
+    case 4: atomic_fetch_and(&p[i], ~(src[lid]<<(lid / 16))); break;
+    case 5: atomic_fetch_or (&p[i], src[lid]<<(lid / 16)); break;
+    case 6: atomic_fetch_xor(&p[i], src[lid]); break;
+    case 7: atomic_fetch_min(&p[i], -src[lid]); break;
+    case 8: atomic_fetch_max(&p[i], src[lid]); break;
+    case 9: atomic_fetch_min((atomic_uint*)&p[i], -src[lid]); break;
+    case 10: atomic_fetch_max((atomic_uint*)&p[i], src[lid]); break;
+    case 11: atomic_compare_exchange_strong(&p[i], &compare, src[10]); break;
+    default:  break;
+  }
+  atomic_int* d = (atomic_int*)dst;
+  switch(i) {
+    case 0: atomic_inc(&dst[i]); break;
+    case 1: atomic_dec(&dst[i]); break;
+    case 2: atomic_fetch_add(&d[i], src[lid]); break;
+    case 3: atomic_fetch_sub(&d[i], src[lid]); break;
+    case 4: atomic_fetch_and(&d[i], ~(src[lid]<<(lid / 16))); break;
+    case 5: atomic_fetch_or (&d[i], src[lid]<<(lid / 16)); break;
+    case 6: atomic_fetch_xor(&d[i], src[lid]); break;
+    case 7: atomic_fetch_min(&d[i], -src[lid]); break;
+    case 8: atomic_fetch_max(&d[i], src[lid]); break;
+    case 9: atomic_fetch_min((atomic_uint*)&d[i], -src[lid]); break;
+    case 10: atomic_fetch_max((atomic_uint*)&d[i], src[lid]); break;
+    case 11: atomic_compare_exchange_strong(&d[i], &compare, src[10]); break;
+    default:  break;
+  }
+  if(get_global_id(0) == 0) {
+    for(i=0; i<12; i=i+1)
+      atomic_xchg(&dst[i+12], tmp[i]);
+  }
diff --git a/kernels/compiler_ceil64.spir b/kernels/compiler_ceil64.spir
new file mode 100644
index 0000000..8357836
Binary files /dev/null and b/kernels/compiler_ceil64.spir differ
diff --git a/kernels/compiler_ctz.cl b/kernels/compiler_ctz.cl
new file mode 100644
index 0000000..8acdfb9
--- /dev/null
+++ b/kernels/compiler_ctz.cl
@@ -0,0 +1,16 @@
+    kernel void compiler_ctz_##TYPE(global TYPE* src, global TYPE* dst)   \
+{                                                \
+  __global TYPE* A = &src[get_global_id(0)];    \
+  __global TYPE* B = &dst[get_global_id(0)];    \
+  *B =  ctz(*A);   \
diff --git a/kernels/compiler_device_enqueue.cl b/kernels/compiler_device_enqueue.cl
new file mode 100644
index 0000000..cb20142
--- /dev/null
+++ b/kernels/compiler_device_enqueue.cl
@@ -0,0 +1,18 @@
+void block_fn(__global uint* val)
+  atomic_add(val, get_global_id(0));
+kernel void compiler_device_enqueue(uint glob_size_arr, __global uint* val)
+  size_t tid = get_global_id(0);
+  for(int i = 0; i < glob_size_arr; i++)
+  {
+    ndrange_t ndrange = ndrange_1D(glob_size_arr);
+    __global uint * v = val + tid;
+    void (^kernelBlock)(void) = ^{ block_fn(v); };
+    queue_t q = get_default_queue();
+    enqueue_kernel(q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);
+  }
diff --git a/kernels/compiler_generic_atomic.cl b/kernels/compiler_generic_atomic.cl
new file mode 100644
index 0000000..3db49e2
--- /dev/null
+++ b/kernels/compiler_generic_atomic.cl
@@ -0,0 +1,33 @@
+#define GENERIC_KERNEL(T)                                              \
+kernel void compiler_generic_atomic_##T(global T *src, global T *dst)  \
+{                                                                      \
+  size_t gid = get_global_id(0);                                       \
+  size_t lid = get_local_id(0);                                        \
+  private T pdata[16];                                                 \
+  local T ldata[16];                                                   \
+  generic T * p1 = &pdata[lid];                                        \
+  generic T * p2 = &ldata[lid];                                        \
+  generic T *p = (gid & 1) ? p1 : p2;                                  \
+  /* below expression is not supported by clang now   */               \
+  /* generic T *p = (gid & 1) ? p1 : (T *)&ldata[lid]; */              \
+  *p = src[gid];                                                       \
+  /* fill other data */                                                \
+  if(gid&1) {                                                          \
+    ldata[lid] = 20;                                                   \
+  } else {                                                             \
+    for (int i = 0; i < 16; i++) {                                     \
+      pdata[i] = src[lid];;                                            \
+    }                                                                  \
+  }                                                                    \
+  barrier(CLK_LOCAL_MEM_FENCE);                                        \
+                                                                       \
+  generic T * q1 = &pdata[lid];                                        \
+  generic T * q2 = &ldata[lid];                                        \
+  generic T *q = (gid & 1) ? q1 : q2;                                  \
+  atomic_fetch_add((atomic_int*)q , pdata[lid]);                       \
+  dst[gid] = *q;                                                       \
diff --git a/kernels/compiler_generic_pointer.cl b/kernels/compiler_generic_pointer.cl
new file mode 100644
index 0000000..a06b192
--- /dev/null
+++ b/kernels/compiler_generic_pointer.cl
@@ -0,0 +1,33 @@
+#define GENERIC_KERNEL(T)                                              \
+kernel void compiler_generic_pointer_##T(global T *src, global T *dst) \
+{                                                                      \
+  size_t gid = get_global_id(0);                                       \
+  size_t lid = get_local_id(0);                                        \
+  private T pdata[16];                                                 \
+  local T ldata[16];                                                   \
+  generic T * p1 = &pdata[lid];                                        \
+  generic T * p2 = &ldata[lid];                                        \
+  generic T *p = (gid & 1) ? p1 : p2;                                  \
+  /* below expression is not supported by clang now   */               \
+  /* generic T *p = (gid & 1) ? p1 : (T *)&ldata[lid]; */              \
+  *p = src[gid];                                                       \
+  /* fill other data */                                                \
+  if(gid&1) {                                                          \
+    ldata[lid] = 20;                                                   \
+  } else {                                                             \
+    for (int i = 0; i < 16; i++) {                                     \
+      pdata[i] = src[lid];;                                            \
+    }                                                                  \
+  }                                                                    \
+  barrier(CLK_LOCAL_MEM_FENCE);                                        \
+                                                                       \
+  generic T * q1 = &pdata[lid];                                        \
+  generic T * q2 = &ldata[lid];                                        \
+  generic T *q = (gid & 1) ? q1 : q2;                                  \
+  dst[gid] = *q + pdata[lid];                                          \
diff --git a/kernels/compiler_pipe_builtin.cl b/kernels/compiler_pipe_builtin.cl
new file mode 100644
index 0000000..4e8dcc4
--- /dev/null
+++ b/kernels/compiler_pipe_builtin.cl
@@ -0,0 +1,117 @@
+typedef struct{
+  int a;
+  int b;
+__kernel void compiler_pipe_convenience_write_int(write_only pipe int p, __global int *src)
+    int gid = get_global_id(0);
+    write_pipe(p, &src[gid]);
+__kernel void compiler_pipe_convenience_read_int(read_only pipe int p, __global int *dst)
+    int gid = get_global_id(0);
+    read_pipe(p, &dst[gid]);
+__kernel void compiler_pipe_convenience_write_mystruct(write_only pipe mystruct p, __global mystruct *src)
+    int gid = get_global_id(0);
+    write_pipe(p, &src[gid]);
+__kernel void compiler_pipe_convenience_read_mystruct(read_only pipe mystruct p, __global mystruct *dst)
+    int gid = get_global_id(0);
+    read_pipe(p, &dst[gid]);
+__kernel void compiler_pipe_reserve_write_int(write_only pipe int p, __global int *src)
+    int gid = get_global_id(0);
+    reserve_id_t res_id = reserve_write_pipe(p, 1);
+    if(is_valid_reserve_id(res_id))
+    {
+      write_pipe(p, res_id, 0, &src[gid]);
+      commit_write_pipe(p, res_id);
+    }
+__kernel void compiler_pipe_reserve_read_int(read_only pipe int p, __global int *dst)
+    int gid = get_global_id(0);
+    reserve_id_t res_id = reserve_read_pipe(p, 1);
+    if(is_valid_reserve_id(res_id))
+    {
+      read_pipe(p, res_id, 0, &dst[gid]);
+      commit_read_pipe(p, res_id);
+    }
+__kernel void compiler_pipe_reserve_write_mystruct(write_only pipe mystruct p, __global mystruct *src)
+    int gid = get_global_id(0);
+    reserve_id_t res_id = reserve_write_pipe(p, 1);
+    if(is_valid_reserve_id(res_id))
+    {
+      write_pipe(p, res_id, 0, &src[gid]);
+      commit_write_pipe(p, res_id);
+    }
+__kernel void compiler_pipe_reserve_read_mystruct(read_only pipe mystruct p, __global mystruct *dst)
+    int gid = get_global_id(0);
+    reserve_id_t res_id = reserve_read_pipe(p, 1);
+    if(is_valid_reserve_id(res_id))
+    {
+      read_pipe(p, res_id, 0, &dst[gid]);
+      commit_read_pipe(p, res_id);
+    }
+__kernel void compiler_pipe_workgroup_write_int(write_only pipe int p, __global int *src)
+    int gid = get_global_id(0);
+    reserve_id_t res_id = work_group_reserve_write_pipe(p, get_local_size(0));
+    if(is_valid_reserve_id(res_id))
+    {
+      write_pipe(p, res_id, get_local_id(0), &src[gid]);
+      work_group_commit_write_pipe(p, res_id);
+    }
+__kernel void compiler_pipe_workgroup_read_int(read_only pipe int p, __global int *dst)
+    int gid = get_global_id(0);
+    reserve_id_t res_id = work_group_reserve_read_pipe(p, get_local_size(0));
+    if(is_valid_reserve_id(res_id))
+    {
+      read_pipe(p, res_id, get_local_id(0), &dst[gid]);
+      work_group_commit_read_pipe(p, res_id);
+    }
+__kernel void compiler_pipe_workgroup_write_mystruct(write_only pipe mystruct p, __global mystruct *src)
+    int gid = get_global_id(0);
+    reserve_id_t res_id = work_group_reserve_write_pipe(p, get_local_size(0));
+    if(is_valid_reserve_id(res_id))
+    {
+      write_pipe(p, res_id, get_local_id(0), &src[gid]);
+      work_group_commit_write_pipe(p, res_id);
+    }
+__kernel void compiler_pipe_workgroup_read_mystruct(read_only pipe mystruct p, __global mystruct *dst)
+    int gid = get_global_id(0);
+    reserve_id_t res_id = work_group_reserve_read_pipe(p, get_local_size(0));
+    if(is_valid_reserve_id(res_id))
+    {
+      read_pipe(p, res_id, get_local_id(0), &dst[gid]);
+      work_group_commit_read_pipe(p, res_id);
+    }
+__kernel void compiler_pipe_query(write_only pipe int p, __global uint *src)
+    int gid = get_global_id(0);
+    write_pipe(p,&gid);
+    if(gid == 0) {
+      src[0] = get_pipe_num_packets(p);
+      src[1] = get_pipe_max_packets(p);
+    }
diff --git a/kernels/compiler_program_global.cl b/kernels/compiler_program_global.cl
new file mode 100644
index 0000000..fbe030f
--- /dev/null
+++ b/kernels/compiler_program_global.cl
@@ -0,0 +1,77 @@
+struct config{
+  int s0;
+  global short *s1;
+global int i = 5;
+global int bb = 4;
+global int *global p;
+/* array */
+global int ba[12];
+/* short/long data type */
+global short s;
+global short s2;
+global long l;
+/* pointer in constant AS to global */
+global int * constant px =&i;
+/* constant pointer relocation */
+constant int x = 2;
+constant int y =1;
+constant int *constant z[2] = {&x, &y};
+/* structure with pointer field */
+global struct config c[2] = {{1, &s}, {2, &s2} };
+global int a = 1;
+global int b = 2;
+global int * constant gArr[2]= {&a, &b};
+global int a_var[1] = {0};
+global int *p_var = a_var;
+__kernel void compiler_program_global0(const global int *src, int dynamic) {
+  size_t gid = get_global_id(0);
+  /* global read/write */
+  p = &i;
+  *p += 1;
+  /* pointer in struct memory access */
+  *c[gid&1].s1 += 2;
+  s = 2;
+  l = 3;
+  /* constant AS pointer (points to global) memory access */
+  *px += *z[dynamic];
+  p = &bb;
+  /* array */
+  if (gid < 11)
+    ba[gid] = src[gid];
+__kernel void compiler_program_global1(global int *dst, int dynamic) {
+  size_t gid = get_global_id(0);
+//  static global sg;
+  dst[11] = i;
+  dst[12] = *p;
+  dst[13] = s;
+  dst[14] = l;
+  if (p_var == a_var)
+    dst[15] = *gArr[dynamic];
+  if (gid < 11)
+    dst[gid] = ba[gid];
+__kernel void nouse(int dynamic) {
+  c[0].s1 = &s2;
+  p_var = a+dynamic;
diff --git a/kernels/compiler_sub_group_shuffle.cl b/kernels/compiler_sub_group_shuffle.cl
index 322da74..c771eea 100644
--- a/kernels/compiler_sub_group_shuffle.cl
+++ b/kernels/compiler_sub_group_shuffle.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_int(global int *dst, int c)
   int i = get_global_id(0);
   if (i == 0)
@@ -16,3 +16,23 @@ __kernel void compiler_sub_group_shuffle(global int *dst, int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_short(global short *dst, int c)
+  short i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+  short from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  short o0 = get_sub_group_local_id();
+  short o1 = intel_sub_group_shuffle(from, c);
+  short o2 = intel_sub_group_shuffle(from, 5);
+  short o3 = intel_sub_group_shuffle(from, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
diff --git a/kernels/compiler_sub_group_shuffle_down.cl b/kernels/compiler_sub_group_shuffle_down.cl
index 769fc3f..40bac05 100644
--- a/kernels/compiler_sub_group_shuffle_down.cl
+++ b/kernels/compiler_sub_group_shuffle_down.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_down(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_down_int(global int *dst, int c)
   int i = get_global_id(0);
   if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_down(global int *dst, int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_down_short(global short *dst, int c)
+  short i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+  short from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int k = get_sub_group_local_id() + 1;
+  short o0 = intel_sub_group_shuffle_down((short)123, (short)456, c);
+  short o1 = intel_sub_group_shuffle_down((short)123, from, c);
+  short o2 = intel_sub_group_shuffle_down(from, (short)-from, k);
+  short o3 = intel_sub_group_shuffle_down(from, (short)321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
diff --git a/kernels/compiler_sub_group_shuffle_up.cl b/kernels/compiler_sub_group_shuffle_up.cl
index 5c5cee1..fd287d5 100644
--- a/kernels/compiler_sub_group_shuffle_up.cl
+++ b/kernels/compiler_sub_group_shuffle_up.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_up(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_up_int(global int *dst, int c)
   int i = get_global_id(0);
   if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_up(global int *dst, int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_up_short(global short *dst, int c)
+  short i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+  short from = i;
+  int j = get_sub_group_local_id() + 1;
+  int k = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  short o0 = intel_sub_group_shuffle_up((short)123, (short)456, c);
+  short o1 = intel_sub_group_shuffle_up((short)123, from, c);
+  short o2 = intel_sub_group_shuffle_up(from, (short)-from, k);
+  short o3 = intel_sub_group_shuffle_up(from, (short)321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
diff --git a/kernels/compiler_sub_group_shuffle_xor.cl b/kernels/compiler_sub_group_shuffle_xor.cl
index 8bc15d3..df3dfe7 100644
--- a/kernels/compiler_sub_group_shuffle_xor.cl
+++ b/kernels/compiler_sub_group_shuffle_xor.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_xor(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_xor_int(global int *dst, int c)
   int i = get_global_id(0);
   if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_xor(global int *dst, int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_xor_short(global short *dst, int c)
+  short i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+  short from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int k = get_sub_group_local_id() + 1;
+  short o0 = get_sub_group_local_id();
+  short o1 = intel_sub_group_shuffle_xor(from, c);
+  short o2 = intel_sub_group_shuffle_xor(from, j);
+  short o3 = intel_sub_group_shuffle_xor(from, k);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
diff --git a/kernels/compiler_subgroup_broadcast.cl b/kernels/compiler_subgroup_broadcast.cl
index 96d38d9..3d16d67 100644
--- a/kernels/compiler_subgroup_broadcast.cl
+++ b/kernels/compiler_subgroup_broadcast.cl
@@ -1,7 +1,7 @@
  * Subgroup broadcast 1D functions
+#ifndef HALF
 kernel void compiler_subgroup_broadcast_imm_int(global int *src,
                                                 global int *dst,
                                                 uint simd_id)
@@ -32,3 +32,27 @@ kernel void compiler_subgroup_broadcast_long(global long *src,
   long broadcast_val = sub_group_broadcast(val, simd_id);
   dst[index] = broadcast_val;
+kernel void compiler_subgroup_broadcast_short(global short *src,
+                                                global short *dst,
+                                                uint simd_id)
+  uint index = get_global_id(0);
+  short val = src[index];
+  short broadcast_val = sub_group_broadcast(val, simd_id);
+  dst[index] = broadcast_val;
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_subgroup_broadcast_half(global half *src,
+                                                global half *dst,
+                                                uint simd_id)
+  uint index = get_global_id(0);
+  half val = src[index];
+  half broadcast_val = sub_group_broadcast(val, simd_id);
+  //printf("%d val %d is %d\n",index,as_ushort(val), as_ushort(broadcast_val));
+  dst[index] = broadcast_val;
diff --git a/kernels/compiler_subgroup_buffer_block_read.cl b/kernels/compiler_subgroup_buffer_block_read.cl
index 9edaa2e..4cbf894 100644
--- a/kernels/compiler_subgroup_buffer_block_read.cl
+++ b/kernels/compiler_subgroup_buffer_block_read.cl
@@ -1,31 +1,62 @@
-__kernel void compiler_subgroup_buffer_block_read1(global uint *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui1(global uint *src, global uint *dst)
   int id = get_global_id(0);
   global uint * p = src + get_sub_group_id() * get_max_sub_group_size();
-  uint tmp = intel_sub_group_block_read(p);
+  uint tmp = intel_sub_group_block_read_ui(p);
   dst[id] = tmp;
-__kernel void compiler_subgroup_buffer_block_read2(global uint *src, global uint2 *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui2(global uint *src, global uint2 *dst)
   int id = get_global_id(0);
   global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*2;
-  uint2 tmp = intel_sub_group_block_read2(p);
+  uint2 tmp = intel_sub_group_block_read_ui2(p);
   dst[id] = tmp;
-__kernel void compiler_subgroup_buffer_block_read4(global uint *src, global uint4 *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui4(global uint *src, global uint4 *dst)
   int id = get_global_id(0);
   global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*4;
-  uint4 tmp = intel_sub_group_block_read4(p);
+  uint4 tmp = intel_sub_group_block_read_ui4(p);
   dst[id] = tmp;
-__kernel void compiler_subgroup_buffer_block_read8(global uint *src, global uint8 *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui8(global uint *src, global uint8 *dst)
   int id = get_global_id(0);
   global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*8;
-  uint8 tmp = intel_sub_group_block_read8(p);
+  uint8 tmp = intel_sub_group_block_read_ui8(p);
   dst[id] = tmp;
+#ifdef SHORT
+__kernel void compiler_subgroup_buffer_block_read_us1(global ushort *src, global ushort *dst)
+  int id = get_global_id(0);
+  global ushort * p = src + get_sub_group_id() * get_max_sub_group_size();
+  ushort tmp = intel_sub_group_block_read_us(p);
+  dst[id] = tmp;
+__kernel void compiler_subgroup_buffer_block_read_us2(global ushort *src, global ushort2 *dst)
+  int id = get_global_id(0);
+  global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*2;
+  ushort2 tmp = intel_sub_group_block_read_us2(p);
+  dst[id] = tmp;
+__kernel void compiler_subgroup_buffer_block_read_us4(global ushort *src, global ushort4 *dst)
+  int id = get_global_id(0);
+  global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*4;
+  ushort4 tmp = intel_sub_group_block_read_us4(p);
+  dst[id] = tmp;
+__kernel void compiler_subgroup_buffer_block_read_us8(global ushort *src, global ushort8 *dst)
+  int id = get_global_id(0);
+  global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*8;
+  ushort8 tmp = intel_sub_group_block_read_us8(p);
+  dst[id] = tmp;
diff --git a/kernels/compiler_subgroup_buffer_block_write.cl b/kernels/compiler_subgroup_buffer_block_write.cl
index f735855..f452dcc 100644
--- a/kernels/compiler_subgroup_buffer_block_write.cl
+++ b/kernels/compiler_subgroup_buffer_block_write.cl
@@ -1,27 +1,55 @@
-__kernel void compiler_subgroup_buffer_block_write1(global uint *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui1(global uint *src, global uint *dst)
   int id = get_global_id(0);
   global uint * p = dst + get_sub_group_id() * get_max_sub_group_size();
-  intel_sub_group_block_write(p,src[id]);
+  intel_sub_group_block_write_ui(p,src[id]);
-__kernel void compiler_subgroup_buffer_block_write2(global uint2 *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui2(global uint2 *src, global uint *dst)
   int id = get_global_id(0);
   global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*2;
-  intel_sub_group_block_write2(p,src[id]);
+  intel_sub_group_block_write_ui2(p,src[id]);
-__kernel void compiler_subgroup_buffer_block_write4(global uint4 *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui4(global uint4 *src, global uint *dst)
   int id = get_global_id(0);
   global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*4;
-  intel_sub_group_block_write4(p,src[id]);
+  intel_sub_group_block_write_ui4(p,src[id]);
-__kernel void compiler_subgroup_buffer_block_write8(global uint8 *src, global uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui8(global uint8 *src, global uint *dst)
   int id = get_global_id(0);
   global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*8;
-  intel_sub_group_block_write8(p,src[id]);
+  intel_sub_group_block_write_ui8(p,src[id]);
+#ifdef SHORT
+__kernel void compiler_subgroup_buffer_block_write_us1(global ushort *src, global ushort *dst)
+  int id = get_global_id(0);
+  global ushort * p = dst + get_sub_group_id() * get_max_sub_group_size();
+  intel_sub_group_block_write_us(p,src[id]);
+__kernel void compiler_subgroup_buffer_block_write_us2(global ushort2 *src, global ushort *dst)
+  int id = get_global_id(0);
+  global ushort * p = dst + get_sub_group_id() * get_max_sub_group_size()*2;
+  intel_sub_group_block_write_us2(p,src[id]);
+__kernel void compiler_subgroup_buffer_block_write_us4(global ushort4 *src, global ushort *dst)
+  int id = get_global_id(0);
+  global ushort * p = dst + get_sub_group_id() * get_max_sub_group_size()*4;
+  intel_sub_group_block_write_us4(p,src[id]);
+__kernel void compiler_subgroup_buffer_block_write_us8(global ushort8 *src, global ushort *dst)
+  int id = get_global_id(0);
+  global ushort * p = dst + get_sub_group_id() * get_max_sub_group_size()*8;
+  intel_sub_group_block_write_us8(p,src[id]);
diff --git a/kernels/compiler_subgroup_image_block_read.cl b/kernels/compiler_subgroup_image_block_read.cl
index d5df6db..fa079b7 100644
--- a/kernels/compiler_subgroup_image_block_read.cl
+++ b/kernels/compiler_subgroup_image_block_read.cl
@@ -1,31 +1,64 @@
-__kernel void compiler_subgroup_image_block_read1(image2d_t src, global uint *dst)
+__kernel void compiler_subgroup_image_block_read_ui1(image2d_t src, global uint *dst)
   int id = get_global_id(0);
   int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
-  uint tmp = intel_sub_group_block_read(src,coord);
+  uint tmp = intel_sub_group_block_read_ui(src,coord);
   dst[id] = tmp;
-__kernel void compiler_subgroup_image_block_read2(image2d_t src, global uint2 *dst)
+__kernel void compiler_subgroup_image_block_read_ui2(image2d_t src, global uint2 *dst)
   int id = get_global_id(0);
   int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
-  uint2 tmp = intel_sub_group_block_read2(src,coord);
+  uint2 tmp = intel_sub_group_block_read_ui2(src,coord);
   dst[id] = tmp;
-__kernel void compiler_subgroup_image_block_read4(image2d_t src, global uint4 *dst)
+__kernel void compiler_subgroup_image_block_read_ui4(image2d_t src, global uint4 *dst)
   int id = get_global_id(0);
   int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
-  uint4 tmp = intel_sub_group_block_read4(src,coord);
+  uint4 tmp = intel_sub_group_block_read_ui4(src,coord);
   dst[id] = tmp;
-__kernel void compiler_subgroup_image_block_read8(image2d_t src, global uint8 *dst)
+__kernel void compiler_subgroup_image_block_read_ui8(image2d_t src, global uint8 *dst)
   int id = get_global_id(0);
   int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
-  uint8 tmp = intel_sub_group_block_read8(src,coord);
+  uint8 tmp = intel_sub_group_block_read_ui8(src,coord);
   dst[id] = tmp;
+#ifdef SHORT
+__kernel void compiler_subgroup_image_block_read_us1(image2d_t src, global ushort *dst)
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+  ushort tmp = intel_sub_group_block_read_us(src,coord);
+  dst[id] = tmp;
+__kernel void compiler_subgroup_image_block_read_us2(image2d_t src, global ushort2 *dst)
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+  ushort2 tmp = intel_sub_group_block_read_us2(src,coord);
+  dst[id] = tmp;
+__kernel void compiler_subgroup_image_block_read_us4(image2d_t src, global ushort4 *dst)
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+  ushort4 tmp = intel_sub_group_block_read_us4(src,coord);
+  dst[id] = tmp;
+__kernel void compiler_subgroup_image_block_read_us8(image2d_t src, global ushort8 *dst)
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+  ushort8 tmp = intel_sub_group_block_read_us8(src,coord);
+  dst[id] = tmp;
diff --git a/kernels/compiler_subgroup_image_block_write.cl b/kernels/compiler_subgroup_image_block_write.cl
index d9b3717..7d97c59 100644
--- a/kernels/compiler_subgroup_image_block_write.cl
+++ b/kernels/compiler_subgroup_image_block_write.cl
@@ -1,27 +1,55 @@
-__kernel void compiler_subgroup_image_block_write1(image2d_t dst, global uint *src)
+__kernel void compiler_subgroup_image_block_write_ui1(image2d_t dst, global uint *src)
   int id = get_global_id(0);
   int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
-  intel_sub_group_block_write(dst,coord, src[id]);
+  intel_sub_group_block_write_ui(dst,coord, src[id]);
-__kernel void compiler_subgroup_image_block_write2(image2d_t dst, global uint2 *src)
+__kernel void compiler_subgroup_image_block_write_ui2(image2d_t dst, global uint2 *src)
   int id = get_global_id(0);
   int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
-  intel_sub_group_block_write2(dst,coord, src[id]);
+  intel_sub_group_block_write_ui2(dst,coord, src[id]);
-__kernel void compiler_subgroup_image_block_write4(image2d_t dst, global uint4 *src)
+__kernel void compiler_subgroup_image_block_write_ui4(image2d_t dst, global uint4 *src)
   int id = get_global_id(0);
   int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
-  intel_sub_group_block_write4(dst,coord, src[id]);
+  intel_sub_group_block_write_ui4(dst,coord, src[id]);
-__kernel void compiler_subgroup_image_block_write8(image2d_t dst, global uint8 *src)
+__kernel void compiler_subgroup_image_block_write_ui8(image2d_t dst, global uint8 *src)
   int id = get_global_id(0);
   int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
-  intel_sub_group_block_write8(dst,coord, src[id]);
+  intel_sub_group_block_write_ui8(dst,coord, src[id]);
+#ifdef SHORT
+__kernel void compiler_subgroup_image_block_write_us1(image2d_t dst, global ushort *src)
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+  intel_sub_group_block_write_us(dst,coord, src[id]);
+__kernel void compiler_subgroup_image_block_write_us2(image2d_t dst, global ushort2 *src)
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+  intel_sub_group_block_write_us2(dst,coord, src[id]);
+__kernel void compiler_subgroup_image_block_write_us4(image2d_t dst, global ushort4 *src)
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+  intel_sub_group_block_write_us4(dst,coord, src[id]);
+__kernel void compiler_subgroup_image_block_write_us8(image2d_t dst, global ushort8 *src)
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(ushort),0);
+  intel_sub_group_block_write_us8(dst,coord, src[id]);
diff --git a/kernels/compiler_subgroup_reduce.cl b/kernels/compiler_subgroup_reduce.cl
index 77ffb07..79d8e7d 100644
--- a/kernels/compiler_subgroup_reduce.cl
+++ b/kernels/compiler_subgroup_reduce.cl
@@ -1,6 +1,7 @@
  * Subgroup any all functions
+#ifndef HALF
 kernel void compiler_subgroup_any(global int *src, global int *dst) {
   int val = src[get_global_id(0)];
   int predicate = sub_group_any(val);
@@ -72,6 +73,17 @@ kernel void compiler_subgroup_reduce_add_float(global float *src, global float *
  * Subgroup reduce max functions
+kernel void compiler_subgroup_reduce_max_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_reduce_max_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  //printf("src is %d\n",val);
+  ushort sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
 kernel void compiler_subgroup_reduce_max_int(global int *src, global int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_reduce_max(val);
@@ -105,6 +117,17 @@ kernel void compiler_subgroup_reduce_max_float(global float *src, global float *
  * Subgroup reduce min functions
+kernel void compiler_subgroup_reduce_min_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_reduce_min_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  //printf("src is %d\n",val);
+  ushort sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
 kernel void compiler_subgroup_reduce_min_int(global int *src, global int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_reduce_min(val);
@@ -134,3 +157,21 @@ kernel void compiler_subgroup_reduce_min_float(global float *src, global float *
   float sum = sub_group_reduce_min(val);
   dst[get_global_id(0)] = sum;
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_subgroup_reduce_add_half(global half *src, global half *dst) {
+  half val = src[get_global_id(0)];
+  half sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_reduce_max_half(global half *src, global half *dst) {
+  half val = src[get_global_id(0)];
+  half sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_reduce_min_half(global half *src, global half *dst) {
+  half val = src[get_global_id(0)];
+  half sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
diff --git a/kernels/compiler_subgroup_scan_exclusive.cl b/kernels/compiler_subgroup_scan_exclusive.cl
index afc00d0..2c4b928 100644
--- a/kernels/compiler_subgroup_scan_exclusive.cl
+++ b/kernels/compiler_subgroup_scan_exclusive.cl
@@ -1,6 +1,19 @@
  * Subgroup scan exclusive add functions
+#ifndef HALF
+kernel void compiler_subgroup_scan_exclusive_add_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_exclusive_add_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
 kernel void compiler_subgroup_scan_exclusive_add_int(global int *src, global int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_scan_exclusive_add(val);
@@ -34,6 +47,18 @@ kernel void compiler_subgroup_scan_exclusive_add_float(global float *src, global
  * Subgroup scan exclusive max functions
+kernel void compiler_subgroup_scan_exclusive_max_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_exclusive_max_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
 kernel void compiler_subgroup_scan_exclusive_max_int(global int *src, global int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_scan_exclusive_max(val);
@@ -67,6 +92,18 @@ kernel void compiler_subgroup_scan_exclusive_max_float(global float *src, global
  * Subgroup scan exclusive min functions
+kernel void compiler_subgroup_scan_exclusive_min_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_exclusive_min_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
 kernel void compiler_subgroup_scan_exclusive_min_int(global int *src, global int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_scan_exclusive_min(val);
@@ -96,3 +133,21 @@ kernel void compiler_subgroup_scan_exclusive_min_float(global float *src, global
   float sum = sub_group_scan_exclusive_min(val);
   dst[get_global_id(0)] = sum;
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_subgroup_scan_exclusive_add_half(global half *src, global half *dst) {
+  half val = src[get_global_id(0)];
+  half sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_exclusive_max_half(global half *src, global half *dst) {
+  half val = src[get_global_id(0)];
+  half sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_exclusive_min_half(global half *src, global half *dst) {
+  half val = src[get_global_id(0)];
+  half sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
diff --git a/kernels/compiler_subgroup_scan_inclusive.cl b/kernels/compiler_subgroup_scan_inclusive.cl
index da1a6e6..def941c 100644
--- a/kernels/compiler_subgroup_scan_inclusive.cl
+++ b/kernels/compiler_subgroup_scan_inclusive.cl
@@ -1,6 +1,19 @@
  * Subgroup scan inclusive add functions
+#ifndef HALF
+kernel void compiler_subgroup_scan_inclusive_add_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_inclusive_add_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
 kernel void compiler_subgroup_scan_inclusive_add_int(global int *src, global int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_scan_inclusive_add(val);
@@ -34,6 +47,18 @@ kernel void compiler_subgroup_scan_inclusive_add_float(global float *src, global
  * Subgroup scan inclusive max functions
+kernel void compiler_subgroup_scan_inclusive_max_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_inclusive_max_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
 kernel void compiler_subgroup_scan_inclusive_max_int(global int *src, global int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_scan_inclusive_max(val);
@@ -67,6 +92,18 @@ kernel void compiler_subgroup_scan_inclusive_max_float(global float *src, global
  * Subgroup scan inclusive min functions
+kernel void compiler_subgroup_scan_inclusive_min_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_inclusive_min_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
 kernel void compiler_subgroup_scan_inclusive_min_int(global int *src, global int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_scan_inclusive_min(val);
@@ -96,3 +133,21 @@ kernel void compiler_subgroup_scan_inclusive_min_float(global float *src, global
   float sum = sub_group_scan_inclusive_min(val);
   dst[get_global_id(0)] = sum;
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_subgroup_scan_inclusive_add_half(global half *src, global half *dst) {
+  half val = src[get_global_id(0)];
+  half sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_inclusive_max_half(global half *src, global half *dst) {
+  half val = src[get_global_id(0)];
+  half sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+kernel void compiler_subgroup_scan_inclusive_min_half(global half *src, global half *dst) {
+  half val = src[get_global_id(0)];
+  half sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
diff --git a/src/Android.mk b/src/Android.mk
index 9b63f7e..c195988 100644
--- a/src/Android.mk
+++ b/src/Android.mk
@@ -84,12 +84,23 @@ $(shell $(LOCAL_PATH)/git_sha1.sh $(LOCAL_PATH) ${GIT_SHA1})
     $(addprefix kernels/,$(addsuffix _str.c, $(KERNEL_NAMES))) \
     $(addprefix kernels/,$(addsuffix _str.c, $(BUILT_IN_NAME))) \
+    cl_base_object.c \
     cl_api.c \
+    cl_api_platform_id.c \
+    cl_api_device_id.c \
+    cl_api_mem.c \
+    cl_api_kernel.c \
+    cl_api_command_queue.c \
+    cl_api_event.c \
+    cl_api_context.c \
+    cl_api_sampler.c \
+    cl_api_program.c \
     cl_alloc.c \
     cl_kernel.c \
     cl_program.c \
     cl_gbe_loader.cpp \
     cl_sampler.c \
+    cl_accelerator_intel.c \
     cl_event.c \
     cl_enqueue.c \
     cl_image.c \
@@ -101,15 +112,16 @@ LOCAL_SRC_FILES:= \
     cl_command_queue.c \
     cl_command_queue.h \
     cl_command_queue_gen7.c \
-    cl_thread.c \
+    cl_command_queue_enqueue.c \
+    cl_device_enqueue.c \
+    cl_utils.c \
     cl_driver.h \
     cl_driver.cpp \
     cl_driver_defs.c \
     intel/intel_gpgpu.c \
     intel/intel_batchbuffer.c \
     intel/intel_driver.c \
-    performance.c \
-    cl_accelerator_intel.c
+    performance.c
 libgbe \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 82be7ff..f3c4632 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,8 +3,9 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}
-                    ${MESA_SOURCE_INCLUDES}
-                    ${LLVM_INCLUDE_DIR})
+                    ${LLVM_INCLUDE_DIR}
+                    ${OPENGL_INCLUDE_DIRS}
+                    ${EGL_INCLUDE_DIRS})
 macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
 foreach (KF ${KERNEL_FILES})
@@ -65,7 +66,17 @@ MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${BUILT_IN_NAME}")
+    cl_base_object.c
+    cl_api_platform_id.c
+    cl_api_device_id.c
+    cl_api_mem.c
+    cl_api_kernel.c
+    cl_api_command_queue.c
+    cl_api_event.c
+    cl_api_context.c
+    cl_api_sampler.c
+    cl_api_program.c
@@ -82,8 +93,11 @@ set(OPENCL_SRC
+    cl_device_enqueue.c
+    cl_device_enqueue.h
-    cl_thread.c
+    cl_command_queue_enqueue.c
+    cl_utils.c
@@ -108,14 +122,11 @@ if (CMRT_FOUND)
   set(OPENCL_SRC ${OPENCL_SRC} cl_cmrt.cpp)
 endif (CMRT_FOUND)
-  set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
+  set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c )
 if (OCLIcd_FOUND)
   set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
@@ -148,6 +159,11 @@ if (HAVE_DRM_INTEL_MIN_EU_IN_POOL)
 set(GIT_SHA1 "git_sha1.h")
 add_custom_target(${GIT_SHA1} ALL
   COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh
@@ -156,7 +172,7 @@ add_custom_target(${GIT_SHA1} ALL
 SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic,--allow-shlib-undefined")
-link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
 add_library(cl SHARED ${OPENCL_SRC})
@@ -170,5 +186,5 @@ target_link_libraries(
-                      ${OPTIONAL_EGL_LIBRARY})
+                      ${EGL_LIBRARIES})
diff --git a/src/cl_accelerator_intel.c b/src/cl_accelerator_intel.c
index cda8963..ae08184 100644
--- a/src/cl_accelerator_intel.c
+++ b/src/cl_accelerator_intel.c
@@ -19,9 +19,7 @@ cl_accelerator_intel_new(cl_context ctx,
   /* Allocate and inialize the structure itself */
   TRY_ALLOC(accel, CALLOC(struct _cl_accelerator_intel));
-  SET_ICD(accel->dispatch)
-  accel->ref_n = 1;
@@ -37,12 +35,12 @@ cl_accelerator_intel_new(cl_context ctx,
   /* Append the accelerator_intel in the context accelerator_intel list */
   /* does this really needed? */
-  pthread_mutex_lock(&ctx->accelerator_intel_lock);
     accel->next = ctx->accels;
     if (ctx->accels != NULL)
       ctx->accels->prev = accel;
     ctx->accels = accel;
-  pthread_mutex_unlock(&ctx->accelerator_intel_lock);
   accel->ctx = ctx;
@@ -60,7 +58,7 @@ error:
 LOCAL void
 cl_accelerator_intel_add_ref(cl_accelerator_intel accel)
-  atomic_inc(&accel->ref_n);
+  CL_OBJECT_INC_REF(accel);
 LOCAL void
@@ -68,19 +66,20 @@ cl_accelerator_intel_delete(cl_accelerator_intel accel)
   if (UNLIKELY(accel == NULL))
-  if (atomic_dec(&accel->ref_n) > 1)
+  if (CL_OBJECT_DEC_REF(accel) > 1)
   /* Remove the accelerator_intel in the context accelerator_intel list */
-  pthread_mutex_lock(&accel->ctx->accelerator_intel_lock);
+  CL_OBJECT_LOCK(accel->ctx);
     if (accel->prev)
       accel->prev->next = accel->next;
     if (accel->next)
       accel->next->prev = accel->prev;
     if (accel->ctx->accels == accel)
       accel->ctx->accels = accel->next;
-  pthread_mutex_unlock(&accel->ctx->accelerator_intel_lock);
+  CL_OBJECT_UNLOCK(accel->ctx);
diff --git a/src/cl_accelerator_intel.h b/src/cl_accelerator_intel.h
index cecfd2a..435ae73 100644
--- a/src/cl_accelerator_intel.h
+++ b/src/cl_accelerator_intel.h
@@ -1,22 +1,25 @@
+#include "cl_base_object.h"
 #include "CL/cl.h"
 #include "CL/cl_ext.h"
 #include <stdint.h>
 struct _cl_accelerator_intel {
-  DEFINE_ICD(dispatch)
-  uint64_t magic;            /* To identify it as a accelerator_intel object */
-  volatile int ref_n;        /* This object is reference counted */
+  _cl_base_object base;
   cl_accelerator_intel prev, next;     /* We chain in the allocator, why chain? */
   cl_context ctx;            /* Context it belongs to */
   cl_accelerator_type_intel type;
   union {
     cl_motion_estimation_desc_intel me;
-  }desc;                     /* save desc before we decide how to handle it */
+  } desc;                     /* save desc before we decide how to handle it */
+#define CL_OBJECT_ACCELERATOR_INTEL_MAGIC 0x7e6a08c9a7ac3e3fLL
+    (((cl_base_object)obj)->magic == CL_OBJECT_ACCELERATOR_INTEL_MAGIC)
 cl_accelerator_intel cl_accelerator_intel_new(cl_context ctx,
                          cl_accelerator_type_intel accel_type,
                          size_t desc_sz,
diff --git a/src/cl_api.c b/src/cl_api.c
index a7c78f0..24b8b3d 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -67,117 +67,6 @@ typedef intptr_t cl_device_partition_property;
 	  return RET; \
 	} while(0)
-static inline cl_int
-handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
-              cl_event* event, enqueue_data* data, cl_command_type type)
-  cl_int status = cl_event_wait_events(num, wait_list, queue);
-  cl_event e = NULL;
-  if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
-    e = cl_event_new(queue->ctx, queue, type, event!=NULL);
-    /* if need profiling, add the submit timestamp here. */
-    if (e->type != CL_COMMAND_USER &&
-	    e->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-	cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED);
-	cl_event_get_queued_cpu_timestamp(e);
-    }
-    if(event != NULL)
-      *event = e;
-    if(status == CL_ENQUEUE_EXECUTE_DEFER) {
-      cl_event_new_enqueue_callback(e, data, num, wait_list);
-    }
-  }
-  set_current_event(queue, e);
-  return status;
-/* The following code checking overlap is from Appendix of openCL spec 1.1 */
-cl_bool check_copy_overlap(const size_t src_offset[3],
-                           const size_t dst_offset[3],
-                           const size_t region[3],
-                           size_t row_pitch, size_t slice_pitch)
-  const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
-  const size_t src_max[] = {src_offset[0] + region[0],
-                            src_offset[1] + region[1],
-                            src_offset[2] + region[2]};
-  const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
-  const size_t dst_max[] = {dst_offset[0] + region[0],
-                            dst_offset[1] + region[1],
-                            dst_offset[2] + region[2]};
-  // Check for overlap
-  cl_bool overlap = CL_TRUE;
-  unsigned i;
-  size_t dst_start = dst_offset[2] * slice_pitch +
-                     dst_offset[1] * row_pitch + dst_offset[0];
-  size_t dst_end = dst_start + (region[2] * slice_pitch +
-                   region[1] * row_pitch + region[0]);
-  size_t src_start = src_offset[2] * slice_pitch +
-                     src_offset[1] * row_pitch + src_offset[0];
-  size_t src_end = src_start + (region[2] * slice_pitch +
-                   region[1] * row_pitch + region[0]);
-  for (i=0; i != 3; ++i) {
-    overlap = overlap && (src_min[i] < dst_max[i])
-                      && (src_max[i] > dst_min[i]);
-  }
-  if (!overlap) {
-    size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
-                          src_offset[0] + region[0] - row_pitch : 0;
-    size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
-                          dst_offset[0] + region[0] - row_pitch : 0;
-    if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
-         (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
-      if ( (src_start <= dst_start && dst_start < src_end) ||
-           (dst_start <= src_start && src_start < dst_end) )
-        overlap = CL_TRUE;
-    }
-    if (region[2] > 1) {
-      size_t src_height = slice_pitch / row_pitch;
-      size_t dst_height = slice_pitch / row_pitch;
-      size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
-                            src_offset[1] + region[1] - src_height : 0;
-      size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
-                            dst_offset[1] + region[1] - dst_height : 0;
-      if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
-           (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
-        if ( (src_start <= dst_start && dst_start < src_end) ||
-             (dst_start <= src_start && src_start < dst_end) )
-          overlap = CL_TRUE;
-      }
-    }
-  }
-  return overlap;
-static cl_int
-cl_check_device_type(cl_device_type device_type)
-  const cl_device_type valid =  CL_DEVICE_TYPE_GPU
-                              | CL_DEVICE_TYPE_CPU
-                              | CL_DEVICE_TYPE_ACCELERATOR
-                              | CL_DEVICE_TYPE_DEFAULT
-                              | CL_DEVICE_TYPE_CUSTOM;
-  if( (device_type & valid) == 0) {
-  }
-  if(UNLIKELY(!(device_type & CL_DEVICE_TYPE_DEFAULT) && !(device_type & CL_DEVICE_TYPE_GPU)))
-    return CL_DEVICE_NOT_FOUND;
-  return CL_SUCCESS;
-static cl_int
-cl_device_id_is_ok(const cl_device_id device)
-  if(UNLIKELY(device == NULL)) return CL_FALSE;
-  return device != cl_get_gt_device() ? CL_FALSE : CL_TRUE;
 clGetPlatformIDs(cl_uint          num_entries,
                  cl_platform_id * platforms,
@@ -191,299 +80,6 @@ clGetPlatformIDs(cl_uint          num_entries,
   return cl_get_platform_ids(num_entries, platforms, num_platforms);
-clGetPlatformInfo(cl_platform_id    platform,
-                  cl_platform_info  param_name,
-                  size_t            param_value_size,
-                  void *            param_value,
-                  size_t *          param_value_size_ret)
-  /* Only one platform. This is easy */
-  if (UNLIKELY(platform != NULL && platform != cl_get_platform_default()))
-  return cl_get_platform_info(platform,
-                              param_name,
-                              param_value_size,
-                              param_value,
-                              param_value_size_ret);
-clGetDeviceIDs(cl_platform_id platform,
-               cl_device_type device_type,
-               cl_uint        num_entries,
-               cl_device_id * devices,
-               cl_uint *      num_devices)
-  cl_int err = CL_SUCCESS;
-  /* Check parameter consistency */
-  if (UNLIKELY(devices == NULL && num_devices == NULL))
-    return CL_INVALID_VALUE;
-  if (UNLIKELY(platform && platform != cl_get_platform_default()))
-  if (UNLIKELY(devices && num_entries == 0))
-    return CL_INVALID_VALUE;
-  err = cl_check_device_type(device_type);
-  if(err != CL_SUCCESS)
-    return err;
-  return cl_get_device_ids(platform,
-                           device_type,
-                           num_entries,
-                           devices,
-                           num_devices);
-clGetDeviceInfo(cl_device_id   device,
-                cl_device_info param_name,
-                size_t         param_value_size,
-                void *         param_value,
-                size_t *       param_value_size_ret)
-  return cl_get_device_info(device,
-                            param_name,
-                            param_value_size,
-                            param_value,
-                            param_value_size_ret);
-clCreateSubDevices(cl_device_id                         in_device,
-                   const cl_device_partition_property * properties,
-                   cl_uint                              num_devices,
-                   cl_device_id *                       out_devices,
-                   cl_uint *                            num_devices_ret)
-  /* Check parameter consistency */
-  if (UNLIKELY(out_devices == NULL && num_devices_ret == NULL))
-    return CL_INVALID_VALUE;
-  if (UNLIKELY(in_device == NULL && properties == NULL))
-    return CL_INVALID_VALUE;
-  *num_devices_ret = 0;
-clRetainDevice(cl_device_id device)
-  // XXX stub for C++ Bindings
-  return CL_SUCCESS;
-clReleaseDevice(cl_device_id device)
-#ifdef HAS_CMRT
-  if (device->cmrt_device != NULL)
-    cmrt_destroy_device(device);
-  // XXX stub for C++ Bindings
-  return CL_SUCCESS;
-clCreateContext(const cl_context_properties *  properties,
-                cl_uint                        num_devices,
-                const cl_device_id *           devices,
-                void (* pfn_notify) (const char*, const void*, size_t, void*),
-                void *                         user_data,
-                cl_int *                       errcode_ret)
-  cl_int err = CL_SUCCESS;
-  cl_context context = NULL;
-  /* Assert parameters correctness */
-  INVALID_VALUE_IF (devices == NULL);
-  INVALID_VALUE_IF (num_devices == 0);
-  INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
-  /* Now check if the user is asking for the right device */
-  INVALID_DEVICE_IF (cl_device_id_is_ok(*devices) == CL_FALSE);
-  context = cl_create_context(properties,
-                           num_devices,
-                           devices,
-                           pfn_notify,
-                           user_data,
-                           &err);
-  initialize_env_var();
-  if (errcode_ret)
-    *errcode_ret = err;
-  return context;
-clCreateContextFromType(const cl_context_properties *  properties,
-                        cl_device_type                 device_type,
-                        void (CL_CALLBACK *pfn_notify) (const char *, const void *, size_t, void *),
-                        void *                         user_data,
-                        cl_int *                       errcode_ret)
-  cl_context context = NULL;
-  cl_int err = CL_SUCCESS;
-  cl_device_id devices[1];
-  cl_uint num_devices = 1;
-  INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
-  err = cl_check_device_type(device_type);
-  if(err != CL_SUCCESS) {
-    goto error;
-  }
-  err = cl_get_device_ids(NULL,
-                          device_type,
-                          1,
-                          &devices[0],
-                          &num_devices);
-  if (err != CL_SUCCESS) {
-    goto error;
-  }
-  context = cl_create_context(properties,
-                              num_devices,
-                              devices,
-                              pfn_notify,
-                              user_data,
-                              &err);
-  if (errcode_ret)
-    *errcode_ret = err;
-  return context;
-clRetainContext(cl_context context)
-  cl_int err = CL_SUCCESS;
-  CHECK_CONTEXT (context);
-  cl_context_add_ref(context);
-  return err;
-clReleaseContext(cl_context context)
-  cl_int err = CL_SUCCESS;
-  CHECK_CONTEXT (context);
-  cl_context_delete(context);
-  return err;
-clGetContextInfo(cl_context      context,
-                 cl_context_info param_name,
-                 size_t          param_value_size,
-                 void *          param_value,
-                 size_t *        param_value_size_ret)
-  cl_int err = CL_SUCCESS;
-  CHECK_CONTEXT (context);
-  if (param_name == CL_CONTEXT_DEVICES) {
-    FILL_GETINFO_RET (cl_device_id, 1, &context->device, CL_SUCCESS);
-  } else if (param_name == CL_CONTEXT_NUM_DEVICES) {
-    cl_uint n = 1;
-    FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
-  } else if (param_name == CL_CONTEXT_REFERENCE_COUNT) {
-    cl_uint ref = context->ref_n;
-    FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
-  } else if (param_name == CL_CONTEXT_PROPERTIES) {
-    if(context->prop_len > 0) {
-      FILL_GETINFO_RET (cl_context_properties, context->prop_len, context->prop_user, CL_SUCCESS);
-    } else {
-      cl_context_properties n = 0;
-      FILL_GETINFO_RET (cl_context_properties, 1, &n, CL_SUCCESS);
-    }
-  } else {
-    return CL_INVALID_VALUE;
-  }
-  return err;
-clCreateCommandQueue(cl_context                   context,
-                     cl_device_id                 device,
-                     cl_command_queue_properties  properties,
-                     cl_int *                     errcode_ret)
-  cl_command_queue queue = NULL;
-  cl_int err = CL_SUCCESS;
-  CHECK_CONTEXT (context);
-  INVALID_DEVICE_IF (device != context->device);
-  if(properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {/*not supported now.*/
-    goto error;
-  }
-  queue = cl_context_create_queue(context, device, properties, &err);
-  if (errcode_ret)
-    *errcode_ret = err;
-  return queue;
-clRetainCommandQueue(cl_command_queue command_queue)
-  cl_int err = CL_SUCCESS;
-  CHECK_QUEUE (command_queue);
-  cl_command_queue_add_ref(command_queue);
-  return err;
-clReleaseCommandQueue(cl_command_queue command_queue)
-  cl_int err = CL_SUCCESS;
-  CHECK_QUEUE (command_queue);
-  cl_command_queue_delete(command_queue);
-  return err;
-clGetCommandQueueInfo(cl_command_queue       command_queue,
-                      cl_command_queue_info  param_name,
-                      size_t                 param_value_size,
-                      void *                 param_value,
-                      size_t *               param_value_size_ret)
-  cl_int err = CL_SUCCESS;
-  CHECK_QUEUE (command_queue);
-  if (param_name == CL_QUEUE_CONTEXT) {
-    FILL_GETINFO_RET (cl_context, 1, &command_queue->ctx, CL_SUCCESS);
-  } else if (param_name == CL_QUEUE_DEVICE) {
-    FILL_GETINFO_RET (cl_device_id, 1, &command_queue->ctx->device, CL_SUCCESS);
-  } else if (param_name == CL_QUEUE_REFERENCE_COUNT) {
-    cl_uint ref = command_queue->ref_n;
-    FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
-  } else if (param_name == CL_QUEUE_PROPERTIES) {
-    FILL_GETINFO_RET (cl_command_queue_properties, 1, &command_queue->props, CL_SUCCESS);
-  } else {
-    return CL_INVALID_VALUE;
-  }
-  return err;
 clCreateBuffer(cl_context    context,
                cl_mem_flags  flags,
@@ -538,7 +134,7 @@ clCreateImage(cl_context context,
     goto error;
   if (image_format->image_channel_order < CL_R ||
-          image_format->image_channel_order > CL_RGBx) {
+          image_format->image_channel_order > CL_sBGRA) {
     goto error;
@@ -583,249 +179,528 @@ error:
   return mem;
-clCreateImage2D(cl_context              context,
-                cl_mem_flags            flags,
-                const cl_image_format * image_format,
-                size_t                  image_width,
-                size_t                  image_height,
-                size_t                  image_row_pitch,
-                void *                  host_ptr,
-                cl_int *                errcode_ret)
+void *
+clSVMAlloc (cl_context context,
+            cl_svm_mem_flags flags,
+            size_t size,
+            unsigned int alignment)
-  cl_mem mem = NULL;
   cl_int err = CL_SUCCESS;
   CHECK_CONTEXT (context);
-  cl_image_desc image_desc;
-  memset(&image_desc, 0, sizeof(image_desc));
-  image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-  image_desc.image_width = image_width;
-  image_desc.image_height = image_height;
-  image_desc.image_row_pitch = image_row_pitch;
-  mem = cl_mem_new_image(context,
-                         flags,
-                         image_format,
-                         &image_desc,
-                         host_ptr,
-                         &err);
+  (void) err;
+  return cl_mem_svm_allocate(context, flags, size, alignment);
-  if (errcode_ret)
-    *errcode_ret = err;
-  return mem;
+  return NULL;
-clCreateImage3D(cl_context              context,
-                cl_mem_flags            flags,
-                const cl_image_format * image_format,
-                size_t                  image_width,
-                size_t                  image_height,
-                size_t                  image_depth,
-                size_t                  image_row_pitch,
-                size_t                  image_slice_pitch,
-                void *                  host_ptr,
-                cl_int *                errcode_ret)
+clSVMFree (cl_context context, void* svm_pointer)
-  cl_mem mem = NULL;
   cl_int err = CL_SUCCESS;
   CHECK_CONTEXT (context);
-  cl_image_desc image_desc;
-  image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
-  image_desc.image_width = image_width;
-  image_desc.image_height = image_height;
-  image_desc.image_depth = image_depth;
-  image_desc.image_row_pitch = image_row_pitch;
-  image_desc.image_slice_pitch = image_slice_pitch;
-  mem = cl_mem_new_image(context,
-                         flags,
-                         image_format,
-                         &image_desc,
-                         host_ptr,
-                         &err);
+  (void) err;
+  return cl_mem_svm_delete(context, svm_pointer);
-  if (errcode_ret)
-    *errcode_ret = err;
-  return mem;
+  return;
-clRetainMemObject(cl_mem memobj)
+clEnqueueSVMFree (cl_command_queue command_queue,
+                  cl_uint num_svm_pointers,
+                  void *svm_pointers[],
+                  void (CL_CALLBACK *pfn_free_func)( cl_command_queue queue,
+                                                     cl_uint num_svm_pointers,
+                                                     void *svm_pointers[],
+                                                     void *user_data),
+                  void *user_data,
+                  cl_uint num_events_in_wait_list,
+                  const cl_event *event_wait_list,
+                  cl_event *event)
   cl_int err = CL_SUCCESS;
-  CHECK_MEM (memobj);
-  cl_mem_add_ref(memobj);
+  cl_int i = 0;
+  void** pointers = NULL;
+  cl_event e = NULL;
+  cl_int e_status;
+  enqueue_data *data;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if(num_svm_pointers == 0 || svm_pointers == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    for(i=0; i<num_svm_pointers; i++) {
+      if(svm_pointers[i] == NULL) {
+        err = CL_INVALID_VALUE;
+        break;
+      }
+    }
+    if(err != CL_SUCCESS)
+        break;
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_SVM_FREE, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    pointers = malloc(num_svm_pointers * sizeof(void *));
+    if(UNLIKELY(pointers == NULL)) {
+      err = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
+    memcpy(pointers, svm_pointers, num_svm_pointers * sizeof(void *));
+    data = &e->exec_data;
+    data->type      = EnqueueSVMFree;
+    data->queue     = command_queue;
+    data->pointers  = pointers;
+    data->free_func = pfn_free_func;
+    data->size      = num_svm_pointers;
+    data->ptr       = user_data;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
   return err;
-clReleaseMemObject(cl_mem memobj)
+clEnqueueSVMMap (cl_command_queue command_queue,
+                 cl_bool blocking_map,
+                 cl_map_flags map_flags,
+                 void *svm_ptr,
+                 size_t size,
+                 cl_uint num_events_in_wait_list,
+                 const cl_event *event_wait_list,
+                 cl_event *event)
   cl_int err = CL_SUCCESS;
-  CHECK_MEM (memobj);
-  cl_mem_delete(memobj);
+  cl_mem buffer;
+  CHECK_QUEUE(command_queue);
+  buffer = cl_context_get_svm_from_ptr(command_queue->ctx, svm_ptr);
+  if(buffer == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  clEnqueueMapBuffer(command_queue, buffer, blocking_map, map_flags, 0, size,
+                     num_events_in_wait_list, event_wait_list, event, &err);
+  if(event)
+    (*event)->event_type = CL_COMMAND_SVM_MAP;
   return err;
-clGetSupportedImageFormats(cl_context         ctx,
-                           cl_mem_flags       flags,
-                           cl_mem_object_type image_type,
-                           cl_uint            num_entries,
-                           cl_image_format *  image_formats,
-                           cl_uint *          num_image_formats)
+clEnqueueSVMUnmap (cl_command_queue command_queue,
+                   void *svm_ptr,
+                   cl_uint num_events_in_wait_list,
+                   const cl_event *event_wait_list,
+                   cl_event *event)
   cl_int err = CL_SUCCESS;
-  if (UNLIKELY(num_entries == 0 && image_formats != NULL)) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D &&
-               image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
-               image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
-               image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY &&
-               image_type != CL_MEM_OBJECT_IMAGE2D &&
-               image_type != CL_MEM_OBJECT_IMAGE3D)) {
+  cl_mem buffer;
+  CHECK_QUEUE(command_queue);
+  buffer = cl_context_get_svm_from_ptr(command_queue->ctx, svm_ptr);
+  if(buffer == NULL) {
     err = CL_INVALID_VALUE;
     goto error;
-  err = cl_image_get_supported_fmt(ctx,
-                                   image_type,
-                                   num_entries,
-                                   image_formats,
-                                   num_image_formats);
+  err = clEnqueueUnmapMemObject(command_queue, buffer, svm_ptr,
+                                num_events_in_wait_list, event_wait_list, event);
+  if(event)
+    (*event)->event_type = CL_COMMAND_SVM_UNMAP;
   return err;
-clGetMemObjectInfo(cl_mem      memobj,
-                   cl_mem_info param_name,
-                   size_t      param_value_size,
-                   void *      param_value,
-                   size_t *    param_value_size_ret)
+cl_int clEnqueueSVMMemcpy (cl_command_queue command_queue,
+                           cl_bool blocking_copy,
+                           void *dst_ptr,
+                           const void *src_ptr,
+                           size_t size,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list,
+                           cl_event *event)
   cl_int err = CL_SUCCESS;
-  CHECK_MEM(memobj);
+  enqueue_data *data;
+  cl_int e_status;
+  cl_event e = NULL;
-  err = cl_get_mem_object_info(memobj,
-                               param_name,
-                               param_value_size,
-                               param_value,
-                               param_value_size_ret);
-  return err;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
-clGetImageInfo(cl_mem         mem,
-               cl_image_info  param_name,
-               size_t         param_value_size,
-               void *         param_value,
-               size_t *       param_value_size_ret)
-  return cl_get_image_info(mem,
-                           param_name,
-                           param_value_size,
-                           param_value,
-                           param_value_size_ret);
+    if(UNLIKELY(dst_ptr == NULL || src_ptr == NULL || size == 0 )) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if(((size_t)src_ptr < (size_t)dst_ptr && ((size_t)src_ptr + size > (size_t)dst_ptr)) ||
+       ((size_t)dst_ptr < (size_t)src_ptr && ((size_t)dst_ptr + size > (size_t)src_ptr))) {
+      err = CL_MEM_COPY_OVERLAP;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_SVM_MEMCPY, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (blocking_copy) {
+      err = cl_event_wait_for_event_ready(e);
+      if (err != CL_SUCCESS)
+        break;
+      /* Blocking call API is a sync point of flush. */
+      err = cl_command_queue_wait_flush(command_queue);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type         = EnqueueSVMMemCopy;
+    data->queue        = command_queue;
+    data->ptr          = dst_ptr;
+    data->const_ptr    = src_ptr;
+    data->size         = size;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while(0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
-clSetMemObjectDestructorCallback(cl_mem  memobj,
-                                 void (CL_CALLBACK *pfn_notify) (cl_mem, void*),
-                                 void * user_data)
+cl_int clEnqueueSVMMemFill (cl_command_queue command_queue,
+                            void *svm_ptr,
+                            const void *pattern,
+                            size_t pattern_size,
+                            size_t size,
+                            cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list,
+                            cl_event *event)
   cl_int err = CL_SUCCESS;
-  CHECK_MEM(memobj);
-  INVALID_VALUE_IF (pfn_notify == 0);
+  enqueue_data *data;
+  cl_int e_status;
+  cl_event e = NULL;
-  cl_mem_dstr_cb *cb = (cl_mem_dstr_cb*)malloc(sizeof(cl_mem_dstr_cb));
-  if (!cb) {
-    goto error;
-  }
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
-  memset(cb, 0, sizeof(cl_mem_dstr_cb));
-  cb->pfn_notify = pfn_notify;
-  cb->user_data = user_data;
-  cb->next = memobj->dstr_cb;
-  memobj->dstr_cb = cb;
+    if(UNLIKELY(svm_ptr == NULL ||
+               ((size_t)svm_ptr & (pattern_size - 1)) != 0)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if(UNLIKELY(pattern == NULL ||
+               (pattern_size & (pattern_size - 1)) != 0 ||
+                pattern_size > 128)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if(UNLIKELY(size == 0 ||
+               (size % pattern_size) != 0)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_SVM_MEMFILL, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type         = EnqueueSVMMemFill;
+    data->queue        = command_queue;
+    data->ptr          = svm_ptr;
+    data->const_ptr    = pattern;
+    data->pattern_size = pattern_size;
+    data->size         = size;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while(0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
   return err;
-clCreateSampler(cl_context         context,
-                cl_bool            normalized,
-                cl_addressing_mode addressing,
-                cl_filter_mode     filter,
-                cl_int *           errcode_ret)
+clCreateImage2D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_row_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret)
-  cl_sampler sampler = NULL;
+  cl_mem mem = NULL;
   cl_int err = CL_SUCCESS;
   CHECK_CONTEXT (context);
-  sampler = cl_sampler_new(context, normalized, addressing, filter, &err);
+  cl_image_desc image_desc;
+  memset(&image_desc, 0, sizeof(image_desc));
+  image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  image_desc.image_width = image_width;
+  image_desc.image_height = image_height;
+  image_desc.image_row_pitch = image_row_pitch;
+  mem = cl_mem_new_image(context,
+                         flags,
+                         image_format,
+                         &image_desc,
+                         host_ptr,
+                         &err);
   if (errcode_ret)
     *errcode_ret = err;
-  return sampler;
+  return mem;
-clRetainSampler(cl_sampler sampler)
+clCreateImage3D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_depth,
+                size_t                  image_row_pitch,
+                size_t                  image_slice_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret)
+  cl_mem mem = NULL;
   cl_int err = CL_SUCCESS;
-  CHECK_SAMPLER (sampler);
-  cl_sampler_add_ref(sampler);
+  CHECK_CONTEXT (context);
+  cl_image_desc image_desc;
+  image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  image_desc.image_width = image_width;
+  image_desc.image_height = image_height;
+  image_desc.image_depth = image_depth;
+  image_desc.image_row_pitch = image_row_pitch;
+  image_desc.image_slice_pitch = image_slice_pitch;
+  mem = cl_mem_new_image(context,
+                         flags,
+                         image_format,
+                         &image_desc,
+                         host_ptr,
+                         &err);
-  return err;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
-clReleaseSampler(cl_sampler sampler)
+clGetSupportedImageFormats(cl_context         ctx,
+                           cl_mem_flags       flags,
+                           cl_mem_object_type image_type,
+                           cl_uint            num_entries,
+                           cl_image_format *  image_formats,
+                           cl_uint *          num_image_formats)
   cl_int err = CL_SUCCESS;
-  CHECK_SAMPLER (sampler);
-  cl_sampler_delete(sampler);
+  if (UNLIKELY(num_entries == 0 && image_formats != NULL)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D &&
+               image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
+               image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
+               image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY &&
+               image_type != CL_MEM_OBJECT_IMAGE2D &&
+               image_type != CL_MEM_OBJECT_IMAGE3D)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  err = cl_image_get_supported_fmt(ctx,
+                                   flags,
+                                   image_type,
+                                   num_entries,
+                                   image_formats,
+                                   num_image_formats);
   return err;
-clGetSamplerInfo(cl_sampler       sampler,
-                 cl_sampler_info  param_name,
-                 size_t           param_value_size,
-                 void *           param_value,
-                 size_t *         param_value_size_ret)
+clCreateSamplerWithProperties(cl_context                  context,
+                              const cl_sampler_properties *sampler_properties,
+                              cl_int *                    errcode_ret)
+  cl_sampler sampler = NULL;
   cl_int err = CL_SUCCESS;
-  CHECK_SAMPLER (sampler);
-  if (param_name == CL_SAMPLER_REFERENCE_COUNT) {
-    FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&sampler->ref_n, CL_SUCCESS);
-  } else if (param_name == CL_SAMPLER_CONTEXT) {
-    FILL_GETINFO_RET (cl_context, 1, &sampler->ctx, CL_SUCCESS);
-  } else if (param_name == CL_SAMPLER_NORMALIZED_COORDS) {
-    FILL_GETINFO_RET (cl_bool, 1, &sampler->normalized_coords, CL_SUCCESS);
-  } else if (param_name == CL_SAMPLER_ADDRESSING_MODE) {
-    FILL_GETINFO_RET (cl_addressing_mode, 1, &sampler->address, CL_SUCCESS);
-  } else if (param_name == CL_SAMPLER_FILTER_MODE ) {
-    FILL_GETINFO_RET (cl_filter_mode, 1, &sampler->filter, CL_SUCCESS);
-  } else{
-    return CL_INVALID_VALUE;
+  CHECK_CONTEXT (context);
+  cl_bool normalized = 0xFFFFFFFF;
+  cl_addressing_mode addressing = 0xFFFFFFFF;
+  cl_filter_mode filter = 0xFFFFFFFF;
+  if(sampler_properties)
+  {
+    cl_ulong sam_type;
+    cl_ulong sam_val;
+    cl_uint i;
+    for(i = 0;(sam_type = sampler_properties[i++])!=0;i++)
+    {
+      sam_val = sampler_properties[i];
+      switch(sam_type)
+      {
+          if(normalized != 0xFFFFFFFF)
+            err = CL_INVALID_VALUE;
+          else if(sam_val == CL_TRUE || sam_val == CL_FALSE)
+            normalized = sam_val;
+          else
+            err = CL_INVALID_VALUE;
+          break;
+          if(addressing != 0xFFFFFFFF)
+            err = CL_INVALID_VALUE;
+          else if(sam_val == CL_ADDRESS_MIRRORED_REPEAT || sam_val == CL_ADDRESS_REPEAT ||
+                  sam_val == CL_ADDRESS_CLAMP_TO_EDGE || sam_val == CL_ADDRESS_CLAMP ||
+                  sam_val == CL_ADDRESS_NONE)
+            addressing = sam_val;
+          else
+            err = CL_INVALID_VALUE;
+          break;
+          if(filter != 0xFFFFFFFF)
+            err = CL_INVALID_VALUE;
+          else if(sam_val == CL_FILTER_LINEAR || sam_val == CL_FILTER_NEAREST)
+            filter = sam_val;
+          else
+            err = CL_INVALID_VALUE;
+          break;
+        default:
+          err = CL_INVALID_VALUE;
+          break;
+      }
+    }
+  if(err)
+    goto error;
+  if(normalized == 0xFFFFFFFF) normalized = CL_TRUE;
+  if(addressing == 0xFFFFFFFF) addressing = CL_ADDRESS_CLAMP;
+  if(filter == 0xFFFFFFFF) filter = CL_FILTER_NEAREST;
+  sampler = cl_create_sampler(context, normalized, addressing, filter, &err);
-  return err;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return sampler;
@@ -946,7 +821,10 @@ clBuildProgram(cl_program            program,
   /* Everything is easy. We only support one device anyway */
   if (num_devices != 0) {
-    INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
+    err = cl_devices_list_include_check(program->ctx->device_num,
+                                        program->ctx->devices, num_devices, device_list);
+    if (err)
+      goto error;
   assert(program->source_type == FROM_LLVM ||
@@ -988,7 +866,10 @@ clCompileProgram(cl_program            program ,
   /* Everything is easy. We only support one device anyway */
   if (num_devices != 0) {
-    INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
+    err = cl_devices_list_include_check(program->ctx->device_num,
+                                        program->ctx->devices, num_devices, device_list);
+    if (err)
+      goto error;
   /* TODO support create program from binary */
@@ -1027,2232 +908,319 @@ clLinkProgram(cl_context            context,
   INVALID_VALUE_IF (pfn_notify  == 0 && user_data   != NULL);
   INVALID_VALUE_IF (num_input_programs == 0 && input_programs != NULL);
   INVALID_VALUE_IF (num_input_programs != 0 && input_programs == NULL);
-  INVALID_VALUE_IF (num_input_programs == 0 && input_programs == NULL);
-  program = cl_program_link(context, num_input_programs, input_programs, options, &err);
-  if(program) program->is_built = CL_TRUE;
-  if (pfn_notify) pfn_notify(program, user_data);
-  if (errcode_ret)
-    *errcode_ret = err;
-  return program;
-  return CL_SUCCESS;
-clUnloadPlatformCompiler(cl_platform_id platform)
-  return CL_SUCCESS;
-clGetProgramInfo(cl_program       program,
-                 cl_program_info  param_name,
-                 size_t           param_value_size,
-                 void *           param_value,
-                 size_t *         param_value_size_ret)
-  cl_int err = CL_SUCCESS;
-  char * ret_str = "";
-  CHECK_PROGRAM (program);
-  if (param_name == CL_PROGRAM_REFERENCE_COUNT) {
-    cl_uint ref = program->ref_n;
-    FILL_GETINFO_RET (cl_uint, 1, (&ref), CL_SUCCESS);
-  } else if (param_name == CL_PROGRAM_CONTEXT) {
-    cl_context context = program->ctx;
-    FILL_GETINFO_RET (cl_context, 1, &context, CL_SUCCESS);
-  } else if (param_name == CL_PROGRAM_NUM_DEVICES) {
-    cl_uint num_dev = 1; // Just 1 dev now.
-    FILL_GETINFO_RET (cl_uint, 1, &num_dev, CL_SUCCESS);
-  } else if (param_name == CL_PROGRAM_DEVICES) {
-    cl_device_id dev_id = program->ctx->device;
-    FILL_GETINFO_RET (cl_device_id, 1, &dev_id, CL_SUCCESS);
-  } else if (param_name == CL_PROGRAM_NUM_KERNELS) {
-    cl_uint kernels_num = program->ker_n;
-    FILL_GETINFO_RET (cl_uint, 1, &kernels_num, CL_SUCCESS);
-  } else if (param_name == CL_PROGRAM_SOURCE) {
-    if (!program->source)
-      FILL_GETINFO_RET (char, 1, &ret_str, CL_SUCCESS);
-    FILL_GETINFO_RET (char, (strlen(program->source) + 1),
-                   program->source, CL_SUCCESS);
-  } else if(param_name == CL_PROGRAM_KERNEL_NAMES) {
-    cl_program_get_kernel_names(program, param_value_size, (char *)param_value, param_value_size_ret);
-  } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
-    if (program->binary == NULL){
-      if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
-      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
-      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
-      }else{
-        return CL_INVALID_BINARY;
-      }
-    }
-    if (program->binary == NULL || program->binary_sz == 0) {
-      return CL_OUT_OF_RESOURCES;
-    }
-    FILL_GETINFO_RET (size_t, 1, (&program->binary_sz), CL_SUCCESS);
-  } else if (param_name == CL_PROGRAM_BINARIES) {
-    if (param_value_size_ret)
-      *param_value_size_ret = sizeof(void*);
-    if (!param_value)
-      return CL_SUCCESS;
-    /* param_value points to an array of n
-       pointers allocated by the caller */
-    if (program->binary == NULL) {
-      if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
-      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
-      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
-      }else{
-        return CL_INVALID_BINARY;
-      }
-    }
-    if (program->binary == NULL || program->binary_sz == 0) {
-      return CL_OUT_OF_RESOURCES;
-    }
-    memcpy(*((void **)param_value), program->binary, program->binary_sz);
-    return CL_SUCCESS;
-  } else {
-    return CL_INVALID_VALUE;
-  }
-    return err;
-clGetProgramBuildInfo(cl_program             program,
-                      cl_device_id           device,
-                      cl_program_build_info  param_name,
-                      size_t                 param_value_size,
-                      void *                 param_value,
-                      size_t *               param_value_size_ret)
-  cl_int err = CL_SUCCESS;
-  char * ret_str = "";
-  CHECK_PROGRAM (program);
-  INVALID_DEVICE_IF (device != program->ctx->device);
-  if (param_name == CL_PROGRAM_BUILD_STATUS) {
-    FILL_GETINFO_RET (cl_build_status, 1, &program->build_status, CL_SUCCESS);
-  } else if (param_name == CL_PROGRAM_BUILD_OPTIONS) {
-    if (program->is_built && program->build_opts)
-      ret_str = program->build_opts;
-    FILL_GETINFO_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
-  } else if (param_name == CL_PROGRAM_BUILD_LOG) {
-    FILL_GETINFO_RET (char, program->build_log_sz + 1, program->build_log, CL_SUCCESS);
-    if (param_value_size_ret)
-      *param_value_size_ret = program->build_log_sz + 1;
-  }else if (param_name == CL_PROGRAM_BINARY_TYPE){
-    FILL_GETINFO_RET (cl_uint, 1, &program->binary_type, CL_SUCCESS);
-  } else {
-    return CL_INVALID_VALUE;
-  }
-    return err;
-clCreateKernel(cl_program   program,
-               const char * kernel_name,
-               cl_int *     errcode_ret)
-  cl_kernel kernel = NULL;
-  cl_int err = CL_SUCCESS;
-  CHECK_PROGRAM (program);
-  if (program->ker_n <= 0) {
-    goto error;
-  }
-  INVALID_VALUE_IF (kernel_name == NULL);
-  kernel = cl_program_create_kernel(program, kernel_name, &err);
-  if (errcode_ret)
-    *errcode_ret = err;
-  return kernel;
-clCreateKernelsInProgram(cl_program      program,
-                         cl_uint         num_kernels,
-                         cl_kernel *     kernels,
-                         cl_uint *       num_kernels_ret)
-  cl_int err = CL_SUCCESS;
-  CHECK_PROGRAM (program);
-  if (program->ker_n <= 0) {
-    goto error;
-  }
-  if (kernels && num_kernels < program->ker_n) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if(num_kernels_ret)
-    *num_kernels_ret = program->ker_n;
-  if(kernels)
-    err = cl_program_create_kernels_in_program(program, kernels);
-  return err;
-clRetainKernel(cl_kernel kernel)
-  cl_int err = CL_SUCCESS;
-  CHECK_KERNEL(kernel);
-  cl_kernel_add_ref(kernel);
-  return err;
-clReleaseKernel(cl_kernel kernel)
-  cl_int err = CL_SUCCESS;
-  CHECK_KERNEL(kernel);
-  cl_kernel_delete(kernel);
-  return err;
-clSetKernelArg(cl_kernel     kernel,
-               cl_uint       arg_index,
-               size_t        arg_size,
-               const void *  arg_value)
-  cl_int err = CL_SUCCESS;
-  CHECK_KERNEL(kernel);
-#ifdef HAS_CMRT
-  if (kernel->cmrt_kernel != NULL)
-    err = cmrt_set_kernel_arg(kernel, arg_index, arg_size, arg_value);
-  else
-    err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
-  return err;
-cl_int clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_index, cl_kernel_arg_info param_name,
-        size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-  cl_int err = CL_SUCCESS;
-  CHECK_KERNEL(kernel);
-  if(kernel->program->build_opts == NULL ||
-        strstr(kernel->program->build_opts,"-cl-kernel-arg-info") == NULL ) {
-    goto error;
-  }
-          && param_name != CL_KERNEL_ARG_ACCESS_QUALIFIER
-          && param_name != CL_KERNEL_ARG_TYPE_NAME
-          && param_name != CL_KERNEL_ARG_TYPE_QUALIFIER
-          && param_name != CL_KERNEL_ARG_NAME) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (arg_index >= kernel->arg_n) {
-    goto error;
-  }
-  err = cl_get_kernel_arg_info(kernel, arg_index, param_name, param_value_size,
-          param_value, param_value_size_ret);
-  return err;
-clGetKernelInfo(cl_kernel        kernel,
-                cl_kernel_info   param_name,
-                size_t           param_value_size,
-                void *           param_value,
-                size_t *         param_value_size_ret)
-  cl_int err;
-  CHECK_KERNEL(kernel);
-  if (param_name == CL_KERNEL_CONTEXT) {
-    FILL_GETINFO_RET (cl_context, 1, &kernel->program->ctx, CL_SUCCESS);
-  } else if (param_name == CL_KERNEL_PROGRAM) {
-    FILL_GETINFO_RET (cl_program, 1, &kernel->program, CL_SUCCESS);
-  } else if (param_name == CL_KERNEL_NUM_ARGS) {
-    cl_uint n = kernel->arg_n;
-    FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
-  } else if (param_name == CL_KERNEL_REFERENCE_COUNT) {
-    cl_int ref = kernel->ref_n;
-    FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
-  } else if (param_name == CL_KERNEL_FUNCTION_NAME) {
-    const char * n = cl_kernel_get_name(kernel);
-    FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
-  } else if (param_name == CL_KERNEL_ATTRIBUTES) {
-    const char * n = cl_kernel_get_attributes(kernel);
-    FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
-  } else {
-    return CL_INVALID_VALUE;
-  }
-  return err;
-clGetKernelWorkGroupInfo(cl_kernel                   kernel,
-                         cl_device_id                device,
-                         cl_kernel_work_group_info   param_name,
-                         size_t                      param_value_size,
-                         void *                      param_value,
-                         size_t *                    param_value_size_ret)
-  return cl_get_kernel_workgroup_info(kernel,
-                                      device,
-                                      param_name,
-                                      param_value_size,
-                                      param_value,
-                                      param_value_size_ret);
-clGetKernelSubGroupInfoKHR(cl_kernel                   kernel,
-                          cl_device_id                device,
-                          cl_kernel_work_group_info   param_name,
-                          size_t                      input_value_size,
-                          const void *                input_value,
-                          size_t                      param_value_size,
-                          void *                      param_value,
-                          size_t *                    param_value_size_ret)
-  return cl_get_kernel_subgroup_info(kernel,
-                                     device,
-                                     param_name,
-                                     input_value_size,
-                                     input_value,
-                                     param_value_size,
-                                     param_value,
-                                     param_value_size_ret);
-clWaitForEvents(cl_uint          num_events,
-                const cl_event * event_list)
-  cl_int err = CL_SUCCESS;
-  cl_context ctx = NULL;
-  if(num_events > 0 && event_list)
-    ctx = event_list[0]->ctx;
-  TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx);
-  while(cl_event_wait_events(num_events, event_list, NULL) == CL_ENQUEUE_EXECUTE_DEFER) {
-    usleep(8000);       //sleep 8ms to wait other thread
-  }
-  return err;
-clGetEventInfo(cl_event      event,
-               cl_event_info param_name,
-               size_t        param_value_size,
-               void *        param_value,
-               size_t *      param_value_size_ret)
-  cl_int err = CL_SUCCESS;
-  CHECK_EVENT(event);
-  if (param_name == CL_EVENT_COMMAND_QUEUE) {
-    FILL_GETINFO_RET (cl_command_queue, 1, &event->queue, CL_SUCCESS);
-  } else if (param_name == CL_EVENT_CONTEXT) {
-    FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS);
-  } else if (param_name == CL_EVENT_COMMAND_TYPE) {
-    FILL_GETINFO_RET (cl_command_type, 1, &event->type, CL_SUCCESS);
-  } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
-    cl_event_update_status(event, 0);
-    FILL_GETINFO_RET (cl_int, 1, &event->status, CL_SUCCESS);
-  } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
-    cl_uint ref = event->ref_n;
-    FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
-  } else {
-    return CL_INVALID_VALUE;
-  }
-  return err;
-clCreateUserEvent(cl_context context,
-                  cl_int *   errcode_ret)
-  cl_int err = CL_SUCCESS;
-  cl_event event = NULL;
-  CHECK_CONTEXT(context);
-  TRY_ALLOC(event, cl_event_new(context, NULL, CL_COMMAND_USER, CL_TRUE));
-  if(errcode_ret)
-    *errcode_ret = err;
-  return event;
-clRetainEvent(cl_event  event)
-  cl_int err = CL_SUCCESS;
-  CHECK_EVENT(event);
-  cl_event_add_ref(event);
-  return err;
-clReleaseEvent(cl_event  event)
-  cl_int err = CL_SUCCESS;
-  CHECK_EVENT(event);
-  cl_event_delete(event);
-  return err;
-clSetUserEventStatus(cl_event    event,
-                     cl_int      execution_status)
-  cl_int err = CL_SUCCESS;
-  CHECK_EVENT(event);
-  if(execution_status > CL_COMPLETE) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if(event->status != CL_SUBMITTED) {
-    goto error;
-  }
-  cl_event_set_status(event, execution_status);
-  return err;
-clSetEventCallback(cl_event     event,
-                   cl_int       command_exec_callback_type,
-                   void (CL_CALLBACK * pfn_notify) (cl_event, cl_int, void *),
-                   void *       user_data)
-  cl_int err = CL_SUCCESS;
-  CHECK_EVENT(event);
-  if((pfn_notify == NULL) ||
-    (command_exec_callback_type > CL_SUBMITTED) ||
-    (command_exec_callback_type < CL_COMPLETE)) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  err = cl_event_set_callback(event, command_exec_callback_type, pfn_notify, user_data);
-  return err;
-clGetEventProfilingInfo(cl_event             event,
-                        cl_profiling_info    param_name,
-                        size_t               param_value_size,
-                        void *               param_value,
-                        size_t *             param_value_size_ret)
-  cl_int err = CL_SUCCESS;
-  cl_ulong ret_val;
-  CHECK_EVENT(event);
-  cl_event_update_status(event, 0);
-  if (event->type == CL_COMMAND_USER ||
-      !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
-          event->status != CL_COMPLETE) {
-    goto error;
-  }
-  if (param_value && param_value_size < sizeof(cl_ulong)) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (param_name == CL_PROFILING_COMMAND_QUEUED) {
-    ret_val = event->queued_timestamp;
-  } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) {
-    ret_val= event->queued_timestamp + cl_event_get_timestamp_delta(event->timestamp[0],event->timestamp[1]);
-  } else if (param_name == CL_PROFILING_COMMAND_START) {
-    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_START);
-    ret_val = event->queued_timestamp + cl_event_get_start_timestamp(event);
-  } else if (param_name == CL_PROFILING_COMMAND_END) {
-    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_END);
-    ret_val =  event->queued_timestamp + cl_event_get_end_timestamp(event);
-  } else {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (err == CL_SUCCESS) {
-    if (param_value)
-      *(cl_ulong*)param_value = ret_val;
-    if (param_value_size_ret)
-      *param_value_size_ret = sizeof(cl_ulong);
-  }
-  return err;
-clFlush(cl_command_queue command_queue)
-  /* have nothing to do now, as currently
-   * clEnqueueNDRangeKernel will flush at
-   * the end of each calling. we may need
-   * to optimize it latter.*/
-  return 0;
-clFinish(cl_command_queue command_queue)
-  cl_int err = CL_SUCCESS;
-  CHECK_QUEUE (command_queue);
-#ifdef HAS_CMRT
-  if (command_queue->cmrt_event != NULL)
-    return cmrt_wait_for_task_finished(command_queue);
-  err = cl_command_queue_finish(command_queue);
-  return err;
-clEnqueueReadBuffer(cl_command_queue command_queue,
-                    cl_mem           buffer,
-                    cl_bool          blocking_read,
-                    size_t           offset,
-                    size_t           size,
-                    void *           ptr,
-                    cl_uint          num_events_in_wait_list,
-                    const cl_event * event_wait_list,
-                    cl_event *       event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, defer_enqueue_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(buffer);
-  if (command_queue->ctx != buffer->ctx) {
-     goto error;
-  }
-  if (!ptr || !size || offset + size > buffer->size) {
-     err = CL_INVALID_VALUE;
-     goto error;
-  }
-  if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
-     goto error;
-  }
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-  data = &defer_enqueue_data;
-  data->type    = EnqueueReadBuffer;
-  data->mem_obj = buffer;
-  data->ptr     = ptr;
-  data->offset  = offset;
-  data->size    = size;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
-  }
-  return err;
-clEnqueueReadBufferRect(cl_command_queue command_queue,
-                        cl_mem           buffer,
-                        cl_bool          blocking_read,
-                        const size_t *   buffer_origin,
-                        const size_t *   host_origin,
-                        const size_t *   region,
-                        size_t           buffer_row_pitch,
-                        size_t           buffer_slice_pitch,
-                        size_t           host_row_pitch,
-                        size_t           host_slice_pitch,
-                        void *           ptr,
-                        cl_uint          num_events_in_wait_list,
-                        const cl_event * event_wait_list,
-                        cl_event *       event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(buffer);
-  if (command_queue->ctx != buffer->ctx) {
-    goto error;
-  }
-  if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
-     goto error;
-  }
-  if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if(buffer_row_pitch == 0)
-    buffer_row_pitch = region[0];
-  if(buffer_slice_pitch == 0)
-    buffer_slice_pitch = region[1] * buffer_row_pitch;
-  if(host_row_pitch == 0)
-    host_row_pitch = region[0];
-  if(host_slice_pitch == 0)
-    host_slice_pitch = region[1] * host_row_pitch;
-  if (buffer_row_pitch < region[0] ||
-      host_row_pitch < region[0]) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
-      (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
-         + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
-         + buffer_origin[0] + region[0] > buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-  data = &no_wait_data;
-  data->type        = EnqueueReadBufferRect;
-  data->mem_obj     = buffer;
-  data->ptr         = ptr;
-  data->origin[0]   = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
-  data->host_origin[0]  = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
-  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
-  data->row_pitch   = buffer_row_pitch;
-  data->slice_pitch = buffer_slice_pitch;
-  data->host_row_pitch   = host_row_pitch;
-  data->host_slice_pitch = host_slice_pitch;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
-  }
- error:
-  return err;
-clEnqueueWriteBuffer(cl_command_queue    command_queue,
-                     cl_mem              buffer,
-                     cl_bool             blocking_write,
-                     size_t              offset,
-                     size_t              size,
-                     const void *        ptr,
-                     cl_uint             num_events_in_wait_list,
-                     const cl_event *    event_wait_list,
-                     cl_event *          event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(buffer);
-  if (command_queue->ctx != buffer->ctx) {
-    goto error;
-  }
-  if (!ptr || !size || offset + size > buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
-    goto error;
-  }
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-  data = &no_wait_data;
-  data->type      = EnqueueWriteBuffer;
-  data->mem_obj   = buffer;
-  data->const_ptr = ptr;
-  data->offset    = offset;
-  data->size      = size;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
-  }
- error:
-  return err;
-clEnqueueWriteBufferRect(cl_command_queue     command_queue,
-                         cl_mem               buffer,
-                         cl_bool              blocking_write,
-                         const size_t *       buffer_origin,
-                         const size_t *       host_origin,
-                         const size_t *       region,
-                         size_t               buffer_row_pitch,
-                         size_t               buffer_slice_pitch,
-                         size_t               host_row_pitch,
-                         size_t               host_slice_pitch,
-                         const void *         ptr,
-                         cl_uint              num_events_in_wait_list,
-                         const cl_event *     event_wait_list,
-                         cl_event *           event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(buffer);
-  if (command_queue->ctx != buffer->ctx) {
-    goto error;
-  }
-  if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
-    goto error;
-  }
-  if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if(buffer_row_pitch == 0)
-    buffer_row_pitch = region[0];
-  if(buffer_slice_pitch == 0)
-    buffer_slice_pitch = region[1] * buffer_row_pitch;
-  if(host_row_pitch == 0)
-    host_row_pitch = region[0];
-  if(host_slice_pitch == 0)
-    host_slice_pitch = region[1] * host_row_pitch;
-  if (buffer_row_pitch < region[0] ||
-      host_row_pitch < region[0]) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
-      (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
-         + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
-         + buffer_origin[0] + region[0] > buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-  data = &no_wait_data;
-  data->type        = EnqueueWriteBufferRect;
-  data->mem_obj     = buffer;
-  data->const_ptr   = ptr;
-  data->origin[0]   = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
-  data->host_origin[0]  = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
-  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
-  data->row_pitch   = buffer_row_pitch;
-  data->slice_pitch = buffer_slice_pitch;
-  data->host_row_pitch   = host_row_pitch;
-  data->host_slice_pitch = host_slice_pitch;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
-  }
-  return err;
-clEnqueueFillImage(cl_command_queue   command_queue,
-                   cl_mem             image,
-                   const void *       fill_color,
-                   const size_t *     porigin,
-                   const size_t *     pregion,
-                   cl_uint            num_events_in_wait_list,
-                   const cl_event *   event_wait_list,
-                   cl_event *         event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_IMAGE(image, src_image);
-  FIXUP_IMAGE_REGION(src_image, pregion, region);
-  FIXUP_IMAGE_ORIGIN(src_image, porigin, origin);
-  if (command_queue->ctx != image->ctx) {
-    goto error;
-  }
-  if (fill_color == NULL) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (!origin || !region || origin[0] + region[0] > src_image->w || origin[1] + region[1] > src_image->h || origin[2] + region[2] > src_image->depth) {
-     err = CL_INVALID_VALUE;
-     goto error;
-  }
-  if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (origin[2] != 0 || region[2] != 1)){
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D && (origin[2] != 0 ||origin[1] != 0 || region[2] != 1 || region[1] != 1)){
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  err = cl_image_fill(command_queue, fill_color, src_image, origin, region);
-  if (err) {
-    goto error;
-  }
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
-  data = &no_wait_data;
-  data->type = EnqueueFillImage;
-  data->queue = command_queue;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    if (event && (*event)->type != CL_COMMAND_USER
-        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
-    }
-    err = cl_command_queue_flush(command_queue);
-  }
-  if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_image", "", command_queue);
-  return 0;
- error:
-  return err;
-clEnqueueFillBuffer(cl_command_queue   command_queue,
-                    cl_mem             buffer,
-                    const void *       pattern,
-                    size_t             pattern_size,
-                    size_t             offset,
-                    size_t             size,
-                    cl_uint            num_events_in_wait_list,
-                    const cl_event *   event_wait_list,
-                    cl_event *         event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
-  int i = 0;
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(buffer);
-  if (command_queue->ctx != buffer->ctx) {
-    goto error;
-  }
-  if (offset + size > buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (pattern == NULL) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
-    if (valid_sz[i] == pattern_size)
-      break;
-  }
-  if (i == sizeof(valid_sz) / sizeof(size_t)) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (offset % pattern_size || size % pattern_size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size);
-  if (err) {
-    goto error;
-  }
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-  data = &no_wait_data;
-  data->type = EnqueueFillBuffer;
-  data->queue = command_queue;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    if (event && (*event)->type != CL_COMMAND_USER
-        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
-    }
-    err = cl_command_queue_flush(command_queue);
-  }
-  if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", "", command_queue);
-  return 0;
- error:
-  return err;
-clEnqueueCopyBuffer(cl_command_queue     command_queue,
-                    cl_mem               src_buffer,
-                    cl_mem               dst_buffer,
-                    size_t               src_offset,
-                    size_t               dst_offset,
-                    size_t               cb,
-                    cl_uint              num_events_in_wait_list,
-                    const cl_event *     event_wait_list,
-                    cl_event *           event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(src_buffer);
-  CHECK_MEM(dst_buffer);
-  if (command_queue->ctx != src_buffer->ctx) {
-    goto error;
-  }
-  if (command_queue->ctx != dst_buffer->ctx) {
-    goto error;
-  }
-  if (src_offset + cb > src_buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (dst_offset + cb > dst_buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  /* Check overlap */
-  if (src_buffer == dst_buffer
-         && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1)
-         && (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
-    goto error;
-  }
-  /* Check sub overlap */
-  if (src_buffer->type == CL_MEM_SUBBUFFER_TYPE && dst_buffer->type == CL_MEM_SUBBUFFER_TYPE ) {
-    struct _cl_mem_buffer* src_b = (struct _cl_mem_buffer*)src_buffer;
-    struct _cl_mem_buffer* dst_b = (struct _cl_mem_buffer*)dst_buffer;
-    size_t src_sub_offset = src_b->sub_offset;
-    size_t dst_sub_offset = dst_b->sub_offset;
-    if ((src_offset + src_sub_offset <= dst_offset + dst_sub_offset
-          && dst_offset + dst_sub_offset <= src_offset + src_sub_offset + cb - 1)
-     && (dst_offset + dst_sub_offset <= src_offset + src_sub_offset
-          && src_offset + src_sub_offset <= dst_offset + dst_sub_offset + cb - 1)) {
-      err = CL_MEM_COPY_OVERLAP;
-      goto error;
-    }
-  }
-  err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb);
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
-  data = &no_wait_data;
-  data->type = EnqueueCopyBuffer;
-  data->queue = command_queue;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    if (event && (*event)->type != CL_COMMAND_USER
-            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
-    }
-    err = cl_command_queue_flush(command_queue);
-  }
-  if(b_output_kernel_perf)
-	  time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "", command_queue);
-  return 0;
-  return err;
-clEnqueueCopyBufferRect(cl_command_queue     command_queue,
-                        cl_mem               src_buffer,
-                        cl_mem               dst_buffer,
-                        const size_t *       src_origin,
-                        const size_t *       dst_origin,
-                        const size_t *       region,
-                        size_t               src_row_pitch,
-                        size_t               src_slice_pitch,
-                        size_t               dst_row_pitch,
-                        size_t               dst_slice_pitch,
-                        cl_uint              num_events_in_wait_list,
-                        const cl_event *     event_wait_list,
-                        cl_event *           event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(src_buffer);
-  CHECK_MEM(dst_buffer);
-  if ((command_queue->ctx != src_buffer->ctx) ||
-      (command_queue->ctx != dst_buffer->ctx)) {
-    goto error;
-  }
-  if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if(src_row_pitch == 0)
-    src_row_pitch = region[0];
-  if(src_slice_pitch == 0)
-    src_slice_pitch = region[1] * src_row_pitch;
-  if(dst_row_pitch == 0)
-    dst_row_pitch = region[0];
-  if(dst_slice_pitch == 0)
-    dst_slice_pitch = region[1] * dst_row_pitch;
-  if (src_row_pitch < region[0] ||
-      dst_row_pitch < region[0]) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) ||
-      (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if ((src_origin[2] + region[2] - 1) * src_slice_pitch
-        + (src_origin[1] + region[1] - 1) * src_row_pitch
-        + src_origin[0] + region[0] > src_buffer->size
-      ||(dst_origin[2] + region[2] - 1) * dst_slice_pitch
-          + (dst_origin[1] + region[1] - 1) * dst_row_pitch
-          + dst_origin[0] + region[0] > dst_buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (src_buffer == dst_buffer &&
-      check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
-    goto error;
-  }
-  cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
-                          src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
-  data = &no_wait_data;
-  data->type = EnqueueCopyBufferRect;
-  data->queue = command_queue;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    if (event && (*event)->type != CL_COMMAND_USER
-            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
-    }
-    err = cl_command_queue_flush(command_queue);
-  }
-  if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "", command_queue);
-  return err;
-clEnqueueReadImage(cl_command_queue      command_queue,
-                   cl_mem                mem,
-                   cl_bool               blocking_read,
-                   const size_t *        porigin,
-                   const size_t *        pregion,
-                   size_t                row_pitch,
-                   size_t                slice_pitch,
-                   void *                ptr,
-                   cl_uint               num_events_in_wait_list,
-                   const cl_event *      event_wait_list,
-                   cl_event *            event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_IMAGE(mem, image);
-  FIXUP_IMAGE_REGION(image, pregion, region);
-  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
-  if (command_queue->ctx != mem->ctx) {
-     goto error;
-  }
-  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
-     err = CL_INVALID_VALUE;
-     goto error;
-  }
-  if (!row_pitch)
-    row_pitch = image->bpp*region[0];
-  else if (row_pitch < image->bpp*region[0]) {
-     err = CL_INVALID_VALUE;
-     goto error;
-  }
-  if (image->slice_pitch) {
-    if (!slice_pitch)
-      slice_pitch = row_pitch*region[1];
-    else if (slice_pitch < row_pitch*region[1]) {
-      err = CL_INVALID_VALUE;
-      goto error;
-    }
-  }
-  else if (slice_pitch) {
-     err = CL_INVALID_VALUE;
-     goto error;
-  }
-  if (!ptr) {
-     err = CL_INVALID_VALUE;
-     goto error;
-  }
-  if (mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
-     goto error;
-  }
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
-  data = &no_wait_data;
-  data->type        = EnqueueReadImage;
-  data->mem_obj     = mem;
-  data->ptr         = ptr;
-  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
-  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
-  data->row_pitch   = row_pitch;
-  data->slice_pitch = slice_pitch;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
-  }
-  return err;
-clEnqueueWriteImage(cl_command_queue     command_queue,
-                    cl_mem               mem,
-                    cl_bool              blocking_write,
-                    const size_t *       porigin,
-                    const size_t *       pregion,
-                    size_t               row_pitch,
-                    size_t               slice_pitch,
-                    const void *         ptr,
-                    cl_uint              num_events_in_wait_list,
-                    const cl_event *     event_wait_list,
-                    cl_event *           event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_IMAGE(mem, image);
-  FIXUP_IMAGE_REGION(image, pregion, region);
-  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
-  if (command_queue->ctx != mem->ctx) {
-    goto error;
-  }
-  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (!row_pitch)
-    row_pitch = image->bpp*region[0];
-  else if (row_pitch < image->bpp*region[0]) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (image->slice_pitch) {
-    if (!slice_pitch)
-      slice_pitch = row_pitch*region[1];
-    else if (slice_pitch < row_pitch*region[1]) {
-      err = CL_INVALID_VALUE;
-      goto error;
-    }
-  }
-  else if (slice_pitch) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (!ptr) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
-    goto error;
-  }
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
-  data = &no_wait_data;
-  data->type        = EnqueueWriteImage;
-  data->mem_obj     = mem;
-  data->const_ptr   = ptr;
-  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
-  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
-  data->row_pitch   = row_pitch;
-  data->slice_pitch = slice_pitch;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
-  }
-  return err;
-clEnqueueCopyImage(cl_command_queue      command_queue,
-                   cl_mem                src_mem,
-                   cl_mem                dst_mem,
-                   const size_t *        psrc_origin,
-                   const size_t *        pdst_origin,
-                   const size_t *        pregion,
-                   cl_uint               num_events_in_wait_list,
-                   const cl_event *      event_wait_list,
-                   cl_event *            event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  cl_bool overlap = CL_TRUE;
-  cl_int i = 0;
-  CHECK_QUEUE(command_queue);
-  CHECK_IMAGE(src_mem, src_image);
-  CHECK_IMAGE(dst_mem, dst_image);
-  FIXUP_IMAGE_REGION(src_image, pregion, region);
-  FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
-  FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
-  if (command_queue->ctx != src_mem->ctx ||
-      command_queue->ctx != dst_mem->ctx) {
-    goto error;
-  }
-  if (src_image->fmt.image_channel_order != dst_image->fmt.image_channel_order ||
-      src_image->fmt.image_channel_data_type != dst_image->fmt.image_channel_data_type) {
-    goto error;
-  }
-  if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
-      src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
-      dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) ||
-      (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1))) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (src_image == dst_image) {
-    for(i = 0; i < 3; i++)
-      overlap = overlap && (src_origin[i] < dst_origin[i] + region[i])
-                        && (dst_origin[i] < src_origin[i] + region[i]);
-    if(overlap == CL_TRUE) {
-      err = CL_MEM_COPY_OVERLAP;
-      goto error;
-    }
-  }
-  cl_mem_kernel_copy_image(command_queue, src_image, dst_image, src_origin, dst_origin, region);
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
-  data = &no_wait_data;
-  data->type = EnqueueCopyImage;
-  data->queue = command_queue;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    if (event && (*event)->type != CL_COMMAND_USER
-            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
-    }
-    err = cl_command_queue_flush(command_queue);
-  }
-  if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "", command_queue);
-  return err;
-clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
-                           cl_mem            src_mem,
-                           cl_mem            dst_buffer,
-                           const size_t *    psrc_origin,
-                           const size_t *    pregion,
-                           size_t            dst_offset,
-                           cl_uint           num_events_in_wait_list,
-                           const cl_event *  event_wait_list,
-                           cl_event *        event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_IMAGE(src_mem, src_image);
-  CHECK_MEM(dst_buffer);
-  FIXUP_IMAGE_REGION(src_image, pregion, region);
-  FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
-  if (command_queue->ctx != src_mem->ctx ||
-      command_queue->ctx != dst_buffer->ctx) {
-    goto error;
-  }
-  if (dst_offset + region[0]*region[1]*region[2]*src_image->bpp > dst_buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
-      src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  cl_mem_copy_image_to_buffer(command_queue, src_image, dst_buffer, src_origin, dst_offset, region);
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
-  data = &no_wait_data;
-  data->type = EnqueueCopyImageToBuffer;
-  data->queue = command_queue;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    if (event && (*event)->type != CL_COMMAND_USER
-            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
-    }
-    err = cl_command_queue_flush(command_queue);
-  }
-  if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "", command_queue);
-  return err;
-clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
-                           cl_mem            src_buffer,
-                           cl_mem            dst_mem,
-                           size_t            src_offset,
-                           const size_t *    pdst_origin,
-                           const size_t *    pregion,
-                           cl_uint           num_events_in_wait_list,
-                           const cl_event *  event_wait_list,
-                           cl_event *        event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(src_buffer);
-  CHECK_IMAGE(dst_mem, dst_image);
-  FIXUP_IMAGE_REGION(dst_image, pregion, region);
-  FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
-  if (command_queue->ctx != src_buffer->ctx ||
-      command_queue->ctx != dst_mem->ctx) {
-    goto error;
-  }
-  if (src_offset + region[0]*region[1]*region[2]*dst_image->bpp > src_buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
-      dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1)) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  cl_mem_copy_buffer_to_image(command_queue, src_buffer, dst_image, src_offset, dst_origin, region);
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, dst_mem->ctx);
-  data = &no_wait_data;
-  data->type = EnqueueCopyBufferToImage;
-  data->queue = command_queue;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    if (event && (*event)->type != CL_COMMAND_USER
-            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
-    }
-    err = cl_command_queue_flush(command_queue);
-  }
-  if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "", command_queue);
-  return err;
-static cl_int _cl_map_mem(cl_mem mem, void *ptr, void **mem_ptr,
-                          size_t offset, size_t size,
-                          const size_t *origin, const size_t *region)
-  cl_int slot = -1;
-  int err = CL_SUCCESS;
-  size_t sub_offset = 0;
-  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
-    struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
-    sub_offset = buffer->sub_offset;
-  }
-  ptr = (char*)ptr + offset + sub_offset;
-  if(mem->flags & CL_MEM_USE_HOST_PTR) {
-    assert(mem->host_ptr);
-    //only calc ptr here, will do memcpy in enqueue
-    *mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
-  } else {
-    *mem_ptr = ptr;
-  }
-  /* Record the mapped address. */
-  if (!mem->mapped_ptr_sz) {
-    mem->mapped_ptr_sz = 16;
-    mem->mapped_ptr = (cl_mapped_ptr *)malloc(
-          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
-    if (!mem->mapped_ptr) {
-      cl_mem_unmap_auto(mem);
-      err = CL_OUT_OF_HOST_MEMORY;
-      goto error;
-    }
-    memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
-    slot = 0;
-  } else {
-   int i = 0;
-    for (; i < mem->mapped_ptr_sz; i++) {
-      if (mem->mapped_ptr[i].ptr == NULL) {
-        slot = i;
-        break;
-      }
-   }
-    if (i == mem->mapped_ptr_sz) {
-      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
-          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
-      if (!new_ptr) {
-        cl_mem_unmap_auto(mem);
-        err = CL_OUT_OF_HOST_MEMORY;
-        goto error;
-      }
-      memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
-      memcpy(new_ptr, mem->mapped_ptr,
-             mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
-      slot = mem->mapped_ptr_sz;
-      mem->mapped_ptr_sz *= 2;
-      free(mem->mapped_ptr);
-      mem->mapped_ptr = new_ptr;
-    }
-  }
-  assert(slot != -1);
-  mem->mapped_ptr[slot].ptr = *mem_ptr;
-  mem->mapped_ptr[slot].v_ptr = ptr;
-  mem->mapped_ptr[slot].size = size;
-  if(origin) {
-    assert(region);
-    mem->mapped_ptr[slot].origin[0] = origin[0];
-    mem->mapped_ptr[slot].origin[1] = origin[1];
-    mem->mapped_ptr[slot].origin[2] = origin[2];
-    mem->mapped_ptr[slot].region[0] = region[0];
-    mem->mapped_ptr[slot].region[1] = region[1];
-    mem->mapped_ptr[slot].region[2] = region[2];
-  }
-  mem->map_ref++;
-  if (err != CL_SUCCESS)
-    *mem_ptr = NULL;
-  return err;
-void *
-clEnqueueMapBuffer(cl_command_queue  command_queue,
-                   cl_mem            buffer,
-                   cl_bool           blocking_map,
-                   cl_map_flags      map_flags,
-                   size_t            offset,
-                   size_t            size,
-                   cl_uint           num_events_in_wait_list,
-                   const cl_event *  event_wait_list,
-                   cl_event *        event,
-                   cl_int *          errcode_ret)
-  cl_int err = CL_SUCCESS;
-  void *ptr = NULL;
-  void *mem_ptr = NULL;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(buffer);
-  if (command_queue->ctx != buffer->ctx) {
-    goto error;
-  }
-  if (!size || offset + size > buffer->size) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if ((map_flags & CL_MAP_READ &&
-       buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
-       buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
-  {
-    goto error;
-  }
-#ifdef HAS_CMRT
-  if (command_queue->cmrt_event != NULL)
-    cmrt_wait_for_task_finished(command_queue);
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
-  data = &no_wait_data;
-  data->type        = EnqueueMapBuffer;
-  data->mem_obj     = buffer;
-  data->offset      = offset;
-  data->size        = size;
-  data->ptr         = ptr;
-  data->unsync_map  = 1;
-    data->write_map = 1;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    data->unsync_map = 0;
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if (err != CL_SUCCESS)
-      goto error;
-    ptr = data->ptr;
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
-  } else {
-    if (buffer->is_userptr)
-      ptr = buffer->host_ptr;
-    else {
-      if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
-        err = CL_MAP_FAILURE;
-        goto error;
-      }
-    }
-  }
-  err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
-  if (err != CL_SUCCESS)
-    goto error;
-  if (errcode_ret)
-    *errcode_ret = err;
-  return mem_ptr;
-void *
-clEnqueueMapImage(cl_command_queue   command_queue,
-                  cl_mem             mem,
-                  cl_bool            blocking_map,
-                  cl_map_flags       map_flags,
-                  const size_t *     porigin,
-                  const size_t *     pregion,
-                  size_t *           image_row_pitch,
-                  size_t *           image_slice_pitch,
-                  cl_uint            num_events_in_wait_list,
-                  const cl_event *   event_wait_list,
-                  cl_event *         event,
-                  cl_int *           errcode_ret)
-  cl_int err = CL_SUCCESS;
-  void *ptr  = NULL;
-  void *mem_ptr = NULL;
-  size_t offset = 0;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_IMAGE(mem, image);
-  FIXUP_IMAGE_REGION(image, pregion, region);
-  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
-  if (command_queue->ctx != mem->ctx) {
-    goto error;
-  }
-  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-  if ((map_flags & CL_MAP_READ &&
-       mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
-       mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
-  {
-    goto error;
-  }
-#ifdef HAS_CMRT
-  if (command_queue->cmrt_event != NULL)
-    cmrt_wait_for_task_finished(command_queue);
+  INVALID_VALUE_IF (num_input_programs == 0 && input_programs == NULL);
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
-  data = &no_wait_data;
-  data->type        = EnqueueMapImage;
-  data->mem_obj     = mem;
-  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
-  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
-  data->ptr         = ptr;
-  data->unsync_map  = 1;
-    data->write_map = 1;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    data->unsync_map = 0;
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if (err != CL_SUCCESS)
-      goto error;
-    ptr = data->ptr;
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
-  } else {
-    if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
-      err = CL_MAP_FAILURE;
-      goto error;
-    }
-  }
+  program = cl_program_link(context, num_input_programs, input_programs, options, &err);
-  if(mem->flags & CL_MEM_USE_HOST_PTR) {
-    if (image_slice_pitch)
-      *image_slice_pitch = image->host_slice_pitch;
-    *image_row_pitch = image->host_row_pitch;
+  if(program) program->is_built = CL_TRUE;
-    offset = image->bpp*origin[0] + image->host_row_pitch*origin[1] + image->host_slice_pitch*origin[2];
-  } else {
-    if (image_slice_pitch)
-      *image_slice_pitch = image->slice_pitch;
-    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
-      *image_row_pitch = image->slice_pitch;
-    else
-      *image_row_pitch = image->row_pitch;
-    offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
-  }
-  err = _cl_map_mem(mem, ptr, &mem_ptr, offset, 0, origin, region);
+  if (pfn_notify) pfn_notify(program, user_data);
   if (errcode_ret)
     *errcode_ret = err;
-  return mem_ptr; //TODO: map and unmap first
+  return program;
-clEnqueueUnmapMemObject(cl_command_queue  command_queue,
-                        cl_mem            memobj,
-                        void *            mapped_ptr,
-                        cl_uint           num_events_in_wait_list,
-                        const cl_event *  event_wait_list,
-                        cl_event *        event)
-  cl_int err = CL_SUCCESS;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
-  CHECK_MEM(memobj);
-  if (command_queue->ctx != memobj->ctx) {
-    goto error;
-  }
+  return CL_SUCCESS;
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, memobj->ctx);
+clUnloadPlatformCompiler(cl_platform_id platform)
+  return CL_SUCCESS;
-  data = &no_wait_data;
-  data->type        = EnqueueUnmapMemObject;
-  data->mem_obj     = memobj;
-  data->ptr         = mapped_ptr;
+clCreateKernel(cl_program   program,
+               const char * kernel_name,
+               cl_int *     errcode_ret)
+  cl_kernel kernel = NULL;
+  cl_int err = CL_SUCCESS;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  CHECK_PROGRAM (program);
+  if (program->ker_n <= 0) {
+    goto error;
+  INVALID_VALUE_IF (kernel_name == NULL);
+  kernel = cl_program_create_kernel(program, kernel_name, &err);
-  return err;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return kernel;
-clEnqueueMigrateMemObjects(cl_command_queue        command_queue,
-                           cl_uint                 num_mem_objects,
-                           const cl_mem *          mem_objects,
-                           cl_mem_migration_flags  flags,
-                           cl_uint                 num_events_in_wait_list,
-                           const cl_event *        event_wait_list,
-                           cl_event *              event)
+clCreateKernelsInProgram(cl_program      program,
+                         cl_uint         num_kernels,
+                         cl_kernel *     kernels,
+                         cl_uint *       num_kernels_ret)
-  /* So far, we just support 1 device and no subdevice. So all the command queues
-     belong to the small context. There is no need to migrate the mem objects by now. */
   cl_int err = CL_SUCCESS;
-  cl_uint i = 0;
-  enqueue_data *data, defer_enqueue_data = { 0 };
-    CHECK_QUEUE(command_queue);
-  if (num_mem_objects == 0 || mem_objects == NULL) {
-    err = CL_INVALID_VALUE;
+  CHECK_PROGRAM (program);
+  if (program->ker_n <= 0) {
     goto error;
-  if (flags && flags & ~(CL_MIGRATE_MEM_OBJECT_HOST |
-                         CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)) {
+  if (kernels && num_kernels < program->ker_n) {
     err = CL_INVALID_VALUE;
     goto error;
-  for (i = 0; i < num_mem_objects; i++) {
-    CHECK_MEM(mem_objects[i]);
-    if (mem_objects[i]->ctx != command_queue->ctx) {
-      err = CL_INVALID_CONTEXT;
-      goto error;
-    }
-  }
-  /* really nothing to do, fill the event. */
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
-  data = &defer_enqueue_data;
-  data->type = EnqueueMigrateMemObj;
+  if(num_kernels_ret)
+    *num_kernels_ret = program->ker_n;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
-  }
+  if(kernels)
+    err = cl_program_create_kernels_in_program(program, kernels);
   return err;
-clEnqueueNDRangeKernel(cl_command_queue  command_queue,
-                       cl_kernel         kernel,
-                       cl_uint           work_dim,
-                       const size_t *    global_work_offset,
-                       const size_t *    global_work_size,
-                       const size_t *    local_work_size,
-                       cl_uint           num_events_in_wait_list,
-                       const cl_event *  event_wait_list,
-                       cl_event *        event)
+clRetainKernel(cl_kernel kernel)
-  size_t fixed_global_off[] = {0,0,0};
-  size_t fixed_global_sz[] = {1,1,1};
-  size_t fixed_local_sz[] = {1,1,1};
   cl_int err = CL_SUCCESS;
-  cl_uint i;
-  enqueue_data *data, no_wait_data = { 0 };
-  CHECK_QUEUE(command_queue);
+  cl_kernel_add_ref(kernel);
+  return err;
-  /* Check number of dimensions we have */
-  if (UNLIKELY(work_dim == 0 || work_dim > 3)) {
-    goto error;
-  }
-  /* We need a work size per dimension */
-  if (UNLIKELY(global_work_size == NULL)) {
-    goto error;
-  }
-  if (kernel->vme) {
-    if (work_dim != 2) {
-      goto error;
-    }
-    if (local_work_size != NULL) {
-      goto error;
-    }
-  }
-  if (global_work_offset != NULL)
-    for (i = 0; i < work_dim; ++i) {
-      if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
-        goto error;
-      }
-    }
-  /* Local sizes must be non-null and divide global sizes */
-  if (local_work_size != NULL)
-    for (i = 0; i < work_dim; ++i)
-      if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) {
-        goto error;
-      }
+clReleaseKernel(cl_kernel kernel)
+  cl_int err = CL_SUCCESS;
+  CHECK_KERNEL(kernel);
+  cl_kernel_delete(kernel);
+  return err;
-  /* Queue and kernel must share the same context */
-  assert(kernel->program);
-  if (command_queue->ctx != kernel->program->ctx) {
-    goto error;
-  }
+clSetKernelArg(cl_kernel     kernel,
+               cl_uint       arg_index,
+               size_t        arg_size,
+               const void *  arg_value)
+  cl_int err = CL_SUCCESS;
+  CHECK_KERNEL(kernel);
 #ifdef HAS_CMRT
-  if (kernel->cmrt_kernel != NULL) {
-    err = cmrt_enqueue(command_queue, kernel, global_work_size, local_work_size);
-    goto error;
-  }
+  if (kernel->cmrt_kernel != NULL)
+    err = cmrt_set_kernel_arg(kernel, arg_index, arg_size, arg_value);
+  else
-  /* XXX No event right now */
-  //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
-  //FATAL_IF(event_wait_list != NULL, "Events are not supported");
-  //FATAL_IF(event != NULL, "Events are not supported");
-  if (local_work_size != NULL) {
-    for (i = 0; i < work_dim; ++i)
-      fixed_local_sz[i] = local_work_size[i];
-  } else {
-    if (kernel->vme) {
-        fixed_local_sz[0] = 16;
-        fixed_local_sz[1] = 1;
-    } else {
-      uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
-      size_t realGroupSize = 1;
-      for (i = 0; i< work_dim; i++) {
-        for (j = maxDimSize; j > 1; j--) {
-          if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
-            fixed_local_sz[i] = j;
-            maxGroupSize = maxGroupSize /j;
-            maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
-            break;  //choose next work_dim
-          }
-        }
-        realGroupSize *= fixed_local_sz[i];
-      }
-      //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop:
-      //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel
-      //it triggers the following message for many times.
-      //to avoid too many messages, only print it for the first time of the process.
-      //just use static variable since it doesn't matter to print a few times at multi-thread case.
-      static int warn_no_good_localsize = 1;
-      if (realGroupSize % 8 != 0 && warn_no_good_localsize) {
-        warn_no_good_localsize = 0;
-        DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide local_work_size[] explicitly, you can find good values with trial-and-error method.");
-      }
-    }
-  }
-  if (kernel->vme) {
-    fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16;
-    fixed_global_sz[1] = (global_work_size[1]+15) / 16;
-  } else {
-    for (i = 0; i < work_dim; ++i)
-      fixed_global_sz[i] = global_work_size[i];
-  }
-  if (global_work_offset != NULL)
-    for (i = 0; i < work_dim; ++i)
-      fixed_global_off[i] = global_work_offset[i];
-  if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) {
-    if (fixed_local_sz[0] != kernel->compile_wg_sz[0]
-        || fixed_local_sz[1] != kernel->compile_wg_sz[1]
-        || fixed_local_sz[2] != kernel->compile_wg_sz[2])
-    {
-        goto error;
-    }
-  }
-  /* Do device specific checks are enqueue the kernel */
-  err = cl_command_queue_ND_range(command_queue,
-                                  kernel,
-                                  work_dim,
-                                  fixed_global_off,
-                                  fixed_global_sz,
-                                  fixed_local_sz);
-  if(err != CL_SUCCESS)
-    goto error;
-  data = &no_wait_data;
-  data->type = EnqueueNDRangeKernel;
-  data->queue = command_queue;
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
-    if (event && (*event)->type != CL_COMMAND_USER
-            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
-    }
-    err = cl_command_queue_flush(command_queue);
-  }
+    err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
-  if(b_output_kernel_perf)
-  {
-    if(kernel->program->build_opts != NULL)
-      time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue);
-    else
-      time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue);
-  }
   return err;
-clEnqueueTask(cl_command_queue   command_queue,
-              cl_kernel          kernel,
-              cl_uint            num_events_in_wait_list,
-              const cl_event *   event_wait_list,
-              cl_event *         event)
+clSetKernelArgSVMPointer(cl_kernel kernel,
+                          cl_uint arg_index,
+                          const void *arg_value)
-  const size_t global_size[3] = {1, 0, 0};
-  const size_t local_size[3]  = {1, 0, 0};
+  cl_int err = CL_SUCCESS;
+  CHECK_KERNEL(kernel);
-  return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_size, local_size,
-                                num_events_in_wait_list, event_wait_list, event);
+  err = cl_kernel_set_arg_svm_pointer(kernel, arg_index, arg_value);
+  return err;
-clEnqueueNativeKernel(cl_command_queue   command_queue,
-                      void (*user_func)(void *),
-                      void *             args,
-                      size_t             cb_args,
-                      cl_uint            num_mem_objects,
-                      const cl_mem *     mem_list,
-                      const void **      args_mem_loc,
-                      cl_uint            num_events_in_wait_list,
-                      const cl_event *   event_wait_list,
-                      cl_event *         event)
+clSetKernelExecInfo(cl_kernel kernel,
+                     cl_kernel_exec_info  param_name,
+                     size_t  param_value_size,
+                     const void  *param_value)
   cl_int err = CL_SUCCESS;
-  void *new_args = NULL;
-  enqueue_data *data, no_wait_data = { 0 };
-  cl_int i;
-  if(user_func == NULL ||
-    (args == NULL && cb_args > 0) ||
-    (args == NULL && num_mem_objects ==0) ||
-    (args != NULL && cb_args == 0) ||
-    (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) ||
-    (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) {
+  CHECK_KERNEL(kernel);
+  if((param_name != CL_KERNEL_EXEC_INFO_SVM_PTRS &&
+     param_value == NULL || param_value_size == 0) {
     err = CL_INVALID_VALUE;
     goto error;
-  //Per spec, need copy args
-  if (cb_args)
-  {
-    new_args = malloc(cb_args);
-    if (!new_args)
-    {
-      err = CL_OUT_OF_HOST_MEMORY;
-      goto error;
-    }
-    memcpy(new_args, args, cb_args);
-    for (i=0; i<num_mem_objects; ++i)
-    {
-      CHECK_MEM(mem_list[i]);
-      args_mem_loc[i] = new_args + (args_mem_loc[i] - args);  //change to new args
-    }
-  }
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
-  data = &no_wait_data;
-  data->type        = EnqueueNativeKernel;
-  data->mem_list    = mem_list;
-  data->ptr         = new_args;
-  data->size        = cb_args;
-  data->offset      = (size_t)num_mem_objects;
-  data->const_ptr   = args_mem_loc;
-  data->user_func   = user_func;
-  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(event ? *event : NULL, data);
-    if(event) cl_event_set_status(*event, CL_COMPLETE);
+     *(cl_bool *)param_value == CL_TRUE) {
+    goto error;
+  err = cl_kernel_set_exec_info(kernel, param_value_size, param_value);
   return err;
-clEnqueueMarker(cl_command_queue command_queue,
-    cl_event *event)
+cl_int clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_index, cl_kernel_arg_info param_name,
+        size_t param_value_size, void *param_value, size_t *param_value_size_ret)
   cl_int err = CL_SUCCESS;
-  CHECK_QUEUE(command_queue);
-  if(event == NULL) {
+  CHECK_KERNEL(kernel);
+  if(kernel->program->build_opts == NULL ||
+        strstr(kernel->program->build_opts,"-cl-kernel-arg-info") == NULL ) {
+    goto error;
+  }
+          && param_name != CL_KERNEL_ARG_ACCESS_QUALIFIER
+          && param_name != CL_KERNEL_ARG_TYPE_NAME
+          && param_name != CL_KERNEL_ARG_TYPE_QUALIFIER
+          && param_name != CL_KERNEL_ARG_NAME) {
     err = CL_INVALID_VALUE;
     goto error;
-  cl_event_marker_with_wait_list(command_queue, 0, NULL, event);
+  if (arg_index >= kernel->arg_n) {
+    goto error;
+  }
+  err = cl_get_kernel_arg_info(kernel, arg_index, param_name, param_value_size,
+          param_value, param_value_size_ret);
   return err;
-clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
-    cl_uint num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event *event)
+clGetKernelWorkGroupInfo(cl_kernel                   kernel,
+                         cl_device_id                device,
+                         cl_kernel_work_group_info   param_name,
+                         size_t                      param_value_size,
+                         void *                      param_value,
+                         size_t *                    param_value_size_ret)
+  return cl_get_kernel_workgroup_info(kernel,
+                                      device,
+                                      param_name,
+                                      param_value_size,
+                                      param_value,
+                                      param_value_size_ret);
+clGetKernelSubGroupInfoKHR(cl_kernel                   kernel,
+                          cl_device_id                device,
+                          cl_kernel_work_group_info   param_name,
+                          size_t                      input_value_size,
+                          const void *                input_value,
+                          size_t                      param_value_size,
+                          void *                      param_value,
+                          size_t *                    param_value_size_ret)
+  return cl_get_kernel_subgroup_info(kernel,
+                                     device,
+                                     param_name,
+                                     input_value_size,
+                                     input_value,
+                                     param_value_size,
+                                     param_value,
+                                     param_value_size_ret);
+clRetainEvent(cl_event  event)
   cl_int err = CL_SUCCESS;
-  CHECK_QUEUE(command_queue);
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+  CHECK_EVENT(event);
+  cl_event_add_ref(event);
-  cl_event_marker_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
   return err;
-clEnqueueWaitForEvents(cl_command_queue  command_queue,
-                       cl_uint           num_events,
-                       const cl_event *  event_list)
+clReleaseEvent(cl_event  event)
   cl_int err = CL_SUCCESS;
-  CHECK_QUEUE(command_queue);
-  err = clWaitForEvents(num_events, event_list);
+  CHECK_EVENT(event);
+  cl_event_delete(event);
   return err;
-clEnqueueBarrier(cl_command_queue  command_queue)
+cl_mem clCreatePipe (cl_context context,
+                     cl_mem_flags flags,
+                     cl_uint pipe_packet_size,
+                     cl_uint pipe_max_packets,
+                     const cl_pipe_properties *properties,
+                     cl_int *errcode_ret)
+  cl_mem mem = NULL;
   cl_int err = CL_SUCCESS;
-  CHECK_QUEUE(command_queue);
+  cl_uint device_max_size = 0;
+  CHECK_CONTEXT (context);
+  if(UNLIKELY((flags & ~(CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS)) != 0)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if(UNLIKELY(properties != NULL)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
-  cl_event_barrier_with_wait_list(command_queue, 0, NULL, NULL);
+  if(UNLIKELY(pipe_packet_size == 0 || pipe_max_packets == 0)) {
+    goto error;
+  }
+  if ((err = cl_get_device_info(context->devices[0],
+                                CL_DEVICE_PIPE_MAX_PACKET_SIZE,
+                                sizeof(device_max_size),
+                                &device_max_size,
+                                NULL)) != CL_SUCCESS) {
+    goto error;
+  }
+  if(UNLIKELY(pipe_packet_size > device_max_size)) {
+    goto error;
+  }
+  if(flags == 0)
+  mem = cl_mem_new_pipe(context, flags, pipe_packet_size, pipe_max_packets, &err);
-  return err;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
-clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
-    cl_uint num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event *event)
+cl_int clGetPipeInfo (cl_mem pipe,
+                      cl_pipe_info param_name,
+                      size_t param_value_size,
+                      void *param_value,
+                      size_t *param_value_size_ret)
   cl_int err = CL_SUCCESS;
-  CHECK_QUEUE(command_queue);
+  CHECK_MEM(pipe);
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+  err = cl_get_pipe_info(pipe,
+                         param_name,
+                         param_value_size,
+                         param_value,
+                         param_value_size_ret);
-  cl_event_barrier_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
   return err;
@@ -3566,7 +1534,8 @@ clGetAcceleratorInfoINTEL(cl_accelerator_intel           accel,
-    FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&accel->ref_n, CL_SUCCESS);
+    cl_uint ref = CL_OBJECT_GET_REF(accel);
+    FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
   } else if (param_name == CL_ACCELERATOR_CONTEXT_INTEL) {
     FILL_GETINFO_RET (cl_context, 1, &accel->ctx, CL_SUCCESS);
   } else if (param_name == CL_ACCELERATOR_TYPE_INTEL) {
diff --git a/src/cl_api_command_queue.c b/src/cl_api_command_queue.c
new file mode 100644
index 0000000..b1aee12
--- /dev/null
+++ b/src/cl_api_command_queue.c
@@ -0,0 +1,233 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_command_queue.h"
+#include "cl_device_id.h"
+#include "CL/cl.h"
+#include <stdio.h>
+/* Depreciated in 2.0 later */
+clCreateCommandQueue(cl_context context,
+                     cl_device_id device,
+                     cl_command_queue_properties properties,
+                     cl_int *errcode_ret)
+  cl_command_queue queue = NULL;
+  cl_int err = CL_SUCCESS;
+  do {
+    if (!CL_OBJECT_IS_CONTEXT(context)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    err = cl_devices_list_include_check(context->device_num, context->devices, 1, &device);
+    if (err)
+      break;
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { /*not supported now.*/
+      break;
+    }
+    queue = cl_create_command_queue(context, device, properties, 0, &err);
+  } while (0);
+  if (errcode_ret)
+    *errcode_ret = err;
+  return queue;
+/* 2.0 new API for create command queue. */
+clCreateCommandQueueWithProperties(cl_context context,
+                                   cl_device_id device,
+                                   const cl_queue_properties *properties,
+                                   cl_int *errcode_ret)
+  cl_command_queue queue = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_command_queue_properties prop = 0xFFFFFFFF;
+  cl_uint queue_sz = 0xFFFFFFFF;
+  do {
+    if (!CL_OBJECT_IS_CONTEXT(context)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    err = cl_devices_list_include_check(context->device_num, context->devices, 1, &device);
+    if (err)
+      break;
+    if (properties) {
+      cl_ulong que_type;
+      cl_ulong que_val;
+      cl_uint i;
+      for (i = 0; (que_type = properties[i++]) != 0; i++) {
+        que_val = properties[i];
+        switch (que_type) {
+        case CL_QUEUE_PROPERTIES:
+          if (prop != 0xFFFFFFFF)
+            err = CL_INVALID_VALUE;
+          else {
+            switch (que_val) {
+            case 0:
+            case CL_QUEUE_PROFILING_ENABLE:
+            case CL_QUEUE_PROFILING_ENABLE |
+            case CL_QUEUE_PROFILING_ENABLE |
+            case CL_QUEUE_PROFILING_ENABLE |
+              prop = que_val;
+              break;
+            default:
+              err = CL_INVALID_VALUE;
+              break;
+            }
+          }
+          break;
+        case CL_QUEUE_SIZE:
+          queue_sz = que_val;
+          break;
+        default:
+          err = CL_INVALID_VALUE;
+          break;
+        }
+      }
+      if (err) /* break the while and return some err. */
+        break;
+    }
+    /* Set some paramters to default val. */
+    if (prop == 0xFFFFFFFF)
+      prop = 0;
+    if (queue_sz != 0xFFFFFFFF)
+      if (!(prop & CL_QUEUE_ON_DEVICE)) {
+        err = CL_INVALID_VALUE;
+        break;
+      }
+    if (queue_sz == 0xFFFFFFFF)
+      queue_sz = device->queue_on_device_preferred_size;
+    if (queue_sz > device->queue_on_device_max_size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    queue = cl_create_command_queue(context, device, prop, queue_sz, &err);
+  } while (0);
+  if (errcode_ret)
+    *errcode_ret = err;
+  return queue;
+clGetCommandQueueInfo(cl_command_queue command_queue,
+                      cl_command_queue_info param_name,
+                      size_t param_value_size,
+                      void *param_value,
+                      size_t *param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  cl_int ref;
+  if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+  }
+  if (param_name == CL_QUEUE_CONTEXT) {
+    src_ptr = &command_queue->ctx;
+    src_size = sizeof(cl_context);
+  } else if (param_name == CL_QUEUE_DEVICE) {
+    src_ptr = &command_queue->device;
+    src_size = sizeof(cl_device_id);
+  } else if (param_name == CL_QUEUE_REFERENCE_COUNT) {
+    ref = CL_OBJECT_GET_REF(command_queue);
+    src_ptr = &ref;
+    src_size = sizeof(cl_int);
+  } else if (param_name == CL_QUEUE_PROPERTIES) {
+    src_ptr = &command_queue->props;
+    src_size = sizeof(cl_command_queue_properties);
+  } else if (param_name == CL_QUEUE_SIZE) {
+    src_ptr = &command_queue->size;
+    src_size = sizeof(command_queue->size);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
+clFlush(cl_command_queue command_queue)
+  if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+  }
+  return cl_command_queue_wait_flush(command_queue);
+clFinish(cl_command_queue command_queue)
+  if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+  }
+  return cl_command_queue_wait_finish(command_queue);
+clRetainCommandQueue(cl_command_queue command_queue)
+  if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+  }
+  cl_command_queue_add_ref(command_queue);
+  return CL_SUCCESS;
+clReleaseCommandQueue(cl_command_queue command_queue)
+  if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+  }
+  cl_command_queue_wait_flush(command_queue);
+  cl_command_queue_delete(command_queue);
+  return CL_SUCCESS;
diff --git a/src/cl_api_context.c b/src/cl_api_context.c
new file mode 100644
index 0000000..e8184b1
--- /dev/null
+++ b/src/cl_api_context.c
@@ -0,0 +1,174 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_context.h"
+#include "cl_device_id.h"
+#include "cl_alloc.h"
+clCreateContext(const cl_context_properties *properties,
+                cl_uint num_devices,
+                const cl_device_id *devices,
+                void (*pfn_notify)(const char *, const void *, size_t, void *),
+                void *user_data,
+                cl_int *errcode_ret)
+  cl_int err = CL_SUCCESS;
+  cl_context context = NULL;
+  do {
+    /* Assure parameters correctness */
+    if (devices == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (num_devices == 0) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (pfn_notify == NULL && user_data != NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    err = cl_devices_list_check(num_devices, devices);
+    if (err != CL_SUCCESS)
+      break;
+    context = cl_create_context(properties, num_devices, devices, pfn_notify, user_data, &err);
+  } while (0);
+  if (errcode_ret)
+    *errcode_ret = err;
+  return context;
+clCreateContextFromType(const cl_context_properties *properties,
+                        cl_device_type device_type,
+                        void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+                        void *user_data,
+                        cl_int *errcode_ret)
+  cl_context context = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_device_id *devices = NULL;
+  cl_uint num_devices = 0;
+  const cl_device_type valid_type = CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_ACCELERATOR |
+                                    CL_DEVICE_TYPE_DEFAULT | CL_DEVICE_TYPE_CUSTOM;
+  do {
+    /* Assure parameters correctness */
+    if (pfn_notify == NULL && user_data != NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if ((device_type & valid_type) == 0) {
+      break;
+    }
+    /* Get the devices num first. */
+    err = cl_get_device_ids(NULL, device_type, 0, NULL, &num_devices);
+    if (err != CL_SUCCESS)
+      break;
+    assert(num_devices > 0);
+    devices = cl_malloc(num_devices * sizeof(cl_device_id));
+    err = cl_get_device_ids(NULL, device_type, num_devices, &devices[0], &num_devices);
+    if (err != CL_SUCCESS)
+      break;
+    context = cl_create_context(properties, num_devices, devices, pfn_notify, user_data, &err);
+  } while (0);
+  if (devices)
+    cl_free(devices);
+  if (errcode_ret)
+    *errcode_ret = err;
+  return context;
+clRetainContext(cl_context context)
+  if (!CL_OBJECT_IS_CONTEXT(context)) {
+  }
+  cl_context_add_ref(context);
+  return CL_SUCCESS;
+clReleaseContext(cl_context context)
+  if (!CL_OBJECT_IS_CONTEXT(context)) {
+  }
+  cl_context_delete(context);
+  return CL_SUCCESS;
+clGetContextInfo(cl_context context,
+                 cl_context_info param_name,
+                 size_t param_value_size,
+                 void *param_value,
+                 size_t *param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  cl_uint n, ref;
+  cl_context_properties p;
+  if (!CL_OBJECT_IS_CONTEXT(context)) {
+  }
+  if (param_name == CL_CONTEXT_DEVICES) {
+    src_ptr = context->devices;
+    src_size = sizeof(cl_device_id) * context->device_num;
+  } else if (param_name == CL_CONTEXT_NUM_DEVICES) {
+    n = context->device_num;
+    src_ptr = &n;
+    src_size = sizeof(cl_uint);
+  } else if (param_name == CL_CONTEXT_REFERENCE_COUNT) {
+    ref = CL_OBJECT_GET_REF(context);
+    src_ptr = &ref;
+    src_size = sizeof(cl_uint);
+  } else if (param_name == CL_CONTEXT_PROPERTIES) {
+    if (context->prop_len > 0) {
+      src_ptr = context->prop_user;
+      src_size = sizeof(cl_context_properties) * context->prop_len;
+    } else {
+      p = 0;
+      src_ptr = &p;
+      src_size = sizeof(cl_context_properties);
+    }
+  } else {
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
diff --git a/src/cl_api_device_id.c b/src/cl_api_device_id.c
new file mode 100644
index 0000000..4ffef78
--- /dev/null
+++ b/src/cl_api_device_id.c
@@ -0,0 +1,90 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_device_id.h"
+#include "cl_platform_id.h"
+clGetDeviceIDs(cl_platform_id platform,
+               cl_device_type device_type,
+               cl_uint num_entries,
+               cl_device_id *devices,
+               cl_uint *num_devices)
+  const cl_device_type valid_type = CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU |
+                                    CL_DEVICE_TYPE_ACCELERATOR | CL_DEVICE_TYPE_DEFAULT |
+                                    CL_DEVICE_TYPE_CUSTOM;
+  /* Check parameter consistency */
+  if (UNLIKELY(devices == NULL && num_devices == NULL))
+    return CL_INVALID_VALUE;
+  if (UNLIKELY(platform && platform != cl_get_platform_default()))
+  if (UNLIKELY(devices && num_entries == 0))
+    return CL_INVALID_VALUE;
+  if ((device_type & valid_type) == 0)
+  return cl_get_device_ids(platform, device_type, num_entries, devices, num_devices);
+clGetDeviceInfo(cl_device_id device,
+                cl_device_info param_name,
+                size_t param_value_size,
+                void *param_value,
+                size_t *param_value_size_ret)
+  if (!CL_OBJECT_IS_DEVICE(device)) {
+    return CL_INVALID_DEVICE;
+  }
+  return cl_get_device_info(device, param_name, param_value_size,
+                            param_value, param_value_size_ret);
+clRetainDevice(cl_device_id device)
+  // XXX stub for C++ Bindings
+  return CL_SUCCESS;
+clReleaseDevice(cl_device_id device)
+  // XXX stub for C++ Bindings
+  return CL_SUCCESS;
+clCreateSubDevices(cl_device_id in_device,
+                   const cl_device_partition_property *properties,
+                   cl_uint num_devices,
+                   cl_device_id *out_devices,
+                   cl_uint *num_devices_ret)
+  /* Check parameter consistency */
+  if (UNLIKELY(out_devices == NULL && num_devices_ret == NULL))
+    return CL_INVALID_VALUE;
+  if (UNLIKELY(in_device == NULL && properties == NULL))
+    return CL_INVALID_VALUE;
+  *num_devices_ret = 0;
diff --git a/src/cl_api_event.c b/src/cl_api_event.c
new file mode 100644
index 0000000..5f3a116
--- /dev/null
+++ b/src/cl_api_event.c
@@ -0,0 +1,330 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_event.h"
+#include "cl_context.h"
+#include "cl_command_queue.h"
+#include "CL/cl.h"
+#include <stdio.h>
+clCreateUserEvent(cl_context context,
+                  cl_int *errcode_ret)
+  cl_int err = CL_SUCCESS;
+  cl_event event = NULL;
+  do {
+    if (!CL_OBJECT_IS_CONTEXT(context)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    event = cl_event_create(context, NULL, 0, NULL, CL_COMMAND_USER, &err);
+  } while (0);
+  if (errcode_ret)
+    *errcode_ret = err;
+  return event;
+clSetUserEventStatus(cl_event event,
+                     cl_int execution_status)
+  cl_int err = CL_SUCCESS;
+  if (!CL_OBJECT_IS_EVENT(event)) {
+    return CL_INVALID_EVENT;
+  }
+  if (execution_status > CL_COMPLETE) {
+    return CL_INVALID_VALUE;
+  }
+  err = cl_event_set_status(event, execution_status);
+  return err;
+/* 1.1 API, depreciated */
+clEnqueueMarker(cl_command_queue command_queue,
+                cl_event *event)
+  return clEnqueueMarkerWithWaitList(command_queue, 0, NULL, event);
+clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+                            cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list,
+                            cl_event *event)
+  cl_int err = CL_SUCCESS;
+  cl_event e = NULL;
+  cl_int e_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (event == NULL) { /* Create a anonymous event, it can not be waited on and useless. */
+      return CL_SUCCESS;
+    }
+    e = cl_event_create_marker_or_barrier(command_queue, num_events_in_wait_list,
+                                          event_wait_list, CL_FALSE, &err);
+    if (err != CL_SUCCESS) {
+      return err;
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    } else if (e_status == CL_COMPLETE) {
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+/* 1.1 API, depreciated */
+clEnqueueBarrier(cl_command_queue command_queue)
+  return clEnqueueBarrierWithWaitList(command_queue, 0, NULL, NULL);
+clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
+                             cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list,
+                             cl_event *event)
+  cl_int err = CL_SUCCESS;
+  cl_event e = NULL;
+  cl_int e_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create_marker_or_barrier(command_queue, num_events_in_wait_list,
+                                          event_wait_list, CL_TRUE, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    } else if (e_status == CL_COMPLETE) {
+      cl_command_queue_insert_barrier_event(command_queue, e);
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      /* Already a completed barrier, no need to insert to queue. */
+    } else {
+      cl_command_queue_insert_barrier_event(command_queue, e);
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clWaitForEvents(cl_uint num_events,
+                const cl_event *event_list)
+  cl_int err = CL_SUCCESS;
+  cl_uint i;
+  if (num_events == 0 || event_list == NULL) {
+    return CL_INVALID_VALUE;
+  }
+  err = cl_event_check_waitlist(num_events, event_list, NULL, NULL);
+  if (err != CL_SUCCESS) {
+    return err;
+  }
+  for (i = 0; i < num_events; i++) {
+    if (cl_event_get_status(event_list[i]) < CL_COMPLETE) {
+      return err;
+    }
+  }
+  err = cl_event_wait_for_events_list(num_events, event_list);
+  return err;
+/* 1.1 API, depreciated */
+clEnqueueWaitForEvents(cl_command_queue command_queue,
+                       cl_uint num_events,
+                       const cl_event *event_list)
+  cl_int err = CL_SUCCESS;
+  if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+  }
+  err = clWaitForEvents(num_events, event_list);
+  return err;
+clSetEventCallback(cl_event event,
+                   cl_int command_exec_callback_type,
+                   void(CL_CALLBACK *pfn_notify)(cl_event, cl_int, void *),
+                   void *user_data)
+  cl_int err = CL_SUCCESS;
+  if (!CL_OBJECT_IS_EVENT(event)) {
+    return CL_INVALID_EVENT;
+  }
+  if ((pfn_notify == NULL) ||
+      (command_exec_callback_type > CL_SUBMITTED) ||
+      (command_exec_callback_type < CL_COMPLETE)) {
+    return CL_INVALID_VALUE;
+  }
+  err = cl_event_set_callback(event, command_exec_callback_type, pfn_notify, user_data);
+  return err;
+clGetEventInfo(cl_event event,
+               cl_event_info param_name,
+               size_t param_value_size,
+               void *param_value,
+               size_t *param_value_size_ret)
+  void *src_ptr = NULL;
+  size_t src_size = 0;
+  cl_uint ref;
+  cl_int status;
+  if (!CL_OBJECT_IS_EVENT(event)) {
+    return CL_INVALID_EVENT;
+  }
+  if (param_name == CL_EVENT_COMMAND_QUEUE) {
+    src_ptr = &event->queue;
+    src_size = sizeof(cl_command_queue);
+  } else if (param_name == CL_EVENT_CONTEXT) {
+    src_ptr = &event->ctx;
+    src_size = sizeof(cl_context);
+  } else if (param_name == CL_EVENT_COMMAND_TYPE) {
+    src_ptr = &event->event_type;
+    src_size = sizeof(cl_command_type);
+  } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
+    status = cl_event_get_status(event);
+    src_ptr = &status;
+    src_size = sizeof(cl_int);
+  } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
+    ref = CL_OBJECT_GET_REF(event);
+    src_ptr = &ref;
+    src_size = sizeof(cl_int);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
+clGetEventProfilingInfo(cl_event event,
+                        cl_profiling_info param_name,
+                        size_t param_value_size,
+                        void *param_value,
+                        size_t *param_value_size_ret)
+  cl_ulong ret_val;
+  if (!CL_OBJECT_IS_EVENT(event)) {
+    return CL_INVALID_EVENT;
+  }
+  assert(event->event_type == CL_COMMAND_USER || event->queue != NULL);
+  if (event->event_type == CL_COMMAND_USER ||
+      !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
+      cl_event_get_status(event) != CL_COMPLETE) {
+  }
+  if (param_value && param_value_size < sizeof(cl_ulong)) {
+    return CL_INVALID_VALUE;
+  }
+  if (param_name < CL_PROFILING_COMMAND_QUEUED ||
+      param_name > CL_PROFILING_COMMAND_COMPLETE) {
+    return CL_INVALID_VALUE;
+  }
+  ret_val = event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED];
+  if (ret_val == CL_EVENT_INVALID_TIMESTAMP) {
+    return CL_INVALID_VALUE;
+  }
+  if (param_value)
+    *(cl_ulong *)param_value = ret_val;
+  if (param_value_size_ret)
+    *param_value_size_ret = sizeof(cl_ulong);
+  return CL_SUCCESS;
diff --git a/src/cl_api_kernel.c b/src/cl_api_kernel.c
new file mode 100644
index 0000000..13ea8c0
--- /dev/null
+++ b/src/cl_api_kernel.c
@@ -0,0 +1,422 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_mem.h"
+#include "cl_kernel.h"
+#include "cl_enqueue.h"
+#include "cl_command_queue.h"
+#include "cl_event.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_alloc.h"
+#include "CL/cl.h"
+#include <stdio.h>
+#include <string.h>
+clGetKernelInfo(cl_kernel kernel,
+                cl_kernel_info param_name,
+                size_t param_value_size,
+                void *param_value,
+                size_t *param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  const char *str = NULL;
+  cl_int ref;
+  cl_uint n;
+  if (!CL_OBJECT_IS_KERNEL(kernel)) {
+    return CL_INVALID_KERNEL;
+  }
+  if (param_name == CL_KERNEL_CONTEXT) {
+    src_ptr = &kernel->program->ctx;
+    src_size = sizeof(cl_context);
+  } else if (param_name == CL_KERNEL_PROGRAM) {
+    src_ptr = &kernel->program;
+    src_size = sizeof(cl_program);
+  } else if (param_name == CL_KERNEL_NUM_ARGS) {
+    n = kernel->arg_n;
+    src_ptr = &n;
+    src_size = sizeof(cl_uint);
+  } else if (param_name == CL_KERNEL_REFERENCE_COUNT) {
+    ref = CL_OBJECT_GET_REF(kernel);
+    src_ptr = &ref;
+    src_size = sizeof(cl_int);
+  } else if (param_name == CL_KERNEL_FUNCTION_NAME) {
+    str = cl_kernel_get_name(kernel);
+    src_ptr = str;
+    src_size = strlen(str) + 1;
+  } else if (param_name == CL_KERNEL_ATTRIBUTES) {
+    str = cl_kernel_get_attributes(kernel);
+    src_ptr = str;
+    src_size = strlen(str) + 1;
+  } else {
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                       cl_kernel kernel,
+                       cl_uint work_dim,
+                       const size_t *global_work_offset,
+                       const size_t *global_work_size,
+                       const size_t *local_work_size,
+                       cl_uint num_events_in_wait_list,
+                       const cl_event *event_wait_list,
+                       cl_event *event)
+  size_t fixed_global_off[] = {0, 0, 0};
+  size_t fixed_global_sz[] = {1, 1, 1};
+  size_t fixed_local_sz[] = {1, 1, 1};
+  cl_int err = CL_SUCCESS;
+  cl_uint i;
+  cl_event e = NULL;
+  cl_int event_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_KERNEL(kernel)) {
+      err = CL_INVALID_KERNEL;
+      break;
+    }
+    /* Check number of dimensions we have */
+    if (UNLIKELY(work_dim == 0 || work_dim > 3)) {
+      break;
+    }
+    /* We need a work size per dimension */
+    if (UNLIKELY(global_work_size == NULL)) {
+      break;
+    }
+    if (kernel->vme) {
+      if (work_dim != 2) {
+        break;
+      }
+      if (local_work_size != NULL) {
+        break;
+      }
+    }
+    if (global_work_offset != NULL) {
+      for (i = 0; i < work_dim; ++i) {
+        if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
+          err = CL_INVALID_GLOBAL_OFFSET;
+          break;
+        }
+      }
+    }
+    /* Queue and kernel must share the same context */
+    assert(kernel->program);
+    if (command_queue->ctx != kernel->program->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (local_work_size != NULL) {
+      for (i = 0; i < work_dim; ++i)
+        fixed_local_sz[i] = local_work_size[i];
+    } else {
+      if (kernel->vme) {
+        fixed_local_sz[0] = 16;
+        fixed_local_sz[1] = 1;
+      } else {
+        uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+        size_t realGroupSize = 1;
+        for (i = 0; i < work_dim; i++) {
+          for (j = maxDimSize; j > 1; j--) {
+            if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+              fixed_local_sz[i] = j;
+              maxGroupSize = maxGroupSize / j;
+              maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+              break; //choose next work_dim
+            }
+          }
+          realGroupSize *= fixed_local_sz[i];
+        }
+        //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop:
+        //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel
+        //it triggers the following message for many times.
+        //to avoid too many messages, only print it for the first time of the process.
+        //just use static variable since it doesn't matter to print a few times at multi-thread case.
+        static int warn_no_good_localsize = 1;
+        if (realGroupSize % 8 != 0 && warn_no_good_localsize) {
+          warn_no_good_localsize = 0;
+          DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide\n"
+                             " local_work_size[] explicitly, you can find good values with\n"
+                             " trial-and-error method.");
+        }
+      }
+    }
+    if (kernel->vme) {
+      fixed_global_sz[0] = (global_work_size[0] + 15) / 16 * 16;
+      fixed_global_sz[1] = (global_work_size[1] + 15) / 16;
+    } else {
+      for (i = 0; i < work_dim; ++i)
+        fixed_global_sz[i] = global_work_size[i];
+    }
+    if (global_work_offset != NULL)
+      for (i = 0; i < work_dim; ++i)
+        fixed_global_off[i] = global_work_offset[i];
+    if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) {
+      if (fixed_local_sz[0] != kernel->compile_wg_sz[0] ||
+          fixed_local_sz[1] != kernel->compile_wg_sz[1] ||
+          fixed_local_sz[2] != kernel->compile_wg_sz[2]) {
+        break;
+      }
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    int i, j, k;
+    const size_t global_wk_sz_div[3] = {
+      fixed_global_sz[0] / fixed_local_sz[0] * fixed_local_sz[0],
+      fixed_global_sz[1] / fixed_local_sz[1] * fixed_local_sz[1],
+      fixed_global_sz[2] / fixed_local_sz[2] * fixed_local_sz[2]};
+    const size_t global_wk_sz_rem[3] = {
+      fixed_global_sz[0] % fixed_local_sz[0],
+      fixed_global_sz[1] % fixed_local_sz[1],
+      fixed_global_sz[2] % fixed_local_sz[2]};
+    cl_uint count;
+    count = global_wk_sz_rem[0] ? 2 : 1;
+    count *= global_wk_sz_rem[1] ? 2 : 1;
+    count *= global_wk_sz_rem[2] ? 2 : 1;
+    const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
+    /* Go through the at most 8 cases and euque if there is work items left */
+    for (i = 0; i < 2; i++) {
+      for (j = 0; j < 2; j++) {
+        for (k = 0; k < 2; k++) {
+          size_t global_wk_sz_use[3] = {global_wk_all[k][0], global_wk_all[j][1], global_wk_all[i][2]};
+          size_t global_dim_off[3] = {
+            k * global_wk_sz_div[0] / fixed_local_sz[0],
+            j * global_wk_sz_div[1] / fixed_local_sz[1],
+            i * global_wk_sz_div[2] / fixed_local_sz[2]};
+          size_t local_wk_sz_use[3] = {
+            k ? global_wk_sz_rem[0] : fixed_local_sz[0],
+            j ? global_wk_sz_rem[1] : fixed_local_sz[1],
+            i ? global_wk_sz_rem[2] : fixed_local_sz[2]};
+          if (local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0)
+            continue;
+          e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                              event_wait_list, CL_COMMAND_NDRANGE_KERNEL, &err);
+          if (err != CL_SUCCESS) {
+            break;
+          }
+          /* Do device specific checks are enqueue the kernel */
+          err = cl_command_queue_ND_range(command_queue, kernel, e, work_dim,
+                                          fixed_global_off, global_dim_off, fixed_global_sz,
+                                          global_wk_sz_use, fixed_local_sz, local_wk_sz_use);
+          if (err != CL_SUCCESS) {
+            break;
+          }
+          e->exec_data.mid_event_of_enq = (count > 1);
+          count--;
+          /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+             The finish or Complete status will always be done in queue list. */
+          event_status = cl_event_is_ready(e);
+          if (event_status < CL_COMPLETE) { // Error happend, cancel.
+            break;
+          }
+          err = cl_event_exec(e, (event_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED), CL_FALSE);
+          if (err != CL_SUCCESS) {
+            break;
+          }
+          cl_command_queue_enqueue_event(command_queue, e);
+          if (e->exec_data.mid_event_of_enq)
+            cl_event_delete(e);
+        }
+        if (err != CL_SUCCESS) {
+          break;
+        }
+      }
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueTask(cl_command_queue command_queue,
+              cl_kernel kernel,
+              cl_uint num_events_in_wait_list,
+              const cl_event *event_wait_list,
+              cl_event *event)
+  const size_t global_size[3] = {1, 0, 0};
+  const size_t local_size[3] = {1, 0, 0};
+  return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
+                                global_size, local_size,
+                                num_events_in_wait_list, event_wait_list, event);
+clEnqueueNativeKernel(cl_command_queue command_queue,
+                      void (*user_func)(void *),
+                      void *args,
+                      size_t cb_args,
+                      cl_uint num_mem_objects,
+                      const cl_mem *mem_list,
+                      const void **args_mem_loc,
+                      cl_uint num_events_in_wait_list,
+                      const cl_event *event_wait_list,
+                      cl_event *event)
+  cl_int err = CL_SUCCESS;
+  void *new_args = NULL;
+  void **new_args_mem_loc = NULL;
+  cl_mem *new_mem_list = NULL;
+  cl_int i;
+  cl_int e_status;
+  cl_event e = NULL;
+  enqueue_data *data = NULL;
+  do {
+    if (user_func == NULL ||
+        (args == NULL && cb_args > 0) ||
+        (args == NULL && num_mem_objects > 0) ||
+        (args != NULL && cb_args == 0) ||
+        (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) ||
+        (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    //Per spec, need copy args
+    if (cb_args) {
+      new_args = cl_malloc(cb_args);
+      if (num_mem_objects) {
+        new_args_mem_loc = cl_malloc(sizeof(void *) * num_mem_objects);
+        new_mem_list = cl_malloc(sizeof(cl_mem) * num_mem_objects);
+        memcpy(new_mem_list, mem_list, sizeof(cl_mem) * num_mem_objects);
+      }
+      if (new_args == NULL || new_args_mem_loc == NULL) {
+        err = CL_OUT_OF_HOST_MEMORY;
+        break;
+      }
+      memcpy(new_args, args, cb_args);
+      for (i = 0; i < num_mem_objects; ++i) {
+        if (!CL_OBJECT_IS_MEM(mem_list[i])) {
+          err = CL_INVALID_MEM_OBJECT;
+          break;
+        }
+        new_args_mem_loc[i] = new_args + (args_mem_loc[i] - args); //change to new args
+      }
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_NATIVE_KERNEL, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueNativeKernel;
+    data->mem_list = new_mem_list;
+    data->ptr = new_args;
+    data->size = cb_args;
+    data->offset = (size_t)num_mem_objects;
+    data->const_ptr = new_args_mem_loc;
+    data->user_func = user_func;
+    new_args = NULL;
+    new_mem_list = NULL;
+    new_args_mem_loc = NULL; // Event delete will free them.
+    err = cl_event_exec(e, (e_status == CL_COMPLETE ? CL_COMPLETE : CL_QUEUED), CL_FALSE);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (e_status != CL_COMPLETE)
+      cl_command_queue_enqueue_event(command_queue, e);
+  } while (0);
+  if (err != CL_SUCCESS) {
+    if (new_args)
+      cl_free(new_args);
+    if (new_mem_list)
+      cl_free(new_mem_list);
+    if (new_args_mem_loc)
+      cl_free(new_args_mem_loc);
+  }
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
new file mode 100644
index 0000000..0d19bf8
--- /dev/null
+++ b/src/cl_api_mem.c
@@ -0,0 +1,2435 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_mem.h"
+#include "cl_enqueue.h"
+#include "cl_command_queue.h"
+#include "cl_event.h"
+#include "CL/cl.h"
+clSetMemObjectDestructorCallback(cl_mem memobj,
+                                 void(CL_CALLBACK *pfn_notify)(cl_mem, void *),
+                                 void *user_data)
+  if (!CL_OBJECT_IS_MEM(memobj))
+  if (pfn_notify == NULL)
+    return CL_INVALID_VALUE;
+  return cl_mem_set_destructor_callback(memobj, pfn_notify, user_data);
+clGetMemObjectInfo(cl_mem memobj,
+                   cl_mem_info param_name,
+                   size_t param_value_size,
+                   void *param_value,
+                   size_t *param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  cl_mem_object_type type;
+  size_t ptr, offset;
+  cl_int ref;
+  cl_mem parent;
+  if (!CL_OBJECT_IS_MEM(memobj)) {
+  }
+  switch (param_name) {
+  case CL_MEM_TYPE: {
+    type = cl_get_mem_object_type(memobj);
+    src_ptr = &type;
+    src_size = sizeof(cl_mem_object_type);
+    break;
+  }
+  case CL_MEM_FLAGS:
+    src_ptr = &memobj->flags;
+    src_size = sizeof(cl_mem_flags);
+    break;
+  case CL_MEM_SIZE:
+    src_ptr = &memobj->size;
+    src_size = sizeof(size_t);
+    break;
+  case CL_MEM_HOST_PTR: {
+    ptr = 0;
+    if (memobj->type == CL_MEM_IMAGE_TYPE) {
+      ptr = (size_t)memobj->host_ptr;
+    } else {
+      struct _cl_mem_buffer *buf = (struct _cl_mem_buffer *)memobj;
+      ptr = (size_t)memobj->host_ptr + buf->sub_offset;
+    }
+    src_ptr = &ptr;
+    src_size = sizeof(size_t);
+    break;
+  }
+    src_ptr = &memobj->is_svm;
+    src_size = sizeof(memobj->is_svm);
+    break;
+  }
+    src_ptr = &memobj->map_ref;
+    src_size = sizeof(cl_uint);
+    break;
+    ref = CL_OBJECT_GET_REF(memobj);
+    src_ptr = &ref;
+    src_size = sizeof(cl_int);
+    break;
+  }
+    src_ptr = &memobj->ctx;
+    src_size = sizeof(cl_context);
+    break;
+    parent = NULL;
+    if (memobj->type == CL_MEM_SUBBUFFER_TYPE) {
+      struct _cl_mem_buffer *buf = (struct _cl_mem_buffer *)memobj;
+      parent = (cl_mem)(buf->parent);
+    } else if (memobj->type == CL_MEM_IMAGE_TYPE) {
+      parent = memobj;
+    } else if (memobj->type == CL_MEM_BUFFER1D_IMAGE_TYPE) {
+      struct _cl_mem_buffer1d_image *image_buffer = (struct _cl_mem_buffer1d_image *)memobj;
+      parent = image_buffer->descbuffer;
+    } else
+      parent = NULL;
+    src_ptr = &parent;
+    src_size = sizeof(cl_mem);
+    break;
+  }
+  case CL_MEM_OFFSET: {
+    offset = 0;
+    if (memobj->type == CL_MEM_SUBBUFFER_TYPE) {
+      struct _cl_mem_buffer *buf = (struct _cl_mem_buffer *)memobj;
+      offset = buf->sub_offset;
+    }
+    src_ptr = &offset;
+    src_size = sizeof(size_t);
+    break;
+  }
+  default:
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
+clGetImageInfo(cl_mem memobj,
+               cl_image_info param_name,
+               size_t param_value_size,
+               void *param_value,
+               size_t *param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  struct _cl_mem_image *image;
+  size_t height, depth, array_sz;
+  cl_uint value;
+  if (!CL_OBJECT_IS_MEM(memobj)) {
+  }
+  image = cl_mem_image(memobj);
+  switch (param_name) {
+    src_ptr = &image->fmt;
+    src_size = sizeof(cl_image_format);
+    break;
+    src_ptr = &image->bpp;
+    src_size = sizeof(size_t);
+    break;
+    src_ptr = &image->row_pitch;
+    src_size = sizeof(size_t);
+    break;
+    src_ptr = &image->slice_pitch;
+    src_size = sizeof(size_t);
+    break;
+    if (memobj->type == CL_MEM_BUFFER1D_IMAGE_TYPE) {
+      struct _cl_mem_buffer1d_image *buffer1d_image = (struct _cl_mem_buffer1d_image *)image;
+      src_ptr = &buffer1d_image->size;
+    } else {
+      src_ptr = &image->w;
+    }
+    src_size = sizeof(size_t);
+    break;
+  case CL_IMAGE_HEIGHT: {
+    height = 0;
+    if (memobj->type != CL_MEM_BUFFER1D_IMAGE_TYPE) {
+      height = IS_1D_IMAGE(image) ? 0 : image->h;
+    }
+    src_ptr = &height;
+    src_size = sizeof(size_t);
+    break;
+  }
+  case CL_IMAGE_DEPTH: {
+    depth = 0;
+    depth = IS_3D_IMAGE(image) ? image->depth : 0;
+    src_ptr = &depth;
+    src_size = sizeof(size_t);
+    break;
+  }
+    array_sz = 0;
+    array_sz = IS_IMAGE_ARRAY(image) ? image->depth : 0;
+    src_ptr = &array_sz;
+    src_size = sizeof(size_t);
+    break;
+  }
+    src_ptr = &image->buffer_1d;
+    src_size = sizeof(cl_mem);
+    break;
+    value = 0;
+    src_ptr = &value;
+    src_size = sizeof(cl_uint);
+    break;
+  }
+  default:
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
+void *
+clEnqueueMapBuffer(cl_command_queue command_queue,
+                   cl_mem buffer,
+                   cl_bool blocking_map,
+                   cl_map_flags map_flags,
+                   size_t offset,
+                   size_t size,
+                   cl_uint num_events_in_wait_list,
+                   const cl_event *event_wait_list,
+                   cl_event *event,
+                   cl_int *errcode_ret)
+  cl_int err = CL_SUCCESS;
+  void *ptr = NULL;
+  void *mem_ptr = NULL;
+  cl_event e = NULL;
+  cl_int e_status;
+  enqueue_data *data = NULL;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_BUFFER(buffer)) {
+      break;
+    }
+    if (command_queue->ctx != buffer->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (!size || offset + size > buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if ((map_flags & CL_MAP_READ &&
+         buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+        (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+         buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) {
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_MAP_BUFFER, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (blocking_map) {
+      err = cl_event_wait_for_event_ready(e);
+      if (err != CL_SUCCESS)
+        break;
+      /* Blocking call API is a sync point of flush. */
+      err = cl_command_queue_wait_flush(command_queue);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueMapBuffer;
+    data->mem_obj = buffer;
+    data->offset = offset;
+    data->size = size;
+    data->ptr = NULL;
+    data->unsync_map = 0;
+      data->write_map = 1;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_SUBMITTED, CL_TRUE); // Submit to get the address.
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+    ptr = data->ptr;
+    assert(ptr);
+    err = cl_mem_record_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
+    assert(err == CL_SUCCESS);
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem_ptr;
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                        cl_mem memobj,
+                        void *mapped_ptr,
+                        cl_uint num_events_in_wait_list,
+                        const cl_event *event_wait_list,
+                        cl_event *event)
+  cl_int err = CL_SUCCESS;
+  cl_int e_status;
+  enqueue_data *data = NULL;
+  cl_event e = NULL;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_MEM(memobj)) {
+      break;
+    }
+    if (command_queue->ctx != memobj->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_UNMAP_MEM_OBJECT, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueUnmapMemObject;
+    data->mem_obj = memobj;
+    data->ptr = mapped_ptr;
+    if (e_status == CL_COMPLETE) { // No need to wait
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else { // May need to wait some event to complete.
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueReadBuffer(cl_command_queue command_queue,
+                    cl_mem buffer,
+                    cl_bool blocking_read,
+                    size_t offset,
+                    size_t size,
+                    void *ptr,
+                    cl_uint num_events_in_wait_list,
+                    const cl_event *event_wait_list,
+                    cl_event *event)
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data = NULL;
+  cl_int e_status;
+  cl_event e = NULL;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_BUFFER(buffer)) {
+      break;
+    }
+    if (command_queue->ctx != buffer->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (!ptr || !size || offset + size > buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_READ_BUFFER, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (blocking_read) {
+      err = cl_event_wait_for_event_ready(e);
+      if (err != CL_SUCCESS)
+        break;
+      /* Blocking call API is a sync point of flush. */
+      err = cl_command_queue_wait_flush(command_queue);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueReadBuffer;
+    data->mem_obj = buffer;
+    data->ptr = ptr;
+    data->offset = offset;
+    data->size = size;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueWriteBuffer(cl_command_queue command_queue,
+                     cl_mem buffer,
+                     cl_bool blocking_write,
+                     size_t offset,
+                     size_t size,
+                     const void *ptr,
+                     cl_uint num_events_in_wait_list,
+                     const cl_event *event_wait_list,
+                     cl_event *event)
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data = NULL;
+  cl_int e_status;
+  cl_event e = NULL;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_BUFFER(buffer)) {
+      break;
+    }
+    if (command_queue->ctx != buffer->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (!ptr || !size || offset + size > buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_WRITE_BUFFER, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (blocking_write) {
+      err = cl_event_wait_for_event_ready(e);
+      if (err != CL_SUCCESS)
+        break;
+      /* Blocking call API is a sync point of flush. */
+      err = cl_command_queue_wait_flush(command_queue);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueWriteBuffer;
+    data->mem_obj = buffer;
+    data->const_ptr = ptr;
+    data->offset = offset;
+    data->size = size;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueReadBufferRect(cl_command_queue command_queue,
+                        cl_mem buffer,
+                        cl_bool blocking_read,
+                        const size_t *buffer_origin,
+                        const size_t *host_origin,
+                        const size_t *region,
+                        size_t buffer_row_pitch,
+                        size_t buffer_slice_pitch,
+                        size_t host_row_pitch,
+                        size_t host_slice_pitch,
+                        void *ptr,
+                        cl_uint num_events_in_wait_list,
+                        const cl_event *event_wait_list,
+                        cl_event *event)
+  cl_int err = CL_SUCCESS;
+  size_t total_size = 0;
+  enqueue_data *data = NULL;
+  cl_int e_status;
+  cl_event e = NULL;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_BUFFER(buffer)) {
+      break;
+    }
+    if (command_queue->ctx != buffer->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+      break;
+    }
+    if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (buffer_row_pitch == 0)
+      buffer_row_pitch = region[0];
+    if (buffer_slice_pitch == 0)
+      buffer_slice_pitch = region[1] * buffer_row_pitch;
+    if (host_row_pitch == 0)
+      host_row_pitch = region[0];
+    if (host_slice_pitch == 0)
+      host_slice_pitch = region[1] * host_row_pitch;
+    if (buffer_row_pitch < region[0] ||
+        host_row_pitch < region[0]) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0) ||
+        (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    total_size = (buffer_origin[2] + region[2] - 1) * buffer_slice_pitch +
+                 (buffer_origin[1] + region[1] - 1) * buffer_row_pitch + buffer_origin[0] + region[0];
+    if (total_size > buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_READ_BUFFER_RECT, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (blocking_read) {
+      err = cl_event_wait_for_event_ready(e);
+      if (err != CL_SUCCESS)
+        break;
+      /* Blocking call API is a sync point of flush. */
+      err = cl_command_queue_wait_flush(command_queue);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueReadBufferRect;
+    data->mem_obj = buffer;
+    data->ptr = ptr;
+    data->origin[0] = buffer_origin[0];
+    data->origin[1] = buffer_origin[1];
+    data->origin[2] = buffer_origin[2];
+    data->host_origin[0] = host_origin[0];
+    data->host_origin[1] = host_origin[1];
+    data->host_origin[2] = host_origin[2];
+    data->region[0] = region[0];
+    data->region[1] = region[1];
+    data->region[2] = region[2];
+    data->row_pitch = buffer_row_pitch;
+    data->slice_pitch = buffer_slice_pitch;
+    data->host_row_pitch = host_row_pitch;
+    data->host_slice_pitch = host_slice_pitch;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueWriteBufferRect(cl_command_queue command_queue,
+                         cl_mem buffer,
+                         cl_bool blocking_write,
+                         const size_t *buffer_origin,
+                         const size_t *host_origin,
+                         const size_t *region,
+                         size_t buffer_row_pitch,
+                         size_t buffer_slice_pitch,
+                         size_t host_row_pitch,
+                         size_t host_slice_pitch,
+                         const void *ptr,
+                         cl_uint num_events_in_wait_list,
+                         const cl_event *event_wait_list,
+                         cl_event *event)
+  cl_int err = CL_SUCCESS;
+  size_t total_size = 0;
+  enqueue_data *data = NULL;
+  cl_int e_status;
+  cl_event e = NULL;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_BUFFER(buffer)) {
+      break;
+    }
+    if (command_queue->ctx != buffer->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+      break;
+    }
+    if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (buffer_row_pitch == 0)
+      buffer_row_pitch = region[0];
+    if (buffer_slice_pitch == 0)
+      buffer_slice_pitch = region[1] * buffer_row_pitch;
+    if (host_row_pitch == 0)
+      host_row_pitch = region[0];
+    if (host_slice_pitch == 0)
+      host_slice_pitch = region[1] * host_row_pitch;
+    if (buffer_row_pitch < region[0] ||
+        host_row_pitch < region[0]) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0) ||
+        (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    total_size = (buffer_origin[2] + region[2] - 1) * buffer_slice_pitch +
+                 (buffer_origin[1] + region[1] - 1) * buffer_row_pitch +
+                 buffer_origin[0] + region[0];
+    if (total_size > buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_WRITE_BUFFER_RECT, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (blocking_write) {
+      err = cl_event_wait_for_event_ready(e);
+      if (err != CL_SUCCESS)
+        break;
+      /* Blocking call API is a sync point of flush. */
+      err = cl_command_queue_wait_flush(command_queue);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueWriteBufferRect;
+    data->mem_obj = buffer;
+    data->const_ptr = ptr;
+    data->origin[0] = buffer_origin[0];
+    data->origin[1] = buffer_origin[1];
+    data->origin[2] = buffer_origin[2];
+    data->host_origin[0] = host_origin[0];
+    data->host_origin[1] = host_origin[1];
+    data->host_origin[2] = host_origin[2];
+    data->region[0] = region[0];
+    data->region[1] = region[1];
+    data->region[2] = region[2];
+    data->row_pitch = buffer_row_pitch;
+    data->slice_pitch = buffer_slice_pitch;
+    data->host_row_pitch = host_row_pitch;
+    data->host_slice_pitch = host_slice_pitch;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueCopyBuffer(cl_command_queue command_queue,
+                    cl_mem src_buffer,
+                    cl_mem dst_buffer,
+                    size_t src_offset,
+                    size_t dst_offset,
+                    size_t cb,
+                    cl_uint num_events_in_wait_list,
+                    const cl_event *event_wait_list,
+                    cl_event *event)
+  cl_int err = CL_SUCCESS;
+  cl_event e = NULL;
+  cl_int e_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_MEM(src_buffer)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_MEM(dst_buffer)) {
+      break;
+    }
+    if (command_queue->ctx != src_buffer->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (command_queue->ctx != dst_buffer->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (src_offset + cb > src_buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (dst_offset + cb > dst_buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    /* Check overlap */
+    if (src_buffer == dst_buffer && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1) &&
+        (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
+      err = CL_MEM_COPY_OVERLAP;
+      break;
+    }
+    /* Check sub overlap */
+    if (src_buffer->type == CL_MEM_SUBBUFFER_TYPE && dst_buffer->type == CL_MEM_SUBBUFFER_TYPE) {
+      struct _cl_mem_buffer *src_b = (struct _cl_mem_buffer *)src_buffer;
+      struct _cl_mem_buffer *dst_b = (struct _cl_mem_buffer *)dst_buffer;
+      size_t src_sub_offset = src_b->sub_offset;
+      size_t dst_sub_offset = dst_b->sub_offset;
+      if ((src_offset + src_sub_offset <= dst_offset + dst_sub_offset &&
+           dst_offset + dst_sub_offset <= src_offset + src_sub_offset + cb - 1) &&
+          (dst_offset + dst_sub_offset <= src_offset + src_sub_offset &&
+           src_offset + src_sub_offset <= dst_offset + dst_sub_offset + cb - 1)) {
+        err = CL_MEM_COPY_OVERLAP;
+        break;
+      }
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_COPY_BUFFER, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = cl_mem_copy(command_queue, e, src_buffer, dst_buffer, src_offset, dst_offset, cb);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+       The finish or Complete status will always be done in queue list. */
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    }
+    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    cl_command_queue_enqueue_event(command_queue, e);
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+/* The following code checking overlap is from Appendix of openCL spec 1.1 */
+static cl_bool
+check_copy_overlap(const size_t src_offset[3],
+                   const size_t dst_offset[3],
+                   const size_t region[3],
+                   size_t row_pitch, size_t slice_pitch)
+  const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
+  const size_t src_max[] = {src_offset[0] + region[0],
+                            src_offset[1] + region[1],
+                            src_offset[2] + region[2]};
+  const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
+  const size_t dst_max[] = {dst_offset[0] + region[0],
+                            dst_offset[1] + region[1],
+                            dst_offset[2] + region[2]};
+  // Check for overlap
+  cl_bool overlap = CL_TRUE;
+  unsigned i;
+  size_t dst_start = dst_offset[2] * slice_pitch +
+                     dst_offset[1] * row_pitch + dst_offset[0];
+  size_t dst_end = dst_start + (region[2] * slice_pitch +
+                                region[1] * row_pitch + region[0]);
+  size_t src_start = src_offset[2] * slice_pitch +
+                     src_offset[1] * row_pitch + src_offset[0];
+  size_t src_end = src_start + (region[2] * slice_pitch +
+                                region[1] * row_pitch + region[0]);
+  for (i = 0; i != 3; ++i) {
+    overlap = overlap && (src_min[i] < dst_max[i]) && (src_max[i] > dst_min[i]);
+  }
+  if (!overlap) {
+    size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ? src_offset[0] + region[0] - row_pitch : 0;
+    size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ? dst_offset[0] + region[0] - row_pitch : 0;
+    if ((delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
+        (delta_dst_x > 0 && delta_dst_x > src_offset[0])) {
+      if ((src_start <= dst_start && dst_start < src_end) ||
+          (dst_start <= src_start && src_start < dst_end))
+        overlap = CL_TRUE;
+    }
+    if (region[2] > 1) {
+      size_t src_height = slice_pitch / row_pitch;
+      size_t dst_height = slice_pitch / row_pitch;
+      size_t delta_src_y = (src_offset[1] + region[1] > src_height) ? src_offset[1] + region[1] - src_height : 0;
+      size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ? dst_offset[1] + region[1] - dst_height : 0;
+      if ((delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
+          (delta_dst_y > 0 && delta_dst_y > src_offset[1])) {
+        if ((src_start <= dst_start && dst_start < src_end) ||
+            (dst_start <= src_start && src_start < dst_end))
+          overlap = CL_TRUE;
+      }
+    }
+  }
+  return overlap;
+clEnqueueCopyBufferRect(cl_command_queue command_queue,
+                        cl_mem src_buffer,
+                        cl_mem dst_buffer,
+                        const size_t *src_origin,
+                        const size_t *dst_origin,
+                        const size_t *region,
+                        size_t src_row_pitch,
+                        size_t src_slice_pitch,
+                        size_t dst_row_pitch,
+                        size_t dst_slice_pitch,
+                        cl_uint num_events_in_wait_list,
+                        const cl_event *event_wait_list,
+                        cl_event *event)
+  cl_int err = CL_SUCCESS;
+  cl_event e = NULL;
+  size_t total_size = 0;
+  cl_int e_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_MEM(src_buffer)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_MEM(dst_buffer)) {
+      break;
+    }
+    if ((command_queue->ctx != src_buffer->ctx) ||
+        (command_queue->ctx != dst_buffer->ctx)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (src_row_pitch == 0)
+      src_row_pitch = region[0];
+    if (src_slice_pitch == 0)
+      src_slice_pitch = region[1] * src_row_pitch;
+    if (dst_row_pitch == 0)
+      dst_row_pitch = region[0];
+    if (dst_slice_pitch == 0)
+      dst_slice_pitch = region[1] * dst_row_pitch;
+    if (src_row_pitch < region[0] ||
+        dst_row_pitch < region[0]) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0) ||
+        (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    total_size = (src_origin[2] + region[2] - 1) * src_slice_pitch +
+                 (src_origin[1] + region[1] - 1) * src_row_pitch + src_origin[0] + region[0];
+    if (total_size > src_buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    total_size = (dst_origin[2] + region[2] - 1) * dst_slice_pitch +
+                 (dst_origin[1] + region[1] - 1) * dst_row_pitch + dst_origin[0] + region[0];
+    if (total_size > dst_buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (src_buffer == dst_buffer &&
+        (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (src_buffer == dst_buffer &&
+        check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
+      err = CL_MEM_COPY_OVERLAP;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_COPY_BUFFER_RECT, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = cl_mem_copy_buffer_rect(command_queue, e, src_buffer, dst_buffer, src_origin, dst_origin, region,
+                                  src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+       The finish or Complete status will always be done in queue list. */
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    } else if (e_status == CL_COMPLETE) {
+      err = cl_event_exec(e, CL_SUBMITTED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    cl_command_queue_enqueue_event(command_queue, e);
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueFillBuffer(cl_command_queue command_queue,
+                    cl_mem buffer,
+                    const void *pattern,
+                    size_t pattern_size,
+                    size_t offset,
+                    size_t size,
+                    cl_uint num_events_in_wait_list,
+                    const cl_event *event_wait_list,
+                    cl_event *event)
+  cl_int err = CL_SUCCESS;
+  static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+  int i = 0;
+  cl_event e = NULL;
+  cl_int e_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_BUFFER(buffer)) {
+      break;
+    }
+    if (command_queue->ctx != buffer->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (offset + size > buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (pattern == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
+      if (valid_sz[i] == pattern_size)
+        break;
+    }
+    if (i == sizeof(valid_sz) / sizeof(size_t)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (offset % pattern_size || size % pattern_size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_FILL_BUFFER, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = cl_mem_fill(command_queue, e, pattern, pattern_size, buffer, offset, size);
+    if (err) {
+      break;
+    }
+    /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+       The finish or Complete status will always be done in queue list. */
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    }
+    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    cl_command_queue_enqueue_event(command_queue, e);
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueMigrateMemObjects(cl_command_queue command_queue,
+                           cl_uint num_mem_objects,
+                           const cl_mem *mem_objects,
+                           cl_mem_migration_flags flags,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list,
+                           cl_event *event)
+  /* So far, we just support 1 device and no subdevice. So all the command queues
+     belong to the small context. There is no need to migrate the mem objects by now. */
+  cl_int err = CL_SUCCESS;
+  cl_event e = NULL;
+  cl_int e_status;
+  cl_uint i = 0;
+  do {
+    if (!flags & CL_MIGRATE_MEM_OBJECT_HOST) {
+      if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+        break;
+      }
+    }
+    if (num_mem_objects == 0 || mem_objects == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    for (i = 0; i < num_mem_objects; i++) {
+      if (!CL_OBJECT_IS_MEM(mem_objects[i])) {
+        err = CL_INVALID_MEM_OBJECT;
+        break;
+      }
+      if (mem_objects[i]->ctx != command_queue->ctx) {
+        err = CL_INVALID_CONTEXT;
+        break;
+      }
+    }
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_MIGRATE_MEM_OBJECTS, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    /* Noting to do now, just enqueue a event. */
+    e->exec_data.type = EnqueueMigrateMemObj;
+    /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+       The finish or Complete status will always be done in queue list. */
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    }
+    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    cl_command_queue_enqueue_event(command_queue, e);
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+/************************************ Images *********************************************/
+static cl_int
+check_image_region(struct _cl_mem_image *image, const size_t *pregion, size_t *region)
+  if (pregion == NULL) {
+    return CL_INVALID_VALUE;
+  }
+  if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+    region[0] = pregion[0];
+    region[1] = 1;
+    region[2] = pregion[1];
+  } else {
+    region[0] = pregion[0];
+    region[1] = pregion[1];
+    region[2] = pregion[2];
+  }
+  if ((region[0] == 0) || (region[1] == 0) || (region[2] == 0)) {
+    return CL_INVALID_VALUE;
+  }
+  return CL_SUCCESS;
+static cl_int
+check_image_origin(struct _cl_mem_image *image, const size_t *porigin, size_t *origin)
+  if (porigin == NULL) {
+    return CL_INVALID_VALUE;
+  }
+  if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+    origin[0] = porigin[0];
+    origin[1] = 0;
+    origin[2] = porigin[1];
+  } else {
+    origin[0] = porigin[0];
+    origin[1] = porigin[1];
+    origin[2] = porigin[2];
+  }
+  return CL_SUCCESS;
+void *
+clEnqueueMapImage(cl_command_queue command_queue,
+                  cl_mem mem,
+                  cl_bool blocking_map,
+                  cl_map_flags map_flags,
+                  const size_t *porigin,
+                  const size_t *pregion,
+                  size_t *image_row_pitch,
+                  size_t *image_slice_pitch,
+                  cl_uint num_events_in_wait_list,
+                  const cl_event *event_wait_list,
+                  cl_event *event,
+                  cl_int *errcode_ret)
+  cl_int err = CL_SUCCESS;
+  void *ptr = NULL;
+  void *mem_ptr = NULL;
+  size_t offset = 0;
+  struct _cl_mem_image *image = NULL;
+  cl_int e_status;
+  enqueue_data *data = NULL;
+  size_t region[3];
+  size_t origin[3];
+  cl_event e = NULL;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_IMAGE(mem)) {
+      break;
+    }
+    image = cl_mem_image(mem);
+    err = check_image_region(image, pregion, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = check_image_origin(image, porigin, origin);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (command_queue->ctx != mem->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (origin[0] + region[0] > image->w ||
+        origin[1] + region[1] > image->h ||
+        origin[2] + region[2] > image->depth) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if ((map_flags & CL_MAP_READ &&
+         mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+        (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+         mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) {
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_MAP_IMAGE, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (blocking_map) {
+      err = cl_event_wait_for_event_ready(e);
+      if (err != CL_SUCCESS)
+        break;
+      /* Blocking call API is a sync point of flush. */
+      err = cl_command_queue_wait_flush(command_queue);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueMapImage;
+    data->mem_obj = mem;
+    data->origin[0] = origin[0];
+    data->origin[1] = origin[1];
+    data->origin[2] = origin[2];
+    data->region[0] = region[0];
+    data->region[1] = region[1];
+    data->region[2] = region[2];
+    data->ptr = ptr;
+    data->unsync_map = 1;
+      data->write_map = 1;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_SUBMITTED, CL_TRUE); // Submit to get the address.
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+    ptr = data->ptr;
+    assert(ptr);
+    /* Store and write back map info. */
+    if (mem->flags & CL_MEM_USE_HOST_PTR) {
+      if (image_slice_pitch)
+        *image_slice_pitch = image->host_slice_pitch;
+      *image_row_pitch = image->host_row_pitch;
+      offset = image->bpp * origin[0] + image->host_row_pitch * origin[1] +
+               image->host_slice_pitch * origin[2];
+    } else {
+      if (image_slice_pitch)
+        *image_slice_pitch = image->slice_pitch;
+      if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+        *image_row_pitch = image->slice_pitch;
+      else
+        *image_row_pitch = image->row_pitch;
+      offset = image->bpp * origin[0] + image->row_pitch * origin[1] + image->slice_pitch * origin[2];
+    }
+    err = cl_mem_record_map_mem(mem, ptr, &mem_ptr, offset, 0, origin, region);
+    assert(err == CL_SUCCESS); // Easy way, do not use unmap to handle error.
+  } while (0);
+  if (err != CL_SUCCESS) {
+    if (e) {
+      cl_event_delete(e);
+      e = NULL;
+    }
+    assert(ptr == NULL);
+  }
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem_ptr;
+clEnqueueReadImage(cl_command_queue command_queue,
+                   cl_mem mem,
+                   cl_bool blocking_read,
+                   const size_t *porigin,
+                   const size_t *pregion,
+                   size_t row_pitch,
+                   size_t slice_pitch,
+                   void *ptr,
+                   cl_uint num_events_in_wait_list,
+                   const cl_event *event_wait_list,
+                   cl_event *event)
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  enqueue_data *data = NULL;
+  cl_int e_status;
+  size_t region[3];
+  size_t origin[3];
+  cl_event e = NULL;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_IMAGE(mem)) {
+      break;
+    }
+    image = cl_mem_image(mem);
+    err = check_image_region(image, pregion, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = check_image_origin(image, porigin, origin);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (command_queue->ctx != mem->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (origin[0] + region[0] > image->w ||
+        origin[1] + region[1] > image->h ||
+        origin[2] + region[2] > image->depth) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (!row_pitch) {
+      row_pitch = image->bpp * region[0];
+    } else if (row_pitch < image->bpp * region[0]) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (image->slice_pitch) {
+      if (!slice_pitch) {
+        slice_pitch = row_pitch * region[1];
+      } else if (slice_pitch < row_pitch * region[1]) {
+        err = CL_INVALID_VALUE;
+        break;
+      }
+    } else if (slice_pitch) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (!ptr) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_READ_IMAGE, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (blocking_read) {
+      err = cl_event_wait_for_event_ready(e);
+      if (err != CL_SUCCESS)
+        break;
+      /* Blocking call API is a sync point of flush. */
+      err = cl_command_queue_wait_flush(command_queue);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueReadImage;
+    data->mem_obj = mem;
+    data->ptr = ptr;
+    data->origin[0] = origin[0];
+    data->origin[1] = origin[1];
+    data->origin[2] = origin[2];
+    data->region[0] = region[0];
+    data->region[1] = region[1];
+    data->region[2] = region[2];
+    data->row_pitch = row_pitch;
+    data->slice_pitch = slice_pitch;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueWriteImage(cl_command_queue command_queue,
+                    cl_mem mem,
+                    cl_bool blocking_write,
+                    const size_t *porigin,
+                    const size_t *pregion,
+                    size_t row_pitch,
+                    size_t slice_pitch,
+                    const void *ptr,
+                    cl_uint num_events_in_wait_list,
+                    const cl_event *event_wait_list,
+                    cl_event *event)
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  enqueue_data *data = NULL;
+  cl_int e_status;
+  size_t region[3];
+  size_t origin[3];
+  cl_event e = NULL;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_IMAGE(mem)) {
+      break;
+    }
+    image = cl_mem_image(mem);
+    err = check_image_region(image, pregion, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = check_image_origin(image, porigin, origin);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (command_queue->ctx != mem->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (origin[0] + region[0] > image->w ||
+        origin[1] + region[1] > image->h ||
+        origin[2] + region[2] > image->depth) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (!row_pitch) {
+      row_pitch = image->bpp * region[0];
+    } else if (row_pitch < image->bpp * region[0]) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (image->slice_pitch) {
+      if (!slice_pitch) {
+        slice_pitch = row_pitch * region[1];
+      } else if (slice_pitch < row_pitch * region[1]) {
+        err = CL_INVALID_VALUE;
+        break;
+      }
+    } else if (slice_pitch) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (!ptr) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_WRITE_IMAGE, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (blocking_write) {
+      err = cl_event_wait_for_event_ready(e);
+      if (err != CL_SUCCESS)
+        break;
+      /* Blocking call API is a sync point of flush. */
+      err = cl_command_queue_wait_flush(command_queue);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    }
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) {
+      break;
+    }
+    data = &e->exec_data;
+    data->type = EnqueueWriteImage;
+    data->mem_obj = mem;
+    data->const_ptr = ptr;
+    data->origin[0] = origin[0];
+    data->origin[1] = origin[1];
+    data->origin[2] = origin[2];
+    data->region[0] = region[0];
+    data->region[1] = region[1];
+    data->region[2] = region[2];
+    data->row_pitch = row_pitch;
+    data->slice_pitch = slice_pitch;
+    if (e_status == CL_COMPLETE) {
+      // Sync mode, no need to queue event.
+      err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+    } else {
+      err = cl_event_exec(e, CL_QUEUED, CL_FALSE);
+      if (err != CL_SUCCESS) {
+        break;
+      }
+      cl_command_queue_enqueue_event(command_queue, e);
+    }
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueCopyImage(cl_command_queue command_queue,
+                   cl_mem src_mem,
+                   cl_mem dst_mem,
+                   const size_t *psrc_origin,
+                   const size_t *pdst_origin,
+                   const size_t *pregion,
+                   cl_uint num_events_in_wait_list,
+                   const cl_event *event_wait_list,
+                   cl_event *event)
+  cl_int err = CL_SUCCESS;
+  cl_bool overlap = CL_TRUE;
+  cl_int i = 0;
+  cl_event e = NULL;
+  struct _cl_mem_image *src_image = NULL;
+  struct _cl_mem_image *dst_image = NULL;
+  size_t region[3];
+  size_t src_origin[3];
+  size_t dst_origin[3];
+  cl_int e_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_IMAGE(src_mem)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_IMAGE(dst_mem)) {
+      break;
+    }
+    src_image = cl_mem_image(src_mem);
+    dst_image = cl_mem_image(dst_mem);
+    err = check_image_region(src_image, pregion, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = check_image_origin(src_image, psrc_origin, src_origin);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = check_image_origin(dst_image, pdst_origin, dst_origin);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (command_queue->ctx != src_mem->ctx ||
+        command_queue->ctx != dst_mem->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (src_image->fmt.image_channel_order != dst_image->fmt.image_channel_order ||
+        src_image->fmt.image_channel_data_type != dst_image->fmt.image_channel_data_type) {
+      break;
+    }
+    if (src_origin[0] + region[0] > src_image->w ||
+        src_origin[1] + region[1] > src_image->h ||
+        src_origin[2] + region[2] > src_image->depth) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (dst_origin[0] + region[0] > dst_image->w ||
+        dst_origin[1] + region[1] > dst_image->h ||
+        dst_origin[2] + region[2] > dst_image->depth) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) ||
+        (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1))) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (src_image == dst_image) {
+      for (i = 0; i < 3; i++) {
+        overlap = overlap && (src_origin[i] < dst_origin[i] + region[i]) &&
+                  (dst_origin[i] < src_origin[i] + region[i]);
+      }
+      if (overlap == CL_TRUE) {
+        err = CL_MEM_COPY_OVERLAP;
+        break;
+      }
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_COPY_IMAGE, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = cl_mem_kernel_copy_image(command_queue, e, src_image, dst_image,
+                                   src_origin, dst_origin, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+       The finish or Complete status will always be done in queue list. */
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    }
+    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    cl_command_queue_enqueue_event(command_queue, e);
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+                           cl_mem src_mem,
+                           cl_mem dst_buffer,
+                           const size_t *psrc_origin,
+                           const size_t *pregion,
+                           size_t dst_offset,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list,
+                           cl_event *event)
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *src_image = NULL;
+  size_t region[3];
+  size_t src_origin[3];
+  cl_event e = NULL;
+  cl_int e_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_IMAGE(src_mem)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_BUFFER(dst_buffer)) {
+      break;
+    }
+    src_image = cl_mem_image(src_mem);
+    err = check_image_region(src_image, pregion, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = check_image_origin(src_image, psrc_origin, src_origin);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (command_queue->ctx != src_mem->ctx ||
+        command_queue->ctx != dst_buffer->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (dst_offset + region[0] * region[1] * region[2] * src_image->bpp > dst_buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (src_origin[0] + region[0] > src_image->w ||
+        src_origin[1] + region[1] > src_image->h ||
+        src_origin[2] + region[2] > src_image->depth) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_COPY_IMAGE_TO_BUFFER, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = cl_mem_copy_image_to_buffer(command_queue, e, src_image, dst_buffer,
+                                      src_origin, dst_offset, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+       The finish or Complete status will always be done in queue list. */
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    }
+    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    cl_command_queue_enqueue_event(command_queue, e);
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+                           cl_mem src_buffer,
+                           cl_mem dst_mem,
+                           size_t src_offset,
+                           const size_t *pdst_origin,
+                           const size_t *pregion,
+                           cl_uint num_events_in_wait_list,
+                           const cl_event *event_wait_list,
+                           cl_event *event)
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *dst_image = NULL;
+  size_t region[3];
+  size_t dst_origin[3];
+  cl_event e = NULL;
+  cl_int e_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_BUFFER(src_buffer)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_IMAGE(dst_mem)) {
+      break;
+    }
+    dst_image = cl_mem_image(dst_mem);
+    err = check_image_region(dst_image, pregion, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = check_image_origin(dst_image, pdst_origin, dst_origin);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (command_queue->ctx != src_buffer->ctx ||
+        command_queue->ctx != dst_mem->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (src_offset + region[0] * region[1] * region[2] * dst_image->bpp > src_buffer->size) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (dst_origin[0] + region[0] > dst_image->w ||
+        dst_origin[1] + region[1] > dst_image->h ||
+        dst_origin[2] + region[2] > dst_image->depth) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_COPY_BUFFER_TO_IMAGE, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = cl_mem_copy_buffer_to_image(command_queue, e, src_buffer, dst_image,
+                                      src_offset, dst_origin, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+       The finish or Complete status will always be done in queue list. */
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    }
+    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    cl_command_queue_enqueue_event(command_queue, e);
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clEnqueueFillImage(cl_command_queue command_queue,
+                   cl_mem mem,
+                   const void *fill_color,
+                   const size_t *porigin,
+                   const size_t *pregion,
+                   cl_uint num_events_in_wait_list,
+                   const cl_event *event_wait_list,
+                   cl_event *event)
+  cl_int err = CL_SUCCESS;
+  size_t region[3];
+  size_t origin[3];
+  cl_event e = NULL;
+  struct _cl_mem_image *image = NULL;
+  cl_int e_status;
+  do {
+    if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) {
+      break;
+    }
+    if (!CL_OBJECT_IS_IMAGE(mem)) {
+      break;
+    }
+    image = cl_mem_image(mem);
+    err = check_image_region(image, pregion, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = check_image_origin(image, porigin, origin);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    if (command_queue->ctx != mem->ctx) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (fill_color == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (origin[0] + region[0] > image->w ||
+        origin[1] + region[1] > image->h ||
+        origin[2] + region[2] > image->depth) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (image->image_type == CL_MEM_OBJECT_IMAGE2D && (origin[2] != 0 || region[2] != 1)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (image->image_type == CL_MEM_OBJECT_IMAGE1D && (origin[2] != 0 || origin[1] != 0 ||
+                                                       region[2] != 1 || region[1] != 1)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
+                                  event, command_queue->ctx);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_FILL_IMAGE, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    err = cl_image_fill(command_queue, e, fill_color, image, origin, region);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+       The finish or Complete status will always be done in queue list. */
+    e_status = cl_event_is_ready(e);
+    if (e_status < CL_COMPLETE) { // Error happend, cancel.
+      break;
+    }
+    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    if (err != CL_SUCCESS) {
+      break;
+    }
+    cl_command_queue_enqueue_event(command_queue, e);
+  } while (0);
+  if (err == CL_SUCCESS && event) {
+    *event = e;
+  } else {
+    cl_event_delete(e);
+  }
+  return err;
+clRetainMemObject(cl_mem memobj)
+  if (!CL_OBJECT_IS_MEM(memobj)) {
+  }
+  cl_mem_add_ref(memobj);
+  return CL_SUCCESS;
+clReleaseMemObject(cl_mem memobj)
+  if (!CL_OBJECT_IS_MEM(memobj)) {
+  }
+  cl_mem_delete(memobj);
+  return CL_SUCCESS;
diff --git a/src/cl_api_platform_id.c b/src/cl_api_platform_id.c
new file mode 100644
index 0000000..10d8894
--- /dev/null
+++ b/src/cl_api_platform_id.c
@@ -0,0 +1,65 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_platform_id.h"
+#include "CL/cl_ext.h"
+clGetPlatformInfo(cl_platform_id platform,
+                  cl_platform_info param_name,
+                  size_t param_value_size,
+                  void *param_value,
+                  size_t *param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  if (!CL_OBJECT_IS_PLATFORM(platform)) {
+  }
+  /* Only one platform now. */
+  if (platform != cl_get_platform_default()) {
+  }
+  if (param_name == CL_PLATFORM_PROFILE) {
+    src_ptr = platform->profile;
+    src_size = platform->profile_sz;
+  } else if (param_name == CL_PLATFORM_VERSION) {
+    src_ptr = platform->version;
+    src_size = platform->version_sz;
+  } else if (param_name == CL_PLATFORM_NAME) {
+    src_ptr = platform->name;
+    src_size = platform->name_sz;
+  } else if (param_name == CL_PLATFORM_VENDOR) {
+    src_ptr = platform->vendor;
+    src_size = platform->vendor_sz;
+  } else if (param_name == CL_PLATFORM_EXTENSIONS) {
+    src_ptr = platform->extensions;
+    src_size = platform->extensions_sz;
+  } else if (param_name == CL_PLATFORM_ICD_SUFFIX_KHR) {
+    src_ptr = platform->icd_suffix_khr;
+    src_size = platform->icd_suffix_khr_sz;
+  } else {
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
diff --git a/src/cl_api_program.c b/src/cl_api_program.c
new file mode 100644
index 0000000..d68f29f
--- /dev/null
+++ b/src/cl_api_program.c
@@ -0,0 +1,171 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_device_id.h"
+#include <string.h>
+clGetProgramInfo(cl_program program,
+                 cl_program_info param_name,
+                 size_t param_value_size,
+                 void *param_value,
+                 size_t *param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  const char *ret_str = "";
+  cl_int ref;
+  cl_uint num_dev, kernels_num;
+  if (!CL_OBJECT_IS_PROGRAM(program)) {
+  }
+  if (param_name == CL_PROGRAM_REFERENCE_COUNT) {
+    ref = CL_OBJECT_GET_REF(program);
+    src_ptr = &ref;
+    src_size = sizeof(cl_int);
+  } else if (param_name == CL_PROGRAM_CONTEXT) {
+    src_ptr = &program->ctx;
+    src_size = sizeof(cl_context);
+  } else if (param_name == CL_PROGRAM_NUM_DEVICES) {
+    num_dev = program->ctx->device_num; // Just 1 dev now.
+    src_ptr = &num_dev;
+    src_size = sizeof(cl_uint);
+  } else if (param_name == CL_PROGRAM_DEVICES) {
+    src_ptr = program->ctx->devices;
+    src_size = program->ctx->device_num * sizeof(cl_device_id);
+  } else if (param_name == CL_PROGRAM_NUM_KERNELS) {
+    kernels_num = program->ker_n;
+    src_ptr = &kernels_num;
+    src_size = sizeof(cl_uint);
+  } else if (param_name == CL_PROGRAM_SOURCE) {
+    if (!program->source) {
+      src_ptr = ret_str;
+      src_size = 1;
+    } else {
+      src_ptr = program->source;
+      src_size = strlen(program->source) + 1;
+    }
+  } else if (param_name == CL_PROGRAM_KERNEL_NAMES) {
+    // TODO: need to refine this.
+    cl_program_get_kernel_names(program, param_value_size, (char *)param_value, param_value_size_ret);
+    return CL_SUCCESS;
+  } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
+    if (program->binary == NULL) {
+      if (program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+      } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+      } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+      } else {
+        return CL_INVALID_BINARY;
+      }
+    }
+    if (program->binary == NULL || program->binary_sz == 0) {
+      return CL_OUT_OF_RESOURCES;
+    }
+    src_ptr = &program->binary_sz;
+    src_size = sizeof(size_t);
+  } else if (param_name == CL_PROGRAM_BINARIES) {
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(void *);
+    if (!param_value)
+      return CL_SUCCESS;
+    /* param_value points to an array of n
+       pointers allocated by the caller */
+    if (program->binary == NULL) {
+      if (program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+      } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+      } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+      } else {
+        return CL_INVALID_BINARY;
+      }
+    }
+    if (program->binary == NULL || program->binary_sz == 0) {
+      return CL_OUT_OF_RESOURCES;
+    }
+    memcpy(*((void **)param_value), program->binary, program->binary_sz);
+    return CL_SUCCESS;
+  } else {
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
+clGetProgramBuildInfo(cl_program program,
+                      cl_device_id device,
+                      cl_program_build_info param_name,
+                      size_t param_value_size,
+                      void *param_value,
+                      size_t *param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  const char *ret_str = "";
+  size_t global_size;
+  if (!CL_OBJECT_IS_PROGRAM(program)) {
+  }
+  cl_int err = cl_devices_list_include_check(program->ctx->device_num,
+                                             program->ctx->devices, 1, &device);
+  if (err != CL_SUCCESS)
+    return err;
+  if (param_name == CL_PROGRAM_BUILD_STATUS) {
+    src_ptr = &program->build_status;
+    src_size = sizeof(cl_build_status);
+  } else if (param_name == CL_PROGRAM_BUILD_OPTIONS) {
+    if (program->is_built && program->build_opts) {
+      ret_str = program->build_opts;
+    }
+    src_ptr = ret_str;
+    src_size = strlen(ret_str) + 1;
+  } else if (param_name == CL_PROGRAM_BUILD_LOG) {
+    src_ptr = program->build_log;
+    src_size = program->build_log_sz + 1;
+  } else if (param_name == CL_PROGRAM_BINARY_TYPE) {
+    src_ptr = &program->binary_type;
+    src_size = sizeof(cl_uint);
+  } else if (param_name == CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE) {
+    global_size = 0;
+    if (program->is_built)
+      global_size = cl_program_get_global_variable_size(program);
+    src_ptr = &global_size;
+    src_size = sizeof(global_size);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
diff --git a/src/cl_api_sampler.c b/src/cl_api_sampler.c
new file mode 100644
index 0000000..d8dba29
--- /dev/null
+++ b/src/cl_api_sampler.c
@@ -0,0 +1,127 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_sampler.h"
+#include "cl_context.h"
+#include "cl_device_id.h"
+clCreateSampler(cl_context context,
+                cl_bool normalized,
+                cl_addressing_mode addressing,
+                cl_filter_mode filter,
+                cl_int *errcode_ret)
+  cl_sampler sampler = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_uint i;
+  do {
+    if (!CL_OBJECT_IS_CONTEXT(context)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+    if (addressing < CL_ADDRESS_NONE || addressing > CL_ADDRESS_MIRRORED_REPEAT) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (filter < CL_FILTER_NEAREST || filter > CL_FILTER_LINEAR) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    /* Check if images are not supported by any device associated with context */
+    for (i = 0; i < context->device_num; i++) {
+      if (context->devices[i]->image_support == CL_FALSE) {
+        err = CL_INVALID_OPERATION;
+        break;
+      }
+    }
+    if (err != CL_SUCCESS)
+      break;
+    sampler = cl_create_sampler(context, normalized, addressing, filter, &err);
+  } while (0);
+  if (errcode_ret)
+    *errcode_ret = err;
+  return sampler;
+clGetSamplerInfo(cl_sampler sampler,
+                 cl_sampler_info param_name,
+                 size_t param_value_size,
+                 void *param_value,
+                 size_t *param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  cl_int ref;
+  if (!CL_OBJECT_IS_SAMPLER(sampler)) {
+  }
+  if (param_name == CL_SAMPLER_REFERENCE_COUNT) {
+    ref = CL_OBJECT_GET_REF(sampler);
+    src_ptr = &ref;
+    src_size = sizeof(cl_int);
+  } else if (param_name == CL_SAMPLER_CONTEXT) {
+    src_ptr = &sampler->ctx;
+    src_size = sizeof(cl_context);
+  } else if (param_name == CL_SAMPLER_NORMALIZED_COORDS) {
+    src_ptr = &sampler->normalized_coords;
+    src_size = sizeof(cl_bool);
+  } else if (param_name == CL_SAMPLER_ADDRESSING_MODE) {
+    src_ptr = &sampler->address;
+    src_size = sizeof(cl_addressing_mode);
+  } else if (param_name == CL_SAMPLER_FILTER_MODE) {
+    src_ptr = &sampler->filter;
+    src_size = sizeof(cl_filter_mode);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
+clRetainSampler(cl_sampler sampler)
+  if (!CL_OBJECT_IS_SAMPLER(sampler)) {
+  }
+  cl_sampler_add_ref(sampler);
+  return CL_SUCCESS;
+clReleaseSampler(cl_sampler sampler)
+  if (!CL_OBJECT_IS_SAMPLER(sampler)) {
+  }
+  cl_sampler_delete(sampler);
+  return CL_SUCCESS;
diff --git a/src/cl_base_object.c b/src/cl_base_object.c
new file mode 100644
index 0000000..5578bdc
--- /dev/null
+++ b/src/cl_base_object.c
@@ -0,0 +1,140 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <stdio.h>
+#include "cl_base_object.h"
+static pthread_t invalid_thread_id = -1;
+LOCAL void
+cl_object_init_base(cl_base_object obj, cl_ulong magic)
+  obj->magic = magic;
+  obj->ref = 1;
+  SET_ICD(obj->dispatch);
+  pthread_mutex_init(&obj->mutex, NULL);
+  pthread_cond_init(&obj->cond, NULL);
+  obj->owner = invalid_thread_id;
+  list_node_init(&obj->node);
+LOCAL void
+cl_object_destroy_base(cl_base_object obj)
+  int ref = CL_OBJECT_GET_REF(obj);
+  if (ref != 0) {
+    DEBUGP(DL_ERROR, "CL object %p, call destroy with a reference %d", obj,
+           ref);
+    assert(0);
+  }
+  if (!CL_OBJECT_IS_VALID(obj)) {
+           "CL object %p, call destroy while it is already a dead object", obj);
+    assert(0);
+  }
+  if (obj->owner != invalid_thread_id) {
+    DEBUGP(DL_ERROR, "CL object %p, call destroy while still has a owener %d",
+           obj, (int)obj->owner);
+    assert(0);
+  }
+  if (!list_node_out_of_list(&obj->node)) {
+    DEBUGP(DL_ERROR, "CL object %p, call destroy while still belong to some object %p",
+           obj, obj->node.p);
+    assert(0);
+  }
+  obj->magic = CL_OBJECT_INVALID_MAGIC;
+  pthread_mutex_destroy(&obj->mutex);
+  pthread_cond_destroy(&obj->cond);
+LOCAL cl_int
+cl_object_take_ownership(cl_base_object obj, cl_int wait, cl_bool withlock)
+  pthread_t self;
+  assert(CL_OBJECT_IS_VALID(obj));
+  self = pthread_self();
+  if (withlock == CL_FALSE)
+    pthread_mutex_lock(&obj->mutex);
+  if (pthread_equal(obj->owner, self)) { // Already get
+    if (withlock == CL_FALSE)
+      pthread_mutex_unlock(&obj->mutex);
+    return 1;
+  }
+  if (pthread_equal(obj->owner, invalid_thread_id)) {
+    obj->owner = self;
+    if (withlock == CL_FALSE)
+      pthread_mutex_unlock(&obj->mutex);
+    return 1;
+  }
+  if (wait == 0) {
+    if (withlock == CL_FALSE)
+      pthread_mutex_unlock(&obj->mutex);
+    return 0;
+  }
+  while (!pthread_equal(obj->owner, invalid_thread_id)) {
+    pthread_cond_wait(&obj->cond, &obj->mutex);
+  }
+  obj->owner = self;
+  if (withlock == CL_FALSE)
+    pthread_mutex_unlock(&obj->mutex);
+  return 1;
+LOCAL void
+cl_object_release_ownership(cl_base_object obj, cl_bool withlock)
+  assert(CL_OBJECT_IS_VALID(obj));
+  if (withlock == CL_FALSE)
+    pthread_mutex_lock(&obj->mutex);
+  assert(pthread_equal(pthread_self(), obj->owner));
+  obj->owner = invalid_thread_id;
+  pthread_cond_broadcast(&obj->cond);
+  if (withlock == CL_FALSE)
+    pthread_mutex_unlock(&obj->mutex);
+LOCAL void
+cl_object_wait_on_cond(cl_base_object obj)
+  assert(CL_OBJECT_IS_VALID(obj));
+  pthread_cond_wait(&obj->cond, &obj->mutex);
+LOCAL void
+cl_object_notify_cond(cl_base_object obj)
+  assert(CL_OBJECT_IS_VALID(obj));
+  pthread_cond_broadcast(&obj->cond);
diff --git a/src/cl_base_object.h b/src/cl_base_object.h
new file mode 100644
index 0000000..186b149
--- /dev/null
+++ b/src/cl_base_object.h
@@ -0,0 +1,84 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __CL_BASE_OBJECT_H__
+#define __CL_BASE_OBJECT_H__
+#include "cl_utils.h"
+#include "cl_khr_icd.h"
+#include "CL/cl.h"
+#include <pthread.h>
+#include <assert.h>
+  Every CL objects should have:
+    ICD dispatcher: Hold the ICD function table pointer.
+    Reference: To maintain its' life time. CL retain/release API will
+    change its value. We will destroy the object when the count reach 0.
+    Magic: Just a number to represent each CL object. We will use it
+    to check whether it is the object we want.
+    Mutex & Cond: Used to protect the CL objects MT safe. lock/unlock
+    critical region should be short enough and should not have any block
+    function call. take_ownership/release_ownership  can own the object
+    for a long time. take_ownership will not hold the lock and so will
+    not cause deadlock problems. we can wait on the cond to get the
+    ownership.
+typedef struct _cl_base_object {
+  DEFINE_ICD(dispatch);  /* Dispatch function table for icd */
+  cl_ulong magic;        /* Magic number for each CL object */
+  atomic_t ref;          /* Reference for each CL object */
+  list_node node;        /* CL object node belong to some container */
+  pthread_mutex_t mutex; /* THe mutex to protect this object MT safe */
+  pthread_cond_t cond;   /* Condition to wait for getting the object */
+  pthread_t owner;       /* The thread which own this object */
+} _cl_base_object;
+typedef struct _cl_base_object *cl_base_object;
+#define CL_OBJECT_IS_VALID(obj) (((cl_base_object)obj)->magic != CL_OBJECT_INVALID_MAGIC)
+#define CL_OBJECT_INC_REF(obj) (atomic_inc(&((cl_base_object)obj)->ref))
+#define CL_OBJECT_DEC_REF(obj) (atomic_dec(&((cl_base_object)obj)->ref))
+#define CL_OBJECT_GET_REF(obj) (atomic_read(&((cl_base_object)obj)->ref))
+#define CL_OBJECT_LOCK(obj) (pthread_mutex_lock(&((cl_base_object)obj)->mutex))
+#define CL_OBJECT_UNLOCK(obj) (pthread_mutex_unlock(&((cl_base_object)obj)->mutex))
+extern void cl_object_init_base(cl_base_object obj, cl_ulong magic);
+extern void cl_object_destroy_base(cl_base_object obj);
+extern cl_int cl_object_take_ownership(cl_base_object obj, cl_int wait, cl_bool withlock);
+extern void cl_object_release_ownership(cl_base_object obj, cl_bool withlock);
+extern void cl_object_wait_on_cond(cl_base_object obj);
+extern void cl_object_notify_cond(cl_base_object obj);
+#define CL_OBJECT_INIT_BASE(obj, magic) (cl_object_init_base((cl_base_object)obj, magic))
+#define CL_OBJECT_DESTROY_BASE(obj) (cl_object_destroy_base((cl_base_object)obj))
+#define CL_OBJECT_TAKE_OWNERSHIP(obj, wait) (cl_object_take_ownership((cl_base_object)obj, wait, CL_FALSE))
+#define CL_OBJECT_RELEASE_OWNERSHIP(obj) (cl_object_release_ownership((cl_base_object)obj, CL_FALSE))
+#define CL_OBJECT_TAKE_OWNERSHIP_WITHLOCK(obj, wait) (cl_object_take_ownership((cl_base_object)obj, wait, CL_TRUE))
+#define CL_OBJECT_RELEASE_OWNERSHIP_WITHLOCK(obj) (cl_object_release_ownership((cl_base_object)obj, CL_TRUE))
+#define CL_OBJECT_WAIT_ON_COND(obj) (cl_object_wait_on_cond((cl_base_object)obj))
+#define CL_OBJECT_NOTIFY_COND(obj) (cl_object_notify_cond((cl_base_object)obj))
+#endif /* __CL_BASE_OBJECT_H__ */
diff --git a/src/cl_cmrt.cpp b/src/cl_cmrt.cpp
index 25e4d82..f653844 100644
--- a/src/cl_cmrt.cpp
+++ b/src/cl_cmrt.cpp
@@ -256,7 +256,7 @@ cl_int cmrt_set_kernel_arg(cl_kernel k, cl_uint index, size_t sz, const void *va
     result = cmrt_kernel->SetKernelArg(index, sz, value);
   else {
     cl_mem mem = *(cl_mem*)value;
-    if (mem->magic == CL_MAGIC_MEM_HEADER) {
+    if (((cl_base_object)mem)->magic == CL_MAGIC_MEM_HEADER) {
       if (!CreateCmrtMemory(mem))
         return CL_INVALID_ARG_VALUE;
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index b66928f..55b1a23 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -25,7 +25,6 @@
 #include "cl_device_id.h"
 #include "cl_mem.h"
 #include "cl_utils.h"
-#include "cl_thread.h"
 #include "cl_alloc.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
@@ -37,80 +36,71 @@
 #include <stdio.h>
 #include <string.h>
-LOCAL cl_command_queue
+static cl_command_queue
 cl_command_queue_new(cl_context ctx)
   cl_command_queue queue = NULL;
-  TRY_ALLOC_NO_ERR (queue, CALLOC(struct _cl_command_queue));
-  SET_ICD(queue->dispatch)
-  queue->magic = CL_MAGIC_QUEUE_HEADER;
-  queue->ref_n = 1;
-  queue->ctx = ctx;
-  queue->cmrt_event = NULL;
-  if ((queue->thread_data = cl_thread_data_create()) == NULL) {
-    goto error;
+  queue = cl_calloc(1, sizeof(_cl_command_queue));
+  if (queue == NULL)
+    return NULL;
+  if (cl_command_queue_init_enqueue(queue) != CL_SUCCESS) {
+    cl_free(queue);
+    return NULL;
   /* Append the command queue in the list */
-  pthread_mutex_lock(&ctx->queue_lock);
-    queue->next = ctx->queues;
-    if (ctx->queues != NULL)
-      ctx->queues->prev = queue;
-    ctx->queues = queue;
-  pthread_mutex_unlock(&ctx->queue_lock);
+  cl_context_add_queue(ctx, queue);
+  return queue;
-  /* The queue also belongs to its context */
-  cl_context_add_ref(ctx);
+LOCAL cl_command_queue
+cl_create_command_queue(cl_context ctx, cl_device_id device, cl_command_queue_properties properties,
+                        cl_uint queue_size, cl_int *errcode_ret)
+  cl_command_queue queue = cl_command_queue_new(ctx);
+  if (queue == NULL) {
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
+  }
+  queue->props = properties;
+  queue->device = device;
+  queue->size = queue_size;
+  *errcode_ret = CL_SUCCESS;
   return queue;
-  cl_command_queue_delete(queue);
-  queue = NULL;
-  goto exit;
 LOCAL void
 cl_command_queue_delete(cl_command_queue queue)
-  if (atomic_dec(&queue->ref_n) != 1) return;
-#ifdef HAS_CMRT
-  if (queue->cmrt_event != NULL)
-    cmrt_destroy_event(queue);
-  // If there is a list of valid events, we need to give them
-  // a chance to call the call-back function.
-  cl_event_update_last_events(queue,1);
-  /* Remove it from the list */
-  assert(queue->ctx);
-  pthread_mutex_lock(&queue->ctx->queue_lock);
-    if (queue->prev)
-      queue->prev->next = queue->next;
-    if (queue->next)
-      queue->next->prev = queue->prev;
-    if (queue->ctx->queues == queue)
-      queue->ctx->queues = queue->next;
-  pthread_mutex_unlock(&queue->ctx->queue_lock);
-  cl_thread_data_destroy(queue);
-  queue->thread_data = NULL;
+  if (CL_OBJECT_DEC_REF(queue) > 1)
+    return;
+  /* Before we destroy the queue, we should make sure all
+     the commands in the queue are finished. */
+  cl_command_queue_wait_finish(queue);
+  cl_context_remove_queue(queue->ctx, queue);
+  cl_command_queue_destroy_enqueue(queue);
-  cl_context_delete(queue->ctx);
-  cl_free(queue->wait_events);
-  cl_free(queue->barrier_events);
-  queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+  if (queue->barrier_events) {
+    cl_free(queue->barrier_events);
+  }
 LOCAL void
 cl_command_queue_add_ref(cl_command_queue queue)
-  atomic_inc(&queue->ref_n);
+  CL_OBJECT_INC_REF(queue);
 static void
@@ -131,10 +121,9 @@ set_image_info(char *curbe,
 LOCAL cl_int
-cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
+cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, uint32_t *max_bti)
   uint32_t i;
   for (i = 0; i < k->image_sz; i++) {
     int id = k->images[i].arg_idx;
@@ -143,6 +132,8 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
+    if(*max_bti < k->images[i].idx)
+      *max_bti = k->images[i].idx;
       if( (image->fmt.image_channel_order != CL_R) || (image->fmt.image_channel_data_type != CL_UNORM_INT8) )
@@ -168,33 +159,71 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
 LOCAL cl_int
-cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
+cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, uint32_t *max_bti)
   /* Bind all user buffers (given by clSetKernelArg) */
-  uint32_t i;
+  uint32_t i, bti;
+  uint32_t ocl_version = interp_kernel_get_ocl_version(k->opaque);
   enum gbe_arg_type arg_type; /* kind of argument */
   for (i = 0; i < k->arg_n; ++i) {
     int32_t offset; // location of the address in the curbe
     arg_type = interp_kernel_get_arg_type(k->opaque, i);
-    if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
+    if (!(arg_type == GBE_ARG_GLOBAL_PTR ||
+          (arg_type == GBE_ARG_CONSTANT_PTR && ocl_version >= 200) ||
+          arg_type == GBE_ARG_PIPE) ||
+        !k->args[i].mem)
     offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
     if (offset < 0)
+    bti = interp_kernel_get_arg_bti(k->opaque, i);
+    if(*max_bti < bti)
+      *max_bti = bti;
     if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
       struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + buffer->sub_offset, k->args[i].mem->size, bti);
     } else {
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+      size_t mem_offset = 0; //
+      if(k->args[i].is_svm) {
+        mem_offset = (size_t)k->args[i].ptr - (size_t)k->args[i].mem->host_ptr;
+      }
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + mem_offset, k->args[i].mem->size, bti);
+  return CL_SUCCESS;
+LOCAL cl_int
+cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, uint32_t *max_bti)
+  uint32_t i;
+  size_t mem_offset, bti = *max_bti;
+  cl_mem mem;
+  int32_t offset = interp_kernel_get_curbe_size(k->opaque);
+  for (i = 0; i < k->exec_info_n; i++) {
+    void *ptr = k->exec_info[i];
+    mem = cl_context_get_svm_from_ptr(k->program->ctx, ptr);
+    if(mem == NULL)
+      mem = cl_context_get_mem_from_ptr(k->program->ctx, ptr);
+    if (mem) {
+      mem_offset = (size_t)ptr - (size_t)mem->host_ptr;
+      /* only need realloc in surface state, don't need realloc in curbe */
+      cl_gpgpu_bind_buf(gpgpu, mem->bo, offset + i * sizeof(ptr), mem->offset + mem_offset, mem->size, bti++);
+        bti = *max_bti + BTI_WORKAROUND_IMAGE_OFFSET;
+      assert(bti < BTI_MAX_ID);
+    }
+  }
+  *max_bti = bti;
   return CL_SUCCESS;
-extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, cl_event, 
+                                             uint32_t, const size_t *, const size_t *,const size_t *,
+                                             const size_t *, const size_t *, const size_t *);
 static cl_int
 cl_kernel_check_args(cl_kernel k)
@@ -209,10 +238,14 @@ cl_kernel_check_args(cl_kernel k)
 LOCAL cl_int
 cl_command_queue_ND_range(cl_command_queue queue,
                           cl_kernel k,
+                          cl_event event,
                           const uint32_t work_dim,
                           const size_t *global_wk_off,
+                          const size_t *global_dim_off,
                           const size_t *global_wk_sz,
-                          const size_t *local_wk_sz)
+                          const size_t *global_wk_sz_use,
+                          const size_t *local_wk_sz,
+                          const size_t *local_wk_sz_use)
     time_start(queue->ctx, cl_kernel_get_name(k), queue);
@@ -222,8 +255,13 @@ cl_command_queue_ND_range(cl_command_queue queue,
   /* Check that the user did not forget any argument */
   TRY (cl_kernel_check_args, k);
   if (ver == 7 || ver == 75 || ver == 8 || ver == 9)
-    TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+    //TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+    TRY (cl_command_queue_ND_range_gen7, queue, k, event, work_dim,
+                                global_wk_off, global_dim_off, global_wk_sz,
+                                global_wk_sz_use, local_wk_sz, local_wk_sz_use);
     FATAL ("Unknown Gen Device");
@@ -232,7 +270,7 @@ error:
 LOCAL int
-cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
+cl_command_queue_flush_gpgpu(cl_gpgpu gpgpu)
   void* printf_info = cl_gpgpu_get_printf_info(gpgpu);
   void* profiling_info;
@@ -257,171 +295,73 @@ cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
     interp_output_profiling(profiling_info, cl_gpgpu_map_profiling_buffer(gpgpu));
-  return CL_SUCCESS;
-LOCAL cl_int
-cl_command_queue_flush(cl_command_queue queue)
-  int err;
-  err = cl_command_queue_flush_gpgpu(queue, gpgpu);
-  // We now keep a list of uncompleted events and check if they compelte
-  // every flush. This can make sure all events created have chance to be
-  // update status, so the callback functions or reference can be handled.
-  cl_event_update_last_events(queue,0);
-  cl_event current_event = get_current_event(queue);
-  if (current_event && err == CL_SUCCESS) {
-    err = cl_event_flush(current_event);
-    set_current_event(queue, NULL);
-  }
-  cl_invalid_thread_gpgpu(queue);
-  return err;
-LOCAL cl_int
-cl_command_queue_finish(cl_command_queue queue)
-  cl_gpgpu_sync(cl_get_thread_batch_buf(queue));
-  cl_event_update_last_events(queue,1);
   return CL_SUCCESS;
-LOCAL void
-cl_command_queue_insert_event(cl_command_queue queue, cl_event event)
-  cl_int i=0;
-  cl_event *new_list;
-  assert(queue != NULL);
-  if(queue->wait_events == NULL) {
-    queue->wait_events_size = DEFAULT_WAIT_EVENTS_SIZE;
-    TRY_ALLOC_NO_ERR (queue->wait_events, CALLOC_ARRAY(cl_event, queue->wait_events_size));
-  }
-  for(i=0; i<queue->wait_events_num; i++) {
-    if(queue->wait_events[i] == event)
-      return;   //is in the wait_events, need to insert
-  }
-  if(queue->wait_events_num < queue->wait_events_size) {
-    queue->wait_events[queue->wait_events_num++] = event;
-    return;
-  }
-  //wait_events_num == wait_events_size, array is full
-  queue->wait_events_size *= 2;
-  TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->wait_events_size));
-  memcpy(new_list, queue->wait_events, sizeof(cl_event)*queue->wait_events_num);
-  cl_free(queue->wait_events);
-  queue->wait_events = new_list;
-  queue->wait_events[queue->wait_events_num++] = event;
-  return;
-  return;
-  if(queue->wait_events)
-    cl_free(queue->wait_events);
-  queue->wait_events = NULL;
-  queue->wait_events_size = 0;
-  queue->wait_events_num = 0;
-  goto exit;
-LOCAL void
-cl_command_queue_remove_event(cl_command_queue queue, cl_event event)
-  cl_int i=0;
-  assert(queue->wait_events);
-  for(i=0; i<queue->wait_events_num; i++) {
-    if(queue->wait_events[i] == event)
-      break;
-  }
-  if(i == queue->wait_events_num)
-    return;
-  if(i == queue->wait_events_num - 1) {
-    queue->wait_events[i] = NULL;
-  } else {
-    for(; i<queue->wait_events_num-1; i++) {
-      queue->wait_events[i] = queue->wait_events[i+1];
-    }
-  }
-  queue->wait_events_num -= 1;
 LOCAL void
 cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event)
-  cl_int i=0;
-  cl_event *new_list;
+  cl_int i = 0;
+  cl_event_add_ref(event);
   assert(queue != NULL);
-  if(queue->barrier_events == NULL) {
-    queue->barrier_events_size = DEFAULT_WAIT_EVENTS_SIZE;
-    TRY_ALLOC_NO_ERR (queue->barrier_events, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
+  CL_OBJECT_LOCK(queue);
+  if (queue->barrier_events == NULL) {
+    queue->barrier_events_size = 4;
+    queue->barrier_events = cl_calloc(queue->barrier_events_size, sizeof(cl_event));
+    assert(queue->barrier_events);
-  for(i=0; i<queue->barrier_events_num; i++) {
-    if(queue->barrier_events[i] == event)
-      return;   //is in the barrier_events, need to insert
+  for (i = 0; i<queue->barrier_events_num; i++) {
+    assert(queue->barrier_events[i] != event);
   if(queue->barrier_events_num < queue->barrier_events_size) {
     queue->barrier_events[queue->barrier_events_num++] = event;
+    CL_OBJECT_UNLOCK(queue);
-  //barrier_events_num == barrier_events_size, array is full
+  /* Array is full, double expand. */
   queue->barrier_events_size *= 2;
-  TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
-  memcpy(new_list, queue->barrier_events, sizeof(cl_event)*queue->barrier_events_num);
-  cl_free(queue->barrier_events);
-  queue->barrier_events = new_list;
-  queue->barrier_events[queue->barrier_events_num++] = event;
-  return;
+  queue->barrier_events = cl_realloc(queue->barrier_events,
+                                     queue->barrier_events_size * sizeof(cl_event));
+  assert(queue->barrier_events);
+  queue->barrier_events[queue->barrier_events_num++] = event;
-  if(queue->barrier_events)
-    cl_free(queue->barrier_events);
-  queue->barrier_events = NULL;
-  queue->barrier_events_size = 0;
-  queue->barrier_events_num = 0;
-  goto exit;
 LOCAL void
 cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event)
-  cl_int i=0;
+  cl_int i = 0;
+  assert(queue != NULL);
-  if(queue->barrier_events_num == 0)
-    return;
+  CL_OBJECT_LOCK(queue);
+  assert(queue->barrier_events_num > 0);
+  assert(queue->barrier_events);
-  for(i=0; i<queue->barrier_events_num; i++) {
+  for(i = 0; i < queue->barrier_events_num; i++) {
     if(queue->barrier_events[i] == event)
+  assert(i < queue->barrier_events_num); // Must find it.
-  if(i == queue->barrier_events_num)
-    return;
-  if(i == queue->barrier_events_num - 1) {
+  if(i == queue->barrier_events_num - 1) { // The last one.
     queue->barrier_events[i] = NULL;
   } else {
-    for(; i<queue->barrier_events_num-1; i++) {
+    for(; i < queue->barrier_events_num - 1; i++) { // Move forward.
       queue->barrier_events[i] = queue->barrier_events[i+1];
   queue->barrier_events_num -= 1;
+  cl_event_delete(event);
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index d1b8c44..9f6ff39 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -22,84 +22,87 @@
 #include "cl_internals.h"
 #include "cl_driver.h"
-#include "cl_thread.h"
+#include "cl_base_object.h"
 #include "CL/cl.h"
 #include <stdint.h>
 struct intel_gpgpu;
+typedef struct _cl_command_queue_enqueue_worker {
+  cl_command_queue queue;
+  pthread_t tid;
+  cl_uint cookie;
+  cl_bool quit;
+  list_head enqueued_events;
+  cl_uint in_exec_status; // Same value as CL_COMPLETE, CL_SUBMITTED ...
+} _cl_command_queue_enqueue_worker;
+typedef _cl_command_queue_enqueue_worker *cl_command_queue_enqueue_worker;
 /* Basically, this is a (kind-of) batch buffer */
-struct _cl_command_queue {
-  DEFINE_ICD(dispatch)
-  uint64_t magic;                      /* To identify it as a command queue */
-  volatile int ref_n;                  /* We reference count this object */
+typedef struct _cl_command_queue {
+  _cl_base_object base;
+  _cl_command_queue_enqueue_worker worker;
   cl_context ctx;                      /* Its parent context */
-  cl_event* barrier_events;               /* Point to array of non-complete user events that block this command queue */
-  cl_int    barrier_events_num;           /* Number of Non-complete user events */
-  cl_int    barrier_events_size;          /* The size of array that wait_events point to */
-  cl_event* wait_events;               /* Point to array of non-complete user events that block this command queue */
-  cl_int    wait_events_num;           /* Number of Non-complete user events */
-  cl_int    wait_events_size;          /* The size of array that wait_events point to */
-  cl_command_queue_properties  props;  /* Queue properties */
-  cl_command_queue prev, next;         /* We chain the command queues together */
-  void *thread_data;                   /* Used to store thread context data */
+  cl_device_id device;                 /* Its device */
+  cl_event* barrier_events;            /* Point to array of non-complete user events that block this command queue */
+  cl_int barrier_events_num;           /* Number of Non-complete user events */
+  cl_int barrier_events_size;          /* The size of array that wait_events point to */
+  cl_command_queue_properties props;   /* Queue properties */
   cl_mem perf;                         /* Where to put the perf counters */
+  cl_uint size;                        /* Store the specified size for queueu */
+} _cl_command_queue;;
-  void* cmrt_event;                    /* the latest CmEvent* of the command queue */
-/* The macro to get the thread specified gpgpu struct. */
-#define GET_QUEUE_THREAD_GPGPU(queue) \
-	cl_gpgpu gpgpu = queue ? cl_get_thread_gpgpu(queue) : NULL;  \
-	if (queue) \
-	  assert(gpgpu);
+#define CL_OBJECT_COMMAND_QUEUE_MAGIC 0x83650a12b79ce4efLL
+#define CL_OBJECT_IS_COMMAND_QUEUE(obj) ((obj &&                           \
+         ((cl_base_object)obj)->magic == CL_OBJECT_COMMAND_QUEUE_MAGIC &&  \
+         CL_OBJECT_GET_REF(obj) >= 1))
 /* Allocate and initialize a new command queue. Also insert it in the list of
- * command queue in the associated context
- */
-extern cl_command_queue cl_command_queue_new(cl_context);
+ * command queue in the associated context */
+extern cl_command_queue cl_create_command_queue(cl_context, cl_device_id,
+                                                cl_command_queue_properties, cl_uint, cl_int*);
 /* Destroy and deallocate the command queue */
 extern void cl_command_queue_delete(cl_command_queue);
 /* Keep one more reference on the queue */
 extern void cl_command_queue_add_ref(cl_command_queue);
 /* Map ND range kernel from OCL API */
 extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
                                         cl_kernel ker,
+                                        cl_event event,
                                         const uint32_t work_dim,
-                                        const size_t *global_work_offset,
-                                        const size_t *global_work_size,
-                                        const size_t *local_work_size);
+                                        const size_t *global_wk_off,
+                                        const size_t *global_dim_off,
+                                        const size_t *global_wk_sz,
+                                        const size_t *global_wk_sz_use,
+                                        const size_t *local_wk_sz,
+                                        const size_t *local_wk_sz_use);
 /* The memory object where to report the performance */
 extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
-/* Flush for the command queue */
-extern cl_int cl_command_queue_flush(cl_command_queue);
 /* Flush for the specified gpgpu */
-extern int cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu);
-/* Wait for the completion of the command queue */
-extern cl_int cl_command_queue_finish(cl_command_queue);
+extern int cl_command_queue_flush_gpgpu(cl_gpgpu);
 /* Bind all the surfaces in the GPGPU state */
-extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
+extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel, cl_gpgpu, uint32_t *);
 /* Bind all the image surfaces in the GPGPU state */
-extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
+extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel, cl_gpgpu, uint32_t *);
+/* Bind all exec info to bind table */
+extern cl_int cl_command_queue_bind_exec_info(cl_command_queue, cl_kernel, cl_gpgpu, uint32_t *);
 /* Insert a user event to command's wait_events */
 extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
 /* Remove a user event from command's wait_events */
 extern void cl_command_queue_remove_event(cl_command_queue, cl_event);
 extern void cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event);
 extern void cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event);
+extern void cl_command_queue_notify(cl_command_queue queue);
+extern void cl_command_queue_enqueue_event(cl_command_queue queue, cl_event event);
+extern cl_int cl_command_queue_init_enqueue(cl_command_queue queue);
+extern void cl_command_queue_destroy_enqueue(cl_command_queue queue);
+extern cl_int cl_command_queue_wait_finish(cl_command_queue queue);
+extern cl_int cl_command_queue_wait_flush(cl_command_queue queue);
+/* Note: Must call this function with queue's lock. */
+extern cl_event *cl_command_queue_record_in_queue_events(cl_command_queue queue, cl_uint *list_num);
 #endif /* __CL_COMMAND_QUEUE_H__ */
diff --git a/src/cl_command_queue_enqueue.c b/src/cl_command_queue_enqueue.c
new file mode 100644
index 0000000..44a0761
--- /dev/null
+++ b/src/cl_command_queue_enqueue.c
@@ -0,0 +1,330 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: He Junyan <junyan.he at intel.com>
+ */
+#include "cl_command_queue.h"
+#include "cl_event.h"
+#include "cl_alloc.h"
+#include <stdio.h>
+static void *
+worker_thread_function(void *Arg)
+  cl_command_queue_enqueue_worker worker = (cl_command_queue_enqueue_worker)Arg;
+  cl_command_queue queue = worker->queue;
+  cl_event e;
+  cl_uint cookie = -1;
+  list_node *pos;
+  list_node *n;
+  list_head ready_list;
+  cl_int exec_status;
+  CL_OBJECT_LOCK(queue);
+  while (1) {
+    /* Must have locked here. */
+    if (worker->quit == CL_TRUE) {
+      CL_OBJECT_UNLOCK(queue);
+      return NULL;
+    }
+    if (list_empty(&worker->enqueued_events)) {
+      CL_OBJECT_WAIT_ON_COND(queue);
+      continue;
+    }
+    /* The cookie will change when event status change or something happend to
+       this command queue. If we already checked the event list and do not find
+       anything to exec, we need to wait the cookie update, to avoid loop for ever. */
+    if (cookie == worker->cookie) {
+      CL_OBJECT_WAIT_ON_COND(queue);
+      continue;
+    }
+    /* Here we hold lock to check event status, to avoid missing the status notify*/
+    list_init(&ready_list);
+    list_for_each_safe(pos, n, &worker->enqueued_events)
+    {
+      e = list_entry(pos, _cl_event, enqueue_node);
+      if (cl_event_is_ready(e) <= CL_COMPLETE) {
+        list_node_del(&e->enqueue_node);
+        list_add_tail(&ready_list, &e->enqueue_node);
+      }
+    }
+    if (list_empty(&ready_list)) { /* Nothing to do, just wait. */
+      cookie = worker->cookie;
+      continue;
+    }
+    /* Notify waiters, we change the event list. */
+    worker->in_exec_status = CL_QUEUED;
+    CL_OBJECT_UNLOCK(queue);
+    /* Do the really job without lock.*/
+    exec_status = CL_SUBMITTED;
+    list_for_each_safe(pos, n, &ready_list)
+    {
+      e = list_entry(pos, _cl_event, enqueue_node);
+      cl_event_exec(e, exec_status, CL_FALSE);
+    }
+    /* Notify all waiting for flush. */
+    CL_OBJECT_LOCK(queue);
+    worker->in_exec_status = CL_SUBMITTED;
+    CL_OBJECT_UNLOCK(queue);
+    list_for_each_safe(pos, n, &ready_list)
+    {
+      e = list_entry(pos, _cl_event, enqueue_node);
+      cl_event_exec(e, CL_COMPLETE, CL_FALSE);
+    }
+    /* Clear and delete all the events. */
+    list_for_each_safe(pos, n, &ready_list)
+    {
+      e = list_entry(pos, _cl_event, enqueue_node);
+      list_node_del(&e->enqueue_node);
+      cl_event_delete(e);
+    }
+    CL_OBJECT_LOCK(queue);
+    worker->in_exec_status = CL_COMPLETE;
+    /* Notify finish waiters, we have done all the ready event. */
+  }
+LOCAL void
+cl_command_queue_notify(cl_command_queue queue)
+  if (CL_OBJECT_GET_REF(queue) < 1) {
+    return;
+  }
+  assert(queue && (((cl_base_object)queue)->magic == CL_OBJECT_COMMAND_QUEUE_MAGIC));
+  CL_OBJECT_LOCK(queue);
+  queue->worker.cookie++;
+LOCAL void
+cl_command_queue_enqueue_event(cl_command_queue queue, cl_event event)
+  CL_OBJECT_INC_REF(event);
+  assert(CL_OBJECT_IS_COMMAND_QUEUE(queue));
+  CL_OBJECT_LOCK(queue);
+  assert(queue->worker.quit == CL_FALSE);
+  assert(list_node_out_of_list(&event->enqueue_node));
+  list_add_tail(&queue->worker.enqueued_events, &event->enqueue_node);
+  queue->worker.cookie++;
+LOCAL cl_int
+cl_command_queue_init_enqueue(cl_command_queue queue)
+  cl_command_queue_enqueue_worker worker = &queue->worker;
+  worker->queue = queue;
+  worker->quit = CL_FALSE;
+  worker->in_exec_status = CL_COMPLETE;
+  worker->cookie = 8;
+  list_init(&worker->enqueued_events);
+  if (pthread_create(&worker->tid, NULL, worker_thread_function, worker)) {
+    DEBUGP(DL_ERROR, "Can not create worker thread for queue %p...\n", queue);
+    return CL_OUT_OF_RESOURCES;
+  }
+  return CL_SUCCESS;
+LOCAL void
+cl_command_queue_destroy_enqueue(cl_command_queue queue)
+  cl_command_queue_enqueue_worker worker = &queue->worker;
+  list_node *pos;
+  list_node *n;
+  cl_event e;
+  assert(worker->queue == queue);
+  assert(worker->quit == CL_FALSE);
+  CL_OBJECT_LOCK(queue);
+  worker->quit = 1;
+  pthread_join(worker->tid, NULL);
+  /* We will wait for finish before destroy the command queue. */
+  if (!list_empty(&worker->enqueued_events)) {
+    DEBUGP(DL_WARNING, "There are still some enqueued works in the queue %p when this"
+                       " queue is destroyed, this may cause very serious problems.\n",
+           queue);
+    list_for_each_safe(pos, n, &worker->enqueued_events)
+    {
+      e = list_entry(pos, _cl_event, enqueue_node);
+      list_node_del(&e->enqueue_node);
+      cl_event_set_status(e, -1); // Give waiters a chance to wakeup.
+      cl_event_delete(e);
+    }
+  }
+/* Note: Must call this function with queue's lock. */
+LOCAL cl_event *
+cl_command_queue_record_in_queue_events(cl_command_queue queue, cl_uint *list_num)
+  int event_num = 0;
+  list_node *pos;
+  cl_command_queue_enqueue_worker worker = &queue->worker;
+  cl_event *enqueued_list = NULL;
+  int i;
+  cl_event tmp_e = NULL;
+  list_for_each(pos, &worker->enqueued_events)
+  {
+    event_num++;
+  }
+  assert(event_num > 0);
+  enqueued_list = cl_calloc(event_num, sizeof(cl_event));
+  assert(enqueued_list);
+  i = 0;
+  list_for_each(pos, &worker->enqueued_events)
+  {
+    tmp_e = list_entry(pos, _cl_event, enqueue_node);
+    cl_event_add_ref(tmp_e); // Add ref temp avoid delete.
+    enqueued_list[i] = tmp_e;
+    i++;
+  }
+  assert(i == event_num);
+  *list_num = event_num;
+  return enqueued_list;
+LOCAL cl_int
+cl_command_queue_wait_flush(cl_command_queue queue)
+  cl_command_queue_enqueue_worker worker = &queue->worker;
+  cl_event *enqueued_list = NULL;
+  cl_uint enqueued_num = 0;
+  int i;
+  CL_OBJECT_LOCK(queue);
+  if (worker->quit) { // already destroy the queue?
+    CL_OBJECT_UNLOCK(queue);
+  }
+  if (!list_empty(&worker->enqueued_events)) {
+    enqueued_list = cl_command_queue_record_in_queue_events(queue, &enqueued_num);
+    assert(enqueued_num > 0);
+    assert(enqueued_list);
+  }
+  while (worker->in_exec_status == CL_QUEUED) {
+    if (worker->quit) { // already destroy the queue?
+      CL_OBJECT_UNLOCK(queue);
+    }
+  }
+  /* Wait all event enter submitted status. */
+  for (i = 0; i < enqueued_num; i++) {
+    CL_OBJECT_LOCK(enqueued_list[i]);
+    while (enqueued_list[i]->status > CL_SUBMITTED) {
+      CL_OBJECT_WAIT_ON_COND(enqueued_list[i]);
+    }
+    CL_OBJECT_UNLOCK(enqueued_list[i]);
+  }
+  for (i = 0; i < enqueued_num; i++) {
+    cl_event_delete(enqueued_list[i]);
+  }
+  if (enqueued_list)
+    cl_free(enqueued_list);
+  return CL_SUCCESS;
+LOCAL cl_int
+cl_command_queue_wait_finish(cl_command_queue queue)
+  cl_command_queue_enqueue_worker worker = &queue->worker;
+  cl_event *enqueued_list = NULL;
+  cl_uint enqueued_num = 0;
+  int i;
+  CL_OBJECT_LOCK(queue);
+  if (worker->quit) { // already destroy the queue?
+    CL_OBJECT_UNLOCK(queue);
+  }
+  if (!list_empty(&worker->enqueued_events)) {
+    enqueued_list = cl_command_queue_record_in_queue_events(queue, &enqueued_num);
+    assert(enqueued_num > 0);
+    assert(enqueued_list);
+  }
+  while (worker->in_exec_status > CL_COMPLETE) {
+    if (worker->quit) { // already destroy the queue?
+      CL_OBJECT_UNLOCK(queue);
+    }
+  }
+  /* Wait all event enter submitted status. */
+  for (i = 0; i < enqueued_num; i++) {
+    CL_OBJECT_LOCK(enqueued_list[i]);
+    while (enqueued_list[i]->status > CL_COMPLETE) {
+      CL_OBJECT_WAIT_ON_COND(enqueued_list[i]);
+    }
+    CL_OBJECT_UNLOCK(enqueued_list[i]);
+  }
+  for (i = 0; i < enqueued_num; i++) {
+    cl_event_delete(enqueued_list[i]);
+  }
+  if (enqueued_list)
+    cl_free(enqueued_list);
+  return CL_SUCCESS;
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index a7b967d..dd82a44 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -23,12 +23,15 @@
 #include "cl_kernel.h"
 #include "cl_device_id.h"
 #include "cl_mem.h"
+#include "cl_event.h"
 #include "cl_utils.h"
 #include "cl_alloc.h"
+#include "cl_device_enqueue.h"
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+256; }
@@ -123,12 +126,24 @@ error:
 static int
-cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
+cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker, cl_gpgpu gpgpu)
-  /* calculate constant buffer size
-   * we need raw_size & aligned_size
-   */
+  if (interp_kernel_get_ocl_version(ker->opaque) >= 200) {
+    // pass the starting of constant address space
+    int32_t constant_addrspace = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_CONSTANT_ADDRSPACE, 0);
+    if (constant_addrspace >= 0) {
+      size_t global_const_size = interp_program_get_global_constant_size(ker->program->opaque);
+      if (global_const_size > 0) {
+        *(char **)(ker->curbe + constant_addrspace) = ker->program->global_data_ptr;
+        cl_gpgpu_bind_buf(gpgpu, ker->program->global_data, constant_addrspace, 0, ALIGN(global_const_size, getpagesize()), BTI_CONSTANT);
+      }
+    }
+    return 0;
+  }
+  // TODO this is only valid for OpenCL 1.2,
+  // under ocl1.2 we gather all constant into one dedicated surface.
+  // but in 2.0 we put program global into one surface, but constants
+  // pass through kernel argument in each separate buffer
   int32_t arg;
   size_t offset = 0;
   uint32_t raw_size = 0, aligned_size =0;
@@ -207,6 +222,7 @@ cl_curbe_fill(cl_kernel ker,
               const size_t *global_wk_off,
               const size_t *global_wk_sz,
               const size_t *local_wk_sz,
+              const size_t *enqueued_local_wk_sz,
               size_t thread_n)
   int32_t offset;
@@ -216,15 +232,18 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
   UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
   UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
+  UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_X, enqueued_local_wk_sz[0]);
+  UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y, enqueued_local_wk_sz[1]);
+  UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z, enqueued_local_wk_sz[2]);
   UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
   UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
   UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]);
   UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]);
   UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]);
   UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]);
-  UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0]/local_wk_sz[0]);
-  UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
-  UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
+  UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0] / enqueued_local_wk_sz[0] + (global_wk_sz[0]%enqueued_local_wk_sz[0]?1:0));
+  UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1] / enqueued_local_wk_sz[1] + (global_wk_sz[1]%enqueued_local_wk_sz[1]?1:0));
+  UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2] / enqueued_local_wk_sz[2] + (global_wk_sz[2]%enqueued_local_wk_sz[2]?1:0));
 #undef UPLOAD
@@ -255,11 +274,11 @@ static void
 cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   cl_context ctx = ker->program->ctx;
-  cl_device_id device = ctx->device;
+  cl_device_id device = ctx->devices[0];
   const int32_t per_lane_stack_sz = ker->stack_size;
   const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
   const int32_t sub_value = GBE_STACK_BUFFER;
-  const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
+  const int32_t offset_stack_buffer = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
   int32_t stack_sz = per_lane_stack_sz;
   /* No stack required for this kernel */
@@ -269,9 +288,9 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   /* The stack size is given for *each* SIMD lane. So, we accordingly compute
    * the size we need for the complete machine
-  assert(offset >= 0);
+  assert(offset_stack_buffer >= 0);
   stack_sz *= interp_kernel_get_simd_width(ker->opaque);
-  stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
+  stack_sz *= device->max_compute_unit * ctx->devices[0]->max_thread_per_unit;
   /* for some hardware, part of EUs are disabled with EU id reserved,
    * it makes the active EU id larger than count of EUs within a subslice,
@@ -279,7 +298,12 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   cl_driver_enlarge_stack_size(ctx->drv, &stack_sz);
-  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
+  const int32_t offset_stack_size = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_SIZE, 0);
+  if (offset_stack_size >= 0) {
+    *(uint64_t *)(ker->curbe + offset_stack_size) = stack_sz;
+  }
+  cl_gpgpu_set_stack(gpgpu, offset_stack_buffer, stack_sz, BTI_PRIVATE);
 static int
@@ -331,24 +355,36 @@ cl_alloc_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num
 LOCAL cl_int
 cl_command_queue_ND_range_gen7(cl_command_queue queue,
                                cl_kernel ker,
+                               cl_event event,
                                const uint32_t work_dim,
                                const size_t *global_wk_off,
+                               const size_t *global_dim_off,
                                const size_t *global_wk_sz,
-                               const size_t *local_wk_sz)
+                               const size_t *global_wk_sz_use,
+                               const size_t *local_wk_sz,
+                               const size_t *local_wk_sz_use)
+  cl_gpgpu gpgpu = cl_gpgpu_new(queue->ctx->drv);
   cl_context ctx = queue->ctx;
   char *final_curbe = NULL;  /* Includes them and one sub-buffer per group */
   cl_gpgpu_kernel kernel;
   const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
   size_t i, batch_sz = 0u, local_sz = 0u;
-  size_t cst_sz = ker->curbe_sz= interp_kernel_get_curbe_size(ker->opaque);
+  size_t cst_sz = interp_kernel_get_curbe_size(ker->opaque);
   int32_t scratch_sz = interp_kernel_get_scratch_size(ker->opaque);
   size_t thread_n = 0u;
   int printf_num = 0;
   cl_int err = CL_SUCCESS;
   size_t global_size = global_wk_sz[0] * global_wk_sz[1] * global_wk_sz[2];
   void* printf_info = NULL;
+  uint32_t max_bti = 0;
+  if (ker->exec_info_n > 0) {
+    cst_sz += ker->exec_info_n * sizeof(void *);
+    cst_sz = (cst_sz + 31) / 32 * 32;   //align to register size, hard code here.
+    ker->curbe = cl_realloc(ker->curbe, cst_sz);
+  }
+  ker->curbe_sz = cst_sz;
   /* Setup kernel */
   kernel.name = interp_kernel_get_name(ker->opaque);
@@ -359,21 +395,21 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   kernel.use_slm = interp_kernel_use_slm(ker->opaque);
   /* Compute the number of HW threads we need */
-  if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) {
+  if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz_use, 3, &local_sz) != CL_SUCCESS)) {
     DEBUGP(DL_ERROR, "Work group size exceed Kernel's work group size.");
     return err;
   kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
   kernel.curbe_sz = cst_sz;
-  if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
+  if (scratch_sz > ker->program->ctx->devices[0]->scratch_mem_size) {
     DEBUGP(DL_ERROR, "Out of scratch memory %d.", scratch_sz);
     return CL_OUT_OF_RESOURCES;
   /* Curbe step 1: fill the constant urb buffer data shared by all threads */
   if (ker->curbe) {
-    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
-    if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
+    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz_use ,local_wk_sz, thread_n);
+    if (kernel.slm_sz > ker->program->ctx->devices[0]->local_mem_size) {
       DEBUGP(DL_ERROR, "Out of shared local memory %d.", kernel.slm_sz);
       return CL_OUT_OF_RESOURCES;
@@ -384,9 +420,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Setup the kernel */
   if (queue->props & CL_QUEUE_PROFILING_ENABLE)
-    err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 1);
+    err = cl_gpgpu_state_init(gpgpu, ctx->devices[0]->max_compute_unit * ctx->devices[0]->max_thread_per_unit, cst_sz / 32, 1);
-    err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 0);
+    err = cl_gpgpu_state_init(gpgpu, ctx->devices[0]->max_compute_unit * ctx->devices[0]->max_thread_per_unit, cst_sz / 32, 0);
   if (err != 0)
     goto error;
   printf_num = interp_get_printf_num(printf_info);
@@ -403,10 +439,14 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind user buffers */
-  cl_command_queue_bind_surface(queue, ker);
+  cl_command_queue_bind_surface(queue, ker, gpgpu, &max_bti);
   /* Bind user images */
-  if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker) != CL_SUCCESS))
+  if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker, gpgpu, &max_bti) != CL_SUCCESS))
     return err;
+  /* Bind all exec infos */
+  cl_command_queue_bind_exec_info(queue, ker, gpgpu, &max_bti);
+  /* Bind device enqueue buffer */
+  cl_device_enqueue_bind_buffer(gpgpu, ker, &max_bti, &kernel);
   /* Bind all samplers */
   if (ker->vme)
     cl_gpgpu_bind_vme_state(gpgpu, ker->accel);
@@ -419,7 +459,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind a stack if needed */
   cl_bind_stack(gpgpu, ker);
-  if (cl_upload_constant_buffer(queue, ker) != 0)
+  if (cl_upload_constant_buffer(queue, ker, gpgpu) != 0)
     goto error;
   cl_gpgpu_states_setup(gpgpu, &kernel);
@@ -431,7 +471,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
     for (i = 0; i < thread_n; ++i) {
         memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
-    TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
+    TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz_use, simd_sz, cst_sz, thread_n);
     if (cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz) != 0)
       goto error;
@@ -440,14 +480,19 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   batch_sz = cl_kernel_compute_batch_sz(ker);
   if (cl_gpgpu_batch_reset(gpgpu, batch_sz) != 0)
     goto error;
-  cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
+  //cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
   /* Issue the GPGPU_WALKER command */
-  cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+  cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off,global_dim_off, global_wk_sz_use, local_wk_sz_use);
   /* Close the batch buffer and submit it */
   cl_gpgpu_batch_end(gpgpu, 0);
+  event->exec_data.queue = queue;
+  event->exec_data.gpgpu = gpgpu;
+  event->exec_data.type = EnqueueNDRangeKernel;
   return CL_SUCCESS;
diff --git a/src/cl_context.c b/src/cl_context.c
index a6bde7d..3f2e757 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -22,6 +22,8 @@
 #include "cl_context.h"
 #include "cl_command_queue.h"
 #include "cl_mem.h"
+#include "cl_sampler.h"
+#include "cl_event.h"
 #include "cl_alloc.h"
 #include "cl_utils.h"
 #include "cl_driver.h"
@@ -38,6 +40,139 @@
 #include <assert.h>
 #include <string.h>
+LOCAL void
+cl_context_add_queue(cl_context ctx, cl_command_queue queue) {
+  assert(queue->ctx == NULL);
+  cl_context_add_ref(ctx);
+  while (ctx->queue_modify_disable) {
+  }
+  list_add_tail(&ctx->queues, &queue->base.node);
+  ctx->queue_num++;
+  queue->ctx = ctx;
+LOCAL void
+cl_context_remove_queue(cl_context ctx, cl_command_queue queue) {
+  assert(queue->ctx == ctx);
+  while (ctx->queue_modify_disable) {
+  }
+  list_node_del(&queue->base.node);
+  ctx->queue_num--;
+  cl_context_delete(ctx);
+  queue->ctx = NULL;
+LOCAL void
+cl_context_add_mem(cl_context ctx, cl_mem mem) {
+  assert(mem->ctx == NULL);
+  cl_context_add_ref(ctx);
+  list_add_tail(&ctx->mem_objects, &mem->base.node);
+  ctx->mem_object_num++;
+  mem->ctx = ctx;
+LOCAL void
+cl_context_remove_mem(cl_context ctx, cl_mem mem) {
+  assert(mem->ctx == ctx);
+  list_node_del(&mem->base.node);
+  ctx->mem_object_num--;
+  cl_context_delete(ctx);
+  mem->ctx = NULL;
+LOCAL void
+cl_context_add_sampler(cl_context ctx, cl_sampler sampler) {
+  assert(sampler->ctx == NULL);
+  cl_context_add_ref(ctx);
+  list_add_tail(&ctx->samplers, &sampler->base.node);
+  ctx->sampler_num++;
+  sampler->ctx = ctx;
+LOCAL void
+cl_context_remove_sampler(cl_context ctx, cl_sampler sampler) {
+  assert(sampler->ctx == ctx);
+  list_node_del(&sampler->base.node);
+  ctx->sampler_num--;
+  cl_context_delete(ctx);
+  sampler->ctx = NULL;
+LOCAL void
+cl_context_add_event(cl_context ctx, cl_event event) {
+  assert(event->ctx == NULL);
+  cl_context_add_ref(ctx);
+  list_add_tail(&ctx->events, &event->base.node);
+  ctx->event_num++;
+  event->ctx = ctx;
+LOCAL void
+cl_context_remove_event(cl_context ctx, cl_event event) {
+  assert(event->ctx == ctx);
+  list_node_del(&event->base.node);
+  ctx->event_num--;
+  cl_context_delete(ctx);
+  event->ctx = NULL;
+LOCAL void
+cl_context_add_program(cl_context ctx, cl_program program) {
+  assert(program->ctx == NULL);
+  cl_context_add_ref(ctx);
+  list_add_tail(&ctx->programs, &program->base.node);
+  ctx->program_num++;
+  program->ctx = ctx;
+LOCAL void
+cl_context_remove_program(cl_context ctx, cl_program program) {
+  assert(program->ctx == ctx);
+  list_node_del(&program->base.node);
+  ctx->program_num--;
+  cl_context_delete(ctx);
+  program->ctx = NULL;
 #define CHECK(var) \
   if (var) \
     return CL_INVALID_PROPERTY; \
@@ -125,6 +260,10 @@ cl_create_context(const cl_context_properties *  properties,
   cl_context ctx = NULL;
   cl_int err = CL_SUCCESS;
   cl_uint prop_len = 0;
+  cl_uint dev_num = 0;
+  cl_device_id* all_dev = NULL;
+  cl_uint i, j;
   /* XXX */
   FATAL_IF (num_devices != 1, "Only one device is supported");
@@ -132,8 +271,32 @@ cl_create_context(const cl_context_properties *  properties,
   if (UNLIKELY(((err = cl_context_properties_process(properties, &props, &prop_len)) != CL_SUCCESS)))
     goto error;
+  /* Filter out repeated device. */
+  assert(num_devices > 0);
+  all_dev = cl_calloc(num_devices, sizeof(cl_device_id));
+  if (all_dev == NULL) {
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
+  }
+  for (i = 0; i < num_devices; i++) {
+    for (j = 0; j < i; j++) {
+      if (devices[j] == devices[i]) {
+        break;
+      }
+    }
+    if (j != i) { // Find some duplicated one.
+      continue;
+    }
+    all_dev[dev_num] = devices[i];
+    dev_num++;
+  }
+  assert(dev_num == 1); // TODO: multi devices later.
   /* We are good */
-  if (UNLIKELY((ctx = cl_context_new(&props)) == NULL)) {
+  if (UNLIKELY((ctx = cl_context_new(&props, dev_num, all_dev)) == NULL)) {
+    cl_free(all_dev);
     goto error;
@@ -143,13 +306,13 @@ cl_create_context(const cl_context_properties *  properties,
     memcpy(ctx->prop_user, properties, sizeof(cl_context_properties)*prop_len);
   ctx->prop_len = prop_len;
-  /* Attach the device to the context */
-  ctx->device = *devices;
+  /* cl_context_new will use all_dev. */
+  all_dev = NULL;
   /* Save the user callback and user data*/
   ctx->pfn_notify = pfn_notify;
   ctx->user_data = user_data;
-  cl_driver_set_atomic_flag(ctx->drv, ctx->device->atomic_test_result);
+  cl_driver_set_atomic_flag(ctx->drv, ctx->devices[0]->atomic_test_result);
   if (errcode_ret != NULL)
@@ -162,22 +325,23 @@ error:
 LOCAL cl_context
-cl_context_new(struct _cl_context_prop *props)
+cl_context_new(struct _cl_context_prop *props, cl_uint dev_num, cl_device_id* all_dev)
   cl_context ctx = NULL;
   TRY_ALLOC_NO_ERR (ctx, CALLOC(struct _cl_context));
+  ctx->devices = all_dev;
+  ctx->device_num = dev_num;
+  list_init(&ctx->queues);
+  list_init(&ctx->mem_objects);
+  list_init(&ctx->samplers);
+  list_init(&ctx->events);
+  list_init(&ctx->programs);
+  ctx->queue_modify_disable = CL_FALSE;
   TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new(props));
-  SET_ICD(ctx->dispatch)
   ctx->props = *props;
-  ctx->magic = CL_MAGIC_CONTEXT_HEADER;
-  ctx->ref_n = 1;
   ctx->ver = cl_driver_get_ver(ctx->drv);
-  pthread_mutex_init(&ctx->program_lock, NULL);
-  pthread_mutex_init(&ctx->queue_lock, NULL);
-  pthread_mutex_init(&ctx->buffer_lock, NULL);
-  pthread_mutex_init(&ctx->sampler_lock, NULL);
-  pthread_mutex_init(&ctx->accelerator_intel_lock, NULL);
   return ctx;
@@ -195,7 +359,7 @@ cl_context_delete(cl_context ctx)
   /* We are not done yet */
-  if (atomic_dec(&ctx->ref_n) > 1)
+  if (CL_OBJECT_DEC_REF(ctx) > 1)
   /* delete the internal programs. */
@@ -218,16 +382,9 @@ cl_context_delete(cl_context ctx)
   ctx->built_in_prgs = NULL;
-  /* All object lists should have been freed. Otherwise, the reference counter
-   * of the context cannot be 0
-   */
-  assert(ctx->queues == NULL);
-  assert(ctx->programs == NULL);
-  assert(ctx->buffers == NULL);
-  assert(ctx->drv);
-  ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
@@ -235,32 +392,7 @@ LOCAL void
 cl_context_add_ref(cl_context ctx)
-  atomic_inc(&ctx->ref_n);
-LOCAL cl_command_queue
-cl_context_create_queue(cl_context ctx,
-                        cl_device_id device,
-                        cl_command_queue_properties properties, /* XXX */
-                        cl_int *errcode_ret)
-  cl_command_queue queue = NULL;
-  cl_int err = CL_SUCCESS;
-  /* We create the command queue and store it in the context list of queues */
-  TRY_ALLOC (queue, cl_command_queue_new(ctx));
-  queue->props = properties;
-  if (errcode_ret)
-    *errcode_ret = err;
-  return queue;
-  cl_command_queue_delete(queue);
-  queue = NULL;
-  goto exit;
@@ -276,9 +408,10 @@ cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
   cl_int ret;
   cl_int binary_status = CL_SUCCESS;
   cl_kernel ker;
-  pthread_mutex_lock(&ctx->program_lock);
   if (ctx->internal_prgs[index] == NULL) {
-    ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
+    ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->devices[0],
       &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
     if (!ctx->internal_prgs[index]) {
@@ -326,6 +459,41 @@ cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
   ker = ctx->internal_kernels[index];
-  pthread_mutex_unlock(&ctx->program_lock);
   return cl_kernel_dup(ker);
+cl_context_get_svm_from_ptr(cl_context ctx, const void * p)
+  struct list_node *pos;
+  cl_mem buf;
+  list_for_each (pos, (&ctx->mem_objects)) {
+    buf = (cl_mem)list_entry(pos, _cl_base_object, node);
+    if(buf->host_ptr == NULL) continue;
+    if(buf->is_svm == 0) continue;
+    if(buf->type != CL_MEM_SVM_TYPE) continue;
+    if((size_t)buf->host_ptr <= (size_t)p &&
+       (size_t)p < ((size_t)buf->host_ptr + buf->size))
+      return buf;
+  }
+  return NULL;
+cl_context_get_mem_from_ptr(cl_context ctx, const void * p)
+  struct list_node *pos;
+  cl_mem buf;
+  list_for_each (pos, (&ctx->mem_objects)) {
+    buf = (cl_mem)list_entry(pos, _cl_base_object, node);
+    if(buf->host_ptr == NULL) continue;
+    if((size_t)buf->host_ptr <= (size_t)p &&
+       (size_t)p < ((size_t)buf->host_ptr + buf->size))
+      return buf;
+  }
+  return NULL;
diff --git a/src/cl_context.h b/src/cl_context.h
index 489e5d7..4812afd 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -24,7 +24,7 @@
 #include "CL/cl_ext.h"
 #include "cl_internals.h"
 #include "cl_driver.h"
-#include "cl_khr_icd.h"
+#include "cl_base_object.h"
 #include <stdint.h>
 #include <pthread.h>
@@ -99,23 +99,23 @@ struct _cl_context_prop {
 #define EGL_CTX(ctx)    (EGLContext)(ctx->props.gl_context)
 /* Encapsulate the whole device */
 struct _cl_context {
-  DEFINE_ICD(dispatch)
-  uint64_t magic;                   /* To identify it as a context */
-  volatile int ref_n;               /* We reference count this object */
+  _cl_base_object base;
   cl_driver drv;                    /* Handles HW or simulator */
-  cl_device_id device;              /* All information about the GPU device */
-  cl_command_queue queues;          /* All command queues currently allocated */
-  cl_program programs;              /* All programs currently allocated */
-  cl_mem buffers;                   /* All memory object currently allocated */
-  cl_sampler samplers;              /* All sampler object currently allocated */
+  cl_device_id* devices;            /* All devices belong to this context */
+  cl_uint device_num;               /* Devices number of this context */
+  list_head queues;                 /* All command queues currently allocated */
+  cl_uint queue_num;                /* All queue number currently allocated */
+  cl_uint queue_modify_disable;     /* Temp disable queue list change. */
+  list_head mem_objects;            /* All memory object currently allocated */
+  cl_uint mem_object_num;           /* All memory number currently allocated */
+  list_head samplers;               /* All sampler object currently allocated */
+  cl_uint sampler_num;              /* All sampler number currently allocated */
+  list_head events;                 /* All event object currently allocated */
+  cl_uint event_num;                /* All event number currently allocated */
+  list_head programs;               /* All programs currently allocated */
+  cl_uint program_num;              /* All program number currently allocated */
   cl_accelerator_intel accels;      /* All accelerator_intel object currently allocated */
-  cl_event   events;                /* All event object currently allocated */
-  pthread_mutex_t queue_lock;       /* To allocate and deallocate queues */
-  pthread_mutex_t program_lock;     /* To allocate and deallocate programs */
-  pthread_mutex_t buffer_lock;      /* To allocate and deallocate buffers */
-  pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
-  pthread_mutex_t accelerator_intel_lock;     /* To allocate and deallocate accelerator_intel */
-  pthread_mutex_t event_lock;       /* To allocate and deallocate events */
   cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
                                     /* All programs internal used, for example clEnqueuexxx api use */
   cl_kernel  internal_kernels[CL_INTERNAL_KERNEL_MAX];
@@ -132,6 +132,22 @@ struct _cl_context {
+#define CL_OBJECT_IS_CONTEXT(obj) ((obj &&                           \
+         ((cl_base_object)obj)->magic == CL_OBJECT_CONTEXT_MAGIC &&  \
+         CL_OBJECT_GET_REF(obj) >= 1))
+extern void cl_context_add_queue(cl_context ctx, cl_command_queue queue);
+extern void cl_context_remove_queue(cl_context ctx, cl_command_queue queue);
+extern void cl_context_add_mem(cl_context ctx, cl_mem mem);
+extern void cl_context_remove_mem(cl_context ctx, cl_mem mem);
+extern void cl_context_add_sampler(cl_context ctx, cl_sampler sampler);
+extern void cl_context_remove_sampler(cl_context ctx, cl_sampler sampler);
+extern void cl_context_add_event(cl_context ctx, cl_event sampler);
+extern void cl_context_remove_event(cl_context ctx, cl_event sampler);
+extern void cl_context_add_program(cl_context ctx, cl_program program);
+extern void cl_context_remove_program(cl_context ctx, cl_program program);
 /* Implement OpenCL function */
 extern cl_context cl_create_context(const cl_context_properties*,
@@ -141,7 +157,7 @@ extern cl_context cl_create_context(const cl_context_properties*,
 /* Allocate and initialize a context */
-extern cl_context cl_context_new(struct _cl_context_prop *);
+extern cl_context cl_context_new(struct _cl_context_prop *prop, cl_uint dev_num, cl_device_id* all_dev);
 /* Destroy and deallocate a context */
 extern void cl_context_delete(cl_context);
@@ -149,12 +165,6 @@ extern void cl_context_delete(cl_context);
 /* Increment the context reference counter */
 extern void cl_context_add_ref(cl_context);
-/* Create the command queue from the given context and device */
-extern cl_command_queue cl_context_create_queue(cl_context,
-                                                cl_device_id,
-                                                cl_command_queue_properties,
-                                                cl_int*);
 /* Enqueue a ND Range kernel */
 extern cl_int cl_context_ND_kernel(cl_context,
@@ -171,5 +181,10 @@ extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
 extern cl_kernel cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
                   const char * str_kernel, size_t size, const char * str_option);
+/* Get the SVM from pointer, return NULL if pointer is not from SVM */
+extern cl_mem cl_context_get_svm_from_ptr(cl_context ctx, const void *p);
+/* Get the mem from pointer, return NULL if pointer is not from mem*/
+extern cl_mem cl_context_get_mem_from_ptr(cl_context ctx, const void *p);
 #endif /* __CL_CONTEXT_H__ */
diff --git a/src/cl_device_enqueue.c b/src/cl_device_enqueue.c
new file mode 100644
index 0000000..b6932df
--- /dev/null
+++ b/src/cl_device_enqueue.c
@@ -0,0 +1,201 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang<rong.r.yang at intel.com>
+ */
+#include "cl_device_enqueue.h"
+#include "cl_mem.h"
+#include "cl_utils.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_alloc.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
+#include "cl_event.h"
+LOCAL cl_int
+cl_device_enqueue_fix_offset(cl_kernel ker) {
+  uint32_t i;
+  void *ptr;
+  cl_mem mem;
+  enum gbe_arg_type arg_type; /* kind of argument */
+  for (i = 0; i < ker->arg_n; ++i) {
+    arg_type = interp_kernel_get_arg_type(ker->opaque, i);
+    //HOW about image
+    if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !ker->args[i].mem)
+      continue;
+    if(!ker->args[i].is_svm) {
+      mem = ker->args[i].mem;
+      ptr = cl_mem_map(mem, 0);
+      cl_buffer_set_softpin_offset(mem->bo, (size_t)ptr);
+      cl_buffer_set_bo_use_full_range(mem->bo, 1);
+      cl_buffer_disable_reuse(mem->bo);
+      mem->host_ptr = ptr;
+      cl_mem_unmap(mem);
+      ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ptr;
+    } else {
+      ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ker->args[i].mem->host_ptr;
+    }
+  }
+  return 0;
+LOCAL cl_int
+cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker, uint32_t *max_bti, cl_gpgpu_kernel *kernel)
+  int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+  size_t buf_size = 32 * 1024 * 1024;  //fix 32M
+  cl_mem mem;
+  if(offset > 0) {
+    if(ker->useDeviceEnqueue == false) {
+      if(ker->device_enqueue_ptr == NULL)
+        ker->device_enqueue_ptr = cl_mem_svm_allocate(ker->program->ctx, 0, buf_size, 0);
+      if(ker->device_enqueue_infos == NULL)
+        ker->device_enqueue_infos = cl_calloc(ker->arg_n, sizeof(void *));
+      ker->device_enqueue_info_n = 0;
+      ker->useDeviceEnqueue = CL_TRUE;
+      cl_device_enqueue_fix_offset(ker);
+      cl_kernel_add_ref(ker);
+    }
+    mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
+    assert(mem);
+    cl_gpgpu_bind_buf(gpgpu, mem->bo, offset, 0, buf_size, *max_bti);
+    cl_gpgpu_set_kernel(gpgpu, ker);
+  }
+  return 0;
+typedef struct ndrange_info_t {
+  int type;
+  int global_work_size[3];
+  int local_work_size[3];
+  int global_work_offset[3];
+} ndrange_info_t;
+typedef struct Block_literal {
+  void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
+  int flags;
+  int reserved;
+  int index;
+  struct Block_descriptor_1 {
+    unsigned long int slm_size;         // NULL
+    unsigned long int size;         // sizeof(struct Block_literal_1)
+    // optional helper functions
+    void *copy_helper;     // IFF (1<<25)
+    void *dispose_helper;             // IFF (1<<25)
+    // required ABI.2010.3.16
+    const char *signature;                         // IFF (1<<30)
+  } *descriptor;
+  // imported variables
+} Block_literal;
+LOCAL cl_int
+cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu)
+  cl_mem mem;
+  int size, type, dim, i;
+  const char * kernel_name;
+  cl_kernel child_ker;
+  cl_event evt = NULL;
+  cl_kernel ker = cl_gpgpu_get_kernel(gpgpu);
+  if(ker == NULL || ker->useDeviceEnqueue == CL_FALSE)
+    return 0;
+  void *buf = cl_gpgpu_ref_batch_buf(gpgpu);
+  //wait the gpgpu's batch buf finish, the gpgpu in queue may be not
+  //same as the param gpgpu, for example when flush event.
+  cl_gpgpu_sync(buf);
+  cl_gpgpu_unref_batch_buf(buf);
+  mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
+  if(mem == NULL) return -1;
+  char *ptr = (char *)cl_mem_map(mem, 0);
+  size =  *(int *)ptr;
+  ptr += 4;
+  while(size > 0) {
+    size_t fixed_global_off[] = {0,0,0};
+    size_t fixed_global_sz[] = {1,1,1};
+    size_t fixed_local_sz[] = {1,1,1};
+    ndrange_info_t* ndrange_info = (ndrange_info_t *)ptr;
+    size -= sizeof(ndrange_info_t);
+    ptr += sizeof(ndrange_info_t);
+    Block_literal *block = (Block_literal *)ptr;
+    size -=  block->descriptor->size;
+    ptr += block->descriptor->size;
+    type = ndrange_info->type;
+    dim = (type & 0xf0) >> 4;
+    type = type & 0xf;
+    assert(dim <= 2);
+    for(i = 0; i <= dim; i++) {
+      fixed_global_sz[i] = ndrange_info->global_work_size[i];
+      if(type > 1)
+        fixed_local_sz[i] = ndrange_info->local_work_size[i];
+      if(type > 2)
+        fixed_global_off[i] = ndrange_info->global_work_offset[i];
+    }
+    int *slm_sizes = (int *)ptr;
+    int slm_size = block->descriptor->slm_size;
+    size -= slm_size;
+    ptr += slm_size;
+    kernel_name = interp_program_get_device_enqueue_kernel_name(ker->program->opaque, block->index);
+    child_ker = cl_program_create_kernel(ker->program, kernel_name, NULL);
+    assert(child_ker);
+    cl_kernel_set_arg_svm_pointer(child_ker, 0, block);
+    int index = 1;
+    for(i=0; i<slm_size/sizeof(int); i++, index++) {
+      cl_kernel_set_arg(child_ker, index, slm_sizes[i], NULL);
+    }
+    cl_kernel_set_exec_info(child_ker, ker->device_enqueue_info_n * sizeof(void *),
+                            ker->device_enqueue_infos);
+    if (evt != NULL) {
+      clReleaseEvent(evt);
+      evt = NULL;
+    }
+    clEnqueueNDRangeKernel(queue, child_ker, dim + 1, fixed_global_off,
+                           fixed_global_sz, fixed_local_sz, 0, NULL, &evt);
+    cl_command_queue_flush_gpgpu(gpgpu);
+    cl_kernel_delete(child_ker);
+  }
+  if (evt != NULL) {
+    //Can't call clWaitForEvents here, it may cause dead lock.
+    //If evt->exec_data.gpgpu is NULL, evt has finished.
+    if (evt->exec_data.gpgpu) {
+      buf = cl_gpgpu_ref_batch_buf(evt->exec_data.gpgpu);
+      //wait the gpgpu's batch buf finish, the gpgpu in queue may be not
+      //same as the param gpgpu, for example when flush event.
+      cl_gpgpu_sync(buf);
+      cl_gpgpu_unref_batch_buf(buf);
+    }
+    clReleaseEvent(evt);
+    evt = NULL;
+  }
+  cl_mem_unmap_auto(mem);
+  cl_kernel_delete(ker);
+  return 0;
diff --git a/backend/src/libocl/src/ocl_sync.cl b/src/cl_device_enqueue.h
similarity index 56%
copy from backend/src/libocl/src/ocl_sync.cl
copy to src/cl_device_enqueue.h
index b6efef8..17fc6c7 100644
--- a/backend/src/libocl/src/ocl_sync.cl
+++ b/src/cl_device_enqueue.h
@@ -1,5 +1,5 @@
- * Copyright © 2012 - 2014 Intel Corporation
+ * Copyright © 2012 Intel Corporation
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -14,19 +14,18 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ * Author: Rong Yang<rong.r.yang at intel.com>
-#include "ocl_sync.h"
-void __gen_ocl_barrier_local(void);
-void __gen_ocl_barrier_global(void);
-void __gen_ocl_barrier_local_and_global(void);
-void __gen_ocl_debugwait(void);
+#ifndef __CL_DEVICE_ENQUEUE_H__
+#define __CL_DEVICE_ENQUEUE_H__
-OVERLOADABLE void mem_fence(cl_mem_fence_flags flags) {
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "CL/cl.h"
+#include <stdint.h>
-OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags) {
-OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags) {
+extern cl_int cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker,
+                                                     uint32_t *max_bti, cl_gpgpu_kernel *kernel);
+extern cl_int cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu);
+#endif /* __CL_DEVICE_ENQUEUE_H__ */
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index ded2f1e..31f8616 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -24,7 +24,6 @@
 #include "cl_driver.h"
 #include "cl_device_data.h"
 #include "cl_khr_icd.h"
-#include "cl_thread.h"
 #include "CL/cl.h"
 #include "CL/cl_ext.h"
 #include "CL/cl_intel.h"
@@ -42,7 +41,6 @@
 static struct _cl_device_id intel_ivb_gt2_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 16,
   .max_thread_per_unit = 8,
   .sub_slice_count = 2,
@@ -53,7 +51,6 @@ static struct _cl_device_id intel_ivb_gt2_device = {
 static struct _cl_device_id intel_ivb_gt1_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 6,
   .max_thread_per_unit = 6,
   .sub_slice_count = 1,
@@ -64,7 +61,6 @@ static struct _cl_device_id intel_ivb_gt1_device = {
 static struct _cl_device_id intel_baytrail_t_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 4,
   .max_thread_per_unit = 8,
   .sub_slice_count = 1,
@@ -76,7 +72,6 @@ static struct _cl_device_id intel_baytrail_t_device = {
 /* XXX we clone IVB for HSW now */
 static struct _cl_device_id intel_hsw_gt1_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 10,
   .max_thread_per_unit = 7,
   .sub_slice_count = 1,
@@ -87,7 +82,6 @@ static struct _cl_device_id intel_hsw_gt1_device = {
 static struct _cl_device_id intel_hsw_gt2_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 20,
   .max_thread_per_unit = 7,
   .sub_slice_count = 2,
@@ -98,7 +92,6 @@ static struct _cl_device_id intel_hsw_gt2_device = {
 static struct _cl_device_id intel_hsw_gt3_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 40,
   .max_thread_per_unit = 7,
   .sub_slice_count = 4,
@@ -110,7 +103,6 @@ static struct _cl_device_id intel_hsw_gt3_device = {
 /* XXX we clone IVB for HSW now */
 static struct _cl_device_id intel_brw_gt1_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 12,
   .max_thread_per_unit = 7,
   .sub_slice_count = 2,
@@ -121,7 +113,6 @@ static struct _cl_device_id intel_brw_gt1_device = {
 static struct _cl_device_id intel_brw_gt2_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 24,
   .max_thread_per_unit = 7,
   .sub_slice_count = 3,
@@ -132,7 +123,6 @@ static struct _cl_device_id intel_brw_gt2_device = {
 static struct _cl_device_id intel_brw_gt3_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 48,
   .max_thread_per_unit = 7,
   .sub_slice_count = 6,
@@ -144,7 +134,6 @@ static struct _cl_device_id intel_brw_gt3_device = {
 //Cherryview has the same pciid, must get the max_compute_unit and max_thread_per_unit from drm
 static struct _cl_device_id intel_chv_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 8,
   .max_thread_per_unit = 7,
   .sub_slice_count = 2,
@@ -156,7 +145,6 @@ static struct _cl_device_id intel_chv_device = {
 /* XXX we clone brw now */
 static struct _cl_device_id intel_skl_gt1_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 6,
   .max_thread_per_unit = 7,
   .sub_slice_count = 2,
@@ -167,7 +155,6 @@ static struct _cl_device_id intel_skl_gt1_device = {
 static struct _cl_device_id intel_skl_gt2_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 24,
   .max_thread_per_unit = 7,
   .sub_slice_count = 3,
@@ -178,7 +165,6 @@ static struct _cl_device_id intel_skl_gt2_device = {
 static struct _cl_device_id intel_skl_gt3_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 48,
   .max_thread_per_unit = 7,
   .sub_slice_count = 6,
@@ -189,7 +175,6 @@ static struct _cl_device_id intel_skl_gt3_device = {
 static struct _cl_device_id intel_skl_gt4_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 72,
   .max_thread_per_unit = 7,
   .sub_slice_count = 9,
@@ -200,7 +185,6 @@ static struct _cl_device_id intel_skl_gt4_device = {
 static struct _cl_device_id intel_bxt18eu_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 18,
   .max_thread_per_unit = 6,
   .sub_slice_count = 3,
@@ -221,7 +205,6 @@ static struct _cl_device_id intel_bxt12eu_device = {
 static struct _cl_device_id intel_kbl_gt1_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 12,
   .max_thread_per_unit = 7,
   .sub_slice_count = 2,
@@ -232,7 +215,6 @@ static struct _cl_device_id intel_kbl_gt1_device = {
 static struct _cl_device_id intel_kbl_gt15_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 18,
   .max_thread_per_unit = 7,
   .sub_slice_count = 3,
@@ -243,7 +225,6 @@ static struct _cl_device_id intel_kbl_gt15_device = {
 static struct _cl_device_id intel_kbl_gt2_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 24,
   .max_thread_per_unit = 7,
   .sub_slice_count = 3,
@@ -254,7 +235,6 @@ static struct _cl_device_id intel_kbl_gt2_device = {
 static struct _cl_device_id intel_kbl_gt3_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 48,
   .max_thread_per_unit = 7,
   .sub_slice_count = 6,
@@ -265,7 +245,6 @@ static struct _cl_device_id intel_kbl_gt3_device = {
 static struct _cl_device_id intel_kbl_gt4_device = {
-  INIT_ICD(dispatch)
   .max_compute_unit = 72,
   .max_thread_per_unit = 7,
   .sub_slice_count = 9,
@@ -774,6 +753,7 @@ kbl_gt4_break:
   if (ret == NULL)
     return NULL;
   if (!CompilerSupported()) {
     ret->compiler_available = CL_FALSE;
     //ret->linker_available = CL_FALSE;
@@ -840,7 +820,7 @@ cl_self_test(cl_device_id device, cl_self_test_res atomic_in_l3_flag)
     return ret;
   cl_driver_set_atomic_flag(ctx->drv, atomic_in_l3_flag);
   if (status == CL_SUCCESS) {
-    queue = clCreateCommandQueue(ctx, device, 0, &status);
+    queue = clCreateCommandQueueWithProperties(ctx, device, 0, &status);
     if (status == CL_SUCCESS) {
       program = clCreateProgramWithSource(ctx, 1, &kernel_source, NULL, &status);
       if (status == CL_SUCCESS) {
@@ -937,30 +917,6 @@ cl_get_device_ids(cl_platform_id    platform,
-#define DECL_FIELD(CASE,FIELD)                                      \
-  case JOIN(CL_DEVICE_,CASE):                                       \
-    if (param_value_size_ret) {                                     \
-      *param_value_size_ret = sizeof device->FIELD;                 \
-      if (!param_value)                                             \
-        return CL_SUCCESS;                                          \
-    }                                                               \
-    if (param_value_size < sizeof device->FIELD)                    \
-      return CL_INVALID_VALUE;                                      \
-    memcpy(param_value, &device->FIELD, sizeof device->FIELD);      \
-    return CL_SUCCESS;
-#define DECL_STRING_FIELD(CASE,FIELD)                               \
-  case JOIN(CL_DEVICE_,CASE):                                       \
-    if (param_value_size_ret) {                                     \
-      *param_value_size_ret = device->JOIN(FIELD,_sz);              \
-      if (!param_value)                                             \
-        return CL_SUCCESS;                                          \
-    }                                                               \
-    if (param_value_size < device->JOIN(FIELD,_sz))                 \
-      return CL_INVALID_VALUE;                                      \
-    memcpy(param_value, device->FIELD, device->JOIN(FIELD,_sz));    \
-    return CL_SUCCESS;
 LOCAL cl_bool is_gen_device(cl_device_id device) {
   return device == &intel_ivb_gt1_device ||
          device == &intel_ivb_gt2_device ||
@@ -992,101 +948,394 @@ cl_get_device_info(cl_device_id     device,
                    void *           param_value,
                    size_t *         param_value_size_ret)
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  cl_int dev_ref;
+  // We now just support gen devices.
   if (UNLIKELY(is_gen_device(device) == CL_FALSE))
     return CL_INVALID_DEVICE;
   /* Find the correct parameter */
   switch (param_name) {
-    DECL_FIELD(TYPE, device_type)
-    DECL_FIELD(VENDOR_ID, vendor_id)
-    DECL_FIELD(MAX_COMPUTE_UNITS, max_compute_unit)
-    DECL_FIELD(MAX_WORK_ITEM_DIMENSIONS, max_work_item_dimensions)
-    DECL_FIELD(MAX_WORK_ITEM_SIZES, max_work_item_sizes)
-    DECL_FIELD(MAX_WORK_GROUP_SIZE, max_work_group_size)
-    DECL_FIELD(PREFERRED_VECTOR_WIDTH_CHAR, preferred_vector_width_char)
-    DECL_FIELD(PREFERRED_VECTOR_WIDTH_SHORT, preferred_vector_width_short)
-    DECL_FIELD(PREFERRED_VECTOR_WIDTH_INT, preferred_vector_width_int)
-    DECL_FIELD(PREFERRED_VECTOR_WIDTH_LONG, preferred_vector_width_long)
-    DECL_FIELD(PREFERRED_VECTOR_WIDTH_FLOAT, preferred_vector_width_float)
-    DECL_FIELD(PREFERRED_VECTOR_WIDTH_DOUBLE, preferred_vector_width_double)
-    DECL_FIELD(PREFERRED_VECTOR_WIDTH_HALF, preferred_vector_width_half)
-    DECL_FIELD(NATIVE_VECTOR_WIDTH_CHAR, native_vector_width_char)
-    DECL_FIELD(NATIVE_VECTOR_WIDTH_SHORT, native_vector_width_short)
-    DECL_FIELD(NATIVE_VECTOR_WIDTH_INT, native_vector_width_int)
-    DECL_FIELD(NATIVE_VECTOR_WIDTH_LONG, native_vector_width_long)
-    DECL_FIELD(NATIVE_VECTOR_WIDTH_FLOAT, native_vector_width_float)
-    DECL_FIELD(NATIVE_VECTOR_WIDTH_DOUBLE, native_vector_width_double)
-    DECL_FIELD(NATIVE_VECTOR_WIDTH_HALF, native_vector_width_half)
-    DECL_FIELD(MAX_CLOCK_FREQUENCY, max_clock_frequency)
-    DECL_FIELD(ADDRESS_BITS, address_bits)
-    DECL_FIELD(MAX_MEM_ALLOC_SIZE, max_mem_alloc_size)
-    DECL_FIELD(IMAGE_SUPPORT, image_support)
-    DECL_FIELD(MAX_READ_IMAGE_ARGS, max_read_image_args)
-    DECL_FIELD(MAX_WRITE_IMAGE_ARGS, max_write_image_args)
-    DECL_FIELD(IMAGE_MAX_ARRAY_SIZE, image_max_array_size)
-    DECL_FIELD(IMAGE2D_MAX_WIDTH, image2d_max_width)
-    DECL_FIELD(IMAGE2D_MAX_HEIGHT, image2d_max_height)
-    DECL_FIELD(IMAGE3D_MAX_WIDTH, image3d_max_width)
-    DECL_FIELD(IMAGE3D_MAX_HEIGHT, image3d_max_height)
-    DECL_FIELD(IMAGE3D_MAX_DEPTH, image3d_max_depth)
-    DECL_FIELD(MAX_SAMPLERS, max_samplers)
-    DECL_FIELD(MAX_PARAMETER_SIZE, max_parameter_size)
-    DECL_FIELD(MEM_BASE_ADDR_ALIGN, mem_base_addr_align)
-    DECL_FIELD(MIN_DATA_TYPE_ALIGN_SIZE, min_data_type_align_size)
-    DECL_FIELD(SINGLE_FP_CONFIG, single_fp_config)
-    DECL_FIELD(HALF_FP_CONFIG, half_fp_config)
-    DECL_FIELD(DOUBLE_FP_CONFIG, double_fp_config)
-    DECL_FIELD(GLOBAL_MEM_CACHE_TYPE, global_mem_cache_type)
-    DECL_FIELD(GLOBAL_MEM_CACHELINE_SIZE, global_mem_cache_line_size)
-    DECL_FIELD(GLOBAL_MEM_CACHE_SIZE, global_mem_cache_size)
-    DECL_FIELD(GLOBAL_MEM_SIZE, global_mem_size)
-    DECL_FIELD(MAX_CONSTANT_BUFFER_SIZE, max_constant_buffer_size)
-    DECL_FIELD(IMAGE_MAX_BUFFER_SIZE, image_mem_size)
-    DECL_FIELD(MAX_CONSTANT_ARGS, max_constant_args)
-    DECL_FIELD(LOCAL_MEM_TYPE, local_mem_type)
-    DECL_FIELD(LOCAL_MEM_SIZE, local_mem_size)
-    DECL_FIELD(ERROR_CORRECTION_SUPPORT, error_correction_support)
-    DECL_FIELD(HOST_UNIFIED_MEMORY, host_unified_memory)
-    DECL_FIELD(PROFILING_TIMER_RESOLUTION, profiling_timer_resolution)
-    DECL_FIELD(ENDIAN_LITTLE, endian_little)
-    DECL_FIELD(AVAILABLE, available)
-    DECL_FIELD(COMPILER_AVAILABLE, compiler_available)
-    DECL_FIELD(LINKER_AVAILABLE, linker_available)
-    DECL_FIELD(EXECUTION_CAPABILITIES, execution_capabilities)
-    DECL_FIELD(QUEUE_PROPERTIES, queue_properties)
-    DECL_FIELD(PLATFORM, platform)
-    DECL_FIELD(PRINTF_BUFFER_SIZE, printf_buffer_size)
-    DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version)
-    DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels)
-    DECL_FIELD(PARENT_DEVICE, parent_device)
-    DECL_FIELD(PARTITION_MAX_SUB_DEVICES, partition_max_sub_device)
-    DECL_FIELD(PARTITION_PROPERTIES, partition_property)
-    DECL_FIELD(PARTITION_TYPE, partition_type)
-    DECL_FIELD(REFERENCE_COUNT, device_reference_count)
-    DECL_FIELD(IMAGE_PITCH_ALIGNMENT, image_pitch_alignment)
-    DECL_FIELD(IMAGE_BASE_ADDRESS_ALIGNMENT, image_base_address_alignment)
-      if (param_value_size_ret) {
-        *param_value_size_ret = device->driver_version_sz;
-        if (!param_value)
-          return CL_SUCCESS;
+    case CL_DEVICE_TYPE:
+      src_ptr = &device->device_type;
+      src_size = sizeof(device->device_type);
+      break;
+      src_ptr = &device->vendor_id;
+      src_size = sizeof(device->vendor_id);
+      break;
+      src_ptr = &device->max_compute_unit;
+      src_size = sizeof(device->max_compute_unit);
+      break;
+      src_ptr = &device->max_work_item_dimensions;
+      src_size = sizeof(device->max_work_item_dimensions);
+      break;
+      src_ptr = &device->max_work_item_sizes;
+      src_size = sizeof(device->max_work_item_sizes);
+      break;
+      src_ptr = &device->max_work_group_size;
+      src_size = sizeof(device->max_work_group_size);
+      break;
+      src_ptr = &device->preferred_vector_width_char;
+      src_size = sizeof(device->preferred_vector_width_char);
+      break;
+      src_ptr = &device->preferred_vector_width_short;
+      src_size = sizeof(device->preferred_vector_width_short);
+      break;
+      src_ptr = &device->preferred_vector_width_int;
+      src_size = sizeof(device->preferred_vector_width_int);
+      break;
+      src_ptr = &device->preferred_vector_width_long;
+      src_size = sizeof(device->preferred_vector_width_long);
+      break;
+      src_ptr = &device->preferred_vector_width_float;
+      src_size = sizeof(device->preferred_vector_width_float);
+      break;
+      src_ptr = &device->preferred_vector_width_double;
+      src_size = sizeof(device->preferred_vector_width_double);
+      break;
+      src_ptr = &device->preferred_vector_width_half;
+      src_size = sizeof(device->preferred_vector_width_half);
+      break;
+      src_ptr = &device->native_vector_width_char;
+      src_size = sizeof(device->native_vector_width_char);
+      break;
+      src_ptr = &device->native_vector_width_short;
+      src_size = sizeof(device->native_vector_width_short);
+      break;
+      src_ptr = &device->native_vector_width_int;
+      src_size = sizeof(device->native_vector_width_int);
+      break;
+      src_ptr = &device->native_vector_width_long;
+      src_size = sizeof(device->native_vector_width_long);
+      break;
+      src_ptr = &device->native_vector_width_float;
+      src_size = sizeof(device->native_vector_width_float);
+      break;
+      src_ptr = &device->native_vector_width_double;
+      src_size = sizeof(device->native_vector_width_double);
+      break;
+      src_ptr = &device->native_vector_width_half;
+      src_size = sizeof(device->native_vector_width_half);
+      break;
+      src_ptr = &device->max_clock_frequency;
+      src_size = sizeof(device->max_clock_frequency);
+      break;
+      src_ptr = &device->address_bits;
+      src_size = sizeof(device->address_bits);
+      break;
+      src_ptr = &device->max_mem_alloc_size;
+      src_size = sizeof(device->max_mem_alloc_size);
+      break;
+      src_ptr = &device->image_support;
+      src_size = sizeof(device->image_support);
+      break;
+      src_ptr = &device->max_read_image_args;
+      src_size = sizeof(device->max_read_image_args);
+      break;
+      src_ptr = &device->max_write_image_args;
+      src_size = sizeof(device->max_write_image_args);
+      break;
+      src_ptr = &device->max_read_write_image_args;
+      src_size = sizeof(device->max_read_write_image_args);
+      break;
+      src_ptr = &device->image_max_array_size;
+      src_size = sizeof(device->image_max_array_size);
+      break;
+      src_ptr = &device->image2d_max_width;
+      src_size = sizeof(device->image2d_max_width);
+      break;
+      src_ptr = &device->image2d_max_height;
+      src_size = sizeof(device->image2d_max_height);
+      break;
+      src_ptr = &device->image3d_max_width;
+      src_size = sizeof(device->image3d_max_width);
+      break;
+      src_ptr = &device->image3d_max_height;
+      src_size = sizeof(device->image3d_max_height);
+      break;
+      src_ptr = &device->image3d_max_depth;
+      src_size = sizeof(device->image3d_max_depth);
+      break;
+      src_ptr = &device->max_samplers;
+      src_size = sizeof(device->max_samplers);
+      break;
+      src_ptr = &device->max_parameter_size;
+      src_size = sizeof(device->max_parameter_size);
+      break;
+      src_ptr = &device->mem_base_addr_align;
+      src_size = sizeof(device->mem_base_addr_align);
+      break;
+      src_ptr = &device->min_data_type_align_size;
+      src_size = sizeof(device->min_data_type_align_size);
+      break;
+      src_ptr = &device->max_pipe_args;
+      src_size = sizeof(device->max_pipe_args);
+      break;
+      src_ptr = &device->pipe_max_active_reservations;
+      src_size = sizeof(device->pipe_max_active_reservations);
+      break;
+      src_ptr = &device->pipe_max_packet_siz;
+      src_size = sizeof(device->pipe_max_packet_siz);
+      break;
+      src_ptr = &device->single_fp_config;
+      src_size = sizeof(device->single_fp_config);
+      break;
+      src_ptr = &device->half_fp_config;
+      src_size = sizeof(device->half_fp_config);
+      break;
+      src_ptr = &device->double_fp_config;
+      src_size = sizeof(device->double_fp_config);
+      break;
+      src_ptr = &device->global_mem_cache_type;
+      src_size = sizeof(device->global_mem_cache_type);
+      break;
+      src_ptr = &device->global_mem_cache_line_size;
+      src_size = sizeof(device->global_mem_cache_line_size);
+      break;
+      src_ptr = &device->global_mem_cache_size;
+      src_size = sizeof(device->global_mem_cache_size);
+      break;
+      src_ptr = &device->global_mem_size;
+      src_size = sizeof(device->global_mem_size);
+      break;
+      src_ptr = &device->max_constant_buffer_size;
+      src_size = sizeof(device->max_constant_buffer_size);
+      break;
+      src_ptr = &device->image_mem_size;
+      src_size = sizeof(device->image_mem_size);
+      break;
+      src_ptr = &device->max_constant_args;
+      src_size = sizeof(device->max_constant_args);
+      break;
+      src_ptr = &device->max_global_variable_size;
+      src_size = sizeof(device->max_global_variable_size);
+      break;
+      src_ptr = &device->global_variable_preferred_total_size;
+      src_size = sizeof(device->global_variable_preferred_total_size);
+      break;
+      src_ptr = &device->local_mem_type;
+      src_size = sizeof(device->local_mem_type);
+      break;
+      src_ptr = &device->local_mem_size;
+      src_size = sizeof(device->local_mem_size);
+      break;
+      src_ptr = &device->error_correction_support;
+      src_size = sizeof(device->error_correction_support);
+      break;
+      src_ptr = &device->host_unified_memory;
+      src_size = sizeof(device->host_unified_memory);
+      break;
+      src_ptr = &device->profiling_timer_resolution;
+      src_size = sizeof(device->profiling_timer_resolution);
+      break;
+      src_ptr = &device->endian_little;
+      src_size = sizeof(device->endian_little);
+      break;
+      src_ptr = &device->available;
+      src_size = sizeof(device->available);
+      break;
+      src_ptr = &device->compiler_available;
+      src_size = sizeof(device->compiler_available);
+      break;
+      src_ptr = &device->linker_available;
+      src_size = sizeof(device->linker_available);
+      break;
+      src_ptr = &device->execution_capabilities;
+      src_size = sizeof(device->execution_capabilities);
+      break;
+      src_ptr = &device->queue_properties;
+      src_size = sizeof(device->queue_properties);
+      break;
+      src_ptr = &device->queue_on_device_properties;
+      src_size = sizeof(device->queue_on_device_properties);
+      break;
+      src_ptr = &device->queue_on_device_preferred_size;
+      src_size = sizeof(device->queue_on_device_preferred_size);
+      break;
+      src_ptr = &device->queue_on_device_max_size;
+      src_size = sizeof(device->queue_on_device_max_size);
+      break;
+      src_ptr = &device->max_on_device_queues;
+      src_size = sizeof(device->max_on_device_queues);
+      break;
+      src_ptr = &device->max_on_device_events;
+      src_size = sizeof(device->max_on_device_events);
+      break;
+      src_ptr = &device->platform;
+      src_size = sizeof(device->platform);
+      break;
+      src_ptr = &device->printf_buffer_size;
+      src_size = sizeof(device->printf_buffer_size);
+      break;
+      src_ptr = &device->interop_user_sync;
+      src_size = sizeof(device->interop_user_sync);
+      break;
+    case CL_DEVICE_NAME:
+      src_ptr = device->name;
+      src_size = device->name_sz;
+      break;
+      src_ptr = device->vendor;
+      src_size = device->vendor_sz;
+      break;
+      src_ptr = device->version;
+      src_size = device->version_sz;
+      break;
+      src_ptr = device->profile;
+      src_size = device->profile_sz;
+      break;
+      src_ptr = device->opencl_c_version;
+      src_size = device->opencl_c_version_sz;
+      break;
+      src_ptr = device->spir_versions;
+      src_size = device->spir_versions_sz;
+      break;
+      src_ptr = device->extensions;
+      src_size = device->extensions_sz;
+      break;
+      src_ptr = device->built_in_kernels;
+      src_size = device->built_in_kernels_sz;
+      break;
+      src_ptr = &device->parent_device;
+      src_size = sizeof(device->parent_device);
+      break;
+      src_ptr = &device->partition_max_sub_device;
+      src_size = sizeof(device->partition_max_sub_device);
+      break;
+      src_ptr = &device->partition_property;
+      src_size = sizeof(device->partition_property);
+      break;
+      src_ptr = &device->affinity_domain;
+      src_size = sizeof(device->affinity_domain);
+      break;
+      src_ptr = &device->partition_type;
+      src_size = sizeof(device->partition_type);
+      break;
+      src_ptr = &device->preferred_platform_atomic_alignment;
+      src_size = sizeof(device->preferred_platform_atomic_alignment);
+      break;
+      src_ptr = &device->preferred_global_atomic_alignment;
+      src_size = sizeof(device->preferred_global_atomic_alignment);
+      break;
+      src_ptr = &device->preferred_local_atomic_alignment;
+      src_size = sizeof(device->preferred_local_atomic_alignment);
+      break;
+      src_ptr = &device->image_pitch_alignment;
+      src_size = sizeof(device->image_pitch_alignment);
+      break;
+      src_ptr = &device->image_base_address_alignment;
+      src_size = sizeof(device->image_base_address_alignment);
+      break;
+      src_ptr = &device->svm_capabilities;
+      src_size = sizeof(device->svm_capabilities);
+      break;
+      {
+        dev_ref = CL_OBJECT_GET_REF(device);
+        src_ptr = &dev_ref;
+        src_size = sizeof(cl_int);
+        break;
-      if (param_value_size < device->driver_version_sz)
-        return CL_INVALID_VALUE;
-      memcpy(param_value, device->driver_version, device->driver_version_sz);
-      return CL_SUCCESS;
+      src_ptr = device->driver_version;
+      src_size = device->driver_version_sz;
+      break;
-    default: return CL_INVALID_VALUE;
-  };
+    default:
+      return CL_INVALID_VALUE;
+  }
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
 LOCAL cl_int
@@ -1140,7 +1389,7 @@ cl_check_builtin_kernel_dimension(cl_kernel kernel, cl_device_id device)
   const char * n = cl_kernel_get_name(kernel);
   const char * builtin_kernels_2d = "__cl_copy_image_2d_to_2d;__cl_copy_image_2d_to_buffer;__cl_copy_buffer_to_image_2d;__cl_fill_image_2d;__cl_fill_image_2d_array;";
   const char * builtin_kernels_3d = "__cl_copy_image_3d_to_2d;__cl_copy_image_2d_to_3d;__cl_copy_image_3d_to_3d;__cl_copy_image_3d_to_buffer;__cl_copy_buffer_to_image_3d;__cl_fill_image_3d";
-    if (!strstr(device->built_in_kernels, n)){
+    if (n == NULL || !strstr(device->built_in_kernels, n)){
       return 0;
     }else if(strstr(builtin_kernels_2d, n)){
       return 2;
@@ -1156,22 +1405,22 @@ cl_get_kernel_max_wg_sz(cl_kernel kernel)
   size_t work_group_size, thread_cnt;
   int simd_width = interp_kernel_get_simd_width(kernel->opaque);
-  int device_id = kernel->program->ctx->device->device_id;
+  int device_id = kernel->program->ctx->devices[0]->device_id;
   if (!interp_kernel_use_slm(kernel->opaque)) {
     if (!IS_BAYTRAIL_T(device_id) || simd_width == 16)
       work_group_size = simd_width * 64;
-      work_group_size = kernel->program->ctx->device->max_compute_unit *
-                        kernel->program->ctx->device->max_thread_per_unit * simd_width;
+      work_group_size = kernel->program->ctx->devices[0]->max_compute_unit *
+                        kernel->program->ctx->devices[0]->max_thread_per_unit * simd_width;
   } else {
-    thread_cnt = kernel->program->ctx->device->max_compute_unit *
-                 kernel->program->ctx->device->max_thread_per_unit / kernel->program->ctx->device->sub_slice_count;
+    thread_cnt = kernel->program->ctx->devices[0]->max_compute_unit *
+                 kernel->program->ctx->devices[0]->max_thread_per_unit / kernel->program->ctx->devices[0]->sub_slice_count;
     if(thread_cnt > 64)
       thread_cnt = 64;
     work_group_size = thread_cnt * simd_width;
-  if(work_group_size > kernel->program->ctx->device->max_work_group_size)
-    work_group_size = kernel->program->ctx->device->max_work_group_size;
+  if(work_group_size > kernel->program->ctx->devices[0]->max_work_group_size)
+    work_group_size = kernel->program->ctx->devices[0]->max_work_group_size;
   return work_group_size;
@@ -1187,7 +1436,7 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
   int dimension = 0;
   if (device == NULL)
-    device = kernel->program->ctx->device;
+    device = kernel->program->ctx->devices[0];
   if (UNLIKELY(is_gen_device(device) == CL_FALSE))
     return CL_INVALID_DEVICE;
@@ -1259,7 +1508,7 @@ cl_get_kernel_subgroup_info(cl_kernel kernel,
   int err = CL_SUCCESS;
   if(device != NULL)
-    if (kernel->program->ctx->device != device)
+    if (kernel->program->ctx->devices[0] != device)
       return CL_INVALID_DEVICE;
@@ -1329,3 +1578,54 @@ cl_get_kernel_subgroup_info(cl_kernel kernel,
   return err;
+LOCAL cl_int
+cl_devices_list_check(cl_uint num_devices, const cl_device_id *devices)
+  cl_uint i;
+  if (devices == NULL)
+    return CL_INVALID_DEVICE;
+  assert(num_devices > 0);
+  for (i = 0; i < num_devices; i++) {
+    if (!CL_OBJECT_IS_DEVICE(devices[i])) {
+      return CL_INVALID_DEVICE;
+    }
+    if (devices[i]->available == CL_FALSE) {
+    }
+    // We now just support one platform.
+    if (devices[i]->platform != cl_get_platform_default()) {
+      return CL_INVALID_DEVICE;
+    }
+    // TODO: We now just support Gen Device.
+    if (devices[i] != cl_get_gt_device()) {
+      return CL_INVALID_DEVICE;
+    }
+  }
+  return CL_SUCCESS;
+LOCAL cl_int
+cl_devices_list_include_check(cl_uint num_devices, const cl_device_id *devices,
+                              cl_uint num_to_check, const cl_device_id *devices_to_check)
+  cl_uint i, j;
+  for (i = 0; i < num_to_check; i++) {
+    for (j = 0; j < num_devices; j++) {
+      if (devices_to_check[i] == devices[j])
+        break;
+    }
+    if (j == num_devices)
+      return CL_INVALID_DEVICE;
+  }
+  return CL_SUCCESS;
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 7db125b..9d8b512 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -22,10 +22,10 @@
-#include "cl_khr_icd.h"
+#include "cl_base_object.h"
 /* Store complete information about the device */
 struct _cl_device_id {
-  DEFINE_ICD(dispatch)
+  _cl_base_object base;
   cl_device_type device_type;
   cl_uint  device_id;
   cl_uint  vendor_id;
@@ -55,9 +55,14 @@ struct _cl_device_id {
   cl_uint  max_clock_frequency;
   cl_uint  address_bits;
   cl_ulong max_mem_alloc_size;
+  cl_device_svm_capabilities  svm_capabilities;
+  cl_uint preferred_platform_atomic_alignment;
+  cl_uint preferred_global_atomic_alignment;
+  cl_uint preferred_local_atomic_alignment;
   cl_bool  image_support;
   cl_uint  max_read_image_args;
   cl_uint  max_write_image_args;
+  cl_uint  max_read_write_image_args;
   size_t   image2d_max_width;
   size_t   image_max_array_size;
   size_t   image2d_max_height;
@@ -69,6 +74,9 @@ struct _cl_device_id {
   size_t   max_parameter_size;
   cl_uint  mem_base_addr_align;
   cl_uint  min_data_type_align_size;
+  cl_uint  max_pipe_args;
+  cl_uint  pipe_max_active_reservations;
+  cl_uint  pipe_max_packet_siz;
   cl_device_fp_config single_fp_config;
   cl_device_fp_config half_fp_config;
   cl_device_fp_config double_fp_config;
@@ -78,6 +86,8 @@ struct _cl_device_id {
   cl_ulong global_mem_size;
   cl_ulong max_constant_buffer_size;
   cl_uint  max_constant_args;
+  size_t  max_global_variable_size;
+  size_t  global_variable_preferred_total_size;
   cl_device_local_mem_type local_mem_type;
   cl_ulong local_mem_size;
   cl_ulong scratch_mem_size;
@@ -90,6 +100,12 @@ struct _cl_device_id {
   cl_bool  linker_available;
   cl_device_exec_capabilities execution_capabilities;
   cl_command_queue_properties queue_properties;
+  cl_command_queue_properties queue_on_host_properties;
+  cl_command_queue_properties queue_on_device_properties;
+  cl_uint queue_on_device_preferred_size;
+  cl_uint queue_on_device_max_size;
+  cl_uint max_on_device_queues;
+  cl_uint max_on_device_events;
   cl_platform_id platform;
   size_t printf_buffer_size;
   cl_bool interop_user_sync;
@@ -117,15 +133,19 @@ struct _cl_device_id {
   cl_device_partition_property partition_property[3];
   cl_device_affinity_domain    affinity_domain;
   cl_device_partition_property partition_type[3];
-  cl_uint      device_reference_count;
   uint32_t atomic_test_result;
-  uint32_t image_pitch_alignment;
-  uint32_t image_base_address_alignment;
+  cl_uint image_pitch_alignment;
+  cl_uint image_base_address_alignment;
   //inited as NULL, created only when cmrt kernel is used
   void* cmrt_device;  //realtype: CmDevice*
+#define CL_OBJECT_DEVICE_MAGIC 0x2acaddcca8853c52LL
+#define CL_OBJECT_IS_DEVICE(obj) ((obj &&                           \
+         ((cl_base_object)obj)->magic == CL_OBJECT_DEVICE_MAGIC &&  \
+         CL_OBJECT_GET_REF(obj) >= 1))
 /* Get a device from the given platform */
 extern cl_int cl_get_device_ids(cl_platform_id    platform,
                                 cl_device_type    device_type,
@@ -162,5 +182,9 @@ extern cl_int cl_get_kernel_subgroup_info(cl_kernel kernel,
 extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
 extern size_t cl_get_kernel_max_wg_sz(cl_kernel);
+extern cl_int cl_devices_list_check(cl_uint num_devices, const cl_device_id *devices);
+extern cl_int cl_devices_list_include_check(cl_uint num_devices, const cl_device_id *devices,
+                                        cl_uint num_to_check, const cl_device_id *devices_to_check);
 #endif /* __CL_DEVICE_ID_H__ */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 584be9d..07c5f7f 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -123,7 +123,7 @@ typedef enum gpu_command_status {
 typedef struct cl_gpgpu_kernel {
   const char *name;        /* kernel name and bo name */
   uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
-  uint32_t curbe_sz;         /* total size of all curbes */
+  uint32_t curbe_sz;       /* total size of all curbes */
   cl_buffer bo;            /* kernel code in the proper addr space */
   int32_t barrierID;       /* barrierID for _this_ kernel */
   uint32_t use_slm:1;      /* For gen7 (automatic barrier management) */
@@ -147,6 +147,12 @@ extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
 typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, size_t size, uint8_t bti);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
+typedef void (cl_gpgpu_set_kernel_cb)(cl_gpgpu, void *);
+extern cl_gpgpu_set_kernel_cb *cl_gpgpu_set_kernel;
+typedef void* (cl_gpgpu_get_kernel_cb)(cl_gpgpu);
+extern cl_gpgpu_get_kernel_cb *cl_gpgpu_get_kernel;
 /* bind samplers defined in both kernel and kernel args. */
 typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
 extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
@@ -262,11 +268,11 @@ typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
 extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
 /* Get a event time stamp */
-typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, cl_gpgpu_event, int, uint64_t*);
+typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, int, uint64_t*);
 extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
 /* Get current GPU time stamp */
-typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*);
+typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_driver, uint64_t*);
 extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp;
 /* Get current batch buffer handle */
@@ -326,10 +332,10 @@ typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
                                   uint32_t simd_sz,
                                   uint32_t thread_n,
                                   const size_t global_wk_off[3],
+                                  const size_t global_dim_off[3],
                                   const size_t global_wk_sz[3],
                                   const size_t local_wk_sz[3]);
 extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
  * Buffer
@@ -340,8 +346,17 @@ extern cl_buffer_alloc_cb *cl_buffer_alloc;
 typedef cl_buffer (cl_buffer_alloc_userptr_cb)(cl_buffer_mgr, const char*, void *, size_t, unsigned long);
 extern cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr;
+typedef int (cl_buffer_set_softpin_offset_cb)(cl_buffer, uint64_t);
+extern cl_buffer_set_softpin_offset_cb *cl_buffer_set_softpin_offset;
+typedef int (cl_buffer_set_bo_use_full_range_cb)(cl_buffer, uint32_t);
+extern cl_buffer_set_bo_use_full_range_cb *cl_buffer_set_bo_use_full_range;
+typedef int (cl_buffer_disable_reuse_cb)(cl_buffer);
+extern cl_buffer_disable_reuse_cb *cl_buffer_disable_reuse;
 /* Set a buffer's tiling mode */
-typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
+typedef int (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
 extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
 #include "cl_context.h"
@@ -351,7 +366,7 @@ typedef cl_buffer (cl_buffer_alloc_from_texture_cb)(cl_context, unsigned int, in
                                                     struct _cl_mem_image *gl_image);
 extern cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture;
-typedef void (cl_buffer_release_from_texture_cb)(cl_context, unsigned int, int, unsigned int);
+typedef void (cl_buffer_release_from_texture_cb)(cl_context, struct _cl_mem_gl_image *);
 extern cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture;
 typedef cl_buffer (cl_buffer_get_buffer_from_libva_cb)(cl_context ctx, unsigned int bo_name, size_t *sz);
@@ -436,35 +451,5 @@ extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
 typedef void (cl_driver_update_device_info_cb)(cl_device_id device);
 extern cl_driver_update_device_info_cb *cl_driver_update_device_info;
- * cl_khr_gl_sharing.
- **************************************************************************/
-typedef int (cl_gl_acquire_texture_cb)(void *driver, void *ctx, int target,
-                                       int level, int texture, void*user_data);
-extern cl_gl_acquire_texture_cb *cl_gl_acquire_texture;
-typedef int (cl_gl_release_texture_cb)(void *driver, void *ctx, int target,
-                                       int level, int texture);
-extern cl_gl_release_texture_cb *cl_gl_release_texture;
-typedef int (cl_gl_acquire_buffer_object_cb)(void *driver, void *ctx,
-                                             int bufobj, void* user_data);
-extern cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object;
-typedef int (cl_gl_release_buffer_object_cb)(void *driver, void *ctx, int bufobj);
-extern cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object;
-typedef int (cl_gl_acquire_render_buffer_cb)(void *driver, void *ctx,
-                                             int rb, void* user_data);
-extern cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer;
-typedef int (cl_gl_release_render_buffer_cb)(void *driver, void *ctx, int rb);
-extern cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer;
-/* this is normally defined in Mesa/configs/default with DRI_DRIVER_SEARCH_PATH */
-#define DEFAULT_DRIVER_DIR "/usr/local/lib/dri"
 #endif /* __CL_DRIVER_H__ */
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index ea4e90a..18ab473 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -33,6 +33,9 @@ LOCAL cl_driver_update_device_info_cb *cl_driver_update_device_info = NULL;
 /* Buffer */
 LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
 LOCAL cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr = NULL;
+LOCAL cl_buffer_set_softpin_offset_cb *cl_buffer_set_softpin_offset = NULL;
+LOCAL cl_buffer_set_bo_use_full_range_cb *cl_buffer_set_bo_use_full_range = NULL;
+LOCAL cl_buffer_disable_reuse_cb *cl_buffer_disable_reuse = NULL;
 LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
 LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
 LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
@@ -57,13 +60,6 @@ LOCAL cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align = NULL;
 LOCAL cl_buffer_get_buffer_from_fd_cb *cl_buffer_get_buffer_from_fd = NULL;
 LOCAL cl_buffer_get_image_from_fd_cb *cl_buffer_get_image_from_fd = NULL;
-/* cl_khr_gl_sharing */
-LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
-LOCAL cl_gl_release_texture_cb *cl_gl_release_texture = NULL;
-LOCAL cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object = NULL;
-LOCAL cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object = NULL;
-LOCAL cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer = NULL;
-LOCAL cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer = NULL;
 /* GPGPU */
 LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
 LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
@@ -107,4 +103,6 @@ LOCAL cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer = NULL;
 LOCAL cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info = NULL;
 LOCAL cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info = NULL;
 LOCAL cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer = NULL;
+LOCAL cl_gpgpu_set_kernel_cb *cl_gpgpu_set_kernel = NULL;
+LOCAL cl_gpgpu_get_kernel_cb *cl_gpgpu_get_kernel = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 54c0ffa..8350089 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -16,93 +16,102 @@
  * Author: Rong Yang <rong.r.yang at intel.com>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <pthread.h>
+//#include "cl_image.h"
 #include "cl_enqueue.h"
-#include "cl_image.h"
 #include "cl_driver.h"
 #include "cl_event.h"
 #include "cl_command_queue.h"
 #include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_device_enqueue.h"
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
-cl_int cl_enqueue_read_buffer(enqueue_data* data)
+static cl_int
+cl_enqueue_read_buffer(enqueue_data *data, cl_int status)
   cl_int err = CL_SUCCESS;
   cl_mem mem = data->mem_obj;
+  if (status != CL_COMPLETE)
+    return err;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
          mem->type == CL_MEM_SUBBUFFER_TYPE);
-  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+  struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
   //cl_buffer_get_subdata sometime is very very very slow in linux kernel, in skl and chv,
   //and it is randomly. So temporary disable it, use map/copy/unmap to read.
   //Should re-enable it after find root cause.
   if (0 && !mem->is_userptr) {
     if (cl_buffer_get_subdata(mem->bo, data->offset + buffer->sub_offset,
-			       data->size, data->ptr) != 0)
+                              data->size, data->ptr) != 0)
       err = CL_MAP_FAILURE;
   } else {
-    void* src_ptr = cl_mem_map_auto(mem, 0);
+    void *src_ptr = cl_mem_map_auto(mem, 0);
     if (src_ptr == NULL)
       err = CL_MAP_FAILURE;
     else {
       //sometimes, application invokes read buffer, instead of map buffer, even if userptr is enabled
       //memcpy is not necessary for this case
-      if (data->ptr != (char*)src_ptr + data->offset + buffer->sub_offset)
-        memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
+      if (data->ptr != (char *)src_ptr + data->offset + buffer->sub_offset)
+        memcpy(data->ptr, (char *)src_ptr + data->offset + buffer->sub_offset, data->size);
   return err;
-cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
+static cl_int
+cl_enqueue_read_buffer_rect(enqueue_data *data, cl_int status)
   cl_int err = CL_SUCCESS;
-  void* src_ptr;
-  void* dst_ptr;
+  void *src_ptr;
+  void *dst_ptr;
-  const size_t* origin = data->origin;
-  const size_t* host_origin = data->host_origin;
-  const size_t* region = data->region;
+  const size_t *origin = data->origin;
+  const size_t *host_origin = data->host_origin;
+  const size_t *region = data->region;
   cl_mem mem = data->mem_obj;
+  if (status != CL_COMPLETE)
+    return err;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
          mem->type == CL_MEM_SUBBUFFER_TYPE);
-  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+  struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
   if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
     err = CL_MAP_FAILURE;
     goto error;
-   size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
-   src_ptr = (char*)src_ptr + offset +  buffer->sub_offset;
-   offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
-   dst_ptr = (char *)data->ptr + offset;
-   if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
-       (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
-   {
-     memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
-   }
-   else {
-     cl_uint y, z;
-     for (z = 0; z < region[2]; z++) {
-       const char* src = src_ptr;
-       char* dst = dst_ptr;
-       for (y = 0; y < region[1]; y++) {
-         memcpy(dst, src, region[0]);
-         src += data->row_pitch;
-         dst += data->host_row_pitch;
-       }
-       src_ptr = (char*)src_ptr + data->slice_pitch;
-       dst_ptr = (char*)dst_ptr + data->host_slice_pitch;
-     }
-   }
+  size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch * origin[2];
+  src_ptr = (char *)src_ptr + offset + buffer->sub_offset;
+  offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data->host_slice_pitch * host_origin[2];
+  dst_ptr = (char *)data->ptr + offset;
+  if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+      (region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data->slice_pitch == data->host_slice_pitch))) {
+    memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]);
+  } else {
+    cl_uint y, z;
+    for (z = 0; z < region[2]; z++) {
+      const char *src = src_ptr;
+      char *dst = dst_ptr;
+      for (y = 0; y < region[1]; y++) {
+        memcpy(dst, src, region[0]);
+        src += data->row_pitch;
+        dst += data->host_row_pitch;
+      }
+      src_ptr = (char *)src_ptr + data->slice_pitch;
+      dst_ptr = (char *)dst_ptr + data->host_slice_pitch;
+    }
+  }
   err = cl_mem_unmap_auto(mem);
@@ -110,75 +119,80 @@ error:
   return err;
-cl_int cl_enqueue_write_buffer(enqueue_data *data)
+static cl_int
+cl_enqueue_write_buffer(enqueue_data *data, cl_int status)
   cl_int err = CL_SUCCESS;
   cl_mem mem = data->mem_obj;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
          mem->type == CL_MEM_SUBBUFFER_TYPE);
-  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+  struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
+  if (status != CL_COMPLETE)
+    return err;
   if (mem->is_userptr) {
-    void* dst_ptr = cl_mem_map_auto(mem, 1);
+    void *dst_ptr = cl_mem_map_auto(mem, 1);
     if (dst_ptr == NULL)
       err = CL_MAP_FAILURE;
     else {
-      memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
+      memcpy((char *)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
-  }
-  else {
+  } else {
     if (cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset,
-			   data->size, data->const_ptr) != 0)
+                          data->size, data->const_ptr) != 0)
       err = CL_MAP_FAILURE;
   return err;
-cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
+static cl_int
+cl_enqueue_write_buffer_rect(enqueue_data *data, cl_int status)
   cl_int err = CL_SUCCESS;
-  void* src_ptr;
-  void* dst_ptr;
+  void *src_ptr;
+  void *dst_ptr;
-  const size_t* origin = data->origin;
-  const size_t* host_origin = data->host_origin;
-  const size_t* region = data->region;
+  const size_t *origin = data->origin;
+  const size_t *host_origin = data->host_origin;
+  const size_t *region = data->region;
   cl_mem mem = data->mem_obj;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
          mem->type == CL_MEM_SUBBUFFER_TYPE);
-  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+  struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
+  if (status != CL_COMPLETE)
+    return err;
   if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
     err = CL_MAP_FAILURE;
     goto error;
-  size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+  size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch * origin[2];
   dst_ptr = (char *)dst_ptr + offset + buffer->sub_offset;
-  offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
-  src_ptr = (char*)data->const_ptr + offset;
+  offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data->host_slice_pitch * host_origin[2];
+  src_ptr = (char *)data->const_ptr + offset;
   if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
-      (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
-  {
-    memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
-  }
-  else {
+      (region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data->slice_pitch == data->host_slice_pitch))) {
+    memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]);
+  } else {
     cl_uint y, z;
     for (z = 0; z < region[2]; z++) {
-      const char* src = src_ptr;
-      char* dst = dst_ptr;
+      const char *src = src_ptr;
+      char *dst = dst_ptr;
       for (y = 0; y < region[1]; y++) {
         memcpy(dst, src, region[0]);
         src += data->host_row_pitch;
         dst += data->row_pitch;
-      src_ptr = (char*)src_ptr + data->host_slice_pitch;
-      dst_ptr = (char*)dst_ptr + data->slice_pitch;
+      src_ptr = (char *)src_ptr + data->host_slice_pitch;
+      dst_ptr = (char *)dst_ptr + data->slice_pitch;
@@ -188,16 +202,19 @@ error:
   return err;
-cl_int cl_enqueue_read_image(enqueue_data *data)
+static cl_int
+cl_enqueue_read_image(enqueue_data *data, cl_int status)
   cl_int err = CL_SUCCESS;
-  void* src_ptr;
+  void *src_ptr;
   cl_mem mem = data->mem_obj;
   CHECK_IMAGE(mem, image);
-  const size_t* origin = data->origin;
-  const size_t* region = data->region;
+  const size_t *origin = data->origin;
+  const size_t *region = data->region;
+  if (status != CL_COMPLETE)
+    return err;
   if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
     err = CL_MAP_FAILURE;
@@ -208,40 +225,42 @@ cl_int cl_enqueue_read_image(enqueue_data *data)
   src_ptr = (char*)src_ptr + offset;
   if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
-      (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
-  {
-    memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
-  }
-  else {
+      (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch))) {
+    memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]);
+  } else {
     cl_uint y, z;
     for (z = 0; z < region[2]; z++) {
-      const char* src = src_ptr;
-      char* dst = data->ptr;
+      const char *src = src_ptr;
+      char *dst = data->ptr;
       for (y = 0; y < region[1]; y++) {
-        memcpy(dst, src, image->bpp*region[0]);
+        memcpy(dst, src, image->bpp * region[0]);
         src += image->row_pitch;
         dst += data->row_pitch;
-      src_ptr = (char*)src_ptr + image->slice_pitch;
-      data->ptr = (char*)data->ptr + data->slice_pitch;
+      src_ptr = (char *)src_ptr + image->slice_pitch;
+      data->ptr = (char *)data->ptr + data->slice_pitch;
- err = cl_mem_unmap_auto(mem);
+  err = cl_mem_unmap_auto(mem);
   return err;
-cl_int cl_enqueue_write_image(enqueue_data *data)
+static cl_int
+cl_enqueue_write_image(enqueue_data *data, cl_int status)
   cl_int err = CL_SUCCESS;
-  void* dst_ptr;
+  void *dst_ptr;
   cl_mem mem = data->mem_obj;
   CHECK_IMAGE(mem, image);
+  if (status != CL_COMPLETE)
+    return err;
   if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
     err = CL_MAP_FAILURE;
     goto error;
@@ -255,45 +274,58 @@ cl_int cl_enqueue_write_image(enqueue_data *data)
   return err;
-cl_int cl_enqueue_map_buffer(enqueue_data *data)
+static cl_int
+cl_enqueue_map_buffer(enqueue_data *data, cl_int status)
   void *ptr = NULL;
   cl_int err = CL_SUCCESS;
   cl_mem mem = data->mem_obj;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
-         mem->type == CL_MEM_SUBBUFFER_TYPE);
-  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+         mem->type == CL_MEM_SUBBUFFER_TYPE ||
+         mem->type == CL_MEM_SVM_TYPE);
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer *)mem;
-  if (mem->is_userptr)
-    ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
-  else {
-    if(data->unsync_map == 1)
-      //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
-      ptr = cl_mem_map_gtt(mem);
-    else
+  if (status == CL_SUBMITTED) {
+    if (buffer->base.is_userptr) {
+      ptr = buffer->base.host_ptr;
+    } else {
+      if ((ptr = cl_mem_map_gtt_unsync(&buffer->base)) == NULL) {
+        err = CL_MAP_FAILURE;
+        return err;
+      }
+    }
+    data->ptr = ptr;
+  } else if (status == CL_COMPLETE) {
+    if (mem->is_userptr)
       ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
-  }
+    else {
+      if (data->unsync_map == 1)
+        //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+        ptr = cl_mem_map_gtt(mem);
+      else
+        ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
+    }
-  if (ptr == NULL) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-  data->ptr = ptr;
+    if (ptr == NULL) {
+      err = CL_MAP_FAILURE;
+      return err;
+    }
+    data->ptr = ptr;
-  if((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) {
-    assert(mem->host_ptr);
-    ptr = (char*)ptr + data->offset + buffer->sub_offset;
-    memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
+    if ((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) {
+      assert(mem->host_ptr);
+      ptr = (char *)ptr + data->offset + buffer->sub_offset;
+      memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
+    }
   return err;
-cl_int cl_enqueue_map_image(enqueue_data *data)
+static cl_int
+cl_enqueue_map_image(enqueue_data *data, cl_int status)
   cl_int err = CL_SUCCESS;
   cl_mem mem = data->mem_obj;
@@ -301,46 +333,59 @@ cl_int cl_enqueue_map_image(enqueue_data *data)
   size_t row_pitch = 0;
   CHECK_IMAGE(mem, image);
-  if(data->unsync_map == 1)
-    //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
-    ptr = cl_mem_map_gtt(mem);
-  else
-    ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
+  if (status == CL_SUBMITTED) {
+    if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
+      err = CL_MAP_FAILURE;
+      goto error;
+    }
+    data->ptr = ptr;
+  } else if (status == CL_COMPLETE) {
+    if (data->unsync_map == 1)
+      //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+      ptr = cl_mem_map_gtt(mem);
+    else
+      ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
-  if (ptr == NULL) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-  data->ptr = (char*)ptr + image->offset;
-  if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
-    row_pitch = image->slice_pitch;
-  else
-    row_pitch = image->row_pitch;
-  if(mem->flags & CL_MEM_USE_HOST_PTR) {
-    assert(mem->host_ptr);
-    if (!mem->is_userptr)
-      //src and dst need add offset in function cl_mem_copy_image_region
-      cl_mem_copy_image_region(data->origin, data->region,
-                             mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
-                             data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
+    if (ptr == NULL) {
+      err = CL_MAP_FAILURE;
+      goto error;
+    }
+    data->ptr = (char*)ptr + image->offset;
+    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      row_pitch = image->slice_pitch;
+    else
+      row_pitch = image->row_pitch;
+    if(mem->flags & CL_MEM_USE_HOST_PTR) {
+      assert(mem->host_ptr);
+      if (!mem->is_userptr)
+        //src and dst need add offset in function cl_mem_copy_image_region
+        cl_mem_copy_image_region(data->origin, data->region,
+                                 mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+                                 data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
+    }
   return err;
-cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
+static cl_int
+cl_enqueue_unmap_mem_object(enqueue_data *data, cl_int status)
   cl_int err = CL_SUCCESS;
   int i, j;
   size_t mapped_size = 0;
   size_t origin[3], region[3];
-  void * v_ptr = NULL;
-  void * mapped_ptr = data->ptr;
+  void *v_ptr = NULL;
+  void *mapped_ptr = data->ptr;
   cl_mem memobj = data->mem_obj;
   size_t row_pitch = 0;
+  if (status != CL_COMPLETE)
+    return err;
   assert(memobj->mapped_ptr_sz >= memobj->map_ref);
   for (i = 0; i < memobj->mapped_ptr_sz; i++) {
@@ -348,7 +393,7 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
       memobj->mapped_ptr[i].ptr = NULL;
       mapped_size = memobj->mapped_ptr[i].size;
       v_ptr = memobj->mapped_ptr[i].v_ptr;
-      for(j=0; j<3; j++) {
+      for (j = 0; j < 3; j++) {
         region[j] = memobj->mapped_ptr[i].region[j];
         origin[j] = memobj->mapped_ptr[i].origin[j];
         memobj->mapped_ptr[i].region[j] = 0;
@@ -364,10 +409,11 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
   INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
   if (memobj->flags & CL_MEM_USE_HOST_PTR) {
-    if(memobj->type == CL_MEM_BUFFER_TYPE ||
-       memobj->type == CL_MEM_SUBBUFFER_TYPE) {
+    if (memobj->type == CL_MEM_BUFFER_TYPE ||
+        memobj->type == CL_MEM_SUBBUFFER_TYPE ||
+        memobj->type == CL_MEM_SVM_TYPE) {
       assert(mapped_ptr >= memobj->host_ptr &&
-        mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
+             mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
       /* Sync the data. */
       if (!memobj->is_userptr)
         memcpy(v_ptr, mapped_ptr, mapped_size);
@@ -381,8 +427,8 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
       if (!memobj->is_userptr)
         //v_ptr have added offset, host_ptr have not added offset.
         cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
-                               memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
-                               image, CL_FALSE, CL_TRUE);
+                                 memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+                                 image, CL_FALSE, CL_TRUE);
   } else {
     assert(v_ptr == mapped_ptr);
@@ -391,24 +437,24 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
   /* shrink the mapped slot. */
-  if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
+  if (memobj->mapped_ptr_sz / 2 > memobj->map_ref) {
     int j = 0;
     cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
-                             sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
+      sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz / 2));
     if (!new_ptr) {
       /* Just do nothing. */
       goto error;
-    memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr));
+    memset(new_ptr, 0, (memobj->mapped_ptr_sz / 2) * sizeof(cl_mapped_ptr));
     for (i = 0; i < memobj->mapped_ptr_sz; i++) {
       if (memobj->mapped_ptr[i].ptr) {
         new_ptr[j] = memobj->mapped_ptr[i];
-        assert(j < memobj->mapped_ptr_sz/2);
+        assert(j < memobj->mapped_ptr_sz / 2);
-    memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
+    memobj->mapped_ptr_sz = memobj->mapped_ptr_sz / 2;
     memobj->mapped_ptr = new_ptr;
@@ -417,7 +463,8 @@ error:
   return err;
-cl_int cl_enqueue_native_kernel(enqueue_data *data)
+static cl_int
+cl_enqueue_native_kernel(enqueue_data *data, cl_int status)
   cl_int err = CL_SUCCESS;
   cl_uint num_mem_objects = (cl_uint)data->offset;
@@ -425,65 +472,208 @@ cl_int cl_enqueue_native_kernel(enqueue_data *data)
   const void **args_mem_loc = (const void **)data->const_ptr;
   cl_uint i;
-  for (i=0; i<num_mem_objects; ++i)
-  {
-      const cl_mem buffer = mem_list[i];
-      CHECK_MEM(buffer);
+  if (status != CL_COMPLETE)
+    return err;
+  for (i = 0; i < num_mem_objects; ++i) {
+    const cl_mem buffer = mem_list[i];
+    CHECK_MEM(buffer);
-      *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0);
+    *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0);
-  for (i=0; i<num_mem_objects; ++i)
-  {
-      cl_mem_unmap_auto(mem_list[i]);
+  for (i = 0; i < num_mem_objects; ++i) {
+    cl_mem_unmap_auto(mem_list[i]);
-  free(data->ptr);
   return err;
-cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
+cl_int cl_enqueue_svm_free(enqueue_data *data, cl_int status) {
+  int i;
+  void **pointers = data->pointers;
+  uint num_svm_ptrs = data->size;
+  cl_int err = CL_SUCCESS;
+  if (status != CL_COMPLETE)
+    return err;
+  if(data->free_func) {
+    data->free_func(data->queue, num_svm_ptrs, pointers, data->ptr);
+  } else {
+    for(i=0; i<num_svm_ptrs; i++)
+      cl_mem_svm_delete(data->queue->ctx, pointers[i]);
+  }
+  free(pointers);
+  return CL_SUCCESS;
+cl_int cl_enqueue_svm_mem_copy(enqueue_data *data, cl_int status) {
+  cl_mem mem;
+  size_t size = data->size;
+  const char* src_ptr = (const char *)data->const_ptr;
+  char *dst_ptr = (char *)data->ptr;
+  cl_int err = CL_SUCCESS;
+  int i;
+  if (status != CL_COMPLETE)
+    return err;
+  if((mem = cl_context_get_svm_from_ptr(data->queue->ctx, data->ptr)) != NULL) {
+      dst_ptr = (char *)cl_mem_map_auto(mem, 1);
+  }
+  if((mem = cl_context_get_svm_from_ptr(data->queue->ctx, data->const_ptr)) != NULL) {
+      src_ptr = (const char *)cl_mem_map_auto(mem, 0);
+  }
+  for(i=0; i<size; i++) {
+    dst_ptr[i] = src_ptr[i];
+  }
+  return CL_SUCCESS;
+cl_int cl_enqueue_svm_mem_fill(enqueue_data *data, cl_int status) {
+  cl_mem mem;
+  size_t size = data->size;
+  size_t pattern_size = data->pattern_size;
+  const char* pattern = (const char *)data->const_ptr;
+  char *ptr = (char *)data->ptr;
+  cl_int err = CL_SUCCESS;
+  int i, j;
+  if (status != CL_COMPLETE)
+    return err;
+  if((mem = cl_context_get_svm_from_ptr(data->queue->ctx, data->ptr)) != NULL) {
+      ptr = (char *)cl_mem_map_auto(mem, 1);
+  }
+  for(i=0; i<size; ) {
+    for(j=0; j<pattern_size; j++) {
+      ptr[i++] = pattern[j];
+    }
+  }
+  return CL_SUCCESS;
+static cl_int
+cl_enqueue_ndrange(enqueue_data *data, cl_int status)
+  cl_int err = CL_SUCCESS;
+  if (status == CL_SUBMITTED) {
+    err = cl_command_queue_flush_gpgpu(data->gpgpu);
+    //if it is the last ndrange of an cl enqueue api,
+    //check the device enqueue information.
+    if (data->mid_event_of_enq == 0) {
+      assert(data->queue);
+      cl_device_enqueue_parse_result(data->queue, data->gpgpu);
+    }
+  } else if (status == CL_COMPLETE) {
+    void *batch_buf = cl_gpgpu_ref_batch_buf(data->gpgpu);
+    cl_gpgpu_sync(batch_buf);
+    cl_gpgpu_unref_batch_buf(batch_buf);
+  }
+  return err;
+static cl_int
+cl_enqueue_marker_or_barrier(enqueue_data *data, cl_int status)
-  /* if need profiling, add the submit timestamp here. */
-  if (event && event->type != CL_COMMAND_USER
-           && event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-    cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT);
+  return CL_COMPLETE;
+LOCAL void
+cl_enqueue_delete(enqueue_data *data)
+  if (data == NULL)
+    return;
+  if (data->type == EnqueueCopyBufferRect ||
+      data->type == EnqueueCopyBuffer ||
+      data->type == EnqueueCopyImage ||
+      data->type == EnqueueCopyBufferToImage ||
+      data->type == EnqueueCopyImageToBuffer ||
+      data->type == EnqueueNDRangeKernel ||
+      data->type == EnqueueFillBuffer ||
+      data->type == EnqueueFillImage) {
+    if (data->gpgpu) {
+      cl_gpgpu_delete(data->gpgpu);
+      data->gpgpu = NULL;
+    }
+    return;
-  switch(data->type) {
-    case EnqueueReadBuffer:
-      return cl_enqueue_read_buffer(data);
-    case EnqueueReadBufferRect:
-      return cl_enqueue_read_buffer_rect(data);
-    case EnqueueWriteBuffer:
-      return cl_enqueue_write_buffer(data);
-    case EnqueueWriteBufferRect:
-      return cl_enqueue_write_buffer_rect(data);
-    case EnqueueReadImage:
-      return cl_enqueue_read_image(data);
-    case EnqueueWriteImage:
-      return cl_enqueue_write_image(data);
-    case EnqueueMapBuffer:
-      return cl_enqueue_map_buffer(data);
-    case EnqueueMapImage:
-      return cl_enqueue_map_image(data);
-    case EnqueueUnmapMemObject:
-      return cl_enqueue_unmap_mem_object(data);
-    case EnqueueCopyBufferRect:
-    case EnqueueCopyBuffer:
-    case EnqueueCopyImage:
-    case EnqueueCopyBufferToImage:
-    case EnqueueCopyImageToBuffer:
-    case EnqueueNDRangeKernel:
-    case EnqueueFillBuffer:
-    case EnqueueFillImage:
-      return cl_event_flush(event);
-    case EnqueueNativeKernel:
-      return cl_enqueue_native_kernel(data);
-    case EnqueueMigrateMemObj:
-    default:
-      return CL_SUCCESS;
+  if (data->type == EnqueueNativeKernel) {
+    if (data->mem_list) {
+      cl_free((void*)data->mem_list);
+      data->mem_list = NULL;
+    }
+    if (data->ptr) {
+      cl_free((void*)data->ptr);
+      data->ptr = NULL;
+    }
+    if (data->const_ptr) {
+      cl_free((void*)data->const_ptr);
+      data->const_ptr = NULL;
+    }
+  }
+LOCAL cl_int
+cl_enqueue_handle(enqueue_data *data, cl_int status)
+  switch (data->type) {
+  case EnqueueReturnSuccesss:
+    return CL_SUCCESS;
+  case EnqueueReadBuffer:
+    return cl_enqueue_read_buffer(data, status);
+  case EnqueueReadBufferRect:
+    return cl_enqueue_read_buffer_rect(data, status);
+  case EnqueueWriteBuffer:
+    return cl_enqueue_write_buffer(data, status);
+  case EnqueueWriteBufferRect:
+    return cl_enqueue_write_buffer_rect(data, status);
+  case EnqueueReadImage:
+    return cl_enqueue_read_image(data, status);
+  case EnqueueWriteImage:
+    return cl_enqueue_write_image(data, status);
+  case EnqueueMapBuffer:
+    return cl_enqueue_map_buffer(data, status);
+  case EnqueueMapImage:
+    return cl_enqueue_map_image(data, status);
+  case EnqueueUnmapMemObject:
+    return cl_enqueue_unmap_mem_object(data, status);
+  case EnqueueSVMFree:
+    return cl_enqueue_svm_free(data, status);
+  case EnqueueSVMMemCopy:
+    return cl_enqueue_svm_mem_copy(data, status);
+  case EnqueueSVMMemFill:
+    return cl_enqueue_svm_mem_fill(data, status);
+  case EnqueueMarker:
+  case EnqueueBarrier:
+    return cl_enqueue_marker_or_barrier(data, status);
+  case EnqueueCopyBufferRect:
+  case EnqueueCopyBuffer:
+  case EnqueueCopyImage:
+  case EnqueueCopyBufferToImage:
+  case EnqueueCopyImageToBuffer:
+  case EnqueueNDRangeKernel:
+  case EnqueueFillBuffer:
+  case EnqueueFillImage:
+    //return cl_event_flush(event);
+    return cl_enqueue_ndrange(data, status);
+  case EnqueueNativeKernel:
+    return cl_enqueue_native_kernel(data, status);
+  case EnqueueMigrateMemObj:
+  default:
+    return CL_SUCCESS;
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index 09305af..50a54fc 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -24,7 +24,8 @@
 #include "CL/cl.h"
 typedef enum {
-  EnqueueReadBuffer = 0,
+  EnqueueReturnSuccesss = 0, /* For some case, we have nothing to do, just return SUCCESS. */
+  EnqueueReadBuffer,
@@ -45,30 +46,45 @@ typedef enum {
+  EnqueueSVMFree,
+  EnqueueSVMMemCopy,
+  EnqueueSVMMemFill,
 } enqueue_type;
 typedef struct _enqueue_data {
-  enqueue_type      type;             /* Command type */
-  cl_mem            mem_obj;          /* Enqueue's cl_mem */
-  cl_command_queue  queue;            /* Command queue */
-  size_t            offset;           /* Mem object's offset */
-  size_t            size;             /* Size */
-  size_t            origin[3];        /* Origin */
-  size_t            host_origin[3];   /* Origin */
-  size_t            region[3];        /* Region */
-  size_t            row_pitch;        /* Row pitch */
-  size_t            slice_pitch;      /* Slice pitch */
-  size_t            host_row_pitch;   /* Host row pitch, used in read/write buffer rect */
-  size_t            host_slice_pitch; /* Host slice pitch, used in read/write buffer rect */
-  const void *      const_ptr;        /* Const ptr for memory read */
-  void *            ptr;              /* Ptr for write and return value */
-  const cl_mem*     mem_list;         /* mem_list of clEnqueueNativeKernel */
-  uint8_t           unsync_map;       /* Indicate the clEnqueueMapBuffer/Image is unsync map */
-  uint8_t           write_map;        /* Indicate if the clEnqueueMapBuffer is write enable */
-  void (*user_func)(void *);          /* pointer to a host-callable user function */
+  enqueue_type type;         /* Command type */
+  cl_mem mem_obj;            /* Enqueue's cl_mem */
+  cl_command_queue queue;    /* Command queue */
+  size_t offset;             /* Mem object's offset */
+  size_t size;               /* Size */
+  size_t origin[3];          /* Origin */
+  size_t host_origin[3];     /* Origin */
+  size_t region[3];          /* Region */
+  size_t row_pitch;          /* Row pitch */
+  size_t slice_pitch;        /* Slice pitch */
+  size_t host_row_pitch;     /* Host row pitch, used in read/write buffer rect */
+  size_t host_slice_pitch;   /* Host slice pitch, used in read/write buffer rect */
+  const void *const_ptr;     /* Const ptr for memory read */
+  void *ptr;                 /* Ptr for write and return value */
+  const cl_mem *mem_list;    /* mem_list of clEnqueueNativeKernel */
+  uint8_t unsync_map;        /* Indicate the clEnqueueMapBuffer/Image is unsync map */
+  uint8_t write_map;         /* Indicate if the clEnqueueMapBuffer is write enable */
+  void ** pointers;          /* The svm_pointers of clEnqueueSVMFree  */
+  size_t  pattern_size;      /* the pattern_size of clEnqueueSVMMemFill */
+  void (*user_func)(void *); /* pointer to a host-callable user function */
+  void (CL_CALLBACK *free_func)( cl_command_queue queue,
+                                 cl_uint num_svm_pointers,
+                                 void *svm_pointers[],
+                                 void *user_data);  /* pointer to pfn_free_func of clEnqueueSVMFree */
+  cl_gpgpu gpgpu;
+  cl_bool mid_event_of_enq;  /* For non-uniform ndrange, one enqueue have a sequence event, the
+                                last event need to parse device enqueue information.
+                                0 : last event; 1: non-last event */
 } enqueue_data;
 /* Do real enqueue commands */
-cl_int cl_enqueue_handle(cl_event event, enqueue_data* data);
+extern cl_int cl_enqueue_handle(enqueue_data *data, cl_int status);
+extern void cl_enqueue_delete(enqueue_data *data);
 #endif /* __CL_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
index a2aacea..3e1dc22 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -14,769 +14,684 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- * Author: Rong Yang <rong.r.yang at intel.com>
 #include "cl_event.h"
 #include "cl_context.h"
-#include "cl_utils.h"
-#include "cl_alloc.h"
-#include "cl_khr_icd.h"
-#include "cl_kernel.h"
 #include "cl_command_queue.h"
-#include <assert.h>
+#include "cl_alloc.h"
+#include <string.h>
 #include <stdio.h>
-void cl_event_update_last_events(cl_command_queue queue, int wait)
+// TODO: Need to move it to some device related file later.
+static void
+cl_event_update_timestamp_gen(cl_event event, cl_int status)
-  cl_event last_event = get_last_event(queue);
-  if(!last_event) return;
-  cl_event next, now;
-  now = last_event;
-  while(now){
-    next = now->last_next;//get next first in case set status maintain it
-    cl_event_update_status(now,wait);//update event status
-    now = next;
+  cl_ulong ts = 0;
+  if ((event->exec_data.type == EnqueueCopyBufferRect) ||
+      (event->exec_data.type == EnqueueCopyBuffer) ||
+      (event->exec_data.type == EnqueueCopyImage) ||
+      (event->exec_data.type == EnqueueCopyBufferToImage) ||
+      (event->exec_data.type == EnqueueCopyImageToBuffer) ||
+      (event->exec_data.type == EnqueueNDRangeKernel) ||
+      (event->exec_data.type == EnqueueFillBuffer) ||
+      (event->exec_data.type == EnqueueFillImage)) {
+    if (status == CL_QUEUED || status == CL_SUBMITTED) {
+      cl_gpgpu_event_get_gpu_cur_timestamp(event->queue->ctx->drv, &ts);
+        ts++;
+      event->timestamp[CL_QUEUED - status] = ts;
+      return;
+    } else if (status == CL_RUNNING) {
+      assert(event->exec_data.gpgpu);
+      return; // Wait for the event complete and get run and complete then.
+    } else {
+      assert(event->exec_data.gpgpu);
+      cl_gpgpu_event_get_exec_timestamp(event->exec_data.gpgpu, 0, &ts);
+        ts++;
+      event->timestamp[2] = ts;
+      cl_gpgpu_event_get_exec_timestamp(event->exec_data.gpgpu, 1, &ts);
+        ts++;
+      event->timestamp[3] = ts;
+      /* Set the submit time the same as running time if it is later. */
+      if (event->timestamp[1] > event->timestamp[2] ||
+          event->timestamp[2] - event->timestamp[1] > 0x0FFFFFFFFFF /*Overflowed */)
+        event->timestamp[1] = event->timestamp[2];
+      return;
+    }
+  } else {
+    cl_gpgpu_event_get_gpu_cur_timestamp(event->queue->ctx->drv, &ts);
+      ts++;
+    event->timestamp[CL_QUEUED - status] = ts;
+    return;
-void cl_event_insert_last_events(cl_command_queue queue,cl_event event)
+LOCAL void
+cl_event_update_timestamp(cl_event event, cl_int state)
-  if(!event) return;
-  cl_event last_event = get_last_event(queue);
-  if(last_event){
-    cl_event now = last_event;
-    while(now->last_next)
-      now = now->last_next;
-    now->last_next = event;
-    event->last_prev = now;
+  int i;
+  cl_bool re_cal = CL_FALSE;
+  cl_ulong ts[4];
+  assert(state >= CL_COMPLETE || state <= CL_QUEUED);
+  if (event->event_type == CL_COMMAND_USER)
+    return;
+  assert(event->queue);
+  if ((event->queue->props & CL_QUEUE_PROFILING_ENABLE) == 0)
+    return;
+  /* Should not record the timestamp twice. */
+  assert(event->timestamp[CL_QUEUED - state] == CL_EVENT_INVALID_TIMESTAMP);
+  cl_event_update_timestamp_gen(event, state);
+  if (state == CL_COMPLETE) {
+    // TODO: Need to set the CL_PROFILING_COMMAND_COMPLETE when enable child enqueue.
+    // Just a duplicate of event complete time now.
+    event->timestamp[4] = event->timestamp[3];
+    /* If timestamp overflow, set queued time to 0 and re-calculate. */
+    for (i = 0; i < 4; i++) {
+      if (event->timestamp[i + 1] < event->timestamp[i]) {
+        re_cal = CL_TRUE;
+        break;
+      }
+    }
+    if (re_cal) {
+      for (i = 3; i >= 0; i--) {
+        if (event->timestamp[i + 1] < event->timestamp[i]) { //overflow
+          ts[i] = event->timestamp[i + 1] + (CL_EVENT_INVALID_TIMESTAMP - event->timestamp[i]);
+        } else {
+          ts[i] = event->timestamp[i + 1] - event->timestamp[i];
+        }
+      }
+      event->timestamp[0] = 0;
+      for (i = 1; i < 5; i++) {
+        event->timestamp[i] = event->timestamp[i - 1] + ts[i - 1];
+      }
+    }
-  else set_last_event(queue,event);
-static inline cl_bool
-cl_event_is_gpu_command_type(cl_command_type type)
+LOCAL void
+cl_event_add_ref(cl_event event)
-  switch(type) {
-    case CL_COMMAND_TASK:
-      return CL_TRUE;
-    default:
-      return CL_FALSE;
-  }
+  assert(event);
+  CL_OBJECT_INC_REF(event);
-int cl_event_flush(cl_event event)
+LOCAL cl_int
+cl_event_get_status(cl_event event)
-  int err = CL_SUCCESS;
-  if(!event) {
-    err = CL_INVALID_VALUE;
-    return err;
-  }
+  cl_int ret;
-  assert(event->gpgpu_event != NULL);
-  if (event->gpgpu) {
-    err = cl_command_queue_flush_gpgpu(event->queue, event->gpgpu);
-    cl_gpgpu_delete(event->gpgpu);
-    event->gpgpu = NULL;
-  }
-  cl_gpgpu_event_flush(event->gpgpu_event);
-  cl_event_insert_last_events(event->queue,event);
-  return err;
+  assert(event);
+  CL_OBJECT_LOCK(event);
+  ret = event->status;
+  return ret;
-cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
+static cl_event
+cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type,
+             cl_uint num_events, cl_event *event_list)
-  cl_event event = NULL;
+  int i;
+  cl_event e = cl_calloc(1, sizeof(_cl_event));
+  if (e == NULL)
+    return NULL;
-  /* Allocate and inialize the structure itself */
-  TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
-  SET_ICD(event->dispatch)
-  event->magic = CL_MAGIC_EVENT_HEADER;
-  event->ref_n = 1;
   /* Append the event in the context event list */
-  pthread_mutex_lock(&ctx->event_lock);
-    event->next = ctx->events;
-    if (ctx->events != NULL)
-      ctx->events->prev = event;
-    ctx->events = event;
-  pthread_mutex_unlock(&ctx->event_lock);
-  event->ctx   = ctx;
-  cl_context_add_ref(ctx);
-  /* Initialize all members and create GPGPU event object */
-  event->queue = queue;
-  event->type  = type;
-  event->gpgpu_event = NULL;
-  if(type == CL_COMMAND_USER) {
-    event->status = CL_SUBMITTED;
+  cl_context_add_event(ctx, e);
+  e->queue = queue;
+  list_init(&e->callbacks);
+  list_node_init(&e->enqueue_node);
+  e->event_type = type;
+  if (type == CL_COMMAND_USER) {
+    e->status = CL_SUBMITTED;
+  } else {
+    e->status = CL_EVENT_STATE_UNKNOWN;
-  else {
-    event->status = CL_QUEUED;
-    if(cl_event_is_gpu_command_type(event->type))
-      event->gpgpu_event = cl_gpgpu_event_new(gpgpu);
+  if (type == CL_COMMAND_USER) {
+    assert(queue == NULL);
+  }
+  e->depend_events = event_list;
+  e->depend_event_num = num_events;
+  for (i = 0; i < 4; i++) {
+    e->timestamp[i] = CL_EVENT_INVALID_TIMESTAMP;
-  cl_event_add_ref(event);       //dec when complete
-  event->user_cb = NULL;
-  event->enqueue_cb = NULL;
-  event->waits_head = NULL;
-  event->emplict = emplict;
-  return event;
-  cl_event_delete(event);
-  event = NULL;
-  goto exit;
+  return e;
-void cl_event_delete(cl_event event)
+LOCAL void
+cl_event_delete(cl_event event)
+  int i;
+  cl_event_user_callback cb;
   if (UNLIKELY(event == NULL))
-  cl_event_update_status(event, 0);
-  if (atomic_dec(&event->ref_n) > 1)
+  if (CL_OBJECT_DEC_REF(event) > 1)
-  /* Call all user's callback if haven't execute */
-  cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE status will force all callbacks that are not executed to run
+  cl_enqueue_delete(&event->exec_data);
-  /* delete gpgpu event object */
-  if(event->gpgpu_event)
-    cl_gpgpu_event_delete(event->gpgpu_event);
+  assert(list_node_out_of_list(&event->enqueue_node));
+  if (event->depend_events) {
+    assert(event->depend_event_num);
+    for (i = 0; i < event->depend_event_num; i++) {
+      cl_event_delete(event->depend_events[i]);
+    }
+    cl_free(event->depend_events);
+  }
+  /* Free all the callbacks. Last ref, no need to lock. */
+  while (!list_empty(&event->callbacks)) {
+    cb = list_entry(event->callbacks.head_node.n, _cl_event_user_callback, node);
+    list_node_del(&cb->node);
+    cl_free(cb);
+  }
   /* Remove it from the list */
-  pthread_mutex_lock(&event->ctx->event_lock);
-  if (event->prev)
-    event->prev->next = event->next;
-  if (event->next)
-    event->next->prev = event->prev;
-  /* if this is the head, update head pointer ctx->events */
-  if (event->ctx->events == event)
-    event->ctx->events = event->next;
-  pthread_mutex_unlock(&event->ctx->event_lock);
-  cl_context_delete(event->ctx);
-  if (event->gpgpu) {
-    fprintf(stderr, "Warning: a event is deleted with a pending enqueued task.\n");
-    cl_gpgpu_delete(event->gpgpu);
-    event->gpgpu = NULL;
-  }
+  cl_context_remove_event(event->ctx, event);
-void cl_event_add_ref(cl_event event)
+LOCAL cl_event
+cl_event_create(cl_context ctx, cl_command_queue queue, cl_uint num_events,
+                const cl_event *event_list, cl_command_type type, cl_int *errcode_ret)
-  assert(event);
-  atomic_inc(&event->ref_n);
+  cl_event e = NULL;
+  cl_event *depend_events = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_uint total_events = 0;
+  int i;
-cl_int cl_event_set_callback(cl_event event ,
-                                  cl_int command_exec_callback_type,
-                                  EVENT_NOTIFY pfn_notify,
-                                  void* user_data)
-  assert(event);
-  assert(pfn_notify);
+  assert(ctx);
-  cl_int err = CL_SUCCESS;
-  user_callback *cb;
-  TRY_ALLOC(cb, CALLOC(user_callback));
-  cb->pfn_notify  = pfn_notify;
-  cb->user_data   = user_data;
-  cb->status      = command_exec_callback_type;
-  cb->executed    = CL_FALSE;
-  // It is possible that the event enqueued is already completed.
-  // clEnqueueReadBuffer can be synchronous and when the callback
-  // is registered after, it still needs to get executed.
-  pthread_mutex_lock(&event->ctx->event_lock); // Thread safety required: operations on the event->status can be made from many different threads
-  if(event->status <= command_exec_callback_type) {
-    /* Call user callback */
-    pthread_mutex_unlock(&event->ctx->event_lock); // pfn_notify can call clFunctions that use the event_lock and from here it's not required
-    cb->pfn_notify(event, event->status, cb->user_data);
-    cl_free(cb);
-  } else {
-    // Enqueue to callback list
-    cb->next        = event->user_cb;
-    event->user_cb  = cb;
-    pthread_mutex_unlock(&event->ctx->event_lock);
-  }
+  do {
+    if (event_list)
+      assert(num_events);
-  return err;
-  cl_free(cb);
-  goto exit;
-cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
-                                    const cl_event *event_wait_list,
-                                    cl_event *event,cl_context ctx)
-  cl_int err = CL_SUCCESS;
-  cl_int i;
-  /* check the event_wait_list and num_events_in_wait_list */
-  if((event_wait_list == NULL) &&
-     (num_events_in_wait_list > 0))
-    goto error;
-  if ((event_wait_list != NULL) &&
-      (num_events_in_wait_list == 0)){
-    goto error;
-  }
+    if (queue == NULL) {
+      assert(type == CL_COMMAND_USER);
+      assert(event_list == NULL);
+      assert(num_events == 0);
-  /* check the event and context */
-  for(i=0; i<num_events_in_wait_list; i++) {
-    CHECK_EVENT(event_wait_list[i]);
-    if(event_wait_list[i]->status < CL_COMPLETE) {
-      goto exit;
+      e = cl_event_new(ctx, queue, type, 0, NULL);
+      if (e == NULL) {
+        err = CL_OUT_OF_HOST_MEMORY;
+        break;
+      }
+    } else {
+      CL_OBJECT_LOCK(queue);
+      total_events = queue->barrier_events_num + num_events;
+      if (total_events) {
+        depend_events = cl_calloc(total_events, sizeof(cl_event));
+        if (depend_events == NULL) {
+          CL_OBJECT_UNLOCK(queue);
+          err = CL_OUT_OF_HOST_MEMORY;
+          break;
+        }
+      }
+      /* Add all the barrier events as depend events. */
+      for (i = 0; i < queue->barrier_events_num; i++) {
+        assert(CL_EVENT_IS_BARRIER(queue->barrier_events[i]));
+        cl_event_add_ref(queue->barrier_events[i]);
+        depend_events[num_events + i] = queue->barrier_events[i];
+      }
+      CL_OBJECT_UNLOCK(queue);
+      for (i = 0; i < num_events; i++) {
+        assert(event_list && event_list[i]);
+        assert(event_list[i]->ctx == ctx);
+        assert(CL_OBJECT_IS_EVENT(event_list[i]));
+        cl_event_add_ref(event_list[i]);
+        depend_events[i] = event_list[i];
+      }
+      if (depend_events)
+        assert(total_events);
+      e = cl_event_new(ctx, queue, type, total_events, depend_events);
+      if (e == NULL) {
+        err = CL_OUT_OF_HOST_MEMORY;
+        break;
+      }
+      depend_events = NULL;
-    if(event && event == &event_wait_list[i])
-      goto error;
-    if(event_wait_list[i]->ctx != ctx) {
-      err = CL_INVALID_CONTEXT;
-      goto exit;
+  } while (0);
+  if (err != CL_SUCCESS) {
+    if (depend_events) {
+      for (i = 0; i < total_events; i++) {
+        cl_event_delete(depend_events[i]);
+      }
+      cl_free(depend_events);
+    // if set depend_events, must succeed.
+    assert(e->depend_events == NULL);
+    cl_event_delete(e);
-  return err;
-  err = CL_INVALID_EVENT_WAIT_LIST;  //reset error
-  goto exit;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return e;
-cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
-                            cl_command_queue queue)
+LOCAL cl_int
+cl_event_set_callback(cl_event event, cl_int exec_type, cl_event_notify_cb pfn_notify, void *user_data)
-  cl_int i;
+  cl_int err = CL_SUCCESS;
+  cl_event_user_callback cb;
+  cl_bool exec_imm = CL_FALSE;
+  assert(event);
+  assert(pfn_notify);
-  /* Check whether wait user events */
-  for(i=0; i<num_events_in_wait_list; i++) {
-    if(event_wait_list[i]->status <= CL_COMPLETE)
-      continue;
+  do {
+    cb = cl_calloc(1, sizeof(_cl_event_user_callback));
+    if (cb == NULL) {
+      err = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
-    /* Need wait on user event, return and do enqueue defer */
-    if((event_wait_list[i]->type == CL_COMMAND_USER) ||
-       (event_wait_list[i]->enqueue_cb &&
-       (event_wait_list[i]->enqueue_cb->wait_user_events != NULL))){
+    list_node_init(&cb->node);
+    cb->pfn_notify = pfn_notify;
+    cb->user_data = user_data;
+    cb->status = exec_type;
+    cb->executed = CL_FALSE;
+    CL_OBJECT_LOCK(event);
+    if (event->status > exec_type) {
+      list_add_tail(&event->callbacks, &cb->node);
+      cb = NULL;
+    } else {
+      /* The state has already OK, call it immediately. */
+      exec_imm = CL_TRUE;
-  }
+    CL_OBJECT_UNLOCK(event);
-  if(queue && queue->barrier_events_num )
+    if (exec_imm) {
+      cb->pfn_notify(event, event->status, cb->user_data);
+    }
-  /* Non user events or all user event finished, wait all enqueue events finish */
-  for(i=0; i<num_events_in_wait_list; i++) {
-    if(event_wait_list[i]->status <= CL_COMPLETE)
-      continue;
+  } while (0);
-    //enqueue callback haven't finish, in another thread, wait
-    if(event_wait_list[i]->enqueue_cb != NULL)
-    if(event_wait_list[i]->gpgpu_event)
-      cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
-    cl_event_set_status(event_wait_list[i], CL_COMPLETE);  //Execute user's callback
-  }
+  if (cb)
+    cl_free(cb);
+  return err;
-void cl_event_new_enqueue_callback(cl_event event,
-                                            enqueue_data *data,
-                                            cl_uint num_events_in_wait_list,
-                                            const cl_event *event_wait_list)
+LOCAL cl_int
+cl_event_set_status(cl_event event, cl_int status)
-  enqueue_callback *cb, *node;
-  user_event *user_events, *u_ev;
-  cl_command_queue queue = event ? event->queue : NULL;
-  cl_int i;
-  cl_int err = CL_SUCCESS;
+  list_head tmp_callbacks;
+  list_node *n;
+  list_node *pos;
+  cl_bool notify_queue = CL_FALSE;
+  cl_event_user_callback cb;
-  /* Allocate and initialize the structure itself */
-  TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
-  cb->num_events = 0;
-  TRY_ALLOC_NO_ERR (cb->wait_list, CALLOC_ARRAY(cl_event, num_events_in_wait_list));
-  for(i=0; i<num_events_in_wait_list; i++) {
-    //user event will insert to cb->wait_user_events, need not in wait list, avoid ref twice
-    if(event_wait_list[i]->type != CL_COMMAND_USER) {
-      cb->wait_list[cb->num_events++] = event_wait_list[i];
-      cl_event_add_ref(event_wait_list[i]);  //add defer enqueue's wait event reference
-    }
+  assert(event);
+  CL_OBJECT_LOCK(event);
+  if (event->status <= CL_COMPLETE) { // Already set to error or completed
+    CL_OBJECT_UNLOCK(event);
-  cb->event = event;
-  cb->next = NULL;
-  cb->wait_user_events = NULL;
-  if(queue && queue->barrier_events_num > 0) {
-    for(i=0; i<queue->barrier_events_num; i++) {
-      /* Insert the enqueue_callback to user event list */
-      node = queue->wait_events[i]->waits_head;
-      if(node == NULL)
-        queue->wait_events[i]->waits_head = cb;
-      else{
-        while((node != cb) && node->next)
-          node = node->next;
-        if(node == cb)   //wait on dup user event
-          continue;
-        node->next = cb;
-      }
-      /* Insert the user event to enqueue_callback's wait_user_events */
-      TRY(cl_event_insert_user_event, &cb->wait_user_events, queue->wait_events[i]);
-      cl_event_add_ref(queue->wait_events[i]);
-    }
+  if (CL_EVENT_IS_USER(event)) {
+    assert(event->status != CL_RUNNING && event->status != CL_QUEUED);
+  } else {
+    assert(event->queue); // Must belong to some queue.
-  /* Find out all user events that in event_wait_list wait */
-  for(i=0; i<num_events_in_wait_list; i++) {
-    if(event_wait_list[i]->status <= CL_COMPLETE)
-      continue;
-    if(event_wait_list[i]->type == CL_COMMAND_USER) {
-      /* Insert the enqueue_callback to user event list */
-      node = event_wait_list[i]->waits_head;
-      if(node == NULL)
-        event_wait_list[i]->waits_head = cb;
-      else {
-        while((node != cb) && node->next)
-          node = node->next;
-        if(node == cb)   //wait on dup user event
+  if (status >= event->status) { // Should never go back.
+    CL_OBJECT_UNLOCK(event);
+  }
+  event->status = status;
+  /* Call all the callbacks. */
+  if (!list_empty(&event->callbacks)) {
+    do {
+      status = event->status;
+      list_init(&tmp_callbacks);
+      list_move(&event->callbacks, &tmp_callbacks);
+      /* Call all the callbacks without lock. */
+      CL_OBJECT_UNLOCK(event);
+      list_for_each_safe(pos, n, &tmp_callbacks)
+      {
+        cb = list_entry(pos, _cl_event_user_callback, node);
+        assert(cb->executed == CL_FALSE);
+        if (cb->status < status)
-        node->next = cb;
-      }
-      /* Insert the user event to enqueue_callback's wait_user_events */
-      TRY(cl_event_insert_user_event, &cb->wait_user_events, event_wait_list[i]);
-      cl_event_add_ref(event_wait_list[i]);
-      if(queue)
-        cl_command_queue_insert_event(queue, event_wait_list[i]);
-      if(queue && data->type == EnqueueBarrier){
-        cl_command_queue_insert_barrier_event(queue, event_wait_list[i]);
-      }
-    } else if(event_wait_list[i]->enqueue_cb != NULL) {
-      user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
-      while(user_events != NULL) {
-        /* Insert the enqueue_callback to user event's  waits_tail */
-        node = user_events->event->waits_head;
-        if(node == NULL)
-          event_wait_list[i]->waits_head = cb;
-        else{
-          while((node != cb) && node->next)
-            node = node->next;
-          if(node == cb) {  //wait on dup user event
-            user_events = user_events->next;
-            continue;
-          }
-          node->next = cb;
-        }
-        /* Insert the user event to enqueue_callback's wait_user_events */
-        TRY(cl_event_insert_user_event, &cb->wait_user_events, user_events->event);
-        cl_event_add_ref(user_events->event);
-        if(queue)
-          cl_command_queue_insert_event(event->queue, user_events->event);
-        if(queue && data->type == EnqueueBarrier){
-          cl_command_queue_insert_barrier_event(event->queue, user_events->event);
-        }
-        user_events = user_events->next;
+        list_node_del(&cb->node);
+        cb->executed = CL_TRUE;
+        cb->pfn_notify(event, status, cb->user_data);
+        cl_free(cb);
-    }
-  }
-  if(event != NULL && event->queue != NULL && event->gpgpu_event != NULL) {
-    event->gpgpu = cl_thread_gpgpu_take(event->queue);
-    data->ptr = (void *)event->gpgpu_event;
-  }
-  cb->data = *data;
-  if(event)
-    event->enqueue_cb = cb;
-  return;
-  if(cb) {
-    while(cb->wait_user_events) {
-      u_ev = cb->wait_user_events;
-      cb->wait_user_events = cb->wait_user_events->next;
-      cl_event_delete(u_ev->event);
-      cl_free(u_ev);
-    }
-    for(i=0; i<cb->num_events; i++) {
-      if(cb->wait_list[i]) {
-        cl_event_delete(cb->wait_list[i]);
-      }
-    }
-    cl_free(cb);
-  }
-  goto exit;
-void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb) {
-  user_callback *user_cb = NULL;
-  user_callback *queue_cb = NULL; // For thread safety, we create a queue that holds user_callback's pfn_notify contents
-  user_callback *temp_cb = NULL;
-  user_cb = event->user_cb;
-  pthread_mutex_lock(&event->ctx->event_lock);
-  while(user_cb) {
-    if(user_cb->status >= status
-        && user_cb->executed == CL_FALSE) { // Added check to not execute a callback when it was already handled
-      user_cb->executed = CL_TRUE;
-      temp_cb = cl_malloc(sizeof(user_callback));
-      if(!temp_cb) {
-        break; // Out of memory
-      }
-      temp_cb->pfn_notify = user_cb->pfn_notify; // Minor struct copy to call ppfn_notify out of the pthread_mutex
-      temp_cb->user_data = user_cb->user_data;
-      if(free_cb) {
-        cl_free(user_cb);
-      }
-      if(!queue_cb) {
-        queue_cb = temp_cb;
-        queue_cb->next = NULL;
-      } else { // Enqueue First
-        temp_cb->next = queue_cb;
-        queue_cb = temp_cb;
-      }
-    }
-    user_cb = user_cb->next;
-  }
-  pthread_mutex_unlock(&event->ctx->event_lock);
-  // Calling the callbacks outside of the event_lock is required because the callback can call cl_api functions and get deadlocked
-  while(queue_cb) { // For each callback queued, actually execute the callback
-    queue_cb->pfn_notify(event, event->status, queue_cb->user_data);
-    temp_cb = queue_cb;
-    queue_cb = queue_cb->next;
-    cl_free(temp_cb);
-  }
+      CL_OBJECT_LOCK(event);
-void cl_event_set_status(cl_event event, cl_int status)
-  cl_int ret, i;
-  cl_event evt;
+      // Set back the uncalled callbacks.
+      list_merge(&event->callbacks, &tmp_callbacks);
-  pthread_mutex_lock(&event->ctx->event_lock);
-  if(status >= event->status) {
-    pthread_mutex_unlock(&event->ctx->event_lock);
-    return;
-  }
-  if(event->status <= CL_COMPLETE) {
-    event->status = status;    //have done enqueue before or doing in another thread
-    pthread_mutex_unlock(&event->ctx->event_lock);
-    return;
+      /* Status may changed because we unlock. need to check again. */
+    } while (status != event->status);
-  if(status <= CL_COMPLETE) {
-    if(event->enqueue_cb) {
-      if(status == CL_COMPLETE) {
-        cl_enqueue_handle(event, &event->enqueue_cb->data);
-        if(event->gpgpu_event)
-          cl_gpgpu_event_update_status(event->gpgpu_event, 1);  //now set complet, need refine
-      } else {
-        if(event->gpgpu_event) {
-          // Error then cancel the enqueued event.
-          cl_gpgpu_delete(event->gpgpu);
-          event->gpgpu = NULL;
-        }
-      }
+  /*  Wakeup all the waiter for status change. */
+  if (event->status <= CL_COMPLETE) {
+    notify_queue = CL_TRUE;
+  }
-      event->status = status;  //Change the event status after enqueue and befor unlock
-      pthread_mutex_unlock(&event->ctx->event_lock);
-      for(i=0; i<event->enqueue_cb->num_events; i++)
-        cl_event_delete(event->enqueue_cb->wait_list[i]);
-      pthread_mutex_lock(&event->ctx->event_lock);
+  /* Need to notify all the command queue within the same context. */
+  if (notify_queue) {
+    cl_command_queue queue = NULL;
-      if(event->enqueue_cb->wait_list)
-        cl_free(event->enqueue_cb->wait_list);
-      cl_free(event->enqueue_cb);
-      event->enqueue_cb = NULL;
+    /*First, we need to remove it from queue's barrier list. */
+    if (CL_EVENT_IS_BARRIER(event)) {
+      assert(event->queue);
+      cl_command_queue_remove_barrier_event(event->queue, event);
-  }
-  if(event->status >= status)  //maybe changed in other threads
-    event->status = status;
-  pthread_mutex_unlock(&event->ctx->event_lock);
-  /* Call user callback */
-  cl_event_call_callback(event, status, CL_FALSE);
-  if(event->type == CL_COMMAND_USER) {
-    /* Check all defer enqueue */
-    enqueue_callback *cb, *enqueue_cb = event->waits_head;
-    while(enqueue_cb) {
-      /* Remove this user event in enqueue_cb, update the header if needed. */
-      cl_event_remove_user_event(&enqueue_cb->wait_user_events, event);
-      cl_event_delete(event);
-      /* Still wait on other user events */
-      if(enqueue_cb->wait_user_events != NULL) {
-        enqueue_cb = enqueue_cb->next;
-        continue;
-      }
-      //remove user event frome enqueue_cb's ctx
-      cl_command_queue_remove_event(enqueue_cb->event->queue, event);
-      cl_command_queue_remove_barrier_event(enqueue_cb->event->queue, event);
-      /* All user events complete, now wait enqueue events */
-      ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
-          enqueue_cb->event->queue);
-      assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
-      ret = ~ret;
-      cb = enqueue_cb;
-      enqueue_cb = enqueue_cb->next;
-      /* Call the pending operation */
-      evt = cb->event;
-      /* TODO: if this event wait on several events, one event's
-         status is error, the others is complete, what's the status
-         of this event? Can't find the description in OpenCL spec.
-         Simply update to latest finish wait event.*/
-      cl_event_set_status(cb->event, status);
-      if(evt->emplict == CL_FALSE) {
-        cl_event_delete(evt);
-      }
+    /* Then, notify all the queues within the same context. */
+    CL_OBJECT_LOCK(event->ctx);
+    /* Disable remove and add queue to the context temporary. We need to
+       make sure all the queues in the context currently are valid. */
+    event->ctx->queue_modify_disable++;
+    CL_OBJECT_UNLOCK(event->ctx);
+    list_for_each(pos, &event->ctx->queues)
+    {
+      queue = (cl_command_queue)(list_entry(pos, _cl_base_object, node));
+      assert(queue != NULL);
+      cl_command_queue_notify(queue);
-    event->waits_head = NULL;
+    CL_OBJECT_LOCK(event->ctx);
+    /* Disable remove and add queue to the context temporary. We need to
+       make sure all the queues in the context currently are valid. */
+    event->ctx->queue_modify_disable--;
+    CL_OBJECT_NOTIFY_COND(event->ctx);
+    CL_OBJECT_UNLOCK(event->ctx);
-  if(event->status <= CL_COMPLETE){
-    /* Maintain the last_list when event completed*/
-    if (event->last_prev)
-      event->last_prev->last_next = event->last_next;
-    if (event->last_next)
-      event->last_next->last_prev = event->last_prev;
-    if(event->queue && get_last_event(event->queue) == event)
-      set_last_event(event->queue, event->last_next);
-    event->last_prev = NULL;
-    event->last_next = NULL;
-    cl_event_delete(event);
-  }
+  return CL_SUCCESS;
-void cl_event_update_status(cl_event event, int wait)
+LOCAL cl_int
+cl_event_wait_for_event_ready(const cl_event event)
-  if(event->status <= CL_COMPLETE)
-    return;
-  if((event->gpgpu_event) &&
-     (cl_gpgpu_event_update_status(event->gpgpu_event, wait) == command_complete))
-    cl_event_set_status(event, CL_COMPLETE);
+  assert(CL_OBJECT_IS_EVENT(event));
+  return cl_event_wait_for_events_list(event->depend_event_num, event->depend_events);
-cl_int cl_event_marker_with_wait_list(cl_command_queue queue,
-                cl_uint num_events_in_wait_list,
-                const cl_event *event_wait_list,
-                cl_event* event)
+LOCAL cl_int
+cl_event_wait_for_events_list(cl_uint num_events, const cl_event *event_list)
-  enqueue_data data = { 0 };
+  int i;
   cl_event e;
+  cl_int ret = CL_SUCCESS;
-  e = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER, CL_TRUE);
-  if(e == NULL)
-    return CL_OUT_OF_HOST_MEMORY;
+  for (i = 0; i < num_events; i++) {
+    e = event_list[i];
+    assert(e);
+    assert(CL_OBJECT_IS_EVENT(e));
-  if(event != NULL ){
-    *event = e;
-  }
+    while (e->status > CL_COMPLETE) {
+    }
-//enqueues a marker command which waits for either a list of events to complete, or if the list is
-//empty it waits for all commands previously enqueued in command_queue to complete before it  completes.
-  if(num_events_in_wait_list > 0){
-    if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
-      data.type = EnqueueMarker;
-      cl_event_new_enqueue_callback(event?*event:NULL, &data, num_events_in_wait_list, event_wait_list);
-      return CL_SUCCESS;
+    assert(e->status <= CL_COMPLETE);
+    /* Iff some error happened, return the error. */
+    if (e->status < CL_COMPLETE) {
-  } else if(queue->wait_events_num > 0) {
-    data.type = EnqueueMarker;
-    cl_event_new_enqueue_callback(event?*event:NULL, &data, queue->wait_events_num, queue->wait_events);
-    return CL_SUCCESS;
-  cl_event_update_last_events(queue,1);
-  cl_event_set_status(e, CL_COMPLETE);
-  return CL_SUCCESS;
+  return ret;
-cl_int cl_event_barrier_with_wait_list(cl_command_queue queue,
-                cl_uint num_events_in_wait_list,
-                const cl_event *event_wait_list,
-                cl_event* event)
+LOCAL cl_int
+cl_event_check_waitlist(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+                        cl_event *event, cl_context ctx)
-  enqueue_data data = { 0 };
-  cl_event e;
+  cl_int err = CL_SUCCESS;
+  cl_int i;
-  e = cl_event_new(queue->ctx, queue, CL_COMMAND_BARRIER, CL_TRUE);
-  if(e == NULL)
-    return CL_OUT_OF_HOST_MEMORY;
+  do {
+    /* check the event_wait_list and num_events_in_wait_list */
+    if ((event_wait_list == NULL) && (num_events_in_wait_list > 0)) {
+      break;
+    }
-  if(event != NULL ){
-    *event = e;
-  }
-//enqueues a barrier command which waits for either a list of events to complete, or if the list is
-//empty it waits for all commands previously enqueued in command_queue to complete before it  completes.
-  if(num_events_in_wait_list > 0){
-    if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
-      data.type = EnqueueBarrier;
-      cl_event_new_enqueue_callback(e, &data, num_events_in_wait_list, event_wait_list);
-      return CL_SUCCESS;
+    if ((event_wait_list != NULL) && (num_events_in_wait_list == 0)) {
+      break;
-  } else if(queue->wait_events_num > 0) {
-    data.type = EnqueueBarrier;
-    cl_event_new_enqueue_callback(e, &data, queue->wait_events_num, queue->wait_events);
-    return CL_SUCCESS;
-  }
-  cl_event_update_last_events(queue,1);
+    /* check the event and context */
+    for (i = 0; i < num_events_in_wait_list; i++) {
+      if (!CL_OBJECT_IS_EVENT(event_wait_list[i])) {
+        break;
+      }
-  cl_event_set_status(e, CL_COMPLETE);
-  return CL_SUCCESS;
+      if (event == event_wait_list + i) { /* Pointer of element of the wait list */
+        break;
+      }
-cl_ulong cl_event_get_cpu_timestamp(cl_ulong *cpu_time)
-  struct timespec ts;
+      /* check all belong to same context. */
+      if (ctx == NULL) {
+        ctx = event_wait_list[i]->ctx;
+      }
+      if (event_wait_list[i]->ctx != ctx) {
+        err = CL_INVALID_CONTEXT;
+        break;
+      }
+    }
- if(clock_gettime(CLOCK_MONOTONIC_RAW,&ts) != 0){
-  printf("CPU Timmer error\n");
-  return CL_FALSE;
-  }
-  *cpu_time = (1000000000.0) * (cl_ulong) ts.tv_sec + (cl_ulong) ts.tv_nsec;
+    if (err != CL_SUCCESS)
+      break;
-  return CL_SUCCESS;
+  } while (0);
+  return err;
-cl_int cl_event_get_queued_cpu_timestamp(cl_event event)
+/* When we call this function, all the events it depends
+   on should already be ready, unless ignore_depends is set. */
+LOCAL cl_uint
+cl_event_exec(cl_event event, cl_int exec_to_status, cl_bool ignore_depends)
-  cl_int ret_val;
+  /* We are MT safe here, no one should call this
+     at the same time. No need to lock */
+  cl_int ret = CL_SUCCESS;
+  cl_int cur_status = cl_event_get_status(event);
+  cl_int depend_status;
+  cl_int s;
+  assert(exec_to_status >= CL_COMPLETE);
+  assert(exec_to_status <= CL_QUEUED);
+  if (cur_status < CL_COMPLETE) {
+    return cur_status;
+  }
+  depend_status = cl_event_is_ready(event);
+  assert(depend_status <= CL_COMPLETE || ignore_depends || exec_to_status == CL_QUEUED);
+  if (depend_status < CL_COMPLETE) { // Error happend, cancel exec.
+    ret = cl_event_set_status(event, depend_status);
+    return depend_status;
+  }
+  if (cur_status <= exec_to_status) {
+    return ret;
+  }
+  /* Exec to the target status. */
+  for (s = cur_status - 1; s >= exec_to_status; s--) {
+    assert(s >= CL_COMPLETE);
+    ret = cl_enqueue_handle(&event->exec_data, s);
+    if (ret != CL_SUCCESS) {
+      assert(ret < 0);
+      DEBUGP(DL_WARNING, "Exec event %p error, type is %d, error staus is %d",
+             event, event->event_type, ret);
+      ret = cl_event_set_status(event, ret);
+      assert(ret == CL_SUCCESS);
+      return ret; // Failed and we never do further.
+    } else {
+      assert(!CL_EVENT_IS_USER(event));
+      if ((event->queue->props & CL_QUEUE_PROFILING_ENABLE) != 0) {
+        /* record the timestamp before actually doing something. */
+        cl_event_update_timestamp(event, s);
+      }
-  ret_val = cl_event_get_cpu_timestamp(&event->queued_timestamp);
+      ret = cl_event_set_status(event, s);
+      assert(ret == CL_SUCCESS);
+    }
+  }
-  return ret_val;
+  return ret;
-cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp,cl_ulong end_timestamp)
+/* 0 means ready, >0 means not ready, <0 means error. */
+LOCAL cl_int
+cl_event_is_ready(cl_event event)
-  cl_ulong ret_val;
-  if(end_timestamp > start_timestamp){
-   ret_val = end_timestamp - start_timestamp;
-   }
-  else {
-   /*if start time stamp is greater than end timstamp then set ret value to max*/
-   ret_val = ((cl_ulong) 1 << 32);
-  }
+  int i;
+  int status;
+  int ret_status = CL_COMPLETE;
-  return ret_val;
+  for (i = 0; i < event->depend_event_num; i++) {
+    status = cl_event_get_status(event->depend_events[i]);
-cl_ulong cl_event_get_start_timestamp(cl_event event)
-  cl_ulong ret_val;
+    if (status > CL_COMPLETE) { // Find some not ready, just OK
+      return status;
+    }
-   ret_val = cl_event_get_timestamp_delta(event->timestamp[0],event->timestamp[2]);
+    if (status < CL_COMPLETE) { // Record some error.
+      ret_status = status;
+    }
+  }
-  return ret_val;
+  return ret_status;
-cl_ulong cl_event_get_end_timestamp(cl_event event)
+LOCAL cl_event
+cl_event_create_marker_or_barrier(cl_command_queue queue, cl_uint num_events_in_wait_list,
+                                  const cl_event *event_wait_list, cl_bool is_barrier, cl_int *error)
- cl_ulong ret_val;
+  cl_event e = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_command_type type = CL_COMMAND_MARKER;
+  enqueue_type eq_type = EnqueueMarker;
-  ret_val = cl_event_get_timestamp_delta(event->timestamp[0],event->timestamp[3]);
+  if (is_barrier) {
+    eq_type = EnqueueBarrier;
+  }
-  return ret_val;
+  if (event_wait_list) {
+    assert(num_events_in_wait_list > 0);
-cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
-  cl_ulong ret_val = 0;
-  GET_QUEUE_THREAD_GPGPU(event->queue);
+    e = cl_event_create(queue->ctx, queue, num_events_in_wait_list,
+                        event_wait_list, type, &err);
+    if (err != CL_SUCCESS) {
+      *error = err;
+      return NULL;
+    }
+  } else { /* The marker depends on all events in the queue now. */
+    cl_command_queue_enqueue_worker worker = &queue->worker;
+    cl_uint i;
+    cl_uint event_num;
+    cl_event *depend_events;
+    CL_OBJECT_LOCK(queue);
+    /* First, wait for the command queue retire all in executing event. */
+    while (1) {
+      if (worker->quit) { // already destroy the queue?
+        CL_OBJECT_UNLOCK(queue);
+        *error = CL_INVALID_COMMAND_QUEUE;
+        return NULL;
+      }
-  if (!event->gpgpu_event) {
-    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
-    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
-    return CL_SUCCESS;
-  }
+      if (worker->in_exec_status != CL_COMPLETE) {
+        CL_OBJECT_WAIT_ON_COND(queue);
+        continue;
+      }
-  if(param_name == CL_PROFILING_COMMAND_SUBMIT ||
-         param_name == CL_PROFILING_COMMAND_QUEUED) {
-    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
-    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
-    return CL_SUCCESS;
-  } else if(param_name == CL_PROFILING_COMMAND_START) {
-    cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 0, &ret_val);
-    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
-    return CL_SUCCESS;
-  } else if (param_name == CL_PROFILING_COMMAND_END) {
-    cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 1, &ret_val);
-    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
-    return CL_SUCCESS;
-  }
+      break;
+    }
-cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event)
-  user_event * u_iter = *p_u_ev;
-  user_event * u_ev;
-  while(u_iter)
-  {
-    if(u_iter->event == event)
-      return CL_SUCCESS;
-    u_iter = u_iter->next;
-  }
+    event_num = 0;
+    depend_events = NULL;
+    if (!list_empty(&worker->enqueued_events)) {
+      depend_events = cl_command_queue_record_in_queue_events(queue, &event_num);
+    }
-  TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
-  u_ev->event = event;
-  u_ev->next = *p_u_ev;
-  *p_u_ev = u_ev;
+    CL_OBJECT_UNLOCK(queue);
+    e = cl_event_create(queue->ctx, queue, event_num, depend_events, type, &err);
-  return CL_SUCCESS;
-  return CL_FALSE;
+    for (i = 0; i < event_num; i++) { //unref the temp
+      cl_event_delete(depend_events[i]);
+    }
+    if (depend_events)
+      cl_free(depend_events);
-cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event)
-  user_event * u_iter = *p_u_ev;
-  user_event * u_prev = *p_u_ev;
-  while(u_iter){
-    if(u_iter->event == event ){
-      if(u_iter == *p_u_ev){
-        *p_u_ev = u_iter->next;
-      }else{
-        u_prev->next = u_iter->next;
-      }
-      cl_free(u_iter);
-      break;
+    if (err != CL_SUCCESS) {
+      *error = err;
+      return NULL;
-    u_prev = u_iter;
-    u_iter = u_iter->next;
-  return CL_SUCCESS;
+  e->exec_data.type = eq_type;
+  *error = CL_SUCCESS;
+  return e;
diff --git a/src/cl_event.h b/src/cl_event.h
index 67fab19..f28844a 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -14,111 +14,75 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
-#ifndef __CL_EVENT_H__
-#define __CL_EVENT_H__
+#ifndef __CL_EVENT_H_
+#define __CL_EVENT_H_
 #include <semaphore.h>
-#include "cl_internals.h"
-#include "cl_driver.h"
+#include "cl_base_object.h"
 #include "cl_enqueue.h"
 #include "CL/cl.h"
+typedef void(CL_CALLBACK *cl_event_notify_cb)(cl_event event, cl_int event_command_exec_status, void *user_data);
-typedef struct _user_event {
-  cl_event            event;   /* The user event */
-  struct _user_event* next;    /* Next user event in list */
-} user_event;
+typedef struct _cl_event_user_callback {
+  cl_int status;                 /* The execution status */
+  cl_bool executed;              /* Indicat the callback function been called or not */
+  cl_event_notify_cb pfn_notify; /* Callback function */
+  void *user_data;               /* Callback user data */
+  list_node node;                /* Event callback list node */
+} _cl_event_user_callback;
-typedef struct _enqueue_callback {
-  cl_event           event;            /* The event relative this enqueue callback */
-  enqueue_data       data;             /* Hold all enqueue callback's infomation */
-  cl_uint            num_events;       /* num events in wait list */
-  cl_event*          wait_list;        /* All event wait list this callback wait on */
-  user_event*        wait_user_events; /* The head of user event list the callback wait on */
-  struct _enqueue_callback*  next;     /* The  next enqueue callback in wait list */
-} enqueue_callback;
+typedef _cl_event_user_callback *cl_event_user_callback;
-typedef void (CL_CALLBACK *EVENT_NOTIFY)(cl_event event, cl_int event_command_exec_status, void *user_data);
+typedef struct _cl_event {
+  _cl_base_object base;
+  cl_context ctx;             /* The context associated with event */
+  cl_command_queue queue;     /* The command queue associated with event */
+  cl_command_type event_type; /* Event type. */
+  cl_bool is_barrier;         /* Is this event a barrier */
+  cl_int status;              /* The execution status */
+  cl_event *depend_events;    /* The events must complete before this. */
+  cl_uint depend_event_num;   /* The depend events number. */
+  list_head callbacks;        /* The events The event callback functions */
+  list_node enqueue_node;     /* The node in the enqueue list. */
+  cl_ulong timestamp[5];      /* The time stamps for profiling. */
+  enqueue_data exec_data; /* Context for execute this event. */
+} _cl_event;
-typedef struct _user_callback {
-  cl_int            status;     /* The execution status */
-  cl_bool           executed;   /* Indicat the callback function been called or not */
-  EVENT_NOTIFY      pfn_notify; /* Callback function */
-  void*             user_data;  /* Callback user data */
-  struct _user_callback*    next;       /* Next event callback in list */
-} user_callback;
+#define CL_OBJECT_EVENT_MAGIC 0x8324a9f810ebf90fLL
+#define CL_OBJECT_IS_EVENT(obj) ((obj &&                           \
+         ((cl_base_object)obj)->magic == CL_OBJECT_EVENT_MAGIC &&  \
+         CL_OBJECT_GET_REF(obj) >= 1))
-struct _cl_event {
-  DEFINE_ICD(dispatch)
-  uint64_t           magic;       /* To identify it as a sampler object */
-  volatile int       ref_n;       /* We reference count this object */
-  cl_context         ctx;         /* The context associated with event */
-  cl_event           prev, next;  /* We chain the memory buffers together */
-  cl_command_queue   queue;       /* The command queue associated with event */
-  cl_command_type    type;        /* The command type associated with event */
-  cl_int             status;      /* The execution status */
-  cl_gpgpu           gpgpu;       /* Current gpgpu, owned by this structure. */
-  cl_gpgpu_event     gpgpu_event; /* The event object communicate with hardware */
-  user_callback*     user_cb;     /* The event callback functions */
-  enqueue_callback*  enqueue_cb;  /* This event's enqueue */
-  enqueue_callback*  waits_head;  /* The head of enqueues list wait on this event */
-  cl_bool            emplict;     /* Identify this event whether created by api emplict*/
-  cl_ulong           timestamp[4];/* The time stamps for profiling. */
-  cl_ulong	     queued_timestamp;
-  cl_event   last_next, last_prev;/* We need a list to monitor untouchable api event*/
+#define CL_EVENT_IS_MARKER(E) (E->event_type == CL_COMMAND_MARKER)
+#define CL_EVENT_IS_BARRIER(E) (E->event_type == CL_COMMAND_BARRIER)
+#define CL_EVENT_IS_USER(E) (E->event_type == CL_COMMAND_USER)
 /* Create a new event object */
-cl_event cl_event_new(cl_context, cl_command_queue, cl_command_type, cl_bool);
-/* Unref the object and delete it if no more reference on it */
-void cl_event_delete(cl_event);
-/* Add one more reference to this object */
-void cl_event_add_ref(cl_event);
-/* Register a user callback function for specific commond execution status */
-cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
-/* Execute the event's callback if the event's status supersedes the callback's status. Free the callback if specified */
-void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb);
-/* Check events wait list for enqueue commonds */
-cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
-/* Wait the all events in wait list complete */
-cl_int cl_event_wait_events(cl_uint, const cl_event *, cl_command_queue);
-/* New a enqueue suspend task */
-void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_event *);
-/* Set the event status and call all callbacks */
-void cl_event_set_status(cl_event, cl_int);
-/* Check and update event status */
-void cl_event_update_status(cl_event, cl_int);
-/* Create the marker event */
-cl_int cl_event_marker_with_wait_list(cl_command_queue, cl_uint, const cl_event *,  cl_event*);
-/* Create the barrier event */
-cl_int cl_event_barrier_with_wait_list(cl_command_queue, cl_uint, const cl_event *,  cl_event*);
-/* Get the cpu time */
-cl_ulong cl_event_get_cpu_timestamp(cl_ulong *cpu_time);
-/*Get the cpu time for queued*/
-cl_int cl_event_get_queued_cpu_timestamp(cl_event event);
-/*get timestamp delate between end and start*/
-cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp,cl_ulong end_timestamp);
-/*Get start time stamp*/
-cl_ulong cl_event_get_start_timestamp(cl_event event);
-/*Get end time stamp*/
-cl_ulong cl_event_get_end_timestamp(cl_event event);
-/* Do the event profiling */
-cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
-/* insert the user event */
-cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event);
-/* remove the user event */
-cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event);
-/* flush the event's pending gpgpu batch buffer and notify driver this gpgpu event has been flushed. */
-cl_int cl_event_flush(cl_event event);
-/* monitor or block wait all events in the last_event list */
-void cl_event_update_last_events(cl_command_queue queuet, int wait);
-/* insert the event into the last_event list in queue */
-void cl_event_insert_last_events(cl_command_queue queue, cl_event event);
+extern cl_event cl_event_create(cl_context ctx, cl_command_queue queue, cl_uint num_events,
+                                const cl_event *event_list, cl_command_type type, cl_int *errcode_ret);
+extern cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+                                      cl_event* event, cl_context ctx);
+extern cl_uint cl_event_exec(cl_event event, cl_int exec_to_status, cl_bool ignore_depends);
+/* 0 means ready, >0 means not ready, <0 means error. */
+extern cl_int cl_event_is_ready(cl_event event);
+extern cl_int cl_event_get_status(cl_event event);
+extern void cl_event_add_ref(cl_event event);
+extern void cl_event_delete(cl_event event);
+extern cl_int cl_event_set_status(cl_event event, cl_int status);
+extern cl_int cl_event_set_callback(cl_event event, cl_int exec_type,
+                                    cl_event_notify_cb pfn_notify, void *user_data);
+extern cl_int cl_event_wait_for_events_list(cl_uint num_events, const cl_event *event_list);
+extern cl_int cl_event_wait_for_event_ready(cl_event event);
+extern cl_event cl_event_create_marker_or_barrier(cl_command_queue queue, cl_uint num_events_in_wait_list,
+                                                  const cl_event *event_wait_list, cl_bool is_barrier,
+                                                  cl_int* error);
+extern void cl_event_update_timestamp(cl_event event, cl_int status);
 #endif /* __CL_EVENT_H__ */
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 40b6ddc..a3c71ca 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -1,5 +1,5 @@
 #include "llvm/Config/llvm-config.h"
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL
 #include "EGL/egl.h"
 #include "EGL/eglext.h"
@@ -55,7 +55,7 @@ void check_opt1_extension(cl_extensions_t *extensions)
 check_gl_extension(cl_extensions_t *extensions) {
-#if defined(HAS_EGL)
+#if defined(HAS_GL_EGL)
   int id;
       /* For now, we only support cl_khr_gl_sharing. */
   for(id = GL_EXT_START_ID; id <= GL_EXT_END_ID; id++)
@@ -152,6 +152,7 @@ cl_intel_platform_extension_init(cl_platform_id intel_platform)
   static int ext_initialized = 0;
   /* The EXT should be only inited once. */
+  (void) ext_initialized;
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 1139775..52a4953 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -9,6 +9,9 @@
   DECL_EXT(khr_local_int32_base_atomics) \
   DECL_EXT(khr_local_int32_extended_atomics) \
   DECL_EXT(khr_byte_addressable_store) \
+  DECL_EXT(khr_3d_image_writes)\
+  DECL_EXT(khr_image2d_from_buffer)\
+  DECL_EXT(khr_depth_images)\
 /* The OPT1 extensions are those optional extensions
@@ -16,19 +19,17 @@
-  DECL_EXT(khr_3d_image_writes)\
-  DECL_EXT(khr_image2d_from_buffer)\
-  DECL_EXT(khr_depth_images)\
   DECL_EXT(khr_spir) \
   DECL_EXT(intel_accelerator) \
   DECL_EXT(intel_motion_estimation) \
-  DECL_EXT(intel_subgroups)
+  DECL_EXT(intel_subgroups) \
+  DECL_EXT(intel_subgroups_short)
@@ -63,7 +64,7 @@ cl_khr_extension_id_max
 #define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
 #define OPT1_EXT_END_ID EXT_ID(khr_icd)
 #define INTEL_EXT_START_ID EXT_ID(intel_accelerator)
-#define INTEL_EXT_END_ID EXT_ID(intel_subgroups)
+#define INTEL_EXT_END_ID EXT_ID(intel_subgroups_short)
 #define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
 #define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index aa13a3d..f190b0d 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -38,10 +38,13 @@ gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource = NULL;
 gbe_program_new_from_binary_cb *interp_program_new_from_binary = NULL;
 gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size = NULL;
 gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data = NULL;
+gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count = NULL;
+gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table = NULL;
 gbe_program_delete_cb *interp_program_delete = NULL;
 gbe_program_get_kernel_num_cb *interp_program_get_kernel_num = NULL;
 gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name = NULL;
 gbe_program_get_kernel_cb *interp_program_get_kernel = NULL;
+gbe_program_get_device_enqueue_kernel_name_cb *interp_program_get_device_enqueue_kernel_name = NULL;
 gbe_kernel_get_name_cb *interp_kernel_get_name = NULL;
 gbe_kernel_get_attributes_cb *interp_kernel_get_attributes = NULL;
 gbe_kernel_get_code_cb *interp_kernel_get_code = NULL;
@@ -64,6 +67,7 @@ gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data = NULL;
 gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size = NULL;
 gbe_kernel_get_image_size_cb *interp_kernel_get_image_size = NULL;
 gbe_kernel_get_image_data_cb *interp_kernel_get_image_data = NULL;
+gbe_kernel_get_ocl_version_cb *interp_kernel_get_ocl_version = NULL;
 gbe_output_profiling_cb* interp_output_profiling = NULL;
 gbe_get_profiling_bti_cb* interp_get_profiling_bti = NULL;
 gbe_dup_profiling_cb* interp_dup_profiling = NULL;
@@ -73,6 +77,7 @@ gbe_dup_printfset_cb* interp_dup_printfset = NULL;
 gbe_release_printf_info_cb* interp_release_printf_info = NULL;
 gbe_output_printf_cb* interp_output_printf = NULL;
 gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info = NULL;
+gbe_kernel_use_device_enqueue_cb *interp_kernel_use_device_enqueue = NULL;
 struct GbeLoaderInitializer
@@ -110,6 +115,14 @@ struct GbeLoaderInitializer
     if (interp_program_get_global_constant_data == NULL)
       return false;
+    interp_program_get_global_reloc_count = *(gbe_program_get_global_reloc_count_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_count");
+    if (interp_program_get_global_reloc_count == NULL)
+      return false;
+    interp_program_get_global_reloc_table = *(gbe_program_get_global_reloc_table_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_table");
+    if (interp_program_get_global_reloc_table == NULL)
+      return false;
     interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp, "gbe_program_delete");
     if (interp_program_delete == NULL)
       return false;
@@ -126,6 +139,10 @@ struct GbeLoaderInitializer
     if (interp_program_get_kernel == NULL)
       return false;
+    interp_program_get_device_enqueue_kernel_name = *(gbe_program_get_device_enqueue_kernel_name_cb**)dlsym(dlhInterp, "gbe_program_get_device_enqueue_kernel_name");
+    if (interp_program_get_device_enqueue_kernel_name == NULL)
+      return false;
     interp_kernel_get_name = *(gbe_kernel_get_name_cb**)dlsym(dlhInterp, "gbe_kernel_get_name");
     if (interp_kernel_get_name == NULL)
       return false;
@@ -214,6 +231,10 @@ struct GbeLoaderInitializer
     if (interp_kernel_get_image_data == NULL)
       return false;
+    interp_kernel_get_ocl_version = *(gbe_kernel_get_ocl_version_cb**)dlsym(dlhInterp, "gbe_kernel_get_ocl_version");
+    if (interp_kernel_get_ocl_version == NULL)
+      return false;
     interp_output_profiling = *(gbe_output_profiling_cb**)dlsym(dlhInterp, "gbe_output_profiling");
     if (interp_output_profiling == NULL)
       return false;
@@ -250,6 +271,10 @@ struct GbeLoaderInitializer
     if (interp_kernel_get_arg_info == NULL)
       return false;
+    interp_kernel_use_device_enqueue = *(gbe_kernel_use_device_enqueue_cb**)dlsym(dlhInterp, "gbe_kernel_use_device_enqueue");
+    if (interp_kernel_use_device_enqueue == NULL)
+      return false;
     return true;
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index df808a5..df885d2 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -38,10 +38,13 @@ extern gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource;
 extern gbe_program_new_from_binary_cb *interp_program_new_from_binary;
 extern gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size;
 extern gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data;
+extern gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count;
+extern gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table;
 extern gbe_program_delete_cb *interp_program_delete;
 extern gbe_program_get_kernel_num_cb *interp_program_get_kernel_num;
 extern gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name;
 extern gbe_program_get_kernel_cb *interp_program_get_kernel;
+extern gbe_program_get_device_enqueue_kernel_name_cb *interp_program_get_device_enqueue_kernel_name;
 extern gbe_kernel_get_name_cb *interp_kernel_get_name;
 extern gbe_kernel_get_attributes_cb *interp_kernel_get_attributes;
 extern gbe_kernel_get_code_cb *interp_kernel_get_code;
@@ -64,6 +67,7 @@ extern gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data;
 extern gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size;
 extern gbe_kernel_get_image_size_cb *interp_kernel_get_image_size;
 extern gbe_kernel_get_image_data_cb *interp_kernel_get_image_data;
+extern gbe_kernel_get_ocl_version_cb *interp_kernel_get_ocl_version;
 extern gbe_output_profiling_cb* interp_output_profiling;
 extern gbe_get_profiling_bti_cb* interp_get_profiling_bti;
 extern gbe_dup_profiling_cb* interp_dup_profiling;
@@ -73,6 +77,7 @@ extern gbe_dup_printfset_cb* interp_dup_printfset;
 extern gbe_release_printf_info_cb* interp_release_printf_info;
 extern gbe_output_printf_cb* interp_output_printf;
 extern gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info;
+extern gbe_kernel_use_device_enqueue_cb * interp_kernel_use_device_enqueue;
 int CompilerSupported();
 #ifdef __cplusplus
diff --git a/src/cl_gl_api.c b/src/cl_gl_api.c
index 519aab6..897edb4 100644
--- a/src/cl_gl_api.c
+++ b/src/cl_gl_api.c
@@ -20,7 +20,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL
 #include <GL/gl.h>
@@ -95,16 +95,7 @@ clCreateFromGLTexture3D(cl_context    context,
                         GLuint texture,
                         cl_int *      errcode_ret)
-  cl_mem mem = NULL;
-  cl_int err = CL_SUCCESS;
-  CHECK_CONTEXT (context);
-  CHECK_GL_CONTEXT (context);
-  mem = cl_mem_new_gl_texture(context, flags, texture_target, miplevel, texture, &err);
-  if (errcode_ret)
-    *errcode_ret = err;
-  return mem;
@@ -120,6 +111,12 @@ clCreateFromGLTexture(cl_context      context,
   CHECK_CONTEXT (context);
   CHECK_GL_CONTEXT (context);
+  //We just support GL_TEXTURE_2D now.
+  if(target != GL_TEXTURE_2D){
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
   mem = cl_mem_new_gl_texture(context, flags, target, miplevel, texture, &err);
   if (errcode_ret)
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 70a0a54..cf5ad7a 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -39,10 +39,19 @@
 .native_vector_width_float = 4,
 .native_vector_width_double = 2,
 .native_vector_width_half = 8,
+#ifdef ENABLE_OPENCL_20
+.address_bits = 64,
 .address_bits = 32,
+.preferred_platform_atomic_alignment = 0,
+.preferred_global_atomic_alignment = 0,
+.preferred_local_atomic_alignment = 0,
 .image_support = CL_TRUE,
 .max_read_image_args = BTI_MAX_READ_IMAGE_ARGS,
 .max_write_image_args = BTI_MAX_WRITE_IMAGE_ARGS,
+.max_read_write_image_args = BTI_MAX_WRITE_IMAGE_ARGS,
 .image_max_array_size = 2048,
 .image2d_max_width = 8192,
 .image2d_max_height = 8192,
@@ -53,10 +62,15 @@
 .max_samplers = 16,
 .mem_base_addr_align = sizeof(cl_long) * 16 * 8,
 .min_data_type_align_size = sizeof(cl_long) * 16,
+.max_pipe_args = 16,
+.pipe_max_active_reservations = 1,
+.pipe_max_packet_siz = 1024,
 .double_fp_config = 0,
 .global_mem_cache_type = CL_READ_WRITE_CACHE,
 .max_constant_buffer_size = 128 * 1024 * 1024,
 .max_constant_args = 8,
+.max_global_variable_size = 64 * 1024,
+.global_variable_preferred_total_size = 64 * 1024,
 .error_correction_support = CL_FALSE,
 .host_unified_memory = CL_TRUE,
@@ -70,6 +84,12 @@
 .linker_available = CL_TRUE,
 .execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL,
 .queue_properties = CL_QUEUE_PROFILING_ENABLE,
+.queue_on_host_properties = CL_QUEUE_PROFILING_ENABLE,
+.queue_on_device_preferred_size = 16 * 1024,
+.queue_on_device_max_size = 256 * 1024,
+.max_on_device_queues = 1,
+.max_on_device_events = 1024,
 .platform = NULL, /* == intel_platform (set when requested) */
 .single_fp_config = CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
@@ -127,7 +147,6 @@ DECL_INFO_STRING(spir_versions, "1.2")
 .partition_property = {0},
 .affinity_domain = 0,
 .partition_type = {0},
-.device_reference_count = 1,
 .image_pitch_alignment = 1,
 .image_base_address_alignment = 4096,
 .cmrt_device = NULL
diff --git a/src/cl_image.c b/src/cl_image.c
index d58bdf3..d059304 100644
--- a/src/cl_image.c
+++ b/src/cl_image.c
@@ -91,6 +91,13 @@ cl_image_byte_per_pixel(const cl_image_format *fmt, uint32_t *bpp)
       *bpp *= 4;
+    case CL_sRGBA:
+    case CL_sBGRA:
+      if (type != CL_UNORM_INT8)
+      *bpp *= 4;
+    break;
@@ -189,13 +196,23 @@ cl_image_get_intel_format(const cl_image_format *fmt)
         case CL_UNORM_INT8:     return I965_SURFACEFORMAT_B8G8R8A8_UNORM;
         default: return INTEL_UNSUPPORTED_FORMAT;
+    case CL_sRGBA:
+      switch (type) {
+        case CL_UNORM_INT8:     return I965_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
+        default: return INTEL_UNSUPPORTED_FORMAT;
+      };
+    case CL_sBGRA:
+      switch (type) {
+        case CL_UNORM_INT8:     return I965_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
+        default: return INTEL_UNSUPPORTED_FORMAT;
+      };
     default: return INTEL_UNSUPPORTED_FORMAT;
 static const uint32_t cl_image_order[] = {
 static const uint32_t cl_image_type[] = {
@@ -211,6 +228,7 @@ static const size_t cl_image_type_n = SIZEOF32(cl_image_type);
 cl_image_get_supported_fmt(cl_context ctx,
+                           cl_mem_flags flags,
                            cl_mem_object_type image_type,
                            cl_uint num_entries,
                            cl_image_format *image_formats,
@@ -224,6 +242,10 @@ cl_image_get_supported_fmt(cl_context ctx,
       .image_channel_data_type = cl_image_type[j]
     const uint32_t intel_fmt = cl_image_get_intel_format(&fmt);
+    if (cl_image_order[i] >= CL_sRGBA &&
+        ((flags & CL_MEM_WRITE_ONLY) || (flags & CL_MEM_READ_WRITE) ||
+         (flags & CL_MEM_KERNEL_READ_AND_WRITE)))
+      continue;
     if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
     if (n < num_entries && image_formats) image_formats[n] = fmt;
diff --git a/src/cl_image.h b/src/cl_image.h
index ae74509..4f0d0f1 100644
--- a/src/cl_image.h
+++ b/src/cl_image.h
@@ -35,6 +35,7 @@ extern uint32_t cl_image_get_intel_format(const cl_image_format *fmt);
 /* Return the list of formats supported by the API */
 extern cl_int cl_image_get_supported_fmt(cl_context context,
+                                         cl_mem_flags flags,
                                          cl_mem_object_type image_type,
                                          cl_uint num_entries,
                                          cl_image_format *image_formats,
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index b380abe..f687084 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -45,14 +45,16 @@ cl_kernel_delete(cl_kernel k)
 #ifdef HAS_CMRT
   if (k->cmrt_kernel != NULL) {
-    k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   /* We are not done with the kernel */
-  if (atomic_dec(&k->ref_n) > 1) return;
+  if (CL_OBJECT_DEC_REF(k) > 1)
+    return;
   /* Release one reference on all bos we own */
   if (k->bo)       cl_buffer_unreference(k->bo);
   /* This will be true for kernels created by clCreateKernel */
@@ -68,7 +70,17 @@ cl_kernel_delete(cl_kernel k)
   if (k->image_sz)
-  k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+  if (k->exec_info)
+    cl_free(k->exec_info);
+  if (k->device_enqueue_ptr)
+    cl_mem_svm_delete(k->program->ctx, k->device_enqueue_ptr);
+  if (k->device_enqueue_infos)
+    cl_free(k->device_enqueue_infos);
@@ -77,9 +89,7 @@ cl_kernel_new(cl_program p)
   cl_kernel k = NULL;
   TRY_ALLOC_NO_ERR (k, CALLOC(struct _cl_kernel));
-  SET_ICD(k->dispatch)
-  k->ref_n = 1;
   k->program = p;
   k->cmrt_kernel = NULL;
@@ -108,7 +118,7 @@ cl_kernel_get_attributes(cl_kernel k)
 LOCAL void
 cl_kernel_add_ref(cl_kernel k)
-  atomic_inc(&k->ref_n);
 LOCAL cl_int
@@ -156,16 +166,23 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
       return CL_INVALID_ARG_VALUE;
     cl_sampler s = *(cl_sampler*)value;
-    if(s->magic != CL_MAGIC_SAMPLER_HEADER)
       return CL_INVALID_SAMPLER;
   } else {
     // should be image, GLOBAL_PTR, CONSTANT_PTR
-    if (UNLIKELY(value == NULL && arg_type == GBE_ARG_IMAGE))
+    if (UNLIKELY(value == NULL && (arg_type == GBE_ARG_IMAGE ||
+            arg_type == GBE_ARG_PIPE)))
       return CL_INVALID_ARG_VALUE;
     if(value != NULL)
       mem = *(cl_mem*)value;
+    if(arg_type == GBE_ARG_PIPE) {
+      _cl_mem_pipe* pipe= cl_mem_pipe(mem);
+      size_t type_size = (size_t)interp_kernel_get_arg_info(k->opaque, index,5);
+      if(pipe->packet_size != type_size)
+          return CL_INVALID_ARG_VALUE;
+    }
     if(value != NULL && mem) {
-      if( CL_SUCCESS != is_valid_mem(mem, ctx->buffers))
+      if(CL_SUCCESS != cl_mem_is_valid(mem, ctx))
         return CL_INVALID_MEM_OBJECT;
       if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !IS_IMAGE(mem))
@@ -252,11 +269,62 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   k->args[index].mem = mem;
   k->args[index].is_set = 1;
+  k->args[index].is_svm = mem->is_svm;
+  if(mem->is_svm)
+    k->args[index].ptr = mem->host_ptr;
   k->args[index].local_sz = 0;
   k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
   return CL_SUCCESS;
+LOCAL cl_int
+cl_kernel_set_arg_svm_pointer(cl_kernel k, cl_uint index, const void *value)
+  enum gbe_arg_type arg_type; /* kind of argument */
+  //size_t arg_sz;              /* size of the argument */
+  cl_context ctx = k->program->ctx;
+  cl_mem mem= cl_context_get_svm_from_ptr(ctx, value);
+  if (UNLIKELY(index >= k->arg_n))
+  arg_type = interp_kernel_get_arg_type(k->opaque, index);
+  //arg_sz = interp_kernel_get_arg_size(k->opaque, index);
+  if(arg_type != GBE_ARG_GLOBAL_PTR && arg_type != GBE_ARG_CONSTANT_PTR )
+  if(mem == NULL)
+  cl_mem_add_ref(mem);
+  if (k->args[index].mem)
+    cl_mem_delete(k->args[index].mem);
+  k->args[index].ptr = (void *)value;
+  k->args[index].mem = mem;
+  k->args[index].is_set = 1;
+  k->args[index].is_svm = 1;
+  k->args[index].local_sz = 0;
+  k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
+  return 0;
+LOCAL cl_int
+cl_kernel_set_exec_info(cl_kernel k, size_t n, const void *value)
+  cl_int err = CL_SUCCESS;
+  assert(k != NULL);
+  if (n == 0) return err;
+  TRY_ALLOC(k->exec_info, cl_calloc(n, 1));
+  memcpy(k->exec_info, value, n);
+  k->exec_info_n = n / sizeof(void *);
+  return err;
 LOCAL int
 cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name,
                        size_t param_value_size, void *param_value, size_t *param_value_size_ret)
@@ -275,13 +343,13 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
     if (!param_value) return CL_SUCCESS;
     if (param_value_size < sizeof(cl_kernel_arg_address_qualifier))
       return CL_INVALID_VALUE;
-    if ((cl_ulong)ret_info == 0) {
+    if ((size_t)ret_info == 0) {
       *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_PRIVATE;
-    } else if ((cl_ulong)ret_info == 1 || (cl_ulong)ret_info == 4) {
+    } else if ((size_t)ret_info == 1 || (size_t)ret_info == 4) {
       *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_GLOBAL;
-    } else if ((cl_ulong)ret_info == 2) {
+    } else if ((size_t)ret_info == 2) {
       *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_CONSTANT;
-    } else if ((cl_ulong)ret_info == 3) {
+    } else if ((size_t)ret_info == 3) {
       *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL;
     } else {
       /* If no address qualifier is specified, the default address qualifier
@@ -334,6 +402,8 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
       type_qual = type_qual | CL_KERNEL_ARG_TYPE_VOLATILE;
     if (strstr((char*)ret_info, "restrict"))
       type_qual = type_qual | CL_KERNEL_ARG_TYPE_RESTRICT;
+    if (strstr((char*)ret_info, "pipe"))
+      type_qual = CL_KERNEL_ARG_TYPE_PIPE;
     *(cl_kernel_arg_type_qualifier *)param_value = type_qual;
     return CL_SUCCESS;
@@ -371,7 +441,8 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   k->opaque = opaque;
   const char* kname = cl_kernel_get_name(k);
-  if (strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0)
+  if (kname != NULL &&
+      strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0)
     k->vme = 1;
     k->vme = 0;
@@ -389,7 +460,7 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   /* Get image data & size */
   k->image_sz = interp_kernel_get_image_size(k->opaque);
   assert(k->sampler_sz <= GEN_MAX_SURFACES);
-  assert(k->image_sz <= ctx->device->max_read_image_args + ctx->device->max_write_image_args);
+  assert(k->image_sz <= ctx->devices[0]->max_read_image_args + ctx->devices[0]->max_write_image_args);
   if (k->image_sz > 0) {
     TRY_ALLOC_NO_ERR(k->images, cl_calloc(k->image_sz, sizeof(k->images[0])));
     interp_kernel_get_image_data(k->opaque, k->images);
@@ -409,17 +480,16 @@ cl_kernel_dup(cl_kernel from)
   if (UNLIKELY(from == NULL))
     return NULL;
   TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel));
-  SET_ICD(to->dispatch)
   to->bo = from->bo;
   to->opaque = from->opaque;
   to->vme = from->vme;
-  to->ref_n = 1;
-  to->magic = CL_MAGIC_KERNEL_HEADER;
   to->program = from->program;
   to->arg_n = from->arg_n;
   to->curbe_sz = from->curbe_sz;
   to->sampler_sz = from->sampler_sz;
   to->image_sz = from->image_sz;
+  to->exec_info_n = from->exec_info_n;
   memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz));
   to->stack_size = from->stack_size;
   if (to->sampler_sz)
@@ -429,6 +499,10 @@ cl_kernel_dup(cl_kernel from)
     memcpy(to->images, from->images, to->image_sz * sizeof(to->images[0]));
   } else
     to->images = NULL;
+  if (to->exec_info_n) { /* Must always 0 here */
+    TRY_ALLOC_NO_ERR(to->exec_info, cl_calloc(to->exec_info_n, sizeof(void *)));
+    memcpy(to->exec_info, from->exec_info, to->exec_info_n * sizeof(void *));
+  }
   TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
   if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 05a882e..8acd82a 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -21,6 +21,7 @@
 #define __CL_KERNEL_H__
 #include "cl_internals.h"
+#include "cl_base_object.h"
 #include "cl_driver.h"
 #include "cl_gbe_loader.h"
 #include "CL/cl.h"
@@ -40,15 +41,15 @@ typedef struct cl_argument {
   cl_sampler sampler;   /* For sampler. */
   cl_accelerator_intel accel;
   unsigned char bti;
-  uint32_t local_sz:31; /* For __local size specification */
+  void *ptr;            /* SVM ptr value. */
+  uint32_t local_sz:30; /* For __local size specification */
   uint32_t is_set:1;    /* All args must be set before NDRange */
+  uint32_t is_svm:1;    /* Indicate this argument is SVMPointer */
 } cl_argument;
 /* One OCL function */
 struct _cl_kernel {
-  DEFINE_ICD(dispatch)
-  uint64_t magic;             /* To identify it as a kernel */
-  volatile int ref_n;         /* We reference count this object */
+  _cl_base_object base;
   cl_buffer bo;               /* The code itself */
   cl_program program;         /* Owns this structure (and pointers) */
   gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
@@ -71,8 +72,19 @@ struct _cl_kernel {
   uint32_t vme:1;             /* True only if it is a built-in kernel for VME */
   void* cmrt_kernel;          /* CmKernel* */
+  uint32_t exec_info_n;       /* The kernel's exec info count */
+  void** exec_info;             /* The kernel's exec info */
+  cl_bool useDeviceEnqueue;     /* kernel use device enqueue */
+  void* device_enqueue_ptr;     /* device_enqueue buffer*/
+  uint32_t device_enqueue_info_n; /* count of parent kernel's arguments buffers, as child enqueues' exec info */
+  void** device_enqueue_infos;   /* parent kernel's arguments buffers, as child enqueues' exec info   */
+#define CL_OBJECT_KERNEL_MAGIC 0x1234567890abedefLL
+#define CL_OBJECT_IS_KERNEL(obj) ((obj &&                           \
+         ((cl_base_object)obj)->magic == CL_OBJECT_KERNEL_MAGIC &&  \
+         CL_OBJECT_GET_REF(obj) >= 1))
 /* Allocate an empty kernel */
 extern cl_kernel cl_kernel_new(cl_program);
@@ -104,6 +116,12 @@ extern int cl_kernel_set_arg(cl_kernel,
                              uint32_t    arg_index,
                              size_t      arg_size,
                              const void *arg_value);
+extern int cl_kernel_set_arg_svm_pointer(cl_kernel,
+                                            uint32_t arg_index,
+                                            const void *arg_value);
+extern cl_int cl_kernel_set_exec_info(cl_kernel k,
+                                      size_t n,
+                                      const void *value);
 /* Get the argument information */
 extern int cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index,
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
index 84b4beb..7b3600c 100644
--- a/src/cl_khr_icd.c
+++ b/src/cl_khr_icd.c
@@ -169,23 +169,23 @@ struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
   (void *) NULL,
   (void *) NULL,
   (void *) NULL,
   (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) NULL,
-  (void *) clGetKernelSubGroupInfoKHR,
+#ifdef CL_VERSION_2_0
+  clCreateCommandQueueWithProperties,
+  clCreatePipe,
+  clGetPipeInfo,
+  clSVMAlloc,
+  clSVMFree,
+  clEnqueueSVMFree,
+  clEnqueueSVMMemcpy,
+  clEnqueueSVMMemFill,
+  clEnqueueSVMMap,
+  clEnqueueSVMUnmap,
+  clCreateSamplerWithProperties,
+  clSetKernelArgSVMPointer,
+  clSetKernelExecInfo,
+  clGetKernelSubGroupInfoKHR,
diff --git a/src/cl_khr_icd.h b/src/cl_khr_icd.h
index 3985d80..58cee68 100644
--- a/src/cl_khr_icd.h
+++ b/src/cl_khr_icd.h
@@ -21,13 +21,11 @@
 #define SET_ICD(dispatch) \
   dispatch = &cl_khr_icd_dispatch;
-#define INIT_ICD(member)  .member = &cl_khr_icd_dispatch,
 #define DEFINE_ICD(member) struct _cl_icd_dispatch const *member;
 extern struct _cl_icd_dispatch const cl_khr_icd_dispatch;
 #define SET_ICD(dispatch)
-#define INIT_ICD(member)
 #define DEFINE_ICD(member)
diff --git a/src/cl_mem.c b/src/cl_mem.c
index ad1c8c2..0278b7f 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -28,6 +28,7 @@
 #include "cl_kernel.h"
 #include "cl_command_queue.h"
 #include "cl_cmrt.h"
+#include "cl_enqueue.h"
 #include "CL/cl.h"
 #include "CL/cl_intel.h"
@@ -35,6 +36,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
+#include <math.h>
 #define FIELD_SIZE(CASE,TYPE)               \
   case JOIN(CL_,CASE):                      \
@@ -48,7 +50,7 @@
 #define MAX_TILING_SIZE                             128 * MB
-static cl_mem_object_type
+LOCAL cl_mem_object_type
 cl_get_mem_object_type(cl_mem mem)
   switch (mem->type) {
@@ -67,166 +69,39 @@ cl_get_mem_object_type(cl_mem mem)
 LOCAL cl_int
-cl_get_mem_object_info(cl_mem mem,
-                cl_mem_info param_name,
-                size_t param_value_size,
-                void *param_value,
-                size_t *param_value_size_ret)
+cl_get_pipe_info(cl_mem mem,
+                    cl_mem_info param_name,
+                    size_t param_value_size,
+                    void *param_value,
+                    size_t *param_value_size_ret)
+  _cl_mem_pipe *pipe;
-    FIELD_SIZE(MEM_TYPE, cl_mem_object_type);
-    FIELD_SIZE(MEM_FLAGS, cl_mem_flags);
-    FIELD_SIZE(MEM_SIZE, size_t);
-    FIELD_SIZE(MEM_HOST_PTR, void *);
-    FIELD_SIZE(MEM_MAP_COUNT, cl_uint);
-    FIELD_SIZE(MEM_CONTEXT, cl_context);
-    FIELD_SIZE(MEM_OFFSET, size_t);
     return CL_INVALID_VALUE;
-  switch(param_name)
-  {
-  case CL_MEM_TYPE:
-    *((cl_mem_object_type *)param_value) = cl_get_mem_object_type(mem);
-    break;
-  case CL_MEM_FLAGS:
-    *((cl_mem_flags *)param_value) = mem->flags;
-    break;
-  case CL_MEM_SIZE:
-    *((size_t *)param_value) = mem->size;
-    break;
-  case CL_MEM_HOST_PTR:
-    if(mem->type == CL_MEM_IMAGE_TYPE) {
-      *((size_t *)param_value) = (size_t)mem->host_ptr;
-    } else {
-      struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
-      *((size_t *)param_value) = (size_t)mem->host_ptr + buf->sub_offset;
-    }
-    break;
-    *((cl_uint *)param_value) = mem->map_ref;
-    break;
-    *((cl_uint *)param_value) = mem->ref_n;
-    break;
-    *((cl_context *)param_value) = mem->ctx;
-    break;
-    if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
-      *((cl_mem *)param_value) = NULL;
-    } else {
-      struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
-      *((cl_mem *)param_value) = (cl_mem)(buf->parent);
-    }
-    break;
-  case CL_MEM_OFFSET:
-    if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
-      *((size_t *)param_value) = 0;
-    } else {
-      struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
-      *((size_t *)param_value) = buf->sub_offset;
-    }
-    break;
-  }
-  return CL_SUCCESS;
-#define IS_1D(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D ||        \
-                      image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||  \
-                      image->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
-#define IS_2D(image) (image->image_type == CL_MEM_OBJECT_IMAGE2D ||        \
-                      image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
-#define IS_3D(image) (image->image_type == CL_MEM_OBJECT_IMAGE3D)
-#define IS_ARRAY(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
-                         image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+  if(mem->type != CL_MEM_PIPE_TYPE)
-LOCAL cl_int
-cl_get_image_info(cl_mem mem,
-                  cl_image_info param_name,
-                  size_t param_value_size,
-                  void *param_value,
-                  size_t *param_value_size_ret)
-  int err;
-  CHECK_IMAGE(mem, image);
+  pipe = cl_mem_pipe(mem);
-    FIELD_SIZE(IMAGE_FORMAT, cl_image_format);
-  default:
-    return CL_INVALID_VALUE;
-  }
-  switch(param_name)
-  {
-    *(cl_image_format *)param_value = image->fmt;
-    break;
-    *(size_t *)param_value = image->bpp;
-    break;
-    *(size_t *)param_value = image->row_pitch;
-    break;
-    *(size_t *)param_value = image->slice_pitch;
-    break;
-    if (mem->type == CL_MEM_BUFFER1D_IMAGE_TYPE) {
-      struct _cl_mem_buffer1d_image *buffer1d_image = (struct _cl_mem_buffer1d_image*) image;
-      *(size_t *)param_value = buffer1d_image->size;
-    } else
-      *(size_t *)param_value = image->w;
-    break;
-    if (mem->type == CL_MEM_BUFFER1D_IMAGE_TYPE)
-      *(size_t *)param_value = 0;
-    else
-      *(size_t *)param_value = IS_1D(image) ? 0 : image->h;
-    break;
-    *(size_t *)param_value = IS_3D(image) ? image->depth : 0;
+    *((cl_uint *)param_value) = pipe->packet_size;
-    *(size_t *)param_value = IS_ARRAY(image) ? image->depth : 0;
-    break;
-    *(cl_mem *)param_value = image->buffer_1d;
-    break;
-    *(cl_mem *)param_value = 0;
+    *((cl_uint *)param_value) = pipe->max_packets;
   return CL_SUCCESS;
-    return err;
-#undef FIELD_SIZE
 LOCAL cl_mem
 cl_mem_allocate(enum cl_mem_type type,
                 cl_context ctx,
@@ -257,18 +132,23 @@ cl_mem_allocate(enum cl_mem_type type,
     struct _cl_mem_buffer1d_image *buffer1d_image = NULL;
     TRY_ALLOC(buffer1d_image, CALLOC(struct _cl_mem_buffer1d_image));
     mem = &buffer1d_image->base.base;
+  } else if (type == CL_MEM_PIPE_TYPE) {
+    _cl_mem_pipe *pipe = NULL;
+    TRY_ALLOC(pipe, CALLOC(struct _cl_mem_pipe));
+    mem = &pipe->base;
   } else {
     struct _cl_mem_buffer *buffer = NULL;
     TRY_ALLOC (buffer, CALLOC(struct _cl_mem_buffer));
     mem = &buffer->base;
+  list_init(&mem->dstr_cb_head);
   mem->type = type;
-  SET_ICD(mem->dispatch)
-  mem->ref_n = 1;
-  mem->magic = CL_MAGIC_MEM_HEADER;
   mem->flags = flags;
   mem->is_userptr = 0;
   mem->offset = 0;
+  mem->is_svm = 0;
   mem->cmrt_mem = NULL;
   if (mem->type == CL_MEM_IMAGE_TYPE) {
     cl_mem_image(mem)->is_image_from_buffer = 0;
@@ -285,17 +165,26 @@ cl_mem_allocate(enum cl_mem_type type,
     uint8_t bufCreated = 0;
-    if (ctx->device->host_unified_memory) {
+    if (ctx->devices[0]->host_unified_memory) {
       int page_size = getpagesize();
       int cacheline_size = 0;
-      cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
+      cl_get_device_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
       if (type == CL_MEM_BUFFER_TYPE) {
         if (flags & CL_MEM_USE_HOST_PTR) {
           assert(host_ptr != NULL);
+          cl_mem svm_mem = NULL;
+          if((svm_mem = cl_context_get_svm_from_ptr(ctx, host_ptr)) != NULL)
+            mem->is_svm = 1;
           /* userptr not support tiling */
           if (!is_tiled) {
-            if ((ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr) &&
+            if(svm_mem != NULL) {  //SVM always paged alignment
+              mem->offset = 0;
+              mem->is_userptr = 1;
+              mem->bo = svm_mem->bo;
+              cl_mem_add_ref(svm_mem);
+              bufCreated = 1;
+            } else if ((ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr) &&
                 (ALIGN((unsigned long)sz, cacheline_size) == (unsigned long)sz)) {
               void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & (~(page_size - 1)));
               mem->offset = host_ptr - aligned_host_ptr;
@@ -333,7 +222,7 @@ cl_mem_allocate(enum cl_mem_type type,
       // if create image from USE_HOST_PTR buffer, the buffer's base address need be aligned.
       if(buffer->is_userptr) {
         int base_alignement = 0;
-        cl_get_device_info(ctx->device, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, sizeof(base_alignement), &base_alignement, NULL);
+        cl_get_device_info(ctx->devices[0], CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, sizeof(base_alignement), &base_alignement, NULL);
         if(ALIGN((unsigned long)buffer->host_ptr, base_alignement) != (unsigned long)buffer->host_ptr) {
           goto error;
@@ -363,15 +252,8 @@ cl_mem_allocate(enum cl_mem_type type,
     mem->size = sz;
-  cl_context_add_ref(ctx);
-  mem->ctx = ctx;
-    /* Append the buffer in the context buffer list */
-  pthread_mutex_lock(&ctx->buffer_lock);
-  mem->next = ctx->buffers;
-  if (ctx->buffers != NULL)
-    ctx->buffers->prev = mem;
-  ctx->buffers = mem;
-  pthread_mutex_unlock(&ctx->buffer_lock);
+  /* Append the buffer in the context buffer list */
+  cl_context_add_mem(ctx, mem);
   if (errcode)
@@ -385,17 +267,26 @@ error:
 LOCAL cl_int
-is_valid_mem(cl_mem mem, cl_mem buffers)
+cl_mem_is_valid(cl_mem mem, cl_context ctx)
-  cl_mem tmp = buffers;
-  while(tmp){
-    if(mem == tmp){
-      if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
+  struct list_node *pos;
+  cl_base_object pbase_object;
+  list_for_each (pos, (&ctx->mem_objects)) {
+    pbase_object = list_entry(pos, _cl_base_object, node);
+    if (pbase_object == (cl_base_object)mem) {
+      if (UNLIKELY(!CL_OBJECT_IS_MEM(mem))) {
+        CL_OBJECT_UNLOCK(ctx);
         return CL_INVALID_MEM_OBJECT;
+      }
+      CL_OBJECT_UNLOCK(ctx);
       return CL_SUCCESS;
-    tmp = tmp->next;
@@ -448,7 +339,7 @@ cl_mem_new_buffer(cl_context ctx,
     goto error;
-  if ((err = cl_get_device_info(ctx->device,
+  if ((err = cl_get_device_info(ctx->devices[0],
@@ -550,7 +441,7 @@ cl_mem_new_sub_buffer(cl_mem buffer,
     goto error;
-  if (info->origin & (buffer->ctx->device->mem_base_addr_align / 8 - 1)) {
+  if (info->origin & (buffer->ctx->devices[0]->mem_base_addr_align / 8 - 1)) {
     goto error;
@@ -558,10 +449,10 @@ cl_mem_new_sub_buffer(cl_mem buffer,
   /* Now create the sub buffer and link it to the buffer. */
   TRY_ALLOC (sub_buf, CALLOC(struct _cl_mem_buffer));
   mem = &sub_buf->base;
+  list_init(&mem->dstr_cb_head);
   mem->type = CL_MEM_SUBBUFFER_TYPE;
-  SET_ICD(mem->dispatch)
-  mem->ref_n = 1;
-  mem->magic = CL_MAGIC_MEM_HEADER;
   mem->flags = flags;
   mem->offset = buffer->offset;
   mem->is_userptr = buffer->is_userptr;
@@ -579,19 +470,12 @@ cl_mem_new_sub_buffer(cl_mem buffer,
   mem->bo = buffer->bo;
   mem->size = info->size;
   sub_buf->sub_offset = info->origin;
-  if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & CL_MEM_COPY_HOST_PTR) {
+  if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & CL_MEM_COPY_HOST_PTR || buffer->flags & CL_MEM_ALLOC_HOST_PTR) {
     mem->host_ptr = buffer->host_ptr;
-  cl_context_add_ref(buffer->ctx);
-  mem->ctx = buffer->ctx;
   /* Append the buffer in the context buffer list */
-  pthread_mutex_lock(&buffer->ctx->buffer_lock);
-  mem->next = buffer->ctx->buffers;
-  if (buffer->ctx->buffers != NULL)
-    buffer->ctx->buffers->prev = mem;
-  buffer->ctx->buffers = mem;
-  pthread_mutex_unlock(&buffer->ctx->buffer_lock);
+  cl_context_add_mem(buffer->ctx, mem);
   if (errcode_ret)
@@ -603,6 +487,68 @@ error:
   goto exit;
+cl_mem cl_mem_new_pipe(cl_context ctx,
+                             cl_mem_flags flags,
+                             cl_uint packet_size,
+                             cl_uint max_packets,
+                             cl_int *errcode_ret)
+  _cl_mem_pipe* pipe = NULL;
+  cl_uint *ptr = NULL;
+  cl_mem mem = NULL;
+  cl_int err;
+  cl_uint sz;
+  if(UNLIKELY((pipe = CALLOC(_cl_mem_pipe)) == NULL)) {
+    goto error;
+  }
+  sz = packet_size * max_packets;
+  assert(sz != 0);
+  /* HSW: Byte scattered Read/Write has limitation that
+     the buffer size must be a multiple of 4 bytes. */
+  sz = ALIGN(sz, 4);
+  sz += 128;   //The head of pipe is for data struct, and alignment to 128 byte for max data type double16
+  mem = cl_mem_allocate(CL_MEM_PIPE_TYPE, ctx, flags, sz, CL_FALSE,NULL , NULL, &err);
+  if (mem == NULL || err != CL_SUCCESS)
+    goto error;
+  ptr = cl_mem_map_auto(mem, 1);
+  if(ptr == NULL){
+    goto error;
+  }
+  ptr[0] = max_packets;
+  ptr[1] = packet_size;
+  ptr[2] = 0;              //write ptr
+  ptr[3] = 0;              //read ptr
+  ptr[4] = 0;              //reservation read ptr
+  ptr[5] = 0;              //reservation write ptr
+  ptr[6] = 0;              //packet num
+  cl_mem_unmap(mem);
+  pipe = cl_mem_pipe(mem);
+  pipe->flags = flags;
+  pipe->packet_size = packet_size;
+  pipe->max_packets = max_packets;
+  return mem;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
 void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo)
@@ -620,6 +566,81 @@ void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo)
+void* cl_mem_svm_allocate(cl_context ctx, cl_svm_mem_flags flags,
+                                 size_t size, unsigned int alignment)
+  cl_int err = CL_SUCCESS;
+  size_t max_mem_size;
+  if(UNLIKELY(alignment & (alignment - 1)))
+    return NULL;
+  if ((err = cl_get_device_info(ctx->devices[0],
+                                 CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                 sizeof(max_mem_size),
+                                 &max_mem_size,
+                                 NULL)) != CL_SUCCESS) {
+      return NULL;
+  }
+  if(UNLIKELY(size == 0 || size > max_mem_size)) {
+    return NULL;
+  }
+    return NULL;
+  }
+          || ((flags & CL_MEM_WRITE_ONLY) && (flags & CL_MEM_READ_ONLY))
+          || ((flags & CL_MEM_WRITE_ONLY) && (flags & CL_MEM_READ_WRITE))
+          || ((flags & CL_MEM_READ_ONLY) && (flags & CL_MEM_READ_WRITE)))) {
+    return NULL;
+  }
+  void * ptr = NULL;
+  cl_buffer_mgr bufmgr = NULL;
+  cl_mem mem;
+  _cl_mem_svm* svm;
+  if(UNLIKELY((svm = CALLOC(_cl_mem_svm)) == NULL))
+    return NULL;
+  mem = &svm->base;
+  mem->type = CL_MEM_SVM_TYPE;
+  list_init(&mem->dstr_cb_head);
+  mem->flags = flags | CL_MEM_USE_HOST_PTR;
+  mem->is_userptr = 0;
+  mem->is_svm = 0;
+  mem->offset = 0;
+  bufmgr = cl_context_get_bufmgr(ctx);
+  assert(bufmgr);
+  int page_size = getpagesize();
+  const size_t alignedSZ = ALIGN(size, page_size);
+  if(alignment == 0)
+    alignment = page_size;
+  else
+    alignment = ALIGN(alignment, page_size);
+  ptr = cl_aligned_malloc(alignedSZ, alignment);
+  if(ptr == NULL) return NULL;
+  mem->host_ptr = ptr;
+  mem->is_svm = 1;
+  mem->is_userptr = 1;
+  mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL SVM memory object", ptr, alignedSZ, 0);
+  mem->size = size;
+  cl_buffer_set_softpin_offset(mem->bo, (size_t)ptr);
+  cl_buffer_set_bo_use_full_range(mem->bo, 1);
+  /* Append the svm in the context buffer list */
+  cl_context_add_mem(ctx, mem);
+  return ptr;
 cl_mem_copy_image_region(const size_t *origin, const size_t *region,
                          void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
@@ -790,7 +811,7 @@ _cl_mem_new_image(cl_context ctx,
     h = 1;
     depth = 1;
-    if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
+    if (UNLIKELY(w > ctx->devices[0]->image2d_max_width)) DO_IMAGE_ERROR;
     if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
     if (UNLIKELY(data && (slice_pitch % pitch != 0))) DO_IMAGE_ERROR;
     if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
@@ -800,11 +821,11 @@ _cl_mem_new_image(cl_context ctx,
              image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
     if (image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
-      if (UNLIKELY(w > ctx->device->image_mem_size)) DO_IMAGE_ERROR;
+      if (UNLIKELY(w > ctx->devices[0]->image_mem_size)) DO_IMAGE_ERROR;
       /* This is an image1d buffer which exceeds normal image size restrication
          We have to use a 2D image to simulate this 1D image. */
-      h = (w + ctx->device->image2d_max_width - 1) / ctx->device->image2d_max_width;
-      w = w > ctx->device->image2d_max_width ? ctx->device->image2d_max_width : w;
+      h = (w + ctx->devices[0]->image2d_max_width - 1) / ctx->devices[0]->image2d_max_width;
+      w = w > ctx->devices[0]->image2d_max_width ? ctx->devices[0]->image2d_max_width : w;
       tiling = CL_NO_TILE;
     } else if(image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL) {
       tiling = CL_NO_TILE;
@@ -817,8 +838,8 @@ _cl_mem_new_image(cl_context ctx,
     if (data && pitch == 0)
       pitch = min_pitch;
-    if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
-    if (UNLIKELY(h > ctx->device->image2d_max_height)) DO_IMAGE_ERROR;
+    if (UNLIKELY(w > ctx->devices[0]->image2d_max_width)) DO_IMAGE_ERROR;
+    if (UNLIKELY(h > ctx->devices[0]->image2d_max_height)) DO_IMAGE_ERROR;
     if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
     if (UNLIKELY(!data && pitch != 0 && buffer == NULL)) DO_IMAGE_ERROR;
@@ -838,11 +859,11 @@ _cl_mem_new_image(cl_context ctx,
     size_t min_slice_pitch = pitch * h;
     if (data && slice_pitch == 0)
       slice_pitch = min_slice_pitch;
-    if (UNLIKELY(w > ctx->device->image3d_max_width)) DO_IMAGE_ERROR;
-    if (UNLIKELY(h > ctx->device->image3d_max_height)) DO_IMAGE_ERROR;
+    if (UNLIKELY(w > ctx->devices[0]->image3d_max_width)) DO_IMAGE_ERROR;
+    if (UNLIKELY(h > ctx->devices[0]->image3d_max_height)) DO_IMAGE_ERROR;
     if (image_type == CL_MEM_OBJECT_IMAGE3D &&
-       (UNLIKELY(depth > ctx->device->image3d_max_depth))) DO_IMAGE_ERROR
-    else if (UNLIKELY(depth > ctx->device->image_max_array_size)) DO_IMAGE_ERROR;
+       (UNLIKELY(depth > ctx->devices[0]->image3d_max_depth))) DO_IMAGE_ERROR
+    else if (UNLIKELY(depth > ctx->devices[0]->image_max_array_size)) DO_IMAGE_ERROR;
     if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
     if (UNLIKELY(data && min_slice_pitch > slice_pitch)) DO_IMAGE_ERROR;
     if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
@@ -854,9 +875,9 @@ _cl_mem_new_image(cl_context ctx,
   uint8_t enableUserptr = 0;
-  if (enable_true_hostptr && ctx->device->host_unified_memory && data != NULL && (flags & CL_MEM_USE_HOST_PTR)) {
+  if (enable_true_hostptr && ctx->devices[0]->host_unified_memory && data != NULL && (flags & CL_MEM_USE_HOST_PTR)) {
     int cacheline_size = 0;
-    cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
+    cl_get_device_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
     if (ALIGN((unsigned long)data, cacheline_size) == (unsigned long)data &&
         ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)) == h &&
         ALIGN(h * pitch * depth, cacheline_size) == h * pitch * depth && //h and pitch should same as aligned_h and aligned_pitch if enable userptr
@@ -1033,7 +1054,7 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
     goto error;
-  if ((err = cl_get_device_info(ctx->device,
+  if ((err = cl_get_device_info(ctx->devices[0],
@@ -1103,6 +1124,8 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
     memcpy(dst, src, mem_buffer->base.size);
+    struct _cl_mem_buffer1d_image* image_buffer = (struct _cl_mem_buffer1d_image*)image;
+    image_buffer->descbuffer = buffer;
@@ -1172,14 +1195,28 @@ cl_mem_new_image(cl_context context,
 LOCAL void
+cl_mem_svm_delete(cl_context ctx, void *svm_pointer)
+  cl_mem mem;
+  if(UNLIKELY(svm_pointer == NULL))
+    return;
+  mem = cl_context_get_svm_from_ptr(ctx, svm_pointer);
+  if(mem == NULL)
+    return;
+  cl_mem_delete(mem);
+LOCAL void
 cl_mem_delete(cl_mem mem)
   cl_int i;
+  cl_mem_dstr_cb cb = NULL;
   if (UNLIKELY(mem == NULL))
-  if (atomic_dec(&mem->ref_n) > 1)
+  if (CL_OBJECT_DEC_REF(mem) > 1)
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL
   if (UNLIKELY(IS_GL_IMAGE(mem))) {
@@ -1190,6 +1227,14 @@ cl_mem_delete(cl_mem mem)
+  /* First, call all the callbacks registered by user. */
+  while (!list_empty(&mem->dstr_cb_head)) {
+    cb = list_entry(mem->dstr_cb_head.head_node.n, _cl_mem_dstr_cb, node);
+    list_node_del(&cb->node);
+    cb->pfn_notify(mem, cb->user_data);
+    cl_free(cb);
+  }
   /* iff we are a image, delete the 1d buffer if has. */
   if (IS_IMAGE(mem)) {
     if (cl_mem_image(mem)->buffer_1d) {
@@ -1204,21 +1249,6 @@ cl_mem_delete(cl_mem mem)
-  /* Remove it from the list */
-  if (mem->ctx) {
-    pthread_mutex_lock(&mem->ctx->buffer_lock);
-      if (mem->prev)
-        mem->prev->next = mem->next;
-      if (mem->next)
-        mem->next->prev = mem->prev;
-      if (mem->ctx->buffers == mem)
-        mem->ctx->buffers = mem->next;
-    pthread_mutex_unlock(&mem->ctx->buffer_lock);
-    cl_context_delete(mem->ctx);
-  } else {
-    assert((mem->prev == 0) && (mem->next == 0));
-  }
   /* Someone still mapped, unmap */
   if(mem->map_ref > 0) {
@@ -1234,16 +1264,6 @@ cl_mem_delete(cl_mem mem)
   if (mem->mapped_ptr)
-  if (mem->dstr_cb) {
-    cl_mem_dstr_cb *cb = mem->dstr_cb;
-    while (mem->dstr_cb) {
-      cb = mem->dstr_cb;
-      cb->pfn_notify(mem, cb->user_data);
-      mem->dstr_cb = cb->next;
-      free(cb);
-    }
-  }
   /* Iff we are sub, do nothing for bo release. */
   if (mem->type == CL_MEM_SUBBUFFER_TYPE) {
     struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
@@ -1258,15 +1278,24 @@ cl_mem_delete(cl_mem mem)
       buffer->parent->subs = buffer->sub_next;
     cl_mem_delete((cl_mem )(buffer->parent));
+  } else if (mem->is_svm && mem->type != CL_MEM_SVM_TYPE) {
+    cl_mem svm_mem = cl_context_get_svm_from_ptr(mem->ctx, mem->host_ptr);
+    if (svm_mem != NULL)
+      cl_mem_delete(svm_mem);
   } else if (LIKELY(mem->bo != NULL)) {
-  if (mem->is_userptr &&
+  /* Remove it from the list */
+  cl_context_remove_mem(mem->ctx, mem);
+  if ((mem->is_userptr &&
       (mem->flags & CL_MEM_ALLOC_HOST_PTR) &&
-      (mem->type != CL_MEM_SUBBUFFER_TYPE))
+      (mem->type != CL_MEM_SUBBUFFER_TYPE)) ||
+      (mem->is_svm && mem->type == CL_MEM_SVM_TYPE))
@@ -1274,7 +1303,7 @@ LOCAL void
 cl_mem_add_ref(cl_mem mem)
-  atomic_inc(&mem->ref_n);
 #define LOCAL_SZ_0   16
@@ -1282,7 +1311,7 @@ cl_mem_add_ref(cl_mem mem)
 #define LOCAL_SZ_2   4
 LOCAL cl_int
-cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+cl_mem_copy(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
             size_t src_offset, size_t dst_offset, size_t cb)
   cl_int ret = CL_SUCCESS;
@@ -1335,7 +1364,8 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
     cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
     cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
-    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+                                    global_off, global_sz, global_sz, local_sz, local_sz);
     return ret;
@@ -1376,7 +1406,8 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
     cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
     cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
-    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+                                    global_off, global_sz, global_sz, local_sz, local_sz);
     return ret;
@@ -1406,7 +1437,8 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
     cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
     cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
-    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+                                    global_off, global_sz, global_sz, local_sz, local_sz);
     return ret;
@@ -1438,7 +1470,8 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
     cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
     cl_kernel_set_arg(ker, 9, sizeof(int), &src_less);
-    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+                                    global_off, global_sz, global_sz, local_sz, local_sz);
     return ret;
@@ -1450,7 +1483,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
 LOCAL cl_int
-cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image* src_image,
+cl_image_fill(cl_command_queue queue, cl_event e, const void * pattern, struct _cl_mem_image* src_image,
            const size_t * origin, const size_t * region)
   cl_int ret = CL_SUCCESS;
@@ -1458,6 +1491,8 @@ cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+  uint32_t savedIntelFmt = src_image->intel_fmt;
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
@@ -1503,7 +1538,24 @@ cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image
     return CL_OUT_OF_RESOURCES;
   cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
-  cl_kernel_set_arg(ker, 1, sizeof(float)*4, pattern);
+  if(src_image->fmt.image_channel_order >= CL_sRGBA) {
+#define RGB2sRGB(linear)  ( linear <= 0.0031308f )? ( 12.92f * linear ):( 1.055f * powf( linear, 1.0f/2.4f ) - 0.055f);
+    cl_image_format fmt;
+    float newpattern[4] = {0.0,0.0,0.0,((float*)pattern)[3]};
+    int i;
+    for(i = 0;i < 3; i++){
+      if(src_image->fmt.image_channel_order == CL_sRGBA) {
+        newpattern[i] = RGB2sRGB(((float*)pattern)[i]);
+      } else
+        newpattern[2-i] = RGB2sRGB(((float*)pattern)[i]);
+    }
+    cl_kernel_set_arg(ker, 1, sizeof(float)*4, newpattern);
+    fmt.image_channel_order = CL_RGBA;
+    fmt.image_channel_data_type = CL_UNORM_INT8;
+    src_image->intel_fmt = cl_image_get_intel_format(&fmt);
+#undef RGB2sRGB
+  } else
+    cl_kernel_set_arg(ker, 1, sizeof(float)*4, pattern);
   cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
   cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
   cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
@@ -1511,13 +1563,15 @@ cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image
   cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]);
   cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]);
-  ret = cl_command_queue_ND_range(queue, ker, 3, global_off, global_sz, local_sz);
+  ret = cl_command_queue_ND_range(queue, ker, e, 3, global_off,
+                                  global_off, global_sz, global_sz, local_sz, local_sz);
+  src_image->intel_fmt = savedIntelFmt;
   return ret;
 LOCAL cl_int
-cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+cl_mem_fill(cl_command_queue queue, cl_event e, const void * pattern, size_t pattern_size,
             cl_mem buffer, size_t offset, size_t size)
   cl_int ret = CL_SUCCESS;
@@ -1614,13 +1668,14 @@ cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
   if (is_128)
     cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
-  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  ret = cl_command_queue_ND_range(queue, ker, e, 1, global_off,
+                                  global_off, global_sz, global_sz, local_sz, local_sz);
   return ret;
 LOCAL cl_int
-cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+cl_mem_copy_buffer_rect(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
                        const size_t *src_origin, const size_t *dst_origin, const size_t *region,
                        size_t src_row_pitch, size_t src_slice_pitch,
                        size_t dst_row_pitch, size_t dst_slice_pitch) {
@@ -1635,7 +1690,7 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
     cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
     cl_int size = region[0]*region[1]*region[2];
-    ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size);
+    ret = cl_mem_copy(queue, NULL, src_buf, dst_buf,src_offset, dst_offset, size);
     return ret;
@@ -1687,14 +1742,16 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
   cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
-  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+                                  global_off, global_sz, global_sz, local_sz, local_sz);
   return ret;
 LOCAL cl_int
-cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image, struct _cl_mem_image* dst_image,
-                         const size_t *src_origin, const size_t *dst_origin, const size_t *region) {
+cl_mem_kernel_copy_image(cl_command_queue queue, cl_event event, struct _cl_mem_image* src_image,
+                         struct _cl_mem_image* dst_image, const size_t *src_origin,
+                         const size_t *dst_origin, const size_t *region) {
   cl_int ret;
   cl_kernel ker = NULL;
   size_t global_off[] = {0,0,0};
@@ -1722,7 +1779,9 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
   if (fixupDataType) {
     cl_image_format fmt;
-    if (src_image->fmt.image_channel_order != CL_BGRA)
+    if (src_image->fmt.image_channel_order != CL_BGRA &&
+        src_image->fmt.image_channel_order != CL_sBGRA &&
+        src_image->fmt.image_channel_order != CL_sRGBA)
       fmt.image_channel_order = src_image->fmt.image_channel_order;
       fmt.image_channel_order = CL_RGBA;
@@ -1835,7 +1894,8 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
   cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]);
   cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]);
-  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+                                  global_off, global_sz, global_sz, local_sz, local_sz);
@@ -1848,7 +1908,7 @@ fail:
 LOCAL cl_int
-cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_mem buffer,
+cl_mem_copy_image_to_buffer(cl_command_queue queue, cl_event event, struct _cl_mem_image* image, cl_mem buffer,
                          const size_t *src_origin, const size_t dst_offset, const size_t *region) {
   cl_int ret;
   cl_kernel ker = NULL;
@@ -1937,7 +1997,8 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
   cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
   cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset);
-  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+                                  global_off, global_sz, global_sz, local_sz, local_sz);
@@ -1951,7 +2012,7 @@ fail:
 LOCAL cl_int
-cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_mem_image* image,
+cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_event event, cl_mem buffer, struct _cl_mem_image* image,
                          const size_t src_offset, const size_t *dst_origin, const size_t *region) {
   cl_int ret;
   cl_kernel ker = NULL;
@@ -2037,7 +2098,8 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
   cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
-  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
+                                  global_off, global_sz, global_sz, local_sz, local_sz);
   image->intel_fmt = intel_fmt;
@@ -2326,3 +2388,103 @@ error:
   mem = NULL;
   goto exit;
+LOCAL cl_int
+cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
+                      size_t size, const size_t *origin, const size_t *region)
+  // TODO: Need to add MT safe logic.
+  cl_int slot = -1;
+  int err = CL_SUCCESS;
+  size_t sub_offset = 0;
+  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
+    struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+    sub_offset = buffer->sub_offset;
+  }
+  ptr = (char*)ptr + offset + sub_offset;
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+    assert(mem->host_ptr);
+    //only calc ptr here, will do memcpy in enqueue
+    *mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
+  } else {
+    *mem_ptr = ptr;
+  }
+  /* Record the mapped address. */
+  if (!mem->mapped_ptr_sz) {
+    mem->mapped_ptr_sz = 16;
+    mem->mapped_ptr = (cl_mapped_ptr *)malloc(
+        sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
+    if (!mem->mapped_ptr) {
+      cl_mem_unmap_auto(mem);
+      err = CL_OUT_OF_HOST_MEMORY;
+      goto error;
+    }
+    memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+    slot = 0;
+  } else {
+    int i = 0;
+    for (; i < mem->mapped_ptr_sz; i++) {
+      if (mem->mapped_ptr[i].ptr == NULL) {
+        slot = i;
+        break;
+      }
+    }
+    if (i == mem->mapped_ptr_sz) {
+      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
+      if (!new_ptr) {
+        cl_mem_unmap_auto(mem);
+        err = CL_OUT_OF_HOST_MEMORY;
+        goto error;
+      }
+      memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      memcpy(new_ptr, mem->mapped_ptr,
+          mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      slot = mem->mapped_ptr_sz;
+      mem->mapped_ptr_sz *= 2;
+      free(mem->mapped_ptr);
+      mem->mapped_ptr = new_ptr;
+    }
+  }
+  assert(slot != -1);
+  mem->mapped_ptr[slot].ptr = *mem_ptr;
+  mem->mapped_ptr[slot].v_ptr = ptr;
+  mem->mapped_ptr[slot].size = size;
+  if(origin) {
+    assert(region);
+    mem->mapped_ptr[slot].origin[0] = origin[0];
+    mem->mapped_ptr[slot].origin[1] = origin[1];
+    mem->mapped_ptr[slot].origin[2] = origin[2];
+    mem->mapped_ptr[slot].region[0] = region[0];
+    mem->mapped_ptr[slot].region[1] = region[1];
+    mem->mapped_ptr[slot].region[2] = region[2];
+  }
+  mem->map_ref++;
+  if (err != CL_SUCCESS)
+    *mem_ptr = NULL;
+  return err;
+LOCAL cl_int
+cl_mem_set_destructor_callback(cl_mem memobj,
+                               void(CL_CALLBACK *pfn_notify)(cl_mem, void *), void *user_data)
+  cl_mem_dstr_cb cb = cl_calloc(1, sizeof(_cl_mem_dstr_cb));
+  if (cb == NULL) {
+    return CL_OUT_OF_HOST_MEMORY;
+  }
+  memset(cb, 0, sizeof(_cl_mem_dstr_cb));
+  list_node_init(&cb->node);
+  cb->pfn_notify = pfn_notify;
+  cb->user_data = user_data;
+  CL_OBJECT_LOCK(memobj);
+  list_add(&memobj->dstr_cb_head, &cb->node);
+  CL_OBJECT_UNLOCK(memobj);
+  return CL_SUCCESS;
diff --git a/src/cl_mem.h b/src/cl_mem.h
index c8f256d..4764401 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -23,9 +23,12 @@
 #include "cl_internals.h"
 #include "cl_driver_type.h"
 #include "CL/cl.h"
-#include "cl_khr_icd.h"
+#include "cl_base_object.h"
 #include <assert.h>
 #include <pthread.h>
+#if defined(HAS_GL_EGL)
+#include "EGL/egl.h"
 #ifndef CL_VERSION_1_2
 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
@@ -61,15 +64,18 @@ typedef struct _cl_mapped_ptr {
 typedef struct _cl_mem_dstr_cb {
-  struct _cl_mem_dstr_cb * next;
-  void (CL_CALLBACK *pfn_notify)(cl_mem memobj, void *user_data);
+  list_node node; /* Mem callback list node */
+  void(CL_CALLBACK *pfn_notify)(cl_mem memobj, void *user_data);
   void *user_data;
+} _cl_mem_dstr_cb;
+typedef _cl_mem_dstr_cb* cl_mem_dstr_cb;
 /* Used for buffers and images */
 enum cl_mem_type {
@@ -78,11 +84,8 @@ enum cl_mem_type {
 #define IS_GL_IMAGE(mem) (mem->type == CL_MEM_GL_IMAGE_TYPE)
 typedef  struct _cl_mem {
-  DEFINE_ICD(dispatch)
-  uint64_t magic;           /* To identify it as a memory object */
-  cl_mem prev, next;        /* We chain the memory buffers together */
+  _cl_base_object base;
   enum cl_mem_type type;
-  volatile int ref_n;       /* This object is reference counted */
   cl_buffer bo;             /* Data in GPU memory */
   size_t size;              /* original request size, not alignment size, used in constant buffer */
   cl_context ctx;           /* Context it belongs to */
@@ -92,19 +95,45 @@ typedef  struct _cl_mem {
   int mapped_ptr_sz;        /* The array size of mapped_ptr. */
   int map_ref;              /* The mapped count. */
   uint8_t mapped_gtt;       /* This object has mapped gtt, for unmap. */
-  cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
-  uint8_t is_userptr;       /* CL_MEM_USE_HOST_PTR is enabled*/
+  list_head dstr_cb_head;   /* All destroy callbacks. */
+  uint8_t is_userptr;       /* CL_MEM_USE_HOST_PTR is enabled */
+  cl_bool is_svm;           /* This object  is svm */
   size_t offset;            /* offset of host_ptr to the page beginning, only for CL_MEM_USE_HOST_PTR*/
   uint8_t cmrt_mem_type;    /* CmBuffer, CmSurface2D, ... */
   void* cmrt_mem;
 } _cl_mem;
+#define CL_OBJECT_MEM_MAGIC 0x381a27b9ee6504dfLL
+#define CL_OBJECT_IS_MEM(obj) ((obj &&                           \
+         ((cl_base_object)obj)->magic == CL_OBJECT_MEM_MAGIC &&  \
+         CL_OBJECT_GET_REF(obj) >= 1))
+#define CL_OBJECT_IS_IMAGE(mem) ((mem &&                           \
+         ((cl_base_object)mem)->magic == CL_OBJECT_MEM_MAGIC &&    \
+         CL_OBJECT_GET_REF(mem) >= 1 &&                            \
+         mem->type >= CL_MEM_IMAGE_TYPE))
+#define CL_OBJECT_IS_BUFFER(mem) ((mem &&                          \
+         ((cl_base_object)mem)->magic == CL_OBJECT_MEM_MAGIC &&    \
+         CL_OBJECT_GET_REF(mem) >= 1 &&                            \
+         mem->type < CL_MEM_IMAGE_TYPE))
+typedef struct _cl_mem_pipe {
+  _cl_mem base;
+  cl_svm_mem_flags flags;                 /* Flags specified at the creation time */
+  uint32_t packet_size;
+  uint32_t max_packets;
+} _cl_mem_pipe;
+typedef struct _cl_mem_svm {
+  _cl_mem base;
+  cl_svm_mem_flags flags;                 /* Flags specified at the creation time */
+} _cl_mem_svm;
 struct _cl_mem_image {
   _cl_mem base;
   cl_image_format fmt;            /* only for images */
   uint32_t intel_fmt;             /* format to provide in the surface state */
-  uint32_t bpp;                   /* number of bytes per pixel */
+  size_t bpp;                     /* number of bytes per pixel */
   cl_mem_object_type image_type;  /* only for images 1D/2D...*/
   size_t w, h, depth;             /* only for images (depth is only for 3D images) */
   size_t row_pitch, slice_pitch;
@@ -118,16 +147,30 @@ struct _cl_mem_image {
 struct _cl_mem_gl_image {
   struct _cl_mem_image base;
-  uint32_t target;
-  int      miplevel;
-  uint32_t texture;
+  int fd;
+#if defined(HAS_GL_EGL)
+  EGLImage egl_image;
 struct _cl_mem_buffer1d_image {
   struct _cl_mem_image base;
   uint32_t size;
+  _cl_mem * descbuffer;
+#define IS_1D_IMAGE(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D ||        \
+                                image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||  \
+                                image->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+#define IS_2D_IMAGE(image) (image->image_type == CL_MEM_OBJECT_IMAGE2D ||        \
+                                image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+#define IS_3D_IMAGE(image) (image->image_type == CL_MEM_OBJECT_IMAGE3D)
+#define IS_IMAGE_ARRAY(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
+                                   image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
 inline static void
 cl_mem_image_init(struct _cl_mem_image *image, size_t w, size_t h,
                   cl_mem_object_type image_type,
@@ -176,21 +219,18 @@ cl_mem_gl_image(cl_mem mem)
   return (struct _cl_mem_gl_image*)mem;
-inline static struct _cl_mem_buffer *
-cl_mem_buffer(cl_mem mem)
+inline static struct _cl_mem_pipe *
+cl_mem_pipe(cl_mem mem)
-  assert(!IS_IMAGE(mem));
-  return (struct _cl_mem_buffer *)mem;
+  assert(mem->type == CL_MEM_PIPE_TYPE);
+  return (struct _cl_mem_pipe *)mem;
 /* Query information about a memory object */
-extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t *);
-/* Query information about an image */
-extern cl_int cl_get_image_info(cl_mem, cl_image_info, size_t, void *, size_t *);
+extern cl_mem_object_type cl_get_mem_object_type(cl_mem mem);
 /* Query whether mem is in buffers */
-extern cl_int is_valid_mem(cl_mem mem, cl_mem buffers);
+extern cl_int cl_mem_is_valid(cl_mem mem, cl_context ctx);
 /* Create a new memory object and initialize it with possible user data */
 extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*);
@@ -198,6 +238,13 @@ extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*
 /* Create a new sub memory object */
 extern cl_mem cl_mem_new_sub_buffer(cl_mem, cl_mem_flags, cl_buffer_create_type, const void *, cl_int *);
+extern cl_mem cl_mem_new_pipe(cl_context, cl_mem_flags, cl_uint, cl_uint, cl_int *);
+/* Query information about a pipe object */
+extern cl_int cl_get_pipe_info(cl_mem, cl_mem_info, size_t, void *, size_t *);
+void* cl_mem_svm_allocate(cl_context, cl_svm_mem_flags, size_t, unsigned int);
+void cl_mem_svm_delete(cl_context, void *svm_pointer);
 /* Idem but this is an image */
 extern cl_mem
 cl_mem_new_image(cl_context context,
@@ -217,30 +264,30 @@ extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
 extern void cl_mem_add_ref(cl_mem);
 /* api clEnqueueCopyBuffer help function */
-extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+extern cl_int cl_mem_copy(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
               size_t src_offset, size_t dst_offset, size_t cb);
-extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+extern cl_int cl_mem_fill(cl_command_queue queue, cl_event e, const void * pattern, size_t pattern_size,
               cl_mem buffer, size_t offset, size_t size);
-extern cl_int cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image*,
+extern cl_int cl_image_fill(cl_command_queue queue, cl_event e, const void * pattern, struct _cl_mem_image*,
                                     const size_t *, const size_t *);
 /* api clEnqueueCopyBufferRect help function */
-extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
+extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_event event, cl_mem, cl_mem,
                                      const size_t *, const size_t *, const size_t *,
                                      size_t, size_t, size_t, size_t);
 /* api clEnqueueCopyImage help function */
-extern cl_int cl_mem_kernel_copy_image(cl_command_queue, struct _cl_mem_image*, struct _cl_mem_image*,
-                                       const size_t *, const size_t *, const size_t *);
+extern cl_int cl_mem_kernel_copy_image(cl_command_queue, cl_event event, struct _cl_mem_image*,
+                                       struct _cl_mem_image*, const size_t *, const size_t *, const size_t *);
 /* api clEnqueueCopyImageToBuffer help function */
-extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, struct _cl_mem_image*, cl_mem,
+extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, cl_event, struct _cl_mem_image*, cl_mem,
                                           const size_t *, const size_t, const size_t *);
 /* api clEnqueueCopyBufferToImage help function */
-extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_mem, struct _cl_mem_image*,
+extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_event, cl_mem, struct _cl_mem_image*,
                                           const size_t, const size_t *, const size_t *);
 /* Directly map a memory object */
@@ -314,5 +361,10 @@ extern cl_mem cl_mem_new_image_from_fd(cl_context ctx,
                                        size_t row_pitch,
                                        cl_int *errcode);
+extern cl_int cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
+                      size_t size, const size_t *origin, const size_t *region);
+extern cl_int cl_mem_set_destructor_callback(cl_mem memobj,
+                      void(CL_CALLBACK *pfn_notify)(cl_mem, void *), void *user_data);
 #endif /* __CL_MEM_H__ */
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index b0b2c1b..fdad067 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -74,10 +74,6 @@ cl_mem_new_gl_texture(cl_context ctx,
     goto error;
-  cl_mem_gl_image(mem)->target = texture_target;
-  cl_mem_gl_image(mem)->miplevel = miplevel;
-  cl_mem_gl_image(mem)->texture = texture;
   if (errcode_ret)
     *errcode_ret = err;
@@ -92,6 +88,5 @@ error:
 LOCAL void cl_mem_gl_delete(struct _cl_mem_gl_image *gl_image)
   if (gl_image->base.base.bo != NULL)
-    cl_buffer_release_from_texture(gl_image->base.base.ctx, gl_image->target,
-                                   gl_image->miplevel, gl_image->texture);
+    cl_buffer_release_from_texture(gl_image->base.base.ctx, gl_image);
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
index d7a1f68..1f21f5d 100644
--- a/src/cl_platform_id.c
+++ b/src/cl_platform_id.c
@@ -31,7 +31,6 @@
     .JOIN(FIELD,_sz) = sizeof(STRING),
 static struct _cl_platform_id intel_platform_data = {
-  INIT_ICD(dispatch)
   DECL_INFO_STRING(name, "Intel Gen OCL Driver")
@@ -51,6 +50,7 @@ cl_get_platform_default(void)
     return intel_platform;
   intel_platform = &intel_platform_data;
   return intel_platform;
@@ -69,54 +69,3 @@ cl_get_platform_ids(cl_uint          num_entries,
   return CL_SUCCESS;
-#define DECL_FIELD(CASE,FIELD)                                  \
-  case JOIN(CL_,CASE):                                          \
-    if (param_value_size < cl_get_platform_default()->JOIN(FIELD,_sz))     \
-      return CL_INVALID_VALUE;                                  \
-    if (param_value_size_ret != NULL)                           \
-      *param_value_size_ret = cl_get_platform_default()->JOIN(FIELD,_sz);  \
-    memcpy(param_value,                                         \
-           cl_get_platform_default()->FIELD,                               \
-           cl_get_platform_default()->JOIN(FIELD,_sz));                    \
-      return CL_SUCCESS;
-#define GET_FIELD_SZ(CASE,FIELD)                                \
-  case JOIN(CL_,CASE):                                          \
-    if (param_value_size_ret != NULL)                           \
-      *param_value_size_ret = cl_get_platform_default()->JOIN(FIELD,_sz);  \
-    return CL_SUCCESS;
-LOCAL cl_int
-cl_get_platform_info(cl_platform_id    platform,
-                     cl_platform_info  param_name,
-                     size_t            param_value_size,
-                     void *            param_value,
-                     size_t *          param_value_size_ret)
-  if (param_value == NULL) {
-    switch (param_name) {
-      GET_FIELD_SZ (PLATFORM_PROFILE,    profile);
-      GET_FIELD_SZ (PLATFORM_VERSION,    version);
-      GET_FIELD_SZ (PLATFORM_NAME,       name);
-      GET_FIELD_SZ (PLATFORM_VENDOR,     vendor);
-      GET_FIELD_SZ (PLATFORM_ICD_SUFFIX_KHR, icd_suffix_khr);
-      default: return CL_INVALID_VALUE;
-    }
-  }
-  /* Fetch the platform inform */
-  switch (param_name) {
-    DECL_FIELD (PLATFORM_NAME,       name);
-    DECL_FIELD (PLATFORM_VENDOR,     vendor);
-    DECL_FIELD (PLATFORM_ICD_SUFFIX_KHR, icd_suffix_khr);
-    default: return CL_INVALID_VALUE;
-  }
-#undef DECL_FIELD
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
index 865317a..3fdb920 100644
--- a/src/cl_platform_id.h
+++ b/src/cl_platform_id.h
@@ -23,12 +23,12 @@
 #include "CL/cl.h"
 #include "cl_internals.h"
 #include "cl_extensions.h"
-#include "cl_khr_icd.h"
+#include "cl_base_object.h"
 #include "src/OCLConfig.h"
 #include "src/git_sha1.h"
 struct _cl_platform_id {
-  DEFINE_ICD(dispatch)
+  _cl_base_object base;
   const char *profile;
   const char *version;
   const char *name;
@@ -44,6 +44,11 @@ struct _cl_platform_id {
   struct cl_extensions *internal_extensions;
+#define CL_OBJECT_PLATFORM_MAGIC 0xaacdbb00123ccd85LL
+#define CL_OBJECT_IS_PLATFORM(obj) ((obj &&                           \
+         ((cl_base_object)obj)->magic == CL_OBJECT_PLATFORM_MAGIC &&  \
+         CL_OBJECT_GET_REF(obj) >= 1))
 /* Return the default platform */
 extern cl_platform_id cl_get_platform_default(void);
@@ -52,13 +57,6 @@ extern cl_int cl_get_platform_ids(cl_uint          num_entries,
                                   cl_platform_id * platforms,
                                   cl_uint *        num_platforms);
-/* Return information for the current platform */
-extern cl_int cl_get_platform_info(cl_platform_id    platform,
-                                   cl_platform_info  param_name,
-                                   size_t            param_value_size,
-                                   void *            param_value,
-                                   size_t *          param_value_size_ret);
 #define _STR(x) #x
 #define _JOINT(x, y) _STR(x) "." _STR(y)
 #define _JOINT3(x, y, z) _STR(x) "." _STR(y) "." _STR(z)
diff --git a/src/cl_program.c b/src/cl_program.c
index 17f64ca..0358705 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -66,7 +66,7 @@ cl_program_delete(cl_program p)
   /* We are not done with it yet */
-  if ((ref = atomic_dec(&p->ref_n)) > 1) return;
+  if ((ref = CL_OBJECT_DEC_REF(p)) > 1) return;
   /* Destroy the sources and binary if still allocated */
@@ -83,17 +83,6 @@ cl_program_delete(cl_program p)
     p->build_log = NULL;
-  /* Remove it from the list */
-  assert(p->ctx);
-  pthread_mutex_lock(&p->ctx->program_lock);
-    if (p->prev)
-      p->prev->next = p->next;
-    if (p->next)
-      p->next->prev = p->prev;
-    if (p->ctx->programs == p)
-      p->ctx->programs = p->next;
-  pthread_mutex_unlock(&p->ctx->program_lock);
 #ifdef HAS_CMRT
   if (p->cmrt_program != NULL)
@@ -106,8 +95,12 @@ cl_program_delete(cl_program p)
-  /* Program belongs to their parent context */
-  cl_context_delete(p->ctx);
+  if (p->global_data_ptr)
+    cl_buffer_unreference(p->global_data);
+  cl_free(p->global_data_ptr);
+  /* Remove it from the list */
+  cl_context_remove_program(p->ctx, p);
   /* Free the program as allocated by the compiler */
   if (p->opaque) {
@@ -120,7 +113,7 @@ cl_program_delete(cl_program p)
-  p->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
@@ -132,17 +125,15 @@ cl_program_new(cl_context ctx)
   /* Allocate the structure */
   TRY_ALLOC_NO_ERR (p, CALLOC(struct _cl_program));
-  SET_ICD(p->dispatch)
   p->build_status = CL_BUILD_NONE;
-  p->ref_n = 1;
-  p->ctx = ctx;
   p->cmrt_program = NULL;
   p->build_log = calloc(BUILD_LOG_MAX_SIZE, sizeof(char));
   if (p->build_log)
     p->build_log_max_sz = BUILD_LOG_MAX_SIZE;
   /* The queue also belongs to its context */
-  cl_context_add_ref(ctx);
+  cl_context_add_program(ctx, p);
   return p;
@@ -155,7 +146,7 @@ LOCAL void
 cl_program_add_ref(cl_program p)
-  atomic_inc(&p->ref_n);
 static cl_int
@@ -217,6 +208,51 @@ LOCAL cl_bool headerCompare(const unsigned char *BufPtr, BINARY_HEADER_INDEX ind
 #define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY)
 #define isCMRT(BufPtr)      headerCompare(BufPtr, BHI_CMRT)
+static cl_int get_program_global_data(cl_program prog) {
+//OpenCL 1.2 would never call this function, and OpenCL 2.0 alwasy HAS_BO_SET_SOFTPIN.
+  cl_buffer_mgr bufmgr = NULL;
+  bufmgr = cl_context_get_bufmgr(prog->ctx);
+  assert(bufmgr);
+  size_t const_size = interp_program_get_global_constant_size(prog->opaque);
+  if (const_size == 0) return CL_SUCCESS;
+  int page_size = getpagesize();
+  size_t alignedSz = ALIGN(const_size, page_size);
+  char * p = (char*)cl_aligned_malloc(alignedSz, page_size);
+  prog->global_data_ptr = p;
+  interp_program_get_global_constant_data(prog->opaque, (char*)p);
+  prog->global_data = cl_buffer_alloc_userptr(bufmgr, "program global data", p, alignedSz, 0);
+  cl_buffer_set_softpin_offset(prog->global_data, (size_t)p);
+  cl_buffer_set_bo_use_full_range(prog->global_data, 1);
+  uint32_t reloc_count = interp_program_get_global_reloc_count(prog->opaque);
+  if (reloc_count > 0) {
+    uint32_t x;
+    struct RelocEntry {int refOffset; int defOffset;};
+    char *temp = (char*) malloc(reloc_count *sizeof(int)*2);
+    interp_program_get_global_reloc_table(prog->opaque, temp);
+    for (x = 0; x < reloc_count; x++) {
+      int ref_offset = ((struct RelocEntry *)temp)[x].refOffset;
+      *(uint64_t*)&(p[ref_offset]) = ((struct RelocEntry *)temp)[x].defOffset + (uint64_t)p;
+    }
+    free(temp);
+  }
+#if 0
+  int x = 0;
+  for (x = 0; x < const_size; x++) {
+    printf("offset %d data: %x\n", x, (unsigned)p[x]);
+  }
+  return CL_SUCCESS;
+LOCAL size_t cl_program_get_global_variable_size(cl_program prog) {
+  return interp_program_get_global_constant_size(prog->opaque);
 LOCAL cl_program
 cl_program_create_from_binary(cl_context             ctx,
                               cl_uint                num_devices,
@@ -232,7 +268,7 @@ cl_program_create_from_binary(cl_context             ctx,
   INVALID_DEVICE_IF (num_devices != 1);
   INVALID_DEVICE_IF (devices == NULL);
-  INVALID_DEVICE_IF (devices[0] != ctx->device);
+  INVALID_DEVICE_IF (devices[0] != ctx->devices[0]);
   INVALID_VALUE_IF (binaries == NULL);
   INVALID_VALUE_IF (lengths == NULL);
@@ -269,7 +305,7 @@ cl_program_create_from_binary(cl_context             ctx,
     TRY_ALLOC(typed_binary, cl_calloc(lengths[0]+1, sizeof(char)));
     memcpy(typed_binary+1, binaries[0], lengths[0]);
     *typed_binary = 1;
-    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->device->device_id, typed_binary, program->binary_sz+1);
+    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->devices[0]->device_id, typed_binary, program->binary_sz+1);
     if (UNLIKELY(program->opaque == NULL)) {
       err = CL_INVALID_PROGRAM;
@@ -287,7 +323,7 @@ cl_program_create_from_binary(cl_context             ctx,
       err= CL_INVALID_BINARY;
       goto error;
-    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->device->device_id, program->binary, program->binary_sz);
+    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->devices[0]->device_id, program->binary, program->binary_sz);
     if (UNLIKELY(program->opaque == NULL)) {
       err = CL_INVALID_PROGRAM;
@@ -296,7 +332,7 @@ cl_program_create_from_binary(cl_context             ctx,
     program->source_type = FROM_LLVM;
   else if (isGenBinary((unsigned char*)program->binary)) {
-    program->opaque = interp_program_new_from_binary(program->ctx->device->device_id, program->binary, program->binary_sz);
+    program->opaque = interp_program_new_from_binary(program->ctx->devices[0]->device_id, program->binary, program->binary_sz);
     if (UNLIKELY(program->opaque == NULL)) {
       err = CL_INVALID_PROGRAM;
       goto error;
@@ -338,7 +374,7 @@ cl_program_create_with_built_in_kernles(cl_context     ctx,
   INVALID_DEVICE_IF (num_devices != 1);
   INVALID_DEVICE_IF (devices == NULL);
-  INVALID_DEVICE_IF (devices[0] != ctx->device);
+  INVALID_DEVICE_IF (devices[0] != ctx->devices[0]);
   cl_int binary_status = CL_SUCCESS;
   extern char cl_internal_built_in_kernel_str[];
@@ -346,7 +382,7 @@ cl_program_create_with_built_in_kernles(cl_context     ctx,
   char* p_built_in_kernel_str =cl_internal_built_in_kernel_str;
   ctx->built_in_prgs = cl_program_create_from_binary(ctx, 1,
-                                                          &ctx->device,
+                                                          &ctx->devices[0],
                                                           (const unsigned char **)&p_built_in_kernel_str,
                                                           &binary_status, &err);
@@ -372,12 +408,12 @@ cl_program_create_with_built_in_kernles(cl_context     ctx,
   kernel = strtok_r( local_kernel_names, delims , &saveptr);
   while( kernel != NULL ) {
-    matched_kernel = strstr(ctx->device->built_in_kernels, kernel);
+    matched_kernel = strstr(ctx->devices[0]->built_in_kernels, kernel);
       for (i = 0; i < ctx->built_in_prgs->ker_n; ++i) {
         const char *ker_name = cl_kernel_get_name(ctx->built_in_prgs->ker[i]);
-        if (strcmp(ker_name, kernel) == 0) {
+        if (ker_name != NULL && strcmp(ker_name, kernel) == 0) {
@@ -412,7 +448,7 @@ cl_program_create_from_llvm(cl_context ctx,
   INVALID_DEVICE_IF (num_devices != 1);
   INVALID_DEVICE_IF (devices == NULL);
-  INVALID_DEVICE_IF (devices[0] != ctx->device);
+  INVALID_DEVICE_IF (devices[0] != ctx->devices[0]);
   INVALID_VALUE_IF (file_name == NULL);
   program = cl_program_new(ctx);
@@ -421,7 +457,7 @@ cl_program_create_from_llvm(cl_context ctx,
       goto error;
-  program->opaque = compiler_program_new_from_llvm(ctx->device->device_id, file_name, NULL, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1, NULL);
+  program->opaque = compiler_program_new_from_llvm(ctx->devices[0]->device_id, file_name, NULL, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1, NULL);
   if (UNLIKELY(program->opaque == NULL)) {
     goto error;
@@ -503,7 +539,7 @@ static int check_cl_version_option(cl_program p, const char* options) {
   const char* s = NULL;
   int ver1 = 0;
   int ver2 = 0;
-  char version_str[64];
+  char version_str[64] = {0};
   if (options && (s = strstr(options, "-cl-std="))) {
@@ -518,7 +554,7 @@ static int check_cl_version_option(cl_program p, const char* options) {
     ver1 = (s[10] - '0') * 10 + (s[12] - '0');
-    if (cl_get_device_info(p->ctx->device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version_str),
+    if (cl_get_device_info(p->ctx->devices[0], CL_DEVICE_OPENCL_C_VERSION, sizeof(version_str),
                                   version_str, NULL) != CL_SUCCESS)
       return 0;
@@ -541,7 +577,7 @@ cl_program_build(cl_program p, const char *options)
   int i = 0;
   int copyed = 0;
-  if (p->ref_n > 1) {
+  if (CL_OBJECT_GET_REF(p) > 1) {
     goto error;
@@ -586,7 +622,7 @@ cl_program_build(cl_program p, const char *options)
       goto error;
-    p->opaque = compiler_program_new_from_source(p->ctx->device->device_id, p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
+    p->opaque = compiler_program_new_from_source(p->ctx->devices[0]->device_id, p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
     if (UNLIKELY(p->opaque == NULL)) {
       if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
@@ -614,7 +650,7 @@ cl_program_build(cl_program p, const char *options)
     /* Create all the kernels */
     TRY (cl_program_load_gen_program, p);
   } else if (p->source_type == FROM_BINARY && p->binary_type != CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
-    p->opaque = interp_program_new_from_binary(p->ctx->device->device_id, p->binary, p->binary_sz);
+    p->opaque = interp_program_new_from_binary(p->ctx->devices[0]->device_id, p->binary, p->binary_sz);
     if (UNLIKELY(p->opaque == NULL)) {
       goto error;
@@ -638,6 +674,9 @@ cl_program_build(cl_program p, const char *options)
     memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
     copyed += sz;
+  if ((err = get_program_global_data(p)) != CL_SUCCESS)
+    goto error;
   p->is_built = 1;
   p->build_status = CL_BUILD_SUCCESS;
   return CL_SUCCESS;
@@ -706,7 +745,7 @@ cl_program_link(cl_context            context,
     goto error;
-  p->opaque = compiler_program_new_gen_program(context->device->device_id, NULL, NULL, NULL);
+  p->opaque = compiler_program_new_gen_program(context->devices[0]->device_id, NULL, NULL, NULL);
   for(i = 0; i < num_input_programs; i++) {
     // if program create with llvm binary, need deserilize first to get module.
@@ -743,6 +782,10 @@ cl_program_link(cl_context            context,
     memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
     copyed += sz;
+  if ((err = get_program_global_data(p)) != CL_SUCCESS)
+    goto error;
   if(p) p->is_built = 1;
   if(p) p->build_status = CL_BUILD_SUCCESS;
@@ -768,7 +811,7 @@ cl_program_compile(cl_program            p,
   cl_int err = CL_SUCCESS;
   int i = 0;
-  if (p->ref_n > 1) {
+  if (CL_OBJECT_GET_REF(p) > 1) {
     goto error;
@@ -841,7 +884,7 @@ cl_program_compile(cl_program            p,
-    p->opaque = compiler_program_compile_from_source(p->ctx->device->device_id, p->source, temp_header_path,
+    p->opaque = compiler_program_compile_from_source(p->ctx->devices[0]->device_id, p->source, temp_header_path,
         p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
     char rm_path[255]="rm ";
@@ -902,7 +945,7 @@ cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
   for (i = 0; i < p->ker_n; ++i) {
     const char *ker_name = cl_kernel_get_name(p->ker[i]);
-    if (strcmp(ker_name, name) == 0) {
+    if (ker_name != NULL && strcmp(ker_name, name) == 0) {
       from = p->ker[i];
@@ -961,10 +1004,13 @@ cl_program_get_kernel_names(cl_program p, size_t size, char *names, size_t *size
-  ker_name = cl_kernel_get_name(p->ker[i]);
-  len = strlen(ker_name);
-  if(names) {
-    strncpy(names, cl_kernel_get_name(p->ker[0]), size - 1);
+  ker_name = cl_kernel_get_name(p->ker[0]);
+  if (ker_name != NULL)
+    len = strlen(ker_name);
+  else
+    len = 0;
+  if(names && ker_name) {
+    strncpy(names, ker_name, size - 1);
     names[size - 1] = '\0';
     if(size < len - 1) {
       if(size_ret) *size_ret = size;
@@ -972,12 +1018,15 @@ cl_program_get_kernel_names(cl_program p, size_t size, char *names, size_t *size
     size = size - len - 1;  //sub \0
-  if(size_ret) *size_ret = strlen(ker_name) + 1;  //add NULL
+  if(size_ret) *size_ret = len + 1;  //add NULL
   for (i = 1; i < p->ker_n; ++i) {
     ker_name = cl_kernel_get_name(p->ker[i]);
-    len = strlen(ker_name);
-    if(names) {
+    if (ker_name != NULL)
+      len = strlen(ker_name);
+    else
+      len = 0;
+    if(names && ker_name) {
       strncat(names, ";", size);
       if(size >= 1)
         strncat(names, ker_name, size - 1);
diff --git a/src/cl_program.h b/src/cl_program.h
index b69e00c..6e8e84a 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -22,6 +22,7 @@
 #include "cl_internals.h"
 #include "cl_gbe_loader.h"
+#include "cl_base_object.h"
 #include "CL/cl.h"
 #include <stdint.h>
@@ -49,13 +50,13 @@ typedef enum _BINARY_HEADER_INDEX {
 /* This maps an OCL file containing some kernels */
 struct _cl_program {
-  DEFINE_ICD(dispatch)
-  uint64_t magic;         /* To identify it as a program */
-  volatile int ref_n;     /* We reference count this object */
+  _cl_base_object base;
   gbe_program opaque;     /* (Opaque) program as ouput by the compiler */
   cl_kernel *ker;         /* All kernels included by the OCL file */
   cl_program prev, next;  /* We chain the programs together */
   cl_context ctx;         /* Its parent context */
+  cl_buffer  global_data;
+  char * global_data_ptr;
   char *bin;              /* The program copied verbatim */
   size_t bin_sz;          /* Its size in memory */
   char *source;           /* Program sources */
@@ -75,6 +76,11 @@ struct _cl_program {
   void* cmrt_program;      /* real type: CmProgram* */
+#define CL_OBJECT_PROGRAM_MAGIC 0x34562ab12789cdefLL
+#define CL_OBJECT_IS_PROGRAM(obj) ((obj &&                           \
+         ((cl_base_object)obj)->magic == CL_OBJECT_PROGRAM_MAGIC &&  \
+         CL_OBJECT_GET_REF(obj) >= 1))
 /* Create a empty program */
 extern cl_program cl_program_new(cl_context);
@@ -146,5 +152,7 @@ cl_program_get_kernel_names(cl_program p,
                             size_t size,
                             char *names,
                             size_t *size_ret);
+extern size_t
+cl_program_get_global_variable_size(cl_program p);
 #endif /* __CL_PROGRAM_H__ */
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index 45c1fdf..d1e6dfe 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -71,44 +71,31 @@ int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler)
 LOCAL cl_sampler
-cl_sampler_new(cl_context ctx,
-               cl_bool normalized_coords,
-               cl_addressing_mode address,
-               cl_filter_mode filter,
-               cl_int *errcode_ret)
+cl_create_sampler(cl_context ctx, cl_bool normalized_coords, cl_addressing_mode address,
+                  cl_filter_mode filter, cl_int *errcode_ret)
   cl_sampler sampler = NULL;
-  cl_int err = CL_SUCCESS;
   /* Allocate and inialize the structure itself */
-  TRY_ALLOC (sampler, CALLOC(struct _cl_sampler));
-  SET_ICD(sampler->dispatch)
-  sampler->ref_n = 1;
-  sampler->magic = CL_MAGIC_SAMPLER_HEADER;
+  sampler = cl_calloc(1, sizeof(_cl_sampler));
+  if (sampler == NULL) {
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
+  }
   sampler->normalized_coords = normalized_coords;
   sampler->address = address;
   sampler->filter = filter;
   /* Append the sampler in the context sampler list */
-  pthread_mutex_lock(&ctx->sampler_lock);
-    sampler->next = ctx->samplers;
-    if (ctx->samplers != NULL)
-      ctx->samplers->prev = sampler;
-    ctx->samplers = sampler;
-  pthread_mutex_unlock(&ctx->sampler_lock);
-  sampler->ctx = ctx;
-  cl_context_add_ref(ctx);
+  cl_context_add_sampler(ctx, sampler);
+  // TODO: May move it to other place, it's not a common sampler logic.
   sampler->clkSamplerValue = cl_to_clk(normalized_coords, address, filter);
-  if (errcode_ret)
-    *errcode_ret = err;
+  *errcode_ret = CL_SUCCESS;
   return sampler;
-  cl_sampler_delete(sampler);
-  sampler = NULL;
-  goto exit;
 LOCAL void
@@ -116,20 +103,11 @@ cl_sampler_delete(cl_sampler sampler)
   if (UNLIKELY(sampler == NULL))
-  if (atomic_dec(&sampler->ref_n) > 1)
+  if (CL_OBJECT_DEC_REF(sampler) > 1)
-  assert(sampler->ctx);
-  pthread_mutex_lock(&sampler->ctx->sampler_lock);
-    if (sampler->prev)
-      sampler->prev->next = sampler->next;
-    if (sampler->next)
-      sampler->next->prev = sampler->prev;
-    if (sampler->ctx->samplers == sampler)
-      sampler->ctx->samplers = sampler->next;
-  pthread_mutex_unlock(&sampler->ctx->sampler_lock);
-  cl_context_delete(sampler->ctx);
+  cl_context_remove_sampler(sampler->ctx, sampler);
@@ -137,6 +115,6 @@ LOCAL void
 cl_sampler_add_ref(cl_sampler sampler)
-  atomic_inc(&sampler->ref_n);
+  CL_OBJECT_INC_REF(sampler);
diff --git a/src/cl_sampler.h b/src/cl_sampler.h
index fc4b7e7..ce06eb4 100644
--- a/src/cl_sampler.h
+++ b/src/cl_sampler.h
@@ -21,37 +21,32 @@
 #define __CL_SAMPLER_H__
 #include "CL/cl.h"
+#include "cl_base_object.h"
 #include "../backend/src/ocl_common_defines.h"
 #include <stdint.h>
 /* How to access images */
-struct _cl_sampler {
-  DEFINE_ICD(dispatch)
-  uint64_t magic;            /* To identify it as a sampler object */
-  volatile int ref_n;        /* This object is reference counted */
-  cl_sampler prev, next;     /* We chain the samplers in the allocator */
-  cl_context ctx;            /* Context it belongs to */
-  cl_bool normalized_coords; /* Are coordinates normalized? */
-  cl_addressing_mode address;/* CLAMP / REPEAT and so on... */
-  cl_filter_mode filter;     /* LINEAR / NEAREST mostly */
+typedef struct _cl_sampler {
+  _cl_base_object base;
+  cl_context ctx;             /* Context it belongs to */
+  cl_bool normalized_coords;  /* Are coordinates normalized? */
+  cl_addressing_mode address; /* CLAMP / REPEAT and so on... */
+  cl_filter_mode filter;      /* LINEAR / NEAREST mostly */
   uint32_t clkSamplerValue;
+} _cl_sampler;
-/* Create a new sampler object */
-extern cl_sampler cl_sampler_new(cl_context,
-                                 cl_bool,
-                                 cl_addressing_mode,
-                                 cl_filter_mode,
-                                 cl_int *err);
+#define CL_OBJECT_SAMPLER_MAGIC 0x686a0ecba79ce32fLL
+#define CL_OBJECT_IS_SAMPLER(obj) ((obj &&                                                     \
+                                    ((cl_base_object)obj)->magic == CL_OBJECT_SAMPLER_MAGIC && \
+                                    CL_OBJECT_GET_REF(obj) >= 1))
+/* Create a new sampler object */
+extern cl_sampler cl_create_sampler(cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int *err);
 /* Unref the object and delete it if no more reference on it */
 extern void cl_sampler_delete(cl_sampler);
 /* Add one more reference to this object */
 extern void cl_sampler_add_ref(cl_sampler);
 /* set a sampler kernel argument */
 int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler);
 #endif /* __CL_SAMPLER_H__ */
diff --git a/src/cl_thread.c b/src/cl_thread.c
deleted file mode 100644
index 0780513..0000000
--- a/src/cl_thread.c
+++ /dev/null
@@ -1,329 +0,0 @@
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- */
-#include <string.h>
-#include <stdio.h>
-#include "cl_thread.h"
-#include "cl_alloc.h"
-#include "cl_utils.h"
-/* Because the cl_command_queue can be used in several threads simultaneously but
-   without add ref to it, we now handle it like this:
-   Keep one threads_slot_array, every time the thread get gpgpu or batch buffer, if it
-   does not have a slot, assign it.
-   The resources are keeped in queue private, and resize it if needed.
-   When the thread exit, the slot will be set invalid.
-   When queue released, all the resources will be released. If user still enqueue, flush
-   or finish the queue after it has been released, the behavior is undefined.
-   TODO: Need to shrink the slot map.
-   */
-static int thread_array_num = 1;
-static int *thread_slot_map = NULL;
-static int thread_magic_num = 1;
-static pthread_mutex_t thread_queue_map_lock = PTHREAD_MUTEX_INITIALIZER;
-typedef struct _thread_spec_data {
-  cl_gpgpu gpgpu ;
-  int valid;
-  void* thread_batch_buf;
-  cl_event last_event;
-  cl_event current_event;
-  int thread_magic;
-} thread_spec_data;
-typedef struct _queue_thread_private {
-  thread_spec_data**  threads_data;
-  int threads_data_num;
-  pthread_mutex_t thread_data_lock;
-} queue_thread_private;
-static pthread_once_t key_once = PTHREAD_ONCE_INIT;
-static pthread_key_t thread_id_key;
-static pthread_key_t thread_magic_key;
-static void create_thread_key()
-  pthread_key_create(&thread_id_key, NULL);
-  pthread_key_create(&thread_magic_key, NULL);
-static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int create)
-  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
-  thread_spec_data* spec = NULL;
-  int i = 0;
-  int *id = NULL, *magic = NULL;
-  pthread_once(&key_once, create_thread_key);
-  id = pthread_getspecific(thread_id_key);
-  if(id == NULL) {
-    id = (int *)malloc(sizeof(int));
-    *id = -1;
-    pthread_setspecific(thread_id_key, id);
-  }
-  magic = pthread_getspecific(thread_magic_key);
-  if(magic == NULL) {
-    magic = (int *)malloc(sizeof(int));
-    *magic = -1;
-    pthread_setspecific(thread_magic_key, magic);
-  }
-  if (*id == -1) {
-    pthread_mutex_lock(&thread_queue_map_lock);
-    for (i = 0; i < thread_array_num; i++) {
-      if (thread_slot_map[i] == 0) {
-        *id = i;
-        break;
-      }
-    }
-    if (i == thread_array_num) {
-      thread_array_num *= 2;
-      thread_slot_map = realloc(thread_slot_map, sizeof(int) * thread_array_num);
-      if(thread_slot_map == NULL) {
-        pthread_mutex_unlock(&thread_queue_map_lock);
-        return NULL;
-      }
-      memset(thread_slot_map + thread_array_num/2, 0, sizeof(int) * (thread_array_num/2));
-      *id = thread_array_num/2;
-    }
-    thread_slot_map[*id] = 1;
-    *magic = thread_magic_num++;
-    pthread_mutex_unlock(&thread_queue_map_lock);
-  }
-  pthread_mutex_lock(&thread_private->thread_data_lock);
-  if (thread_array_num > thread_private->threads_data_num) {// just enlarge
-    int old_num = thread_private->threads_data_num;
-    thread_private->threads_data_num = thread_array_num;
-    thread_private->threads_data = realloc(thread_private->threads_data,
-                thread_private->threads_data_num * sizeof(void *));
-    if(thread_private->threads_data == NULL) {
-      pthread_mutex_unlock(&thread_private->thread_data_lock);
-      return NULL;
-    }
-    memset(thread_private->threads_data + old_num, 0,
-           sizeof(void*) * (thread_private->threads_data_num - old_num));
-  }
-  assert(*id != -1 && *id < thread_array_num);
-  spec = thread_private->threads_data[*id];
-  if (!spec && create) {
-       spec = CALLOC(thread_spec_data);
-       spec->thread_magic = *magic;
-       thread_private->threads_data[*id] = spec;
-  }
-  pthread_mutex_unlock(&thread_private->thread_data_lock);
-  return spec;
-cl_event get_current_event(cl_command_queue queue)
-  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  int *magic = pthread_getspecific(thread_magic_key);
-  assert(spec && magic && spec->thread_magic == *magic);
-  return spec->current_event;
-cl_event get_last_event(cl_command_queue queue)
-  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  int *magic = pthread_getspecific(thread_magic_key);
-  assert(spec && magic && spec->thread_magic == *magic);
-  return spec->last_event;
-void set_current_event(cl_command_queue queue, cl_event e)
-  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  int *magic = pthread_getspecific(thread_magic_key);
-  assert(spec && magic && spec->thread_magic == *magic);
-  spec->current_event = e;
-void set_last_event(cl_command_queue queue, cl_event e)
-  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  int *magic = pthread_getspecific(thread_magic_key);
-  assert(spec && magic && spec->thread_magic == *magic);
-  spec->last_event = e;
-void* cl_thread_data_create(void)
-  queue_thread_private* thread_private = CALLOC(queue_thread_private);
-  if (thread_private == NULL)
-    return NULL;
-  if (thread_slot_map == NULL) {
-    pthread_mutex_lock(&thread_queue_map_lock);
-    thread_slot_map = calloc(thread_array_num, sizeof(int));
-    pthread_mutex_unlock(&thread_queue_map_lock);
-  }
-  pthread_mutex_init(&thread_private->thread_data_lock, NULL);
-  pthread_mutex_lock(&thread_private->thread_data_lock);
-  thread_private->threads_data = malloc(thread_array_num * sizeof(void *));
-  memset(thread_private->threads_data, 0, sizeof(void*) * thread_array_num);
-  thread_private->threads_data_num = thread_array_num;
-  pthread_mutex_unlock(&thread_private->thread_data_lock);
-  return thread_private;
-cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
-  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  if(!spec)
-    return NULL;
-  int *magic = pthread_getspecific(thread_magic_key);
-  assert(magic);
-  if (!spec->thread_magic && spec->thread_magic != *magic) {
-    //We may get the slot from last thread. So free the resource.
-    spec->valid = 0;
-  }
-  if (!spec->valid) {
-    if (spec->thread_batch_buf) {
-      cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
-      spec->thread_batch_buf = NULL;
-    }
-    if (spec->gpgpu) {
-      cl_gpgpu_delete(spec->gpgpu);
-      spec->gpgpu = NULL;
-    }
-    TRY_ALLOC_NO_ERR(spec->gpgpu, cl_gpgpu_new(queue->ctx->drv));
-    spec->valid = 1;
-  }
- error:
-  return spec->gpgpu;
-void cl_set_thread_batch_buf(cl_command_queue queue, void* buf)
-  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  int *magic = pthread_getspecific(thread_magic_key);
-  assert(spec && magic && spec->thread_magic == *magic);
-  if (spec->thread_batch_buf) {
-    cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
-  }
-  spec->thread_batch_buf = buf;
-void* cl_get_thread_batch_buf(cl_command_queue queue) {
-  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  int *magic = pthread_getspecific(thread_magic_key);
-  assert(spec && magic && spec->thread_magic == *magic);
-  return spec->thread_batch_buf;
-void cl_invalid_thread_gpgpu(cl_command_queue queue)
-  int *id = pthread_getspecific(thread_id_key);
-  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
-  thread_spec_data* spec = NULL;
-  pthread_mutex_lock(&thread_private->thread_data_lock);
-  assert(id);
-  spec = thread_private->threads_data[*id];
-  assert(spec);
-  pthread_mutex_unlock(&thread_private->thread_data_lock);
-  if (!spec->valid) {
-    return;
-  }
-  assert(spec->gpgpu);
-  cl_gpgpu_delete(spec->gpgpu);
-  spec->gpgpu = NULL;
-  spec->valid = 0;
-cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue)
-  int *id = pthread_getspecific(thread_id_key);
-  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
-  thread_spec_data* spec = NULL;
-  pthread_mutex_lock(&thread_private->thread_data_lock);
-  assert(id);
-  spec = thread_private->threads_data[*id];
-  assert(spec);
-  pthread_mutex_unlock(&thread_private->thread_data_lock);
-  if (!spec->valid)
-    return NULL;
-  assert(spec->gpgpu);
-  cl_gpgpu gpgpu = spec->gpgpu;
-  spec->gpgpu = NULL;
-  spec->valid = 0;
-  return gpgpu;
-/* The destructor for clean the thread specific data. */
-void cl_thread_data_destroy(cl_command_queue queue)
-  int i = 0;
-  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
-  int threads_data_num;
-  thread_spec_data** threads_data;
-  pthread_mutex_lock(&thread_private->thread_data_lock);
-  threads_data_num = thread_private->threads_data_num;
-  threads_data = thread_private->threads_data;
-  thread_private->threads_data_num = 0;
-  thread_private->threads_data = NULL;
-  pthread_mutex_unlock(&thread_private->thread_data_lock);
-  cl_free(thread_private);
-  queue->thread_data = NULL;
-  for (i = 0; i < threads_data_num; i++) {
-    if (threads_data[i] != NULL && threads_data[i]->thread_batch_buf) {
-      cl_gpgpu_unref_batch_buf(threads_data[i]->thread_batch_buf);
-      threads_data[i]->thread_batch_buf = NULL;
-    }
-    if (threads_data[i] != NULL && threads_data[i]->valid) {
-      cl_gpgpu_delete(threads_data[i]->gpgpu);
-      threads_data[i]->gpgpu = NULL;
-      threads_data[i]->valid = 0;
-    }
-    cl_free(threads_data[i]);
-  }
-  cl_free(threads_data);
diff --git a/src/cl_thread.h b/src/cl_thread.h
deleted file mode 100644
index d77526b..0000000
--- a/src/cl_thread.h
+++ /dev/null
@@ -1,52 +0,0 @@
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- */
-#ifndef __CL_THREAD_H__
-#define __CL_THREAD_H__
-#include <pthread.h>
-#include "cl_internals.h"
-#include "cl_command_queue.h"
-/* Create the thread specific data. */
-void* cl_thread_data_create(void);
-/* The destructor for clean the thread specific data. */
-void cl_thread_data_destroy(cl_command_queue queue);
-/* Used to get the gpgpu struct of each thread. */
-cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
-/* Used to release the gpgpu struct of each thread. */
-void cl_invalid_thread_gpgpu(cl_command_queue queue);
-/* Used to set the batch buffer of each thread. */
-void cl_set_thread_batch_buf(cl_command_queue queue, void* buf);
-/* Used to get the batch buffer of each thread. */
-void* cl_get_thread_batch_buf(cl_command_queue queue);
-/* take current gpgpu from the thread gpgpu pool. */
-cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue);
-cl_event get_current_event(cl_command_queue queue);
-cl_event get_last_event(cl_command_queue queue);
-void set_current_event(cl_command_queue queue, cl_event e);
-void set_last_event(cl_command_queue queue, cl_event e);
-#endif /* __CL_THREAD_H__ */
diff --git a/src/cl_utils.c b/src/cl_utils.c
new file mode 100644
index 0000000..38de1ea
--- /dev/null
+++ b/src/cl_utils.c
@@ -0,0 +1,86 @@
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "cl_utils.h"
+#include <string.h>
+#include <assert.h>
+LOCAL void
+list_node_insert_before(struct list_node *node, struct list_node *the_new)
+  list_node *before_node = node->p;
+  the_new->p = before_node;
+  the_new->n = node;
+  node->p = the_new;
+  before_node->n = the_new;
+LOCAL void
+list_node_insert_after(struct list_node *node, struct list_node *the_new)
+  list_node *after_node = node->n;
+  the_new->n = after_node;
+  the_new->p = node;
+  node->n = the_new;
+  after_node->p = the_new;
+LOCAL void
+list_move(struct list_head *the_old, struct list_head *the_new)
+  assert(list_empty(the_new));
+  if (list_empty(the_old)) {
+    return;
+  }
+  memcpy(&the_new->head_node, &the_old->head_node, sizeof(list_node));
+  the_new->head_node.n->p = &the_new->head_node;
+  the_new->head_node.p->n = &the_new->head_node;
+  list_init(the_old);
+LOCAL void
+list_merge(struct list_head *head, struct list_head *to_merge)
+  if (list_empty(to_merge))
+    return;
+  list_node *merge_last_node = to_merge->head_node.p;
+  list_node *merge_first_node = to_merge->head_node.n;
+  merge_last_node->n = &head->head_node;
+  merge_first_node->p = head->head_node.p;
+  head->head_node.p->n = merge_first_node;
+  head->head_node.p = merge_last_node;
+  list_init(to_merge);
+LOCAL cl_int
+cl_get_info_helper(const void *src, size_t src_size, void *dst, size_t dst_size, size_t *ret_size)
+  if (dst && dst_size < src_size)
+    return CL_INVALID_VALUE;
+  if (dst && dst_size) {
+    memcpy(dst, src, src_size);
+  }
+  if (ret_size)
+    *ret_size = src_size;
+  return CL_SUCCESS;
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 2926611..2d24207 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -19,6 +19,7 @@
 #ifndef __CL_UTILS_H__
 #define __CL_UTILS_H__
+#include "CL/cl.h"
 /* INLINE is forceinline */
 #define INLINE __attribute__((always_inline)) inline
@@ -124,7 +125,7 @@ do {                                                        \
     err = CL_INVALID_CONTEXT;                               \
     goto error;                                             \
   }                                                         \
-  if (UNLIKELY(CTX->magic != CL_MAGIC_CONTEXT_HEADER)) {    \
+  if (UNLIKELY(!CL_OBJECT_IS_CONTEXT(CTX))) {              \
     err = CL_INVALID_CONTEXT;                               \
     goto error;                                             \
   }                                                         \
@@ -136,7 +137,7 @@ do {                                                        \
     err = CL_INVALID_COMMAND_QUEUE;                         \
     goto error;                                             \
   }                                                         \
-  if (UNLIKELY(QUEUE->magic != CL_MAGIC_QUEUE_HEADER)) {    \
     err = CL_INVALID_COMMAND_QUEUE;                         \
     goto error;                                             \
   }                                                         \
@@ -148,7 +149,7 @@ do {                                                        \
     err = CL_INVALID_MEM_OBJECT;                            \
     goto error;                                             \
   }                                                         \
-  if (UNLIKELY(MEM->magic != CL_MAGIC_MEM_HEADER)) {        \
+  if (UNLIKELY(!CL_OBJECT_IS_MEM(MEM))) {                  \
     err = CL_INVALID_MEM_OBJECT;                            \
     goto error;                                             \
   }                                                         \
@@ -215,7 +216,7 @@ do {                                                        \
       err = CL_INVALID_EVENT;                                 \
       goto error;                                             \
     }                                                         \
-    if (UNLIKELY(EVENT->magic != CL_MAGIC_EVENT_HEADER)) {    \
+    if (UNLIKELY(!CL_OBJECT_IS_EVENT(EVENT))) {              \
       err = CL_INVALID_EVENT;                                 \
       goto error;                                             \
     }                                                         \
@@ -227,7 +228,7 @@ do {                                                        \
     err = CL_INVALID_SAMPLER;                               \
     goto error;                                             \
   }                                                         \
     err = CL_INVALID_SAMPLER;                               \
     goto error;                                             \
   }                                                         \
@@ -239,7 +240,7 @@ do {
     err = CL_INVALID_ACCELERATOR_INTEL;                                         \
     goto error;                                                                 \
   }                                                                             \
     err = CL_INVALID_ACCELERATOR_INTEL;                                         \
     goto error;                                                                 \
   }                                                                             \
@@ -251,7 +252,7 @@ do {                                                        \
     err = CL_INVALID_KERNEL;                                \
     goto error;                                             \
   }                                                         \
+  if (UNLIKELY(!CL_OBJECT_IS_KERNEL(KERNEL))) {            \
     err = CL_INVALID_KERNEL;                                \
     goto error;                                             \
   }                                                         \
@@ -263,7 +264,7 @@ do {                                                        \
     err = CL_INVALID_PROGRAM;                               \
     goto error;                                             \
   }                                                         \
     err = CL_INVALID_PROGRAM;                               \
     goto error;                                             \
   }                                                         \
@@ -351,9 +352,80 @@ static INLINE int atomic_add(atomic_t *v, const int c) {
       : "m"(*v), "r"(i));
   return i;
+static INLINE int atomic_read(atomic_t *v) {
+  return *v;
 static INLINE int atomic_inc(atomic_t *v) { return atomic_add(v, 1); }
 static INLINE int atomic_dec(atomic_t *v) { return atomic_add(v, -1); }
-#endif /* __CL_UTILS_H__ */
+/* Define one list node. */
+typedef struct list_node {
+  struct list_node *n;
+  struct list_node *p;
+} list_node;
+typedef struct list_head {
+  list_node head_node;
+} list_head;
+static inline void list_node_init(list_node *node)
+  node->n = node;
+  node->p = node;
+static inline int list_node_out_of_list(const struct list_node *node)
+  return node->n == node;
+static inline void list_init(list_head *head)
+  head->head_node.n = &head->head_node;
+  head->head_node.p = &head->head_node;
+extern void list_node_insert_before(list_node *node, list_node *the_new);
+extern void list_node_insert_after(list_node *node, list_node *the_new);
+static inline void list_node_del(struct list_node *node)
+  node->n->p = node->p;
+  node->p->n = node->n;
+  /* And all point to self for safe. */
+  node->p = node;
+  node->n = node;
+static inline void list_add(list_head *head, list_node *the_new)
+  list_node_insert_after(&head->head_node, the_new);
+static inline void list_add_tail(list_head *head, list_node *the_new)
+  list_node_insert_before(&head->head_node, the_new);
+static inline int list_empty(const struct list_head *head)
+  return head->head_node.n == &head->head_node;
+/* Move the content from one head to another. */
+extern void list_move(struct list_head *the_old, struct list_head *the_new);
+/* Merge the content of the two lists to one head. */
+extern void list_merge(struct list_head *head, struct list_head *to_merge);
+#undef offsetof
+#ifdef __compiler_offsetof
+#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER)
+#define offsetof(TYPE, MEMBER) ((size_t) & ((TYPE *)0)->MEMBER)
+#define list_entry(ptr, type, member) ({                      \
+      const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
+      (type *)( (char *)__mptr - offsetof(type,member) ); })
+#define list_for_each(pos, head) \
+  for (pos = (head)->head_node.n; pos != &((head)->head_node); pos = pos->n)
+#define list_for_each_safe(pos, ne, head)                                   \
+  for (pos = (head)->head_node.n, ne = pos->n; pos != &((head)->head_node); \
+       pos = ne, ne = pos->n)
+extern cl_int cl_get_info_helper(const void *src, size_t src_size, void *dst,
+                                 size_t dst_size, size_t *ret_size);
+#endif /* __CL_UTILS_H__ */
diff --git a/src/intel/intel_cl_gl_share_image_info.h b/src/intel/intel_cl_gl_share_image_info.h
new file mode 100644
index 0000000..21fbbd1
--- /dev/null
+++ b/src/intel/intel_cl_gl_share_image_info.h
@@ -0,0 +1,18 @@
+struct _intel_cl_gl_share_image_info {
+  int fd;
+  size_t w;
+  size_t h;
+  size_t depth;
+  size_t pitch;
+  int tiling;
+  size_t offset;
+  size_t tile_x;
+  size_t tile_y;
+  unsigned int gl_format;
+  size_t row_pitch, slice_pitch;
diff --git a/src/intel/intel_dri_resource_sharing.c b/src/intel/intel_dri_resource_sharing.c
deleted file mode 100644
index 188c1fa..0000000
--- a/src/intel/intel_dri_resource_sharing.c
+++ /dev/null
@@ -1,208 +0,0 @@
- *
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- *
- **************************************************************************/
-#define HAVE_PTHREAD 1
-#include <errno.h>
-#include <time.h>
-#include "main/context.h"
-#include "main/renderbuffer.h"
-#include "main/texobj.h"
-#include <stdbool.h>
-#include <string.h>
-#include <drm.h>
-#include <i915_drm.h>
-#include <intel_bufmgr.h>
-#include <GL/internal/dri_interface.h>
-#include "intel_mipmap_tree.h"
-#include "intel_regions.h"
-#include "intel_context.h"
-#include "intel_dri_resource_sharing.h"
-#include "intel_dri_resource_sharing_int.h"
-#include <dlfcn.h>
- * Sets up a DRIImage structure to point to our shared image in a region
- */
-static bool
-intel_setup_cl_region_from_mipmap_tree(void *driver,
-                                       struct intel_context *intel,
-                                       struct intel_mipmap_tree *mt,
-                                       GLuint level, GLuint zoffset,
-                                       struct _intel_dri_share_image_region *region)
-   unsigned int draw_x, draw_y;
-   uint32_t mask_x, mask_y;
-   struct intel_region *null_region = (struct intel_region *)NULL;
-   intel_miptree_check_level_layer(mt, level, zoffset);
-   _intel_region_get_tile_masks(mt->region, &mask_x, &mask_y, false);
-   _intel_miptree_get_image_offset(mt, level, zoffset, &draw_x, &draw_y);
-   region->w = mt->level[level].width;
-   region->h = mt->level[level].height;
-   region->tile_x = draw_x & mask_x;
-   region->tile_y = draw_y & mask_y;
-   region->tiling = mt->region->tiling;
-   /* XXX hard code to 1 right now. */
-   region->depth = 1;
-   region->row_pitch = mt->region->pitch;
-   region->offset = _intel_region_get_aligned_offset(mt->region,
-                                                     draw_x & ~mask_x,
-                                                     draw_y & ~mask_y,
-                                                     false);
-   if (!_intel_region_flink(mt->region, &region->name))
-      return false;
-   _intel_region_reference(&null_region, mt->region);
-   return true;
-typedef void
-_mesa_test_texobj_completeness_t( const struct gl_context *ctx,
-                                struct gl_texture_object *t );
-_mesa_test_texobj_completeness_t *__mesa_test_texobj_completeness;
-typedef struct gl_texture_object *
-_mesa_lookup_texture_t( const struct gl_context *ctx, GLuint id);
-_mesa_lookup_texture_t *__mesa_lookup_texture;
-static struct gl_texture_object *
-intel_get_gl_obj_from_texture(void *driver,
-                              struct intel_context *intel,
-                              GLenum target, GLint level,
-                              GLuint texture, GLuint face)
-   struct gl_texture_object *obj;
-   __mesa_lookup_texture = dlsym(driver, "_mesa_lookup_texture");
-   obj = __mesa_lookup_texture(&intel->ctx, texture);
-   if (!obj || obj->Target != target) {
-      return NULL;
-   }
-   __mesa_test_texobj_completeness = dlsym(driver, "_mesa_test_texobj_completeness");
-   __mesa_test_texobj_completeness(&intel->ctx, obj);
-   if (!obj->_BaseComplete || (level > 0 && !obj->_MipmapComplete)) {
-      return NULL;
-   }
-   if (level < obj->BaseLevel || level > obj->_MaxLevel) {
-      return NULL;
-   }
-   return obj;
-static GLenum
-get_cl_gl_format(mesa_format format)
-   switch (format) {
-      return GL_RGBA;
-      return GL_BGRA;
-   default:
-      return GL_BGRA;
-  }
-static bool
-intelAcquireTexture(void *driver, __DRIcontext *context, GLenum target,
-                    GLint level, GLuint texture, void *user_data)
-   struct _intel_dri_share_image_region *region = intel_dri_share_image_region(user_data);
-   struct intel_context *intel = context->driverPrivate;
-   struct gl_texture_object *obj;
-   struct intel_texture_object *iobj;
-   /* XXX Always be face 0? */
-   GLuint face = 0;
-   obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
-   if (obj == NULL)
-     return false;
-   iobj = intel_texture_object(obj);
-   region->gl_format = get_cl_gl_format(obj->Image[face][level]->TexFormat);
-   return intel_setup_cl_region_from_mipmap_tree(driver, intel, iobj->mt, level, 0, region);
-static bool
-intelReleaseTexture(void *driver, __DRIcontext *context, GLenum target,
-                    GLint level, GLuint texture)
-   struct intel_context *intel = context->driverPrivate;
-   struct gl_texture_object *obj;
-   struct intel_texture_object *iobj;
-   /* XXX Always be face 0? */
-   GLuint face = 0;
-   obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
-   if (obj == NULL)
-     return false;
-   iobj = intel_texture_object(obj);
-   _intel_region_release(&iobj->mt->region);
-   return true;
-static bool
-intelAcquireBufferObj(void *driver, __DRIcontext *driContextPriv,
-                      GLuint bufobj, void *user_data)
-  return false;
-static bool
-intelReleaseBufferObj(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
-  return false;
-static bool
-intelAcquireRenderBuffer(void *driver, __DRIcontext *driContextPriv,
-                         GLuint bufobj, void *user_data)
-  return false;
-static bool
-intelReleaseRenderBuffer(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
-  return false;
-#include "cl_driver.h"
-  cl_gl_acquire_texture = (cl_gl_acquire_texture_cb*)intelAcquireTexture;
-  cl_gl_release_texture = (cl_gl_release_texture_cb*)intelReleaseTexture;
-  cl_gl_acquire_buffer_object = (cl_gl_acquire_buffer_object_cb*)intelAcquireBufferObj;
-  cl_gl_release_buffer_object = (cl_gl_release_buffer_object_cb*)intelReleaseBufferObj;
-  cl_gl_acquire_render_buffer = (cl_gl_acquire_render_buffer_cb*)intelAcquireRenderBuffer;
-  cl_gl_release_render_buffer = (cl_gl_release_render_buffer_cb*)intelReleaseRenderBuffer;
diff --git a/src/intel/intel_dri_resource_sharing.h b/src/intel/intel_dri_resource_sharing.h
deleted file mode 100644
index 6d2ce4d..0000000
--- a/src/intel/intel_dri_resource_sharing.h
+++ /dev/null
@@ -1,39 +0,0 @@
-struct _intel_dri_share_image_region {
-  unsigned int name;
-  size_t w;
-  size_t h;
-  size_t depth;
-  size_t pitch;
-  int tiling;
-  size_t offset;
-  size_t tile_x;
-  size_t tile_y;
-  unsigned int gl_format;
-  size_t row_pitch, slice_pitch;
-struct _intel_dri_share_buffer_object {
-  unsigned int name;
-  size_t sz;
-  size_t offset;
-inline static struct _intel_dri_share_image_region *
-intel_dri_share_image_region(void *user_data)
-   return (struct _intel_dri_share_image_region *)user_data;
-inline static struct _intel_dri_share_buffer_object *
-intel_dri_share_buffer_object(void *user_data)
-   return (struct _intel_dri_share_buffer_object *)user_data;
-extern void intel_set_cl_gl_callbacks(void);
diff --git a/src/intel/intel_dri_resource_sharing_int.h b/src/intel/intel_dri_resource_sharing_int.h
deleted file mode 100644
index c7b283a..0000000
--- a/src/intel/intel_dri_resource_sharing_int.h
+++ /dev/null
@@ -1,143 +0,0 @@
- * The following functions are copied from i965 driver, commit
- * id 292368570a13501dfa95b1b0dd70966caf6ffc6b. Need to keep consistant
- * with the dri driver installed on current system.
- *****************************************************************/
-static bool
-_intel_region_flink(struct intel_region *region, uint32_t *name)
-   if (region->name == 0) {
-      if (drm_intel_bo_flink(region->bo, &region->name))
-         return false;
-   }
-   *name = region->name;
-   return true;
-#define _DBG(...)
-static void
-_intel_region_release(struct intel_region **region_handle)
-   struct intel_region *region = *region_handle;
-   if (region == NULL) {
-      _DBG("%s NULL\n", __FUNCTION__);
-      return;
-   }
-   _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
-   ASSERT(region->refcount > 0);
-   region->refcount--;
-   if (region->refcount == 0) {
-      drm_intel_bo_unreference(region->bo);
-      free(region);
-   }
-   *region_handle = NULL;
-static void
-_intel_region_reference(struct intel_region **dst, struct intel_region *src)
-   _DBG("%s: %p(%d) -> %p(%d)\n", __FUNCTION__,
-        *dst, *dst ? (*dst)->refcount : 0, src, src ? src->refcount : 0);
-   if (src != *dst) {
-      if (*dst)
-         _intel_region_release(dst);
-      if (src)
-         src->refcount++;
-      *dst = src;
-   }
- * This function computes masks that may be used to select the bits of the X
- * and Y coordinates that indicate the offset within a tile.  If the region is
- * untiled, the masks are set to 0.
- */
-static void
-_intel_region_get_tile_masks(struct intel_region *region,
-                             uint32_t *mask_x, uint32_t *mask_y,
-                             bool map_stencil_as_y_tiled)
-   int cpp = region->cpp;
-   uint32_t tiling = region->tiling;
-   if (map_stencil_as_y_tiled)
-      tiling = I915_TILING_Y;
-   switch (tiling) {
-   default:
-      assert(false);
-   case I915_TILING_NONE:
-      *mask_x = *mask_y = 0;
-      break;
-   case I915_TILING_X:
-      *mask_x = 512 / cpp - 1;
-      *mask_y = 7;
-      break;
-   case I915_TILING_Y:
-      *mask_x = 128 / cpp - 1;
-      *mask_y = 31;
-      break;
-   }
- * Compute the offset (in bytes) from the start of the region to the given x
- * and y coordinate.  For tiled regions, caller must ensure that x and y are
- * multiples of the tile size.
- */
-static uint32_t
-_intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
-                                 uint32_t y, bool map_stencil_as_y_tiled)
-   int cpp = region->cpp;
-   uint32_t pitch = region->pitch;
-   uint32_t tiling = region->tiling;
-   if (map_stencil_as_y_tiled) {
-      tiling = I915_TILING_Y;
-      /* When mapping a W-tiled stencil buffer as Y-tiled, each 64-high W-tile
-       * gets transformed into a 32-high Y-tile.  Accordingly, the pitch of
-       * the resulting region is twice the pitch of the original region, since
-       * each row in the Y-tiled view corresponds to two rows in the actual
-       * W-tiled surface.  So we need to correct the pitch before computing
-       * the offsets.
-       */
-      pitch *= 2;
-   }
-   switch (tiling) {
-   default:
-      assert(false);
-   case I915_TILING_NONE:
-      return y * pitch + x * cpp;
-   case I915_TILING_X:
-      assert((x % (512 / cpp)) == 0);
-      assert((y % 8) == 0);
-      return y * pitch + x / (512 / cpp) * 4096;
-   case I915_TILING_Y:
-      assert((x % (128 / cpp)) == 0);
-      assert((y % 32) == 0);
-      return y * pitch + x / (128 / cpp) * 4096;
-   }
-static void
-_intel_miptree_get_image_offset(struct intel_mipmap_tree *mt,
-                                GLuint level, GLuint slice,
-                                GLuint *x, GLuint *y)
-   assert(slice < mt->level[level].depth);
-   *x = mt->level[level].slice[slice].x_offset;
-   *y = mt->level[level].slice[slice].y_offset;
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 7a46c1d..b8a1b52 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -46,10 +46,11 @@
-#if defined(HAS_EGL)
+#if defined(HAS_GL_EGL)
 #include "GL/gl.h"
 #include "EGL/egl.h"
-#include "x11/mesa_egl_extension.h"
+#include <EGL/eglext.h>
 #ifdef HAS_X11
@@ -99,9 +100,9 @@ intel_driver_new(void)
   return driver;
-  intel_driver_delete(driver);
-  driver = NULL;
-  goto exit;
+driver = NULL;
+goto exit;
 /* just used for maximum relocation number in drm_intel */
@@ -111,372 +112,385 @@ error:
 static void
 intel_driver_aub_dump(intel_driver_t *driver)
-  char *val;
-  val = getenv("OCL_DUMP_AUB");
-  if (!val)
-    return;
-  if (atoi(val) != 0) {
-    drm_intel_bufmgr_gem_set_aub_filename(driver->bufmgr,
-					  "beignet.aub");
-    drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
-  }
+char *val;
+val = getenv("OCL_DUMP_AUB");
+if (!val)
+  return;
+if (atoi(val) != 0) {
+  drm_intel_bufmgr_gem_set_aub_filename(driver->bufmgr,
+          "beignet.aub");
+  drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
 static int
 intel_driver_memman_init(intel_driver_t *driver)
-  driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
-  if (!driver->bufmgr) return 0;
-  drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
-  driver->device_id = drm_intel_bufmgr_gem_get_devid(driver->bufmgr);
-  intel_driver_aub_dump(driver);
-  return 1;
+driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
+if (!driver->bufmgr) return 0;
+driver->device_id = drm_intel_bufmgr_gem_get_devid(driver->bufmgr);
+return 1;
-static void
+static int
 intel_driver_context_init(intel_driver_t *driver)
-  driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
-  assert(driver->ctx);
+driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
+if (!driver->ctx)
+  return 0;
+driver->null_bo = NULL;
+drm_intel_bo *bo = dri_bo_alloc(driver->bufmgr, "null_bo", 64*1024, 4096);
+drm_intel_bo_set_softpin_offset(bo, 0);
+// don't reuse it, that would make two bo trying to bind to same address,
+// which is un-reasonable.
+driver->null_bo = bo;
+return 1;
 static void
 intel_driver_context_destroy(intel_driver_t *driver)
-  if(driver->ctx)
-    drm_intel_gem_context_destroy(driver->ctx);
-  driver->ctx = NULL;
+if (driver->null_bo)
+  drm_intel_bo_unreference(driver->null_bo);
+  drm_intel_gem_context_destroy(driver->ctx);
+driver->ctx = NULL;
 static int
 intel_driver_init(intel_driver_t *driver, int dev_fd)
-  driver->fd = dev_fd;
-  driver->locked = 0;
-  pthread_mutex_init(&driver->ctxmutex, NULL);
+driver->fd = dev_fd;
+driver->locked = 0;
+pthread_mutex_init(&driver->ctxmutex, NULL);
-  if (!intel_driver_memman_init(driver)) return 0;
-  intel_driver_context_init(driver);
+if (!intel_driver_memman_init(driver)) return 0;
+if (!intel_driver_context_init(driver)) return 0;
-  driver->gen_ver = EMULATE_GEN;
-  if (EMULATE_GEN == 75)
-    driver->device_id = PCI_CHIP_HASWELL_L;       /* we pick L for HSW */
-  else if (EMULATE_GEN == 7)
-    driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
-  else if (EMULATE_GEN == 6)
-    driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
-  else
-    FATAL ("Unsupported Gen for emulation");
+driver->gen_ver = EMULATE_GEN;
+if (EMULATE_GEN == 75)
+  driver->device_id = PCI_CHIP_HASWELL_L;       /* we pick L for HSW */
+else if (EMULATE_GEN == 7)
+  driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
+else if (EMULATE_GEN == 6)
+  driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
+  FATAL ("Unsupported Gen for emulation");
-  if (IS_GEN9(driver->device_id))
-    driver->gen_ver = 9;
-  else if (IS_GEN8(driver->device_id))
-    driver->gen_ver = 8;
-  else if (IS_GEN75(driver->device_id))
-    driver->gen_ver = 75;
-  else if (IS_GEN7(driver->device_id))
-    driver->gen_ver = 7;
-  else if (IS_GEN6(driver->device_id))
-    driver->gen_ver = 6;
-  else if(IS_IGDNG(driver->device_id))
-    driver->gen_ver = 5;
-  else
-    driver->gen_ver = 4;
+if (IS_GEN9(driver->device_id))
+  driver->gen_ver = 9;
+else if (IS_GEN8(driver->device_id))
+  driver->gen_ver = 8;
+else if (IS_GEN75(driver->device_id))
+  driver->gen_ver = 75;
+else if (IS_GEN7(driver->device_id))
+  driver->gen_ver = 7;
+else if (IS_GEN6(driver->device_id))
+  driver->gen_ver = 6;
+else if(IS_IGDNG(driver->device_id))
+  driver->gen_ver = 5;
+  driver->gen_ver = 4;
 #endif /* EMULATE_GEN */
-  return 1;
+return 1;
 static cl_int
 intel_driver_open(intel_driver_t *intel, cl_context_prop props)
-  int cardi;
+int cardi;
 #ifdef HAS_X11
-  char *driver_name;
+char *driver_name;
-  if (props != NULL
-      && props->gl_type != CL_GL_NOSHARE
-      && props->gl_type != CL_GL_GLX_DISPLAY
-      && props->gl_type != CL_GL_EGL_DISPLAY) {
-    fprintf(stderr, "Unsupported gl share type %d.\n", props->gl_type);
-  }
+if (props != NULL
+    && props->gl_type != CL_GL_NOSHARE
+    && props->gl_type != CL_GL_GLX_DISPLAY
+    && props->gl_type != CL_GL_EGL_DISPLAY) {
+  fprintf(stderr, "Unsupported gl share type %d.\n", props->gl_type);
 #ifdef HAS_X11
-  intel->x11_display = XOpenDisplay(NULL);
-  if(intel->x11_display) {
-    if((intel->dri_ctx = getDRI2State(intel->x11_display,
-                                     DefaultScreen(intel->x11_display),
-                                     &driver_name))) {
-      intel_driver_init_shared(intel, intel->dri_ctx);
-      Xfree(driver_name);
-    }
-    else
-      fprintf(stderr, "X server found. dri2 connection failed! \n");
+intel->x11_display = XOpenDisplay(NULL);
+if(intel->x11_display) {
+  if((intel->dri_ctx = getDRI2State(intel->x11_display,
+                                   DefaultScreen(intel->x11_display),
+                                   &driver_name))) {
+    intel_driver_init_shared(intel, intel->dri_ctx);
+    Xfree(driver_name);
+  else
+    fprintf(stderr, "X server found. dri2 connection failed! \n");
-  if(!intel_driver_is_active(intel)) {
-    char card_name[20];
-    for(cardi = 0; cardi < 16; cardi++) {
-      sprintf(card_name, "/dev/dri/renderD%d", 128+cardi);
-      if (access(card_name, R_OK) != 0)
-        continue;
-      if(intel_driver_init_render(intel, card_name))
-        break;
-    }
+if(!intel_driver_is_active(intel)) {
+  char card_name[20];
+  for(cardi = 0; cardi < 16; cardi++) {
+    sprintf(card_name, "/dev/dri/renderD%d", 128+cardi);
+    if (access(card_name, R_OK) != 0)
+      continue;
+    if(intel_driver_init_render(intel, card_name))
+      break;
-  if(!intel_driver_is_active(intel)) {
-    char card_name[20];
-    for(cardi = 0; cardi < 16; cardi++) {
-      sprintf(card_name, "/dev/dri/card%d", cardi);
-      if (access(card_name, R_OK) != 0)
-        continue;
-      if(intel_driver_init_master(intel, card_name))
-        break;
-    }
+if(!intel_driver_is_active(intel)) {
+  char card_name[20];
+  for(cardi = 0; cardi < 16; cardi++) {
+    sprintf(card_name, "/dev/dri/card%d", cardi);
+    if (access(card_name, R_OK) != 0)
+      continue;
+    if(intel_driver_init_master(intel, card_name))
+      break;
-  if(!intel_driver_is_active(intel)) {
-    fprintf(stderr, "Device open failed, aborting...\n");
-    return CL_DEVICE_NOT_FOUND;
-  }
+if(!intel_driver_is_active(intel)) {
+  fprintf(stderr, "Device open failed, aborting...\n");
-#ifdef HAS_EGL
-  if (props && props->gl_type == CL_GL_EGL_DISPLAY) {
-    assert(props->egl_display);
-  }
+#ifdef HAS_GL_EGL
+if (props && props->gl_type == CL_GL_EGL_DISPLAY) {
+  assert(props->egl_display);
-  return CL_SUCCESS;
+return CL_SUCCESS;
 static void
 intel_driver_close(intel_driver_t *intel)
-  //Due to the drm change about the test usrptr, we need to destroy the bufmgr
-  //befor the driver was closed, otherwise the test usrptr will not be freed.
-  if (intel->bufmgr)
-    drm_intel_bufmgr_destroy(intel->bufmgr);
+//Due to the drm change about the test usrptr, we need to destroy the bufmgr
+//befor the driver was closed, otherwise the test usrptr will not be freed.
+if (intel->bufmgr)
+  drm_intel_bufmgr_destroy(intel->bufmgr);
 #ifdef HAS_X11
-  if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
-  if(intel->x11_display) XCloseDisplay(intel->x11_display);
+if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
+if(intel->x11_display) XCloseDisplay(intel->x11_display);
-  if(intel->need_close) {
-    close(intel->fd);
-    intel->need_close = 0;
-  }
-  intel->dri_ctx = NULL;
-  intel->x11_display = NULL;
-  intel->fd = -1;
+if(intel->need_close) {
+  close(intel->fd);
+  intel->need_close = 0;
+intel->dri_ctx = NULL;
+intel->x11_display = NULL;
+intel->fd = -1;
 LOCAL int
 intel_driver_is_active(intel_driver_t *driver) {
-  return driver->fd >= 0;
+return driver->fd >= 0;
 #ifdef HAS_X11
 LOCAL int 
 intel_driver_init_shared(intel_driver_t *driver, dri_state_t *state)
-  int ret;
-  assert(state);
-  if(state->driConnectedFlag != DRI2)
-    return 0;
-  ret = intel_driver_init(driver, state->fd);
-  driver->need_close = 0;
-  return ret;
+int ret;
+if(state->driConnectedFlag != DRI2)
+  return 0;
+ret = intel_driver_init(driver, state->fd);
+driver->need_close = 0;
+return ret;
 LOCAL int
 intel_driver_init_master(intel_driver_t *driver, const char* dev_name)
-  int dev_fd, ret;
+int dev_fd, ret;
-  drm_client_t client;
+drm_client_t client;
-  // usually dev_name = "/dev/dri/card%d"
-  dev_fd = open(dev_name, O_RDWR);
-  if (dev_fd == -1) {
-    fprintf(stderr, "open(\"%s\", O_RDWR) failed: %s\n", dev_name, strerror(errno));
-    return 0;
-  }
+// usually dev_name = "/dev/dri/card%d"
+dev_fd = open(dev_name, O_RDWR);
+if (dev_fd == -1) {
+  fprintf(stderr, "open(\"%s\", O_RDWR) failed: %s\n", dev_name, strerror(errno));
+  return 0;
-  // Check that we're authenticated
-  memset(&client, 0, sizeof(drm_client_t));
-  ret = ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client);
-  if (ret == -1) {
-    fprintf(stderr, "ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client) failed: %s\n", strerror(errno));
-    close(dev_fd);
-    return 0;
-  }
+// Check that we're authenticated
+memset(&client, 0, sizeof(drm_client_t));
+ret = ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client);
+if (ret == -1) {
+  fprintf(stderr, "ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client) failed: %s\n", strerror(errno));
+  close(dev_fd);
+  return 0;
-  if (!client.auth) {
-    fprintf(stderr, "%s not authenticated\n", dev_name);
-    close(dev_fd);
-    return 0;
-  }
+if (!client.auth) {
+  fprintf(stderr, "%s not authenticated\n", dev_name);
+  close(dev_fd);
+  return 0;
-  ret = intel_driver_init(driver, dev_fd);
-  driver->need_close = 1;
+ret = intel_driver_init(driver, dev_fd);
+driver->need_close = 1;
-  return ret;
+return ret;
 LOCAL int
 intel_driver_init_render(intel_driver_t *driver, const char* dev_name)
-  int dev_fd, ret;
+int dev_fd, ret;
-  dev_fd = open(dev_name, O_RDWR);
-  if (dev_fd == -1)
-    return 0;
+dev_fd = open(dev_name, O_RDWR);
+if (dev_fd == -1)
+  return 0;
-  ret = intel_driver_init(driver, dev_fd);
-  driver->need_close = 1;
+ret = intel_driver_init(driver, dev_fd);
+driver->need_close = 1;
-  return ret;
+return ret;
 LOCAL int 
 intel_driver_terminate(intel_driver_t *driver)
-  pthread_mutex_destroy(&driver->ctxmutex);
-  if(driver->need_close) {
-    close(driver->fd);
-    driver->need_close = 0;
-  }
-  driver->fd = -1;
-  return 1;
+if(driver->need_close) {
+  close(driver->fd);
+  driver->need_close = 0;
+driver->fd = -1;
+return 1;
 LOCAL void
 intel_driver_lock_hardware(intel_driver_t *driver)
-  assert(!driver->locked);
-  driver->locked = 1;
+driver->locked = 1;
 LOCAL void 
 intel_driver_unlock_hardware(intel_driver_t *driver)
-  driver->locked = 0;
+driver->locked = 0;
 LOCAL dri_bo*
 intel_driver_share_buffer_from_name(intel_driver_t *driver, const char *sname, uint32_t name)
-  dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
-                                             sname,
-                                             name);
-  if (bo == NULL) {
-    fprintf(stderr, "intel_bo_gem_create_from_name create \"%s\" bo from name %d failed: %s\n", sname, name, strerror(errno));
-    return NULL;
-  }
-  return bo;
+dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
+                                           sname,
+                                           name);
+if (bo == NULL) {
+  fprintf(stderr, "intel_bo_gem_create_from_name create \"%s\" bo from name %d failed: %s\n", sname, name, strerror(errno));
+  return NULL;
+return bo;
 LOCAL dri_bo*
 intel_driver_share_buffer_from_fd(intel_driver_t *driver, int fd, int size)
-  dri_bo *bo = drm_intel_bo_gem_create_from_prime(driver->bufmgr,
-                                                  fd,
-                                                  size);
-  if (bo == NULL) {
-    fprintf(stderr, "drm_intel_bo_gem_create_from_prime create bo(size %d) from fd %d failed: %s\n", size, fd, strerror(errno));
-    return NULL;
-  }
-  return bo;
+dri_bo *bo = drm_intel_bo_gem_create_from_prime(driver->bufmgr,
+                                                fd,
+                                                size);
+if (bo == NULL) {
+  fprintf(stderr, "drm_intel_bo_gem_create_from_prime create bo(size %d) from fd %d failed: %s\n", size, fd, strerror(errno));
+  return NULL;
+return bo;
 LOCAL uint32_t
 intel_driver_shared_name(intel_driver_t *driver, dri_bo *bo)
-  uint32_t name;
-  assert(bo);
-  dri_bo_flink(bo, &name);
-  return name;
+uint32_t name;
+dri_bo_flink(bo, &name);
+return name;
 /* XXX a null props is ok? */
 static int
-  intel_driver_t *driver = NULL;
-  int intel_device_id;
-  driver = intel_driver_new();
-  assert(driver != NULL);
-  if(UNLIKELY(intel_driver_open(driver, NULL) != CL_SUCCESS)) return INVALID_CHIP_ID;
-  intel_device_id = driver->device_id;
-  intel_driver_context_destroy(driver);
-  intel_driver_close(driver);
-  intel_driver_terminate(driver);
-  intel_driver_delete(driver);
-  return intel_device_id;
+intel_driver_t *driver = NULL;
+int intel_device_id;
+driver = intel_driver_new();
+assert(driver != NULL);
+if(UNLIKELY(intel_driver_open(driver, NULL) != CL_SUCCESS)) return INVALID_CHIP_ID;
+intel_device_id = driver->device_id;
+return intel_device_id;
 extern void intel_gpgpu_delete_all(intel_driver_t *driver);
 static void
 cl_intel_driver_delete(intel_driver_t *driver)
-  if (driver == NULL)
-    return;
-  intel_gpgpu_delete_all(driver);
-  intel_driver_context_destroy(driver);
-  intel_driver_close(driver);
-  intel_driver_terminate(driver);
-  intel_driver_delete(driver);
+if (driver == NULL)
+  return;
 #include "cl_gbe_loader.h"
 static intel_driver_t*
 cl_intel_driver_new(cl_context_prop props)
-  intel_driver_t *driver = NULL;
-  TRY_ALLOC_NO_ERR (driver, intel_driver_new());
-  if(UNLIKELY(intel_driver_open(driver, props) != CL_SUCCESS)) goto error;
+intel_driver_t *driver = NULL;
+TRY_ALLOC_NO_ERR (driver, intel_driver_new());
+if(UNLIKELY(intel_driver_open(driver, props) != CL_SUCCESS)) goto error;
-  return driver;
+return driver;
-  cl_intel_driver_delete(driver);
-  driver = NULL;
-  goto exit;
+driver = NULL;
+goto exit;
 static drm_intel_bufmgr*
 intel_driver_get_bufmgr(intel_driver_t *drv)
-  return drv->bufmgr;
+return drv->bufmgr;
 static uint32_t
 intel_driver_get_ver(struct intel_driver *drv)
-  return drv->gen_ver;
+return drv->gen_ver;
 static void
 intel_driver_enlarge_stack_size(struct intel_driver *drv, int32_t *stack_size)
-    if (drv->gen_ver == 75)
-      *stack_size = *stack_size * 4;
-    else if (drv->device_id == PCI_CHIP_BROXTON_1 || drv->device_id == PCI_CHIP_BROXTON_3 ||
-             IS_CHERRYVIEW(drv->device_id))
-      *stack_size = *stack_size * 2;
+  if (drv->gen_ver == 75)
+    *stack_size = *stack_size * 4;
+  else if (drv->device_id == PCI_CHIP_BROXTON_1 || drv->device_id == PCI_CHIP_BROXTON_3 ||
+           IS_CHERRYVIEW(drv->device_id))
+    *stack_size = *stack_size * 2;
 static void
 intel_driver_set_atomic_flag(intel_driver_t *drv, int atomic_flag)
-  drv->atomic_test_result = atomic_flag;
+drv->atomic_test_result = atomic_flag;
 static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
@@ -484,465 +498,525 @@ static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
 static int get_cl_tiling(uint32_t drm_tiling)
-  switch(drm_tiling) {
-  case I915_TILING_X: return CL_TILE_X;
-  case I915_TILING_Y: return CL_TILE_Y;
-  case I915_TILING_NONE: return CL_NO_TILE;
-  default:
-    assert(0);
-  }
-  return CL_NO_TILE;
+switch(drm_tiling) {
+case I915_TILING_X: return CL_TILE_X;
+case I915_TILING_Y: return CL_TILE_Y;
+case I915_TILING_NONE: return CL_NO_TILE;
+  assert(0);
+return CL_NO_TILE;
 static uint32_t intel_buffer_get_tiling_align(cl_context ctx, uint32_t tiling_mode, uint32_t dim)
-  uint32_t gen_ver = ((intel_driver_t *)ctx->drv)->gen_ver;
-  uint32_t ret = 0;
-  switch (tiling_mode) {
-  case CL_TILE_X:
-    if (dim == 0) { //tileX width in bytes
-      ret = 512;
-    } else if (dim == 1) { //tileX height in number of rows
+uint32_t gen_ver = ((intel_driver_t *)ctx->drv)->gen_ver;
+uint32_t ret = 0;
+switch (tiling_mode) {
+case CL_TILE_X:
+  if (dim == 0) { //tileX width in bytes
+    ret = 512;
+  } else if (dim == 1) { //tileX height in number of rows
+    ret = 8;
+  }  else if (dim == 2) { //height to calculate slice pitch
+    if (gen_ver == 9) //SKL same as tileY height
       ret = 8;
-    }  else if (dim == 2) { //height to calculate slice pitch
-      if (gen_ver == 9) //SKL same as tileY height
-        ret = 8;
-      else if (gen_ver == 8)  //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
-        ret = 4;
-      else
-        ret = 2;
-    } else
-      assert(0);
-    break;
-  case CL_TILE_Y:
-    if (dim == 0) { //tileY width in bytes
-      ret = 128;
-    } else if (dim == 1) { //tileY height in number of rows
+    else if (gen_ver == 8)  //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
+      ret = 4;
+    else
+      ret = 2;
+  } else
+    assert(0);
+  break;
+case CL_TILE_Y:
+  if (dim == 0) { //tileY width in bytes
+    ret = 128;
+  } else if (dim == 1) { //tileY height in number of rows
+    ret = 32;
+  } else if (dim == 2) { //height to calculate slice pitch
+    if (gen_ver == 9) //SKL same as tileY height
       ret = 32;
-    } else if (dim == 2) { //height to calculate slice pitch
-      if (gen_ver == 9) //SKL same as tileY height
-        ret = 32;
-      else if (gen_ver == 8) //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
-        ret = 4;
-      else
-        ret = 2;
-    } else
-      assert(0);
-    break;
+    else if (gen_ver == 8) //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
+      ret = 4;
+    else
+      ret = 2;
+  } else
+    assert(0);
+  break;
-  case CL_NO_TILE:
-    if (dim == 1 || dim == 2) { //vertical alignment
-      if (gen_ver == 8 || gen_ver == 9) //SKL 1D array need 4 alignment qpitch
-        ret = 4;
-      else
-        ret = 2;
-    } else
-      assert(0);
-    break;
-  }
+case CL_NO_TILE:
+  if (dim == 1 || dim == 2) { //vertical alignment
+    if (gen_ver == 8 || gen_ver == 9) //SKL 1D array need 4 alignment qpitch
+      ret = 4;
+    else
+      ret = 2;
+  } else
+    assert(0);
+  break;
-  return ret;
+return ret;
-#if defined(HAS_EGL)
-#include "intel_dri_resource_sharing.h"
+#if defined(HAS_GL_EGL)
+#include "intel_cl_gl_share_image_info.h"
 #include "cl_image.h"
+static int
+if(eglExportDMABUFImageMESA_func == NULL){
+  eglExportDMABUFImageMESA_func =  (PFNEGLEXPORTDMABUFIMAGEMESAPROC) eglGetProcAddress("eglExportDMABUFImageMESA");
+  if(eglExportDMABUFImageMESA_func == NULL){
+    fprintf(stderr, "Failed to get EGL extension function eglExportDMABUFImageMESA\n");
+    return -1;
+  }
+return 0;
 static int cl_get_clformat_from_texture(GLint tex_format, cl_image_format * cl_format)
-  cl_int ret = CL_SUCCESS;
-  switch (tex_format) {
-  case GL_RGBA8:
-  case GL_RGBA:
-  case GL_RGBA16:
-  case GL_RGBA8I:
-  case GL_RGBA16I:
-  case GL_RGBA32I:
-  case GL_RGBA8UI:
-  case GL_RGBA16UI:
-  case GL_RGBA32UI:
-  case GL_RGBA16F:
-  case GL_RGBA32F:
-    cl_format->image_channel_order = CL_RGBA;
-    break;
-  case GL_BGRA:
-    cl_format->image_channel_order = CL_BGRA;
-    break;
-  default:
-    ret = -1;
-    goto error;
-  }
+cl_int ret = CL_SUCCESS;
+switch (tex_format) {
+case GL_RGBA8:
+case GL_RGBA:
+case GL_RGBA16:
+case GL_RGBA8I:
+case GL_RGBA16I:
+case GL_RGBA32I:
+case GL_RGBA8UI:
+case GL_RGBA16UI:
+case GL_RGBA32UI:
+case GL_RGBA16F:
+case GL_RGBA32F:
+  cl_format->image_channel_order = CL_RGBA;
+  break;
+case GL_BGRA:
+  cl_format->image_channel_order = CL_BGRA;
+  break;
+  ret = -1;
+  goto error;
-  switch (tex_format) {
-  case GL_RGBA8:
-  case GL_RGBA:
-  case GL_BGRA:
-    cl_format->image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case GL_RGBA16:
-    cl_format->image_channel_data_type = CL_UNORM_INT16;
-    break;
-  case GL_RGBA8I:
-    cl_format->image_channel_data_type = CL_SIGNED_INT8;
-    break;
-  case GL_RGBA16I:
-    cl_format->image_channel_data_type = CL_SIGNED_INT16;
-    break;
-  case GL_RGBA32I:
-    cl_format->image_channel_data_type = CL_SIGNED_INT32;
-    break;
-  case GL_RGBA8UI:
-    cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
-    break;
-  case GL_RGBA16UI:
-    cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
-    break;
-  case GL_RGBA32UI:
-    cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
-    break;
-  case GL_RGBA16F:
-    cl_format->image_channel_data_type = CL_HALF_FLOAT;
-    break;
-  case GL_RGBA32F:
-    cl_format->image_channel_order = CL_FLOAT;
-    break;
-  default:
-    ret = -1;
-    goto error;
-  }
+switch (tex_format) {
+case GL_RGBA8:
+case GL_RGBA:
+case GL_BGRA:
+  cl_format->image_channel_data_type = CL_UNORM_INT8;
+  break;
+case GL_RGBA16:
+  cl_format->image_channel_data_type = CL_UNORM_INT16;
+  break;
+case GL_RGBA8I:
+  cl_format->image_channel_data_type = CL_SIGNED_INT8;
+  break;
+case GL_RGBA16I:
+  cl_format->image_channel_data_type = CL_SIGNED_INT16;
+  break;
+case GL_RGBA32I:
+  cl_format->image_channel_data_type = CL_SIGNED_INT32;
+  break;
+case GL_RGBA8UI:
+  cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
+  break;
+case GL_RGBA16UI:
+  cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
+  break;
+case GL_RGBA32UI:
+  cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
+  break;
+case GL_RGBA16F:
+  cl_format->image_channel_data_type = CL_HALF_FLOAT;
+  break;
+case GL_RGBA32F:
+  cl_format->image_channel_order = CL_FLOAT;
+  break;
+  ret = -1;
+  goto error;
-  return ret;
+return ret;
 static int
 get_mem_type_from_target(GLenum texture_target, cl_mem_object_type *type)
-  switch(texture_target) {
-  case GL_TEXTURE_1D: *type = CL_MEM_OBJECT_IMAGE1D; break;
-  case GL_TEXTURE_2D: *type = CL_MEM_OBJECT_IMAGE2D; break;
-  case GL_TEXTURE_3D: *type = CL_MEM_OBJECT_IMAGE3D; break;
-  default:
-    return -1;
-  }
-  return CL_SUCCESS;
+switch(texture_target) {
+case GL_TEXTURE_1D: *type = CL_MEM_OBJECT_IMAGE1D; break;
+case GL_TEXTURE_2D: *type = CL_MEM_OBJECT_IMAGE2D; break;
+case GL_TEXTURE_3D: *type = CL_MEM_OBJECT_IMAGE3D; break;
+  return -1;
+return CL_SUCCESS;
 static cl_buffer
 intel_alloc_buffer_from_texture_egl(cl_context ctx, unsigned int target,
-                                    int miplevel, unsigned int texture,
-                                    struct _cl_mem_image *image)
-  cl_buffer bo = (cl_buffer) NULL;
-  struct _intel_dri_share_image_region region;
-  unsigned int bpp, intel_fmt;
-  cl_image_format cl_format;
-  EGLBoolean ret;
-  EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
-                           EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
-                           EGL_GL_TEXTURE_TARGET_MESA, target,
-                           EGL_NONE};
-  ret = eglAcquireResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx),
-                               EGL_GL_TEXTURE_MESA,
-                               &attrib_list[0], &region);
-  if (!ret)
-      goto out;
-  bo = (cl_buffer)intel_driver_share_buffer((intel_driver_t *)ctx->drv, "rendering buffer", region.name);
-  if (bo == NULL) {
-    eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
-    goto out;
-  }
-  region.tiling = get_cl_tiling(region.tiling);
-  if (cl_get_clformat_from_texture(region.gl_format, &cl_format) != 0)
-    goto error;
-  if (cl_image_byte_per_pixel(&cl_format, &bpp) != CL_SUCCESS)
-    goto error;
-  intel_fmt = cl_image_get_intel_format(&cl_format);
-  if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
-    goto error;
-  cl_mem_object_type image_type;
-  if (get_mem_type_from_target(target, &image_type) != 0)
-    goto error;
-  cl_mem_image_init(image, region.w, region.h,
-                    image_type, region.depth, cl_format,
-                    intel_fmt, bpp, region.row_pitch,
-                    region.slice_pitch, region.tiling,
-                    region.tile_x, region.tile_y, region.offset);
-  return bo;
+                                  int miplevel, unsigned int texture,
+                                  struct _cl_mem_image *image)
+drm_intel_bo *intel_bo = NULL;
+struct _intel_cl_gl_share_image_info info;
+unsigned int bpp, intel_fmt;
+cl_image_format cl_format;
+EGLBoolean ret;
+EGLenum e_target;
+//We just support GL_TEXTURE_2D because we can't query info like slice_pitch now.
+if(target == GL_TEXTURE_2D)
+  e_target = EGL_GL_TEXTURE_2D;
+  return NULL;
-  cl_buffer_unreference(bo);
-  eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+if(get_required_egl_extensions() != 0)
+  return NULL;
+EGLAttrib attrib_list[] = {EGL_GL_TEXTURE_LEVEL, miplevel,
+                          EGL_NONE};
+EGLImage e_image = eglCreateImage(EGL_DISP(ctx), EGL_CTX(ctx), e_target,
+                                  (EGLClientBuffer)texture, &attrib_list[0]);
+if(e_image == EGL_NO_IMAGE)
+  return NULL;
+int fd, stride, offset;
+ret = eglExportDMABUFImageMESA_func(EGL_DISP(ctx), e_image, &fd, &stride, &offset);
+if(ret != EGL_TRUE){
+  eglDestroyImage(EGL_DISP(ctx), e_image);
+  return NULL;
+info.fd = fd;
+/* The size argument just takes effect in intel_driver_share_buffer_from_fd when
+ * Linux kernel is older than 3.12, so it doesn't matter we set to 0 here.
+ */
+int size = 0;
+intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, size);
+if (intel_bo == NULL) {
+  eglDestroyImage(EGL_DISP(ctx), e_image);
   return NULL;
+GLint param_value;
+glGetTexLevelParameteriv(target, miplevel, GL_TEXTURE_WIDTH, &param_value);
+info.w = param_value;
+glGetTexLevelParameteriv(target, miplevel, GL_TEXTURE_HEIGHT, &param_value);
+info.h = param_value;
+glGetTexLevelParameteriv(target, miplevel, GL_TEXTURE_DEPTH, &param_value);
+info.depth = 1;
+info.pitch = stride;
+uint32_t tiling_mode, swizzle_mode;
+drm_intel_bo_get_tiling(intel_bo, &tiling_mode, &swizzle_mode);
+info.offset = offset;
+info.tile_x = 0;
+info.tile_y = 0;
+glGetTexLevelParameteriv(target, miplevel, GL_TEXTURE_INTERNAL_FORMAT, &param_value);
+info.gl_format = param_value;
+info.row_pitch = stride;
+info.slice_pitch = 0;
+info.tiling = get_cl_tiling(tiling_mode);
+if (cl_get_clformat_from_texture(info.gl_format, &cl_format) != 0)
+  goto error;
+if (cl_image_byte_per_pixel(&cl_format, &bpp) != CL_SUCCESS)
+  goto error;
+intel_fmt = cl_image_get_intel_format(&cl_format);
+  goto error;
+cl_mem_object_type image_type;
+if (get_mem_type_from_target(target, &image_type) != 0)
+  goto error;
+cl_mem_image_init(image, info.w, info.h,
+                  image_type, info.depth, cl_format,
+                  intel_fmt, bpp, info.row_pitch,
+                  info.slice_pitch, info.tiling,
+                  info.tile_x, info.tile_y, info.offset);
+struct _cl_mem_gl_image *gl_image = (struct _cl_mem_gl_image*)image;
+gl_image->fd = fd;
+gl_image->egl_image = e_image;
+return (cl_buffer) intel_bo;
+eglDestroyImage(EGL_DISP(ctx), e_image);
+return NULL;
 static cl_buffer
 intel_alloc_buffer_from_texture(cl_context ctx, unsigned int target,
-                                int miplevel, unsigned int texture,
-                                struct _cl_mem_image *image)
+                              int miplevel, unsigned int texture,
+                              struct _cl_mem_image *image)
-  if (IS_EGL_CONTEXT(ctx))
-    return intel_alloc_buffer_from_texture_egl(ctx, target, miplevel, texture, image);
+if (IS_EGL_CONTEXT(ctx))
+  return intel_alloc_buffer_from_texture_egl(ctx, target, miplevel, texture, image);
-  return NULL;
+return NULL;
 static int
-intel_release_buffer_from_texture(cl_context ctx, unsigned int target,
-                                  int miplevel, unsigned int texture)
+intel_release_buffer_from_texture(cl_context ctx, struct _cl_mem_gl_image *gl_image)
-  if (IS_EGL_CONTEXT(ctx)) {
-    EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
-                           EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
-                           EGL_GL_TEXTURE_TARGET_MESA, target,
-                           EGL_NONE};
-    eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
-    return CL_SUCCESS;
-  }
-  return -1;
+if (IS_EGL_CONTEXT(ctx)) {
+  close(gl_image->fd);
+  eglDestroyImage(EGL_DISP(ctx), gl_image->egl_image);
+  return CL_SUCCESS;
+return -1;
 cl_buffer intel_share_buffer_from_libva(cl_context ctx,
-                                        unsigned int bo_name,
-                                        size_t *sz)
+                                      unsigned int bo_name,
+                                      size_t *sz)
-  drm_intel_bo *intel_bo;
+drm_intel_bo *intel_bo;
-  intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
-  if (intel_bo == NULL)
-    return NULL;
+if (intel_bo == NULL)
+  return NULL;
-  if (sz)
-    *sz = intel_bo->size;
+if (sz)
+  *sz = intel_bo->size;
-  return (cl_buffer)intel_bo;
+return (cl_buffer)intel_bo;
 cl_buffer intel_share_image_from_libva(cl_context ctx,
-                                       unsigned int bo_name,
-                                       struct _cl_mem_image *image)
+                                     unsigned int bo_name,
+                                     struct _cl_mem_image *image)
-  drm_intel_bo *intel_bo;
-  uint32_t intel_tiling, intel_swizzle_mode;
+drm_intel_bo *intel_bo;
+uint32_t intel_tiling, intel_swizzle_mode;
-  intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
-  if (intel_bo == NULL)
-    return NULL;
+if (intel_bo == NULL)
+  return NULL;
-  drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
-  image->tiling = get_cl_tiling(intel_tiling);
+drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
+image->tiling = get_cl_tiling(intel_tiling);
-  return (cl_buffer)intel_bo;
+return (cl_buffer)intel_bo;
 cl_buffer intel_share_buffer_from_fd(cl_context ctx,
-                                     int fd,
-                                     int buffer_size)
+                                   int fd,
+                                   int buffer_size)
-  drm_intel_bo *intel_bo;
+drm_intel_bo *intel_bo;
-  intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, buffer_size);
+intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, buffer_size);
-  if (intel_bo == NULL)
-    return NULL;
+if (intel_bo == NULL)
+  return NULL;
-  return (cl_buffer)intel_bo;
+return (cl_buffer)intel_bo;
 cl_buffer intel_share_image_from_fd(cl_context ctx,
-                                    int fd,
-                                    int image_size,
-                                    struct _cl_mem_image *image)
+                                  int fd,
+                                  int image_size,
+                                  struct _cl_mem_image *image)
-  drm_intel_bo *intel_bo;
-  uint32_t intel_tiling, intel_swizzle_mode;
+drm_intel_bo *intel_bo;
+uint32_t intel_tiling, intel_swizzle_mode;
-  intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, image_size);
+intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, image_size);
-  if (intel_bo == NULL)
-    return NULL;
+if (intel_bo == NULL)
+  return NULL;
-  drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
-  image->tiling = get_cl_tiling(intel_tiling);
+drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
+image->tiling = get_cl_tiling(intel_tiling);
-  return (cl_buffer)intel_bo;
+return (cl_buffer)intel_bo;
 static cl_buffer intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char* name, void *data,size_t size, unsigned long flags)
-  drm_intel_bo *bo;
-  bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags);
-  /* Fallback to unsynchronized userptr allocation if kernel has no MMU notifier enabled. */
-  if (bo == NULL)
-    bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED);
-  return (cl_buffer)bo;
+drm_intel_bo *bo;
+bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags);
+/* Fallback to unsynchronized userptr allocation if kernel has no MMU notifier enabled. */
+if (bo == NULL)
+  bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED);
+return (cl_buffer)bo;
-  return NULL;
+return NULL;
 static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
-  switch (tiling) {
-    case CL_NO_TILE:
-      *intel_tiling = I915_TILING_NONE;
-      break;
-    case CL_TILE_X:
-      *intel_tiling = I915_TILING_X;
-      break;
-    case CL_TILE_Y:
-      *intel_tiling = I915_TILING_Y;
-      break;
-    default:
-      assert(0);
-      return -1;
-  }
-  return 0;
+switch (tiling) {
+  case CL_NO_TILE:
+    *intel_tiling = I915_TILING_NONE;
+    break;
+  case CL_TILE_X:
+    *intel_tiling = I915_TILING_X;
+    break;
+  case CL_TILE_Y:
+    *intel_tiling = I915_TILING_Y;
+    break;
+  default:
+    assert(0);
+    return -1;
+return 0;
 static int intel_buffer_set_tiling(cl_buffer bo,
-                                   cl_image_tiling_t tiling, size_t stride)
+                                 cl_image_tiling_t tiling, size_t stride)
-  uint32_t intel_tiling;
-  int ret;
-  if (UNLIKELY((get_intel_tiling(tiling, &intel_tiling)) < 0))
-    return -1;
+uint32_t intel_tiling;
+int ret;
+if (UNLIKELY((get_intel_tiling(tiling, &intel_tiling)) < 0))
+  return -1;
 #ifndef NDEBUG
-  uint32_t required_tiling;
-  required_tiling = intel_tiling;
+uint32_t required_tiling;
+required_tiling = intel_tiling;
-  ret = drm_intel_bo_set_tiling((drm_intel_bo*)bo, &intel_tiling, stride);
-  assert(intel_tiling == required_tiling);
-  return ret;
+ret = drm_intel_bo_set_tiling((drm_intel_bo*)bo, &intel_tiling, stride);
+assert(intel_tiling == required_tiling);
+return ret;
-        "Warning: can't get GPU's configurations, will use the minimal one. Please update your drm to 2.4.59+ and linux kernel to 4.0.0+.\n"
+      "Warning: can't get GPU's configurations, will use the minimal one. Please update your drm to 2.4.59+ and linux kernel to 4.0.0+.\n"
 static void
 intel_update_device_info(cl_device_id device)
-  intel_driver_t *driver;
+intel_driver_t *driver;
-  driver = intel_driver_new();
-  assert(driver != NULL);
-  if (intel_driver_open(driver, NULL) != CL_SUCCESS) {
-    intel_driver_delete(driver);
-    return;
-  }
+driver = intel_driver_new();
+assert(driver != NULL);
+if (intel_driver_open(driver, NULL) != CL_SUCCESS) {
+  intel_driver_delete(driver);
+  return;
-  const size_t sz = 4096;
-  void *host_ptr;
-  host_ptr = cl_aligned_malloc(sz, 4096);
-  if (host_ptr != NULL) {
-    cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
-      "CL memory object", host_ptr, sz, 0);
-    if (bo == NULL)
-      device->host_unified_memory = CL_FALSE;
-    else
-      drm_intel_bo_unreference((drm_intel_bo*)bo);
-    cl_free(host_ptr);
-  }
-  else
+const size_t sz = 4096;
+void *host_ptr;
+host_ptr = cl_aligned_malloc(sz, 4096);
+if (host_ptr != NULL) {
+  cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
+    "CL memory object", host_ptr, sz, 0);
+  if (bo == NULL)
     device->host_unified_memory = CL_FALSE;
+  else
+    drm_intel_bo_unreference((drm_intel_bo*)bo);
+  cl_free(host_ptr);
+  device->host_unified_memory = CL_FALSE;
 #ifdef HAS_EU_TOTAL
-  unsigned int eu_total;
+unsigned int eu_total;
-  /* Prefer driver-queried max compute units if supported */
-  if (!drm_intel_get_eu_total(driver->fd, &eu_total))
-    device->max_compute_unit = eu_total;
-  else if (IS_CHERRYVIEW(device->device_id))
-    printf(CHV_CONFIG_WARNING);
+/* Prefer driver-queried max compute units if supported */
+if (!drm_intel_get_eu_total(driver->fd, &eu_total))
+  device->max_compute_unit = eu_total;
+else if (IS_CHERRYVIEW(device->device_id))
-  if (IS_CHERRYVIEW(device->device_id)) {
+if (IS_CHERRYVIEW(device->device_id)) {
 #if defined(__ANDROID__)
-    device->max_compute_unit = 12;
+  device->max_compute_unit = 12;
-    printf(CHV_CONFIG_WARNING);
-  }
-  unsigned int subslice_total;
+unsigned int subslice_total;
-  /* Prefer driver-queried subslice count if supported */
-  if (!drm_intel_get_subslice_total(driver->fd, &subslice_total))
-    device->sub_slice_count = subslice_total;
-  else if (IS_CHERRYVIEW(device->device_id))
-    printf(CHV_CONFIG_WARNING);
+/* Prefer driver-queried subslice count if supported */
+if (!drm_intel_get_subslice_total(driver->fd, &subslice_total))
+  device->sub_slice_count = subslice_total;
+else if (IS_CHERRYVIEW(device->device_id))
-  if (IS_CHERRYVIEW(device->device_id)) {
+if (IS_CHERRYVIEW(device->device_id)) {
 #if defined(__ANDROID__)
-    device->sub_slice_count = 2;
+  device->sub_slice_count = 2;
-    printf(CHV_CONFIG_WARNING);
-  }
-  /* BXT pooled eu, 3*6 to 2*9, like sub slice count is 2 */
-  int has_pooled_eu;
-  if((has_pooled_eu = drm_intel_get_pooled_eu(driver->fd)) > 0)
-    device->sub_slice_count = 2;
+/* BXT pooled eu, 3*6 to 2*9, like sub slice count is 2 */
+int has_pooled_eu;
+if((has_pooled_eu = drm_intel_get_pooled_eu(driver->fd)) > 0)
+  device->sub_slice_count = 2;
-  int min_eu;
-  /* for fused down 2x6 devices, beignet don't support. */
-  if (has_pooled_eu > 0 && (min_eu = drm_intel_get_min_eu_in_pool(driver->fd)) > 0) {
-    assert(min_eu == 9); //don't support fuse down device.
-  }
+int min_eu;
+/* for fused down 2x6 devices, beignet don't support. */
+if (has_pooled_eu > 0 && (min_eu = drm_intel_get_min_eu_in_pool(driver->fd)) > 0) {
+  assert(min_eu == 9); //don't support fuse down device.
 #endif //HAS_MIN_EU_IN_POOL
 #endif //HAS_POOLED_EU
-  //We should get the device memory dynamically, but the
-  //mapablce mem size usage is unknown. Just ignore it.
-  size_t total_mem,map_mem;
-  if(drm_intel_get_aperture_sizes(driver->fd,&map_mem,&total_mem) == 0)
-    device->global_mem_size = (cl_ulong)total_mem;
-  intel_driver_context_destroy(driver);
-  intel_driver_close(driver);
-  intel_driver_terminate(driver);
-  intel_driver_delete(driver);
+//We should get the device memory dynamically, but the
+//mapablce mem size usage is unknown. Just ignore it.
+size_t total_mem,map_mem;
+if(drm_intel_get_aperture_sizes(driver->fd,&map_mem,&total_mem) == 0)
+  device->global_mem_size = (cl_ulong)total_mem;
 LOCAL void
-  cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
-  cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
-  cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
-  cl_driver_enlarge_stack_size = (cl_driver_enlarge_stack_size_cb *) intel_driver_enlarge_stack_size;
-  cl_driver_set_atomic_flag = (cl_driver_set_atomic_flag_cb *) intel_driver_set_atomic_flag;
-  cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
-  cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
-  cl_driver_update_device_info = (cl_driver_update_device_info_cb *) intel_update_device_info;
-  cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
-  cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
+cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
+cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
+cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+cl_driver_enlarge_stack_size = (cl_driver_enlarge_stack_size_cb *) intel_driver_enlarge_stack_size;
+cl_driver_set_atomic_flag = (cl_driver_set_atomic_flag_cb *) intel_driver_set_atomic_flag;
+cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
+cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
+cl_driver_update_device_info = (cl_driver_update_device_info_cb *) intel_update_device_info;
+cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
+cl_buffer_set_softpin_offset = (cl_buffer_set_softpin_offset_cb *) drm_intel_bo_set_softpin_offset;
+cl_buffer_set_bo_use_full_range = (cl_buffer_set_bo_use_full_range_cb *) drm_intel_bo_use_48b_address_range;
+  cl_buffer_disable_reuse = (cl_buffer_disable_reuse_cb *) drm_intel_bo_disable_reuse;
   cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
-#if defined(HAS_EGL)
+#if defined(HAS_GL_EGL)
   cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
   cl_buffer_release_from_texture = (cl_buffer_release_from_texture_cb *) intel_release_buffer_from_texture;
-  intel_set_cl_gl_callbacks();
   cl_buffer_get_buffer_from_libva = (cl_buffer_get_buffer_from_libva_cb *) intel_share_buffer_from_libva;
   cl_buffer_get_image_from_libva = (cl_buffer_get_image_from_libva_cb *) intel_share_image_from_libva;
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
index 51f0e0d..3be93c2 100644
--- a/src/intel/intel_driver.h
+++ b/src/intel/intel_driver.h
@@ -79,6 +79,7 @@ typedef struct intel_driver
   dri_bufmgr *bufmgr;
   drm_intel_context *ctx;
+  drm_intel_bo *null_bo;
   int fd;
   int device_id;
   int gen_ver;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index a643f5c..283b07a 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -975,6 +975,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   size_aux = ALIGN(size_aux, 4096);
   bo = dri_bo_alloc(gpgpu->drv->bufmgr, "AUX_BUFFER", size_aux, 4096);
   if (!bo || dri_bo_map(bo, 1) != 0) {
     fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
     if (bo)
@@ -1527,10 +1528,12 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
                      uint32_t internal_offset, size_t size, uint8_t bti)
   assert(gpgpu->binded_n < max_buf_n);
-  gpgpu->binded_buf[gpgpu->binded_n] = buf;
-  gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
-  gpgpu->binded_offset[gpgpu->binded_n] = offset;
-  gpgpu->binded_n++;
+  if(offset != -1) {
+    gpgpu->binded_buf[gpgpu->binded_n] = buf;
+    gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
+    gpgpu->binded_offset[gpgpu->binded_n] = offset;
+    gpgpu->binded_n++;
+  }
   intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti, I965_SURFACEFORMAT_RAW);
@@ -1710,7 +1713,38 @@ intel_gpgpu_build_idrt_gen9(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 static int
-intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+intel_gpgpu_upload_curbes_gen7(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+  unsigned char *curbe = NULL;
+  cl_gpgpu_kernel *k = gpgpu->ker;
+  uint32_t i, j;
+  /* Upload the data first */
+  if (dri_bo_map(gpgpu->aux_buf.bo, 1) != 0) {
+    fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+    return -1;
+  }
+  assert(gpgpu->aux_buf.bo->virtual);
+  curbe = (unsigned char *) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.curbe_offset);
+  memcpy(curbe, data, size);
+  /* Now put all the relocations for our flat address space */
+  for (i = 0; i < k->thread_n; ++i)
+    for (j = 0; j < gpgpu->binded_n; ++j) {
+      *(uint32_t *)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset64 + gpgpu->target_buf_offset[j];
+      drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo,
+                              gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz,
+                              gpgpu->binded_buf[j],
+                              gpgpu->target_buf_offset[j],
+                              I915_GEM_DOMAIN_RENDER,
+                              I915_GEM_DOMAIN_RENDER);
+    }
+  dri_bo_unmap(gpgpu->aux_buf.bo);
+  return 0;
+static int
+intel_gpgpu_upload_curbes_gen8(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
   unsigned char *curbe = NULL;
   cl_gpgpu_kernel *k = gpgpu->ker;
@@ -1728,7 +1762,7 @@ intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
   /* Now put all the relocations for our flat address space */
   for (i = 0; i < k->thread_n; ++i)
     for (j = 0; j < gpgpu->binded_n; ++j) {
-      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset + gpgpu->target_buf_offset[j];
+      *(size_t *)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset64 + gpgpu->target_buf_offset[j];
                               gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz,
@@ -2050,6 +2084,9 @@ static void
 intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
   gpgpu->ker = kernel;
+  if (gpgpu->drv->null_bo)
+    intel_gpgpu_setup_bti(gpgpu, gpgpu->drv->null_bo, 0, 64*1024, 0xfe, I965_SURFACEFORMAT_RAW);
   intel_gpgpu_build_idrt(gpgpu, kernel);
@@ -2068,6 +2105,7 @@ intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu,
                    uint32_t simd_sz,
                    uint32_t thread_n,
                    const size_t global_wk_off[3],
+                   const size_t global_dim_off[3],
                    const size_t global_wk_sz[3],
                    const size_t local_wk_sz[3])
@@ -2117,6 +2155,7 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
                    uint32_t simd_sz,
                    uint32_t thread_n,
                    const size_t global_wk_off[3],
+                   const size_t global_dim_off[3],
                    const size_t global_wk_sz[3],
                    const size_t local_wk_sz[3])
@@ -2144,14 +2183,14 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
     OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
     OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8  | thread max */
+  OUT_BATCH(gpgpu->batch, global_dim_off[0]);
   OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[0]+global_dim_off[0]);
+  OUT_BATCH(gpgpu->batch, global_dim_off[1]);
   OUT_BATCH(gpgpu->batch, 0);
-  OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
-  OUT_BATCH(gpgpu->batch, 0);
-  OUT_BATCH(gpgpu->batch, 0);
-  OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
-  OUT_BATCH(gpgpu->batch, 0);
-  OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[1]+global_dim_off[1]);
+  OUT_BATCH(gpgpu->batch, global_dim_off[2]);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[2]+global_dim_off[2]);
   OUT_BATCH(gpgpu->batch, right_mask);
   OUT_BATCH(gpgpu->batch, ~0x0);                     /* we always set height as 1, so set bottom mask as all 1*/
@@ -2269,10 +2308,10 @@ intel_gpgpu_read_ts_reg_baytrail(drm_intel_bufmgr *bufmgr)
 /* We want to get the current time of GPU. */
 static void
-intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
+intel_gpgpu_event_get_gpu_cur_timestamp(intel_driver_t* gen_driver, uint64_t* ret_ts)
   uint64_t result = 0;
-  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+  drm_intel_bufmgr *bufmgr = gen_driver->bufmgr;
   /* Get the ts that match the bspec */
   result = intel_gpgpu_read_ts_reg(bufmgr);
@@ -2284,15 +2323,13 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
 /* Get the GPU execute time. */
 static void
-intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
-				     int index, uint64_t* ret_ts)
+intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, int index, uint64_t* ret_ts)
   uint64_t result = 0;
-  assert(event->ts_buf != NULL);
+  assert(gpgpu->time_stamp_b.bo);
   assert(index == 0 || index == 1);
-  drm_intel_gem_bo_map_gtt(event->ts_buf);
-  uint64_t* ptr = event->ts_buf->virtual;
+  drm_intel_gem_bo_map_gtt(gpgpu->time_stamp_b.bo);
+  uint64_t* ptr = gpgpu->time_stamp_b.bo->virtual;
   result = ptr[index];
   /* According to BSpec, the timestamp counter should be 36 bits,
@@ -2303,7 +2340,7 @@ intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
   result = (result & 0x0FFFFFFFF) * 80; //convert to nanoseconds
   *ret_ts = result;
-  drm_intel_gem_bo_unmap_gtt(event->ts_buf);
+  drm_intel_gem_bo_unmap_gtt(gpgpu->time_stamp_b.bo);
 static int
@@ -2409,6 +2446,18 @@ intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu)
   return gpgpu->printf_info;
+static void
+intel_gpgpu_set_kernel(intel_gpgpu_t *gpgpu, void * kernel)
+  gpgpu->kernel = kernel;
+static void*
+intel_gpgpu_get_kernel(intel_gpgpu_t *gpgpu)
+  return gpgpu->kernel;
 LOCAL void
 intel_set_gpgpu_callbacks(int device_id)
@@ -2419,7 +2468,6 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
   cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
   cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
-  cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
   cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
   cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
   cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
@@ -2449,6 +2497,8 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf;
   cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
   cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
+  cl_gpgpu_set_kernel = (cl_gpgpu_set_kernel_cb *)intel_gpgpu_set_kernel;
+  cl_gpgpu_get_kernel = (cl_gpgpu_get_kernel_cb *)intel_gpgpu_get_kernel;
   if (IS_BROADWELL(device_id) || IS_CHERRYVIEW(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen8;
@@ -2468,7 +2518,8 @@ intel_set_gpgpu_callbacks(int device_id)
     intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8;
     cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
     intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8;
-	intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
+    intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
+    cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen8;
   if (IS_GEN9(device_id)) {
@@ -2488,9 +2539,11 @@ intel_set_gpgpu_callbacks(int device_id)
     cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
     intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8;
     intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen9;
+    cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen8;
+  cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes_gen7;
   intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen7;
   intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen7;
   cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen7;
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index 904f9e0..f575f8b 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -53,6 +53,7 @@ struct intel_gpgpu
   uint32_t target_buf_offset[max_buf_n];/* internal offset for buffers binded for the call */
   uint32_t binded_offset[max_buf_n];    /* their offsets in the curbe buffer */
   uint32_t binded_n;                    /* number of buffers binded */
+  void *kernel;                         /* cl_kernel with this gpgpu */
   unsigned long img_bitmap;              /* image usage bitmap. */
   unsigned int img_index_base;          /* base index for image surface.*/
diff --git a/src/performance.c b/src/performance.c
index 28bd6c6..1e676c3 100644
--- a/src/performance.c
+++ b/src/performance.c
@@ -54,6 +54,8 @@ static context_storage_node * find_context(cl_context context)
   if(NULL == record.context_storage)
     record.context_storage = (context_storage_node *) malloc(sizeof(context_storage_node));
+    if (record.context_storage == NULL)
+      return NULL;
     record.context_storage->context_id = (uintptr_t)context;
     record.context_storage->kernels_storage = NULL;
     record.context_storage->kernel_max_time = 0.0f;
@@ -96,6 +98,8 @@ static kernel_storage_node * find_kernel(context_storage_node *p_context, const
   if(NULL == p_context->kernels_storage)
     p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+    if (p_context->kernels_storage == NULL)
+      return NULL;
     strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH);
     p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
@@ -188,6 +192,8 @@ static void print_time_info()
     kernel_storage_node *p_kernel = p_context->kernels_storage;
     kernel_storage_node *p_tmp_kernel = p_kernel;
     time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
+    if (te == NULL)
+      return;
     memset(te, 0, sizeof(time_element)*p_context->kernel_count);
     int i = -1, j = 0, k = 0;
     while(NULL != p_tmp_kernel)
diff --git a/src/x11/mesa_egl_extension.c b/src/x11/mesa_egl_extension.c
deleted file mode 100644
index 4a3e89c..0000000
--- a/src/x11/mesa_egl_extension.c
+++ /dev/null
@@ -1,306 +0,0 @@
-#include <stdio.h>
-#include "mesa_egl_extension.h"
-#include "mesa_egl_res_share.h"
-#include "src/cl_driver.h"
-struct _egl_display;
-struct _egl_resource;
-struct _egl_thread_info;
-struct _egl_config;
-struct _egl_surface;
-struct _egl_driver;
-typedef struct _egl_display _EGLDisplay;
-typedef struct _egl_resource _EGLResource;
-typedef struct _egl_thread_info _EGLThreadInfo;
-typedef struct _egl_config _EGLConfig;
-typedef struct _egl_surface _EGLSurface;
-typedef struct _egl_driver _EGLDriver;
- * A resource of a display.
- */
-struct _egl_resource
-   /* which display the resource belongs to */
-   _EGLDisplay *Display;
-   EGLBoolean IsLinked;
-   EGLint RefCount;
-   /* used to link resources of the same type */
-   _EGLResource *Next;
- * "Base" class for device driver contexts.
- */
-struct _egl_context
-   /* A context is a display resource */
-   _EGLResource Resource;
-   /* The bound status of the context */
-   _EGLThreadInfo *Binding;
-   _EGLSurface *DrawSurface;
-   _EGLSurface *ReadSurface;
-   _EGLConfig *Config;
-   EGLint ClientMajorVersion;
-   EGLint ClientMinorVersion;
-   EGLint Flags;
-   EGLint Profile;
-   EGLint ResetNotificationStrategy;
-   /* The real render buffer when a window surface is bound */
-   EGLint WindowRenderBuffer;
-typedef struct _egl_context _EGLContext;
-struct dri2_egl_display
-   int                       dri2_major;
-   int                       dri2_minor;
-   __DRIscreen              *dri_screen;
-   int                       own_dri_screen;
-   const __DRIconfig       **driver_configs;
-   void                     *driver;
-enum _egl_platform_type {
-typedef enum _egl_platform_type _EGLPlatformType;
-typedef pthread_mutex_t _EGLMutex;
-struct _egl_display
-   /* used to link displays */
-   _EGLDisplay *Next;
-   _EGLMutex Mutex;
-   _EGLPlatformType Platform; /**< The type of the platform display */
-   void *PlatformDisplay;     /**< A pointer to the platform display */
-   _EGLDriver *Driver;        /**< Matched driver of the display */
-   EGLBoolean Initialized;    /**< True if the display is initialized */
-   /* options that affect how the driver initializes the display */
-   struct {
-      EGLBoolean TestOnly;    /**< Driver should not set fields when true */
-      EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
-   } Options;
-   /* these fields are set by the driver during init */
-   void *DriverData;          /**< Driver private data */
-static struct dri2_egl_display *
-dri2_egl_display(_EGLDisplay *dpy)
-  return (struct dri2_egl_display *)dpy->DriverData;
-static _EGLDisplay *
-_eglLockDisplay(EGLDisplay dpy)
-  return (_EGLDisplay *)dpy;
-static _EGLContext *
-_eglLookupContext(EGLContext ctx, EGLDisplay disp)
-  return (_EGLContext *) ctx;
-struct dri2_egl_context
-   _EGLContext   base;
-   __DRIcontext *dri_context;
-static struct dri2_egl_context *
-dri2_egl_context(_EGLContext *ctx)
-  return (struct dri2_egl_context *)ctx;
-static EGLBoolean
-dri2_acquire_texture(_EGLDisplay *disp,
-                     _EGLContext *ctx,
-                     const EGLint *attr_list,
-                     void *user_data)
-   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   GLuint texture = 0;
-   GLenum gl_target = 0;
-   GLint level = 0;
-   GLboolean ret;
-   if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
-      return EGL_FALSE;
-   ret = cl_gl_acquire_texture(dri2_dpy->driver,
-                               dri2_ctx->dri_context,
-                               gl_target, level, texture,
-                               user_data);
-   return ret;
-static EGLBoolean
-dri2_release_texture(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
-   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   GLuint texture = 0;
-   GLenum gl_target = 0;
-   GLint level = 0;
-   GLboolean ret;
-   if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
-      return EGL_FALSE;
-   ret = cl_gl_release_texture(dri2_dpy->driver, dri2_ctx->dri_context,
-                               gl_target, level, texture);
-   return ret;
-static EGLBoolean
-dri2_acquire_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list,
-                           void *user_data)
-   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   GLuint bufobj = 0;
-   GLboolean ret;
-   if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
-      return EGL_FALSE;
-   ret = cl_gl_acquire_buffer_object(dri2_dpy->driver,
-                                     dri2_ctx->dri_context,
-                                     bufobj, user_data);
-   return ret;
-static EGLBoolean
-dri2_release_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
-   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   GLuint bufobj = 0;
-   GLboolean ret;
-   if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
-      return EGL_FALSE;
-   ret = cl_gl_release_buffer_object(dri2_dpy->driver,
-                                     dri2_ctx->dri_context,
-                                     bufobj);
-   return ret;
-static EGLBoolean
-dri2_acquire_render_buffer(_EGLDisplay *disp,
-                           _EGLContext *ctx,
-                           const EGLint *attr_list,
-                           void *user_data)
-   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   GLuint rb = 0;
-   GLboolean ret;
-   if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
-      return EGL_FALSE;
-   ret = cl_gl_acquire_render_buffer(dri2_dpy->driver,
-                                     dri2_ctx->dri_context,
-                                     rb, user_data);
-   return ret;
-static EGLBoolean
-dri2_release_render_buffer(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
-   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
-   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
-   GLuint rb = 0;
-   GLboolean ret;
-   if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
-      return EGL_FALSE;
-   ret = cl_gl_release_render_buffer(dri2_dpy->driver,
-                                     dri2_ctx->dri_context,
-                                     rb);
-   return ret;
-static EGLBoolean
-dri2_acquire_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
-                           const EGLint *attrib_list, void *user_data)
-   switch (target) {
-     return dri2_acquire_texture(disp, ctx, attrib_list, user_data);
-     return dri2_acquire_buffer_object(disp, ctx, attrib_list, user_data);
-     return dri2_acquire_render_buffer(disp, ctx, attrib_list, user_data);
-   default:
-      fprintf(stderr, "bad resource target value 0x%04x",
-              target);
-   }
-   return EGL_FALSE;
-static EGLBoolean
-dri2_release_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
-                           const EGLint *attrib_list)
-   switch (target) {
-     return dri2_release_texture(disp, ctx, attrib_list);
-     return dri2_release_buffer_object(disp, ctx, attrib_list);
-     return dri2_release_render_buffer(disp, ctx, attrib_list);
-   default:
-      fprintf(stderr, "bad resource target value 0x%04x",
-              target);
-   }
-   return EGL_FALSE;
-eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void *user)
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLContext *context = _eglLookupContext(ctx, disp);
-   return dri2_acquire_resource_mesa(disp, context, target, attrib_list, user);
-eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list)
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLContext *context = _eglLookupContext(ctx, disp);
-   return dri2_release_resource_mesa(disp, context, target, attrib_list);
diff --git a/src/x11/mesa_egl_extension.h b/src/x11/mesa_egl_extension.h
deleted file mode 100644
index 39ea134..0000000
--- a/src/x11/mesa_egl_extension.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <EGL/egl.h>
-#include <GL/gl.h>
-#include <GL/internal/dri_interface.h>
-#define EGL_GL_TEXTURE_MESA             0x3300  /* eglAcuireResource target */
-#define EGL_GL_BUFFER_OBJECT_MESA       0x3301  /* eglAcuireResource target */
-#define EGL_GL_RENDER_BUFFER_MESA       0x3302  /* eglAcuireResource target */
-#define EGL_GL_TEXTURE_ID_MESA          0x3303  /* eglAcuireResource attribute */
-#define EGL_GL_TEXTURE_LEVEL_MESA       0x3304  /* eglAcuireResource attribute */
-#define EGL_GL_TEXTURE_TARGET_MESA      0x3305  /* eglAcuireResource attribute */
-#define EGL_GL_BUFFER_OBJECT_ID_MESA    0x3306  /* eglAcuireResource attribute */
-#define EGL_GL_RENDER_BUFFER_ID_MESA    0x3307  /* eglAcuireResource attribute */
-EGLBoolean eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void * user_data);
-EGLBoolean eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list);
diff --git a/src/x11/mesa_egl_res_share.c b/src/x11/mesa_egl_res_share.c
deleted file mode 100644
index 93e9454..0000000
--- a/src/x11/mesa_egl_res_share.c
+++ /dev/null
@@ -1,135 +0,0 @@
- *
- * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
- * Copyright 2013-2014 Intel, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- *
- **************************************************************************/
-#include <assert.h>
-#include <string.h>
-#include "mesa_egl_extension.h"
-#include "mesa_egl_res_share.h"
- * Parse the list of share texture attributes and return the proper error code.
- */
-_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target, EGLint *level,
-                           const EGLint *attrib_list)
-   EGLint i, err = EGL_SUCCESS;
-   *texture = 0;
-   *gl_target = 0;
-   *level = 0;
-   if (!attrib_list)
-      return EGL_BAD_ATTRIBUTE;
-   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
-      EGLint attr = attrib_list[i++];
-      EGLint val = attrib_list[i];
-      switch (attr) {
-         *level = val;
-         break;
-         *texture = val;
-         break;
-         *gl_target = val;
-         break;
-      default:
-         /* unknown attrs are ignored */
-         break;
-      }
-   }
-   return err;
- * Parse the list of share texture attributes and return the proper error code.
- */
-_eglParseBufferObjAttribList(unsigned int *bufobj, const EGLint *attrib_list)
-   EGLint i, err = EGL_SUCCESS;
-   *bufobj = 0;
-   if (!attrib_list)
-      return EGL_BAD_ATTRIBUTE;
-   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
-      EGLint attr = attrib_list[i++];
-      EGLint val = attrib_list[i];
-      switch (attr) {
-         *bufobj = val;
-         break;
-      default:
-         /* unknown attrs are ignored */
-         break;
-      }
-   }
-   if (*bufobj == 0)
-      err = EGL_BAD_ATTRIBUTE;
-   return err;
- * Parse the list of share texture attributes and return the proper error code.
- */
-_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list)
-   EGLint i, err = EGL_SUCCESS;
-   *rb = 0;
-   if (!attrib_list)
-      return EGL_BAD_ATTRIBUTE;
-   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
-      EGLint attr = attrib_list[i++];
-      EGLint val = attrib_list[i];
-      switch (attr) {
-         *rb = val;
-         break;
-      default:
-         /* unknown attrs are ignored */
-         break;
-      }
-   }
-   if (*rb == 0)
-      err = EGL_BAD_ATTRIBUTE;
-   return err;
diff --git a/src/x11/mesa_egl_res_share.h b/src/x11/mesa_egl_res_share.h
deleted file mode 100644
index 43e746e..0000000
--- a/src/x11/mesa_egl_res_share.h
+++ /dev/null
@@ -1,44 +0,0 @@
- *
- * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
- * Copyright 2013-2014 Intel, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- *
- **************************************************************************/
-#include <EGL/egl.h>
-_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target,
-                           EGLint *level, const EGLint *attrib_list);
-_eglParseBufferObjAttribList(unsigned int *bufobj,
-                             const EGLint *attrib_list);
-_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list);
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 4957b7c..db61844 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -40,7 +40,10 @@ endif (NOT NOT_BUILD_STAND_ALONE_UTEST)
-                    ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../include
+                    ${OPENGL_INCLUDE_DIRS}
+                    ${EGL_INCLUDE_DIRS})
 ##### Math Function Part:
@@ -58,7 +61,7 @@ configure_file (
 #XXX only need GL if required
-link_directories (${LLVM_LIBRARY_DIR} ${OPENGL_LIBDIR} ${DRM_LIBDIR})
 set (utests_basic_sources
@@ -159,6 +162,7 @@ set (utests_sources
+  compiler_ctz.cpp
@@ -281,6 +285,7 @@ set (utests_sources
+  multi_queue_events.cpp
@@ -292,6 +297,19 @@ if (LLVM_VERSION_NODOT VERSION_GREATER 34)
+  SET(utests_sources
+      ${utests_sources}
+      compiler_program_global.cpp
+      compiler_generic_atomic.cpp
+      compiler_atomic_functions_20.cpp
+      compiler_sampler.cpp
+      compiler_generic_pointer.cpp
+      runtime_pipe_query.cpp
+      compiler_pipe_builtin.cpp
+      compiler_device_enqueue.cpp)
+endif (ENABLE_OPENCL_20)
   if (X11_FOUND)
@@ -339,13 +357,12 @@ add_custom_target(utest_generator
+#compiler_fill_gl_image test case also need xlib
   SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
@@ -357,13 +374,15 @@ endif ()
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-tautological-compare")
 endif ()
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
+SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated-declarations" )
 ADD_LIBRARY(utests SHARED ${utests_sources})
 ADD_EXECUTABLE(utest_run utest_run.cpp)
diff --git a/utests/compiler_atomic_functions_20.cpp b/utests/compiler_atomic_functions_20.cpp
new file mode 100644
index 0000000..ea1ace5
--- /dev/null
+++ b/utests/compiler_atomic_functions_20.cpp
@@ -0,0 +1,106 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+#include <string.h>
+#define GROUP_NUM 16
+#define LOCAL_SIZE 256
+static void cpu_compiler_atomic(int *dst, int *src)
+  dst[4] = 0xffffffff;
+  int tmp[16] = { 0 };
+  tmp[4] = -1;
+  for(int j=0; j<LOCAL_SIZE; j++) {
+    int i = j % 12;
+    switch(i) {
+      case 0: tmp[i] += 1; break;
+      case 1: tmp[i] -= 1; break;
+      case 2: tmp[i] += src[j]; break;
+      case 3: tmp[i] -= src[j]; break;
+      case 4: tmp[i] &= ~(src[j]<<(j>>4)); break;
+      case 5: tmp[i] |= src[j]<<(j>>4); break;
+      case 6: tmp[i] ^= src[j]; break;
+      case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
+      case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
+      case 9: tmp[i] = (unsigned int)tmp[i] < (unsigned int)(-src[j]) ? tmp[i] : -src[j]; break;
+      case 10: tmp[i] = (unsigned int)tmp[i] > (unsigned int)(src[j]) ? tmp[i] : src[j]; break;
+      case 11:  tmp[i] = src[10]; break;
+      default:  break;
+    }
+  }
+  for(int k=0; k<GROUP_NUM; k++) {
+    for(int j=0; j<LOCAL_SIZE; j++) {
+      int i = j % 12;
+      switch(i) {
+        case 0: dst[i] += 1; break;
+        case 1: dst[i] -= 1; break;
+        case 2: dst[i] += src[j]; break;
+        case 3: dst[i] -= src[j]; break;
+        case 4: dst[i] &= ~(src[j]<<(j>>4)); break;
+        case 5: dst[i] |= src[j]<<(j>>4); break;
+        case 6: dst[i] ^= src[j]; break;
+        case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
+        case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
+        case 9: dst[i] = (unsigned int)dst[i] < (unsigned int)(-src[j]) ? dst[i] : -src[j]; break;
+        case 10: dst[i] = (unsigned int)dst[i] > (unsigned int)(src[j]) ? dst[i] : src[j]; break;
+        case 11:  dst[i] = src[10]; break;
+        default:  break;
+      }
+    }
+  }
+  for(int i=0; i<12; i++)
+    dst[i+12] = tmp[i];
+static void compiler_atomic_functions(const char* kernel_name)
+  const size_t n = GROUP_NUM * LOCAL_SIZE;
+  int cpu_dst[24] = {0}, cpu_src[256];
+  globals[0] = n;
+  locals[0] = LOCAL_SIZE;
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_atomic_functions_20", kernel_name);
+  OCL_CREATE_BUFFER(buf[0], 0, 24 * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, 16 * sizeof(int), NULL);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+  memset(buf_data[0], 0, 24 * sizeof(int));
+  ((int *)buf_data[0])[4] = -1;
+  for (uint32_t i = 0; i < locals[0]; ++i)
+      cpu_src[i] = ((int*)buf_data[1])[i] = rand() & 0xff;
+  cpu_compiler_atomic(cpu_dst, cpu_src);
+  // Check results
+  for(int i=0; i<24; i++) {
+    //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
+    OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
+  }
+#define compiler_atomic(kernel, version) \
+static void compiler_atomic_functions_##version()\
+  compiler_atomic_functions(kernel); \
+} \
+compiler_atomic("compiler_atomic_functions_20", 20)
diff --git a/utests/compiler_ctz.cpp b/utests/compiler_ctz.cpp
new file mode 100644
index 0000000..d84fdad
--- /dev/null
+++ b/utests/compiler_ctz.cpp
@@ -0,0 +1,62 @@
+#include "utest_helper.hpp"
+namespace {
+template<typename T>
+T get_max();
+template<typename U>
+void test(const char *kernel_name)
+  const size_t n = 65;
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_ctz", kernel_name);
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(U), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(U), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  for (uint32_t i = 0; i < n; ++i) {
+      ((U*)buf_data[0])[i] = 1ll << i;
+      if(i == sizeof(U)*8)
+        ((U*)buf_data[0])[i] = 0;
+  }
+  globals[0] = n;
+  locals[0] = 1;
+  for (uint32_t i = 0; i < n; ++i) {
+      if(sizeof(U) == 1 && i <= 8 )
+        OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
+      else if(sizeof(U) == 2 && i <= 16 )
+        OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
+      else if(sizeof(U) == 4 && i <= 32 )
+        OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
+      else if(sizeof(U) == 8 && i <= 64 )
+        OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
+    }
+#define compiler_ctz(type, kernel)\
+static void compiler_ctz_ ##type(void)\
+  test<type>(# kernel);\
+MAKE_UTEST_FROM_FUNCTION(compiler_ctz_ ## type);
+compiler_ctz(uint64_t, compiler_ctz_ulong)
+compiler_ctz(uint32_t, compiler_ctz_uint)
+compiler_ctz(uint16_t, compiler_ctz_ushort)
+compiler_ctz(uint8_t, compiler_ctz_uchar)
+compiler_ctz(int64_t, compiler_ctz_long)
+compiler_ctz(int32_t, compiler_ctz_int)
+compiler_ctz(int16_t, compiler_ctz_short)
+compiler_ctz(int8_t, compiler_ctz_char)
diff --git a/utests/compiler_device_enqueue.cpp b/utests/compiler_device_enqueue.cpp
new file mode 100644
index 0000000..a9e3e2d
--- /dev/null
+++ b/utests/compiler_device_enqueue.cpp
@@ -0,0 +1,36 @@
+#include "utest_helper.hpp"
+void compiler_device_enqueue(void)
+  const size_t n = 32;
+  const uint32_t global_sz = 3;
+  uint32_t result = 0;
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_device_enqueue");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(uint32_t), &global_sz);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+  for(uint32_t i = 0; i < 69; ++i)
+    ((short *)buf_data[0])[i] = 0;
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  for(uint32_t i = 0; i < global_sz; ++i) {
+    result += i;
+  }
+  result *= global_sz;
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == result);
diff --git a/utests/compiler_fill_gl_image.cpp b/utests/compiler_fill_gl_image.cpp
index f1eb8e7..b9d74d1 100644
--- a/utests/compiler_fill_gl_image.cpp
+++ b/utests/compiler_fill_gl_image.cpp
@@ -1,36 +1,14 @@
 #include "utest_helper.hpp"
-static void read_back(int tex, int width, int height, uint32_t * resultColor)
-  float vertices[8] = {-1, 1, 1, 1, 1, -1, -1, -1};
-  float tex_coords[8] = {0, 0, 1, 0, 1, 1, 0, 1};
-  glBindTexture(GL_TEXTURE_2D, tex);
-  glEnable(GL_TEXTURE_2D);
-  glDisable(GL_BLEND);
-  glVertexPointer(2, GL_FLOAT, sizeof(float) * 2, vertices);
-  glEnableClientState(GL_VERTEX_ARRAY);
-  glClientActiveTexture(GL_TEXTURE0);
-  glTexCoordPointer(2, GL_FLOAT, sizeof(float) * 2, tex_coords);
-  glEnableClientState(GL_TEXTURE_COORD_ARRAY);
-  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-  glFlush();
-  glReadPixels(0, 0, width, height, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, resultColor);
 static void compiler_fill_gl_image(void)
   const size_t w = EGL_WINDOW_WIDTH;
   const size_t h = EGL_WINDOW_HEIGHT;
-  uint32_t color = 0x123456FF;
-  uint32_t *resultColor;
+  uint32_t color0 = 0x123456FF;
+  uint32_t color1 = 0x789ABCDE;
+  uint32_t *resultColor0;
+  uint32_t *resultColor1;
   GLuint tex;
   if (eglContext == EGL_NO_CONTEXT) {
@@ -44,13 +22,15 @@ static void compiler_fill_gl_image(void)
   glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, NULL);
+  glGenerateMipmap(GL_TEXTURE_2D);
+  glTexImage2D(GL_TEXTURE_2D, 1, GL_RGBA, w/2, h/2, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, NULL);
+  //Create cl image from miplevel 0
   OCL_CREATE_GL_IMAGE(buf[0], 0, GL_TEXTURE_2D, 0, tex);
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(color), &color);
+  OCL_SET_ARG(1, sizeof(color0), &color0);
   globals[0] = w;
   globals[1] = h;
   locals[0] = 16;
@@ -59,18 +39,37 @@ static void compiler_fill_gl_image(void)
   // Check result
-  resultColor = new uint32_t[w * h * 4];
-  if (resultColor == NULL)
+  resultColor0 = new uint32_t[w * h];
+  if (resultColor0 == NULL)
-  read_back(tex, w, h, resultColor);
+  glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, resultColor0);
   for (uint32_t j = 0; j < h; ++j)
     for (uint32_t i = 0; i < w; i++)
-      OCL_ASSERT(resultColor[j * w + i] == color);
-  delete[] resultColor;
+      OCL_ASSERT(resultColor0[j * w + i] == color0);
+  //Create cl image from miplevel 1
+  OCL_CREATE_GL_IMAGE(buf[1], 0, GL_TEXTURE_2D, 1, tex);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(1, sizeof(color1), &color1);
+  globals[0] = w/2;
+  globals[1] = h/2;
+  // Check result
+  resultColor1 = new uint32_t[(w/2)*(h/2)];
+  glGetTexImage(GL_TEXTURE_2D, 1, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, resultColor1);
+  for (uint32_t j = 0; j < h/2; ++j)
+    for (uint32_t i = 0; i < w/2; i++)
+      OCL_ASSERT(resultColor1[j * (w/2) + i] == color1);
+  delete[] resultColor0;
+  delete[] resultColor1;
diff --git a/utests/compiler_generic_atomic.cpp b/utests/compiler_generic_atomic.cpp
new file mode 100644
index 0000000..9ed5f53
--- /dev/null
+++ b/utests/compiler_generic_atomic.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+template<typename T>
+void test_atomic(const char* kernelName)
+  const int n = 16;
+  T cpu_src[16];
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_generic_atomic.cl", kernelName, SOURCE, "-cl-std=CL2.0");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  for (int i = 0; i <  n; ++i)
+    cpu_src[i] = ((T*)buf_data[0])[i] = (T)i;
+  // Run the kernel on GPU
+  // Compare
+  for (int32_t i = 0; i < n; ++i) {
+//    printf("i=%d dst=%d\n", i, ((T*)buf_data[1])[i]);
+    OCL_ASSERT(((T*)buf_data[1])[i] == 2 * cpu_src[i]);
+  }
+#define GENERIC_ATOMIC_TEST(T)                    \
+void compiler_generic_atomic_##T() {       \
+  test_atomic<T>("compiler_generic_atomic_"#T);   \
+} \
diff --git a/utests/compiler_generic_pointer.cpp b/utests/compiler_generic_pointer.cpp
new file mode 100644
index 0000000..5984694
--- /dev/null
+++ b/utests/compiler_generic_pointer.cpp
@@ -0,0 +1,46 @@
+#include "utest_helper.hpp"
+template<typename T>
+void test(const char* kernelName)
+  const int n = 16;
+  T cpu_src[16];
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_generic_pointer.cl", kernelName, SOURCE, "-cl-std=CL2.0");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  for (int i = 0; i <  n; ++i)
+    cpu_src[i] = ((T*)buf_data[0])[i] = (T)i;
+  // Run the kernel on GPU
+  // Compare
+  for (int32_t i = 0; i < n; ++i) {
+//    printf("i=%d dst=%d\n", i, ((T*)buf_data[1])[i]);
+    OCL_ASSERT(((T*)buf_data[1])[i] == 2 * cpu_src[i]);
+  }
+#define GENERIC_TEST(T)                    \
+void compiler_generic_pointer_##T() {      \
+  test<T>("compiler_generic_pointer_"#T); \
+} \
diff --git a/utests/compiler_pipe_builtin.cpp b/utests/compiler_pipe_builtin.cpp
new file mode 100644
index 0000000..c8ec077
--- /dev/null
+++ b/utests/compiler_pipe_builtin.cpp
@@ -0,0 +1,69 @@
+#include <string.h>
+#include "utest_helper.hpp"
+typedef struct{
+  int a;
+  uint b;
+static void compiler_pipe_##GROUP##_##TYPE(void) \
+{ \
+  const size_t w = 16;  \
+  uint32_t ans_host = 0;  \
+  uint32_t ans_device = 0;  \
+  /* pipe write kernel*/  \
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_pipe_builtin", "compiler_pipe_"#GROUP"_write_"#TYPE);  \
+  OCL_CALL2(clCreatePipe, buf[0], ctx, 0, sizeof(TYPE), w, NULL);\
+  for (uint32_t i = 0; i < w; i++)\
+      ((uint32_t*)buf_data[1])[i] = i;\
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);\
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);\
+  globals[0] = w;\
+  locals[0] = 16;\
+  OCL_CALL(clReleaseKernel, kernel);\
+  /* pipe read kernel */\
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_pipe_builtin", "compiler_pipe_"#GROUP"_read_"#TYPE);\
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);\
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[2]);\
+  /* Check result */\
+  for (uint32_t i = 0; i < w; i++) {\
+      ans_device += ((uint32_t*)buf_data[2])[i];\
+      ans_host += i;\
+  }\
+  OCL_ASSERT(ans_host == ans_device);\
+PIPE_BUILTIN(int, convenience)
+PIPE_BUILTIN(mystruct, convenience)
+PIPE_BUILTIN(int, reserve)
+PIPE_BUILTIN(mystruct, reserve)
+PIPE_BUILTIN(int, workgroup)
+PIPE_BUILTIN(mystruct, workgroup)
+static void compiler_pipe_query(void) {
+  const size_t w = 32;
+  const size_t sz = 16;
+  /* pipe write kernel */
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_pipe_builtin", "compiler_pipe_query");
+  OCL_CALL2(clCreatePipe, buf[0], ctx, 0, sizeof(uint32_t), w, NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sz * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = sz;
+  locals[0] = 16;
+  /*Check result */
+  OCL_ASSERT(sz == ((uint32_t *)buf_data[1])[0] && w == ((uint32_t *)buf_data[1])[1]);
diff --git a/utests/compiler_program_global.cpp b/utests/compiler_program_global.cpp
new file mode 100644
index 0000000..ef7c655
--- /dev/null
+++ b/utests/compiler_program_global.cpp
@@ -0,0 +1,80 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+static int init_program(const char* name, cl_context ctx, cl_program *pg )
+  cl_int err;
+  char* ker_path = cl_do_kiss_path(name, device);
+  cl_file_map_t *fm = cl_file_map_new();
+  err = cl_file_map_open(fm, ker_path);
+  if(err != CL_FILE_MAP_SUCCESS)
+    OCL_ASSERT(0);
+  const char *src = cl_file_map_begin(fm);
+  *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
+  free(ker_path);
+  cl_file_map_delete(fm);
+  return 0;
+void compiler_program_global()
+  const int n = 16;
+  int cpu_src[16];
+  cl_int err;
+  // Setup kernel and buffers
+  cl_program program;
+  init_program("compiler_program_global.cl", ctx, &program);
+  OCL_CALL (clBuildProgram, program, 1, &device, "-cl-std=CL2.0", NULL, NULL);
+  cl_kernel k0 = clCreateKernel(program, "compiler_program_global0", &err);
+  assert(err == CL_SUCCESS);
+  cl_kernel k1 = clCreateKernel(program, "compiler_program_global1", &err);
+  assert(err == CL_SUCCESS);
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CALL (clSetKernelArg, k0, 0, sizeof(cl_mem), &buf[0]);
+  OCL_CALL (clSetKernelArg, k1, 0, sizeof(cl_mem), &buf[1]);
+  int dynamic = 1;
+  OCL_CALL (clSetKernelArg, k0, 1, sizeof(cl_int), &dynamic);
+  OCL_CALL (clSetKernelArg, k1, 1, sizeof(cl_int), &dynamic);
+  globals[0] = 16;
+  locals[0] = 16;
+  for (int i = 0; i <  n; ++i)
+    cpu_src[i] = ((int*)buf_data[0])[i] = i;
+  // Run the kernel on GPU
+  OCL_CALL (clEnqueueNDRangeKernel, queue, k0, 1, NULL, globals, locals, 0, NULL, NULL);
+  OCL_CALL (clEnqueueNDRangeKernel, queue, k1, 1, NULL, globals, locals, 0, NULL, NULL);
+  // Compare
+  for (int32_t i = 0; i < n; ++i) {
+//    printf("i=%d dst=%d\n", i, ((int*)buf_data[1])[i]);
+    switch(i) {
+      default: OCL_ASSERT(((int*)buf_data[1])[i] == i); break;
+      case 11: OCL_ASSERT(((int*)buf_data[1])[i] == 7); break;
+      case 12: OCL_ASSERT(((int*)buf_data[1])[i] == 4); break;
+      case 13: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
+      case 14: OCL_ASSERT(((int*)buf_data[1])[i] == 3); break;
+      case 15: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break;
+    }
+  }
+  clReleaseKernel(k0);
+  clReleaseKernel(k1);
+  clReleaseProgram(program);
diff --git a/utests/compiler_sampler.cpp b/utests/compiler_sampler.cpp
index 32bf926..f8bf622 100644
--- a/utests/compiler_sampler.cpp
+++ b/utests/compiler_sampler.cpp
@@ -8,7 +8,7 @@ void compiler_sampler(void)
   OCL_ASSERT(ctx != 0);
   cl_sampler s;
   cl_int err;
-  int a1[] = {CL_TRUE, CL_FALSE},
+  cl_uint a1[] = {CL_TRUE, CL_FALSE},
@@ -33,6 +33,18 @@ void compiler_sampler(void)
         for(l=0; l<5; l++)
           OCL_CALL(clGetSamplerInfo, s, a4[l], 1000, pv, &pv_size);
         OCL_CALL(clReleaseSampler, s);
+        cl_sampler_properties sam[] = {
+        CL_SAMPLER_FILTER_MODE, a3[k],
+        0};
+        s = clCreateSamplerWithProperties(ctx, sam, &err);
+        OCL_ASSERT(err == CL_SUCCESS);
+        OCL_CALL(clRetainSampler, s);
+        OCL_CALL(clReleaseSampler, s);
+        for(l=0; l<5; l++)
+          OCL_CALL(clGetSamplerInfo, s, a4[l], 1000, pv, &pv_size);
+        OCL_CALL(clReleaseSampler, s);
diff --git a/utests/compiler_sub_group_shuffle.cpp b/utests/compiler_sub_group_shuffle.cpp
index f33e9de..2aadfed 100644
--- a/utests/compiler_sub_group_shuffle.cpp
+++ b/utests/compiler_sub_group_shuffle.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
-void compiler_sub_group_shuffle(void)
+void compiler_sub_group_shuffle_int(void)
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle(void)
   const int32_t buf_size = 4 * n + 1;
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle");
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle",
+                              "compiler_sub_group_shuffle_int");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -43,5 +44,50 @@ void compiler_sub_group_shuffle(void)
+void compiler_sub_group_shuffle_short(void)
+  if(!cl_check_subgroups_short())
+    return;
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle.cl",
+                           "compiler_sub_group_shuffle_short",
+                           SOURCE, "-DSHORT");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  int c = 3;
+  OCL_SET_ARG(1, sizeof(int), &c);
+  globals[0] = n;
+  locals[0] = 16;
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((short*)buf_data[0])[i] = -1;
+  // Run the kernel on GPU
+  // Compare
+  short* dst = (short*)buf_data[0];
+  int suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / suggroupsize;
+    int index = i % suggroupsize;
+    OCL_ASSERT(index == dst[4*i]);
+    OCL_ASSERT((round * suggroupsize + c) == dst[4*i+1]);
+    OCL_ASSERT((round * suggroupsize + 5) == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (suggroupsize - index - 1)) == dst[4*i+3]);
+  }
diff --git a/utests/compiler_sub_group_shuffle_down.cpp b/utests/compiler_sub_group_shuffle_down.cpp
index 8b23234..13f6e12 100644
--- a/utests/compiler_sub_group_shuffle_down.cpp
+++ b/utests/compiler_sub_group_shuffle_down.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
-void compiler_sub_group_shuffle_down(void)
+void compiler_sub_group_shuffle_down_int(void)
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_down(void)
   const int32_t buf_size = 4 * n + 1;
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_down");
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_down",
+                              "compiler_sub_group_shuffle_down_int");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_down(void)
+void compiler_sub_group_shuffle_down_short(void)
+  if(!cl_check_subgroups_short())
+    return;
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_down.cl",
+                           "compiler_sub_group_shuffle_down_short",
+                           SOURCE, "-DSHORT");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  int c = 13;
+  OCL_SET_ARG(1, sizeof(int), &c);
+  globals[0] = n;
+  locals[0] = 16;
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((short*)buf_data[0])[i] = -1;
+  // Run the kernel on GPU
+  // Compare
+  short* dst = (short *)buf_data[0];
+  short suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / suggroupsize;
+    int index = i % suggroupsize;
+    //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+    OCL_ASSERT( (index + c >= suggroupsize ? 456 : 123) == dst[4*i]);
+    OCL_ASSERT( (index + c >= suggroupsize ? (round * suggroupsize + (i + c) % suggroupsize): 123) == dst[4*i+1]);
+    OCL_ASSERT( (index + index + 1 >= suggroupsize ? -(round * suggroupsize + (i + index + 1) % suggroupsize) : (round * suggroupsize + (i + index + 1) % suggroupsize))  == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]);
+  }
diff --git a/utests/compiler_sub_group_shuffle_up.cpp b/utests/compiler_sub_group_shuffle_up.cpp
index d2e054b..f79f03c 100644
--- a/utests/compiler_sub_group_shuffle_up.cpp
+++ b/utests/compiler_sub_group_shuffle_up.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
-void compiler_sub_group_shuffle_up(void)
+void compiler_sub_group_shuffle_up_int(void)
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_up(void)
   const int32_t buf_size = 4 * n + 1;
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_up");
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_up",
+                              "compiler_sub_group_shuffle_up_int");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_up(void)
+void compiler_sub_group_shuffle_up_short(void)
+  if(!cl_check_subgroups_short())
+    return;
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_up.cl",
+                           "compiler_sub_group_shuffle_up_short",
+                           SOURCE, "-DSHORT");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  int c = 3;
+  OCL_SET_ARG(1, sizeof(int), &c);
+  globals[0] = n;
+  locals[0] = 16;
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((short*)buf_data[0])[i] = -1;
+  // Run the kernel on GPU
+  // Compare
+  short* dst = (short *)buf_data[0];
+  short suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / suggroupsize;
+    int index = i % suggroupsize;
+    //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+    OCL_ASSERT( ((c - index) > 0 ? 123 : 456) == dst[4*i]);
+    OCL_ASSERT( ((c - index) > 0 ? 123 : (i - c)) == dst[4*i+1]);
+    OCL_ASSERT( ((suggroupsize - index - 1 - index) > 0 ? (i + index + 1) : -(i + index + 1 - suggroupsize)) == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]);
+  }
diff --git a/utests/compiler_sub_group_shuffle_xor.cpp b/utests/compiler_sub_group_shuffle_xor.cpp
index 967ec3e..b0ad3ee 100644
--- a/utests/compiler_sub_group_shuffle_xor.cpp
+++ b/utests/compiler_sub_group_shuffle_xor.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
-void compiler_sub_group_shuffle_xor(void)
+void compiler_sub_group_shuffle_xor_int(void)
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_xor(void)
   const int32_t buf_size = 4 * n + 1;
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_xor");
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_xor",
+                              "compiler_sub_group_shuffle_xor_int");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_xor(void)
+void compiler_sub_group_shuffle_xor_short(void)
+  if(!cl_check_subgroups_short())
+    return;
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_xor.cl",
+                           "compiler_sub_group_shuffle_xor_short",
+                           SOURCE, "-DSHORT");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  int c = 3;
+  OCL_SET_ARG(1, sizeof(int), &c);
+  globals[0] = n;
+  locals[0] = 16;
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((short*)buf_data[0])[i] = -1;
+  // Run the kernel on GPU
+  // Compare
+  short* dst = (short *)buf_data[0];
+  short suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / suggroupsize;
+    int index = i % suggroupsize;
+    OCL_ASSERT(index == dst[4*i]);
+    //printf("%d %d %d %d\n", i, dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+    OCL_ASSERT((round * suggroupsize + (c ^ index)) == dst[4*i+1]);
+    OCL_ASSERT((round * suggroupsize + (index ^ (suggroupsize - index -1))) == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (index ^ (index + 1) % suggroupsize)) == dst[4*i+3]);
+  }
diff --git a/utests/compiler_subgroup_broadcast.cpp b/utests/compiler_subgroup_broadcast.cpp
index 2e3fabb..33ec43c 100644
--- a/utests/compiler_subgroup_broadcast.cpp
+++ b/utests/compiler_subgroup_broadcast.cpp
@@ -59,10 +59,15 @@ static void generate_data(T* &input,
       /* initially 0, augment after */
       input[gid + lid] = 0;
-      /* check all data types, test ideal for QWORD types */
-      input[gid + lid] += ((rand() % 2 - 1) * base_val);
-      /* add trailing random bits, tests GENERAL cases */
-      input[gid + lid] += (rand() % 112);
+      if(sizeof(T) == 2) {
+        input[gid + lid] = __float_to_half(as_uint((float)(gid + lid)));
+      }
+      else {
+        /* check all data types, test ideal for QWORD types */
+        input[gid + lid] += ((rand() % 2 - 1) * base_val);
+        /* add trailing random bits, tests GENERAL cases */
+        input[gid + lid] += (rand() % 112);
+      }
       /* output generated input */
@@ -185,3 +190,28 @@ void compiler_subgroup_broadcast_long(void)
   subgroup_generic(input, expected);
+void compiler_subgroup_broadcast_short(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+                              "compiler_subgroup_broadcast_short");
+  subgroup_generic(input, expected);
+void compiler_subgroup_broadcast_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_broadcast.cl",
+                           "compiler_subgroup_broadcast_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(input, expected);
diff --git a/utests/compiler_subgroup_buffer_block_read.cpp b/utests/compiler_subgroup_buffer_block_read.cpp
index 9707f19..74bc899 100644
--- a/utests/compiler_subgroup_buffer_block_read.cpp
+++ b/utests/compiler_subgroup_buffer_block_read.cpp
@@ -64,6 +64,7 @@ static void generate_data(T* &input,
         input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val);
         /* add trailing random bits, tests GENERAL cases */
         input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112);
+        //input[(gid + lid)*VEC_SIZE + vsz] = (gid + lid)*VEC_SIZE + vsz;
         /* output generated input */
@@ -156,47 +157,95 @@ static void subgroup_generic(T* input,
  * subgroup buffer block read
-void compiler_subgroup_buffer_block_read1(void)
+void compiler_subgroup_buffer_block_read_ui1(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_buffer_block_read1");
+                              "compiler_subgroup_buffer_block_read_ui1");
   subgroup_generic(input, expected, 1);
-void compiler_subgroup_buffer_block_read2(void)
+void compiler_subgroup_buffer_block_read_ui2(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_buffer_block_read2");
+                              "compiler_subgroup_buffer_block_read_ui2");
   subgroup_generic(input, expected, 2);
-void compiler_subgroup_buffer_block_read4(void)
+void compiler_subgroup_buffer_block_read_ui4(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_buffer_block_read4");
+                              "compiler_subgroup_buffer_block_read_ui4");
   subgroup_generic(input, expected, 4);
-void compiler_subgroup_buffer_block_read8(void)
+void compiler_subgroup_buffer_block_read_ui8(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_buffer_block_read8");
+                              "compiler_subgroup_buffer_block_read_ui8");
   subgroup_generic(input, expected, 8);
+void compiler_subgroup_buffer_block_read_us1(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_read.cl",
+                           "compiler_subgroup_buffer_block_read_us1",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 1);
+void compiler_subgroup_buffer_block_read_us2(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_read.cl",
+                           "compiler_subgroup_buffer_block_read_us2",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 2);
+void compiler_subgroup_buffer_block_read_us4(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_read.cl",
+                           "compiler_subgroup_buffer_block_read_us4",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 4);
+void compiler_subgroup_buffer_block_read_us8(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_read.cl",
+                           "compiler_subgroup_buffer_block_read_us8",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 8);
diff --git a/utests/compiler_subgroup_buffer_block_write.cpp b/utests/compiler_subgroup_buffer_block_write.cpp
index 6b257c5..3b538da 100644
--- a/utests/compiler_subgroup_buffer_block_write.cpp
+++ b/utests/compiler_subgroup_buffer_block_write.cpp
@@ -64,6 +64,7 @@ static void generate_data(T* &input,
         input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val);
         /* add trailing random bits, tests GENERAL cases */
         input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112);
+        //input[(gid + lid)*VEC_SIZE + vsz] = (gid + lid)*VEC_SIZE + vsz;
         /* output generated input */
@@ -156,47 +157,96 @@ static void subgroup_generic(T* input,
  * subgroup buffer block write
-void compiler_subgroup_buffer_block_write1(void)
+void compiler_subgroup_buffer_block_write_ui1(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_buffer_block_write1");
+                              "compiler_subgroup_buffer_block_write_ui1");
   subgroup_generic(input, expected, 1);
-void compiler_subgroup_buffer_block_write2(void)
+void compiler_subgroup_buffer_block_write_ui2(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_buffer_block_write2");
+                              "compiler_subgroup_buffer_block_write_ui2");
   subgroup_generic(input, expected, 2);
-void compiler_subgroup_buffer_block_write4(void)
+void compiler_subgroup_buffer_block_write_ui4(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_buffer_block_write4");
+                              "compiler_subgroup_buffer_block_write_ui4");
   subgroup_generic(input, expected, 4);
-void compiler_subgroup_buffer_block_write8(void)
+void compiler_subgroup_buffer_block_write_ui8(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_buffer_block_write8");
+                              "compiler_subgroup_buffer_block_write_ui8");
   subgroup_generic(input, expected, 8);
+void compiler_subgroup_buffer_block_write_us1(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_write.cl",
+                           "compiler_subgroup_buffer_block_write_us1",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 1);
+void compiler_subgroup_buffer_block_write_us2(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_write.cl",
+                           "compiler_subgroup_buffer_block_write_us2",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 2);
+void compiler_subgroup_buffer_block_write_us4(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_write.cl",
+                           "compiler_subgroup_buffer_block_write_us4",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 4);
+void compiler_subgroup_buffer_block_write_us8(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_buffer_block_write.cl",
+                           "compiler_subgroup_buffer_block_write_us8",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 8);
diff --git a/utests/compiler_subgroup_image_block_read.cpp b/utests/compiler_subgroup_image_block_read.cpp
index 02c8f07..879d622 100644
--- a/utests/compiler_subgroup_image_block_read.cpp
+++ b/utests/compiler_subgroup_image_block_read.cpp
@@ -21,7 +21,7 @@ static void compute_expected(T* input,
   for(uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
     for(uint32_t j = 0; j < VEC_SIZE; j++)
-      expected[i * VEC_SIZE + j] = input[WG_GLOBAL_SIZE * j + i];
+      expected[i * VEC_SIZE + j] = input[WG_GLOBAL_SIZE * 4 / sizeof(T) * j + i];
@@ -33,7 +33,8 @@ static void generate_data(T* &input,
                           size_t VEC_SIZE)
   /* allocate input and expected arrays */
-  input = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+  int* input_ui = new int[WG_GLOBAL_SIZE * VEC_SIZE];
+  input = (T*)input_ui;
   expected = new T[WG_GLOBAL_SIZE * VEC_SIZE];
   /* base value for all data types */
@@ -45,19 +46,22 @@ static void generate_data(T* &input,
     cout << endl << "IN: " << endl;
+  uint32_t rpitch = sizeof(uint32_t) * WG_GLOBAL_SIZE / sizeof(T);
   /* generate inputs and expected values */
-  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE * VEC_SIZE; gid++)
-  {
-    /* initially 0, augment after */
-    input[gid] = ((rand() % 2 - 1) * base_val) + (rand() % 112);
+  for(uint32_t h = 0; h < VEC_SIZE; ++h) {
+    for(uint32_t w = 0; w < WG_GLOBAL_SIZE; ++w)
+    {
+      /* initially 0, augment after */
+      input[w + h * rpitch] = ((rand() % 2 - 1) * base_val) + (rand() % 112);
+      //input[w + h * rpitch] = w + h * WG_GLOBAL_SIZE;
-    /* output generated input */
-    cout << setw(4) << input[gid] << ", " ;
-    if((gid + 1) % 8 == 0)
-          cout << endl;
+      /* output generated input */
+      cout << setw(4) << input[w + h * rpitch] << ", " ;
+      if((w+ 1) % 8 == 0)
+            cout << endl;
+    }
   /* expected values */
   compute_expected(input, expected, VEC_SIZE);
@@ -151,47 +155,95 @@ static void subgroup_generic(T* input,
  * sub_group image block read functions
-void compiler_subgroup_image_block_read1(void)
+void compiler_subgroup_image_block_read_ui1(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_image_block_read1");
+                              "compiler_subgroup_image_block_read_ui1");
   subgroup_generic(input, expected, 1);
-void compiler_subgroup_image_block_read2(void)
+void compiler_subgroup_image_block_read_ui2(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_image_block_read2");
+                              "compiler_subgroup_image_block_read_ui2");
   subgroup_generic(input, expected, 2);
-void compiler_subgroup_image_block_read4(void)
+void compiler_subgroup_image_block_read_ui4(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_image_block_read4");
+                              "compiler_subgroup_image_block_read_ui4");
   subgroup_generic(input, expected, 4);
-void compiler_subgroup_image_block_read8(void)
+void compiler_subgroup_image_block_read_ui8(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_image_block_read8");
+                              "compiler_subgroup_image_block_read_ui8");
+  subgroup_generic(input, expected, 8);
+void compiler_subgroup_image_block_read_us1(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_read.cl",
+                           "compiler_subgroup_image_block_read_us1",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 1);
+void compiler_subgroup_image_block_read_us2(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_read.cl",
+                           "compiler_subgroup_image_block_read_us2",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 2);
+void compiler_subgroup_image_block_read_us4(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_read.cl",
+                           "compiler_subgroup_image_block_read_us4",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 4);
+void compiler_subgroup_image_block_read_us8(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_read.cl",
+                           "compiler_subgroup_image_block_read_us8",
+                           SOURCE, "-DSHORT");
   subgroup_generic(input, expected, 8);
diff --git a/utests/compiler_subgroup_image_block_write.cpp b/utests/compiler_subgroup_image_block_write.cpp
index 2b85167..98cbb0f 100644
--- a/utests/compiler_subgroup_image_block_write.cpp
+++ b/utests/compiler_subgroup_image_block_write.cpp
@@ -50,6 +50,7 @@ static void generate_data(T* &input,
     /* initially 0, augment after */
     input[gid] = ((rand() % 2 - 1) * base_val) + (rand() % 112);
+    //input[gid] = gid;
     /* output generated input */
@@ -155,47 +156,95 @@ static void subgroup_generic(T* input,
  * sub_group image block write functions
-void compiler_subgroup_image_block_write1(void)
+void compiler_subgroup_image_block_write_ui1(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_image_block_write1");
+                              "compiler_subgroup_image_block_write_ui1");
   subgroup_generic(input, expected, 1);
-void compiler_subgroup_image_block_write2(void)
+void compiler_subgroup_image_block_write_ui2(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_image_block_write2");
+                              "compiler_subgroup_image_block_write_ui2");
   subgroup_generic(input, expected, 2);
-void compiler_subgroup_image_block_write4(void)
+void compiler_subgroup_image_block_write_ui4(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_image_block_write4");
+                              "compiler_subgroup_image_block_write_ui4");
   subgroup_generic(input, expected, 4);
-void compiler_subgroup_image_block_write8(void)
+void compiler_subgroup_image_block_write_ui8(void)
   cl_uint *input = NULL;
   cl_uint *expected = NULL;
-                              "compiler_subgroup_image_block_write8");
+                              "compiler_subgroup_image_block_write_ui8");
   subgroup_generic(input, expected, 8);
+void compiler_subgroup_image_block_write_us1(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_write.cl",
+                           "compiler_subgroup_image_block_write_us1",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 1);
+void compiler_subgroup_image_block_write_us2(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_write.cl",
+                           "compiler_subgroup_image_block_write_us2",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 2);
+void compiler_subgroup_image_block_write_us4(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_write.cl",
+                           "compiler_subgroup_image_block_write_us4",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 4);
+void compiler_subgroup_image_block_write_us8(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_image_block_write.cl",
+                           "compiler_subgroup_image_block_write_us8",
+                           SOURCE, "-DSHORT");
+  subgroup_generic(input, expected, 8);
diff --git a/utests/compiler_subgroup_reduce.cpp b/utests/compiler_subgroup_reduce.cpp
index 3c3df06..157086a 100644
--- a/utests/compiler_subgroup_reduce.cpp
+++ b/utests/compiler_subgroup_reduce.cpp
@@ -33,7 +33,8 @@ template<class T>
 static void compute_expected(WG_FUNCTION wg_func,
                     T* input,
                     T* expected,
-                    size_t SIMD_SIZE)
+                    size_t SIMD_SIZE,
+                    bool IS_HALF)
   if(wg_func == WG_ANY)
@@ -54,24 +55,43 @@ static void compute_expected(WG_FUNCTION wg_func,
   else if(wg_func == WG_REDUCE_ADD)
     T wg_sum = input[0];
-    for(uint32_t i = 1; i < SIMD_SIZE; i++)
-      wg_sum += input[i];
+    if(IS_HALF) {
+      float wg_sum_tmp = 0.0f;
+      for(uint32_t i = 0; i < SIMD_SIZE; i++) {
+        wg_sum_tmp += as_float(__half_to_float(input[i]));
+      }
+      wg_sum = __float_to_half(as_uint(wg_sum_tmp));
+    }
+    else {
+      for(uint32_t i = 1; i < SIMD_SIZE; i++)
+        wg_sum += input[i];
+    }
     for(uint32_t i = 0; i < SIMD_SIZE; i++)
       expected[i] = wg_sum;
   else if(wg_func == WG_REDUCE_MAX)
     T wg_max = input[0];
-    for(uint32_t i = 1; i < SIMD_SIZE; i++)
-      wg_max = max(input[i], wg_max);
+    for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+      if (IS_HALF) {
+        wg_max = (as_float(__half_to_float(input[i])) > as_float(__half_to_float(wg_max))) ? input[i] : wg_max;
+      }
+      else
+        wg_max = max(input[i], wg_max);
+    }
     for(uint32_t i = 0; i < SIMD_SIZE; i++)
       expected[i] = wg_max;
   else if(wg_func == WG_REDUCE_MIN)
     T wg_min = input[0];
-    for(uint32_t i = 1; i < SIMD_SIZE; i++)
-      wg_min = min(input[i], wg_min);
+    for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+      if (IS_HALF) {
+        wg_min= (as_float(__half_to_float(input[i])) < as_float(__half_to_float(wg_min))) ? input[i] : wg_min;
+      }
+      else
+        wg_min = min(input[i], wg_min);
+    }
     for(uint32_t i = 0; i < SIMD_SIZE; i++)
       expected[i] = wg_min;
@@ -85,7 +105,8 @@ template<class T>
 static void generate_data(WG_FUNCTION wg_func,
                    T* &input,
                    T* &expected,
-                   size_t SIMD_SIZE)
+                   size_t SIMD_SIZE,
+                   bool IS_HALF)
   input = new T[WG_GLOBAL_SIZE];
   expected = new T[WG_GLOBAL_SIZE];
@@ -115,6 +136,8 @@ static void generate_data(WG_FUNCTION wg_func,
         /* add trailing random bits, tests GENERAL cases */
         input[gid + lid] += (rand() % 112);
         /* always last bit is 1, ideal test ALL/ANY */
+        if (IS_HALF)
+          input[gid + lid] = __float_to_half(as_uint((float)input[gid + lid]/2));
       } else {
         input[gid + lid] += rand();
         input[gid + lid] += rand() / ((float)RAND_MAX + 1);
@@ -129,7 +152,7 @@ static void generate_data(WG_FUNCTION wg_func,
     /* expected values */
-    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE, IS_HALF);
     /* output expected input */
@@ -152,7 +175,8 @@ static void generate_data(WG_FUNCTION wg_func,
 template<class T>
 static void subgroup_generic(WG_FUNCTION wg_func,
                        T* input,
-                       T* expected)
+                       T* expected,
+                       bool IS_HALF = false)
   /* get simd size */
   globals[0] = WG_GLOBAL_SIZE;
@@ -161,7 +185,7 @@ static void subgroup_generic(WG_FUNCTION wg_func,
   /* input and expected data */
-  generate_data(wg_func, input, expected, SIMD_SIZE);
+  generate_data(wg_func, input, expected, SIMD_SIZE, IS_HALF);
   /* prepare input for data type */
   OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
@@ -185,8 +209,22 @@ static void subgroup_generic(WG_FUNCTION wg_func,
   for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
     if(((T *)buf_data[1])[i] != *(expected + i))
+      if (IS_HALF) {
+        float num_computed = as_float(__half_to_float(((T *)buf_data[1])[i]));
+        float num_expected = as_float(__half_to_float(*(expected + i)));
+        float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+        if (num_diff > 0.03f) {
+          mismatches++;
+        }
+          /* output mismatch */
+          cout << "Err at " << i << ", " << num_computed
+               << " != " << num_expected << " diff: " <<num_diff <<endl;
+        //}
+      }
       /* found mismatch on integer, increment */
-      if (numeric_limits<T>::is_integer) {
+      else if (numeric_limits<T>::is_integer) {
@@ -305,6 +343,42 @@ void compiler_subgroup_reduce_add_float(void)
   subgroup_generic(WG_REDUCE_ADD, input, expected);
+void compiler_subgroup_reduce_add_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_reduce.cl",
+                           "compiler_subgroup_reduce_add_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(WG_REDUCE_ADD, input, expected, true);
+void compiler_subgroup_reduce_add_short(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_short");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+void compiler_subgroup_reduce_add_ushort(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_ushort");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
  * Workgroup reduce max utest functions
@@ -364,6 +438,42 @@ void compiler_subgroup_reduce_max_float(void)
   subgroup_generic(WG_REDUCE_MAX, input, expected);
+void compiler_subgroup_reduce_max_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_reduce.cl",
+                           "compiler_subgroup_reduce_max_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(WG_REDUCE_MAX, input, expected, true);
+void compiler_subgroup_reduce_max_short(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_short");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+void compiler_subgroup_reduce_max_ushort(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_ushort");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
  * Workgroup reduce min utest functions
@@ -423,3 +533,39 @@ void compiler_subgroup_reduce_min_float(void)
   subgroup_generic(WG_REDUCE_MIN, input, expected);
+void compiler_subgroup_reduce_min_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_reduce.cl",
+                           "compiler_subgroup_reduce_min_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(WG_REDUCE_MIN, input, expected, true);
+void compiler_subgroup_reduce_min_short(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_short");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+void compiler_subgroup_reduce_min_ushort(void)
+  if(!cl_check_subgroups_short())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_ushort");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
diff --git a/utests/compiler_subgroup_scan_exclusive.cpp b/utests/compiler_subgroup_scan_exclusive.cpp
index 1a21b59..4f3e5ea 100644
--- a/utests/compiler_subgroup_scan_exclusive.cpp
+++ b/utests/compiler_subgroup_scan_exclusive.cpp
@@ -32,36 +32,56 @@ template<class T>
 static void compute_expected(WG_FUNCTION wg_func,
                     T* input,
                     T* expected,
-                    size_t SIMD_SIZE)
+                    size_t SIMD_SIZE,
+                    bool IS_HALF)
   if(wg_func == WG_SCAN_EXCLUSIVE_ADD)
     expected[0] = 0;
     expected[1] = input[0];
-    for(uint32_t i = 2; i < SIMD_SIZE; i++)
-      expected[i] = input[i - 1] + expected[i - 1];
+    for(uint32_t i = 2; i < SIMD_SIZE; i++) {
+      if (IS_HALF)
+        expected[i] = __float_to_half(as_uint(as_float(__half_to_float(input[i - 1])) +
+                                              as_float(__half_to_float(expected[i - 1]))));
+      else
+        expected[i] = input[i - 1] + expected[i - 1];
+    }
   else if(wg_func == WG_SCAN_EXCLUSIVE_MAX)
-    if(numeric_limits<T>::is_integer)
+    if(IS_HALF)
+      expected[0] = 0xFC00;
+    else if(numeric_limits<T>::is_integer)
       expected[0] = numeric_limits<T>::min();
       expected[0] = - numeric_limits<T>::infinity();
     expected[1] = input[0];
-    for(uint32_t i = 2; i < SIMD_SIZE; i++)
-      expected[i] = max(input[i - 1], expected[i - 1]);
+    for(uint32_t i = 2; i < SIMD_SIZE; i++) {
+      if (IS_HALF)
+        expected[i] = (as_float(__half_to_float(input[i - 1])) > as_float(__half_to_float(expected[i - 1]))) ?
+                      input[i - 1] : expected[i - 1];
+      else
+        expected[i] = max(input[i - 1], expected[i - 1]);
+    }
   else if(wg_func == WG_SCAN_EXCLUSIVE_MIN)
-    if(numeric_limits<T>::is_integer)
+    if(IS_HALF)
+      expected[0] = 0x7C00;
+    else if(numeric_limits<T>::is_integer)
       expected[0] = numeric_limits<T>::max();
       expected[0] = numeric_limits<T>::infinity();
     expected[1] = input[0];
-    for(uint32_t i = 2; i < SIMD_SIZE; i++)
-      expected[i] = min(input[i - 1], expected[i - 1]);
+    for(uint32_t i = 2; i < SIMD_SIZE; i++) {
+      if (IS_HALF)
+        expected[i] = (as_float(__half_to_float(input[i - 1])) < as_float(__half_to_float(expected[i - 1]))) ?
+                      input[i - 1] : expected[i - 1];
+      else
+        expected[i] = min(input[i - 1], expected[i - 1]);
+    }
@@ -73,7 +93,8 @@ template<class T>
 static void generate_data(WG_FUNCTION wg_func,
                    T* &input,
                    T* &expected,
-                   size_t SIMD_SIZE)
+                   size_t SIMD_SIZE,
+                   bool IS_HALF)
   input = new T[WG_GLOBAL_SIZE];
   expected = new T[WG_GLOBAL_SIZE];
@@ -101,6 +122,8 @@ static void generate_data(WG_FUNCTION wg_func,
       input[gid + lid] += ((rand() % 2 - 1) * base_val);
       /* add trailing random bits, tests GENERAL cases */
       input[gid + lid] += (rand() % 112);
+      if (IS_HALF)
+        input[gid + lid] = __float_to_half(as_uint((float)input[gid + lid]/2));
       /* output generated input */
@@ -111,7 +134,7 @@ static void generate_data(WG_FUNCTION wg_func,
     /* expected values */
-    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE, IS_HALF);
     /* output expected input */
@@ -134,7 +157,8 @@ static void generate_data(WG_FUNCTION wg_func,
 template<class T>
 static void subgroup_generic(WG_FUNCTION wg_func,
                        T* input,
-                       T* expected)
+                       T* expected,
+                       bool IS_HALF = false)
   /* get simd size */
   globals[0] = WG_GLOBAL_SIZE;
@@ -143,7 +167,7 @@ static void subgroup_generic(WG_FUNCTION wg_func,
   /* input and expected data */
-  generate_data(wg_func, input, expected, SIMD_SIZE);
+  generate_data(wg_func, input, expected, SIMD_SIZE, IS_HALF);
   /* prepare input for data type */
   OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
@@ -166,8 +190,21 @@ static void subgroup_generic(WG_FUNCTION wg_func,
   for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
     if(((T *)buf_data[1])[i] != *(expected + i))
+      if (IS_HALF) {
+        float num_computed = as_float(__half_to_float(((T *)buf_data[1])[i]));
+        float num_expected = as_float(__half_to_float(*(expected + i)));
+        float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+        if (num_diff > 0.03f) {
+          mismatches++;
+          /* output mismatch */
+          cout << "Err at " << i << ", " << num_computed
+               << " != " << num_expected <<" diff: " <<num_diff <<endl;
+        }
+      }
       /* found mismatch on integer, increment */
-      if(numeric_limits<T>::is_integer){
+      else if (numeric_limits<T>::is_integer) {
@@ -261,6 +298,42 @@ void compiler_subgroup_scan_exclusive_add_float(void)
   subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+void compiler_subgroup_scan_exclusive_add_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_exclusive.cl",
+                           "compiler_subgroup_scan_exclusive_add_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected, true);
+void compiler_subgroup_scan_exclusive_add_short(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_short");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+void compiler_subgroup_scan_exclusive_add_ushort(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_ushort");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
  * Workgroup scan_exclusive max utest functions
@@ -320,6 +393,42 @@ void compiler_subgroup_scan_exclusive_max_float(void)
   subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+void compiler_subgroup_scan_exclusive_max_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_exclusive.cl",
+                           "compiler_subgroup_scan_exclusive_max_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected, true);
+void compiler_subgroup_scan_exclusive_max_short(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_short");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+void compiler_subgroup_scan_exclusive_max_ushort(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_ushort");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
  * Workgroup scan_exclusive min utest functions
@@ -379,3 +488,39 @@ void compiler_subgroup_scan_exclusive_min_float(void)
   subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+void compiler_subgroup_scan_exclusive_min_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_exclusive.cl",
+                           "compiler_subgroup_scan_exclusive_min_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected, true);
+void compiler_subgroup_scan_exclusive_min_short(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_short");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+void compiler_subgroup_scan_exclusive_min_ushort(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_ushort");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
diff --git a/utests/compiler_subgroup_scan_inclusive.cpp b/utests/compiler_subgroup_scan_inclusive.cpp
index fa32855..8f8c264 100644
--- a/utests/compiler_subgroup_scan_inclusive.cpp
+++ b/utests/compiler_subgroup_scan_inclusive.cpp
@@ -32,25 +32,41 @@ template<class T>
 static void compute_expected(WG_FUNCTION wg_func,
                     T* input,
                     T* expected,
-                    size_t SIMD_SIZE)
+                    size_t SIMD_SIZE,
+                    bool IS_HALF)
   if(wg_func == WG_SCAN_INCLUSIVE_ADD)
     expected[0] = input[0];
-    for(uint32_t i = 1; i < SIMD_SIZE; i++)
-      expected[i] = input[i] + expected[i - 1];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+      if (IS_HALF)
+        expected[i] = __float_to_half(as_uint(as_float(__half_to_float(input[i])) +
+                                              as_float(__half_to_float(expected[i - 1]))));
+      else
+        expected[i] = input[i] + expected[i - 1];
+    }
   else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
     expected[0] = input[0];
-    for(uint32_t i = 1; i < SIMD_SIZE; i++)
-      expected[i] = max(input[i], expected[i - 1]);
+    for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+      if (IS_HALF)
+        expected[i] = (as_float(__half_to_float(input[i])) > as_float(__half_to_float(expected[i - 1]))) ?
+                      input[i] : expected[i - 1];
+      else
+        expected[i] = max(input[i], expected[i - 1]);
+    }
   else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
     expected[0] = input[0];
-    for(uint32_t i = 1; i < SIMD_SIZE; i++)
-      expected[i] = min(input[i], expected[i - 1]);
+    for(uint32_t i = 1; i < SIMD_SIZE; i++) {
+      if (IS_HALF)
+        expected[i] = (as_float(__half_to_float(input[i])) < as_float(__half_to_float(expected[i - 1]))) ?
+                      input[i] : expected[i - 1];
+      else
+        expected[i] = min(input[i], expected[i - 1]);
+    }
@@ -62,7 +78,8 @@ template<class T>
 static void generate_data(WG_FUNCTION wg_func,
                    T* &input,
                    T* &expected,
-                   size_t SIMD_SIZE)
+                   size_t SIMD_SIZE,
+                   bool IS_HALF)
   input = new T[WG_GLOBAL_SIZE];
   expected = new T[WG_GLOBAL_SIZE];
@@ -91,6 +108,8 @@ static void generate_data(WG_FUNCTION wg_func,
       input[gid + lid] += ((rand() % 2 - 1) * base_val);
       /* add trailing random bits, tests GENERAL cases */
       input[gid + lid] += (rand() % 112);
+      if (IS_HALF)
+        input[gid + lid] = __float_to_half(as_uint((float)input[gid + lid]/2));
       /* output generated input */
@@ -101,7 +120,7 @@ static void generate_data(WG_FUNCTION wg_func,
     /* expected values */
-    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE, IS_HALF);
     /* output expected input */
@@ -124,7 +143,8 @@ static void generate_data(WG_FUNCTION wg_func,
 template<class T>
 static void subgroup_generic(WG_FUNCTION wg_func,
                        T* input,
-                       T* expected)
+                       T* expected,
+                       bool IS_HALF = false)
   /* get simd size */
   globals[0] = WG_GLOBAL_SIZE;
@@ -133,7 +153,7 @@ static void subgroup_generic(WG_FUNCTION wg_func,
   /* input and expected data */
-  generate_data(wg_func, input, expected, SIMD_SIZE);
+  generate_data(wg_func, input, expected, SIMD_SIZE, IS_HALF);
   /* prepare input for data type */
   OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
@@ -156,8 +176,21 @@ static void subgroup_generic(WG_FUNCTION wg_func,
   for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
     if(((T *)buf_data[1])[i] != *(expected + i))
+      if (IS_HALF) {
+        float num_computed = as_float(__half_to_float(((T *)buf_data[1])[i]));
+        float num_expected = as_float(__half_to_float(*(expected + i)));
+        float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+        if (num_diff > 0.03f) {
+          mismatches++;
+          /* output mismatch */
+          cout << "Err at " << i << ", " << num_computed
+               << " != " << num_expected <<" diff: " <<num_diff <<endl;
+        }
+      }
       /* found mismatch on integer, increment */
-      if(numeric_limits<T>::is_integer){
+      else if (numeric_limits<T>::is_integer) {
@@ -251,6 +284,42 @@ void compiler_subgroup_scan_inclusive_add_float(void)
   subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+void compiler_subgroup_scan_inclusive_add_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_inclusive.cl",
+                           "compiler_subgroup_scan_inclusive_add_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected, true);
+void compiler_subgroup_scan_inclusive_add_short(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_short");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+void compiler_subgroup_scan_inclusive_add_ushort(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_ushort");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
  * Workgroup scan_inclusive max utest functions
@@ -310,6 +379,42 @@ void compiler_subgroup_scan_inclusive_max_float(void)
   subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+void compiler_subgroup_scan_inclusive_max_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_inclusive.cl",
+                           "compiler_subgroup_scan_inclusive_max_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected, true);
+void compiler_subgroup_scan_inclusive_max_short(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_short");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+void compiler_subgroup_scan_inclusive_max_ushort(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_ushort");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
  * Workgroup scan_inclusive min utest functions
@@ -369,4 +474,39 @@ void compiler_subgroup_scan_inclusive_min_float(void)
   subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+void compiler_subgroup_scan_inclusive_min_half(void)
+  if(!cl_check_subgroups())
+    return;
+  if(!cl_check_half())
+    return;
+  cl_half *input = NULL;
+  cl_half *expected = NULL;
+  OCL_CALL(cl_kernel_init, "compiler_subgroup_scan_inclusive.cl",
+                           "compiler_subgroup_scan_inclusive_min_half",
+                           SOURCE, "-DHALF");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected, true);
+void compiler_subgroup_scan_inclusive_min_short(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_short");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+void compiler_subgroup_scan_inclusive_min_ushort(void)
+  if(!cl_check_subgroups())
+    return;
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_ushort");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
diff --git a/utests/load_program_from_spir.cpp b/utests/load_program_from_spir.cpp
index bb53947..767a488 100644
--- a/utests/load_program_from_spir.cpp
+++ b/utests/load_program_from_spir.cpp
@@ -35,7 +35,10 @@ static void test_load_program_from_spir(void)
       fprintf(stderr, "run out of memory\n");
-    ker_path = cl_do_kiss_path("compiler_ceil32.spir", device);
+    if(sizeof(size_t) == 8)
+      ker_path = cl_do_kiss_path("compiler_ceil64.spir", device);
+    else
+      ker_path = cl_do_kiss_path("compiler_ceil32.spir", device);
     OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
     const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
diff --git a/utests/multi_queue_events.cpp b/utests/multi_queue_events.cpp
new file mode 100644
index 0000000..4545167
--- /dev/null
+++ b/utests/multi_queue_events.cpp
@@ -0,0 +1,129 @@
+#include "utest_helper.hpp"
+#define THREAD_SIZE 8
+pthread_t tid[THREAD_SIZE];
+static cl_command_queue all_queues[THREAD_SIZE];
+static cl_event enqueue_events[THREAD_SIZE];
+static cl_event user_event;
+static cl_kernel the_kernel;
+static char source_str[] =
+  "kernel void assgin_work_dim( __global int *ret, int i) { \n"
+  "if (i == 0) ret[i] = 10; \n"
+  "else ret[i] = ret[i - 1] + 1; \n"
+  "}\n";
+static size_t the_globals[3] = {16, 1, 1};
+static size_t the_locals[3] = {16, 1, 1};
+static size_t the_goffsets[3] = {0, 0, 0};
+static void *thread_function(void *arg)
+  int num = *((int *)arg);
+  cl_int ret;
+  cl_event dep_event[2];
+  ret = clSetKernelArg(the_kernel, 1, sizeof(cl_int), &num);
+  if (num == 0) {
+    dep_event[0] = user_event;
+    ret = clEnqueueNDRangeKernel(all_queues[num], the_kernel, 1, the_goffsets, the_globals, the_locals,
+                                 1, dep_event, &enqueue_events[num]);
+  } else {
+    dep_event[0] = user_event;
+    dep_event[1] = enqueue_events[num - 1];
+    ret = clEnqueueNDRangeKernel(all_queues[num], the_kernel, 1, the_goffsets, the_globals, the_locals,
+                                 2, dep_event, &enqueue_events[num]);
+  }
+  return NULL;
+void multi_queue_events(void)
+  cl_int ret;
+  size_t source_size = sizeof(source_str);
+  const char *source = source_str;
+  cl_program program = NULL;
+  int i;
+  /* Create Kernel Program from the source */
+  program = clCreateProgramWithSource(ctx, 1, &source, &source_size, &ret);
+  /* Build Kernel Program */
+  ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
+  the_kernel = clCreateKernel(program, "assgin_work_dim", NULL);
+  OCL_ASSERT(the_kernel != NULL);
+  int buffer_content[16] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+  cl_mem buf = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, 16 * 4, buffer_content, &ret);
+  OCL_ASSERT(buf != NULL);
+  ret = clSetKernelArg(the_kernel, 0, sizeof(cl_mem), &buf);
+  for (i = 0; i < THREAD_SIZE; i++) {
+    all_queues[i] = clCreateCommandQueue(ctx, device, 0, &ret);
+  }
+  user_event = clCreateUserEvent(ctx, &ret);
+  for (i = 0; i < THREAD_SIZE; i++) {
+    pthread_create(&tid[i], NULL, thread_function, &i);
+    pthread_join(tid[i], NULL);
+  }
+  cl_event map_event;
+  void *map_ptr = clEnqueueMapBuffer(all_queues[0], buf, 0, CL_MAP_READ, 0, 32,
+                                     THREAD_SIZE, enqueue_events, &map_event, NULL);
+  OCL_ASSERT(map_ptr != NULL);
+  cl_event all_event[10];
+  for (i = 0; i < THREAD_SIZE; i++) {
+    all_event[i] = enqueue_events[i];
+  }
+  all_event[8] = user_event;
+  all_event[9] = map_event;
+  //printf("before Waitfor events ##\n");
+  clSetUserEventStatus(user_event, CL_COMPLETE);
+  ret = clWaitForEvents(10, all_event);
+  //printf("After Waitfor events ##\n");
+  //printf("#############     Finish Setting   ################\n");
+  printf("\n");
+  for (i = 0; i < 8; i++) {
+    //printf(" %d", ((int *)map_ptr)[i]);
+    OCL_ASSERT(((int *)map_ptr)[i] == 10 + i);
+  }
+  //printf("\n");
+  ret = clEnqueueUnmapMemObject(all_queues[0], buf, map_ptr, 1, &map_event, NULL);
+  //printf("------------------------- End -------------------------------\n");
+  clReleaseKernel(the_kernel);
+  clReleaseProgram(program);
+  clReleaseMemObject(buf);
+  for (i = 0; i < THREAD_SIZE; i++) {
+    clReleaseCommandQueue(all_queues[i]);
+    clReleaseEvent(enqueue_events[i]);
+  }
+  clReleaseEvent(user_event);
+  clReleaseEvent(map_event);
diff --git a/utests/runtime_barrier_list.cpp b/utests/runtime_barrier_list.cpp
index 3b8d3c3..7e12b66 100644
--- a/utests/runtime_barrier_list.cpp
+++ b/utests/runtime_barrier_list.cpp
@@ -23,7 +23,7 @@ void runtime_barrier_list(void)
-  clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+  clEnqueueWriteBuffer(queue, buf[0], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(int), &value);
@@ -40,23 +40,22 @@ void runtime_barrier_list(void)
-  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_FALSE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
   clEnqueueBarrierWithWaitList(queue, 0, NULL, &ev[3]);
-  clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+  clEnqueueWriteBuffer(queue, buf[1], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
   clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
   clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
   for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
     clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
     OCL_ASSERT(status <= CL_COMPLETE);
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
index 00e02f1..5804f95 100644
--- a/utests/runtime_event.cpp
+++ b/utests/runtime_event.cpp
@@ -18,7 +18,7 @@ void runtime_event(void)
-  clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+  clEnqueueWriteBuffer(queue, buf[0], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(int), &value);
@@ -33,7 +33,7 @@ void runtime_event(void)
     OCL_ASSERT(status >= CL_SUBMITTED);
-  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_FALSE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
diff --git a/utests/runtime_marker_list.cpp b/utests/runtime_marker_list.cpp
index 751f4a0..b4e6edb 100644
--- a/utests/runtime_marker_list.cpp
+++ b/utests/runtime_marker_list.cpp
@@ -23,7 +23,7 @@ void runtime_marker_list(void)
-  clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+  clEnqueueWriteBuffer(queue, buf[0], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(int), &value);
@@ -40,23 +40,22 @@ void runtime_marker_list(void)
-  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_FALSE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
   clEnqueueMarkerWithWaitList(queue, 0, NULL, &ev[3]);
-  clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+  clEnqueueWriteBuffer(queue, buf[1], CL_FALSE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 1, &ev[3], &ev[4]);
   clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
   clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
   for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
     clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
     OCL_ASSERT(status <= CL_COMPLETE);
diff --git a/utests/runtime_pipe_query.cpp b/utests/runtime_pipe_query.cpp
new file mode 100644
index 0000000..3ce8258
--- /dev/null
+++ b/utests/runtime_pipe_query.cpp
@@ -0,0 +1,15 @@
+#include <string.h>
+#include "utest_helper.hpp"
+static void runtime_pipe_query(void) {
+  const size_t w = 16;
+  const size_t sz = 8;
+  cl_uint retnum, retsz;
+  /* pipe write kernel */
+  OCL_CALL2(clCreatePipe, buf[0], ctx, 0, sz, w, NULL);
+  OCL_CALL(clGetPipeInfo, buf[0], CL_PIPE_MAX_PACKETS, sizeof(retnum), &retnum, NULL);
+  OCL_CALL(clGetPipeInfo, buf[0], CL_PIPE_PACKET_SIZE, sizeof(retsz), &retsz, NULL);
+  /*Check result */
+  OCL_ASSERT(sz == retsz && w == retnum);
diff --git a/utests/setenv.sh.in b/utests/setenv.sh.in
index 67e3bf1..e1282df 100644
--- a/utests/setenv.sh.in
+++ b/utests/setenv.sh.in
@@ -2,7 +2,9 @@
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 4f50f3f..b57d2ad 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -57,7 +57,7 @@ __thread size_t locals[3] = {};
 float ULPSIZE_FAST_MATH = 10000.;
 __attribute__ ((visibility ("internal"))) clGetKernelSubGroupInfoKHR_cb* utestclGetKernelSubGroupInfoKHR = NULL;
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
 Display    *xDisplay;
 EGLDisplay  eglDisplay;
 EGLContext  eglContext = NULL;
@@ -171,6 +171,8 @@ cl_test_channel_order_string(cl_channel_order order)
 #undef DECL_ORDER
     default: return "Unsupported image channel order";
@@ -435,7 +437,7 @@ cl_ocl_init(void)
   cl_int status = CL_SUCCESS;
   cl_uint platform_n;
   size_t i;
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
   bool hasGLExt = false;
   cl_context_properties *props = NULL;
@@ -464,7 +466,7 @@ cl_ocl_init(void)
       GET_DEVICE_STR_INFO(opencl_c_version, OPENCL_C_VERSION);
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
       if (std::strstr(extensionsStr.c_str(), "cl_khr_gl_sharing")) {
         hasGLExt = true;
@@ -476,7 +478,7 @@ cl_ocl_init(void)
      goto error;
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
   if (hasGLExt) {
     int i = 0;
     props = new cl_context_properties[7];
@@ -556,7 +558,7 @@ cl_ocl_destroy(void)
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
   if (eglContext != NULL) {
     eglContext = NULL;
@@ -917,6 +919,26 @@ int cl_check_subgroups(void)
   return 1;
+int cl_check_subgroups_short(void)
+  if (!cl_check_subgroups())
+    return 0;
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, &param_value_size);
+  std::vector<char> param_value(param_value_size);
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+           param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+  if (!param_value.empty())
+    extStr = std::string(&param_value.front(), param_value_size-1);
+  if (std::strstr(extStr.c_str(), "cl_intel_subgroups_short") == NULL) {
+    printf("No cl_intel_subgroups_short, Skip!");
+    return 0;
+  }
+  return 1;
 int cl_check_ocl20(void)
   size_t param_value_size;
@@ -970,110 +992,119 @@ int cl_check_half(void)
   return 1;
-uint32_t __half_to_float(uint16_t h, bool* isInf, bool* infSign)
+uint32_t __half_to_float(uint16_t h, bool *isInf, bool *infSign)
-  struct __FP32 {
-    uint32_t mantissa:23;
-    uint32_t exponent:8;
-    uint32_t sign:1;
-  };
-  struct __FP16 {
-    uint32_t mantissa:10;
-    uint32_t exponent:5;
-    uint32_t sign:1;
-  };
-  uint32_t f;
-  __FP32 o;
-  memset(&o, 0, sizeof(o));
-  __FP16 i;
-  memcpy(&i, &h, sizeof(uint16_t));
+  uint32_t out_val = 0;
+  uint16_t sign = (h & 0x8000) >> 15;
+  uint16_t exp = (h & 0x7c00) >> 10;
+  uint16_t fraction = h & 0x03ff;
   if (isInf)
     *isInf = false;
   if (infSign)
     *infSign = false;
-  if (i.exponent == 0 && i.mantissa == 0) // (Signed) zero
-    o.sign = i.sign;
-  else {
-    if (i.exponent == 0) { // Denormal (converts to normalized)
-      // Adjust mantissa so it's normalized (and keep
-      // track of exponent adjustment)
-      int e = -1;
-      uint m = i.mantissa;
-      do {
-        e++;
-        m <<= 1;
-      } while ((m & 0x400) == 0);
-      o.mantissa = (m & 0x3ff) << 13;
-      o.exponent = 127 - 15 - e;
-      o.sign = i.sign;
-    } else if (i.exponent == 0x1f) { // Inf/NaN
-      // NOTE: Both can be handled with same code path
-      // since we just pass through mantissa bits.
-      o.mantissa = i.mantissa << 13;
-      o.exponent = 255;
-      o.sign = i.sign;
-      if (isInf) {
-        *isInf = (i.mantissa == 0);
-        if (infSign)
-          *infSign = !i.sign;
-      }
-    } else { // Normalized number
-      o.mantissa = i.mantissa << 13;
-      o.exponent = 127 - 15 + i.exponent;
-      o.sign = i.sign;
+  if (exp == 0 && fraction == 0) { // (Signed) zero
+    return (sign << 31);
+  }
+  if (exp == 0) { // subnormal mode
+    assert(fraction > 0);
+    exp = -1;
+    do {
+      fraction = fraction << 1;
+      exp++;
+    } while ((fraction & 0x400) == 0);
+    exp = 127 - exp - 15;
+    out_val = (sign << 31) | ((exp & 0xff) << 23) | ((fraction & 0x3ff) << 13);
+    return out_val;
+  }
+  if (exp == 0x1f) {     // inf or NAN
+    if (fraction == 0) { // inf
+      out_val = (sign << 31) | (255 << 23);
+      if (isInf)
+        *isInf = true;
+      if (infSign)
+        *infSign = (sign == 0) ? 1 : 0;
+      return out_val;
+    } else { // NAN mode
+      out_val = (sign << 31) | (255 << 23) | 0x7fffff;
+      return out_val;
-  memcpy(&f, &o, sizeof(uint32_t));
-  return f;
+  // Easy case, just convert.
+  exp = 127 - 15 + exp;
+  out_val = (sign << 31) | ((exp & 0xff) << 23) | ((fraction & 0x3ff) << 13);
+  return out_val;
 uint16_t __float_to_half(uint32_t x)
-  uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
-  uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
-  unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
-  /* If zero, or denormal, or exponent underflows too much for a denormal
-   * half, return signed zero. */
-  if (e < 103)
-    return bits;
-  /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
-  if (e > 142) {
-    bits |= 0x7c00u;
-    /* If exponent was 0xff and one mantissa bit was set, it means NaN,
-     * not Inf, so make sure we set one mantissa bit too. */
-    bits |= e == 255 && (x & 0x007fffffu);
-    return bits;
+  uint16_t sign = (x & 0x80000000) >> 31;
+  uint16_t exp = (x & 0x7F800000) >> 23;
+  uint32_t fraction = (x & 0x7fffff);
+  uint16_t out_val = 0;
+  /* Handle the float NAN format. */
+  if (exp == 0xFF && fraction != 0) {
+    /* return a NAN half. */
+    out_val = (sign << 15) | (0x7C00) | (fraction & 0x3ff);
+    return out_val;
-  /* If exponent underflows but not too much, return a denormal */
-  if (e < 113) {
-    m |= 0x0800u;
-    /* Extra rounding may overflow and set mantissa to 0 and exponent
-     * to 1, which is OK. */
-    bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
-    return bits;
+  /* Float exp is from -126~127, half exp is from -14~15 */
+  if (exp - 127 > 15) { // Should overflow.
+    /* return +- inf. */
+    out_val = (sign << 15) | (0x7C00);
+    return out_val;
-  bits |= ((e - 112) << 10) | (m >> 1);
-  /* Extra rounding. An overflow will set mantissa to 0 and increment
-   * the exponent, which is OK. */
-  bits += m & 1;
-  return bits;
+  /* half has 10 bits fraction, so have chance to convet to
+     (-1)^sign X 2^(-14) X 0.fraction form. But if the
+     exp - 127 < -14 - 10, we must have unerflow. */
+  if (exp < -14 + 127 - 10) { // Should underflow.
+    /* Return zero without subnormal numbers. */
+    out_val = (sign << 15);
+    return out_val;
+  }
+  if (exp < -14 + 127) { //May underflow, but may use subnormal numbers
+    int shift = -(exp - 127 + 14);
+    assert(shift > 0);
+    assert(shift <= 10);
+    fraction = fraction | 0x0800000; // in 1.significantbits2, add the 1
+    fraction = fraction >> shift;
+    // To half fraction
+    fraction = (fraction & 0x7ff000) >> 12;
+    out_val = (sign << 15) | ((fraction >> 1) & 0x3ff);
+    if (fraction & 0x01)
+      out_val++;
+    return out_val;
+  }
+  /* Easy case, just convert. */
+  fraction = (fraction & 0x7ff000) >> 12;
+  exp = exp - 127 + 15;
+  assert(exp > 0);
+  assert(exp < 0x01f);
+  out_val = (sign << 15) | (exp << 10) | ((fraction >> 1) & 0x3ff);
+  if (fraction & 0x01)
+    out_val++;
+  return out_val;
-uint32_t as_uint(float f) {
+uint32_t as_uint(float f)
   union uint32_cast _tmp;
   _tmp._float = f;
   return _tmp._uint;
-float as_float(uint32_t i) {
+float as_float(uint32_t i)
   union uint32_cast _tmp;
   _tmp._uint = i;
   return _tmp._float;
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index a6e8180..a761325 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -39,13 +39,15 @@
 #define __thread
-#ifdef HAS_EGL
+#ifdef HAS_GL_EGL_X11
 #define EGL_WINDOW_WIDTH 256
 #define EGL_WINDOW_HEIGHT 256
 #include  <GL/gl.h>
+#include  <GL/glext.h>
 #include  <EGL/egl.h>
 #include  <EGL/eglext.h>
-#include <CL/cl_gl.h>
+#include  <CL/cl_gl.h>
 extern EGLDisplay  eglDisplay;
 extern EGLContext  eglContext;
@@ -126,6 +128,9 @@ union uint32_cast {
     OCL_CALL(clEnqueueAcquireGLObjects, queue, 1, &buf[ID], 0, 0, 0)
+    OCL_CALL(clEnqueueReleaseGLObjects, queue, 1, &buf[ID], 0, 0, 0)
   eglSwapBuffers(eglDisplay, eglSurface);
@@ -316,4 +321,6 @@ extern uint32_t __half_to_float(uint16_t h, bool* isInf = NULL, bool* infSign =
 extern uint16_t __float_to_half(uint32_t x);
 extern float as_float(uint32_t i);
 extern uint32_t as_uint(float f);
+/* Check is intel subgroups short enabled. */
+extern int cl_check_subgroups_short(void);
 #endif /* __UTEST_HELPER_HPP__ */

Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git

More information about the Pkg-opencl-commits mailing list