[beignet] 01/07: Imported Upstream version 1.2.0

Tue Sep 6 21:32:54 UTC 2016

This is an automated email from the git hooks/post-receive script.

rnpalmer-guest pushed a commit to branch master
in repository beignet.

commit 8435b1d124b0f95d8d17f12138399cc8109c9aa4
Author: Rebecca N. Palmer <rebecca_palmer at zoho.com>
Date:   Sun Sep 4 17:51:19 2016 +0100

    Imported Upstream version 1.2.0
---
 Android.common.mk                                  |   30 +
 Android.mk                                         |   14 +
 CMake/FindLLVM.cmake                               |    6 +-
 CMakeLists.txt                                     |   67 +-
 GetGenID.sh                                        |   18 +-
 backend/src/Android.mk                             |  288 +++
 backend/src/CMakeLists.txt                         |   11 +-
 backend/src/backend/context.cpp                    |  120 +-
 backend/src/backend/context.hpp                    |   25 +-
 backend/src/backend/gen/gen_mesa_disasm.c          |  219 +-
 backend/src/backend/gen75_context.cpp              |   42 +-
 backend/src/backend/gen75_encoder.cpp              |   66 -
 backend/src/backend/gen75_encoder.hpp              |    6 -
 backend/src/backend/gen7_encoder.cpp               |   62 +-
 backend/src/backend/gen7_encoder.hpp               |    8 +-
 backend/src/backend/gen7_instruction.hpp           |   31 +
 backend/src/backend/gen8_context.cpp               |  864 ++++++-
 backend/src/backend/gen8_context.hpp               |    8 +
 backend/src/backend/gen8_encoder.cpp               |  277 ++-
 backend/src/backend/gen8_encoder.hpp               |   16 +-
 backend/src/backend/gen8_instruction.hpp           |  102 +
 backend/src/backend/gen9_context.cpp               |    5 +
 backend/src/backend/gen9_context.hpp               |   15 +-
 backend/src/backend/gen_context.cpp                | 1939 +++++++++++++--
 backend/src/backend/gen_context.hpp                |   29 +-
 backend/src/backend/gen_defs.hpp                   |   70 +
 backend/src/backend/gen_encoder.cpp                |  289 ++-
 backend/src/backend/gen_encoder.hpp                |   37 +-
 backend/src/backend/gen_insn_compact.cpp           |  427 +++-
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   11 +
 backend/src/backend/gen_insn_scheduling.cpp        |  156 +-
 backend/src/backend/gen_insn_selection.cpp         | 2570 +++++++++++++++-----
 backend/src/backend/gen_insn_selection.hpp         |   63 +-
 backend/src/backend/gen_insn_selection.hxx         |   14 +-
 .../src/backend/gen_insn_selection_optimize.cpp    |  288 +++
 backend/src/backend/gen_insn_selection_output.cpp  |  172 ++
 backend/src/backend/gen_insn_selection_output.hpp  |   13 +
 backend/src/backend/gen_program.cpp                |  234 +-
 backend/src/backend/gen_program.hpp                |   10 +-
 backend/src/backend/gen_reg_allocation.cpp         |  609 +++--
 backend/src/backend/gen_reg_allocation.hpp         |    2 +
 backend/src/backend/gen_register.hpp               |  122 +-
 backend/src/backend/program.cpp                    |  242 +-
 backend/src/backend/program.h                      |   49 +-
 backend/src/backend/program.hpp                    |   51 +-
 backend/src/gbe_bin_generater.cpp                  |   10 +-
 backend/src/gbe_bin_interpreter.cpp                |    6 +-
 backend/src/ir/constant.cpp                        |   51 +-
 backend/src/ir/constant.hpp                        |    4 +-
 backend/src/ir/context.cpp                         |    8 +-
 backend/src/ir/context.hpp                         |   30 +-
 backend/src/ir/function.cpp                        |   41 +-
 backend/src/ir/function.hpp                        |   98 +-
 backend/src/ir/image.cpp                           |   23 +-
 backend/src/ir/image.hpp                           |    4 +-
 backend/src/ir/immediate.cpp                       |    1 +
 backend/src/ir/immediate.hpp                       |    2 +-
 backend/src/ir/instruction.cpp                     | 1090 +++++++--
 backend/src/ir/instruction.hpp                     |  225 +-
 backend/src/ir/instruction.hxx                     |   10 +
 backend/src/ir/liveness.cpp                        |  149 +-
 backend/src/ir/liveness.hpp                        |    9 +-
 backend/src/ir/lowering.cpp                        |   22 +-
 backend/src/ir/printf.cpp                          |  290 +--
 backend/src/ir/printf.hpp                          |  104 +-
 backend/src/ir/profile.cpp                         |   72 +-
 backend/src/ir/profile.hpp                         |   17 +-
 backend/src/ir/profiling.cpp                       |   74 +
 backend/src/ir/profiling.hpp                       |  132 +
 backend/src/ir/register.cpp                        |    8 +
 backend/src/ir/register.hpp                        |   79 +-
 backend/src/ir/sampler.cpp                         |   16 +-
 backend/src/ir/sampler.hpp                         |    4 +-
 backend/src/ir/structurizer.cpp                    |    9 +-
 backend/src/ir/type.cpp                            |    6 +-
 backend/src/ir/unit.cpp                            |    7 +-
 backend/src/ir/unit.hpp                            |   15 +
 backend/src/ir/value.cpp                           |  197 ++
 backend/src/ir/value.hpp                           |   16 +
 backend/src/libocl/Android.mk                      |   89 +
 backend/src/libocl/CMakeLists.txt                  |    2 +-
 backend/src/libocl/include/ocl.h                   |    2 +
 backend/src/libocl/include/ocl_float.h             |    1 +
 backend/src/libocl/include/ocl_geometric.h         |    4 +
 backend/src/libocl/include/ocl_misc.h              |   15 +
 backend/src/libocl/include/ocl_sync.h              |    7 +-
 backend/src/libocl/include/ocl_vload.h             |   14 +-
 backend/src/libocl/include/ocl_work_group.h        |  118 +
 backend/src/libocl/include/ocl_workitem.h          |    3 +
 backend/src/libocl/script/gen_vector.py            |    4 +-
 backend/src/libocl/script/ocl_math.def             |   28 +-
 backend/src/libocl/script/ocl_simd.def             |    9 +
 backend/src/libocl/src/ocl_barrier.ll              |    6 +
 backend/src/libocl/src/ocl_geometric.cl            |   12 +
 backend/src/libocl/src/ocl_memset.cl               |    2 +-
 backend/src/libocl/src/ocl_misc.cl                 |   24 +-
 backend/src/libocl/src/ocl_sync.cl                 |    7 +-
 backend/src/libocl/src/ocl_vload.cl                |   22 +-
 backend/src/libocl/src/ocl_work_group.cl           |  126 +
 backend/src/libocl/src/ocl_workitem.cl             |   30 +
 backend/src/libocl/tmpl/ocl_common.tmpl.cl         |    3 +-
 backend/src/libocl/tmpl/ocl_defines.tmpl.h         |    1 +
 backend/src/libocl/tmpl/ocl_integer.tmpl.cl        |    4 +-
 backend/src/libocl/tmpl/ocl_math.tmpl.cl           |  594 ++---
 backend/src/libocl/tmpl/ocl_math.tmpl.h            |   29 +-
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl           |  243 ++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h            |  143 ++
 backend/src/llvm/ExpandConstantExpr.cpp            |    4 +-
 backend/src/llvm/ExpandLargeIntegers.cpp           |    6 +-
 backend/src/llvm/ExpandUtils.cpp                   |    2 +-
 backend/src/llvm/PromoteIntegers.cpp               |    9 +-
 backend/src/llvm/StripAttributes.cpp               |    2 +-
 backend/src/llvm/llvm_bitcode_link.cpp             |   61 +-
 backend/src/llvm/llvm_gen_backend.cpp              | 1374 ++++++++---
 backend/src/llvm/llvm_gen_backend.hpp              |    8 +-
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   72 +-
 backend/src/llvm/llvm_includes.hpp                 |    5 +
 backend/src/llvm/llvm_intrinsic_lowering.cpp       |    6 +-
 backend/src/llvm/llvm_loadstore_optimization.cpp   |  115 +-
 backend/src/llvm/llvm_passes.cpp                   |    7 +-
 backend/src/llvm/llvm_printf_parser.cpp            |  451 +---
 backend/src/llvm/llvm_profiling.cpp                |  214 ++
 backend/src/llvm/llvm_sampler_fix.cpp              |    2 +
 backend/src/llvm/llvm_scalarize.cpp                |  121 +-
 backend/src/llvm/llvm_to_gen.cpp                   |  100 +-
 backend/src/llvm/llvm_to_gen.hpp                   |    3 +-
 backend/src/llvm/llvm_unroll.cpp                   |   33 +-
 backend/src/ocl_common_defines.h                   |   33 +-
 backend/src/sys/alloc.hpp                          |    4 +-
 backend/src/sys/platform.hpp                       |    8 +-
 benchmark/CMakeLists.txt                           |    7 +-
 benchmark/benchmark_copy_buf.cpp                   |    4 +-
 benchmark/benchmark_copy_buffer.cpp                |   57 +
 benchmark/benchmark_copy_buffer_to_image.cpp       |    2 +-
 benchmark/benchmark_copy_image.cpp                 |   72 +
 benchmark/benchmark_copy_image_to_buffer.cpp       |    2 +-
 benchmark/benchmark_math.cpp                       |  126 +
 benchmark/benchmark_read_buffer.cpp                |    3 +-
 benchmark/benchmark_read_image.cpp                 |    3 +-
 benchmark/benchmark_run.cpp                        |    1 +
 benchmark/benchmark_use_host_ptr_buffer.cpp        |    3 +-
 benchmark/benchmark_workgroup.cpp                  |  370 +++
 docs/Beignet.mdwn                                  |  103 +-
 docs/NEWS.mdwn                                     |    3 +
 docs/howto/stand-alone-utest-howto.mdwn            |   45 +
 docs/howto/video-motion-estimation-howto.mdwn      |   71 +
 include/CL/cl_ext.h                                |  103 +
 include/CL/cl_intel.h                              |   64 +
 kernels/bench_copy_buffer.cl                       |   90 +
 kernels/bench_copy_image.cl                        |   52 +
 kernels/bench_math.cl                              |  272 +++
 kernels/bench_workgroup.cl                         |  239 ++
 kernels/builtin_global_linear_id.cl                |    4 +
 kernels/builtin_local_linear_id.cl                 |    6 +
 kernels/builtin_max_sub_group_size.cl              |    7 +
 kernels/builtin_num_sub_groups.cl                  |    7 +
 kernels/builtin_sub_group_id.cl                    |    7 +
 kernels/builtin_sub_group_size.cl                  |    7 +
 kernels/cmrt_utest_genx.isa                        |  Bin 0 -> 847 bytes
 kernels/compiler_bsort.cl                          |   47 +
 kernels/compiler_bswap.cl                          |   14 +-
 kernels/compiler_double_2.cl                       |    9 -
 kernels/compiler_double_4.cl                       |    5 -
 kernels/compiler_double_convert.cl                 |  102 +
 kernels/compiler_double_div.cl                     |   13 +
 kernels/compiler_get_max_sub_group_size.cl         |    5 +
 kernels/compiler_get_sub_group_id.cl               |    8 -
 kernels/compiler_get_sub_group_local_id.cl         |    8 +
 kernels/compiler_get_sub_group_size.cl             |    5 -
 kernels/compiler_half_convert.cl                   |   11 +-
 kernels/compiler_math_3op.cl                       |   20 +-
 kernels/compiler_mix.cl                            |    4 +
 kernels/compiler_sub_group_all.cl                  |   12 -
 kernels/compiler_sub_group_any.cl                  |   15 -
 kernels/compiler_sub_group_shuffle.cl              |    6 +-
 kernels/compiler_sub_group_shuffle_down.cl         |   19 +
 kernels/compiler_sub_group_shuffle_up.cl           |   19 +
 kernels/compiler_sub_group_shuffle_xor.cl          |   19 +
 kernels/compiler_subgroup_broadcast.cl             |   34 +
 kernels/compiler_subgroup_buffer_block_read.cl     |   31 +
 kernels/compiler_subgroup_buffer_block_write.cl    |   27 +
 kernels/compiler_subgroup_image_block_read.cl      |   31 +
 kernels/compiler_subgroup_image_block_write.cl     |   27 +
 kernels/compiler_subgroup_reduce.cl                |  136 ++
 kernels/compiler_subgroup_scan_exclusive.cl        |   98 +
 kernels/compiler_subgroup_scan_inclusive.cl        |   98 +
 kernels/compiler_vector_load_store.cl              |    8 +-
 kernels/compiler_workgroup_broadcast.cl            |  122 +
 kernels/compiler_workgroup_reduce.cl               |  137 ++
 kernels/compiler_workgroup_scan_exclusive.cl       |   98 +
 kernels/compiler_workgroup_scan_inclusive.cl       |   98 +
 kernels/image_1D_buffer.cl                         |    2 +-
 kernels/image_from_buffer.cl                       |   12 +
 kernels/runtime_use_host_ptr_image.cl              |   10 +
 kernels/test_fill_image_2d_array.cl                |    2 +-
 kernels/test_get_arg_info.cl                       |    2 +-
 kernels/test_printf.cl                             |   46 +
 src/Android.mk                                     |  124 +
 src/CMakeLists.txt                                 |   62 +-
 src/OCLConfig.h.in                                 |    1 -
 src/cl_accelerator_intel.c                         |   86 +
 src/cl_accelerator_intel.h                         |   29 +
 src/cl_api.c                                       |  241 +-
 src/cl_cmrt.cpp                                    |  311 +++
 src/cl_cmrt.h                                      |   45 +
 src/cl_command_queue.c                             |   63 +-
 src/cl_command_queue.h                             |    2 +
 src/cl_command_queue_gen7.c                        |  154 +-
 src/cl_context.c                                   |    1 +
 src/cl_context.h                                   |    3 +
 src/cl_device_data.h                               |   71 +-
 src/cl_device_id.c                                 |  405 ++-
 src/cl_device_id.h                                 |   19 +-
 src/cl_driver.h                                    |   55 +-
 src/cl_driver_defs.c                               |    9 +
 src/cl_enqueue.c                                   |   15 +-
 src/cl_event.c                                     |   30 +-
 src/cl_extensions.c                                |   52 +-
 src/cl_extensions.h                                |   17 +-
 src/cl_gbe_loader.cpp                              |   34 +-
 src/cl_gbe_loader.h                                |    5 +-
 src/cl_gen75_device.h                              |    5 +-
 src/cl_gen7_device.h                               |    7 +-
 src/{cl_gen75_device.h => cl_gen8_device.h}        |    8 +-
 src/{cl_gen75_device.h => cl_gen9_device.h}        |    7 +-
 src/cl_gt_device.h                                 |   12 +-
 src/cl_internals.h                                 |    1 +
 src/cl_kernel.c                                    |   80 +-
 src/cl_kernel.h                                    |    8 +-
 src/cl_mem.c                                       |  332 ++-
 src/cl_mem.h                                       |   18 +
 src/cl_mem_gl.c                                    |    2 +-
 src/cl_program.c                                   |  181 +-
 src/cl_program.h                                   |   17 +-
 src/cl_thread.c                                    |   91 +-
 src/cl_utils.h                                     |   43 +
 src/intel/intel_driver.c                           |   91 +-
 src/intel/intel_gpgpu.c                            |  408 +++-
 src/intel/intel_gpgpu.h                            |    7 +-
 src/intel/intel_structs.h                          |  120 +
 .../cl_internal_block_motion_estimate_intel.cl     |  369 +++
 src/performance.c                                  |    3 +
 src/x11/dricommon.c                                |    6 +-
 utests/Android.mk                                  |  248 ++
 utests/CMakeLists.txt                              |  121 +-
 utests/buildin_work_dim.cpp                        |   13 +-
 utests/builtin_acos_asin.cpp                       |   20 +-
 utests/builtin_exp.cpp                             |   22 +-
 utests/builtin_global_id.cpp                       |   16 +-
 ..._global_id.cpp => builtin_global_linear_id.cpp} |   36 +-
 utests/builtin_global_size.cpp                     |    9 +-
 .../builtin_kernel_block_motion_estimate_intel.cpp |  135 +
 utests/builtin_kernel_max_global_size.cpp          |    8 +-
 utests/builtin_lgamma.cpp                          |    2 +-
 utests/builtin_lgamma_r.cpp                        |    2 +-
 utests/builtin_local_id.cpp                        |   16 +-
 ...in_local_id.cpp => builtin_local_linear_id.cpp} |   32 +-
 utests/builtin_local_size.cpp                      |   10 +-
 utests/builtin_max_sub_group_size.cpp              |   62 +
 utests/builtin_num_groups.cpp                      |   10 +-
 utests/builtin_num_sub_groups.cpp                  |   62 +
 utests/builtin_pow.cpp                             |   19 +-
 utests/builtin_sub_group_id.cpp                    |   63 +
 utests/builtin_sub_group_size.cpp                  |   63 +
 utests/builtin_tgamma.cpp                          |    6 +-
 utests/compare_image_2d_and_1d_array.cpp           |    2 +
 utests/compiler_abs.cpp                            |   18 +-
 utests/compiler_abs_diff.cpp                       |   17 +-
 utests/compiler_array1.cpp                         |    2 +-
 utests/compiler_assignment_operation_in_if.cpp     |    2 +-
 utests/compiler_box_blur_float.cpp                 |    3 +
 utests/compiler_bsort.cpp                          |   45 +
 utests/compiler_bswap.cpp                          |   63 +-
 utests/compiler_cl_finish.cpp                      |    8 +-
 utests/compiler_clz.cpp                            |   16 +-
 utests/compiler_copy_image.cpp                     |    8 +-
 utests/compiler_copy_image1.cpp                    |   24 +-
 utests/compiler_copy_image_1d.cpp                  |    8 +-
 utests/compiler_double.cpp                         |    5 +-
 utests/compiler_double_2.cpp                       |   47 -
 utests/compiler_double_4.cpp                       |   40 -
 utests/compiler_double_convert.cpp                 |  621 +++++
 utests/compiler_double_div.cpp                     |   83 +
 utests/compiler_double_precision.cpp               |    3 +
 utests/compiler_fill_image.cpp                     |    4 +-
 utests/compiler_fill_image_1d_array.cpp            |    1 +
 utests/compiler_fill_image_2d_array.cpp            |   18 +-
 utests/compiler_fill_image_3d.cpp                  |    4 +-
 utests/compiler_function_qualifiers.cpp            |    1 +
 ...ize.cpp => compiler_get_max_sub_group_size.cpp} |   10 +-
 ..._id.cpp => compiler_get_sub_group_local_id.cpp} |    8 +-
 utests/compiler_half.cpp                           |  298 ++-
 utests/compiler_mad24.cpp                          |    2 +-
 utests/compiler_math.cpp                           |    8 +-
 utests/compiler_math_2op.cpp                       |    8 +-
 utests/compiler_math_3op.cpp                       |   76 +-
 utests/compiler_mix.cpp                            |   50 +
 utests/compiler_movforphi_undef.cpp                |    8 +-
 utests/compiler_mul24.cpp                          |    2 +-
 utests/compiler_popcount.cpp                       |    2 +-
 utests/compiler_sub_group_all.cpp                  |   43 -
 utests/compiler_sub_group_any.cpp                  |   43 -
 utests/compiler_sub_group_shuffle.cpp              |    2 +
 ...fle.cpp => compiler_sub_group_shuffle_down.cpp} |   19 +-
 ...uffle.cpp => compiler_sub_group_shuffle_up.cpp} |   19 +-
 ...ffle.cpp => compiler_sub_group_shuffle_xor.cpp} |   15 +-
 utests/compiler_subgroup_broadcast.cpp             |  187 ++
 utests/compiler_subgroup_buffer_block_read.cpp     |  202 ++
 utests/compiler_subgroup_buffer_block_write.cpp    |  202 ++
 utests/compiler_subgroup_image_block_read.cpp      |  197 ++
 utests/compiler_subgroup_image_block_write.cpp     |  201 ++
 utests/compiler_subgroup_reduce.cpp                |  425 ++++
 utests/compiler_subgroup_scan_exclusive.cpp        |  381 +++
 utests/compiler_subgroup_scan_inclusive.cpp        |  372 +++
 utests/compiler_time_stamp.cpp                     |    5 +
 utests/compiler_unstructured_branch3.cpp           |    4 +
 utests/compiler_vector_load_store.cpp              |   32 +-
 utests/compiler_workgroup_broadcast.cpp            |  320 +++
 utests/compiler_workgroup_reduce.cpp               |  417 ++++
 utests/compiler_workgroup_scan_exclusive.cpp       |  373 +++
 utests/compiler_workgroup_scan_inclusive.cpp       |  364 +++
 utests/enqueue_copy_buf_unaligned.cpp              |    2 +-
 utests/get_cl_info.cpp                             |  247 +-
 utests/image_1D_buffer.cpp                         |    3 +-
 utests/image_from_buffer.cpp                       |  109 +
 utests/load_program_from_bin_file.cpp              |    4 +
 utests/load_program_from_gen_bin.cpp               |    8 +
 utests/load_program_from_spir.cpp                  |    6 +
 utests/profiling_exec.cpp                          |    8 +-
 utests/runtime_alloc_host_ptr_buffer.cpp           |    6 +-
 utests/runtime_barrier_list.cpp                    |    1 -
 utests/runtime_climage_from_boname.cpp             |   14 +-
 utests/runtime_cmrt.cpp                            |  274 +++
 utests/runtime_compile_link.cpp                    |   25 +-
 utests/runtime_event.cpp                           |    1 -
 utests/runtime_flat_address_space.cpp              |    8 +-
 utests/runtime_marker_list.cpp                     |    1 -
 utests/runtime_use_host_ptr_image.cpp              |   76 +
 utests/sub_buffer.cpp                              |    2 +-
 utests/test_printf.cpp                             |   54 +
 utests/utest.cpp                                   |   92 +-
 utests/utest.hpp                                   |   22 +-
 utests/utest_generator.py                          |   57 +-
 utests/utest_helper.cpp                            |  388 ++-
 utests/utest_helper.hpp                            |   97 +-
 utests/utest_math_gen.py                           |   30 +-
 utests/utest_run.cpp                               |   36 +-
 utests/vload_bench.cpp                             |    2 +-
 348 files changed, 26538 insertions(+), 4796 deletions(-)

diff --git a/Android.common.mk b/Android.common.mk
new file mode 100644
index 0000000..dcb3c7c
--- /dev/null
+++ b/Android.common.mk
@@ -0,0 +1,30 @@
+#LOCAL_PATH:= $(call my-dir)
+
+#include $(CLEAR_VARS)
+TOP_C_INCLUDE := bionic $(BEIGNET_ROOT_PATH)/include
+TOP_CPPFLAGS := -Wall -Wno-invalid-offsetof -mfpmath=sse -fno-rtti -Wcast-align -std=c++0x -msse2 -msse3 -mssse3 -msse4.1 -D__ANDROID__
+TOP_CFLAGS := -Wall -mfpmath=sse -msse2 -Wcast-align -msse2 -msse3 -mssse3 -msse4.1 -D__ANDROID__
+
+LLVM_INCLUDE_DIRS := external/llvm/device/include\
+                     external/llvm/include \
+                     external/clang/include \
+
+LLVM_CFLAGS := -DNDEBUG -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS
+LLVM_LFLAGS := -ldl -lm
+
+LLVM_FOUND := true
+
+DRM_INCLUDE_PATH := external/drm/intel external/drm/include/drm external/drm
+DRM_LIBRARY := libdrm
+DRM_FOUND := true
+
+THREAD_LIBS_INIT := libpthread
+
+DRM_INTEL_LIBRARY := libdrm_intel
+DRM_INTEL_FOUND := true
+
+GBE_LIBRARY := libgbe
+GBE_FOUND := false
+
+OCLIcd_FOUND := false
+
diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..095f313
--- /dev/null
+++ b/Android.mk
@@ -0,0 +1,14 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+BEIGNET_ROOT_PATH := $(LOCAL_PATH)
+
+#subdirs := backend/src/libocl
+
+
+subdirs := backend/src/libocl \
+           backend/src \
+	   src \
+           utests \
+
+include $(addprefix $(LOCAL_PATH)/,$(addsuffix /Android.mk, $(subdirs)))
diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index a148321..6129909 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -8,12 +8,12 @@
 # LLVM_FOUND       - True if llvm found.
 if (LLVM_INSTALL_DIR)
   find_program(LLVM_CONFIG_EXECUTABLE
-               NAMES llvm-config-35 llvm-config-3.5 llvm-config-36 llvm-config-3.6 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
+               NAMES llvm-config-37 llvm-config-3.7 llvm-config-36 llvm-config-3.6 llvm-config-38 llvm-config-3.8 llvm-config llvm-config-35 llvm-config-3.5 llvm-config-34 llvm-config-3.4
                DOC "llvm-config executable"
                PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
 else (LLVM_INSTALL_DIR)
   find_program(LLVM_CONFIG_EXECUTABLE
-               NAMES llvm-config-35 llvm-config-3.5 llvm-config-36 llvm-config-3.6 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
+               NAMES llvm-config-37 llvm-config-3.7 llvm-config-36 llvm-config-3.6 llvm-config-38 llvm-config-3.8 llvm-config llvm-config-35 llvm-config-3.5 llvm-config-34 llvm-config-3.4
                DOC "llvm-config executable")
 endif (LLVM_INSTALL_DIR)
 
@@ -99,7 +99,9 @@ execute_process(
   OUTPUT_VARIABLE LLVM_SYSTEM_LIBS_ORIG
   OUTPUT_STRIP_TRAILING_WHITESPACE
 )
+if (LLVM_SYSTEM_LIBS_ORIG)
 string(REGEX REPLACE " *\n" "" LLVM_SYSTEM_LIBS ${LLVM_SYSTEM_LIBS_ORIG})
+endif (LLVM_SYSTEM_LIBS_ORIG)
 endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
 
 macro(add_one_lib name)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88985d7..2bc2100 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,8 +17,8 @@ endif ()
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
 set (LIBCL_DRIVER_VERSION_MAJOR 1)
-set (LIBCL_DRIVER_VERSION_MINOR 1)
-set (LIBCL_DRIVER_VERSION_PATCH 2)
+set (LIBCL_DRIVER_VERSION_MINOR 2)
+set (LIBCL_DRIVER_VERSION_PATCH 0)
 set (LIBCL_C_VERSION_MAJOR 1)
 set (LIBCL_C_VERSION_MINOR 2)
 if( ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
@@ -33,7 +33,11 @@ configure_file (
   "src/OCLConfig.h"
 )
 
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+set (NOT_BUILD_STAND_ALONE_UTEST 1)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
+                    ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
 
 INCLUDE (FindPkgConfig)
 
@@ -85,8 +89,8 @@ elseif (COMPILER STREQUAL "CLANG")
 elseif (COMPILER STREQUAL "ICC")
   set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS}  -wd2928 -Wall -fPIC -fstrict-aliasing -fp-model fast -msse4.1 -Wl,-E")
 endif ()
-set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -std=c++0x -Wno-invalid-offsetof")
-set (CMAKE_C_FLAGS "${CMAKE_C_CXX_FLAGS}")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_C_CXX_FLAGS} -std=c++0x -Wno-invalid-offsetof")
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_CXX_FLAGS}")
 set (CMAKE_CXX_FLAGS_DEBUG          "-O0 -g -DGBE_DEBUG=1")
 set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
 set (CMAKE_CXX_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
@@ -126,31 +130,52 @@ ELSE(DRM_FOUND)
   MESSAGE(STATUS "Looking for DRM - not found")
 ENDIF(DRM_FOUND)
 
+include(CheckLibraryExists)
 # DRM Intel
 pkg_check_modules(DRM_INTEL libdrm_intel>=2.4.52)
 IF(DRM_INTEL_FOUND)
   INCLUDE_DIRECTORIES(${DRM_INTEL_INCLUDE_DIRS})
   MESSAGE(STATUS "Looking for DRM Intel - found at ${DRM_INTEL_PREFIX} ${DRM_INTEL_VERSION}")
-  #userptr support starts from 2.4.57, but 2.4.58 is the actual stable release
-  IF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_bo_alloc_userptr" ${DRM_INTEL_LIBDIR} HAVE_DRM_INTEL_USERPTR)
+  IF(HAVE_DRM_INTEL_USERPTR)
     MESSAGE(STATUS "Enable userptr support")
-    SET(DRM_INTEL_USERPTR "enable")
-  ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+  ELSE(HAVE_DRM_INTEL_USERPTR)
     MESSAGE(STATUS "Disable userptr support")
-  ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
-  IF(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
+  ENDIF(HAVE_DRM_INTEL_USERPTR)
+  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_eu_total" ${DRM_INTEL_LIBDIR} HAVE_DRM_INTEL_EU_TOTAL)
+  IF(HAVE_DRM_INTEL_EU_TOTAL)
     MESSAGE(STATUS "Enable EU total query support")
-    SET(DRM_INTEL_EU_TOTAL "enable")
-    MESSAGE(STATUS "Enable subslice total query support")
-    SET(DRM_INTEL_SUBSLICE_TOTAL "enable")
-  ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
+  ELSE(HAVE_DRM_INTEL_EU_TOTAL)
     MESSAGE(STATUS "Disable EU total query support")
+  ENDIF(HAVE_DRM_INTEL_EU_TOTAL)
+  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_subslice_total" ${DRM_INTEL_LIBDIR} HAVE_DRM_INTEL_SUBSLICE_TOTAL)
+  IF(HAVE_DRM_INTEL_SUBSLICE_TOTAL)
+    MESSAGE(STATUS "Enable subslice total query support")
+  ELSE(HAVE_DRM_INTEL_SUBSLICE_TOTAL)
     MESSAGE(STATUS "Disable subslice total query support")
-  ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
+  ENDIF(HAVE_DRM_INTEL_SUBSLICE_TOTAL)
+  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_pooled_eu" "" HAVE_DRM_INTEL_POOLED_EU)
+  IF(HAVE_DRM_INTEL_POOLED_EU)
+    MESSAGE(STATUS "Enable pooled eu query support")
+  ELSE(HAVE_DRM_INTEL_POOLED_EU)
+    MESSAGE(STATUS "Disable pooled eu query support")
+  ENDIF(HAVE_DRM_INTEL_POOLED_EU)
+  CHECK_LIBRARY_EXISTS(drm_intel "drm_intel_get_min_eu_in_pool" "" HAVE_DRM_INTEL_MIN_EU_IN_POOL)
+  IF(HAVE_DRM_INTEL_MIN_EU_IN_POOL)
+    MESSAGE(STATUS "Enable min eu in pool query support")
+  ELSE(HAVE_DRM_INTEL_MIN_EU_IN_POOL)
+    MESSAGE(STATUS "Disable min eu in pool query support")
+  ENDIF(HAVE_DRM_INTEL_MIN_EU_IN_POOL)
 ELSE(DRM_INTEL_FOUND)
   MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
 ENDIF(DRM_INTEL_FOUND)
 
+# CMRT
+pkg_check_modules(CMRT libcmrt)
+IF(CMRT_FOUND)
+INCLUDE_DIRECTORIES(${CMRT_INCLUDE_DIRS})
+ENDIF(CMRT_FOUND)
+
 # Threads
 Find_Package(Threads)
 
@@ -212,11 +237,17 @@ ENDIF(OCLIcd_FOUND)
 
 Find_Package(PythonInterp)
 
+OPTION(EXPERIMENTAL_DOUBLE "Enable experimental double support" OFF)
+IF(EXPERIMENTAL_DOUBLE)
+  ADD_DEFINITIONS(-DENABLE_FP64)
+ENDIF(EXPERIMENTAL_DOUBLE)
+
 OPTION(BUILD_EXAMPLES "Build examples" OFF)
 IF(BUILD_EXAMPLES)
 IF(NOT X11_FOUND)
   MESSAGE(FATAL_ERROR "XLib is necessary for examples - not found")
 ENDIF(NOT X11_FOUND)
+
 # libva & libva-x11
 #pkg_check_modules(LIBVA REQUIRED libva>=0.36.0)
 pkg_check_modules(LIBVA REQUIRED libva)
@@ -246,11 +277,11 @@ ENDIF(BUILD_EXAMPLES)
 ADD_SUBDIRECTORY(include)
 ADD_SUBDIRECTORY(backend)
 ADD_SUBDIRECTORY(src)
-ADD_SUBDIRECTORY(utests)
+ADD_SUBDIRECTORY(utests EXCLUDE_FROM_ALL)
 
 # compile benchmark only if standalone compiler is not provided
 IF (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
-ADD_SUBDIRECTORY(benchmark)
+  ADD_SUBDIRECTORY(benchmark EXCLUDE_FROM_ALL)
 ENDIF (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
 
 IF(BUILD_EXAMPLES)
diff --git a/GetGenID.sh b/GetGenID.sh
index 30296da..6181105 100755
--- a/GetGenID.sh
+++ b/GetGenID.sh
@@ -1,5 +1,21 @@
 #!/bin/bash
-genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 5a84)
+#IVB
+genpciid=(0152 0162 0156 0166 015a 016a)
+#BYT
+genpciid+=(0f31)
+#HSW
+genpciid+=(0402 0412 0422 040a 041a 042a 0406 0416 0426 040b 041b 042b 040e 041e 042e)
+genpciid+=(0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0c0b 0c1b 0c2b 0c0e 0c1e 0c2e)
+genpciid+=(0a02 0a12 0a22 0a0a 0a1a 0a2a 0a06 0a16 0a26 0a0b 0a1b 0a2b 0a0e 0a1e 0a2e)
+genpciid+=(0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 0d0b 0d1b 0d2b 0d0e 0d1e 0d2e)
+#BRW
+genpciid+=(1602 1606 160a 160d 160e 1612 1616 161a 161d 161e 1622 1626 162a 162d 162e)
+#BSW
+genpciid+=(22b0 22b1 22b2 22b3)
+#SKL
+genpciid+=(1906 1916 1926 190e 191e 1902 1912 1932 190b 191b 192b 193b 190a 191a 192a 193a)
+#BXT
+genpciid+=(5a84)
 pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F ] '{print $1}'))
 n=${#pciid[*]}
 i=0
diff --git a/backend/src/Android.mk b/backend/src/Android.mk
new file mode 100644
index 0000000..da4d787
--- /dev/null
+++ b/backend/src/Android.mk
@@ -0,0 +1,288 @@
+LOCAL_PATH:= $(call my-dir)
+include $(LOCAL_PATH)/../../Android.common.mk
+
+include $(CLEAR_VARS)
+include $(CLEAR_TBLGEN_VARS)
+
+LLVM_ROOT_PATH := external/llvm
+CLANG_ROOT_PATH := external/clang
+
+include $(CLANG_ROOT_PATH)/clang.mk
+
+BACKEND_SRC_FILES:= \
+    ${ocl_blob_file} \
+    sys/vector.hpp \
+    sys/map.hpp \
+    sys/set.hpp \
+    sys/intrusive_list.hpp \
+    sys/intrusive_list.cpp \
+    sys/exception.hpp \
+    sys/assert.cpp \
+    sys/assert.hpp \
+    sys/alloc.cpp \
+    sys/alloc.hpp \
+    sys/mutex.cpp \
+    sys/mutex.hpp \
+    sys/platform.cpp \
+    sys/platform.hpp \
+    sys/cvar.cpp \
+    sys/cvar.hpp \
+    ir/context.cpp \
+    ir/context.hpp \
+    ir/profile.cpp \
+    ir/profile.hpp \
+    ir/type.cpp \
+    ir/type.hpp \
+    ir/unit.cpp \
+    ir/unit.hpp \
+    ir/constant.cpp \
+    ir/constant.hpp \
+    ir/sampler.cpp \
+    ir/sampler.hpp \
+    ir/image.cpp \
+    ir/image.hpp \
+    ir/half.cpp \
+    ir/half.hpp \
+    ir/instruction.cpp \
+    ir/instruction.hpp \
+    ir/liveness.cpp \
+    ir/register.cpp \
+    ir/register.hpp \
+    ir/function.cpp \
+    ir/function.hpp \
+    ir/profiling.cpp \
+    ir/profiling.hpp \
+    ir/value.cpp \
+    ir/value.hpp \
+    ir/lowering.cpp \
+    ir/lowering.hpp \
+    ir/printf.cpp \
+    ir/printf.hpp \
+    ir/immediate.hpp \
+    ir/immediate.cpp \
+    ir/structurizer.hpp \
+    ir/structurizer.cpp \
+    backend/context.cpp \
+    backend/context.hpp \
+    backend/program.cpp \
+    backend/program.hpp \
+    backend/program.h \
+    llvm/llvm_sampler_fix.cpp \
+    llvm/llvm_bitcode_link.cpp \
+    llvm/llvm_gen_backend.cpp \
+    llvm/llvm_passes.cpp \
+    llvm/llvm_scalarize.cpp \
+    llvm/llvm_intrinsic_lowering.cpp \
+    llvm/llvm_barrier_nodup.cpp \
+    llvm/llvm_printf_parser.cpp \
+    llvm/ExpandConstantExpr.cpp \
+    llvm/ExpandUtils.cpp \
+    llvm/PromoteIntegers.cpp \
+    llvm/ExpandLargeIntegers.cpp \
+    llvm/StripAttributes.cpp \
+    llvm/llvm_to_gen.cpp \
+    llvm/llvm_loadstore_optimization.cpp \
+    llvm/llvm_gen_backend.hpp \
+    llvm/llvm_gen_ocl_function.hxx \
+    llvm/llvm_unroll.cpp \
+    llvm/llvm_to_gen.hpp \
+    llvm/llvm_profiling.cpp \
+    backend/gen/gen_mesa_disasm.c \
+    backend/gen_insn_selection.cpp \
+    backend/gen_insn_selection.hpp \
+    backend/gen_insn_selection_optimize.cpp \
+    backend/gen_insn_scheduling.cpp \
+    backend/gen_insn_scheduling.hpp \
+    backend/gen_insn_selection_output.cpp \
+    backend/gen_insn_selection_output.hpp \
+    backend/gen_reg_allocation.cpp \
+    backend/gen_reg_allocation.hpp \
+    backend/gen_context.cpp \
+    backend/gen_context.hpp \
+    backend/gen75_context.hpp \
+    backend/gen75_context.cpp \
+    backend/gen8_context.hpp \
+    backend/gen8_context.cpp \
+    backend/gen9_context.hpp \
+    backend/gen9_context.cpp \
+    backend/gen_program.cpp \
+    backend/gen_program.hpp \
+    backend/gen_program.h \
+    backend/gen7_instruction.hpp \
+    backend/gen8_instruction.hpp \
+    backend/gen_defs.hpp \
+    backend/gen_insn_compact.cpp \
+    backend/gen_encoder.hpp \
+    backend/gen_encoder.cpp \
+    backend/gen7_encoder.hpp \
+    backend/gen7_encoder.cpp \
+    backend/gen75_encoder.hpp \
+    backend/gen75_encoder.cpp \
+    backend/gen8_encoder.hpp \
+    backend/gen8_encoder.cpp \
+    backend/gen9_encoder.hpp \
+    backend/gen9_encoder.cpp
+
+#Generate GBEConfig for android
+LOCAL_MODULE := libgbe
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE_CLASS := SHARED_LIBRARIES
+
+generated_path := $(call local-generated-sources-dir)
+gbe_config_file = $(LOCAL_PATH)/GBEConfig.h
+$(shell echo "// the configured options and settings for LIBGBE" > $(gbe_config_file))
+$(shell echo "#define LIBGBE_VERSION_MAJOR 0" >> $(gbe_config_file))
+$(shell echo "#define LIBGBE_VERSION_MINOR 2" >> $(gbe_config_file))
+$(shell echo "#if defined(__ANDROID__)" >> $(gbe_config_file))
+$(shell echo "#if __x86_64__" >> $(gbe_config_file))
+$(shell echo "  #define GBE_OBJECT_DIR \"/system/lib64/libgbe.so\"" >> $(gbe_config_file))
+$(shell echo "  #define INTERP_OBJECT_DIR \"/system/lib64/libgbeinterp.so\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_BITCODE_BIN \"/system/lib/ocl/beignet.bc\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_HEADER_DIR \"/system/lib/ocl/include\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_PCH_OBJECT \"/system/lib/ocl/beignet.pch\"" >> $(gbe_config_file))
+$(shell echo "#else /*__x86_64__*/" >> $(gbe_config_file))
+$(shell echo "  #define GBE_OBJECT_DIR \"/system/lib/libgbe.so\"" >> $(gbe_config_file))
+$(shell echo "  #define INTERP_OBJECT_DIR \"/system/lib/libgbeinterp.so\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_BITCODE_BIN \"/system/lib/ocl/beignet.bc\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_HEADER_DIR \"/system/lib/ocl/include\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_PCH_OBJECT \"/system/lib/ocl/beignet.pch\"" >> $(gbe_config_file))
+$(shell echo "#endif" >> $(gbe_config_file))
+$(shell echo "#else /*__ANDROID__*/" >> $(gbe_config_file))
+$(shell echo "  #define GBE_OBJECT_DIR \"\"" >> $(gbe_config_file))
+$(shell echo "  #define INTERP_OBJECT_DIR \"\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_BITCODE_BIN \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_HEADER_DIR \"`pwd $(TOP)`/$(generated_path)/libocl/include\"" >> $(gbe_config_file))
+$(shell echo "  #define OCL_PCH_OBJECT \"`pwd $(TOP)`/$(generated_path)\"" >> $(gbe_config_file))
+$(shell echo "#endif" >> $(gbe_config_file))
+
+#Build HOST libgbe.so
+LOCAL_C_INCLUDES := $(TOP_C_INCLUDE) \
+                    $(BEIGNET_ROOT_PATH) \
+                    $(LOCAL_PATH)/../ \
+                    $(LLVM_INCLUDE_DIRS)
+LOCAL_CPPFLAGS +=  $(LLVM_CFLAGS) -std=c++11 -fexceptions -DGBE_DEBUG=0 -DGBE_COMPILER_AVAILABLE=1 -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+LOCAL_CFLAGS +=  $(LLVM_CFLAGS) -fexceptions -DGBE_DEBUG=0 -DGBE_COMPILER_AVAILABLE=1 -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+LOCAL_LDLIBS += -lpthread -lm -ldl -lLLVM -lclang
+#LOCAL_STATIC_LIBRARIES := $(CLANG_MODULE_LIBS)
+LOCAL_SHARED_LIBRARIES := libclang
+
+TBLGEN_TABLES :=    \
+	         AttrList.inc    \
+                 Attrs.inc    \
+                 CommentCommandList.inc \
+                 CommentNodes.inc \
+                 DeclNodes.inc    \
+                 DiagnosticCommonKinds.inc   \
+                 DiagnosticDriverKinds.inc       \
+                 DiagnosticFrontendKinds.inc     \
+                 DiagnosticSemaKinds.inc
+
+LOCAL_SRC_FILES = $(BACKEND_SRC_FILES)
+include $(CLANG_HOST_BUILD_MK)
+include $(CLANG_TBLGEN_RULES_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
+include $(BUILD_HOST_SHARED_LIBRARY)
+
+
+#Build gbe_bin_generater
+include $(CLEAR_VARS)
+LOCAL_SRC_FILES := gbe_bin_generater.cpp
+
+LOCAL_C_INCLUDES := $(TOP_C_INCLUDE) \
+                    $(BEIGNET_ROOT_PATH) \
+                    $(LOCAL_PATH)/ \
+                    $(LLVM_INCLUDE_DIRS)
+
+LOCAL_CLANG := true
+LOCAL_MODULE := gbe_bin_generater
+LOCAL_MODULE_TAGS := optional
+LOCAL_CFLAGS = $(LLVM_CFLAGS) -std=gnu++11 -fexceptions
+LOCAL_SHARED_LIBRARIES := libgbe
+LOCAL_LDLIBS += -lpthread -lm -ldl
+
+include $(BUILD_HOST_EXECUTABLE)
+
+
+#Build libgbeinterp.so
+include $(CLEAR_VARS)
+
+LLVM_ROOT_PATH := external/llvm
+include $(LLVM_ROOT_PATH)/llvm.mk
+
+LOCAL_C_INCLUDES := $(TOP_C_INCLUDE) \
+                    $(BEIGNET_ROOT_PATH) \
+                    $(LOCAL_PATH)/../ \
+                    $(LLVM_INCLUDE_DIRS)
+
+LOCAL_LDFLAGS := -Wl,--no-undefined
+
+LOCAL_CFLAGS += $(SUBDIR_C_CXX_FLAGS)
+LOCAL_CPPFLAGS += -Wl,-E -std=c++11 -DGBE_COMPILER_AVAILABLE=1
+
+LOCAL_MODULE := libgbeinterp
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := gbe_bin_interpreter.cpp
+LOCAL_SHARED_LIBRARIES := \
+libcutils \
+$(DRM_INTEL_LIBRARY) \
+$(DRM_LIBRARY)
+
+include $(LLVM_DEVICE_BUILD_MK)
+include $(BUILD_SHARED_LIBRARY)
+
+#Build targe libgbe.so
+include $(CLEAR_VARS)
+include $(CLEAR_TBLGEN_VARS)
+
+LOCAL_C_INCLUDES := $(TOP_C_INCLUDE) \
+                    $(BEIGNET_ROOT_PATH) \
+                    $(LOCAL_PATH)/../ \
+                    $(LLVM_INCLUDE_DIRS)
+
+SUBDIR_C_CXX_FLAGS := -fvisibility=hidden
+SUBDIR_C_CXX_FLAGS += -funroll-loops -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall
+SUBDIR_C_CXX_FLAGS += $(LLVM_CFLAGS)
+
+LOCAL_CPPFLAGS := $(SUBDIR_C_CXX_FLAGS)
+LOCAL_CPPFLAGS += -fno-rtti -std=c++11 -DGBE_DEBUG=1 -DGBE_COMPILER_AVAILABLE=1 -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+LOCAL_CPPFLAGS += -Wl,-E
+
+#LOCAL_SDK_VERSION := 19
+#LOCAL_NDK_STL_VARIANT := gnustl_static
+
+LOCAL_CFLAGS := $(SUBDIR_C_CXX_FLAGS)
+LOCAL_CFLAGS += -Wl,-E
+LOCAL_LDFLAGS := -Wl,--no-undefined
+LOCAL_LDLIBS := $(LLVM_LFLAGS)
+
+LOCAL_MODULE := libgbe
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE_CLASS := SHARED_LIBRARIES
+LOCAL_SHARED_LIBRARIES := \
+libcutils \
+$(DRM_INTEL_LIBRARY) \
+$(DRM_LIBRARY) \
+libclang libLLVM
+#$(THREAD_LIBS_INIT)
+#$(DL_LIBS)
+
+#LOCAL_STATIC_LIBRARIES := $(CLANG_MODULE_LIBS)
+
+TBLGEN_TABLES :=    \
+	         AttrList.inc    \
+                 Attrs.inc    \
+                 CommentCommandList.inc \
+                 CommentNodes.inc \
+                 DeclNodes.inc    \
+                 DiagnosticCommonKinds.inc   \
+                 DiagnosticDriverKinds.inc       \
+                 DiagnosticFrontendKinds.inc     \
+                 DiagnosticSemaKinds.inc
+
+LOCAL_SRC_FILES := $(BACKEND_SRC_FILES)
+
+include $(CLANG_DEVICE_BUILD_MK)
+include $(CLANG_TBLGEN_RULES_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
+include $(BUILD_SHARED_LIBRARY)
+
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index c0d0c23..41eb5ec 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -1,6 +1,6 @@
-set (OCL_BITCODE_BIN "${BEIGNET_INSTALL_DIR}beignet.bc")
+set (OCL_BITCODE_BIN "${BEIGNET_INSTALL_DIR}/beignet.bc")
 set (OCL_HEADER_DIR "${BEIGNET_INSTALL_DIR}/include")
-set (OCL_PCH_OBJECT "${BEIGNET_INSTALL_DIR}beignet.pch")
+set (OCL_PCH_OBJECT "${BEIGNET_INSTALL_DIR}/beignet.pch")
 set (GBE_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbe.so")
 set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
 
@@ -12,7 +12,6 @@ configure_file (
 #do not involve libocl if the standalone compiler is given,
 if (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
 add_subdirectory(libocl)
-add_dependencies(beignet_bitcode libocl)
 endif ()
 
 set (LOCAL_GBE_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbe.so" PARENT_SCOPE)
@@ -66,6 +65,8 @@ set (GBE_SRC
     ir/value.hpp
     ir/lowering.cpp
     ir/lowering.hpp
+    ir/profiling.cpp
+    ir/profiling.hpp
     ir/printf.cpp
     ir/printf.hpp
     ir/immediate.hpp
@@ -85,6 +86,7 @@ set (GBE_SRC
     llvm/llvm_intrinsic_lowering.cpp
     llvm/llvm_barrier_nodup.cpp
     llvm/llvm_printf_parser.cpp
+    llvm/llvm_profiling.cpp
     llvm/ExpandConstantExpr.cpp
     llvm/ExpandUtils.cpp
     llvm/PromoteIntegers.cpp
@@ -99,8 +101,11 @@ set (GBE_SRC
     backend/gen/gen_mesa_disasm.c
     backend/gen_insn_selection.cpp
     backend/gen_insn_selection.hpp
+    backend/gen_insn_selection_optimize.cpp
     backend/gen_insn_scheduling.cpp
     backend/gen_insn_scheduling.hpp
+    backend/gen_insn_selection_output.cpp
+    backend/gen_insn_selection_output.hpp
     backend/gen_reg_allocation.cpp
     backend/gen_reg_allocation.hpp
     backend/gen_context.cpp
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index b230aa8..675dc78 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -38,44 +38,47 @@ namespace gbe
   class SimpleAllocator
   {
   public:
-    SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
+    SimpleAllocator(int32_t startOffset, int32_t size);
     ~SimpleAllocator(void);
 
     /*! Allocate some memory from the pool.
      */
-    int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
+    int32_t allocate(int32_t size, int32_t alignment, bool bFwd=false);
 
     /*! Free the given register file piece */
-    void deallocate(int16_t offset);
+    void deallocate(int32_t offset);
 
+    /*! check whether a super register is in free list,
+     *  a super register means a 32byte register, Gen
+     *  often has 128 super registers*/
+    bool isSuperRegisterFree(int32_t offset);
     /*! Spilt a block into 2 blocks */
-    void splitBlock(int16_t offset, int16_t subOffset);
+    void splitBlock(int32_t offset, int32_t subOffset);
 
   protected:
     /*! Double chained list of free spaces */
     struct Block {
-      Block(int16_t offset, int16_t size) :
+      Block(int32_t offset, int32_t size) :
         prev(NULL), next(NULL), offset(offset), size(size) {}
       Block *prev, *next; //!< Previous and next free blocks
-      int16_t offset;        //!< Where the free block starts
-      int16_t size;          //!< Size of the free block
+      int32_t offset;        //!< Where the free block starts
+      int32_t size;          //!< Size of the free block
     };
 
     /*! Try to coalesce two blocks (left and right). They must be in that order.
      *  If the colascing was done, the left block is deleted
      */
     void coalesce(Block *left, Block *right);
+    void dumpFreeList();
     /*! the maximum offset */
-    int16_t maxOffset;
-    /*! whether trigger an assertion on allocation failure */
-    bool assertFail;
+    int32_t maxOffset;
     /*! Head and tail of the free list */
     Block *head;
     Block *tail;
     /*! Handle free list element allocation */
     DECL_POOL(Block, blockPool);
     /*! Track allocated memory blocks <offset, size> */
-    map<int16_t, int16_t> allocatedBlocks;
+    map<int32_t, int32_t> allocatedBlocks;
     /*! Use custom allocators */
     GBE_CLASS(SimpleAllocator);
   };
@@ -90,7 +93,7 @@ namespace gbe
 
   class RegisterAllocator: public SimpleAllocator {
   public:
-    RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {}
+    RegisterAllocator(int32_t offset, int32_t size): SimpleAllocator(offset, size) {}
 
     GBE_CLASS(RegisterAllocator);
   };
@@ -102,17 +105,15 @@ namespace gbe
 
   class ScratchAllocator: public SimpleAllocator {
   public:
-    ScratchAllocator(int16_t size): SimpleAllocator(0, size, true) {}
-    int16_t getMaxScatchMemUsed() { return maxOffset; }
+    ScratchAllocator(int32_t size): SimpleAllocator(0, size) {}
+    int32_t getMaxScatchMemUsed() { return maxOffset; }
 
     GBE_CLASS(ScratchAllocator);
   };
 
-  SimpleAllocator::SimpleAllocator(int16_t startOffset,
-                                   int16_t size,
-                                   bool _assertFail)
-                                  : maxOffset(0),
-                                  assertFail(_assertFail){
+  SimpleAllocator::SimpleAllocator(int32_t startOffset,
+                                   int32_t size)
+                                  : maxOffset(0) {
     tail = head = this->newBlock(startOffset, size);
   }
 
@@ -124,14 +125,38 @@ namespace gbe
     }
   }
 
-  int16_t SimpleAllocator::allocate(int16_t size, int16_t alignment, bool bFwd)
+  void SimpleAllocator::dumpFreeList() {
+    Block *s = head;
+    printf("register free list:\n");
+    while (s) {
+      printf("blk: %d(r%d.%d) (%d)\n", s->offset, s->offset/GEN_REG_SIZE, s->offset % GEN_REG_SIZE, s->size);
+      s = s->next;
+    }
+    printf("free list end\n");
+  }
+
+  bool SimpleAllocator::isSuperRegisterFree(int32_t offset) {
+    assert((offset % GEN_REG_SIZE) == 0);
+    Block *s = head;
+    while (s) {
+      if (s->offset <= offset && (s->offset+s->size) >= offset+GEN_REG_SIZE) {
+        return true;
+      }
+      if (s->offset > offset)
+        return false;
+      s = s->next;
+    }
+    return false;
+  }
+
+  int32_t SimpleAllocator::allocate(int32_t size, int32_t alignment, bool bFwd)
   {
     // Make it simple and just use the first block we find
     Block *list = bFwd ? head : tail;
     while (list) {
-      int16_t aligned;
-      int16_t spaceOnLeft;
-      int16_t spaceOnRight;
+      int32_t aligned;
+      int32_t spaceOnLeft;
+      int32_t spaceOnRight;
       if(bFwd) {
         aligned = ALIGN(list->offset, alignment);
         spaceOnLeft = aligned - list->offset;
@@ -143,7 +168,7 @@ namespace gbe
           continue;
         }
       } else {
-        int16_t unaligned = list->offset + list->size - size - (alignment-1);
+        int32_t unaligned = list->offset + list->size - size - (alignment-1);
         if(unaligned < 0) {
           list = list->prev;
           continue;
@@ -229,16 +254,15 @@ namespace gbe
       // We have a valid offset now
       return aligned;
     }
-    GBE_ASSERT( !assertFail );
-    return 0;
+    return -1;
   }
 
-  void SimpleAllocator::deallocate(int16_t offset)
+  void SimpleAllocator::deallocate(int32_t offset)
   {
     // Retrieve the size in the allocation map
     auto it = allocatedBlocks.find(offset);
     GBE_ASSERT(it != allocatedBlocks.end());
-    const int16_t size = it->second;
+    const int32_t size = it->second;
 
     // Find the two blocks where to insert the new block
     Block *list = tail, *next = NULL;
@@ -292,7 +316,7 @@ namespace gbe
     }
   }
 
-  void SimpleAllocator::splitBlock(int16_t offset, int16_t subOffset) {
+  void SimpleAllocator::splitBlock(int32_t offset, int32_t subOffset) {
     // Retrieve the size in the allocation map
     auto it = allocatedBlocks.find(offset);
     GBE_ASSERT(it != allocatedBlocks.end());
@@ -306,7 +330,7 @@ namespace gbe
 
     if(subOffset == 0)
       return;
-    int16_t size = it->second;
+    int32_t size = it->second;
     allocatedBlocks.erase(it);
     // Track the allocation to retrieve the size later
     allocatedBlocks.insert(std::make_pair(offset, subOffset));
@@ -322,7 +346,7 @@ namespace gbe
     unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL), useDWLabel(false)
   {
     GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
-    this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn));
+    this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn), true);
     this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
     // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
     this->registerAllocator = NULL; //GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
@@ -373,13 +397,17 @@ namespace gbe
     return this->kernel;
   }
 
-  int16_t Context::allocate(int16_t size, int16_t alignment) {
-    return registerAllocator->allocate(size, alignment);
+  int32_t Context::allocate(int32_t size, int32_t alignment, bool bFwd) {
+    return registerAllocator->allocate(size, alignment, bFwd);
   }
 
-  void Context::deallocate(int16_t offset) { registerAllocator->deallocate(offset); }
+  bool Context::isSuperRegisterFree(int offset) {
+    return registerAllocator->isSuperRegisterFree(offset);
+  }
+
+  void Context::deallocate(int32_t offset) { registerAllocator->deallocate(offset); }
 
-  void Context::splitBlock(int16_t offset, int16_t subOffset) {
+  void Context::splitBlock(int32_t offset, int32_t subOffset) {
     registerAllocator->splitBlock(offset, subOffset);
   }
 
@@ -396,14 +424,14 @@ namespace gbe
 
   void Context::buildStack(void) {
     const auto &stackUse = dag->getUse(ir::ocl::stackptr);
-    if (stackUse.size() == 0)  // no stack is used if stackptr is unused
+    if (stackUse.size() == 0) {  // no stack is used if stackptr is unused
+      this->kernel->stackSize = 0;
       return;
-    // Be sure that the stack pointer is set
-    // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
+    }
     uint32_t stackSize = 128;
     while (stackSize < fn.getStackSize()) {
       stackSize *= 3;
-      GBE_ASSERT(stackSize <= 64*KB);
+      //GBE_ASSERT(stackSize <= 64*KB);
     }
     this->kernel->stackSize = stackSize;
   }
@@ -421,20 +449,6 @@ namespace gbe
     return offset;
   }
 
-  uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
-  {
-    int32_t offset = fn.getImageSet()->getInfoOffset(key);
-    if (offset >= 0)
-      return offset + GEN_REG_SIZE;
-    newCurbeEntry(GBE_CURBE_IMAGE_INFO, key.data, size, 4);
-    std::sort(kernel->patches.begin(), kernel->patches.end());
-
-    offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, key.data);
-    GBE_ASSERT(offset >= 0); // XXX do we need to spill it out to bo?
-    fn.getImageSet()->appendInfo(key, offset);
-    return offset + GEN_REG_SIZE;
-  }
-
   void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
     curbeRegs.insert(std::make_pair(reg, offset));
   }
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index faa7c8a..1567bd6 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -85,14 +85,12 @@ namespace gbe
       return JIPs.find(insn) != JIPs.end();
     }
     /*! Allocate some memory in the register file */
-    int16_t allocate(int16_t size, int16_t alignment);
+    int32_t allocate(int32_t size, int32_t alignment, bool bFwd = true);
+    bool isSuperRegisterFree(int offset);
     /*! Deallocate previously allocated memory */
-    void deallocate(int16_t offset);
+    void deallocate(int32_t offset);
     /*! Spilt a block into 2 blocks, for some registers allocate together but  deallocate seperate */
-    void splitBlock(int16_t offset, int16_t subOffset);
-    /* allocate a new entry for a specific image's information */
-    /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
-    uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
+    void splitBlock(int32_t offset, int32_t subOffset);
     /*! allocate size scratch memory and return start address */
     int32_t allocateScratchMem(uint32_t size);
     /*! deallocate scratch memory at offset */
@@ -107,6 +105,21 @@ namespace gbe
     uint32_t getMaxLabel(void) const {
       return this->isDWLabel() ? 0xffffffff : 0xffff;
     }
+    /*! get register's payload type. */
+    INLINE void getRegPayloadType(ir::Register reg, gbe_curbe_type &curbeType, int &subType) const {
+      if (reg.value() >= fn.getRegisterFile().regNum()) {
+        curbeType = GBE_GEN_REG;
+        subType = 0;
+        return;
+      }
+      fn.getRegPayloadType(reg, curbeType, subType);
+    }
+    /*! check whether a register is a payload register */
+    INLINE bool isPayloadReg(ir::Register reg) const{
+      if (reg.value() >= fn.getRegisterFile().regNum())
+        return false;
+      return fn.isPayloadReg(reg);
+    }
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 5220233..5653275 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -77,6 +77,7 @@ static const struct {
   [GEN_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_MAD] = { .name = "mad", .nsrc = 3, .ndst = 1 },
+  [GEN_OPCODE_LRP] = { .name = "lrp", .nsrc = 3, .ndst = 1 },
   [GEN_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
@@ -84,6 +85,7 @@ static const struct {
   [GEN_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_MADM] = { .name = "madm", .nsrc = 3, .ndst = 1 },
 
   [GEN_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
@@ -271,6 +273,14 @@ static const char *reg_encoding[11] = {
   [10] = ":HF"
 };
 
+static const char *reg_encoding_3src[5] = {
+  [0] = ":F",
+  [1] = ":D",
+  [2] = ":UD",
+  [3] = ":DF",
+  [4] = ":HF",
+};
+
 int reg_type_size[11] = {
   [0] = 4,
   [1] = 4,
@@ -311,6 +321,18 @@ static const char *writemask[16] = {
   [0xf] = "",
 };
 
+static const char *special_acc[9] = {
+  [0x0] = ".acc2",
+  [0x1] = ".acc3",
+  [0x2] = ".acc4",
+  [0x3] = ".acc5",
+  [0x4] = ".acc6",
+  [0x5] = ".acc7",
+  [0x6] = ".acc8",
+  [0x7] = ".acc9",
+  [0x8] = ".noacc",
+};
+
 static const char *end_of_thread[2] = {
   [0] = "",
   [1] = "EOT"
@@ -383,10 +405,24 @@ static const char *math_function_gen8[16] = {
   [GEN_MATH_FUNCTION_FDIV] = "fdiv",
   [GEN_MATH_FUNCTION_POW] = "pow",
   [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+  [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv",
+  [GEN_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod",
   [GEN8_MATH_FUNCTION_INVM] = "invm",
   [GEN8_MATH_FUNCTION_RSQRTM] = "rsqrtm",
 };
 
+static const char *data_port_data_cache_data_size[] = {
+  "1 byte",
+  "2 bytes",
+  "4 bytes",
+  "Reserved"
+};
+
+static const char *data_port_data_cache_byte_scattered_simd_mode[] = {
+  "SIMD8",
+  "SIMD16",
+};
+
 static const char *data_port_data_cache_simd_mode[] = {
   "SIMD4x2",
   "SIMD16",
@@ -398,6 +434,14 @@ static const char *data_port_data_cache_category[] = {
   "scratch",
 };
 
+static const char *data_port_data_cache_block_size[] = {
+  "1 OWORD LOW",
+  "1 OWORD HIGH",
+  "2 OWORD",
+  "4 OWORD",
+  "8 OWORD",
+};
+
 static const char *data_port_scratch_block_size[] = {
   "1 register",
   "2 registers",
@@ -455,6 +499,13 @@ static int column;
 
 static int gen_version;
 
+#define GEN7_BITS_FIELD(inst, gen7) \
+  ({                                                            \
+    int bits;                                                   \
+      bits = ((const union Gen7NativeInstruction *)inst)->gen7; \
+    bits;                                                       \
+  })
+
 #define GEN_BITS_FIELD(inst, gen)                               \
   ({                                                            \
     int bits;                                                   \
@@ -509,6 +560,8 @@ static int gen_version;
 #define EXECUTION_SIZE(inst)       GEN_BITS_FIELD(inst, header.execution_size)
 #define BRANCH_JIP(inst)           GEN_BITS_FIELD2(inst, bits3.gen7_branch.jip, bits3.gen8_branch.jip/8)
 #define BRANCH_UIP(inst)           GEN_BITS_FIELD2(inst, bits3.gen7_branch.uip, bits2.gen8_branch.uip/8)
+#define VME_BTI(inst)              GEN7_BITS_FIELD(inst, bits3.vme_gen7.bti)
+#define VME_MSG_TYPE(inst)         GEN7_BITS_FIELD(inst, bits3.vme_gen7.msg_type)
 #define SAMPLE_BTI(inst)           GEN_BITS_FIELD(inst, bits3.sampler_gen7.bti)
 #define SAMPLER(inst)              GEN_BITS_FIELD(inst, bits3.sampler_gen7.sampler)
 #define SAMPLER_MSG_TYPE(inst)     GEN_BITS_FIELD(inst, bits3.sampler_gen7.msg_type)
@@ -518,6 +571,8 @@ static int gen_version;
 #define UNTYPED_RW_SIMD_MODE(inst) GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.simd_mode)
 #define UNTYPED_RW_CATEGORY(inst)  GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.category)
 #define UNTYPED_RW_MSG_TYPE(inst)  GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.msg_type)
+#define BYTE_RW_SIMD_MODE(inst)    GEN_BITS_FIELD(inst, bits3.gen7_byte_rw.simd_mode)
+#define BYTE_RW_DATA_SIZE(inst)    GEN_BITS_FIELD(inst, bits3.gen7_byte_rw.data_size)
 #define SCRATCH_RW_OFFSET(inst)    GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.offset)
 #define SCRATCH_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.block_size)
 #define SCRATCH_RW_INVALIDATE_AFTER_READ(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.invalidate_after_read)
@@ -531,6 +586,25 @@ static int gen_version;
 #define MSG_GW_ACKREQ(inst)        GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.ackreq)
 #define GENERIC_MSG_LENGTH(inst)   GEN_BITS_FIELD(inst, bits3.generic_gen5.msg_length)
 #define GENERIC_RESPONSE_LENGTH(inst) GEN_BITS_FIELD(inst, bits3.generic_gen5.response_length)
+#define OWORD_RW_BLOCK_SIZE(inst)    GEN_BITS_FIELD(inst, bits3.gen7_oblock_rw.block_size)
+
+static int is_special_acc(const void* inst)
+{
+  if (gen_version < 80)
+    return 0;
+
+  if (OPCODE(inst) != GEN_OPCODE_MADM && OPCODE(inst) != GEN_OPCODE_MATH)
+    return 0;
+
+  if (OPCODE(inst) == GEN_OPCODE_MATH &&
+    (MATH_FUNCTION(inst) != GEN8_MATH_FUNCTION_INVM && MATH_FUNCTION(inst) != GEN8_MATH_FUNCTION_RSQRTM))
+    return 0;
+
+  if (ACCESS_MODE(inst) != GEN_ALIGN_16)
+    return 0;
+
+  return 1;
+}
 
 static int string(FILE *file, const char *string)
 {
@@ -688,7 +762,12 @@ static int dest(FILE *file, const void* inst)
         format(file, ".%d", GEN_BITS_FIELD(inst, bits1.da16.dest_subreg_nr) /
                reg_type_size[GEN_BITS_FIELD(inst, bits1.da16.dest_reg_type)]);
       string(file, "<1>");
-      err |= control(file, "writemask", writemask, GEN_BITS_FIELD(inst, bits1.da16.dest_writemask), NULL);
+
+      if (is_special_acc(inst)) {
+        err |= control(file, "specialacc", special_acc, ((const union Gen8NativeInstruction *)inst)->bits1.da16acc.dst_special_acc, NULL);
+      } else {
+        err |= control(file, "writemask", writemask, GEN_BITS_FIELD(inst, bits1.da16.dest_writemask), NULL);
+      }
       err |= control(file, "dest reg encoding", reg_encoding, GEN_BITS_FIELD(inst, bits1.da16.dest_reg_type), NULL);
     } else {
       err = 1;
@@ -710,8 +789,17 @@ static int dest_3src(FILE *file, const void *inst)
   if (GEN_BITS_FIELD(inst, bits1.da3src.dest_subreg_nr))
     format(file, ".%d", GEN_BITS_FIELD(inst, bits1.da3src.dest_subreg_nr));
   string(file, "<1>");
-  err |= control(file, "writemask", writemask, GEN_BITS_FIELD(inst, bits1.da3src.dest_writemask), NULL);
-  err |= control(file, "dest reg encoding", reg_encoding, GEN_TYPE_F, NULL);
+  if (is_special_acc(inst)) {
+    err |= control(file, "specialacc", special_acc, ((const union Gen8NativeInstruction *)inst)->bits1.da3srcacc.dst_special_acc, NULL);
+  } else {
+    err |= control(file, "writemask", writemask, GEN_BITS_FIELD(inst, bits1.da3src.dest_writemask), NULL);
+  }
+
+  if (gen_version < 80) {
+    err |= control(file, "dest reg encoding", reg_encoding, GEN_TYPE_F, NULL);
+  } else {
+    err |= control(file, "dest reg encoding", reg_encoding_3src, ((const union Gen8NativeInstruction *)inst)->bits1.da3src.dest_type, NULL);
+  }
 
   return 0;
 }
@@ -775,7 +863,7 @@ static int src_ia1(FILE *file,
   return err;
 }
 
-static int src_da16(FILE *file,
+static int src_da16(FILE *file, const void* inst, int src_num,
                     uint32_t _reg_type,
                     uint32_t _reg_file,
                     uint32_t _vert_stride,
@@ -803,6 +891,17 @@ static int src_da16(FILE *file,
 
   err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
   string(file, ",4,1>");
+
+  if (is_special_acc(inst)) {
+    if (src_num == 0) {
+      err |= control(file, "specialacc", special_acc, ((const union Gen8NativeInstruction *)inst)->bits2.da16acc.src0_special_acc_lo, NULL);
+    } else {
+      assert(src_num == 1);
+      err |= control(file, "specialacc", special_acc, ((const union Gen8NativeInstruction *)inst)->bits3.da16acc.src1_special_acc_lo, NULL);
+    }
+    return err;
+  }
+
   /*
    * Three kinds of swizzle display:
    *  identity - nothing printed
@@ -846,10 +945,18 @@ static int src0_3src(FILE *file, const void* inst)
     format(file, ".%d", GEN_BITS_FIELD(inst, bits2.da3src.src0_subreg_nr));
   if (GEN_BITS_FIELD(inst, bits2.da3src.src0_rep_ctrl))
     string(file, "<0,1,0>");
-  else
-    string(file, "<8,8,1>");
-  err |= control(file, "src da16 reg type", reg_encoding,
-                 GEN_TYPE_F, NULL);
+
+  if (gen_version < 80) {
+    err |= control(file, "src da16 reg type", reg_encoding, GEN_TYPE_F, NULL);
+  } else {
+    err |= control(file, "src da16 reg type", reg_encoding_3src, ((const union Gen8NativeInstruction *)inst)->bits1.da3src.src_type, NULL);
+  }
+
+  if (is_special_acc(inst)) {
+    err |= control(file, "specialacc", special_acc, ((const union Gen8NativeInstruction *)inst)->bits2.da3srcacc.src0_special_acc, NULL);
+    return err;
+  }
+
   /*
    * Three kinds of swizzle display:
    *  identity - nothing printed
@@ -894,10 +1001,18 @@ static int src1_3src(FILE *file, const void* inst)
     format(file, ".%d", src1_subreg_nr);
   if (GEN_BITS_FIELD(inst, bits2.da3src.src1_rep_ctrl))
     string(file, "<0,1,0>");
-  else
-    string(file, "<8,8,1>");
-  err |= control(file, "src da16 reg type", reg_encoding,
-                 GEN_TYPE_F, NULL);
+
+  if (gen_version < 80) {
+    err |= control(file, "src da16 reg type", reg_encoding, GEN_TYPE_F, NULL);
+  } else {
+    err |= control(file, "src da16 reg type", reg_encoding_3src, ((const union Gen8NativeInstruction *)inst)->bits1.da3src.src_type, NULL);
+  }
+
+  if (is_special_acc(inst)) {
+    err |= control(file, "specialacc", special_acc, ((const union Gen8NativeInstruction *)inst)->bits2.da3srcacc.src1_special_acc, NULL);
+    return err;
+  }
+
   /*
    * Three kinds of swizzle display:
    *  identity - nothing printed
@@ -939,10 +1054,18 @@ static int src2_3src(FILE *file, const void* inst)
     format(file, ".%d", GEN_BITS_FIELD(inst, bits3.da3src.src2_subreg_nr));
   if (GEN_BITS_FIELD(inst, bits3.da3src.src2_rep_ctrl))
     string(file, "<0,1,0>");
-  else
-    string(file, "<8,8,1>");
-  err |= control(file, "src da16 reg type", reg_encoding,
-                 GEN_TYPE_F, NULL);
+
+  if (gen_version < 80) {
+    err |= control(file, "src da16 reg type", reg_encoding, GEN_TYPE_F, NULL);
+  } else {
+    err |= control(file, "src da16 reg type", reg_encoding_3src, ((const union Gen8NativeInstruction *)inst)->bits1.da3src.src_type, NULL);
+  }
+
+  if (is_special_acc(inst)) {
+    err |= control(file, "specialacc", special_acc, ((const union Gen8NativeInstruction *)inst)->bits3.da3srcacc.src2_special_acc, NULL);
+    return err;
+  }
+
   /*
    * Three kinds of swizzle display:
    *  identity - nothing printed
@@ -1066,6 +1189,16 @@ static int imm(FILE *file, uint32_t type, const void* inst)
       format(file, "%-gHF", f);
       break;
     }
+    case GEN_TYPE_DF_IMM:
+    {
+      assert(!(gen_version < 80));
+      double val;
+      uint32_t hi = (((const union Gen8NativeInstruction *)inst)->bits3).ud;
+      uint32_t lo = (((const union Gen8NativeInstruction *)inst)->bits2).ud;
+      memcpy((void *)(&val), &lo, sizeof(uint32_t));
+      memcpy(((void *)(&val) + sizeof(uint32_t)), &hi, sizeof(uint32_t));
+      format(file, "%f", val);
+    }
   }
   return 0;
 }
@@ -1106,7 +1239,7 @@ static int src0(FILE *file, const void* inst)
     }
   } else {
     if (GEN_BITS_FIELD(inst, bits2.da16.src0_address_mode) == GEN_ADDRESS_DIRECT) {
-      return src_da16(file,
+      return src_da16(file, inst, 0,
                       GEN_BITS_FIELD(inst, bits1.da16.src0_reg_type),
                       GEN_BITS_FIELD(inst, bits1.da16.src0_reg_file),
                       GEN_BITS_FIELD(inst, bits2.da16.src0_vert_stride),
@@ -1157,7 +1290,7 @@ static int src1(FILE *file, const void* inst)
     }
   } else {
     if (GEN_BITS_FIELD(inst, bits3.da16.src1_address_mode) == GEN_ADDRESS_DIRECT) {
-      return src_da16(file,
+      return src_da16(file, inst, 1,
                       GEN_BITS_FIELD2(inst, bits1.da16.src1_reg_type, bits2.da16.src1_reg_type),
                       GEN_BITS_FIELD2(inst, bits1.da16.src1_reg_file, bits2.da16.src1_reg_file),
                       GEN_BITS_FIELD(inst, bits3.da16.src1_vert_stride),
@@ -1333,6 +1466,11 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
 
     if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
       switch (target) {
+        case GEN_SFID_VIDEO_MOTION_EST:
+          format(file, " (bti: %d, msg_type: %d)",
+                 VME_BTI(inst),
+                 VME_MSG_TYPE(inst));
+          break;
         case GEN_SFID_SAMPLER:
           format(file, " (%d, %d, %d, %d)",
                  SAMPLE_BTI(inst),
@@ -1340,14 +1478,39 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
                  SAMPLER_MSG_TYPE(inst),
                  SAMPLER_SIMD_MODE(inst));
           break;
+        case GEN_SFID_DATAPORT_RENDER:
+            if(UNTYPED_RW_MSG_TYPE(inst) == 4 || UNTYPED_RW_MSG_TYPE(inst) == 10)
+              format(file, " (bti: %d, %s, %s)",
+                     UNTYPED_RW_BTI(inst),
+                     data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                     data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            else
+              format(file, " not implemented");
+            break;
         case GEN_SFID_DATAPORT_DATA:
           if(UNTYPED_RW_CATEGORY(inst) == 0) {
-            format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
+            if(UNTYPED_RW_MSG_TYPE(inst) == 5 || UNTYPED_RW_MSG_TYPE(inst) == 13)
+              format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
                    UNTYPED_RW_BTI(inst),
                    UNTYPED_RW_RGBA(inst),
                    data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
                    data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
                    data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            else if(UNTYPED_RW_MSG_TYPE(inst) == 4 || UNTYPED_RW_MSG_TYPE(inst) == 12)
+              format(file, " (bti: %d, data size: %s, %s, %s, %s)",
+                   UNTYPED_RW_BTI(inst),
+                   data_port_data_cache_data_size[BYTE_RW_DATA_SIZE(inst)],
+                   data_port_data_cache_byte_scattered_simd_mode[BYTE_RW_SIMD_MODE(inst)],
+                   data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                   data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            else if(UNTYPED_RW_MSG_TYPE(inst) == 0 || UNTYPED_RW_MSG_TYPE(inst) == 8)
+              format(file, " (bti: %d, data size: %s, %s, %s)",
+                   UNTYPED_RW_BTI(inst),
+                   data_port_data_cache_block_size[OWORD_RW_BLOCK_SIZE(inst)],
+                   data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                   data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            else
+              format(file, " not implemented");
           } else {
             format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
                    SCRATCH_RW_OFFSET(inst),
@@ -1358,12 +1521,18 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
           }
           break;
         case GEN_SFID_DATAPORT1_DATA:
-          format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
-                 UNTYPED_RW_BTI(inst),
-                 UNTYPED_RW_RGBA(inst),
-                 data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
-                 data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
-                 data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            if(UNTYPED_RW_MSG_TYPE(inst) == 4 || UNTYPED_RW_MSG_TYPE(inst) == 10)
+              format(file, " (bti: %d, %s, %s)",
+                     UNTYPED_RW_BTI(inst),
+                     data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                     data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            else
+              format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                     UNTYPED_RW_BTI(inst),
+                     UNTYPED_RW_RGBA(inst),
+                     data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+                     data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                     data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
           break;
         case GEN_SFID_DATAPORT_CONSTANT:
           format(file, " (bti: %d, %s)",
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index b9dfb18..4376734 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -44,13 +44,7 @@ namespace gbe
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      GenRegister sr0 = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
-                                    GEN_ARF_STATE,
-                                    1,
-                                    GEN_TYPE_UD,
-                                    GEN_VERTICAL_STRIDE_8,
-                                    GEN_WIDTH_8,
-                                    GEN_HORIZONTAL_STRIDE_1);
+      GenRegister sr0 = GenRegister::sr(0, 1);
       p->SHR(sr0, slm_index, GenRegister::immud(16));
     p->pop();
   }
@@ -67,40 +61,44 @@ namespace gbe
     using namespace ir;
 
     // Only emit stack pointer computation if we use a stack
-    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+    if (kernel->getStackSize() == 0)
       return;
 
     // Check that everything is consistent in the kernel code
     const uint32_t perLaneSize = kernel->getStackSize();
-    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
 
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
+    // borrow block ip as temporary register as we will
+    // initialize block ip latter.
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+    const GenRegister tmpReg_ud = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UD);
 
     // We compute the per-lane stack pointer here
-    // private address start from zero
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
       //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
-      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
-      p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
-      p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+      p->AND(stackptr, GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
+      p->SHR(stackptr, stackptr, GenRegister::immud(7));
+      p->SHL(tmpReg, tmpReg, GenRegister::immud(2));
+      p->ADD(tmpReg, tmpReg, stackptr); //threadId
+
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
       p->curr.execWidth = this->simdWidth;
-      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
+      loadLaneID(stackptr);
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
       p->curr.execWidth = 1;
-      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
-      p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
-      if(perThreadSize > 0xffff) {
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
-      } else
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
+      p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
       p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+      p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize
+
     p->pop();
   }
 
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
index 135be02..fc37991 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -221,72 +221,6 @@ namespace gbe
     }
   }
 
-
-  void Gen75Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
-    union { double d; unsigned u[2]; } u;
-    u.d = value;
-    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
-    push();
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = 1;
-    curr.execWidth = 1;
-    MOV(r, GenRegister::immud(u.u[0]));
-    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[1]));
-    pop();
-    r.type = GEN_TYPE_DF;
-    r.vstride = GEN_VERTICAL_STRIDE_0;
-    r.width = GEN_WIDTH_1;
-    r.hstride = GEN_HORIZONTAL_STRIDE_0;
-    push();
-    uint32_t width = curr.execWidth;
-    curr.execWidth = 8;
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = 1;
-    curr.quarterControl = GEN_COMPRESSION_Q1;
-    MOV(dest, r);
-    if (width == 16) {
-      curr.quarterControl = GEN_COMPRESSION_Q2;
-      MOV(GenRegister::offset(dest, 2), r);
-    }
-    pop();
-  }
-
-  void Gen75Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp) {
-    GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
-    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F);
-    int w = curr.execWidth;
-    GenRegister r0;
-    r0 = GenRegister::h2(r);
-    push();
-    curr.execWidth = 4;
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = 1;
-    MOV(r0, src0);
-    MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
-    curr.noMask = 0;
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(dest, r0);
-    curr.nibControl = 1;
-    MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r0, 4));
-    pop();
-    if (w == 16) {
-      push();
-      curr.execWidth = 4;
-      curr.predicate = GEN_PREDICATE_NONE;
-      curr.noMask = 1;
-      MOV(r0, GenRegister::suboffset(src0, 8));
-      MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
-      curr.noMask = 0;
-      curr.quarterControl = 1;
-      curr.nibControl = 0;
-      MOV(GenRegister::suboffset(dest, 8), r0);
-      curr.nibControl = 1;
-      MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r0, 4));
-      pop();
-    }
-  }
-
   void Gen75Encoder::JMPI(GenRegister src, bool longjmp) {
     alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
   }
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
index 5d80bbd..d06f393 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -33,8 +33,6 @@ namespace gbe
   class Gen75Encoder : public Gen7Encoder
   {
   public:
-    /*! exec width of the double data type */    
-    #define GEN75_DOUBLE_EXEC_WIDTH  4
     virtual ~Gen75Encoder(void) { }
 
     Gen75Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
@@ -44,10 +42,6 @@ namespace gbe
     virtual void JMPI(GenRegister src, bool longjmp = false);
     /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
     virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
-    /*! Get double/long exec width */
-    virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
-    virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
-    virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
     virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
     virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index fc358be..4f35491 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -175,6 +175,16 @@ namespace gbe
   {
      GenNativeInstruction *insn = this->next(opcode);
      Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
+     int execution_size = 0;
+     if (this->curr.execWidth == 1) {
+       execution_size = GEN_WIDTH_1;
+     } else if (this->curr.execWidth == 8) {
+       execution_size = GEN_WIDTH_8;
+     } else if (this->curr.execWidth == 16) {
+       // Gen7 does not support SIMD16 alu3, still need to use SIMD8
+       execution_size = GEN_WIDTH_8;
+     } else
+       NOT_IMPLEMENTED;
 
      assert(dest.file == GEN_GENERAL_REGISTER_FILE);
      assert(dest.nr < 128);
@@ -182,11 +192,11 @@ namespace gbe
      assert(dest.type = GEN_TYPE_F);
      gen7_insn->bits1.da3src.dest_reg_file = 0;
      gen7_insn->bits1.da3src.dest_reg_nr = dest.nr;
-     gen7_insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
+     gen7_insn->bits1.da3src.dest_subreg_nr = dest.subnr / 4;
      gen7_insn->bits1.da3src.dest_writemask = 0xf;
      this->setHeader(insn);
      gen7_insn->header.access_mode = GEN_ALIGN_16;
-     gen7_insn->header.execution_size = GEN_WIDTH_8;
+     gen7_insn->header.execution_size = execution_size;
 
      assert(src0.file == GEN_GENERAL_REGISTER_FILE);
      assert(src0.address_mode == GEN_ADDRESS_DIRECT);
@@ -239,5 +249,53 @@ namespace gbe
      }
   }
 
+  static void setMBlockRWGEN7(GenEncoder *p,
+                          GenNativeInstruction *insn,
+                          uint32_t bti,
+                          uint32_t msg_type,
+                          uint32_t msg_length,
+                          uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_RENDER;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_mblock_rw.msg_type = msg_type;
+    insn->bits3.gen7_mblock_rw.bti = bti;
+    insn->bits3.gen7_mblock_rw.header_present = 1;
+  }
+
+
+  void Gen7Encoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1;
+    const uint32_t response_length = size; // Size of registers
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setMBlockRWGEN7(this,
+                insn,
+                bti,
+                GEN75_P1_MEDIA_BREAD,
+                msg_length,
+                response_length);
+  }
+
+  void Gen7Encoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1 + size;
+    const uint32_t response_length = 0; // Size of registers
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setMBlockRWGEN7(this,
+                insn,
+                bti,
+                GEN75_P1_MEDIA_TYPED_BWRITE,
+                msg_length,
+                response_length);
+  }
+
+
 #undef NO_SWIZZLE
 }
diff --git a/backend/src/backend/gen7_encoder.hpp b/backend/src/backend/gen7_encoder.hpp
index f009263..edb711d 100644
--- a/backend/src/backend/gen7_encoder.hpp
+++ b/backend/src/backend/gen7_encoder.hpp
@@ -31,21 +31,21 @@ namespace gbe
   class Gen7Encoder : public GenEncoder
   {
   public:
-    /*! gen7 exec width of the double data type */
-    #define GEN7_DOUBLE_EXEC_WIDTH  8
     virtual ~Gen7Encoder(void) { }
 
     Gen7Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
          : GenEncoder(simdWidth, gen, deviceID) { }
 
-    /*! Get double/long exec width */
-    virtual int getDoubleExecWidth(void) { return GEN7_DOUBLE_EXEC_WIDTH; }
     virtual void setHeader(GenNativeInstruction *insn);
     virtual void setDst(GenNativeInstruction *insn, GenRegister dest);
     virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg);
     virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
     virtual void alu3(uint32_t opcode, GenRegister dst,
                        GenRegister src0, GenRegister src1, GenRegister src2);
+    /*! MBlock read */
+    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! MBlock write */
+    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
   };
 }
 #endif /* __GBE_GEN7_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen7_instruction.hpp b/backend/src/backend/gen7_instruction.hpp
index 51f342b..7d7eada 100644
--- a/backend/src/backend/gen7_instruction.hpp
+++ b/backend/src/backend/gen7_instruction.hpp
@@ -350,6 +350,21 @@ union Gen7NativeInstruction
         uint32_t end_of_thread:1;
       } sampler_gen7;
 
+      struct {
+        uint32_t bti:8;
+        uint32_t vme_search_path_lut:3;
+        uint32_t lut_sub:2;
+        uint32_t msg_type:2;
+        uint32_t stream_in:1;
+        uint32_t stream_out:1;
+        uint32_t reserved_mbz:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } vme_gen7;
+
       /**
        * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
        *
@@ -516,6 +531,22 @@ union Gen7NativeInstruction
         uint32_t uip:16;
       } gen7_branch;
 
+      /*! Data port Media block read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t ver_line_stride_offset:1;
+        uint32_t ver_line_stride:1;
+        uint32_t ver_line_stride_override:1;
+        uint32_t ignored:3;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_mblock_rw;
+
       int d;
       uint32_t ud;
       float f;
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index a92bdde..5809835 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -178,11 +178,13 @@ namespace gbe
               p->curr.noMask = 1;
               GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
               p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
-              ind_src.addr_imm += 16;
+              if(!uniform_src)
+                ind_src.addr_imm += 16;
               p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 16), ind_src);
               if (simd == 16) {
                 for (int i = 0; i < 2; i++) {
-                  ind_src.addr_imm += 16;
+                  if(!uniform_src)
+                    ind_src.addr_imm += 16;
                   p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 16*i), ind_src);
                 }
               }
@@ -237,12 +239,77 @@ namespace gbe
               GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
               p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
               if (simd == 16) {
-                ind_src.addr_imm += 16;
+                if(!uniform_src)
+                  ind_src.addr_imm += 16;
                 p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 16), ind_src);
               }
               p->pop();
 
               p->MOV(dst, tmp);
+          }else if (src.type == GEN_TYPE_UL || src.type == GEN_TYPE_L) {
+              bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+              GBE_ASSERT(uniform_src || src.subnr == 0);
+              GBE_ASSERT(dst.subnr == 0);
+              GBE_ASSERT(tmp.subnr == 0);
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 7;
+              new_a0[1] = start_addr + 6;
+              new_a0[2] = start_addr + 5;
+              new_a0[3] = start_addr + 4;
+              new_a0[4] = start_addr + 3;
+              new_a0[5] = start_addr + 2;
+              new_a0[6] = start_addr + 1;
+              new_a0[7] = start_addr;
+              if(!uniform_src) {
+                new_a0[8] = start_addr + 15;
+                new_a0[9] = start_addr + 14;
+                new_a0[10] = start_addr + 13;
+                new_a0[11] = start_addr + 12;
+                new_a0[12] = start_addr + 11;
+                new_a0[13] = start_addr + 10;
+                new_a0[14] = start_addr + 9;
+                new_a0[15] = start_addr + 8;
+              } else {
+                new_a0[8] = start_addr + 7;
+                new_a0[9] = start_addr + 6;
+                new_a0[10] = start_addr + 5;
+                new_a0[11] = start_addr + 4;
+                new_a0[12] = start_addr + 3;
+                new_a0[13] = start_addr + 2;
+                new_a0[14] = start_addr + 1;
+                new_a0[15] = start_addr;
+              }
+              this->setA0Content(new_a0, 56);
+
+              p->push();
+              p->curr.execWidth = 16;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
+              p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+              if(!uniform_src)
+                ind_src.addr_imm += 16;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 16), ind_src);
+              for (int i = 0; i < 2; i++) {
+                if(!uniform_src)
+                  ind_src.addr_imm += 16;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 16*i), ind_src);
+              }
+              if (simd == 16) {
+                for (int i = 0; i < 2; i++) {
+                  if(!uniform_src)
+                    ind_src.addr_imm += 16;
+                  p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 2, 16*i), ind_src);
+                }
+                for (int i = 0; i < 2; i++) {
+                  if(!uniform_src)
+                    ind_src.addr_imm += 16;
+                  p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 3, 16*i), ind_src);
+                }
+              }
+              p->pop();
+
+              p->MOV(dst, tmp);
             } else {
               GBE_ASSERT(0);
             }
@@ -259,20 +326,14 @@ namespace gbe
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
     assert(insn.opcode == SEL_OP_SIMD_SHUFFLE);
-
-    uint32_t simd = p->curr.execWidth;
-    if (src1.file == GEN_IMMEDIATE_VALUE) {
-      uint32_t offset = src1.value.ud % simd;
-      GenRegister reg = GenRegister::suboffset(src0, offset);
-      p->MOV(dst, GenRegister::retype(GenRegister::ud1grf(reg.nr, reg.subnr / typeSize(reg.type)), reg.type));
-    } else {
-      uint32_t base = src0.nr * 32 + src0.subnr * 4;
-      GenRegister baseReg = GenRegister::immuw(base);
-      const GenRegister a0 = GenRegister::addr8(0);
-      p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
-      GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
-      p->MOV(dst, indirect);
-    }
+    assert (src1.file != GEN_IMMEDIATE_VALUE);
+
+    uint32_t base = src0.nr * 32 + src0.subnr * 4;
+    GenRegister baseReg = GenRegister::immuw(base);
+    const GenRegister a0 = GenRegister::addr8(0);
+    p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+    GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+    p->MOV(dst, indirect);
   }
 
   void Gen8Context::emitBinaryInstruction(const SelectionInstruction &insn) {
@@ -854,9 +915,10 @@ namespace gbe
       p->UNTYPED_READ(dst, src, bti, 2*elemNum);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(2*elemNum));
+      const GenRegister btiTmp = ra->genReg(insn.dst(2*elemNum + 1));
       unsigned desc = p->generateUntypedReadMessageDesc(0, 2*elemNum);
 
-      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+      unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
       //predicated load
       p->push();
@@ -864,7 +926,7 @@ namespace gbe
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
         p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), 2*elemNum);
       p->pop();
-      afterMessage(insn, bti, tmp, jip0);
+      afterMessage(insn, bti, tmp, btiTmp, jip0);
     }
 
     for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
@@ -893,9 +955,10 @@ namespace gbe
       p->UNTYPED_WRITE(addr, bti, elemNum*2);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+      const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
       unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
 
-      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+      unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
       //predicated load
       p->push();
@@ -903,7 +966,7 @@ namespace gbe
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
         p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
       p->pop();
-      afterMessage(insn, bti, tmp, jip0);
+      afterMessage(insn, bti, tmp, btiTmp, jip0);
     }
   }
   void Gen8Context::emitPackLongInstruction(const SelectionInstruction &insn) {
@@ -924,6 +987,151 @@ namespace gbe
     this->unpackLongVec(src, dst, p->curr.execWidth);
   }
 
+  void Gen8Context::emitF64DIVInstruction(const SelectionInstruction &insn) {
+    /* Macro for Double Precision IEEE Compliant fdiv
+
+       Set Rounding Mode in CR to RNE
+       GRF are initialized: r0 = 0, r6 = a, r7 = b, r1 = 1
+       The default data type for the macro is :df
+
+       math.eo.f0.0 (4) r8.acc2 r6.noacc r7.noacc 0xE
+       (-f0.0) if
+       madm (4) r9.acc3 r0.noacc r6.noacc r8.acc2       // Step(1), q0=a*y0
+       madm (4) r10.acc4 r1.noacc -r7.noacc r8.acc2     // Step(2), e0=(1-b*y0)
+       madm (4) r11.acc5 r6.noacc -r7.noacc r9.acc3     // Step(3), r0=a-b*q0
+       madm (4) r12.acc6 r8.acc2 r10.acc4 r8.acc2       // Step(4), y1=y0+e0*y0
+       madm (4) r13.acc7 r1.noacc -r7.noacc r12.acc6    // Step(5), e1=(1-b*y1)
+       madm (4) r8.acc8 r8.acc2 r10.acc4 r12.acc6       // Step(6), y2=y0+e0*y1
+       madm (4) r9.acc9 r9.acc3 r11.acc5 r12.acc6       // Step(7), q1=q0+r0*y1
+       madm (4) r12.acc2 r12.acc6 r8.acc8 r13.acc7      // Step(8), y3=y1+e1*y2
+       madm (4) r11.acc3 r6.noacc -r7.noacc r9.acc9     // Step(9), r1=a-b*q1
+
+       Change Rounding Mode in CR if required
+       Implicit Accumulator for destination is NULL
+
+       madm (4) r8.noacc r9.acc9 r11.acc3 r12.acc2      // Step(10), q=q1+r1*y3
+       endif */
+    GenRegister src0 = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_DF);
+    GenRegister src1 = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_DF);
+    GenRegister dst = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_DF);
+    GenRegister r6 , r7, r8;
+    int src0Stride = 1;
+    int src1Stride = 1;
+    int tmpNum = 7;
+    int loopNum = 0;
+
+    if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {// dst is uniform
+      loopNum = 1;
+    } else if (p->curr.execWidth == 4) {
+      loopNum = 1;
+    } else if (p->curr.execWidth == 8) {
+      loopNum = 2;
+    } else if (p->curr.execWidth == 16) {
+      loopNum = 4;
+    } else
+      GBE_ASSERT(0);
+
+    r8 = GenRegister::retype(ra->genReg(insn.dst(tmpNum + 1)), GEN_TYPE_DF);
+    tmpNum++;
+
+    if (src0.vstride == GEN_HORIZONTAL_STRIDE_0) {
+      r6 = GenRegister::retype(ra->genReg(insn.dst(tmpNum + 1)), GEN_TYPE_DF);
+      tmpNum++;
+      src0Stride = 0;
+      p->push(); {
+        p->curr.execWidth = 4;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask= 1;
+        p->MOV(r6, src0);
+      } p->pop();
+    } else {
+      r6 = src0;
+    }
+
+    if (src1.vstride == GEN_HORIZONTAL_STRIDE_0) {
+      r7 = GenRegister::retype(ra->genReg(insn.dst(tmpNum + 1)), GEN_TYPE_DF);
+      tmpNum++;
+      src1Stride = 0;
+      p->push(); {
+        p->curr.execWidth = 4;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(r7, src1);
+      } p->pop();
+    } else {
+      r7 = src1;
+    }
+
+    const GenRegister r0 = GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_DF);
+    const GenRegister r1 = GenRegister::retype(ra->genReg(insn.dst(2)), GEN_TYPE_DF);
+    const GenRegister r9 = GenRegister::retype(ra->genReg(insn.dst(3)), GEN_TYPE_DF);
+    const GenRegister r10 = GenRegister::retype(ra->genReg(insn.dst(4)), GEN_TYPE_DF);
+    const GenRegister r11 = GenRegister::retype(ra->genReg(insn.dst(5)), GEN_TYPE_DF);
+    const GenRegister r12 = GenRegister::retype(ra->genReg(insn.dst(6)), GEN_TYPE_DF);
+    const GenRegister r13 = GenRegister::retype(ra->genReg(insn.dst(7)), GEN_TYPE_DF);
+    Gen8Encoder *p8 = reinterpret_cast<Gen8Encoder *>(p);
+    p->push(); {
+      p->curr.execWidth = 4;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask= 1;
+      p->MOV(r1, GenRegister::immdf(1.0));
+      p->MOV(r0, GenRegister::immdf(0.0));
+    } p->pop();
+
+    for (int i = 0; i < loopNum; i++) {
+      p->push(); {
+        p->curr.noMask= 1;
+        p->curr.execWidth = 4;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p8->MATH_WITH_ACC(r8, GEN8_MATH_FUNCTION_INVM, r6, r7, GEN8_INSN_ACC2, GEN8_INSN_NOACC, GEN8_INSN_NOACC);
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.inversePredicate = 1;
+        p8->MADM(r9, r0, r6, r8, GEN8_INSN_ACC3, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC2);
+        p8->MADM(r10, r1, GenRegister::negate(r7), r8, GEN8_INSN_ACC4, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC2);
+        p8->MADM(r11, r6, GenRegister::negate(r7), r9, GEN8_INSN_ACC5, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC3);
+        p8->MADM(r12, r8, r10, r8, GEN8_INSN_ACC6, GEN8_INSN_ACC2, GEN8_INSN_ACC4, GEN8_INSN_ACC2);
+        p8->MADM(r13, r1, GenRegister::negate(r7), r12, GEN8_INSN_ACC7, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC6);
+        p8->MADM(r8, r8, r10, r12, GEN8_INSN_ACC8, GEN8_INSN_ACC2, GEN8_INSN_ACC4, GEN8_INSN_ACC6);
+        p8->MADM(r9, r9, r11, r12, GEN8_INSN_ACC9, GEN8_INSN_ACC3, GEN8_INSN_ACC5, GEN8_INSN_ACC6);
+        p8->MADM(r12, r12, r8, r13, GEN8_INSN_ACC2, GEN8_INSN_ACC6, GEN8_INSN_ACC8, GEN8_INSN_ACC7);
+        p8->MADM(r11, r6, GenRegister::negate(r7), r9, GEN8_INSN_ACC3, GEN8_INSN_NOACC, GEN8_INSN_NOACC, GEN8_INSN_ACC9);
+
+        p8->MADM(r8, r9, r11, r12, GEN8_INSN_NOACC, GEN8_INSN_ACC9, GEN8_INSN_ACC3, GEN8_INSN_ACC2);
+      } p->pop();
+
+      r6 = GenRegister::offset(r6, src0Stride);
+      r7 = GenRegister::offset(r7, src1Stride);
+
+      /* Move back the result. */
+      if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {// dst is uniform
+        p->push(); {
+          p->curr.execWidth = 1;
+          r8.hstride = GEN_HORIZONTAL_STRIDE_0;
+          r8.vstride = GEN_VERTICAL_STRIDE_0;
+          r8.width = GEN_WIDTH_1;
+          p->MOV(dst, r8);
+        } p->pop();
+        break;
+      } else {
+        p->push(); {
+          p->curr.execWidth = 4;
+          if (i % 2 == 0)
+            p->curr.nibControl = 0;
+          else
+            p->curr.nibControl = 1;
+
+          if (i < 2)
+            p->curr.quarterControl = GEN_COMPRESSION_Q1;
+          else
+            p->curr.quarterControl = GEN_COMPRESSION_Q2;
+
+          p->MOV(GenRegister::offset(dst, i), r8);
+        } p->pop();
+      }
+    }
+  }
+
   void Gen8Context::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
     if (sz == 0)
       sz = 16;
@@ -943,6 +1151,27 @@ namespace gbe
     p->pop();
   }
 
+  void Gen8Context::subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp)
+  {
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADD(GenRegister::retype(t0, GEN_TYPE_UL), GenRegister::retype(t0, GEN_TYPE_UL),
+          GenRegister::negate(GenRegister::retype(t1, GEN_TYPE_UL)));
+    } p->pop();
+  }
+
+  void Gen8Context::addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp) {
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADD(GenRegister::retype(t0, GEN_TYPE_UL), GenRegister::retype(t0, GEN_TYPE_UL),
+          GenRegister::retype(t1, GEN_TYPE_UL));
+    } p->pop();
+  }
+
   void ChvContext::newSelection(void) {
     this->sel = GBE_NEW(SelectionChv, *this);
   }
@@ -1035,6 +1264,42 @@ namespace gbe
     p->ADD(dst, dst, res);
   }
 
+  void Gen8Context::emitPrintfLongInstruction(GenRegister& addr, GenRegister& data,
+                                             GenRegister& src, uint32_t bti) {
+    GenRegister tempSrc, tempDst;
+    GenRegister nextSrc, nextDst;
+    p->push();
+      tempSrc = GenRegister::h2(GenRegister::retype(src, GEN_TYPE_UD));
+      tempDst = GenRegister::retype(data, GEN_TYPE_UD);
+      p->curr.execWidth = 8;
+      p->curr.quarterControl = GEN_COMPRESSION_Q1;
+      p->MOV(tempDst, tempSrc);
+
+      p->curr.quarterControl = GEN_COMPRESSION_Q2;
+      nextSrc = GenRegister::Qn(tempSrc, 1);
+      nextDst = GenRegister::Qn(tempDst, 1);
+      p->MOV(nextDst, nextSrc);
+    p->pop();
+    p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+    p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+
+    p->push();
+      tempSrc = GenRegister::h2(
+        GenRegister::retype(GenRegister::offset(src, 0, 4), GEN_TYPE_UD));
+      tempDst = GenRegister::retype(data, GEN_TYPE_UD);
+      p->curr.execWidth = 8;
+      p->curr.quarterControl = GEN_COMPRESSION_Q1;
+      p->MOV(tempDst, tempSrc);
+
+      p->curr.quarterControl = GEN_COMPRESSION_Q2;
+      nextSrc = GenRegister::Qn(tempSrc, 1);
+      nextDst = GenRegister::Qn(tempDst, 1);
+      p->MOV(nextDst, nextSrc);
+    p->pop();
+    p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+    p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+  }
+
   void ChvContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
     if (sz == 0)
       sz = 16;
@@ -1052,4 +1317,561 @@ namespace gbe
     p->pop();
   }
 
+  /* Init value according to WORKGROUP OP
+   * Emit assert is invalid combination operation - datatype */
+  static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
+  {
+
+    if (wg_op == ir::WORKGROUP_OP_ALL)
+    {
+      if (dataReg.type == GEN_TYPE_D
+          || dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF));
+      else if(dataReg.type == GEN_TYPE_L ||
+          dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immint64(0xFFFFFFFFFFFFFFFFL));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    else if(wg_op == ir::WORKGROUP_OP_ANY
+      || wg_op == ir::WORKGROUP_OP_REDUCE_ADD
+      || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+      || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+    {
+      if (dataReg.type == GEN_TYPE_D)
+        p->MOV(dataReg, GenRegister::immd(0x0));
+      else if (dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immud(0x0));
+      else if (dataReg.type == GEN_TYPE_HF)
+        p->MOV(dataReg, GenRegister::immh(0x0));
+      else if (dataReg.type == GEN_TYPE_F)
+        p->MOV(dataReg, GenRegister::immf(0x0));
+      else if (dataReg.type == GEN_TYPE_L)
+        p->MOV(dataReg, GenRegister::immint64(0x0));
+      else if (dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN
+      || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+      || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+    {
+      if (dataReg.type == GEN_TYPE_D)
+        p->MOV(dataReg, GenRegister::immd(0x7FFFFFFF));
+      else if (dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immud(0xFFFFFFFF));
+      else if (dataReg.type == GEN_TYPE_HF)
+        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UW), GenRegister::immuw(0x7C00));
+      else if (dataReg.type == GEN_TYPE_F)
+        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000));
+      else if (dataReg.type == GEN_TYPE_L)
+        p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL));
+      else if (dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX
+      || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+      || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+    {
+      if (dataReg.type == GEN_TYPE_D)
+        p->MOV(dataReg, GenRegister::immd(0x80000000));
+      else if (dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immud(0x0));
+      else if (dataReg.type == GEN_TYPE_HF)
+        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UW), GenRegister::immuw(0xFC00));
+      else if (dataReg.type == GEN_TYPE_F)
+        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000));
+      else if (dataReg.type == GEN_TYPE_L)
+        p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
+      else if (dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    /* unsupported operation */
+    else
+      GBE_ASSERT(0);
+  }
+
+  /* Perform WORKGROUP OP on 2 input elements (registers) */
+  static void wgOpPerform(GenRegister dst,
+                         GenRegister src1,
+                         GenRegister src2,
+                         uint32_t wg_op,
+                         GenEncoder *p)
+  {
+    /* perform OP REDUCE on 2 elements */
+    if (wg_op == ir::WORKGROUP_OP_ANY)
+      p->OR(dst, src1, src2);
+    else if (wg_op == ir::WORKGROUP_OP_ALL)
+      p->AND(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN INCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN EXCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    else
+      GBE_ASSERT(0);
+  }
+
+  static void wgOpPerformThread(GenRegister threadDst,
+                                  GenRegister inputVal,
+                                  GenRegister threadExchangeData,
+                                   GenRegister resultVal,
+                                   uint32_t simd,
+                                   uint32_t wg_op,
+                                   GenEncoder *p)
+  {
+   p->push();
+   p->curr.predicate = GEN_PREDICATE_NONE;
+   p->curr.noMask = 1;
+   p->curr.execWidth = 1;
+
+   /* setting the type */
+   resultVal = GenRegister::retype(resultVal, inputVal.type);
+   threadDst = GenRegister::retype(threadDst, inputVal.type);
+   threadExchangeData = GenRegister::retype(threadExchangeData, inputVal.type);
+
+   vector<GenRegister> input;
+   vector<GenRegister> result;
+
+   /* for workgroup all and any we can use simd_all/any for each thread */
+   if (wg_op == ir::WORKGROUP_OP_ALL || wg_op == ir::WORKGROUP_OP_ANY) {
+     GenRegister constZero = GenRegister::immuw(0);
+     GenRegister flag01 = GenRegister::flag(0, 1);
+
+     p->push();
+     {
+       p->curr.predicate = GEN_PREDICATE_NONE;
+       p->curr.noMask = 1;
+       p->curr.execWidth = simd;
+       p->MOV(resultVal, GenRegister::immud(1));
+       p->curr.execWidth = 1;
+       if (wg_op == ir::WORKGROUP_OP_ALL)
+         p->MOV(flag01, GenRegister::immw(-1));
+       else
+         p->MOV(flag01, constZero);
+
+       p->curr.execWidth = simd;
+       p->curr.noMask = 0;
+
+       p->curr.flag = 0;
+       p->curr.subFlag = 1;
+       p->CMP(GEN_CONDITIONAL_NEQ, inputVal, constZero);
+
+       if (p->curr.execWidth == 16)
+         if (wg_op == ir::WORKGROUP_OP_ALL)
+           p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+         else
+           p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+       else if (p->curr.execWidth == 8)
+         if (wg_op == ir::WORKGROUP_OP_ALL)
+           p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+         else
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+       else
+         NOT_IMPLEMENTED;
+       p->SEL(threadDst, resultVal, constZero);
+       p->SEL(threadExchangeData, resultVal, constZero);
+     }
+     p->pop();
+   } else {
+     if (inputVal.hstride == GEN_HORIZONTAL_STRIDE_0) {
+       p->MOV(threadExchangeData, inputVal);
+       p->pop();
+       return;
+     }
+
+     /* init thread data to min/max/null values */
+     p->push(); {
+       p->curr.execWidth = simd;
+       wgOpInitValue(p, threadExchangeData, wg_op);
+       p->MOV(resultVal, inputVal);
+     } p->pop();
+
+     GenRegister resultValSingle = resultVal;
+     resultValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+     resultValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+     resultValSingle.width = GEN_WIDTH_1;
+
+     GenRegister inputValSingle = inputVal;
+     inputValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+     inputValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+     inputValSingle.width = GEN_WIDTH_1;
+
+
+     /* make an array of registers for easy accesing */
+     for(uint32_t i = 0; i < simd; i++){
+       /* add all resultVal offset reg positions from list */
+       result.push_back(resultValSingle);
+       input.push_back(inputValSingle);
+
+       /* move to next position */
+       resultValSingle.subnr += typeSize(resultValSingle.type);
+       if (resultValSingle.subnr == 32) {
+           resultValSingle.subnr = 0;
+           resultValSingle.nr++;
+       }
+       /* move to next position */
+       inputValSingle.subnr += typeSize(inputValSingle.type);
+       if (inputValSingle.subnr == 32) {
+           inputValSingle.subnr = 0;
+           inputValSingle.nr++;
+       }
+     }
+
+     uint32_t start_i = 0;
+     if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+         wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+         wg_op == ir::WORKGROUP_OP_REDUCE_MAX ||
+         wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+         wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+         wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) {
+       p->MOV(result[0], input[0]);
+       start_i = 1;
+     }
+
+     else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+         wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+         wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
+       p->MOV(result[1], input[0]);
+       start_i = 2;
+     }
+
+     /* algorithm workgroup */
+     for (uint32_t i = start_i; i < simd; i++)
+     {
+       if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+           wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+           wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+         wgOpPerform(result[0], result[0], input[i], wg_op, p);
+
+       else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+           wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+           wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+         wgOpPerform(result[i], result[i - 1], input[i], wg_op, p);
+
+       else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+           wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+           wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+         wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p);
+
+       else
+         GBE_ASSERT(0);
+     }
+   }
+
+   if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+       wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+       wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+   {
+     p->curr.execWidth = 16;
+     /* value exchanged with other threads */
+     p->MOV(threadExchangeData, result[0]);
+     /* partial result thread */
+     p->MOV(threadDst, result[0]);
+   }
+   else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+   {
+     p->curr.execWidth = 16;
+     /* value exchanged with other threads */
+     p->MOV(threadExchangeData, result[simd - 1]);
+     /* partial result thread */
+     p->MOV(threadDst, resultVal);
+   }
+   else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+   {
+     p->curr.execWidth = 1;
+     /* set result[0] to min/max/null */
+     wgOpInitValue(p, result[0], wg_op);
+
+     p->curr.execWidth = 16;
+     /* value exchanged with other threads */
+     wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
+     /* partial result thread */
+     p->MOV(threadDst, resultVal);
+   }
+
+   p->pop();
+ }
+
+/**
+ * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ *
+ * Implementation:
+ * 1. All the threads first perform the workgroup op value for the
+ * allocated work-items. SIMD16=> 16 work-items allocated for each thread
+ * 2. Each thread writes the partial result in shared local memory using threadId
+ * 3. After a barrier, each thread will read in chunks of 1-4 elements,
+ * the shared local memory region, using a loop based on the thread num value (threadN)
+ * 4. Each thread computes the final value individually
+ *
+ * Optimizations:
+ * Performance is given by chunk read. If threads read in chunks of 4 elements
+ * the performance is increase 2-3x times compared to chunks of 1 element.
+ */
+  void Gen8Context::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type);
+    const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(2)), dst.type);
+    GenRegister threadData = ra->genReg(insn.src(3));
+    GenRegister partialData = GenRegister::toUniform(threadData, dst.type);
+    GenRegister threadId = ra->genReg(insn.src(0));
+    GenRegister threadLoop = ra->genReg(insn.src(1));
+    GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    GenRegister localBarrier = ra->genReg(insn.src(5));
+
+    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t simd = p->curr.execWidth;
+    int32_t jip0, jip1;
+
+    /* masked elements should be properly set to init value */
+    p->push(); {
+      p->curr.noMask = 1;
+      wgOpInitValue(p, tmp, wg_op);
+      p->curr.noMask = 0;
+      p->MOV(tmp, theVal);
+      p->curr.noMask = 1;
+      p->MOV(theVal, tmp);
+    } p->pop();
+
+    threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
+
+    /* use of continuous GRF allocation from insn selection */
+    GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
+    GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
+    GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
+    GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
+
+    /* do some calculation within each thread */
+    wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+
+    p->curr.execWidth = 16;
+    p->MOV(theVal, dst);
+    threadData = GenRegister::toUniform(threadData, dst.type);
+
+    /* store thread count for future use on read/write to SLM */
+    if (wg_op == ir::WORKGROUP_OP_ANY ||
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+    {
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+    }
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+    {
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+    }
+
+    /* all threads write the partial results to SLM memory */
+    if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+    {
+      GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
+      GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4);
+      p->MOV(msgData.offset(msgData, 0), threadDataL);
+      p->MOV(msgData.offset(msgData, 1), threadDataH);
+
+      p->curr.execWidth = 8;
+      p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
+      p->ADD(msgAddr, msgAddr, msgSlmOff);
+      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2);
+    }
+    else
+    {
+      p->curr.execWidth = 8;
+      p->MOV(msgData, threadData);
+      p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
+      p->ADD(msgAddr, msgAddr, msgSlmOff);
+      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+    }
+
+    /* init partialData register, it will hold the final result */
+    wgOpInitValue(p, partialData, wg_op);
+
+    /* add call to barrier */
+    p->push();
+      p->curr.execWidth = 8;
+      p->curr.physicalFlag = 0;
+      p->curr.noMask = 1;
+      p->AND(localBarrier, barrierId, GenRegister::immud(0x0f000000));
+      p->BARRIER(localBarrier);
+      p->curr.execWidth = 1;
+      p->WAIT();
+    p->pop();
+
+    /* perform a loop, based on thread count (which is now multiple of 4) */
+    p->push();{
+      jip0 = p->n_instruction();
+
+      /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */
+      if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+      {
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+        p->MUL(msgAddr, threadLoop, GenRegister::immd(0x8));
+        p->ADD(msgAddr, msgAddr, msgSlmOff);
+        p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 2);
+
+        GenRegister msgDataL = msgData.retype(msgData.offset(msgData, 0, 4), GEN_TYPE_D);
+        GenRegister msgDataH = msgData.retype(msgData.offset(msgData, 1, 4), GEN_TYPE_D);
+        msgDataL.hstride = 2;
+        msgDataH.hstride = 2;
+        p->MOV(msgDataL, msgDataH);
+
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+      }
+      else
+      {
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+        p->MUL(msgAddr, threadLoop, GenRegister::immd(0x4));
+        p->ADD(msgAddr, msgAddr, msgSlmOff);
+        p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
+
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+      }
+
+      /* while threadN is not 0, cycle read SLM / update value */
+      p->curr.noMask = 1;
+      p->curr.flag = 0;
+      p->curr.subFlag = 1;
+      p->CMP(GEN_CONDITIONAL_G, threadLoop, GenRegister::immd(0x0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      jip1 = p->n_instruction();
+      p->JMPI(GenRegister::immud(0));
+      p->patchJMPI(jip1, jip0 - jip1, 0);
+    } p->pop();
+
+    if(wg_op == ir::WORKGROUP_OP_ANY ||
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+    {
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+      p->MOV(dst, partialData);
+    }
+    else
+    {
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+
+      if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+          || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+        p->ADD(dst, dst, partialData);
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      {
+        p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
+        /* workaround QW datatype on CMP */
+        if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 1, 0),
+                       dst.offset(dst, 1, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 2, 0),
+                       dst.offset(dst, 2, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 3, 0),
+                       dst.offset(dst, 3, 0), partialData);
+        }
+      }
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      {
+        p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
+        /* workaround QW datatype on CMP */
+        if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 1, 0),
+                       dst.offset(dst, 1, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 2, 0),
+                       dst.offset(dst, 2, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 3, 0),
+                       dst.offset(dst, 3, 0), partialData);
+        }
+      }
+    }
+
+    /* corner cases for threads 0 */
+    if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+    {
+      p->push();{
+        p->curr.flag = 0;
+        p->curr.subFlag = 1;
+        p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+
+        p->curr.execWidth = 16;
+        p->MOV(dst, theVal);
+      } p->pop();
+    }
+  }
+
+  void Gen8Context::emitSubGroupOpInstruction(const SelectionInstruction &insn){
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type);
+    const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(0)), dst.type);
+    GenRegister threadData = ra->genReg(insn.src(1));
+
+    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t simd = p->curr.execWidth;
+
+    /* masked elements should be properly set to init value */
+    p->push(); {
+      p->curr.noMask = 1;
+      wgOpInitValue(p, tmp, wg_op);
+      p->curr.noMask = 0;
+      p->MOV(tmp, theVal);
+      p->curr.noMask = 1;
+      p->MOV(theVal, tmp);
+    } p->pop();
+
+    /* do some calculation within each thread */
+    wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+  }
+
 }
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index b33aeeb..ec1358c 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -74,10 +74,18 @@ namespace gbe
     virtual void emitPackLongInstruction(const SelectionInstruction &insn);
     virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
 
+    virtual void emitF64DIVInstruction(const SelectionInstruction &insn);
+
+    virtual void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
+    virtual void emitSubGroupOpInstruction(const SelectionInstruction &insn);
+
     static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0);
 
   protected:
     virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+    virtual void subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp);
+    virtual void addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp);
+    virtual void emitPrintfLongInstruction(GenRegister& addr, GenRegister& data, GenRegister& src, uint32_t bti);
     virtual GenEncoder* generateEncoder(void) {
       return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
     }
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 69eabb2..277260f 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -37,6 +37,7 @@ static const uint32_t untypedRWMask[] = {
 
 namespace gbe
 {
+  extern bool compactAlu3(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, GenRegister src2);
   void Gen8Encoder::setHeader(GenNativeInstruction *insn) {
     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
     if (this->curr.execWidth == 8)
@@ -227,75 +228,11 @@ namespace gbe
       this->setSrc1(insn, bti);
     }
   }
-  void Gen8Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
-    union { double d; unsigned u[2]; } u;
-    u.d = value;
-    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
-    push();
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = 1;
-    curr.execWidth = 1;
-    MOV(r, GenRegister::immud(u.u[0]));
-    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[1]));
-    pop();
-    r.type = GEN_TYPE_DF;
-    r.vstride = GEN_VERTICAL_STRIDE_0;
-    r.width = GEN_WIDTH_1;
-    r.hstride = GEN_HORIZONTAL_STRIDE_0;
-    push();
-    uint32_t width = curr.execWidth;
-    curr.execWidth = 8;
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = 1;
-    curr.quarterControl = GEN_COMPRESSION_Q1;
-    MOV(dest, r);
-    if (width == 16) {
-      curr.quarterControl = GEN_COMPRESSION_Q2;
-      MOV(GenRegister::offset(dest, 2), r);
-    }
-    pop();
-  }
 
   void Gen8Encoder::LOAD_INT64_IMM(GenRegister dest, GenRegister value) {
     MOV(dest, value);
   }
 
-  void Gen8Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp) {
-    GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
-    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F);
-    int w = curr.execWidth;
-    GenRegister r0;
-    r0 = GenRegister::h2(r);
-    push();
-    curr.execWidth = 4;
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = 1;
-    MOV(r0, src0);
-    MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
-    curr.noMask = 0;
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(dest, r0);
-    curr.nibControl = 1;
-    MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r0, 4));
-    pop();
-    if (w == 16) {
-      push();
-      curr.execWidth = 4;
-      curr.predicate = GEN_PREDICATE_NONE;
-      curr.noMask = 1;
-      MOV(r0, GenRegister::suboffset(src0, 8));
-      MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
-      curr.noMask = 0;
-      curr.quarterControl = 1;
-      curr.nibControl = 0;
-      MOV(GenRegister::suboffset(dest, 8), r0);
-      curr.nibControl = 1;
-      MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r0, 4));
-      pop();
-    }
-  }
-
   void Gen8Encoder::JMPI(GenRegister src, bool longjmp) {
     alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
   }
@@ -360,6 +297,46 @@ namespace gbe
     gen8_insn->bits1.da1.dest_horiz_stride = dest.hstride;
   }
 
+  void Gen8Encoder::setSrc0WithAcc(GenNativeInstruction *insn, GenRegister reg, uint32_t accN) {
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    assert(reg.file == GEN_GENERAL_REGISTER_FILE);
+    assert(reg.nr < 128);
+    assert(gen8_insn->header.access_mode == GEN_ALIGN_16);
+    assert(reg.subnr == 0);
+    assert(gen8_insn->header.execution_size >= GEN_WIDTH_4);
+
+    gen8_insn->bits1.da16acc.src0_reg_file = reg.file;
+    gen8_insn->bits1.da16acc.src0_reg_type = reg.type;
+    gen8_insn->bits2.da16acc.src0_abs = reg.absolute;
+    gen8_insn->bits2.da16acc.src0_negate = reg.negation;
+    gen8_insn->bits2.da16acc.src0_address_mode = reg.address_mode;
+    gen8_insn->bits2.da16acc.src0_subreg_nr = reg.subnr / 16;
+    gen8_insn->bits2.da16acc.src0_reg_nr = reg.nr;
+    gen8_insn->bits2.da16acc.src0_special_acc_lo = accN;
+    gen8_insn->bits2.da16acc.src0_special_acc_hi = 0;
+    gen8_insn->bits2.da16acc.src0_vert_stride = reg.vstride;
+  }
+
+  void Gen8Encoder::setSrc1WithAcc(GenNativeInstruction *insn, GenRegister reg, uint32_t accN) {
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    assert(reg.file == GEN_GENERAL_REGISTER_FILE);
+    assert(reg.nr < 128);
+    assert(gen8_insn->header.access_mode == GEN_ALIGN_16);
+    assert(reg.subnr == 0);
+    assert(gen8_insn->header.execution_size >= GEN_WIDTH_4);
+
+    gen8_insn->bits2.da16acc.src1_reg_file = reg.file;
+    gen8_insn->bits2.da16acc.src1_reg_type = reg.type;
+    gen8_insn->bits3.da16acc.src1_abs = reg.absolute;
+    gen8_insn->bits3.da16acc.src1_negate = reg.negation;
+    gen8_insn->bits3.da16acc.src1_address_mode = reg.address_mode;
+    gen8_insn->bits3.da16acc.src1_subreg_nr = reg.subnr / 16;
+    gen8_insn->bits3.da16acc.src1_reg_nr = reg.nr;
+    gen8_insn->bits3.da16acc.src1_special_acc_lo = accN;
+    gen8_insn->bits3.da16acc.src1_special_acc_hi = 0;
+    gen8_insn->bits3.da16acc.src1_vert_stride = reg.vstride;
+  }
+
   void Gen8Encoder::setSrc0(GenNativeInstruction *insn, GenRegister reg) {
     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
     if (reg.file != GEN_ARCHITECTURE_REGISTER_FILE)
@@ -372,7 +349,7 @@ namespace gbe
       gen8_insn->bits2.da1.src0_negate = reg.negation;
       gen8_insn->bits2.da1.src0_address_mode = reg.address_mode;
       if (reg.file == GEN_IMMEDIATE_VALUE) {
-        if (reg.type == GEN_TYPE_L || reg.type == GEN_TYPE_UL) {
+        if (reg.type == GEN_TYPE_L || reg.type == GEN_TYPE_UL || reg.type == GEN_TYPE_DF_IMM) {
           gen8_insn->bits3.ud = (uint32_t)(reg.value.i64 >> 32);
           gen8_insn->bits2.ud = (uint32_t)(reg.value.i64);
         } else {
@@ -459,6 +436,53 @@ namespace gbe
     return false;
   }
 
+  void Gen8Encoder::handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1)
+  {
+    uint32_t w = p->curr.execWidth;
+    GenNativeInstruction *insn = NULL;
+
+    if (w <= 8) {
+      insn = p->next(opcode);
+      p->setHeader(insn);
+      p->setDst(insn, dst);
+      p->setSrc0(insn, src0);
+      if (!GenRegister::isNull(src1))
+        p->setSrc1(insn, src1);
+      return;
+    } else {
+      GBE_ASSERT(w == 16);
+      GBE_ASSERT(dst.hstride != GEN_HORIZONTAL_STRIDE_0); //Should not be a uniform.
+      p->push(); {
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        insn = p->next(opcode);
+        p->setHeader(insn);
+        p->setDst(insn, dst);
+        p->setSrc0(insn, src0);
+        if (!GenRegister::isNull(src1))
+          p->setSrc1(insn, src1);
+
+        // second half
+        p->curr.quarterControl = GEN_COMPRESSION_Q2;
+        insn = p->next(opcode);
+        p->setHeader(insn);
+        p->setDst(insn, GenRegister::offset(dst, 2));
+
+        if (src0.hstride != GEN_HORIZONTAL_STRIDE_0)
+          p->setSrc0(insn, GenRegister::offset(src0, 2));
+        else
+          p->setSrc0(insn, src0);
+
+        if (!GenRegister::isNull(src1)) {
+          if (src1.hstride != GEN_HORIZONTAL_STRIDE_0)
+            p->setSrc1(insn, GenRegister::offset(src1, 2));
+          else
+            p->setSrc1(insn, src1);
+        }
+      } p->pop();
+    }
+  }
+
 #define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
 
   void Gen8Encoder::alu3(uint32_t opcode,
@@ -467,25 +491,44 @@ namespace gbe
                               GenRegister src1,
                               GenRegister src2)
   {
+     if(compactAlu3(this, opcode, dest, src0, src1, src2))
+       return;
      GenNativeInstruction *insn = this->next(opcode);
      Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
 
+     int execution_size = 0;
+     if (this->curr.execWidth == 1) {
+       execution_size = GEN_WIDTH_1;
+     }else if(this->curr.execWidth == 8) {
+       execution_size = GEN_WIDTH_8;
+     } else if(this->curr.execWidth == 16) {
+       execution_size = GEN_WIDTH_16;
+     }else
+       NOT_IMPLEMENTED;
+
      assert(dest.file == GEN_GENERAL_REGISTER_FILE);
      assert(dest.nr < 128);
      assert(dest.address_mode == GEN_ADDRESS_DIRECT);
-     assert(dest.type = GEN_TYPE_F);
+     assert(src0.type == GEN_TYPE_HF || src0.type == GEN_TYPE_F || src0.type == GEN_TYPE_DF);
+     assert(src0.type == dest.type);
+     assert(src0.type == src1.type);
+     assert(src0.type == src2.type);
+     int32_t dataType = src0.type == GEN_TYPE_DF ? 3 : (src0.type == GEN_TYPE_HF ? 4 : 0);
      //gen8_insn->bits1.da3src.dest_reg_file = 0;
      gen8_insn->bits1.da3src.dest_reg_nr = dest.nr;
-     gen8_insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
+     gen8_insn->bits1.da3src.dest_subreg_nr = dest.subnr / 4;
      gen8_insn->bits1.da3src.dest_writemask = 0xf;
+     gen8_insn->bits1.da3src.dest_type = dataType;
+     gen8_insn->bits1.da3src.src_type = dataType;
+     gen8_insn->bits1.da3src.src1_type = src1.type == GEN_TYPE_HF;
+     gen8_insn->bits1.da3src.src2_type = src2.type == GEN_TYPE_HF;
      this->setHeader(insn);
      gen8_insn->header.access_mode = GEN_ALIGN_16;
-     gen8_insn->header.execution_size = GEN_WIDTH_8;
+     gen8_insn->header.execution_size = execution_size;
 
      assert(src0.file == GEN_GENERAL_REGISTER_FILE);
      assert(src0.address_mode == GEN_ADDRESS_DIRECT);
      assert(src0.nr < 128);
-     assert(src0.type == GEN_TYPE_F);
      gen8_insn->bits2.da3src.src0_swizzle = NO_SWIZZLE;
      gen8_insn->bits2.da3src.src0_subreg_nr = src0.subnr / 4 ;
      gen8_insn->bits2.da3src.src0_reg_nr = src0.nr;
@@ -496,7 +539,6 @@ namespace gbe
      assert(src1.file == GEN_GENERAL_REGISTER_FILE);
      assert(src1.address_mode == GEN_ADDRESS_DIRECT);
      assert(src1.nr < 128);
-     assert(src1.type == GEN_TYPE_F);
      gen8_insn->bits2.da3src.src1_swizzle = NO_SWIZZLE;
      gen8_insn->bits2.da3src.src1_subreg_nr_low = (src1.subnr / 4) & 0x3;
      gen8_insn->bits3.da3src.src1_subreg_nr_high = (src1.subnr / 4) >> 2;
@@ -508,28 +550,91 @@ namespace gbe
      assert(src2.file == GEN_GENERAL_REGISTER_FILE);
      assert(src2.address_mode == GEN_ADDRESS_DIRECT);
      assert(src2.nr < 128);
-     assert(src2.type == GEN_TYPE_F);
      gen8_insn->bits3.da3src.src2_swizzle = NO_SWIZZLE;
      gen8_insn->bits3.da3src.src2_subreg_nr = src2.subnr / 4;
      gen8_insn->bits3.da3src.src2_rep_ctrl = src2.vstride == GEN_VERTICAL_STRIDE_0;
      gen8_insn->bits3.da3src.src2_reg_nr = src2.nr;
      gen8_insn->bits1.da3src.src2_abs = src2.absolute;
      gen8_insn->bits1.da3src.src2_negate = src2.negation;
+  }
+
+  void Gen8Encoder::MATH_WITH_ACC(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1,
+                             uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc)
+  {
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_MATH);
+     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+     assert(dst.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+     assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0);
 
-     // Emit second half of the instruction
-     if (this->curr.execWidth == 16) {
-      GenNativeInstruction q1Insn = *insn;
-      insn = this->next(opcode);
-      *insn = q1Insn;
-      gen8_insn = &insn->gen8_insn;
-      gen8_insn->header.quarter_control = GEN_COMPRESSION_Q2;
-      gen8_insn->bits1.da3src.dest_reg_nr++;
-      if (gen8_insn->bits2.da3src.src0_rep_ctrl == 0)
-        gen8_insn->bits2.da3src.src0_reg_nr++;
-      if (gen8_insn->bits2.da3src.src1_rep_ctrl == 0)
-        gen8_insn->bits3.da3src.src1_reg_nr++;
-      if (gen8_insn->bits3.da3src.src2_rep_ctrl == 0)
-        gen8_insn->bits3.da3src.src2_reg_nr++;
-     }
+     gen8_insn->header.access_mode = GEN_ALIGN_16;
+     insn->header.destreg_or_condmod = function;
+     this->setHeader(insn);
+     this->setDst(insn, dst);
+     gen8_insn->bits1.da16acc.dst_special_acc = dstAcc;
+     this->setSrc0WithAcc(insn, src0, src0Acc);
+     this->setSrc1WithAcc(insn, src1, src1Acc);
+  }
+
+  void Gen8Encoder::MADM(GenRegister dst, GenRegister src0, GenRegister src1, GenRegister src2,
+      uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc, uint32_t src2Acc)
+  {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_MADM);
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    assert(dst.file == GEN_GENERAL_REGISTER_FILE);
+    assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+    assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+    assert(src2.file == GEN_GENERAL_REGISTER_FILE);
+    assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+    assert(src0.type == GEN_TYPE_DF || src0.type == GEN_TYPE_F);
+    assert(src0.type == dst.type);
+    assert(src0.type == src1.type);
+    assert(src0.type == src2.type);
+      // If in double, width should be less than 4
+    assert((src0.type == GEN_TYPE_DF && this->curr.execWidth <= 4)
+      // If in float, width should be less than 8
+        || (src0.type == GEN_TYPE_F && this->curr.execWidth <= 8));
+
+    int32_t dataType = src0.type == GEN_TYPE_DF ? 3 : 0;
+
+    this->setHeader(insn);
+    gen8_insn->bits1.da3srcacc.dest_reg_nr = dst.nr;
+    gen8_insn->bits1.da3srcacc.dest_subreg_nr = dst.subnr / 16;
+    gen8_insn->bits1.da3srcacc.dst_special_acc = dstAcc;
+    gen8_insn->bits1.da3srcacc.src_type = dataType;
+    gen8_insn->bits1.da3srcacc.dest_type = dataType;
+    gen8_insn->header.access_mode = GEN_ALIGN_16;
+
+    assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+    assert(src0.address_mode == GEN_ADDRESS_DIRECT);
+    assert(src0.nr < 128);
+    gen8_insn->bits2.da3srcacc.src0_special_acc = src0Acc;
+    gen8_insn->bits2.da3srcacc.src0_subreg_nr = src0.subnr / 4 ;
+    gen8_insn->bits2.da3srcacc.src0_reg_nr = src0.nr;
+    gen8_insn->bits1.da3srcacc.src0_abs = src0.absolute;
+    gen8_insn->bits1.da3srcacc.src0_negate = src0.negation;
+    gen8_insn->bits2.da3srcacc.src0_rep_ctrl = src0.vstride == GEN_VERTICAL_STRIDE_0;
+
+    assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+    assert(src1.address_mode == GEN_ADDRESS_DIRECT);
+    assert(src1.nr < 128);
+    gen8_insn->bits2.da3srcacc.src1_special_acc = src1Acc;
+    gen8_insn->bits2.da3srcacc.src1_subreg_nr_low = (src1.subnr / 4) & 0x3;
+    gen8_insn->bits3.da3srcacc.src1_subreg_nr_high = (src1.subnr / 4) >> 2;
+    gen8_insn->bits2.da3srcacc.src1_rep_ctrl = src1.vstride == GEN_VERTICAL_STRIDE_0;
+    gen8_insn->bits3.da3srcacc.src1_reg_nr = src1.nr;
+    gen8_insn->bits1.da3srcacc.src1_abs = src1.absolute;
+    gen8_insn->bits1.da3srcacc.src1_negate = src1.negation;
+
+    assert(src2.file == GEN_GENERAL_REGISTER_FILE);
+    assert(src2.address_mode == GEN_ADDRESS_DIRECT);
+    assert(src2.nr < 128);
+    gen8_insn->bits3.da3srcacc.src2_special_acc = src2Acc;
+    gen8_insn->bits3.da3srcacc.src2_subreg_nr = src2.subnr / 4;
+    gen8_insn->bits3.da3srcacc.src2_rep_ctrl = src2.vstride == GEN_VERTICAL_STRIDE_0;
+    gen8_insn->bits3.da3srcacc.src2_reg_nr = src2.nr;
+    gen8_insn->bits1.da3srcacc.src2_abs = src2.absolute;
+    gen8_insn->bits1.da3srcacc.src2_negate = src2.negation;
   }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index 504e13d..12b3765 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -31,8 +31,6 @@ namespace gbe
   class Gen8Encoder : public GenEncoder
   {
   public:
-    /*! exec width of the double data type */
-    #define GEN8_DOUBLE_EXEC_WIDTH  4
     virtual ~Gen8Encoder(void) { }
 
     Gen8Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
@@ -42,12 +40,8 @@ namespace gbe
     virtual void JMPI(GenRegister src, bool longjmp = false);
     /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
     virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
-    /*! Get double/long exec width */
-    virtual int getDoubleExecWidth(void) { return GEN8_DOUBLE_EXEC_WIDTH; }
     virtual void F16TO32(GenRegister dest, GenRegister src0);
     virtual void F32TO16(GenRegister dest, GenRegister src0);
-    virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
-    virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
     virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
     virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
@@ -61,14 +55,22 @@ namespace gbe
     virtual void setDst(GenNativeInstruction *insn, GenRegister dest);
     virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg);
     virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
-    virtual bool disableCompact() { return true; }
+    virtual uint32_t getCompactVersion() { return 8; }
     virtual void alu3(uint32_t opcode, GenRegister dst,
                        GenRegister src0, GenRegister src1, GenRegister src2);
     virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
                             GenRegister src1 = GenRegister::null());
+    virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
     virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
     virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
     virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    void setSrc0WithAcc(GenNativeInstruction *insn, GenRegister reg, uint32_t accN);
+    void setSrc1WithAcc(GenNativeInstruction *insn, GenRegister reg, uint32_t accN);
+
+    void MATH_WITH_ACC(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1,
+                       uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc);
+    void MADM(GenRegister dst, GenRegister src0, GenRegister src1, GenRegister src2,
+              uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc, uint32_t src2Acc);
   };
 }
 #endif /* __GBE_GEN8_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index 5cf1032..549948a 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -135,6 +135,22 @@ union Gen8NativeInstruction
         uint32_t dest_address_mode:1;
       } ia16;
 
+      struct { // The sub reg field is reinterpreted as accumulator selector.
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t mask_control:1;
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:4;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:4;
+        uint32_t pad:1;
+        uint32_t dst_special_acc:4;
+        uint32_t dest_subreg_nr:1;
+        uint32_t dest_reg_nr:8;
+        uint32_t reserved:2;
+        uint32_t dest_address_mode:1;
+      } da16acc;
+
       struct {
         uint32_t flag_sub_reg_nr:1;
         uint32_t flag_reg_nr:1;
@@ -153,6 +169,25 @@ union Gen8NativeInstruction
         uint32_t dest_subreg_nr:3;
         uint32_t dest_reg_nr:8;
       } da3src;
+
+      struct {
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t mask_control:1;
+        uint32_t src1_type:1;
+        uint32_t src2_type:1;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src2_abs:1;
+        uint32_t src2_negate:1;
+        uint32_t src_type:3;
+        uint32_t dest_type:3;
+        uint32_t dst_special_acc:4;
+        uint32_t dest_subreg_nr:3;
+        uint32_t dest_reg_nr:8;
+      } da3srcacc;
     }bits1;
 
     union {
@@ -219,6 +254,21 @@ union Gen8NativeInstruction
       } ia16;
 
       struct {
+        uint32_t src0_special_acc_lo:4;
+        uint32_t src0_subreg_nr:1;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_special_acc_hi:4;
+        uint32_t pad0:1;
+        uint32_t src0_vert_stride:4;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:4;
+        uint32_t pad:1;
+      } da16acc;
+
+      struct {
         uint32_t src0_rep_ctrl:1;
         uint32_t src0_swizzle:8;
         uint32_t src0_subreg_nr:3;
@@ -230,6 +280,17 @@ union Gen8NativeInstruction
       } da3src;
 
       struct {
+        uint32_t src0_rep_ctrl:1;
+        uint32_t src0_special_acc:8;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_subreg_nr_w:1;
+        uint32_t src1_rep_ctrl:1;
+        uint32_t src1_special_acc:8;
+        uint32_t src1_subreg_nr_low:2;
+      } da3srcacc;
+
+      struct {
         uint32_t uip:32;
       } gen8_branch;
 
@@ -294,6 +355,19 @@ union Gen8NativeInstruction
       } ia16;
 
       struct {
+        uint32_t src1_special_acc_lo:4;
+        uint32_t src1_subreg_nr:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_special_acc_hi:4;
+        uint32_t pad1:1;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad2:7;
+      } da16acc;
+
+      struct {
         uint32_t function_control:19;
         uint32_t header_present:1;
         uint32_t response_length:5;
@@ -504,6 +578,18 @@ union Gen8NativeInstruction
         uint32_t pad:1;
       } da3src;
 
+      struct {
+        uint32_t src1_subreg_nr_high:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_subreg_nr_w:1;
+        uint32_t src2_rep_ctrl:1;
+        uint32_t src2_special_acc:8;
+        uint32_t src2_subreg_nr:3;
+        uint32_t src2_reg_nr:8;
+        uint32_t src2_subreg_nr_w:1;
+        uint32_t pad:1;
+      } da3srcacc;
+
       /*! Message gateway */
       struct {
         uint32_t subfunc:3;
@@ -522,6 +608,22 @@ union Gen8NativeInstruction
         uint32_t jip:32;
       } gen8_branch;
 
+      /*! Data port Media block read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t ver_line_stride_offset:1;
+        uint32_t ver_line_stride:1;
+        uint32_t ver_line_stride_override:1;
+        uint32_t ignored:3;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_mblock_rw;
+
       int d;
       uint32_t ud;
       float f;
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index 6b01657..dc05756 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -51,6 +51,7 @@ namespace gbe
       p->BARRIER(src);
       p->curr.execWidth = 1;
       // Now we wait for the other threads
+      p->curr.predicate = GEN_PREDICATE_NONE;
       p->WAIT();
     p->pop();
   }
@@ -164,4 +165,8 @@ namespace gbe
     p->pop();
   }
 
+  void KblContext::newSelection(void) {
+    this->sel = GBE_NEW(SelectionKbl, *this);
+  }
+
 }
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index a2931cc..2f24b56 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -27,7 +27,7 @@
 
 namespace gbe
 {
-  /* This class is used to implement the HSW
+  /* This class is used to implement the skylake
      specific logic for context. */
   class Gen9Context : public Gen8Context
   {
@@ -68,5 +68,18 @@ namespace gbe
     virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
                                            GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
   };
+  /* This class is used to implement the kabylake
+     specific logic for context. */
+  class KblContext : public Gen9Context
+  {
+    public:
+      virtual ~KblContext(void) { };
+      KblContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+        : Gen9Context(unit, name, deviceID, relaxMath) {
+        };
+
+    private:
+      virtual void newSelection(void);
+  };
 }
 #endif /* __GBE_GEN9_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 2983d52..b429ec3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -28,10 +28,12 @@
 #include "backend/gen_encoder.hpp"
 #include "backend/gen_insn_selection.hpp"
 #include "backend/gen_insn_scheduling.hpp"
+#include "backend/gen_insn_selection_output.hpp"
 #include "backend/gen_reg_allocation.hpp"
 #include "backend/gen/gen_mesa_disasm.h"
 #include "ir/function.hpp"
 #include "ir/value.hpp"
+#include "ir/profiling.hpp"
 #include "sys/cvar.hpp"
 #include <cstring>
 #include <iostream>
@@ -52,6 +54,7 @@ namespace gbe
     this->asmFileName = NULL;
     this->ifEndifFix = false;
     this->regSpillTick = 0;
+    this->inProfilingMode = false;
   }
 
   GenContext::~GenContext(void) {
@@ -91,6 +94,10 @@ namespace gbe
     return i;
   }
 
+  extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
+#define SET_GENINSN_DBGINFO(I) \
+  if(OCL_DEBUGINFO) p->DBGInfo = I.DBGInfo;
+      
   void GenContext::emitInstructionStream(void) {
     // Emit Gen ISA
     for (auto &block : *sel->blockList)
@@ -100,6 +107,7 @@ namespace gbe
       // no more virtual register here in that part of the code generation
       GBE_ASSERT(insn.state.physicalFlag);
       p->curr = insn.state;
+      SET_GENINSN_DBGINFO(insn);
       switch (opcode) {
 #define DECL_SELECTION_IR(OPCODE, FAMILY) \
   case SEL_OP_##OPCODE: this->emit##FAMILY(insn); break;
@@ -113,6 +121,7 @@ namespace gbe
     for(int i = 0; i < 8; i++)
 	p->NOP();
   }
+#undef SET_GENINSN_DBGINFO
 
   bool GenContext::patchBranches(void) {
     using namespace ir;
@@ -139,40 +148,65 @@ namespace gbe
   }
 
   /* Get proper block ip register according to current label width. */
-  static GenRegister getBlockIP(GenContext &ctx) {
+  GenRegister GenContext::getBlockIP(void) {
     GenRegister blockip;
-    if (!ctx.isDWLabel())
-      blockip = ctx.ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+    if (!isDWLabel())
+      blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
     else
-      blockip = ctx.ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
+      blockip = ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
     return blockip;
   }
 
   /* Set current block ip register to a specified constant label value. */
-  static void setBlockIP(GenContext &ctx, GenRegister blockip, uint32_t label) {
-    if (!ctx.isDWLabel())
-      ctx.p->MOV(blockip, GenRegister::immuw(label));
+  void GenContext::setBlockIP(GenRegister blockip, uint32_t label) {
+    if (!isDWLabel())
+      p->MOV(blockip, GenRegister::immuw(label));
     else
-      ctx.p->MOV(blockip, GenRegister::immud(label));
+      p->MOV(blockip, GenRegister::immud(label));
   }
 
   void GenContext::clearFlagRegister(void) {
     // when group size not aligned to simdWidth, flag register need clear to
     // make prediction(any8/16h) work correctly
-    const GenRegister blockip = getBlockIP(*this);
-    const GenRegister zero = ra->genReg(GenRegister::uw1grf(ir::ocl::zero));
-    const GenRegister one = ra->genReg(GenRegister::uw1grf(ir::ocl::one));
+    const GenRegister blockip = getBlockIP();
     p->push();
       p->curr.noMask = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      setBlockIP(*this, blockip, getMaxLabel());
+      setBlockIP(blockip, getMaxLabel());
       p->curr.noMask = 0;
-      setBlockIP(*this, blockip, 0);
+      setBlockIP(blockip, 0);
       p->curr.execWidth = 1;
-      // FIXME, need to get the final use set of zero/one, if there is no user,
-      // no need to generate the following two instructions.
-      p->MOV(zero, GenRegister::immuw(0));
-      p->MOV(one, GenRegister::immw(-1));
+      if (ra->isAllocated(ir::ocl::zero))
+        p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::zero)), GenRegister::immuw(0));
+      if (ra->isAllocated(ir::ocl::one))
+        p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::one)), GenRegister::immw(-1));
+    p->pop();
+  }
+
+  void GenContext::loadLaneID(GenRegister dst) {
+    const GenRegister laneID = GenRegister::immv(0x76543210);
+    GenRegister dst_;
+    if (dst.type == GEN_TYPE_UW)
+      dst_ = dst;
+    else if (dst.type == GEN_TYPE_UD)
+      dst_ = GenRegister::retype(dst, GEN_TYPE_UW);
+    p->push();
+      uint32_t execWidth = p->curr.execWidth;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      if (execWidth == 8)
+        p->MOV(dst_, laneID);
+      else {
+        p->curr.execWidth = 8;
+        p->MOV(dst_, laneID);
+        //Packed Unsigned Half-Byte Integer Vector does not work
+        //have to mock by adding 8 to the singed vector
+        const GenRegister eight = GenRegister::immuw(8);
+        p->ADD(GenRegister::offset(dst_, 0, 16), dst_, eight);
+        p->curr.execWidth = 16;
+      }
+      if (dst.type != GEN_TYPE_UW)
+        p->MOV(dst, dst_);
     p->pop();
   }
 
@@ -180,36 +214,40 @@ namespace gbe
     using namespace ir;
 
     // Only emit stack pointer computation if we use a stack
-    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+    if (kernel->getStackSize() == 0)
       return;
 
     // Check that everything is consistent in the kernel code
     const uint32_t perLaneSize = kernel->getStackSize();
-    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
 
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
+    // borrow block ip as temporary register as we will
+    // initialize block ip latter.
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+    const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
+
+    loadLaneID(stackptr);
 
     // We compute the per-lane stack pointer here
-    // threadId * perThreadSize + laneId*perLaneSize
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
     // let private address start from zero
+    //p->MOV(stackptr, GenRegister::immud(0));
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
       p->curr.execWidth = this->simdWidth;
-      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
       p->curr.execWidth = 1;
-      if(perThreadSize > 0xffff) {
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
-      } else
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
+      p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
       p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+      p->MUL(stackptr, tmpReg_ud, stackptr); // (threadId * simdWidth + laneId)*perLaneSize
     p->pop();
   }
 
@@ -297,12 +335,6 @@ namespace gbe
     GenRegister src = ra->genReg(insn.src(0));
     GenRegister tmp = ra->genReg(insn.dst(1));
     switch (insn.opcode) {
-      case SEL_OP_LOAD_DF_IMM:
-        p->LOAD_DF_IMM(dst, tmp, src.value.df);
-        break;
-      case SEL_OP_MOV_DF:
-        p->MOV_DF(dst, src, tmp);
-        break;
       case SEL_OP_CONVI_TO_I64: {
         GenRegister middle = src;
         if(src.type == GEN_TYPE_B || src.type == GEN_TYPE_W) {
@@ -389,12 +421,14 @@ namespace gbe
             GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
             p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
             for (int i = 1; i < 4; i++) {
-              ind_src.addr_imm += 8;
+              if (!uniform_src)
+                ind_src.addr_imm += 8;
               p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
             }
             if (simd == 16) {
               for (int i = 0; i < 4; i++) {
-                ind_src.addr_imm += 8;
+                if (!uniform_src)
+                  ind_src.addr_imm += 8;
                 p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src);
               }
             }
@@ -433,12 +467,123 @@ namespace gbe
             GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
             p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
             for (int i = 1; i < (simd == 8 ? 2 : 4); i++) {
-              ind_src.addr_imm += 8;
+              if (!uniform_src)
+                ind_src.addr_imm += 8;
               p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
             }
             p->pop();
 
             p->MOV(dst, tmp);
+          }else if (src.type == GEN_TYPE_UL || src.type == GEN_TYPE_L) {
+            bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+            GBE_ASSERT(uniform_src || src.subnr == 0);
+            GBE_ASSERT(dst.subnr == 0);
+            GBE_ASSERT(tmp.subnr == 0);
+            GBE_ASSERT(start_addr >= 0);
+            if (!uniform_src) {
+              new_a0[0] = start_addr + 3;
+              new_a0[1] = start_addr + 2;
+              new_a0[2] = start_addr + 1;
+              new_a0[3] = start_addr;
+              new_a0[4] = start_addr + 7;
+              new_a0[5] = start_addr + 6;
+              new_a0[6] = start_addr + 5;
+              new_a0[7] = start_addr + 4;
+            } else {
+              new_a0[0] = start_addr + 7;
+              new_a0[1] = start_addr + 6;
+              new_a0[2] = start_addr + 5;
+              new_a0[3] = start_addr + 4;
+              new_a0[4] = start_addr + 3;
+              new_a0[5] = start_addr + 2;
+              new_a0[6] = start_addr + 1;
+              new_a0[7] = start_addr;
+            }
+            this->setA0Content(new_a0, 56);
+
+            if (!uniform_src) {
+              p->push();
+              p->curr.execWidth = 8;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
+              p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+              for (int i = 1; i < 4; i++) {
+                if (!uniform_src)
+                  ind_src.addr_imm += 8;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
+              }
+              for (int i = 0; i < 4; i++) {
+                if (!uniform_src)
+                  ind_src.addr_imm += 8;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src);
+              }
+              if (simd == 16) {
+                for (int i = 0; i < 4; i++) {
+                  if (!uniform_src)
+                    ind_src.addr_imm += 8;
+                  p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 2, 8*i), ind_src);
+                }
+                for (int i = 0; i < 4; i++) {
+                  if (!uniform_src)
+                    ind_src.addr_imm += 8;
+                  p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 3, 8*i), ind_src);
+                }
+              }
+              p->pop();
+
+              p->push();
+              p->curr.execWidth = 8;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              if (simd == 8) {
+                p->MOV(GenRegister::offset(GenRegister::retype(dst, GEN_TYPE_D), 1, 0),
+                    GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_D), 0, 0));
+                p->MOV(GenRegister::offset(GenRegister::retype(dst, GEN_TYPE_D), 0, 0),
+                    GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_D), 1, 0));
+              }else if(simd == 16) {
+                p->MOV(GenRegister::offset(GenRegister::retype(dst, GEN_TYPE_D), 2, 0),
+                    GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_D), 0, 0));
+                p->MOV(GenRegister::offset(GenRegister::retype(dst, GEN_TYPE_D), 3, 0),
+                    GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_D), 1, 0));
+                p->MOV(GenRegister::offset(GenRegister::retype(dst, GEN_TYPE_D), 0, 0),
+                    GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_D), 2, 0));
+                p->MOV(GenRegister::offset(GenRegister::retype(dst, GEN_TYPE_D), 1, 0),
+                    GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_D), 3, 0));
+              }
+              p->pop();
+            } else {
+                p->push();
+                p->curr.execWidth = 8;
+                p->curr.predicate = GEN_PREDICATE_NONE;
+                p->curr.noMask = 1;
+                GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
+                p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+                p->pop();
+
+                p->push();
+                p->curr.execWidth = 8;
+                p->curr.predicate = GEN_PREDICATE_NONE;
+                p->curr.noMask = 1;
+                GenRegister x = GenRegister::ud1grf(tmp.nr, 0);
+                GenRegister y = GenRegister::ud1grf(tmp.nr, 1);
+                GenRegister dst_ = dst;
+                dst_.type = GEN_TYPE_UD;
+                dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
+                dst_.width = GEN_WIDTH_8;
+                dst_.vstride = GEN_VERTICAL_STRIDE_8;
+
+                if (simd == 8) {
+                  p->MOV(GenRegister::offset(GenRegister::retype(dst_, GEN_TYPE_D), 0, 0), x);
+                  p->MOV(GenRegister::offset(GenRegister::retype(dst_, GEN_TYPE_D), 1, 0), y);
+                }else if(simd == 16) {
+                  p->MOV(GenRegister::offset(GenRegister::retype(dst_, GEN_TYPE_D), 0, 0), x);
+                  p->MOV(GenRegister::offset(GenRegister::retype(dst_, GEN_TYPE_D), 1, 0), x);
+                  p->MOV(GenRegister::offset(GenRegister::retype(dst_, GEN_TYPE_D), 2, 0), y);
+                  p->MOV(GenRegister::offset(GenRegister::retype(dst_, GEN_TYPE_D), 3, 0), y);
+                }
+                p->pop();
+            }
           } else {
             GBE_ASSERT(0);
           }
@@ -555,35 +700,29 @@ namespace gbe
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
     assert(insn.opcode == SEL_OP_SIMD_SHUFFLE);
+    assert (src1.file != GEN_IMMEDIATE_VALUE);
 
+    uint32_t base = src0.nr * 32 + src0.subnr * 4;
+    GenRegister baseReg = GenRegister::immuw(base);
+    const GenRegister a0 = GenRegister::addr8(0);
     uint32_t simd = p->curr.execWidth;
-    if (src1.file == GEN_IMMEDIATE_VALUE) {
-      uint32_t offset = src1.value.ud % simd;
-      GenRegister reg = GenRegister::suboffset(src0, offset);
-      p->MOV(dst, GenRegister::retype(GenRegister::ud1grf(reg.nr, reg.subnr / typeSize(reg.type)), reg.type));
-    } else {
-      uint32_t base = src0.nr * 32 + src0.subnr * 4;
-      GenRegister baseReg = GenRegister::immuw(base);
-      const GenRegister a0 = GenRegister::addr8(0);
+    p->push();
+      if (simd == 8) {
+        p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+        GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+        p->MOV(dst, indirect);
+      } else if (simd == 16) {
+        p->curr.execWidth = 8;
+        p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+        GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+        p->MOV(dst, indirect);
 
-      p->push();
-        if (simd == 8) {
-          p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
-          GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
-          p->MOV(dst, indirect);
-        } else if (simd == 16) {
-          p->curr.execWidth = 8;
-          p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
-          GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
-          p->MOV(dst, indirect);
-
-          p->curr.quarterControl = 1;
-          p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
-          p->MOV(GenRegister::offset(dst, 1, 0), indirect);
-        } else
-          NOT_IMPLEMENTED;
-      p->pop();
-    }
+        p->curr.quarterControl = 1;
+        p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+        p->MOV(GenRegister::offset(dst, 1, 0), indirect);
+      } else
+        NOT_IMPLEMENTED;
+    p->pop();
   }
 
   void GenContext::emitBinaryInstruction(const SelectionInstruction &insn) {
@@ -1649,6 +1788,10 @@ namespace gbe
     }
   }
 
+  void GenContext::emitF64DIVInstruction(const SelectionInstruction &insn) {
+    GBE_ASSERT(0); // No support for double on Gen7
+  }
+
   void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src0 = ra->genReg(insn.src(0));
@@ -1656,6 +1799,7 @@ namespace gbe
     const GenRegister src2 = ra->genReg(insn.src(2));
     switch (insn.opcode) {
       case SEL_OP_MAD:  p->MAD(dst, src0, src1, src2); break;
+      case SEL_OP_LRP:  p->LRP(dst, src0, src1, src2); break;
       default: NOT_IMPLEMENTED;
     }
   }
@@ -1665,7 +1809,7 @@ namespace gbe
   }
 
   void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
-    p->WAIT();
+    p->WAIT(insn.extra.waitType);
   }
 
   void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) {
@@ -1690,6 +1834,7 @@ namespace gbe
       p->BARRIER(src);
       p->curr.execWidth = 1;
       // Now we wait for the other threads
+      p->curr.predicate = GEN_PREDICATE_NONE;
       p->WAIT();
     p->pop();
   }
@@ -1736,16 +1881,17 @@ namespace gbe
       p->ATOMIC(dst, function, src, bti, srcNum);
     } else {
       GenRegister flagTemp = ra->genReg(insn.dst(1));
+      GenRegister btiTmp = ra->genReg(insn.dst(2));
 
       unsigned desc = p->generateAtomicMessageDesc(function, 0, srcNum);
 
-      unsigned jip0 = beforeMessage(insn, bti, flagTemp, desc);
+      unsigned jip0 = beforeMessage(insn, bti, flagTemp, btiTmp, desc);
       p->push();
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
         p->ATOMIC(dst, function, src, GenRegister::addr1(0), srcNum);
       p->pop();
-      afterMessage(insn, bti, flagTemp, jip0);
+      afterMessage(insn, bti, flagTemp, btiTmp, jip0);
     }
   }
 
@@ -1763,7 +1909,7 @@ namespace gbe
     if(sel->isScalarReg(offset.reg()))
       offset = GenRegister::retype(offset, GEN_TYPE_UW);
     else
-      offset = GenRegister::unpacked_uw(offset.nr, offset.subnr / typeSize(GEN_TYPE_UW));
+      offset = GenRegister::unpacked_uw(offset);
     uint32_t baseRegOffset = GenRegister::grfOffset(baseReg);
     //There is a restrict that: lower 5 bits indirect reg SubRegNum and
     //the lower 5 bits of indirect imm SubRegNum cannot exceed 5 bits.
@@ -1887,9 +2033,10 @@ namespace gbe
       p->UNTYPED_READ(dst, src, bti, elemNum);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+      const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
       unsigned desc = p->generateUntypedReadMessageDesc(0, elemNum);
 
-      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+      unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
       //predicated load
       p->push();
@@ -1897,17 +2044,17 @@ namespace gbe
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
         p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
       p->pop();
-      afterMessage(insn, bti, tmp, jip0);
+      afterMessage(insn, bti, tmp, btiTmp, jip0);
     }
   }
-  unsigned GenContext::beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned desc) {
+  unsigned GenContext::beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, GenRegister btiTmp, unsigned desc) {
       const GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
       setFlag(flagReg, GenRegister::immuw(0));
       p->CMP(GEN_CONDITIONAL_NZ, flagReg, GenRegister::immuw(1));
 
-      GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
-      GenRegister btiUW = ra->genReg(GenRegister::uw1grf(ir::ocl::btiUtil));
-      GenRegister btiUB = ra->genReg(GenRegister::ub1grf(ir::ocl::btiUtil));
+      GenRegister btiUD = GenRegister::retype(btiTmp, GEN_TYPE_UD);
+      GenRegister btiUW = GenRegister::retype(btiTmp, GEN_TYPE_UW);
+      GenRegister btiUB = GenRegister::retype(btiTmp, GEN_TYPE_UB);
       unsigned jip0 = p->n_instruction();
       p->push();
         p->curr.execWidth = 1;
@@ -1930,8 +2077,8 @@ namespace gbe
       p->pop();
       return jip0;
   }
-  void GenContext::afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned jip0) {
-    const GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
+  void GenContext::afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, GenRegister btiTmp, unsigned jip0) {
+    const GenRegister btiUD = GenRegister::retype(btiTmp, GEN_TYPE_UD);
       //restore flag
       setFlag(GenRegister::flag(insn.state.flag, insn.state.subFlag), tmp);
       // get active channel
@@ -1955,9 +2102,10 @@ namespace gbe
       p->UNTYPED_READ(dst, src, bti, elemNum);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+      const GenRegister btiTmp = ra->genReg(insn.dst(elemNum + 1));
       unsigned desc = p->generateUntypedReadMessageDesc(0, elemNum);
 
-      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+      unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
       //predicated load
       p->push();
@@ -1965,7 +2113,7 @@ namespace gbe
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
         p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
       p->pop();
-      afterMessage(insn, bti, tmp, jip0);
+      afterMessage(insn, bti, tmp, btiTmp, jip0);
     }
   }
 
@@ -1978,9 +2126,10 @@ namespace gbe
       p->UNTYPED_WRITE(src, bti, elemNum*2);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(0));
+      const GenRegister btiTmp = ra->genReg(insn.dst(1));
       unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
 
-      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+      unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
       //predicated load
       p->push();
@@ -1988,7 +2137,7 @@ namespace gbe
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
         p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum*2);
       p->pop();
-      afterMessage(insn, bti, tmp, jip0);
+      afterMessage(insn, bti, tmp, btiTmp, jip0);
     }
   }
 
@@ -2000,9 +2149,10 @@ namespace gbe
       p->UNTYPED_WRITE(src, bti, elemNum);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(0));
+      const GenRegister btiTmp = ra->genReg(insn.dst(1));
       unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum);
 
-      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+      unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
       //predicated load
       p->push();
@@ -2010,7 +2160,7 @@ namespace gbe
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
         p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum);
       p->pop();
-      afterMessage(insn, bti, tmp, jip0);
+      afterMessage(insn, bti, tmp, btiTmp, jip0);
     }
   }
 
@@ -2024,9 +2174,10 @@ namespace gbe
       p->BYTE_GATHER(dst, src, bti, elemSize);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(1));
+      const GenRegister btiTmp = ra->genReg(insn.dst(2));
       unsigned desc = p->generateByteGatherMessageDesc(0, elemSize);
 
-      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+      unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
       //predicated load
       p->push();
@@ -2034,7 +2185,7 @@ namespace gbe
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
         p->BYTE_GATHER(dst, src, GenRegister::addr1(0), elemSize);
       p->pop();
-      afterMessage(insn, bti, tmp, jip0);
+      afterMessage(insn, bti, tmp, btiTmp, jip0);
     }
   }
 
@@ -2047,9 +2198,10 @@ namespace gbe
       p->BYTE_SCATTER(src, bti, elemSize);
     } else {
       const GenRegister tmp = ra->genReg(insn.dst(0));
+      const GenRegister btiTmp = ra->genReg(insn.dst(1));
       unsigned desc = p->generateByteScatterMessageDesc(0, elemSize);
 
-      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+      unsigned jip0 = beforeMessage(insn, bti, tmp, btiTmp, desc);
 
       //predicated load
       p->push();
@@ -2057,7 +2209,7 @@ namespace gbe
         p->curr.useFlag(insn.state.flag, insn.state.subFlag);
         p->BYTE_SCATTER(src, GenRegister::addr1(0), elemSize);
       p->pop();
-      afterMessage(insn, bti, tmp, jip0);
+      afterMessage(insn, bti, tmp, btiTmp, jip0);
     }
 
   }
@@ -2116,6 +2268,104 @@ namespace gbe
     p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0, insn.extra.isLD, insn.extra.isUniform);
   }
 
+  void GenContext::emitVmeInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const unsigned int msg_type = insn.extra.msg_type;
+
+    GBE_ASSERT(msg_type == 1);
+    int rsp_len;
+    if(msg_type == 1)
+      rsp_len = 6;
+    uint32_t execWidth_org = p->curr.execWidth;
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    /* Use MOV to Setup bits of payload: mov payload value stored in insn.src(x) to
+     * 5 consecutive payload grf.
+     * In simd8 mode, one virtual grf register map to one physical grf register. But
+     * in simd16 mode, one virtual grf register map to two physical grf registers.
+     * So we should treat them differently.
+     * */
+    if(execWidth_org == 8){
+      for(int i=0; i < 5; i++){
+        GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+        payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+        payload_grf.width = GEN_WIDTH_1;
+        payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+        payload_grf.subphysical = 1;
+        for(int j=0; j < 8; j++){
+          payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+          GenRegister payload_val = ra->genReg(insn.src(i*8+j));
+          payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+          payload_val.width = GEN_WIDTH_1;
+          payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+          p->MOV(payload_grf, payload_val);
+        }
+      }
+    }
+    else if(execWidth_org == 16){
+      for(int i=0; i < 2; i++){
+        for(int k = 0; k < 2; k++){
+          GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+          payload_grf.nr += k;
+          payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+          payload_grf.width = GEN_WIDTH_1;
+          payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+          payload_grf.subphysical = 1;
+          for(int j=0; j < 8; j++){
+            payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+            GenRegister payload_val = ra->genReg(insn.src(i*16+k*8+j));
+            payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+            payload_val.width = GEN_WIDTH_1;
+            payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+            p->MOV(payload_grf, payload_val);
+          }
+        }
+      }
+      {
+        int i = 2;
+        GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+        payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+        payload_grf.width = GEN_WIDTH_1;
+        payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+        payload_grf.subphysical = 1;
+        for(int j=0; j < 8; j++){
+          payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+          GenRegister payload_val = ra->genReg(insn.src(i*16+j));
+          payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+          payload_val.width = GEN_WIDTH_1;
+          payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+          p->MOV(payload_grf, payload_val);
+        }
+      }
+    }
+    p->pop();
+
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(rsp_len)), GEN_TYPE_UB);
+    payload_did.vstride = GEN_VERTICAL_STRIDE_0;
+    payload_did.width = GEN_WIDTH_1;
+    payload_did.hstride = GEN_HORIZONTAL_STRIDE_0;
+    payload_did.subphysical = 1;
+    payload_did.subnr = 20 * typeSize(GEN_TYPE_UB);
+    GenRegister grf0 = GenRegister::ub1grf(0, 20);
+    p->MOV(payload_did, grf0);
+    p->pop();
+
+    const GenRegister msgPayload = ra->genReg(insn.dst(rsp_len));
+    const unsigned char bti = insn.getbti();
+    const unsigned int vme_search_path_lut = insn.extra.vme_search_path_lut;
+    const unsigned int lut_sub = insn.extra.lut_sub;
+    p->VME(bti, dst, msgPayload, msg_type, vme_search_path_lut, lut_sub);
+  }
+
   void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
     p->push();
     uint32_t simdWidth = p->curr.execWidth;
@@ -2153,6 +2403,1073 @@ namespace gbe
     p->TYPED_WRITE(header, true, bti);
   }
 
+  static void calcGID(GenRegister& reg, GenRegister& tmp, int flag, int subFlag, int dim, GenContext *gc)
+  {
+    GenRegister flagReg = GenRegister::flag(flag, subFlag);
+    GenRegister gstart = GenRegister::offset(reg, 0, 8 + dim*8);
+    GenRegister gend = GenRegister::offset(gstart, 0, 4);
+    GenRegister lid, localsz, gid, goffset;
+    if (dim == 0) {
+      lid = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud16grf(ir::ocl::lid0)), GEN_TYPE_UD);
+      localsz = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud1grf(ir::ocl::lsize0)), GEN_TYPE_UD);
+      gid = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud1grf(ir::ocl::groupid0)), GEN_TYPE_UD);
+      goffset = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud1grf(ir::ocl::goffset0)), GEN_TYPE_UD);
+    } else if (dim == 1) {
+      lid = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud16grf(ir::ocl::lid1)), GEN_TYPE_UD);
+      localsz = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud1grf(ir::ocl::lsize1)), GEN_TYPE_UD);
+      gid = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud1grf(ir::ocl::groupid1)), GEN_TYPE_UD);
+      goffset = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud1grf(ir::ocl::goffset1)), GEN_TYPE_UD);
+    } else {
+      lid = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud16grf(ir::ocl::lid2)), GEN_TYPE_UD);
+      localsz = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud1grf(ir::ocl::lsize2)), GEN_TYPE_UD);
+      gid = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud1grf(ir::ocl::groupid2)), GEN_TYPE_UD);
+      goffset = GenRegister::toUniform(gc->ra->genReg(GenRegister::ud1grf(ir::ocl::goffset2)), GEN_TYPE_UD);
+    }
+
+    gc->p->MUL(gstart, localsz, gid);
+    gc->p->ADD(gstart, gstart, lid);
+    gc->p->ADD(gstart, gstart, goffset);
+
+    GenRegister ip;
+    gc->p->MOV(flagReg, GenRegister::immuw(0x0));
+    gc->p->curr.useFlag(flag, subFlag);
+    gc->p->curr.predicate = GEN_PREDICATE_NONE;
+    if (gc->getSimdWidth() == 16)
+      gc->p->curr.execWidth = 16;
+    else
+      gc->p->curr.execWidth = 8;
+
+    if (!gc->isDWLabel()) {
+      ip = gc->ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
+      gc->p->CMP(GEN_CONDITIONAL_EQ, ip, GenRegister::immuw(0xffff));
+    } else {
+      ip = gc->ra->genReg(GenRegister::ud16grf(ir::ocl::dwblockip));
+      gc->p->CMP(GEN_CONDITIONAL_EQ, ip, GenRegister::immud(0xffffffff));
+    }
+    gc->p->curr.execWidth = 1;
+    gc->p->MOV(GenRegister::retype(tmp, GEN_TYPE_UW), flagReg);
+
+    if (gc->getSimdWidth() == 16)
+      gc->p->OR(tmp, tmp, GenRegister::immud(0xffff0000));
+    else
+      gc->p->OR(tmp, tmp, GenRegister::immud(0xffffff00));
+
+    gc->p->FBL(tmp, tmp);
+    gc->p->ADD(tmp, tmp, GenRegister::negate(GenRegister::immud(0x1)));
+    gc->p->MUL(tmp, tmp, GenRegister::immud(4));
+    gc->p->ADD(tmp, tmp, GenRegister::immud(lid.nr*32));
+    gc->p->MOV(GenRegister::addr1(0), GenRegister::retype(tmp, GEN_TYPE_UW));
+    GenRegister dimEnd = GenRegister::to_indirect1xN(lid, 0, 0);
+    gc->p->MOV(tmp, dimEnd);
+    gc->p->MUL(gend, localsz, gid);
+    gc->p->ADD(gend, gend, tmp);
+    gc->p->ADD(gend, gend, goffset);
+  }
+
+  void GenContext::calcGlobalXYZRange(GenRegister& reg, GenRegister& tmp, int flag, int subFlag)
+  {
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      calcGID(reg, tmp, flag, subFlag, 0, this);
+      calcGID(reg, tmp, flag, subFlag, 1, this);
+      calcGID(reg, tmp, flag, subFlag, 2, this);
+    } p->pop();
+  }
+
+  void GenContext::profilingProlog(void) {
+    // record the prolog, globalXYZ and lasttimestamp at the very beginning.
+    GenRegister profilingReg2, profilingReg3, profilingReg4;
+    GenRegister tmArf = GenRegister::tm0();
+    if (this->simdWidth == 16) {
+      profilingReg2 = ra->genReg(GenRegister::ud16grf(ir::ocl::profilingts1));
+      profilingReg3 = GenRegister::offset(profilingReg2, 1);
+      profilingReg4 = ra->genReg(GenRegister::ud16grf(ir::ocl::profilingts2));
+    } else {
+      GBE_ASSERT(this->simdWidth == 8);
+      profilingReg2 = ra->genReg(GenRegister::ud8grf(ir::ocl::profilingts2));
+      profilingReg3 = ra->genReg(GenRegister::ud8grf(ir::ocl::profilingts3));
+      profilingReg4 = ra->genReg(GenRegister::ud8grf(ir::ocl::profilingts4));
+    }
+
+    /* MOV(4)   prolog<1>:UW   arf_tm<4,4,1>:UW  */
+    /* MOV(4)   lastTsReg<1>:UW  prolog<4,4,1>:UW  */
+    GenRegister prolog = profilingReg2;
+    prolog.type = GEN_TYPE_UW;
+    prolog.hstride = GEN_HORIZONTAL_STRIDE_1;
+    prolog.vstride = GEN_VERTICAL_STRIDE_4;
+    prolog.width = GEN_WIDTH_4;
+    prolog = GenRegister::offset(prolog, 0, 4*sizeof(uint32_t));
+
+    GenRegister lastTsReg = GenRegister::toUniform(profilingReg3, GEN_TYPE_UL);
+    lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t));
+    lastTsReg.type = GEN_TYPE_UW;
+    lastTsReg.hstride = GEN_HORIZONTAL_STRIDE_1;
+    lastTsReg.vstride = GEN_VERTICAL_STRIDE_4;
+    lastTsReg.width = GEN_WIDTH_4;
+
+    GenRegister gids = GenRegister::toUniform(profilingReg4, GEN_TYPE_UD);
+    GenRegister tmp = GenRegister::toUniform(profilingReg4, GEN_TYPE_UD);
+
+    // X Y and Z
+    this->calcGlobalXYZRange(gids, tmp, 0, 1);
+
+    p->push(); {
+      p->curr.execWidth = 4;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(prolog, tmArf);
+      p->MOV(lastTsReg, tmArf);
+    } p->pop();
+
+    p->NOP();
+    p->NOP();
+    return;
+  }
+
+  void GenContext::subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp)
+  {
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->SUBB(GenRegister::retype(t0, GEN_TYPE_UD),
+          GenRegister::retype(t0, GEN_TYPE_UD), GenRegister::retype(t1, GEN_TYPE_UD));
+      /* FIXME We can not get the acc register's value correctly by set simd = 1. */
+      p->curr.execWidth = 8;
+      p->MOV(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD));
+      p->curr.execWidth = 1;
+      p->ADD(GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::negate(GenRegister::toUniform(tmp, GEN_TYPE_UD)));
+      p->ADD(GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::negate(GenRegister::retype(GenRegister::offset(t1, 0, sizeof(uint32_t)), GEN_TYPE_UD)));
+    } p->pop();
+  }
+
+  void GenContext::addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp)
+  {
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADDC(GenRegister::retype(t0, GEN_TYPE_UD),
+          GenRegister::retype(t0, GEN_TYPE_UD), GenRegister::retype(t1, GEN_TYPE_UD));
+      p->curr.execWidth = 8;
+      p->MOV(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD));
+      p->curr.execWidth = 1;
+      p->ADD(GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::offset(GenRegister::toUniform(tmp, GEN_TYPE_UD), 0, 6*sizeof(uint32_t)));
+      p->ADD(GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(t0, 0, sizeof(uint32_t)), GEN_TYPE_UD),
+          GenRegister::retype(GenRegister::offset(t1, 0, sizeof(uint32_t)), GEN_TYPE_UD));
+    } p->pop();
+  }
+
+  /* We will record at most 20 timestamps, each one is 16bits. We also will record the
+     prolog and epilog timestamps in 64 bits. So the format of the curbe timestamp reg is:
+     ---------------------------------------------------------
+     | ts0  | ts1  | ts2  | ts3  | ts4  | ts5  | ts6  | ts7	|  profilingReg0
+     | ts8  | ts9  | ts10 | ts11 | ts12 | ts13 | ts14 | ts15 |  profilingReg1
+     | ts16 | ts17 | ts18 | ts19 |	 prolog   |    epilog	|  profilingReg2
+     ---------------------------------------------------------
+     |    tmp0     |    tmp1     |lasttimestamp|  real clock |  profilingReg3
+     ---------------------------------------------------------
+     |	      | gX s | gX e | gY s | gY e | gZ s | gZ e |  profilingReg4
+     ---------------------------------------------------------
+     */
+  void GenContext::emitCalcTimestampInstruction(const SelectionInstruction &insn)
+  {
+    uint32_t pointNum = insn.extra.pointNum;
+    uint32_t tsType = insn.extra.timestampType;
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+
+    GBE_ASSERT(tsType == 1);
+    GenRegister tmArf = GenRegister::tm0();
+    GenRegister profilingReg[5];
+    GenRegister tmp;
+    if (p->curr.execWidth == 16) {
+      profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+      profilingReg[1] = GenRegister::offset(profilingReg[0], 1);
+      profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD);
+      profilingReg[3] = GenRegister::offset(profilingReg[2], 1);
+      profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(2)), GEN_TYPE_UD);
+      if (insn.dstNum == 4) {
+        tmp = GenRegister::retype(ra->genReg(insn.dst(3)), GEN_TYPE_UD);
+      } else {
+        GBE_ASSERT(insn.dstNum == 3);
+        tmp = GenRegister::toUniform(profilingReg[4], GEN_TYPE_UL);
+      }
+    } else {
+      GBE_ASSERT(p->curr.execWidth == 8);
+      profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+      profilingReg[1] = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD);
+      profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(2)), GEN_TYPE_UD);
+      profilingReg[3] = GenRegister::retype(ra->genReg(insn.src(3)), GEN_TYPE_UD);
+      profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
+      if (insn.dstNum == 6) {
+        tmp = GenRegister::retype(ra->genReg(insn.dst(5)), GEN_TYPE_UD);
+      } else {
+        GBE_ASSERT(insn.dstNum == 5);
+        tmp = GenRegister::toUniform(profilingReg[4], GEN_TYPE_UL);
+      }
+    }
+    GenRegister tmp0 = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
+    GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
+    lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t));
+    GenRegister realClock = GenRegister::offset(lastTsReg, 0, sizeof(uint64_t));
+
+    /* MOV(4)   tmp0<1>:UW	   arf_tm<4,4,1>:UW  */
+    p->push(); {
+      p->curr.execWidth = 4;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      GenRegister _tmp0 = tmp0;
+      _tmp0.type = GEN_TYPE_UW;
+      _tmp0.hstride = GEN_HORIZONTAL_STRIDE_1;
+      _tmp0.vstride = GEN_VERTICAL_STRIDE_4;
+      _tmp0.width = GEN_WIDTH_4;
+      p->MOV(_tmp0, tmArf);
+    } p->pop();
+
+    /* Calc the time elapsed. */
+    // SUB(1)  tmp0<1>:UL  tmp0<1>:UL   lastTS<0,1,0>
+    subTimestamps(tmp0, lastTsReg, tmp);
+
+    /* Update the real clock
+       ADD(1)   realclock<1>:UL  realclock<1>:UL  tmp0<1>:UL */
+    addTimestamps(realClock, tmp0, tmp);
+
+    /* We just record timestamp of the first time this point is reached. If the this point is
+       in loop, it can be reached many times. We will not record the later timestamps. The 32bits
+       timestamp can represent about 3.2s, one each kernel's execution time should never exceed
+       3s. So we just record the low 32 bits.
+       CMP.EQ(1)flag0.1	  NULL		tsReg_n<1>:UD  0x0
+       (+flag0.1) MOV(1)   tsReg_n<1>:UD  realclock<1>:UD  Just record the low 32bits
+       */
+    GenRegister tsReg = GenRegister::toUniform(profilingReg[pointNum/8], GEN_TYPE_UD);
+    tsReg = GenRegister::offset(tsReg, 0, (pointNum%8)*sizeof(uint32_t));
+
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_EQ, tsReg, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.inversePredicate = 0;
+      p->MOV(tsReg, GenRegister::retype(GenRegister::retype(realClock, GEN_TYPE_UD), GEN_TYPE_UD));
+    } p->pop();
+
+    /* Store the timestamp for next point use.
+       MOV(4)   lastTS<1>:UW     arf_tm<4,4,1>:UW  */
+    p->push(); {
+      p->curr.execWidth = 4;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      GenRegister _lastTsReg = lastTsReg;
+      _lastTsReg.type = GEN_TYPE_UW;
+      _lastTsReg.hstride = GEN_HORIZONTAL_STRIDE_1;
+      _lastTsReg.vstride = GEN_VERTICAL_STRIDE_4;
+      _lastTsReg.width = GEN_WIDTH_4;
+      p->MOV(_lastTsReg, tmArf);
+    } p->pop();
+  }
+
+  void GenContext::emitStoreProfilingInstruction(const SelectionInstruction &insn) {
+    uint32_t simdType;
+    if (this->simdWidth == 16) {
+      simdType = ir::ProfilingInfo::ProfilingSimdType16;
+    } else if (this->simdWidth == 8) {
+      simdType = ir::ProfilingInfo::ProfilingSimdType8;
+    } else {
+      simdType = ir::ProfilingInfo::ProfilingSimdType1;
+      GBE_ASSERT(0);
+    }
+
+    p->NOP();
+    p->NOP();
+
+    GenRegister tmArf = GenRegister::tm0();
+    GenRegister profilingReg[5];
+    if (p->curr.execWidth == 16) {
+      profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+      profilingReg[1] = GenRegister::offset(profilingReg[0], 1);
+      profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD);
+      profilingReg[3] = GenRegister::offset(profilingReg[2], 1);
+      profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(2)), GEN_TYPE_UD);
+    } else {
+      GBE_ASSERT(p->curr.execWidth == 8);
+      profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+      profilingReg[1] = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD);
+      profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(2)), GEN_TYPE_UD);
+      profilingReg[3] = GenRegister::retype(ra->genReg(insn.src(3)), GEN_TYPE_UD);
+      profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
+    }
+    GenRegister tmp = ra->genReg(insn.dst(0));
+    uint32_t profilingType = insn.extra.profilingType;
+    uint32_t bti = insn.extra.profilingBTI;
+    GBE_ASSERT(profilingType == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
+    lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t));
+    GenRegister realClock = GenRegister::offset(lastTsReg, 0, sizeof(uint64_t));
+    GenRegister tmp0 = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL);
+
+    /* MOV(4)   tmp0<1>:UW	 arf_tm<4,4,1>:UW  */
+    p->push(); {
+      p->curr.execWidth = 4;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      GenRegister _tmp0 = tmp0;
+      _tmp0.type = GEN_TYPE_UW;
+      _tmp0.hstride = GEN_HORIZONTAL_STRIDE_1;
+      _tmp0.vstride = GEN_VERTICAL_STRIDE_4;
+      _tmp0.width = GEN_WIDTH_4;
+      p->MOV(_tmp0, tmArf);
+    } p->pop();
+
+    /* Calc the time elapsed. */
+    subTimestamps(tmp0, lastTsReg, tmp);
+    /* Update the real clock */
+    addTimestamps(realClock, tmp0, tmp);
+
+    //the epilog, record the last timestamp and return.
+    /* MOV(1)   epilog<1>:UL   realclock<0,1,0>:UL  */
+    /* ADD(1)   epilog<1>:UL   prolog<0,1,0>:UL  */
+    GenRegister prolog = GenRegister::toUniform(profilingReg[2], GEN_TYPE_UD);
+    prolog = GenRegister::offset(prolog, 0, 4*sizeof(uint32_t));
+    GenRegister epilog = GenRegister::offset(prolog, 0, 2*sizeof(uint32_t));
+    p->push(); {
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(epilog, GenRegister::retype(realClock, GEN_TYPE_UD));
+      p->MOV(GenRegister::offset(epilog, 0, sizeof(uint32_t)),
+          GenRegister::offset(GenRegister::retype(realClock, GEN_TYPE_UD), 0, sizeof(uint32_t)));
+      addTimestamps(epilog, prolog, tmp);
+    } p->pop();
+
+    /* Now, begin to write the results out. */
+    // Inc the log items number.
+    p->push(); {
+      //ptr[0] is the total count of the log items.
+      GenRegister sndMsg = GenRegister::retype(tmp, GEN_TYPE_UD);
+      sndMsg.width = GEN_WIDTH_8;
+      sndMsg.hstride = GEN_HORIZONTAL_STRIDE_1;
+      sndMsg.vstride = GEN_VERTICAL_STRIDE_8;
+      p->curr.execWidth = 8;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(sndMsg, GenRegister::immud(0x0));
+
+      GenRegister incRes = GenRegister::offset(sndMsg, 1);
+      p->push();
+      {
+        p->curr.execWidth = 1;
+        p->MOV(flagReg, GenRegister::immuw(0x01));
+      }
+      p->pop();
+      p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->ATOMIC(incRes, GEN_ATOMIC_OP_INC, sndMsg, GenRegister::immud(bti), 1);
+    } p->pop();
+
+    // Calculate the final addr
+    GenRegister addr = GenRegister::retype(tmp, GEN_TYPE_UD);
+    addr.width = GEN_WIDTH_8;
+    addr.hstride = GEN_HORIZONTAL_STRIDE_1;
+    addr.vstride = GEN_VERTICAL_STRIDE_8;
+    p->push(); {
+      GenRegister offset = GenRegister::offset(addr, 1);
+
+      p->curr.execWidth = 8;
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MUL(addr, GenRegister::toUniform(offset, GEN_TYPE_UD),
+          GenRegister::immud(sizeof(ir::ProfilingInfo::ProfilingReportItem)));
+      p->ADD(addr, addr, GenRegister::immud(4)); // for the counter
+      p->curr.execWidth = 1;
+      for (int i = 1; i < 8; i++) {
+        p->ADD(GenRegister::toUniform(GenRegister::offset(addr, 0, i*sizeof(uint32_t)), GEN_TYPE_UD),
+            GenRegister::toUniform(GenRegister::offset(addr, 0, i*sizeof(uint32_t)), GEN_TYPE_UD),
+            GenRegister::immud(i*sizeof(uint32_t)));
+      }
+    } p->pop();
+
+    GenRegister data = GenRegister::offset(addr, 1);
+    p->push(); {
+      p->curr.execWidth = 8;
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(data, profilingReg[4]);
+    } p->pop();
+
+    // Write the result out
+    p->push(); {
+      GenRegister ffid = GenRegister::toUniform(data, GEN_TYPE_UD);
+      GenRegister tmp = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UD);
+      GenRegister stateReg =  GenRegister::sr(0, 0);
+      p->curr.noMask = 1;
+      p->curr.execWidth = 1;
+      p->MOV(ffid, stateReg);
+      p->SHR(ffid, ffid, GenRegister::immud(24));
+      p->AND(ffid, ffid, GenRegister::immud(0x0ff));
+      p->OR(ffid, ffid, GenRegister::immud(simdType << 4));
+
+      GenRegister genInfo = GenRegister::offset(ffid, 0, 4);
+      p->MOV(genInfo, stateReg);
+      p->AND(genInfo, genInfo, GenRegister::immud(0x0ff07));
+      //The dispatch mask
+      stateReg = GenRegister::sr(0, 2);
+      p->MOV(tmp, stateReg);
+      p->AND(tmp, tmp, GenRegister::immud(0x0000ffff));
+      p->SHL(tmp, tmp, GenRegister::immud(16));
+      p->OR(genInfo, genInfo, tmp);
+
+      // Write it out.
+      p->curr.execWidth = 8;
+      p->curr.noMask = 1;
+      p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+      p->ADD(addr, addr, GenRegister::immud(32));
+
+      // time stamps
+      for (int i = 0; i < 3; i++) {
+        p->curr.execWidth = 8;
+        p->MOV(data, GenRegister::retype(profilingReg[i], GEN_TYPE_UD));
+        p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+        p->ADD(addr, addr, GenRegister::immud(32));
+      }
+    } p->pop();
+  }
+
+  /* Init value according to WORKGROUP OP
+   * Emit assert is invalid combination operation - datatype */
+  static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
+  {
+
+    if (wg_op == ir::WORKGROUP_OP_ALL)
+    {
+      if (dataReg.type == GEN_TYPE_D
+          || dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF));
+      else if(dataReg.type == GEN_TYPE_L ||
+          dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immint64(0xFFFFFFFFFFFFFFFFL));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    else if(wg_op == ir::WORKGROUP_OP_ANY
+      || wg_op == ir::WORKGROUP_OP_REDUCE_ADD
+      || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+      || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+    {
+      if (dataReg.type == GEN_TYPE_D)
+        p->MOV(dataReg, GenRegister::immd(0x0));
+      else if (dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immud(0x0));
+      else if (dataReg.type == GEN_TYPE_F)
+        p->MOV(dataReg, GenRegister::immf(0x0));
+      else if (dataReg.type == GEN_TYPE_L)
+        p->MOV(dataReg, GenRegister::immint64(0x0));
+      else if (dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN
+      || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+      || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+    {
+      if (dataReg.type == GEN_TYPE_D)
+        p->MOV(dataReg, GenRegister::immd(0x7FFFFFFF));
+      else if (dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immud(0xFFFFFFFF));
+      else if (dataReg.type == GEN_TYPE_F)
+        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000));
+      else if (dataReg.type == GEN_TYPE_L)
+        p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL));
+      else if (dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX
+      || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+      || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+    {
+      if (dataReg.type == GEN_TYPE_D)
+        p->MOV(dataReg, GenRegister::immd(0x80000000));
+      else if (dataReg.type == GEN_TYPE_UD)
+        p->MOV(dataReg, GenRegister::immud(0x0));
+      else if (dataReg.type == GEN_TYPE_F)
+        p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000));
+      else if (dataReg.type == GEN_TYPE_L)
+        p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L));
+      else if (dataReg.type == GEN_TYPE_UL)
+        p->MOV(dataReg, GenRegister::immuint64(0x0));
+      else
+        GBE_ASSERT(0); /* unsupported data-type */
+    }
+
+    /* unsupported operation */
+    else
+      GBE_ASSERT(0);
+  }
+
+  /* Perform WORKGROUP OP on 2 input elements (registers) */
+  static void wgOpPerform(GenRegister dst,
+                         GenRegister src1,
+                         GenRegister src2,
+                         uint32_t wg_op,
+                         GenEncoder *p)
+  {
+    /* perform OP REDUCE on 2 elements */
+    if (wg_op == ir::WORKGROUP_OP_ANY)
+      p->OR(dst, src1, src2);
+    else if (wg_op == ir::WORKGROUP_OP_ALL)
+      p->AND(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN INCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    /* perform OP SCAN EXCLUSIVE on 2 elements */
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+      p->ADD(dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2);
+    else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2);
+
+    else
+      GBE_ASSERT(0);
+  }
+
+  static void wgOpPerformThread(GenRegister threadDst,
+                                  GenRegister inputVal,
+                                  GenRegister threadExchangeData,
+                                   GenRegister resultVal,
+                                   uint32_t simd,
+                                   uint32_t wg_op,
+                                   GenEncoder *p)
+  {
+   p->push();
+   p->curr.predicate = GEN_PREDICATE_NONE;
+   p->curr.noMask = 1;
+   p->curr.execWidth = 1;
+
+   /* setting the type */
+   resultVal = GenRegister::retype(resultVal, inputVal.type);
+   threadDst = GenRegister::retype(threadDst, inputVal.type);
+   threadExchangeData = GenRegister::retype(threadExchangeData, inputVal.type);
+
+   vector<GenRegister> input;
+   vector<GenRegister> result;
+
+   /* for workgroup all and any we can use simd_all/any for each thread */
+   if (wg_op == ir::WORKGROUP_OP_ALL || wg_op == ir::WORKGROUP_OP_ANY) {
+     GenRegister constZero = GenRegister::immuw(0);
+     GenRegister flag01 = GenRegister::flag(0, 1);
+
+     p->push();
+     {
+       p->curr.predicate = GEN_PREDICATE_NONE;
+       p->curr.noMask = 1;
+       p->curr.execWidth = simd;
+       p->MOV(resultVal, GenRegister::immud(1));
+       p->curr.execWidth = 1;
+       if (wg_op == ir::WORKGROUP_OP_ALL)
+         p->MOV(flag01, GenRegister::immw(-1));
+       else
+         p->MOV(flag01, constZero);
+
+       p->curr.execWidth = simd;
+       p->curr.noMask = 0;
+
+       p->curr.flag = 0;
+       p->curr.subFlag = 1;
+       p->CMP(GEN_CONDITIONAL_NEQ, inputVal, constZero);
+
+       if (p->curr.execWidth == 16)
+         if (wg_op == ir::WORKGROUP_OP_ALL)
+           p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+         else
+           p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+       else if (p->curr.execWidth == 8)
+         if (wg_op == ir::WORKGROUP_OP_ALL)
+           p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+         else
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+       else
+         NOT_IMPLEMENTED;
+       p->SEL(threadDst, resultVal, constZero);
+       p->SEL(threadExchangeData, resultVal, constZero);
+     }
+     p->pop();
+   } else {
+     if (inputVal.hstride == GEN_HORIZONTAL_STRIDE_0) {
+       p->MOV(threadExchangeData, inputVal);
+       p->pop();
+       return;
+     }
+
+     /* init thread data to min/max/null values */
+     p->push(); {
+       p->curr.execWidth = simd;
+       wgOpInitValue(p, threadExchangeData, wg_op);
+       p->MOV(resultVal, inputVal);
+     } p->pop();
+
+     GenRegister resultValSingle = resultVal;
+     resultValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+     resultValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+     resultValSingle.width = GEN_WIDTH_1;
+
+     GenRegister inputValSingle = inputVal;
+     inputValSingle.hstride = GEN_HORIZONTAL_STRIDE_0;
+     inputValSingle.vstride = GEN_VERTICAL_STRIDE_0;
+     inputValSingle.width = GEN_WIDTH_1;
+
+
+     /* make an array of registers for easy accesing */
+     for(uint32_t i = 0; i < simd; i++){
+       /* add all resultVal offset reg positions from list */
+       result.push_back(resultValSingle);
+       input.push_back(inputValSingle);
+
+       /* move to next position */
+       resultValSingle.subnr += typeSize(resultValSingle.type);
+       if (resultValSingle.subnr == 32) {
+           resultValSingle.subnr = 0;
+           resultValSingle.nr++;
+       }
+       /* move to next position */
+       inputValSingle.subnr += typeSize(inputValSingle.type);
+       if (inputValSingle.subnr == 32) {
+           inputValSingle.subnr = 0;
+           inputValSingle.nr++;
+       }
+     }
+
+     uint32_t start_i = 0;
+     if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+         wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+         wg_op == ir::WORKGROUP_OP_REDUCE_MAX ||
+         wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+         wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+         wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) {
+       p->MOV(result[0], input[0]);
+       start_i = 1;
+     }
+
+     else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+         wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+         wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
+       p->MOV(result[1], input[0]);
+       start_i = 2;
+     }
+
+     /* algorithm workgroup */
+     for (uint32_t i = start_i; i < simd; i++)
+     {
+       if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+           wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+           wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+         wgOpPerform(result[0], result[0], input[i], wg_op, p);
+
+       else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+           wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+           wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+         wgOpPerform(result[i], result[i - 1], input[i], wg_op, p);
+
+       else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+           wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+           wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+         wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p);
+
+       else
+         GBE_ASSERT(0);
+     }
+   }
+
+   if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+       wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+       wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+   {
+     p->curr.execWidth = 16;
+     /* value exchanged with other threads */
+     p->MOV(threadExchangeData, result[0]);
+     /* partial result thread */
+     p->MOV(threadDst, result[0]);
+   }
+   else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+       wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX)
+   {
+     p->curr.execWidth = 16;
+     /* value exchanged with other threads */
+     p->MOV(threadExchangeData, result[simd - 1]);
+     /* partial result thread */
+     p->MOV(threadDst, resultVal);
+   }
+   else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+       wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+   {
+     p->curr.execWidth = 1;
+     /* set result[0] to min/max/null */
+     wgOpInitValue(p, result[0], wg_op);
+
+     p->curr.execWidth = 16;
+     /* value exchanged with other threads */
+     wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p);
+     /* partial result thread */
+     p->MOV(threadDst, resultVal);
+   }
+
+   p->pop();
+ }
+
+/**
+ * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+ *
+ * Implementation:
+ * 1. All the threads first perform the workgroup op value for the
+ * allocated work-items. SIMD16=> 16 work-items allocated for each thread
+ * 2. Each thread writes the partial result in shared local memory using threadId
+ * 3. After a barrier, each thread will read in chunks of 1-4 elements,
+ * the shared local memory region, using a loop based on the thread num value (threadN)
+ * 4. Each thread computes the final value individually
+ *
+ * Optimizations:
+ * Performance is given by chunk read. If threads read in chunks of 4 elements
+ * the performance is increase 2-3x times compared to chunks of 1 element.
+ */
+  void GenContext::emitWorkGroupOpInstruction(const SelectionInstruction &insn){
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type);
+    const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(2)), dst.type);
+    GenRegister threadData = ra->genReg(insn.src(3));
+    GenRegister partialData = GenRegister::toUniform(threadData, dst.type);
+    GenRegister threadId = ra->genReg(insn.src(0));
+    GenRegister threadLoop = ra->genReg(insn.src(1));
+    GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    GenRegister localBarrier = ra->genReg(insn.src(5));
+
+    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t simd = p->curr.execWidth;
+    int32_t jip0, jip1;
+
+    /* masked elements should be properly set to init value */
+    p->push(); {
+      p->curr.noMask = 1;
+      wgOpInitValue(p, tmp, wg_op);
+      p->curr.noMask = 0;
+      p->MOV(tmp, theVal);
+      p->curr.noMask = 1;
+      p->MOV(theVal, tmp);
+    } p->pop();
+
+    threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD);
+
+    /* use of continuous GRF allocation from insn selection */
+    GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type);
+    GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD);
+    GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD);
+    GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type);
+
+    /* do some calculation within each thread */
+    wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+
+    p->curr.execWidth = 16;
+    p->MOV(theVal, dst);
+    threadData = GenRegister::toUniform(threadData, dst.type);
+
+    /* store thread count for future use on read/write to SLM */
+    if (wg_op == ir::WORKGROUP_OP_ANY ||
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+    {
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn)));
+    }
+    else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+    {
+      threadLoop = GenRegister::retype(tmp, GEN_TYPE_D);
+      p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
+    }
+
+    /* all threads write the partial results to SLM memory */
+    if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+    {
+      GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
+      GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4);
+      p->MOV(msgData.offset(msgData, 0), threadDataL);
+      p->MOV(msgData.offset(msgData, 1), threadDataH);
+
+      p->curr.execWidth = 8;
+      p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
+      p->ADD(msgAddr, msgAddr, msgSlmOff);
+      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2);
+    }
+    else
+    {
+      p->curr.execWidth = 8;
+      p->MOV(msgData, threadData);
+      p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
+      p->ADD(msgAddr, msgAddr, msgSlmOff);
+      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+    }
+
+    /* init partialData register, it will hold the final result */
+    wgOpInitValue(p, partialData, wg_op);
+
+    /* add call to barrier */
+    p->push();
+      p->curr.execWidth = 8;
+      p->curr.physicalFlag = 0;
+      p->curr.noMask = 1;
+      p->AND(localBarrier, barrierId, GenRegister::immud(0x0f000000));
+      p->BARRIER(localBarrier);
+      p->curr.execWidth = 1;
+      p->WAIT();
+    p->pop();
+
+    /* perform a loop, based on thread count (which is now multiple of 4) */
+    p->push();{
+      jip0 = p->n_instruction();
+
+      /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */
+      if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+      {
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+        p->MUL(msgAddr, threadLoop, GenRegister::immd(0x8));
+        p->ADD(msgAddr, msgAddr, msgSlmOff);
+        p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 2);
+
+        GenRegister msgDataL = msgData.retype(msgData.offset(msgData, 0, 4), GEN_TYPE_D);
+        GenRegister msgDataH = msgData.retype(msgData.offset(msgData, 1, 4), GEN_TYPE_D);
+        msgDataL.hstride = 2;
+        msgDataH.hstride = 2;
+        p->MOV(msgDataL, msgDataH);
+
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+      }
+      else
+      {
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+        p->MUL(msgAddr, threadLoop, GenRegister::immd(0x4));
+        p->ADD(msgAddr, msgAddr, msgSlmOff);
+        p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
+
+        /* perform operation, partialData will hold result */
+        wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+      }
+
+      /* while threadN is not 0, cycle read SLM / update value */
+      p->curr.noMask = 1;
+      p->curr.flag = 0;
+      p->curr.subFlag = 1;
+      p->CMP(GEN_CONDITIONAL_G, threadLoop, GenRegister::immd(0x0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      jip1 = p->n_instruction();
+      p->JMPI(GenRegister::immud(0));
+      p->patchJMPI(jip1, jip0 - jip1, 0);
+    } p->pop();
+
+    if(wg_op == ir::WORKGROUP_OP_ANY ||
+      wg_op == ir::WORKGROUP_OP_ALL ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_ADD ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MIN ||
+      wg_op == ir::WORKGROUP_OP_REDUCE_MAX)
+    {
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+      p->MOV(dst, partialData);
+    }
+    else
+    {
+      /* save result to final register location dst */
+      p->curr.execWidth = 16;
+
+      if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD
+          || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD)
+        p->ADD(dst, dst, partialData);
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN)
+      {
+        p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData);
+        /* workaround QW datatype on CMP */
+        if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 1, 0),
+                       dst.offset(dst, 1, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 2, 0),
+                       dst.offset(dst, 2, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 3, 0),
+                       dst.offset(dst, 3, 0), partialData);
+        }
+      }
+      else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX
+        || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+      {
+        p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData);
+        /* workaround QW datatype on CMP */
+        if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 1, 0),
+                       dst.offset(dst, 1, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 2, 0),
+                       dst.offset(dst, 2, 0), partialData);
+            p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 3, 0),
+                       dst.offset(dst, 3, 0), partialData);
+        }
+      }
+    }
+
+    /* corner cases for threads 0 */
+    if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN ||
+      wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX)
+    {
+      p->push();{
+        p->curr.flag = 0;
+        p->curr.subFlag = 1;
+        p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+
+        p->curr.execWidth = 16;
+        p->MOV(dst, theVal);
+      } p->pop();
+    }
+  }
+
+  void GenContext::emitSubGroupOpInstruction(const SelectionInstruction &insn){
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type);
+    const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(0)), dst.type);
+    GenRegister threadData = ra->genReg(insn.src(1));
+
+    uint32_t wg_op = insn.extra.workgroupOp;
+    uint32_t simd = p->curr.execWidth;
+
+    /* masked elements should be properly set to init value */
+    p->push(); {
+      p->curr.noMask = 1;
+      wgOpInitValue(p, tmp, wg_op);
+      p->curr.noMask = 0;
+      p->MOV(tmp, theVal);
+      p->curr.noMask = 1;
+      p->MOV(theVal, tmp);
+    } p->pop();
+
+    /* do some calculation within each thread */
+    wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p);
+  }
+
+  void GenContext::emitPrintfLongInstruction(GenRegister& addr, GenRegister& data,
+                                             GenRegister& src, uint32_t bti) {
+    p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src.bottom_half());
+    p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+    p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+
+    p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src.top_half(this->simdWidth));
+    p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+    p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+  }
+
+  void GenContext::emitPrintfInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister tmp0 = ra->genReg(insn.dst(1));
+    GenRegister src;
+    uint32_t srcNum = insn.srcNum;
+    if (insn.extra.continueFlag)
+      srcNum--;
+
+    GenRegister addr = GenRegister::retype(tmp0, GEN_TYPE_UD);
+    GenRegister data = GenRegister::offset(addr, 2);
+
+    if (!insn.extra.continueFlag) {
+      p->push(); {
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        //ptr[0] is the total count of the log size.
+        p->MOV(addr, GenRegister::immud(0));
+        p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
+      } p->pop();
+
+      p->ATOMIC(addr, GEN_ATOMIC_OP_ADD, addr, GenRegister::immud(insn.extra.printfBTI), 2);
+      /* Write out the header. */
+      p->MOV(data, GenRegister::immud(0xAABBCCDD));
+      p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+
+      p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+      p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
+      p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+
+      p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+      p->MOV(data, GenRegister::immud(insn.extra.printfNum));
+      p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+
+      p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+    }
+
+    // Now, store out every parameter.
+    for(uint32_t i = 0; i < srcNum; i++) {
+      src = ra->genReg(insn.src(i));
+      if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D || src.type == GEN_TYPE_F) {
+        p->MOV(GenRegister::retype(data, src.type), src);
+        p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+        p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+      } else if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_UB ) {
+        p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src);
+        p->UNTYPED_WRITE(addr, GenRegister::immud(insn.extra.printfBTI), 1);
+        p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+      } else if (src.type == GEN_TYPE_L || src.type == GEN_TYPE_UL ) {
+        emitPrintfLongInstruction(addr, data, src, insn.extra.printfBTI);
+      }
+    }
+
+    if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
+      p->push();
+      p->curr.execWidth = 1;
+    }
+    p->MOV(dst, GenRegister::immd(0));
+    if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
+      p->pop();
+    }
+  }
+
   void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
     if (sz == 0)
       sz = 8;
@@ -2170,125 +3487,378 @@ namespace gbe
     p->pop();
   }
 
-  BVAR(OCL_OUTPUT_REG_ALLOC, false);
-  BVAR(OCL_OUTPUT_ASM, false);
+  void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_UD);
+    const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+    const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
+    const uint32_t vec_size = insn.extra.elem;
+    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), GEN_TYPE_UD);
+    const uint32_t simdWidth = p->curr.execWidth;
 
-  void GenContext::allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue) {
-    uint32_t regSize;
-    regSize = this->ra->getRegSize(reg);
-    insertCurbeReg(reg, newCurbeEntry(value, subValue, regSize));
+    // Make header
+    p->push();
+    {
+      // Copy r0 into the header first
+      p->curr.execWidth = 8;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(header, GenRegister::ud8grf(0, 0));
+
+      // Update the header with the current address
+      p->curr.execWidth = 1;
+      p->SHR(headeraddr, addr, GenRegister::immud(4));
+
+      // Put zero in the general state base address
+      p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
+    }
+    p->pop();
+    // Now read the data, oword block read can only work with simd16 and no mask
+    if (vec_size == 1) {
+      p->push();
+      {
+        p->curr.execWidth = 16;
+        p->curr.noMask = 1;
+        p->OBREAD(dst, header, insn.getbti(), simdWidth / 4);
+      }
+      p->pop();
+    } else if (vec_size == 2) {
+      p->push();
+      {
+        p->curr.execWidth = 16;
+        p->curr.noMask = 1;
+        p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2);
+      }
+      p->pop();
+      p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
+      p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth / 8));
+    } else if (vec_size == 4 || vec_size == 8) {
+      if (simdWidth == 8) {
+        for (uint32_t i = 0; i < vec_size / 4; i++) {
+          if (i > 0) {
+            p->push();
+            {
+              // Update the address in header
+              p->curr.execWidth = 1;
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+            }
+            p->pop();
+          }
+          p->push();
+          {
+            p->curr.execWidth = 16;
+            p->curr.noMask = 1;
+            p->OBREAD(tmp, header, insn.getbti(), 8);
+          }
+          p->pop();
+          for (uint32_t j = 0; j < 4; j++)
+            p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, j));
+        }
+      } else {
+        for (uint32_t i = 0; i < vec_size / 2; i++) {
+          if (i > 0) {
+            p->push();
+            {
+              // Update the address in header
+              p->curr.execWidth = 1;
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+            }
+            p->pop();
+          }
+          p->OBREAD(tmp, header, insn.getbti(), 8);
+          for (uint32_t j = 0; j < 2; j++)
+            p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, j*2));
+        }
+      }
+    } else NOT_SUPPORTED;
   }
 
-  void GenContext::buildPatchList(void) {
-    const uint32_t ptrSize = this->getPointerSize();
-    kernel->curbeSize = 0u;
-    auto &stackUse = dag->getUse(ir::ocl::stackptr);
+  void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
+    const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+    const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
+    const uint32_t vec_size = insn.extra.elem;
+    const GenRegister tmp = GenRegister::offset(header, 1);
+    const uint32_t simdWidth = p->curr.execWidth;
+    uint32_t tmp_size = simdWidth * vec_size / 8;
+    tmp_size = tmp_size > 4 ? 4 : tmp_size;
 
-    // We insert the block IP mask first
-    using namespace ir::ocl;
-    if (!isDWLabel())
-      allocCurbeReg(blockip, GBE_CURBE_BLOCK_IP);
-    else
-      allocCurbeReg(dwblockip, GBE_CURBE_DW_BLOCK_IP);
-    allocCurbeReg(lid0, GBE_CURBE_LOCAL_ID_X);
-    allocCurbeReg(lid1, GBE_CURBE_LOCAL_ID_Y);
-    allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
-    allocCurbeReg(zero, GBE_CURBE_ZERO);
-    allocCurbeReg(one, GBE_CURBE_ONE);
-    allocCurbeReg(btiUtil, GBE_CURBE_BTI_UTIL);
-    if (stackUse.size() != 0)
-      allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
-    // Go over the arguments and find the related patch locations
-    const uint32_t argNum = fn.argNum();
-    for (uint32_t argID = 0u; argID < argNum; ++argID) {
-      const ir::FunctionArgument &arg = fn.getArg(argID);
-      // For pointers and values, we have nothing to do. We just push the values
-      if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
-          arg.type == ir::FunctionArgument::LOCAL_POINTER ||
-          arg.type == ir::FunctionArgument::CONSTANT_POINTER)
-        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, ptrSize, ptrSize));
-      if (arg.type == ir::FunctionArgument::VALUE ||
-          arg.type == ir::FunctionArgument::STRUCTURE ||
-          arg.type == ir::FunctionArgument::IMAGE ||
-          arg.type == ir::FunctionArgument::SAMPLER)
-        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, arg.size));
-    }
-
-    // Go over all the instructions and find the special register we need
-    // to push
-    #define INSERT_REG(SPECIAL_REG, PATCH) \
-    if (reg == ir::ocl::SPECIAL_REG) { \
-      if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
-      allocCurbeReg(reg, GBE_CURBE_##PATCH); \
-    } else
+    p->push();
+      // Copy r0 into the header first
+      p->curr.execWidth = 8;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(header, GenRegister::ud8grf(0,0));
 
-    bool needLaneID = false;
-    fn.foreachInstruction([&](ir::Instruction &insn) {
-      const uint32_t srcNum = insn.getSrcNum();
-      if (insn.getOpcode() == ir::OP_SIMD_ID) {
-        GBE_ASSERT(srcNum == 0);
-        needLaneID = true;
+      // Update the header with the current address
+      p->curr.execWidth = 1;
+      p->SHR(headeraddr, addr, GenRegister::immud(4));
+
+      // Put zero in the general state base address
+      p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
+
+    p->pop();
+    // Now write the data, oword block write can only work with simd16 and no mask
+    if (vec_size == 1) {
+      p->MOV(tmp, ra->genReg(insn.src(1)));
+      p->push();
+      {
+        p->curr.execWidth = 16;
+        p->curr.noMask = 1;
+        p->OBWRITE(header, insn.getbti(), simdWidth / 4);
       }
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
-        const ir::Register reg = insn.getSrc(srcID);
-        if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
-          if (srcID != 0) continue;
-          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
-          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
-          ir::ImageInfoKey key(bti, type);
-          const ir::Register imageInfo = insn.getSrc(0);
-          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
-            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
-            insertCurbeReg(imageInfo, offset);
+      p->pop();
+    } else if (vec_size == 2) {
+      p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ;
+      p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra->genReg(insn.src(2))) ;
+      p->push();
+      {
+        p->curr.execWidth = 16;
+        p->curr.noMask = 1;
+        p->OBWRITE(header, insn.getbti(), simdWidth / 2);
+      }
+      p->pop();
+    } else if (vec_size == 4 || vec_size == 8) {
+      if (simdWidth == 8) {
+        for (uint32_t i = 0; i < vec_size / 4; i++) {
+          for (uint32_t j = 0; j < 4; j++)
+            p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j + i*4))) ;
+          if (i > 0) {
+            p->push();
+            {
+              // Update the address in header
+              p->curr.execWidth = 1;
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+            }
+            p->pop();
+          }
+          p->push();
+          {
+            p->curr.execWidth = 16;
+            p->curr.noMask = 1;
+            p->OBWRITE(header, insn.getbti(), 8);
+          }
+          p->pop();
+        }
+      } else {
+        for (uint32_t i = 0; i < vec_size / 2; i++) {
+          for (uint32_t j = 0; j < 2; j++)
+            p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + j + i*2))) ;
+          if (i > 0) {
+            p->push();
+            {
+              // Update the address in header
+              p->curr.execWidth = 1;
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+            }
+            p->pop();
           }
-          continue;
+          p->OBWRITE(header, insn.getbti(), 8);
         }
-        if (fn.isSpecialReg(reg) == false) continue;
-        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
-        if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
-        INSERT_REG(lsize0, LOCAL_SIZE_X)
-        INSERT_REG(lsize1, LOCAL_SIZE_Y)
-        INSERT_REG(lsize2, LOCAL_SIZE_Z)
-        INSERT_REG(gsize0, GLOBAL_SIZE_X)
-        INSERT_REG(gsize1, GLOBAL_SIZE_Y)
-        INSERT_REG(gsize2, GLOBAL_SIZE_Z)
-        INSERT_REG(goffset0, GLOBAL_OFFSET_X)
-        INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
-        INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
-        INSERT_REG(workdim, WORK_DIM)
-        INSERT_REG(numgroup0, GROUP_NUM_X)
-        INSERT_REG(numgroup1, GROUP_NUM_Y)
-        INSERT_REG(numgroup2, GROUP_NUM_Z)
-        INSERT_REG(stackptr, STACK_POINTER)
-        INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
-        INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
-        do {} while(0);
       }
-    });
-#undef INSERT_REG
+    } else NOT_SUPPORTED;
+
+  }
+
+  void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
+    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
+    GenRegister header, offsetx, offsety, blocksizereg;
+    if (simdWidth == 8)
+      header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+    else
+      header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(2)),1), GEN_TYPE_UD);
+
+    offsetx = GenRegister::offset(header, 0, 0*4);
+    offsety = GenRegister::offset(header, 0, 1*4);
+    blocksizereg = GenRegister::offset(header, 0, 2*4);
+    size_t vec_size = insn.extra.elem;
+    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
+
+    if (simdWidth == 8)
+    {
+      p->push();
+        // Copy r0 into the header first
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(header, GenRegister::ud8grf(0,0));
+
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->MOV(offsetx, coordx);
+        p->MOV(offsety, coordy);
+        // Update block width and height
+        p->MOV(blocksizereg, GenRegister::immud(blocksize));
+        // Now read the data
+        p->curr.execWidth = 8;
+        p->MBREAD(dst, header, insn.getbti(), vec_size);
+      p->pop();
+
+    }
+    else if (simdWidth == 16)
+    {
+      const GenRegister tmp = ra->genReg(insn.dst(vec_size));
+      p->push();
+        // Copy r0 into the header first
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(header, GenRegister::ud8grf(0,0));
 
-    if (needLaneID)
-      allocCurbeReg(laneid, GBE_CURBE_LANE_ID);
+        // First half
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->MOV(offsetx, coordx);
+        p->MOV(offsety, coordy);
+        // Update block width and height
+        p->MOV(blocksizereg, GenRegister::immud(blocksize));
+        // Now read the data
+        p->curr.execWidth = 8;
+        p->MBREAD(tmp, header, insn.getbti(), vec_size);
+
+        // Second half
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->ADD(offsetx, offsetx, GenRegister::immud(32));
+
+        const GenRegister tmp2 = GenRegister::offset(tmp, vec_size);
+        // Now read the data
+        p->curr.execWidth = 8;
+        p->MBREAD(tmp2, header, insn.getbti(), vec_size);
+
+        // Move the reg to fit vector rule.
+        for (uint32_t i = 0; i < vec_size; i++) {
+          p->MOV(GenRegister::offset(dst, i * 2), GenRegister::offset(tmp, i));
+          p->MOV(GenRegister::offset(dst, i * 2 + 1),
+                 GenRegister::offset(tmp2, i));
+        }
+      p->pop();
+    } else NOT_IMPLEMENTED;
+  }
+
+  void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) {
+    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
+    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
+    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+    GenRegister offsetx, offsety, blocksizereg;
+    size_t vec_size = insn.extra.elem;
+    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
+
+    offsetx = GenRegister::offset(header, 0, 0*4);
+    offsety = GenRegister::offset(header, 0, 1*4);
+    blocksizereg = GenRegister::offset(header, 0, 2*4);
+
+    if (simdWidth == 8)
+    {
+      p->push();
+        // Copy r0 into the header first
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(header, GenRegister::ud8grf(0,0));
+
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->MOV(offsetx, coordx);
+        p->MOV(offsety, coordy);
+        // Update block width and height
+        p->MOV(blocksizereg, GenRegister::immud(blocksize));
+        p->curr.execWidth = 8;
+        // Mov what we need into msgs
+        for(uint32_t i = 0; i < vec_size; i++)
+          p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i)));
+        // Now read the data
+        p->MBWRITE(header, insn.getbti(), vec_size);
+      p->pop();
+
+    }
+    else
+    {
+      p->push();
+        // Copy r0 into the header first
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(header, GenRegister::ud8grf(0,0));
+
+        // First half
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->MOV(offsetx, coordx);
+        p->MOV(offsety, coordy);
+        // Update block width and height
+        p->MOV(blocksizereg, GenRegister::immud(blocksize));
+        // Now read the data
+        p->curr.execWidth = 8;
+        // Mov what we need into msgs
+        for(uint32_t i = 0; i < vec_size; i++)
+          p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i)));
+        p->MBWRITE(header, insn.getbti(), vec_size);
+
+        // Second half
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->ADD(offsetx, offsetx, GenRegister::immud(32));
+
+        p->curr.execWidth = 8;
+        // Mov what we need into msgs
+        for(uint32_t i = 0; i < vec_size; i++)
+          p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
+        // Now write the data
+        p->MBWRITE(header, insn.getbti(), vec_size);
+
+      p->pop();
+    }
+  }
+
+  BVAR(OCL_OUTPUT_REG_ALLOC, false);
+  BVAR(OCL_OUTPUT_ASM, false);
+
+  void GenContext::allocCurbeReg(ir::Register reg) {
+    uint32_t regSize;
+    gbe_curbe_type curbeType;
+    int subType;
+    this->getRegPayloadType(reg, curbeType, subType);
+    regSize = this->ra->getRegSize(reg);
+    insertCurbeReg(reg, newCurbeEntry(curbeType, subType, regSize));
+    /* Need to patch the image information registers. */
+    if (curbeType == GBE_CURBE_IMAGE_INFO) {
+      std::sort(kernel->patches.begin(), kernel->patches.end());
+      uint32_t offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, subType);
+      fn.getImageSet()->appendInfo(static_cast<ir::ImageInfoKey>(subType), offset);
+    }
+  }
+
+  void GenContext::buildPatchList() {
 
     // After this point the vector is immutable. Sorting it will make
     // research faster
     std::sort(kernel->patches.begin(), kernel->patches.end());
-
     kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
   }
 
+  BVAR(OCL_OUTPUT_SEL_IR, false);
+  BVAR(OCL_OPTIMIZE_SEL_IR, true);
   bool GenContext::emitCode(void) {
     GenKernel *genKernel = static_cast<GenKernel*>(this->kernel);
-    buildPatchList();
     sel->select();
+    if (OCL_OPTIMIZE_SEL_IR)
+      sel->optimize();
+    if (OCL_OUTPUT_SEL_IR)
+      outputSelectionIR(*this, this->sel);
     schedulePreRegAllocation(*this, *this->sel);
     if (UNLIKELY(ra->allocate(*this->sel) == false))
       return false;
     schedulePostRegAllocation(*this, *this->sel);
     if (OCL_OUTPUT_REG_ALLOC)
       ra->outputAllocation();
-    this->clearFlagRegister();
+    if (inProfilingMode) { // add the profiling prolog before do anything.
+      this->profilingProlog();
+    }
     this->emitStackPointer();
+    this->clearFlagRegister();
     this->emitSLMOffset();
     this->emitInstructionStream();
     if (this->patchBranches() == false)
@@ -2299,6 +3869,9 @@ namespace gbe
     if (OCL_OUTPUT_ASM)
       outputAssembly(stdout, genKernel);
 
+    if (OCL_DEBUGINFO)
+      outputAssembly(stdout, genKernel);
+
     if (this->asmFileName) {
       FILE *asmDumpStream = fopen(this->asmFileName, "a");
       if (asmDumpStream) {
@@ -2315,6 +3888,12 @@ namespace gbe
   }
 
   void GenContext::outputAssembly(FILE *file, GenKernel* genKernel) {
+    /* get gen version for the instruction compact */
+    uint32_t insn_version = 0;
+    if (IS_GEN7(deviceID) || IS_GEN75(deviceID))
+      insn_version = 7;
+    else if (IS_GEN8(deviceID) || IS_GEN9(deviceID))
+      insn_version = 8;
     fprintf(file, "%s's disassemble begin:\n", genKernel->getName());
     ir::LabelIndex curLabel = (ir::LabelIndex)0;
     GenCompactInstruction * pCom = NULL;
@@ -2330,10 +3909,14 @@ namespace gbe
           curLabel = (ir::LabelIndex)(curLabel + 1);
         }
       }
+
+      if (OCL_DEBUGINFO)
+        fprintf(file, "[%3i,%3i]", p->storedbg[insnID].line, p->storedbg[insnID].col);
+
       fprintf(file, "    (%8i)  ", insnID);
       pCom = (GenCompactInstruction*)&p->store[insnID];
       if(pCom->bits1.cmpt_control == 1) {
-        decompactInstruction(pCom, &insn);
+        decompactInstruction(pCom, &insn, insn_version);
         gen_disasm(file, &insn, deviceID, 1);
         insnID++;
       } else {
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 8ef725f..fb3d4fe 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -49,6 +49,7 @@ namespace gbe
     REGISTER_ALLOCATION_FAIL,
     REGISTER_SPILL_EXCEED_THRESHOLD,
     REGISTER_SPILL_FAIL,
+    REGISTER_SPILL_NO_SPACE,
     OUT_OF_RANGE_IF_ENDIF,
   } CompileErrorCode;
 
@@ -86,6 +87,7 @@ namespace gbe
     /*! Simd width chosen for the current function */
     INLINE uint32_t getSimdWidth(void) const { return simdWidth; }
     void clearFlagRegister(void);
+    void profilingProlog(void);
     /*! check the flag reg, if is grf, use f0.1 instead */
     GenRegister checkFlagRegister(GenRegister flagReg);
     /*! Emit the per-lane stack pointer computation */
@@ -107,6 +109,9 @@ namespace gbe
       return this->liveness->getLiveIn(bb);
     }
 
+    void loadLaneID(GenRegister dst);
+    GenRegister getBlockIP(void);
+    void setBlockIP(GenRegister blockip, uint32_t label);
     void collectShifter(GenRegister dest, GenRegister src);
     void loadTopHalf(GenRegister dest, GenRegister src);
     void storeTopHalf(GenRegister dest, GenRegister src);
@@ -165,16 +170,27 @@ namespace gbe
     virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
     void emitDWordGatherInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
+    void emitVmeInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
     void emitSpillRegInstruction(const SelectionInstruction &insn);
     void emitUnSpillRegInstruction(const SelectionInstruction &insn);
     void emitGetImageInfoInstruction(const SelectionInstruction &insn);
     virtual void emitI64MULInstruction(const SelectionInstruction &insn);
     virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
+    virtual void emitF64DIVInstruction(const SelectionInstruction &insn);
+    void emitCalcTimestampInstruction(const SelectionInstruction &insn);
+    void emitStoreProfilingInstruction(const SelectionInstruction &insn);
+    virtual void emitWorkGroupOpInstruction(const SelectionInstruction &insn);
+    virtual void emitSubGroupOpInstruction(const SelectionInstruction &insn);
+    void emitPrintfInstruction(const SelectionInstruction &insn);
     void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
-    unsigned beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned desc);
-    void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned jip0);
+    unsigned beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned desc);
+    void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned jip0);
+    void emitOBReadInstruction(const SelectionInstruction &insn);
+    void emitOBWriteInstruction(const SelectionInstruction &insn);
+    void emitMBReadInstruction(const SelectionInstruction &insn);
+    void emitMBWriteInstruction(const SelectionInstruction &insn);
 
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
@@ -205,6 +221,8 @@ namespace gbe
     bool relaxMath;
     bool getIFENDIFFix(void) const { return ifEndifFix; }
     void setIFENDIFFix(bool fix) { ifEndifFix = fix; }
+    bool getProfilingMode(void) const { return inProfilingMode; }
+    void setProfilingMode(bool b) { inProfilingMode = b; }
     CompileErrorCode getErrCode() { return errCode; }
 
   protected:
@@ -212,13 +230,18 @@ namespace gbe
       return GBE_NEW(Gen7Encoder, this->simdWidth, 7, deviceID);
     }
     /*! allocate a new curbe register and insert to curbe pool. */
-    void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
+    void allocCurbeReg(ir::Register reg);
 
     virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+    void calcGlobalXYZRange(GenRegister& reg, GenRegister& tmp, int flag, int subFlag);
+    virtual void subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp);
+    virtual void addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& tmp);
+    virtual void emitPrintfLongInstruction(GenRegister& addr, GenRegister& data, GenRegister& src, uint32_t bti);
 
   private:
     CompileErrorCode errCode;
     bool ifEndifFix;
+    bool inProfilingMode;
     uint32_t regSpillTick;
     const char* asmFileName;
     /*! Build the curbe patch list for the given kernel */
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 1ca148c..bcbb23f 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -174,6 +174,8 @@ enum opcode {
   GEN_OPCODE_LINE = 89,
   GEN_OPCODE_PLN = 90,
   GEN_OPCODE_MAD = 91,
+  GEN_OPCODE_LRP = 92,
+  GEN_OPCODE_MADM = 93,
   GEN_OPCODE_NOP = 126,
 };
 
@@ -463,6 +465,17 @@ enum GenMessageTarget {
 #define GEN_UPDATE_GATEWAT_STATE  0b101
 #define GEN_MMIO_READ_WRITE       0b110
 
+/* Accumulator acc2~acc9 in instruction */
+#define GEN8_INSN_ACC2            0
+#define GEN8_INSN_ACC3            1
+#define GEN8_INSN_ACC4            2
+#define GEN8_INSN_ACC5            3
+#define GEN8_INSN_ACC6            4
+#define GEN8_INSN_ACC7            5
+#define GEN8_INSN_ACC8            6
+#define GEN8_INSN_ACC9            7
+#define GEN8_INSN_NOACC           8
+
 /////////////////////////////////////////////////////////////////////////////
 // Gen EU structures
 /////////////////////////////////////////////////////////////////////////////
@@ -479,6 +492,32 @@ struct GenInstruction {
 
 union GenCompactInstruction {
   struct GenInstruction low;
+  /* Gen8+ src3 compact inst */
+  struct {
+    struct {
+      uint32_t opcode:7;
+      uint32_t pad:1;
+      uint32_t control_index:2;
+      uint32_t src_index:2;
+      uint32_t dst_reg_nr:7;
+      uint32_t pad1:9;
+      uint32_t src0_rep_ctrl:1;
+      uint32_t compact_control:1;
+      uint32_t debug_control:1;
+      uint32_t saturate:1;
+    } bits1;
+    struct {
+      uint32_t src1_rep_ctrl:1;
+      uint32_t src2_rep_ctrl:1;
+      uint32_t src0_subnr:3;
+      uint32_t src1_subnr:3;
+      uint32_t src2_subnr:3;
+      uint32_t src0_reg_nr:7;
+      uint32_t src1_reg_nr:7;
+      uint32_t src2_reg_nr:7;
+    } bits2;
+  } src3Insn;
+  /* Normal src2 compact inst */
   struct {
     struct {
       uint32_t opcode:7;
@@ -602,6 +641,21 @@ union GenNativeInstruction
         uint32_t end_of_thread:1;
       } sampler_gen7;
 
+      struct {
+        uint32_t bti:8;
+        uint32_t vme_search_path_lut:3;
+        uint32_t lut_sub:2;
+        uint32_t msg_type:2;
+        uint32_t stream_in:1;
+        uint32_t stream_out:1;
+        uint32_t reserved_mbz:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } vme_gen7;
+
       /**
        * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
        *
@@ -756,6 +810,22 @@ union GenNativeInstruction
         uint32_t jip:32;
       } gen8_branch;
 
+      /*! Data port Media block read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t ver_line_stride_offset:1;
+        uint32_t ver_line_stride:1;
+        uint32_t ver_line_stride_override:1;
+        uint32_t ignored:3;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_mblock_rw;
+
       int d;
       uint32_t ud;
       float f;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index cac29e8..f8c99b2 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -258,7 +258,7 @@ namespace gbe
     else
       NOT_SUPPORTED;
   }
-#if 0
+
   static void setOBlockRW(GenEncoder *p,
                           GenNativeInstruction *insn,
                           uint32_t bti,
@@ -269,13 +269,27 @@ namespace gbe
   {
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
     p->setMessageDescriptor(insn, sfid, msg_length, response_length);
-    assert(size == 2 || size == 4);
+    assert(size == 2 || size == 4 || size == 8);
     insn->bits3.gen7_oblock_rw.msg_type = msg_type;
     insn->bits3.gen7_oblock_rw.bti = bti;
-    insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
+    insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : 4);
     insn->bits3.gen7_oblock_rw.header_present = 1;
   }
-#endif
+
+  static void setMBlockRW(GenEncoder *p,
+                          GenNativeInstruction *insn,
+                          uint32_t bti,
+                          uint32_t msg_type,
+                          uint32_t msg_length,
+                          uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_mblock_rw.msg_type = msg_type;
+    insn->bits3.gen7_mblock_rw.bti = bti;
+    insn->bits3.gen7_mblock_rw.header_present = 1;
+  }
+
 
   static void setDWordScatterMessgae(GenEncoder *p,
                                      GenNativeInstruction *insn,
@@ -451,7 +465,7 @@ namespace gbe
     this->setHeader(insn);
     insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
 
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
 
     if (bti.file == GEN_IMMEDIATE_VALUE) {
@@ -591,11 +605,23 @@ namespace gbe
       this->setSrc1(insn, bti);
     }
   }
+
+  extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
+  void GenEncoder::setDBGInfo(DebugInfo in, bool hasHigh)
+  {
+    if(OCL_DEBUGINFO)
+    {
+      storedbg.push_back(in);
+      if(hasHigh) storedbg.push_back(in);
+    }
+  }
+  
   GenCompactInstruction *GenEncoder::nextCompact(uint32_t opcode) {
     GenCompactInstruction insn;
     std::memset(&insn, 0, sizeof(GenCompactInstruction));
     insn.bits1.opcode = opcode;
     this->store.push_back(insn.low);
+    setDBGInfo(DBGInfo, false);
     return (GenCompactInstruction *)&this->store.back();
   }
 
@@ -605,66 +631,27 @@ namespace gbe
      insn.header.opcode = opcode;
      this->store.push_back(insn.low);
      this->store.push_back(insn.high);
+     setDBGInfo(DBGInfo, true);
      return (GenNativeInstruction *)(&this->store.back()-1);
   }
 
   bool GenEncoder::canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1)
   {
-	/* By now, just alu1 insn will come to here. So just MOV */
+    /* By now, just alu1 insn will come to here. So just MOV */
     this->MOV(dst.bottom_half(), src0.bottom_half());
     this->MOV(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth));
     return true;
   }
 
-  INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
-                            GenRegister src0, GenRegister src1 = GenRegister::null()) {
-       int w = p->curr.execWidth;
-       p->push();
-       p->curr.execWidth = p->getDoubleExecWidth();
-       p->curr.nibControl = 0;
-       GenNativeInstruction *insn = p->next(opcode);
-       p->setHeader(insn);
-       p->setDst(insn, dst);
-       p->setSrc0(insn, src0);
-       if (!GenRegister::isNull(src1))
-         p->setSrc1(insn, src1);
-       if (w == 8)
-         p->curr.nibControl = 1; // second 1/8 mask
-       insn = p->next(opcode);
-       p->setHeader(insn);
-       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
-       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
-       if (!GenRegister::isNull(src1))
-         p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
-       p->pop();
-  }
-
-  // Double register accessing is a little special,
-  // Per Gen spec, then only supported mode is SIMD8 and, it only
-  // handles four doubles each time.
-  // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8
-  // to two SIMD1x4.
-  INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
-                           GenRegister src0, GenRegister src1 = GenRegister::null()) {
-      if (p->curr.execWidth == 8)
-        _handleDouble(p, opcode, dst, src0, src1);
-      else if (p->curr.execWidth == 16) {
-        p->push();
-        p->curr.execWidth = 8;
-        p->curr.quarterControl = GEN_COMPRESSION_Q1;
-        _handleDouble(p, opcode, dst, src0, src1);
-        p->curr.quarterControl = GEN_COMPRESSION_Q2;
-        if (!GenRegister::isNull(src1))
-          src1 = GenRegister::offset(src1, 2);
-        _handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1);
-        p->pop();
-      }
+  void GenEncoder::handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1) {
+    /* For platform before gen8, we do not support double and can not get here. */
+    GBE_ASSERT(0);
   }
 
   void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
             GenRegister src, uint32_t condition) {
      if (dst.isdf() && src.isdf()) {
-       handleDouble(p, opcode, dst, src);
+       p->handleDouble(p, opcode, dst, src);
      } else if (dst.isint64() && src.isint64()
                 && p->canHandleLong(opcode, dst, src)) { // handle int64
        return;
@@ -709,7 +696,7 @@ namespace gbe
             uint32_t condition)
   {
     if (dst.isdf() && src0.isdf() && src1.isdf()) {
-       handleDouble(p, opcode, dst, src0, src1);
+       p->handleDouble(p, opcode, dst, src0, src1);
     } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
        if(compactAlu2(p, opcode, dst, src0, src1, condition, false))
          return;
@@ -768,84 +755,12 @@ namespace gbe
     this->alu3(GEN_OPCODE_##OP, dest, src0, src1, src2); \
   }
 
-  void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
-    union { double d; unsigned u[2]; } u;
-    u.d = value;
-    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
-    push();
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = 1;
-    curr.execWidth = 1;
-    MOV(r, GenRegister::immud(u.u[1]));
-    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
-    pop();
-    r.type = GEN_TYPE_DF;
-    r.vstride = GEN_VERTICAL_STRIDE_0;
-    r.width = GEN_WIDTH_1;
-    r.hstride = GEN_HORIZONTAL_STRIDE_0;
-    push();
-    uint32_t width = curr.execWidth;
-    curr.execWidth = 8;
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = 1;
-    curr.quarterControl = GEN_COMPRESSION_Q1;
-    MOV(dest, r);
-    if (width == 16) {
-      curr.quarterControl = GEN_COMPRESSION_Q2;
-      MOV(GenRegister::offset(dest, 2), r);
-    }
-    pop();
-  }
-
   void GenEncoder::LOAD_INT64_IMM(GenRegister dest, GenRegister value) {
     GenRegister u0 = GenRegister::immd((int)value.value.i64), u1 = GenRegister::immd(value.value.i64 >> 32);
     MOV(dest.bottom_half(), u0);
     MOV(dest.top_half(this->simdWidth), u1);
   }
 
-  void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp) {
-    GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
-    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F);
-    int w = curr.execWidth;
-    GenRegister r0;
-    int factor = 1;
-    if (dest.type == GEN_TYPE_F) {
-      r0 = r;
-      r = GenRegister::h2(r);
-      factor = 2;
-    } else {
-      r0 = GenRegister::h2(r);
-    }
-    push();
-    curr.execWidth = 8;
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = 1;
-    MOV(r0, src0);
-    MOV(GenRegister::suboffset(r0, 4 * factor), GenRegister::suboffset(src0, 4));
-    curr.noMask = 0;
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(dest, r);
-    curr.nibControl = 1;
-    MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8 / factor));
-    pop();
-    if (w == 16) {
-      push();
-      curr.execWidth = 8;
-      curr.predicate = GEN_PREDICATE_NONE;
-      curr.noMask = 1;
-      MOV(r0, GenRegister::suboffset(src0, 8));
-      MOV(GenRegister::suboffset(r0, 4 * factor), GenRegister::suboffset(src0, 12));
-      curr.noMask = 0;
-      curr.quarterControl = 1;
-      curr.nibControl = 0;
-      MOV(GenRegister::suboffset(dest, 8), r);
-      curr.nibControl = 1;
-      MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8 / factor));
-      pop();
-    }
-  }
-
   void GenEncoder::F16TO32(GenRegister dest, GenRegister src0) {
     alu1(this, GEN_OPCODE_F16TO32, dest, src0);
   }
@@ -879,6 +794,7 @@ namespace gbe
   ALU2(PLN)
   ALU2(MACH)
   ALU3(MAD)
+  ALU3(LRP)
  // ALU2(BRC)
  // ALU1(ENDIF)
  //  ALU1(IF)
@@ -961,6 +877,18 @@ namespace gbe
      insn->bits3.msg_gateway.sub_function_id = GEN_BARRIER_MSG;
      insn->bits3.msg_gateway.notify = 0x1;
   }
+
+  void GenEncoder::FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN) {
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, GenRegister::null());
+     this->setSrc0(insn, src);
+     setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
+     insn->bits3.msg_gateway.sub_function_id = GEN_FORWARD_MSG;
+     GBE_ASSERT(notifyN <= 2);
+     insn->bits3.msg_gateway.notify = notifyN;
+  }
+
   void GenEncoder::FENCE(GenRegister dst) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setHeader(insn);
@@ -1106,9 +1034,10 @@ namespace gbe
     this->setSrc1(insn, src1);
   }
 
-  void GenEncoder::WAIT(void) {
+  void GenEncoder::WAIT(uint32_t n) {
      GenNativeInstruction *insn = this->next(GEN_OPCODE_WAIT);
-     GenRegister src = GenRegister::notification1();
+     GBE_ASSERT(curr.predicate == GEN_PREDICATE_NONE);
+     GenRegister src = GenRegister::notification0(n);
      this->setDst(insn, GenRegister::null());
      this->setSrc0(insn, src);
      this->setSrc1(insn, GenRegister::null());
@@ -1231,6 +1160,50 @@ namespace gbe
                        simd_mode, return_format);
   }
 
+  void GenEncoder::setVmeMessage(GenNativeInstruction *insn,
+                                unsigned char bti,
+                                uint32_t response_length,
+                                uint32_t msg_length,
+                                uint32_t msg_type,
+                                unsigned char vme_search_path_lut,
+                                unsigned char lut_sub)
+  {
+     const GenMessageTarget sfid = GEN_SFID_VIDEO_MOTION_EST;
+     setMessageDescriptor(insn, sfid, msg_length, response_length, true);
+     insn->bits3.vme_gen7.bti = bti;
+     insn->bits3.vme_gen7.vme_search_path_lut = vme_search_path_lut;
+     insn->bits3.vme_gen7.lut_sub = lut_sub;
+     insn->bits3.vme_gen7.msg_type = msg_type;
+     insn->bits3.vme_gen7.stream_in = 0;
+     insn->bits3.vme_gen7.stream_out = 0;
+     insn->bits3.vme_gen7.reserved_mbz = 0;
+
+  }
+
+  void GenEncoder::VME(unsigned char bti,
+                       GenRegister dest,
+                       GenRegister msg,
+                       uint32_t msg_type,
+                       uint32_t vme_search_path_lut,
+                       uint32_t lut_sub)
+  {
+    /* Currectly we just support inter search only, we will support other
+     * modes in future.
+     */
+    GBE_ASSERT(msg_type == 1);
+    uint32_t msg_length, response_length;
+    if(msg_type == 1){
+      msg_length = 5;
+      response_length = 6;
+    }
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    this->setDst(insn, dest);
+    this->setSrc0(insn, msg);
+    setVmeMessage(insn, bti, response_length, msg_length,
+                  msg_type, vme_search_path_lut, lut_sub);
+  }
+
   void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
   {
      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
@@ -1285,6 +1258,72 @@ namespace gbe
      setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
   }
 
+  void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1;
+    const uint32_t response_length = size / 2; // Size is in regs
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setOBlockRW(this,
+                insn,
+                bti,
+                size,
+                GEN7_OBLOCK_READ,
+                msg_length,
+                response_length);
+  }
+
+  void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1 + size / 2; // Size is in owords
+    const uint32_t response_length = 0;
+    this->setHeader(insn);
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    setOBlockRW(this,
+                insn,
+                bti,
+                size,
+                GEN7_OBLOCK_WRITE,
+                msg_length,
+                response_length);
+  }
+
+  void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1;
+    const uint32_t response_length = size; // Size of registers
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setMBlockRW(this,
+                insn,
+                bti,
+                GEN75_P1_MEDIA_BREAD,
+                msg_length,
+                response_length);
+  }
+
+  void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1 + size;
+    const uint32_t response_length = 0; // Size of registers
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setMBlockRW(this,
+                insn,
+                bti,
+                GEN75_P1_MEDIA_TYPED_BWRITE,
+                msg_length,
+                response_length);
+  }
+
   void GenEncoder::EOT(uint32_t msg) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 79e7b6e..0f835ca 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -88,6 +88,9 @@ namespace gbe
     uint32_t deviceID;
     /*! simd width for this codegen */
     uint32_t simdWidth;
+    DebugInfo DBGInfo;
+    vector<DebugInfo> storedbg;
+    void setDBGInfo(DebugInfo in, bool hasHigh);
     ////////////////////////////////////////////////////////////////////////
     // Encoding functions
     ////////////////////////////////////////////////////////////////////////
@@ -125,7 +128,7 @@ namespace gbe
     ALU2(LINE)
     ALU2(PLN)
     ALU3(MAD)
-    //ALU2(MOV_DF);
+    ALU3(LRP)
     ALU2(BRC)
     ALU1(BRD)
 #undef ALU1
@@ -135,13 +138,11 @@ namespace gbe
 
     virtual void F16TO32(GenRegister dest, GenRegister src0);
     virtual void F32TO16(GenRegister dest, GenRegister src0);
-    /*! Get double/long exec width */
-    virtual int getDoubleExecWidth(void) = 0;
-    virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
-    virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
     /*! Barrier message (to synchronize threads of a workgroup) */
     void BARRIER(GenRegister src);
+    /*! Forward the gateway message. */
+    void FWD_GATEWAY_MSG(GenRegister src, uint32_t notifyN = 0);
     /*! Memory fence message (to order loads and stores between threads) */
     void FENCE(GenRegister dst);
     /*! Jump indexed instruction */
@@ -167,7 +168,7 @@ namespace gbe
     /*! No-op */
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
-    void WAIT(void);
+    void WAIT(uint32_t n = 0);
     /*! Atomic instructions */
     virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
     /*! Untyped read (upto 4 channels) */
@@ -205,6 +206,19 @@ namespace gbe
                            bool header_present,
                            uint32_t simd_mode,
                            uint32_t return_format);
+    virtual void VME(unsigned char bti,
+                         GenRegister dest,
+                         GenRegister msg,
+                         uint32_t msg_type,
+                         uint32_t vme_search_path_lut,
+                         uint32_t lut_sub);
+    void setVmeMessage(GenNativeInstruction *insn,
+                          unsigned char bti,
+                          uint32_t response_length,
+                          uint32_t msg_length,
+                          uint32_t msg_type,
+                          unsigned char vme_search_path_lut,
+                          unsigned char lut_sub);
 
     /*! TypedWrite instruction for texture */
     virtual void TYPED_WRITE(GenRegister header,
@@ -247,11 +261,20 @@ namespace gbe
     virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg) = 0;
     virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg) = 0;
     GenCompactInstruction *nextCompact(uint32_t opcode);
-    virtual bool disableCompact() { return false; }
+    virtual uint32_t getCompactVersion() { return 7; }
     GenNativeInstruction *next(uint32_t opcode);
     uint32_t n_instruction(void) const { return store.size(); }
     virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
                             GenRegister src1 = GenRegister::null());
+    virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
+    /*! OBlock read */
+    void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! OBlock write */
+    void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! MBlock read */
+    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! MBlock write */
+    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
 
     GBE_CLASS(GenEncoder); //!< Use custom allocators
     virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/backend/gen_insn_compact.cpp b/backend/src/backend/gen_insn_compact.cpp
index d692fff..5de451c 100644
--- a/backend/src/backend/gen_insn_compact.cpp
+++ b/backend/src/backend/gen_insn_compact.cpp
@@ -62,6 +62,13 @@ namespace gbe {
     {0b0101000000100000000, 31},
   };
 
+  static compact_table_entry src3_control_table[] = {
+    {0b100000000110000000000001, 0},
+    {0b000000000110000000000001, 1},
+    {0b000000001000000000000001, 2},
+    {0b000000001000000000100001, 3},
+  };
+
   static compact_table_entry data_type_table[] = {
     {0b000000001000001100, 20},
     {0b001000000000000001, 0},
@@ -97,6 +104,41 @@ namespace gbe {
     {0b001111111110111101, 28},
   };
 
+  static compact_table_entry gen8_data_type_table[] = {
+    {0b001000000000000000001, 0},
+    {0b001000000000001000000, 1},
+    {0b001000000000001000001, 2},
+    {0b001000000000011000001, 3},
+    {0b001000000000101011101, 4},
+    {0b001000000010111011101, 5},
+    {0b001000000011101000001, 6},
+    {0b001000000011101000101, 7},
+    {0b001000000011101011101, 8},
+    {0b001000001000001000001, 9},
+    {0b001000011000001000000, 10},
+    {0b001000011000001000001, 11},
+    {0b001000101000101000101, 12},
+    {0b001000111000101000100, 13},
+    {0b001000111000101000101, 14},
+    {0b001011100011101011101, 15},
+    {0b001011101011100011101, 16},
+    {0b001011101011101011100, 17},
+    {0b001011101011101011101, 18},
+    {0b001011111011101011100, 19},
+    {0b000000000010000001100, 20},
+    {0b001000000000001011101, 21},
+    {0b001000000000101000101, 22},
+    {0b001000001000001000000, 23},
+    {0b001000101000101000100, 24},
+    {0b001000111000100000100, 25},
+    {0b001001001001000001001, 26},
+    {0b001010111011101011101, 27},
+    {0b001011111011101011101, 28},
+    {0b001001111001101001100, 29},
+    {0b001001001001001001000, 30},
+    {0b001001011001001001000, 31},
+  };
+
   static compact_table_entry data_type_decompact[] = {
     {0b001000000000000001, 0},
     {0b001000000000100000, 1},
@@ -224,6 +266,25 @@ namespace gbe {
     };
     uint32_t data;
   };
+  union Src3ControlBits{
+    struct {
+      uint32_t access_mode:1;
+      uint32_t dependency_control:2;
+      uint32_t nibble_control:1;
+      uint32_t quarter_control:2;
+      uint32_t thread_control:2;
+      uint32_t predicate_control:4;
+      uint32_t predicate_inverse:1;
+      uint32_t execution_size:3;
+      uint32_t conditional_modifier:4;
+      uint32_t acc_wr_control:1;
+      uint32_t flag_sub_reg_nr:1;
+      uint32_t flag_reg_nr:1;
+      uint32_t mask_control:1;
+    };
+    uint32_t data;
+  };
+
   union DataTypeBits{
     struct {
       uint32_t dest_reg_file:2;
@@ -238,6 +299,21 @@ namespace gbe {
     };
     uint32_t data;
   };
+  union Gen8DataTypeBits{
+    struct {
+      uint32_t dest_reg_file:2;
+      uint32_t dest_reg_type:4;
+      uint32_t src0_reg_file:2;
+      uint32_t src0_reg_type:4;
+      uint32_t src1_reg_file:2;
+      uint32_t src1_reg_type:4;
+      uint32_t dest_horiz_stride:2;
+      uint32_t dest_address_mode:1;
+      uint32_t pad:11;
+    };
+    uint32_t data;
+  };
+
   union SubRegBits {
     struct {
       uint32_t dest_subreg_nr:5;
@@ -260,48 +336,157 @@ namespace gbe {
     uint32_t data;
   };
 
-  void decompactInstruction(GenCompactInstruction * p, void *insn) {
-    Gen7NativeInstruction *pOut = (union Gen7NativeInstruction *) insn;
+  void decompactInstruction(GenCompactInstruction * p, void *insn, uint32_t insn_version) {
     GenNativeInstruction *pNative = (union GenNativeInstruction *) insn;
-
-    memset(pOut, 0, sizeof(Gen7NativeInstruction));
-    union ControlBits control_bits;
-    control_bits.data = control_table[(uint32_t)p->bits1.control_index].bit_pattern;
-    pNative->low.low = (uint32_t)p->bits1.opcode | ((control_bits.data & 0xffff) << 8);
-    pOut->header.destreg_or_condmod = p->bits1.destreg_or_condmod;
-    pOut->header.saturate = control_bits.saturate;
-    pOut->header.acc_wr_control = p->bits1.acc_wr_control;
-    pOut->header.cmpt_control = p->bits1.cmpt_control;
-    pOut->header.debug_control = p->bits1.debug_control;
-
-    union DataTypeBits data_type_bits;
-    union SubRegBits subreg_bits;
-    union SrcRegBits src0_bits;
-    data_type_bits.data = data_type_decompact[(uint32_t)p->bits1.data_type_index].bit_pattern;
-    subreg_bits.data = subreg_table[(uint32_t)p->bits1.sub_reg_index].bit_pattern;
-    src0_bits.data = srcreg_table[p->bits1.src0_index_lo | p->bits2.src0_index_hi << 2].bit_pattern;
-
-    pNative->low.high |= data_type_bits.data & 0x7fff;
-    pOut->bits1.da1.dest_horiz_stride = data_type_bits.dest_horiz_stride;
-    pOut->bits1.da1.dest_address_mode = data_type_bits.dest_address_mode;
-    pOut->bits1.da1.dest_reg_nr = p->bits2.dest_reg_nr;
-    pOut->bits1.da1.dest_subreg_nr = subreg_bits.dest_subreg_nr;
-
-    pOut->bits2.da1.src0_subreg_nr = subreg_bits.src0_subreg_nr;
-    pOut->bits2.da1.src0_reg_nr = p->bits2.src0_reg_nr;
-    pNative->high.low |= (src0_bits.data << 13);
-    pOut->bits2.da1.flag_sub_reg_nr = control_bits.flag_sub_reg_nr;
-    pOut->bits2.da1.flag_reg_nr = control_bits.flag_reg_nr;
-
-    if(data_type_bits.src1_reg_file == GEN_IMMEDIATE_VALUE) {
-      uint32_t imm = (uint32_t)p->bits2.src1_reg_nr | (p->bits2.src1_index<<8);
-      pOut->bits3.ud = imm & 0x1000 ? (imm | 0xfffff000) : imm;
+    Gen7NativeInstruction *pOut = (union Gen7NativeInstruction *) insn;
+    /* src3 compact insn */
+    if(p->bits1.opcode == GEN_OPCODE_MAD || p->bits1.opcode == GEN_OPCODE_LRP) {
+#define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
+      assert(insn_version == 8);
+      Gen8NativeInstruction *pOut = (union Gen8NativeInstruction *) insn;
+      memset(pOut, 0, sizeof(Gen8NativeInstruction));
+      union Src3ControlBits control_bits;
+      control_bits.data = src3_control_table[(uint32_t)p->src3Insn.bits1.control_index].bit_pattern;
+      pOut->header.opcode = p->bits1.opcode;
+
+      pOut->bits1.da1.flag_sub_reg_nr = control_bits.flag_sub_reg_nr;
+      pOut->bits1.da1.flag_reg_nr = control_bits.flag_reg_nr;
+      pOut->header.nib_ctrl = control_bits.nibble_control;
+      pOut->header.execution_size = control_bits.execution_size;
+      pOut->header.predicate_control = control_bits.predicate_control;
+      pOut->header.predicate_inverse = control_bits.predicate_inverse;
+      pOut->header.thread_control = control_bits.thread_control;
+      pOut->header.quarter_control = control_bits.quarter_control;
+      pOut->header.dependency_control = control_bits.dependency_control;
+      pOut->header.access_mode = control_bits.access_mode;
+      pOut->header.acc_wr_control = control_bits.acc_wr_control;
+      pOut->header.destreg_or_condmod = control_bits.conditional_modifier;
+      pOut->bits1.da1.mask_control= control_bits.mask_control;
+      pOut->header.cmpt_control = p->bits1.cmpt_control;
+      pOut->header.debug_control = p->bits1.debug_control;
+      pOut->header.saturate = p->src3Insn.bits1.saturate;
+
+      /* dst */
+      pOut->bits1.da3src.dest_reg_nr = p->src3Insn.bits1.dst_reg_nr;
+      pOut->bits1.da3src.dest_writemask = 0xf;
+
+      pOut->bits2.da3src.src0_swizzle = NO_SWIZZLE;
+      pOut->bits2.da3src.src0_subreg_nr = p->src3Insn.bits2.src0_subnr;
+      pOut->bits2.da3src.src0_reg_nr = p->src3Insn.bits2.src0_reg_nr;
+      pOut->bits1.da3src.src0_negate = p->src3Insn.bits1.src_index == 1;
+      pOut->bits2.da3src.src0_rep_ctrl = p->src3Insn.bits1.src0_rep_ctrl;
+
+      pOut->bits2.da3src.src1_swizzle = NO_SWIZZLE;
+      pOut->bits2.da3src.src1_subreg_nr_low = (p->src3Insn.bits2.src1_subnr) & 0x3;
+      pOut->bits3.da3src.src1_subreg_nr_high = (p->src3Insn.bits2.src1_subnr) >> 2;
+      pOut->bits2.da3src.src1_rep_ctrl = p->src3Insn.bits2.src1_rep_ctrl;
+      pOut->bits3.da3src.src1_reg_nr = p->src3Insn.bits2.src1_reg_nr;
+      pOut->bits1.da3src.src1_negate = p->src3Insn.bits1.src_index == 2;
+
+      pOut->bits3.da3src.src2_swizzle = NO_SWIZZLE;
+      pOut->bits3.da3src.src2_subreg_nr = p->src3Insn.bits2.src2_subnr;
+      pOut->bits3.da3src.src2_rep_ctrl = p->src3Insn.bits2.src2_rep_ctrl;
+      pOut->bits3.da3src.src2_reg_nr = p->src3Insn.bits2.src2_reg_nr;
+      pOut->bits1.da3src.src2_negate = p->src3Insn.bits1.src_index == 3;
+#undef NO_SWIZZLE
     } else {
-      union SrcRegBits src1_bits;
-      src1_bits.data = srcreg_table[p->bits2.src1_index].bit_pattern;
-      pOut->bits3.da1.src1_subreg_nr = subreg_bits.src1_subreg_nr;
-      pOut->bits3.da1.src1_reg_nr = p->bits2.src1_reg_nr;
-      pNative->high.high |= (src1_bits.data << 13);
+      if (insn_version == 7) {
+        memset(pOut, 0, sizeof(Gen7NativeInstruction));
+        union ControlBits control_bits;
+        control_bits.data = control_table[(uint32_t)p->bits1.control_index].bit_pattern;
+        pNative->low.low = (uint32_t)p->bits1.opcode | ((control_bits.data & 0xffff) << 8);
+        pOut->header.destreg_or_condmod = p->bits1.destreg_or_condmod;
+        pOut->header.saturate = control_bits.saturate;
+        pOut->header.acc_wr_control = p->bits1.acc_wr_control;
+        pOut->header.cmpt_control = p->bits1.cmpt_control;
+        pOut->header.debug_control = p->bits1.debug_control;
+
+        union DataTypeBits data_type_bits;
+        union SubRegBits subreg_bits;
+        union SrcRegBits src0_bits;
+        data_type_bits.data = data_type_decompact[(uint32_t)p->bits1.data_type_index].bit_pattern;
+        subreg_bits.data = subreg_table[(uint32_t)p->bits1.sub_reg_index].bit_pattern;
+        src0_bits.data = srcreg_table[p->bits1.src0_index_lo | p->bits2.src0_index_hi << 2].bit_pattern;
+
+        pNative->low.high |= data_type_bits.data & 0x7fff;
+        pOut->bits1.da1.dest_horiz_stride = data_type_bits.dest_horiz_stride;
+        pOut->bits1.da1.dest_address_mode = data_type_bits.dest_address_mode;
+        pOut->bits1.da1.dest_reg_nr = p->bits2.dest_reg_nr;
+        pOut->bits1.da1.dest_subreg_nr = subreg_bits.dest_subreg_nr;
+
+        pOut->bits2.da1.src0_subreg_nr = subreg_bits.src0_subreg_nr;
+        pOut->bits2.da1.src0_reg_nr = p->bits2.src0_reg_nr;
+        pNative->high.low |= (src0_bits.data << 13);
+        pOut->bits2.da1.flag_sub_reg_nr = control_bits.flag_sub_reg_nr;
+        pOut->bits2.da1.flag_reg_nr = control_bits.flag_reg_nr;
+
+        if(data_type_bits.src1_reg_file == GEN_IMMEDIATE_VALUE) {
+          uint32_t imm = (uint32_t)p->bits2.src1_reg_nr | (p->bits2.src1_index<<8);
+          pOut->bits3.ud = imm & 0x1000 ? (imm | 0xfffff000) : imm;
+        } else {
+          union SrcRegBits src1_bits;
+          src1_bits.data = srcreg_table[p->bits2.src1_index].bit_pattern;
+          pOut->bits3.da1.src1_subreg_nr = subreg_bits.src1_subreg_nr;
+          pOut->bits3.da1.src1_reg_nr = p->bits2.src1_reg_nr;
+          pNative->high.high |= (src1_bits.data << 13);
+        }
+      } else if (insn_version == 8) {
+        Gen8NativeInstruction *pOut = (union Gen8NativeInstruction *) insn;
+        memset(pOut, 0, sizeof(Gen8NativeInstruction));
+        union ControlBits control_bits;
+        control_bits.data = control_table[(uint32_t)p->bits1.control_index].bit_pattern;
+        pOut->header.opcode = p->bits1.opcode;
+
+        pOut->bits1.da1.flag_sub_reg_nr = control_bits.flag_sub_reg_nr;
+        pOut->bits1.da1.flag_reg_nr = control_bits.flag_reg_nr;
+        pOut->header.saturate = control_bits.saturate;
+        pOut->header.execution_size= control_bits.execution_size;
+        pOut->header.predicate_control= control_bits.predicate_control;
+        pOut->header.predicate_inverse= control_bits.predicate_inverse;
+        pOut->header.thread_control= control_bits.thread_control;
+        pOut->header.quarter_control= control_bits.quarter_control;
+        pOut->header.dependency_control = control_bits.dependency_control;
+        pOut->header.access_mode= control_bits.access_mode;
+        pOut->bits1.da1.mask_control= control_bits.mask_control;
+
+        pOut->header.destreg_or_condmod = p->bits1.destreg_or_condmod;
+        pOut->header.acc_wr_control = p->bits1.acc_wr_control;
+        pOut->header.cmpt_control = p->bits1.cmpt_control;
+        pOut->header.debug_control = p->bits1.debug_control;
+
+        union Gen8DataTypeBits data_type_bits;
+        union SubRegBits subreg_bits;
+        union SrcRegBits src0_bits;
+        data_type_bits.data = gen8_data_type_table[(uint32_t)p->bits1.data_type_index].bit_pattern;
+        subreg_bits.data = subreg_table[(uint32_t)p->bits1.sub_reg_index].bit_pattern;
+        src0_bits.data = srcreg_table[p->bits1.src0_index_lo | p->bits2.src0_index_hi << 2].bit_pattern;
+
+        pOut->bits1.da1.dest_reg_file = data_type_bits.dest_reg_file;
+        pOut->bits1.da1.dest_reg_type = data_type_bits.dest_reg_type;
+        pOut->bits1.da1.dest_horiz_stride = data_type_bits.dest_horiz_stride;
+        pOut->bits1.da1.dest_address_mode = data_type_bits.dest_address_mode;
+        pOut->bits1.da1.dest_reg_nr = p->bits2.dest_reg_nr;
+        pOut->bits1.da1.dest_subreg_nr = subreg_bits.dest_subreg_nr;
+
+        pOut->bits1.da1.src0_reg_file = data_type_bits.src0_reg_file;
+        pOut->bits1.da1.src0_reg_type = data_type_bits.src0_reg_type;
+        pOut->bits2.da1.src0_subreg_nr = subreg_bits.src0_subreg_nr;
+        pOut->bits2.da1.src0_reg_nr = p->bits2.src0_reg_nr;
+        pNative->high.low |= (src0_bits.data << 13);
+
+        pOut->bits2.da1.src1_reg_file = data_type_bits.src1_reg_file;
+        pOut->bits2.da1.src1_reg_type = data_type_bits.src1_reg_type;
+        if(data_type_bits.src1_reg_file == GEN_IMMEDIATE_VALUE) {
+          uint32_t imm = (uint32_t)p->bits2.src1_reg_nr | (p->bits2.src1_index<<8);
+          pOut->bits3.ud = imm & 0x1000 ? (imm | 0xfffff000) : imm;
+        } else {
+          union SrcRegBits src1_bits;
+          src1_bits.data = srcreg_table[p->bits2.src1_index].bit_pattern;
+          pOut->bits3.da1.src1_subreg_nr = subreg_bits.src1_subreg_nr;
+          pOut->bits3.da1.src1_reg_nr = p->bits2.src1_reg_nr;
+          pNative->high.high |= (src1_bits.data << 13);
+        }
+      }
     }
   }
 
@@ -349,6 +534,50 @@ namespace gbe {
     return r->index;
   }
 
+  int compactControlBitsSrc3(GenEncoder *p, uint32_t quarter, uint32_t execWidth) {
+
+    const GenInstructionState *s = &p->curr;
+    // some quick check
+    if(s->nibControl != 0)
+      return -1;
+    if(s->predicate != GEN_PREDICATE_NONE)
+      return -1;
+    if(s->inversePredicate != 0)
+      return -1;
+    if(s->flag == 1)
+      return -1;
+    if(s->subFlag != 0)
+      return -1;
+
+    Src3ControlBits b;
+    b.data = 0;
+
+    if (execWidth == 8)
+      b.execution_size = GEN_WIDTH_8;
+    else if (execWidth == 16)
+      b.execution_size = GEN_WIDTH_16;
+    else if (execWidth == 4)
+      return -1;
+    else if (execWidth == 1)
+      return -1;
+    else
+      NOT_IMPLEMENTED;
+
+    b.mask_control = s->noMask;
+    b.quarter_control = quarter;
+    b.access_mode = 1;
+
+    compact_table_entry key;
+    key.bit_pattern = b.data;
+
+    compact_table_entry *r = (compact_table_entry *)bsearch(&key, src3_control_table,
+      sizeof(src3_control_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+    if (r == NULL)
+      return -1;
+    return r->index;
+  }
+
+
   int compactDataTypeBits(GenEncoder *p, GenRegister *dst, GenRegister *src0, GenRegister *src1) {
 
     // compact does not support any indirect acess
@@ -358,35 +587,65 @@ namespace gbe {
     if(src0->file == GEN_IMMEDIATE_VALUE)
       return -1;
 
-    DataTypeBits b;
-    b.data = 0;
+    compact_table_entry *r;
+    if(p->getCompactVersion() == 7) {
+      DataTypeBits b;
+      b.data = 0;
 
-    b.dest_horiz_stride = dst->hstride == GEN_HORIZONTAL_STRIDE_0 ? GEN_HORIZONTAL_STRIDE_1 : dst->hstride;
-    b.dest_address_mode = dst->address_mode;
-    b.dest_reg_file = dst->file;
-    b.dest_reg_type = dst->type;
+      b.dest_horiz_stride = dst->hstride == GEN_HORIZONTAL_STRIDE_0 ? GEN_HORIZONTAL_STRIDE_1 : dst->hstride;
+      b.dest_address_mode = dst->address_mode;
+      b.dest_reg_file = dst->file;
+      b.dest_reg_type = dst->type;
 
-    b.src0_reg_file = src0->file;
-    b.src0_reg_type = src0->type;
+      b.src0_reg_file = src0->file;
+      b.src0_reg_type = src0->type;
 
-    if(src1) {
-      b.src1_reg_type = src1->type;
-      b.src1_reg_file = src1->file;
-    } else {
-      // default to zero
-      b.src1_reg_type = 0;
-      b.src1_reg_file = 0;
-    }
+      if(src1) {
+        b.src1_reg_type = src1->type;
+        b.src1_reg_file = src1->file;
+      } else {
+        // default to zero
+        b.src1_reg_type = 0;
+        b.src1_reg_file = 0;
+      }
 
-    compact_table_entry key;
-    key.bit_pattern = b.data;
+      compact_table_entry key;
+      key.bit_pattern = b.data;
+
+      r = (compact_table_entry *)bsearch(&key, data_type_table, sizeof(data_type_table)/sizeof(compact_table_entry),
+                                         sizeof(compact_table_entry), cmp_key);
+    } else if(p->getCompactVersion() == 8) {
+      Gen8DataTypeBits b;
+      b.data = 0;
+
+      b.dest_horiz_stride = dst->hstride == GEN_HORIZONTAL_STRIDE_0 ? GEN_HORIZONTAL_STRIDE_1 : dst->hstride;
+      b.dest_address_mode = dst->address_mode;
+      b.dest_reg_file = dst->file;
+      b.dest_reg_type = dst->type;
 
-    compact_table_entry *r = (compact_table_entry *)bsearch(&key, data_type_table,
-                             sizeof(data_type_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+      b.src0_reg_file = src0->file;
+      b.src0_reg_type = src0->type;
+
+      if(src1) {
+        b.src1_reg_type = src1->type;
+        b.src1_reg_file = src1->file;
+      } else {
+        // default to zero
+        b.src1_reg_type = 0;
+        b.src1_reg_file = 0;
+      }
+
+      compact_table_entry key;
+      key.bit_pattern = b.data;
+
+      r = (compact_table_entry *)bsearch(&key, gen8_data_type_table, sizeof(gen8_data_type_table)/sizeof(compact_table_entry),
+                                         sizeof(compact_table_entry), cmp_key);
+    }
     if (r == NULL)
       return -1;
     return r->index;
   }
+
   int compactSubRegBits(GenEncoder *p, GenRegister *dst, GenRegister *src0, GenRegister *src1) {
     SubRegBits b;
     b.data = 0;
@@ -440,9 +699,6 @@ namespace gbe {
   }
 
   bool compactAlu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src, uint32_t condition, bool split) {
-    if(p->disableCompact())
-      return false;
-
     if(split) {
       // TODO support it
       return false;
@@ -478,9 +734,6 @@ namespace gbe {
   }
 
   bool compactAlu2(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, uint32_t condition, bool split) {
-    if(p->disableCompact())
-      return false;
-
     if(split) {
       // TODO support it
       return false;
@@ -528,4 +781,46 @@ namespace gbe {
       return true;
     }
   }
+
+  bool compactAlu3(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, GenRegister src2)
+  {
+    if(p->getCompactVersion() < 8)
+      return false;
+    if(opcode != GEN_OPCODE_MAD && opcode != GEN_OPCODE_LRP)
+      return false;
+    if(src0.type != GEN_TYPE_F)
+      return false;
+    assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+    assert(src0.address_mode == GEN_ADDRESS_DIRECT);
+    assert(src0.nr < 128);
+    assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+    assert(src1.address_mode == GEN_ADDRESS_DIRECT);
+    assert(src1.nr < 128);
+    assert(src2.file == GEN_GENERAL_REGISTER_FILE);
+    assert(src2.address_mode == GEN_ADDRESS_DIRECT);
+    assert(src2.nr < 128);
+
+    int control_index = compactControlBitsSrc3(p, p->curr.quarterControl, p->curr.execWidth);
+    if( control_index == -1) return false;
+    if( src0.negation + src1.negation + src2.negation > 1)
+      return false;
+
+    GenCompactInstruction *insn = p->nextCompact(opcode);
+    insn->src3Insn.bits1.control_index = control_index;
+    insn->src3Insn.bits1.compact_control = 1;
+    insn->src3Insn.bits1.src_index = src0.negation ? 1 : (src1.negation ? 2: (src2.negation ? 3 : 0));
+    insn->src3Insn.bits1.dst_reg_nr = dst.nr ;
+    insn->src3Insn.bits1.src0_rep_ctrl = src0.vstride == GEN_VERTICAL_STRIDE_0;
+    insn->src3Insn.bits1.saturate = p->curr.saturate;
+    /* bits2 */
+    insn->src3Insn.bits2.src1_rep_ctrl = src1.vstride == GEN_VERTICAL_STRIDE_0;
+    insn->src3Insn.bits2.src2_rep_ctrl = src2.vstride == GEN_VERTICAL_STRIDE_0;
+    insn->src3Insn.bits2.src0_subnr = src0.subnr/4;
+    insn->src3Insn.bits2.src1_subnr = src1.subnr/4;
+    insn->src3Insn.bits2.src2_subnr = src2.subnr/4;
+    insn->src3Insn.bits2.src0_reg_nr = src0.nr;
+    insn->src3Insn.bits2.src1_reg_nr = src1.nr;
+    insn->src3Insn.bits2.src2_reg_nr = src2.nr;
+    return true;
+  }
 };
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index d073770..c396626 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -36,6 +36,7 @@ DECL_GEN7_SCHEDULE(UnpackByte,      40,        1,        1)
 DECL_GEN7_SCHEDULE(PackLong,        40,        1,        1)
 DECL_GEN7_SCHEDULE(UnpackLong,      40,        1,        1)
 DECL_GEN7_SCHEDULE(Sample,          160,       1,        1)
+DECL_GEN7_SCHEDULE(Vme,             320,       1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
 DECL_GEN7_SCHEDULE(SpillReg,        20,        1,        1)
 DECL_GEN7_SCHEDULE(UnSpillReg,      160,       1,        1)
@@ -43,3 +44,13 @@ DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
 DECL_GEN7_SCHEDULE(I64MUL,          20,        40,      20)
 DECL_GEN7_SCHEDULE(I64SATADD,       20,        40,      20)
 DECL_GEN7_SCHEDULE(I64SATSUB,       20,        40,      20)
+DECL_GEN7_SCHEDULE(F64DIV,          20,        40,      20)
+DECL_GEN7_SCHEDULE(CalcTimestamp,   80,        1,        1)
+DECL_GEN7_SCHEDULE(StoreProfiling,  80,        1,        1)
+DECL_GEN7_SCHEDULE(WorkGroupOp,     80,        1,        1)
+DECL_GEN7_SCHEDULE(SubGroupOp,      80,        1,        1)
+DECL_GEN7_SCHEDULE(Printf,          80,        1,        1)
+DECL_GEN7_SCHEDULE(OBRead,          80,        1,        1)
+DECL_GEN7_SCHEDULE(OBWrite,         80,        1,        1)
+DECL_GEN7_SCHEDULE(MBRead,          80,        1,        1)
+DECL_GEN7_SCHEDULE(MBWrite,         80,        1,        1)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index 358a2ce..245d17a 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -41,26 +41,29 @@
  * ==============================
  *
  * We try to limit the register pressure.
- * Well, this is a hard problem and we have a decent strategy now that we called
- * "zero cycled LIFO scheduling".
- * We use a local forward list scheduling and we schedule the instructions in a
- * LIFO order i.e. as a stack. Basically, we take the most recent instruction
- * and schedule it right away. Obviously we ignore completely the real latencies
- * and throuputs and just simulate instructions that are issued and completed in
- * zero cycle. For the complex kernels we already have (like menger sponge),
- * this provides a pretty good strategy enabling SIMD16 code generation where
- * when scheduling is deactivated, even SIMD8 fails
  *
- * One may argue that this strategy is bad, latency wise. This is not true since
- * the register allocator will anyway try to burn as many registers as possible.
- * So, there is still opportunities to schedule after register allocation.
+ * To find out an instruction scheduling policy to achieve the theoretical minimum
+ * registers required in a basic block is a NP problem. We have to use some heuristic
+ * factor to simplify the algorithm. There are many researchs which indicate a
+ * bottom-up list scheduling is much better than the top-down method in turns of
+ * register pressure.  I choose one of such research paper as our target. The paper
+ * is as below:
  *
- * Our idea seems to work decently. There is however a strong research article
- * that is able to near-optimally reschudle the instructions to minimize
- * register use. This is:
+ * "Register-Sensitive Selection, Duplication, and Sequencing of Instructions"
+ * It use the bottom-up list scheduling with a Sethi-Ullman label as an
+ * heuristic number. As we will do cycle awareness scheduling after the register
+ * allocation, we don't need to bother with cycle related heuristic number here.
+ * I just skipped the EST computing and usage part in the algorithm.
  *
- * "Minimum Register Instruction Sequence Problem: Revisiting Optimal Code
- *  Generation for DAGs"
+ * It turns out this algorithm works well. It could reduce the register spilling
+ * in clBlas's sgemmBlock kernel from 83+ to only 20.
+ *
+ * Although this scheduling method seems to be lowering the ILP(instruction level parallism).
+ * It's not a big issue, because we will allocate as much as possible different registers
+ * in the following register allocation stage, and we will do a after allocation
+ * instruction scheduling which will try to get as much ILP as possible.
+ *
+ * FIXME: we only need to do this scheduling when a BB is really under high register pressure.
  *
  * After the register allocation
  * ==============================
@@ -114,7 +117,7 @@ namespace gbe
   struct ScheduleDAGNode
   {
     INLINE ScheduleDAGNode(SelectionInstruction &insn) :
-      insn(insn), refNum(0), retiredCycle(0), preRetired(false), readDistance(0x7fffffff) {}
+      insn(insn), refNum(0), depNum(0), retiredCycle(0), preRetired(false), readDistance(0x7fffffff) {}
     bool dependsOn(ScheduleDAGNode *node) const {
       GBE_ASSERT(node != NULL);
       for (auto child : node->children)
@@ -128,6 +131,10 @@ namespace gbe
     SelectionInstruction &insn;
     /*! Number of nodes that point to us (i.e. nodes we depend on) */
     uint32_t refNum;
+    /*! Number of nodes that we depends on. */
+    uint32_t depNum;
+    /*! Register pressure. */
+    uint32_t regNum;
     /*! Cycle when the instruction is retired */
     uint32_t retiredCycle;
     bool preRetired;
@@ -192,8 +199,10 @@ namespace gbe
     static const uint32_t MAX_ACC_REGISTER = 1u;
     /*! Maximum number of *physical* tm registers */
     static const uint32_t MAX_TM_REGISTER = 1u;
+    /*! Maximum number of state registers */
+    static const uint32_t MAX_ST_REGISTER = 2u;
     /*! Maximum number of *physical* arf registers */
-    static const uint32_t MAX_ARF_REGISTER = MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_TM_REGISTER;
+    static const uint32_t MAX_ARF_REGISTER = MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_TM_REGISTER + MAX_ST_REGISTER;
     /*! Stores the last node that wrote to a register / memory ... */
     vector<ScheduleDAGNode*> nodes;
     /*! store nodes each node depends on */
@@ -218,6 +227,8 @@ namespace gbe
     /*! Schedule the DAG, pre register allocation and post register allocation. */
     void preScheduleDAG(SelectionBlock &bb, int32_t insnNum);
     void postScheduleDAG(SelectionBlock &bb, int32_t insnNum);
+
+    void computeRegPressure(ScheduleDAGNode *node, map<ScheduleDAGNode *, int32_t> &regPressureMap);
     /*! To limit register pressure or limit insn latency problems */
     SchedulePolicy policy;
     /*! Make ScheduleListNode allocation faster */
@@ -277,6 +288,7 @@ namespace gbe
       ScheduleListNode *dep = scheduler.newScheduleListNode(node0, depMode);
       node0->refNum++;
       node1->children.push_back(dep);
+      node1->depNum++;
       auto it = deps.find(node0);
       if (it != deps.end()) {
         it->second.push_back(node1);
@@ -333,6 +345,9 @@ namespace gbe
           return grfNum + MAX_FLAG_REGISTER + nr;
         } else if (file == GEN_ARF_TM) {
           return grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER;
+        } else if (file == GEN_ARF_STATE) {
+          GBE_ASSERT(nr < MAX_ST_REGISTER);
+          return grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_TM_REGISTER + nr;
         } else {
           NOT_SUPPORTED;
           return 0;
@@ -500,7 +515,8 @@ namespace gbe
       // Consider barriers and wait are reading memory (local and global)
     if (insn.opcode == SEL_OP_BARRIER ||
         insn.opcode == SEL_OP_FENCE ||
-        insn.opcode == SEL_OP_WAIT) {
+        insn.opcode == SEL_OP_WAIT ||
+        insn.opcode == SEL_OP_WORKGROUP_OP) {
         const uint32_t memIndex = tracker.getMemoryIndex();
         tracker.addDependency(node, memIndex, READ_AFTER_WRITE);
       }
@@ -562,7 +578,8 @@ namespace gbe
       // Consider barriers and wait are reading memory (local and global)
       if (insn.opcode == SEL_OP_BARRIER ||
           insn.opcode == SEL_OP_FENCE ||
-          insn.opcode == SEL_OP_WAIT) {
+          insn.opcode == SEL_OP_WAIT ||
+          insn.opcode == SEL_OP_WORKGROUP_OP) {
         const uint32_t memIndex = tracker.getMemoryIndex();
         tracker.addDependency(memIndex, node, WRITE_AFTER_READ);
       }
@@ -589,7 +606,11 @@ namespace gbe
           || node->insn.opcode == SEL_OP_ENDIF
           || node->insn.opcode == SEL_OP_WHILE
           || node->insn.opcode == SEL_OP_READ_ARF
-          || node->insn.opcode == SEL_OP_BARRIER)
+          || node->insn.opcode == SEL_OP_BARRIER
+          || node->insn.opcode == SEL_OP_CALC_TIMESTAMP
+          || node->insn.opcode == SEL_OP_STORE_PROFILING
+          || node->insn.opcode == SEL_OP_WAIT
+          || node->insn.opcode == SEL_OP_WORKGROUP_OP)
         tracker.makeBarrier(insnID, insnNum);
     }
 
@@ -605,8 +626,95 @@ namespace gbe
     return insnNum;
   }
 
+  /*! Will sort child in register pressure in increasing order */
+  inline bool cmp(const ScheduleDAGNode *v0, const ScheduleDAGNode *v1) {
+    return v0->regNum < v1->regNum;
+  }
+
+  /* Recursively compute heuristic Sethi-Ullman number for each node. */
+  void SelectionScheduler::computeRegPressure(ScheduleDAGNode *node,
+                                              map<ScheduleDAGNode *, int32_t> &regPressureMap) {
+    if (regPressureMap.find(node) != regPressureMap.end()) {
+      GBE_ASSERT(node->regNum == (uint32_t)regPressureMap.find(node)->second);
+      return;
+    }
+    if (node->refNum == 0) {
+      node->regNum = 0;
+      regPressureMap.insert(std::make_pair(node, 0));
+      return;
+    }
+    auto &children = tracker.deps.find(node)->second;
+    for (auto child : children) {
+      computeRegPressure(child, regPressureMap);
+    }
+    std::sort(children.begin(), children.end(), cmp);
+    uint32_t maxRegNum = 0;
+    int32_t i = 0;
+    for (auto &child : children) {
+      if (child->regNum + children.size() - i > maxRegNum)
+        maxRegNum = child->regNum + node->children.size() - i;
+      ++i;
+    }
+    node->regNum = maxRegNum;
+    regPressureMap.insert(std::make_pair(node, maxRegNum));
+    return;
+  }
+
   void SelectionScheduler::preScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
-    printf("Not implemented yet. \n");
+    set<ScheduleDAGNode *> rootNodes;
+    for (int32_t i = 0; i < insnNum; i++) {
+      ScheduleDAGNode *node = tracker.insnNodes[i];
+      if (node->depNum == 0)
+        rootNodes.insert(node);
+    }
+    map<ScheduleDAGNode *, int32_t> regPressureMap;
+    map<ScheduleDAGNode *, int32_t> parentIndexMap;
+    for (auto node : rootNodes) {
+      computeRegPressure(node, regPressureMap);
+      parentIndexMap.insert(std::make_pair(node, INT_MAX));
+    }
+    set<ScheduleDAGNode *> readySet(rootNodes.begin(), rootNodes.end());
+    set<ScheduleDAGNode *> scheduledSet;
+    int32_t j = insnNum;
+
+    // Now, start the scheduling.
+    // Each time find the minimum smallest pair (parentIndex[node], regPressure[node])
+    // as the best node to schedule.
+    while(readySet.size()) {
+      ScheduleDAGNode * bestNode = NULL;
+      int32_t minRegNum = INT_MAX;
+      int32_t minParentIndex = INT_MAX;
+      for(auto node : readySet) {
+        GBE_ASSERT(scheduledSet.contains(node) == false);
+        if (parentIndexMap.find(node)->second < minParentIndex) {
+          bestNode = node;
+          minParentIndex = parentIndexMap.find(node)->second;
+          minRegNum = regPressureMap.find(node)->second;
+        }
+        else if (parentIndexMap.find(node)->second == minParentIndex) {
+          if (regPressureMap.find(node)->second < minRegNum) {
+            bestNode = node;
+            minRegNum = regPressureMap.find(node)->second;
+          }
+        }
+      }
+      for( auto node : tracker.deps.find(bestNode)->second ) {
+        if (node == NULL)
+          continue;
+        node->depNum--;
+        if (parentIndexMap.find(node) != parentIndexMap.end())
+          parentIndexMap.find(node)->second = j;
+        else
+          parentIndexMap.insert(std::make_pair(node, j));
+        if (node->depNum == 0 && scheduledSet.contains(node) == false)
+          readySet.insert(node);
+      }
+      bb.prepend(&bestNode->insn);
+      readySet.erase(bestNode);
+      scheduledSet.insert(bestNode);
+      --j;
+    }
+    GBE_ASSERT(insnNum == (int32_t)bb.insnList.size());
   }
 
   void SelectionScheduler::postScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
@@ -714,8 +822,6 @@ namespace gbe
   void schedulePreRegAllocation(GenContext &ctx, Selection &selection) {
     if (OCL_PRE_ALLOC_INSN_SCHEDULE) {
       SelectionScheduler scheduler(ctx, selection, PRE_ALLOC);
-      // FIXME, need to implement proper pre reg allocation scheduling algorithm.
-      return;
       for (auto &bb : *selection.blockList) {
         const int32_t insnNum = scheduler.buildDAG(bb);
         bb.insnList.clear();
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7eec2b3..6cfa87f 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -187,7 +187,10 @@ namespace gbe
            this->opcode == SEL_OP_ATOMIC       ||
            this->opcode == SEL_OP_BYTE_GATHER  ||
            this->opcode == SEL_OP_SAMPLE ||
-           this->opcode == SEL_OP_DWORD_GATHER;
+           this->opcode == SEL_OP_VME ||
+           this->opcode == SEL_OP_DWORD_GATHER ||
+           this->opcode == SEL_OP_OBREAD ||
+           this->opcode == SEL_OP_MBREAD;
   }
 
   bool SelectionInstruction::modAcc(void) const {
@@ -209,7 +212,9 @@ namespace gbe
            this->opcode == SEL_OP_WRITE64       ||
            this->opcode == SEL_OP_ATOMIC        ||
            this->opcode == SEL_OP_BYTE_SCATTER  ||
-           this->opcode == SEL_OP_TYPED_WRITE;
+           this->opcode == SEL_OP_TYPED_WRITE ||
+           this->opcode == SEL_OP_OBWRITE ||
+           this->opcode == SEL_OP_MBWRITE;
   }
 
   bool SelectionInstruction::isBranch(void) const {
@@ -278,7 +283,7 @@ namespace gbe
     /*! Instruction that needs to be matched */
     const ir::Instruction &insn;
     /*! When sources have been overwritten, a child insn cannot be merged */
-    uint32_t mergeable:ir::Instruction::MAX_SRC_NUM;
+    uint64_t mergeable:ir::Instruction::MAX_SRC_NUM;
     /*! Number of children we have in the pattern */
     uint32_t childNum:7;
     /*! A root must be generated, no matter what */
@@ -361,8 +366,10 @@ namespace gbe
     bool has32X32Mul() const { return bHas32X32Mul; }
     void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
     bool hasLongType() const { return bHasLongType; }
+    bool hasDoubleType() const { return bHasDoubleType; }
     bool hasHalfType() const { return bHasHalfType; }
     void setHasLongType(bool b) { bHasLongType = b; }
+    void setHasDoubleType(bool b) { bHasDoubleType = b; }
     void setHasHalfType(bool b) { bHasHalfType = b; }
     bool hasLongRegRestrict() { return bLongRegRestrict; }
     void setLongRegRestrict(bool b) { bLongRegRestrict = b; }
@@ -401,6 +408,7 @@ namespace gbe
       return GenRegister::offset(reg, nr, subnr);
     }
 
+    GenRegister getLaneIDReg();
     /*! Implement public class */
     INLINE uint32_t getRegNum(void) const { return file.regNum(); }
     /*! Implements public interface */
@@ -454,6 +462,8 @@ namespace gbe
     bool hasQWord(const ir::Instruction &insn);
     /*! A root instruction needs to be generated */
     bool isRoot(const ir::Instruction &insn) const;
+    /*! Set debug infomation to Selection */
+    void setDBGInfo_SEL(DebugInfo in) { DBGInfo = in; }
 
     /*! To handle selection block allocation */
     DECL_POOL(SelectionBlock, blockPool);
@@ -491,8 +501,11 @@ namespace gbe
     uint32_t vectorNum;
     /*! If true, generate code backward */
     bool bwdCodeGeneration;
+    DebugInfo DBGInfo;
     /*! To make function prototypes more readable */
     typedef const GenRegister &Reg;
+    /*! If true, the thread map has already been stored */
+    bool storeThreadMap;
 
     /*! Check for destination register. Major purpose is to find
         out partially updated dst registers. These registers will
@@ -513,8 +526,6 @@ namespace gbe
   INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
     ALU1(MOV)
     ALU1(READ_ARF)
-    ALU1WithTemp(MOV_DF)
-    ALU1WithTemp(LOAD_DF_IMM)
     ALU1(LOAD_INT64_IMM)
     ALU1(RNDZ)
     ALU1(RNDE)
@@ -545,6 +556,7 @@ namespace gbe
     ALU2(MACH)
     ALU1(LZD)
     ALU3(MAD)
+    ALU3(LRP)
     ALU2WithTemp(MUL_HI)
     ALU1(FBH)
     ALU1(FBL)
@@ -617,21 +629,21 @@ namespace gbe
     /*! No-op */
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
-    void WAIT(void);
+    void WAIT(uint32_t n = 0);
     /*! Atomic instruction */
-    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp);
+    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, GenRegister bti, vector<GenRegister> temps);
     /*! Read 64 bits float/int array */
-    void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, GenRegister *flagTemp);
+    void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, vector<GenRegister> temps);
     /*! Write 64 bits float/int array */
-    void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, GenRegister *flagTemp);
+    void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, vector<GenRegister> temps);
     /*! Untyped read (up to 4 elements) */
-    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
+    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, vector<GenRegister> temps);
     /*! Untyped write (up to 4 elements) */
-    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
+    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, GenRegister bti, vector<GenRegister> temps);
     /*! Byte gather (for unaligned bytes, shorts and ints) */
-    void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
+    void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, vector<GenRegister> temps);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
-    void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
+    void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, vector <GenRegister> temps);
     /*! DWord scatter (for constant cache read) */
     void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
     /*! Unpack the uint to charN */
@@ -658,16 +670,46 @@ namespace gbe
     void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
     /*! Encode sample instructions */
     void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform);
+    /*! Encode vme instructions */
+    void VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type, uint32_t vme_search_path_lut, uint32_t lut_sub);
     /*! Encode typed write instructions */
     void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
     /*! Get image information */
     void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
+    /*! Calculate the timestamp */
+    void CALC_TIMESTAMP(GenRegister ts[5], int tsN, GenRegister tmp, uint32_t pointNum, uint32_t tsType);
+    /*! Store the profiling info */
+    void STORE_PROFILING(uint32_t profilingType, uint32_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister ts[5], int tsNum);
+    /*! Printf */
+    void PRINTF(GenRegister dst, uint8_t bti, GenRegister tmp0, GenRegister tmp1, GenRegister src[8],
+                int srcNum, uint16_t num, bool isContinue, uint32_t totalSize);
     /*! Multiply 64-bit integers */
     void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool native_long);
     /*! 64-bit integer division */
     void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_int);
     /*! 64-bit integer remainder of division */
     void I64REM(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_int);
+    /*! double division */
+    void F64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmpNum);
+    /*! Work Group Operations */
+    void WORKGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
+                      GenRegister tmpData1,
+                      GenRegister localThreadID, GenRegister localThreadNUM,
+                      GenRegister tmpData2, GenRegister slmOff,
+                      vector<GenRegister> msg, uint32_t msgSizeReq,
+                      GenRegister localBarrier);
+    /*! Sub Group Operations */
+    void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
+                      GenRegister tmpData1, GenRegister tmpData2);
+    /*! Oblock read */
+    void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    /*! Oblock write */
+    void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    /*! Media block read */
+    void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    /*! Media block write */
+    void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI
        as much as possible. */
@@ -735,6 +777,15 @@ namespace gbe
             GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
     }
 
+    INLINE vector<GenRegister> getBTITemps(const ir::BTI &bti) {
+      vector<GenRegister> temps;
+      if (!bti.isConst) {
+        temps.push_back(selReg(reg(ir::FAMILY_WORD, true), ir::TYPE_U16));
+        temps.push_back(selReg(reg(ir::FAMILY_DWORD, true), ir::TYPE_U32));
+      }
+      return temps;
+    }
+
     /*! Use custom allocators */
     GBE_CLASS(Opaque);
     friend class SelectionBlock;
@@ -744,6 +795,7 @@ namespace gbe
     uint32_t currAuxLabel;
     bool bHas32X32Mul;
     bool bHasLongType;
+    bool bHasDoubleType;
     bool bHasHalfType;
     bool bLongRegRestrict;
     uint32_t ldMsgOrder;
@@ -786,8 +838,9 @@ namespace gbe
     ctx(ctx), block(NULL),
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
-    stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
-    bHas32X32Mul(false), bHasLongType(false), bHasHalfType(false), bLongRegRestrict(false),
+    stateNum(0), vectorNum(0), bwdCodeGeneration(false), storeThreadMap(false),
+    currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false), bHasLongType(false),
+    bHasDoubleType(false), bHasHalfType(false), bLongRegRestrict(false),
     ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false)
   {
     const ir::Function &fn = ctx.getFunction();
@@ -847,6 +900,7 @@ namespace gbe
     GBE_ASSERT(dstNum <= SelectionInstruction::MAX_DST_NUM && srcNum <= SelectionInstruction::MAX_SRC_NUM);
     GBE_ASSERT(this->block != NULL);
     SelectionInstruction *insn = this->create(opcode, dstNum, srcNum);
+    insn->setDBGInfo(DBGInfo);
     if (this->bwdCodeGeneration)
       this->bwdList.push_back(insn);
     else
@@ -1110,7 +1164,7 @@ namespace gbe
       case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
       case FAMILY_QWORD:
         if (!this->hasLongType()) {
-          SEL_REG(df16grf, df8grf, df1grf);
+          SEL_REG(ud16grf, ud8grf, ud1grf);
         } else {
           SEL_REG(ul16grf, ul8grf, ul1grf);
         }
@@ -1153,7 +1207,19 @@ namespace gbe
     SelectionInstruction *insn = this->appendInsn(SEL_OP_JMPI, 0, 1);
     insn->src(0) = src;
     insn->index = index.value();
-    insn->extra.longjmp = abs(index - origin) > 800;
+    ir::LabelIndex start, end;
+    if (origin.value() < index.value()) {
+    // Forward Jump, need to exclude the target BB. Because we
+    // need to jump to the beginning of it.
+      start = origin;
+      end = ir::LabelIndex(index.value() - 1);
+    } else {
+      start = index;
+      end = origin;
+    }
+    // FIXME, this longjmp check is too hacky. We need to support instruction
+    // insertion at code emission stage in the future.
+    insn->extra.longjmp = ctx.getFunction().getDistance(start, end) > 3000;
     return insn->extra.longjmp ? 2 : 1;
   }
 
@@ -1227,10 +1293,11 @@ namespace gbe
   }
 
   void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
-                                     uint32_t srcNum, Reg src0,
-                                     Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp) {
-    unsigned dstNum = flagTemp == NULL ? 1 : 2;
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum, srcNum + 1);
+                                 uint32_t msgPayload, Reg src0,
+                                 Reg src1, Reg src2, GenRegister bti,
+                                 vector<GenRegister> temps) {
+    unsigned dstNum = 1 + temps.size();
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum, msgPayload + 1);
 
     if (bti.file != GEN_IMMEDIATE_VALUE) {
       insn->state.flag = 0;
@@ -1238,17 +1305,21 @@ namespace gbe
     }
 
     insn->dst(0) = dst;
-    if(flagTemp) insn->dst(1) = *flagTemp;
+    if(temps.size()) {
+      insn->dst(1) = temps[0];
+      insn->dst(2) = temps[1];
+    }
 
     insn->src(0) = src0;
-    if(srcNum > 1) insn->src(1) = src1;
-    if(srcNum > 2) insn->src(2) = src2;
-    insn->src(srcNum) = bti;
+    if(msgPayload > 1) insn->src(1) = src1;
+    if(msgPayload > 2) insn->src(2) = src2;
+    insn->src(msgPayload) = bti;
+
     insn->extra.function = function;
-    insn->extra.elem = srcNum;
+    insn->extra.elem = msgPayload;
 
     SelectionVector *vector = this->appendVector();
-    vector->regNum = srcNum;
+    vector->regNum = msgPayload; //bti not included in SelectionVector
     vector->offsetID = 0;
     vector->reg = &insn->src(0);
     vector->isSrc = 1;
@@ -1256,7 +1327,11 @@ namespace gbe
 
   void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); }
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
-  void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
+  void Selection::Opaque::WAIT(uint32_t n)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WAIT, 0, 0);
+    insn->extra.waitType = n;
+  }
 
   void Selection::Opaque::READ64(Reg addr,
                                  const GenRegister *dst,
@@ -1264,14 +1339,14 @@ namespace gbe
                                  uint32_t elemNum,
                                  const GenRegister bti,
                                  bool native_long,
-                                 GenRegister *flagTemp)
+                                 vector<GenRegister> temps)
   {
     SelectionInstruction *insn = NULL;
     SelectionVector *srcVector = NULL;
     SelectionVector *dstVector = NULL;
 
     if (!native_long) {
-      unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
+      unsigned dstNum = elemNum + temps.size();
       insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
       srcVector = this->appendVector();
       dstVector = this->appendVector();
@@ -1280,10 +1355,12 @@ namespace gbe
         insn->dst(elemID) = dst[elemID];
 
       // flagTemp don't need to be put in SelectionVector
-      if (flagTemp)
-        insn->dst(elemNum) = *flagTemp;
+      if (temps.size()) {
+        insn->dst(elemNum) = temps[0];
+        insn->dst(elemNum + 1) = temps[1];
+      }
     } else {
-      unsigned dstNum = flagTemp == NULL ? elemNum*2 : elemNum*2+1;
+      unsigned dstNum = elemNum*2 + temps.size();
       insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
       srcVector = this->appendVector();
       dstVector = this->appendVector();
@@ -1295,8 +1372,10 @@ namespace gbe
         insn->dst(elemID + elemNum) = dst[elemID];
 
       // flagTemp don't need to be put in SelectionVector
-      if (flagTemp)
-        insn->dst(2*elemNum) = *flagTemp;
+      if (temps.size()) {
+        insn->dst(2*elemNum) = temps[0];
+        insn->dst(2*elemNum + 1) = temps[1];
+      }
     }
 
     if (bti.file != GEN_IMMEDIATE_VALUE) {
@@ -1324,9 +1403,9 @@ namespace gbe
                                        const GenRegister *dst,
                                        uint32_t elemNum,
                                        GenRegister bti,
-                                       GenRegister *flagTemp)
+                                       vector<GenRegister> temps)
   {
-    unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
+    unsigned dstNum = elemNum + temps.size();
     SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, dstNum, 2);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
@@ -1335,8 +1414,10 @@ namespace gbe
     // Regular instruction to encode
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->dst(elemID) = dst[elemID];
-    if (flagTemp)
-      insn->dst(elemNum) = *flagTemp;
+    if (temps.size()) {
+      insn->dst(elemNum) = temps[0];
+      insn->dst(elemNum + 1) = temps[1];
+    }
 
     insn->src(0) = addr;
     insn->src(1) = bti;
@@ -1365,13 +1446,13 @@ namespace gbe
                                   uint32_t srcNum,
                                   GenRegister bti,
                                   bool native_long,
-                                  GenRegister *flagTemp)
+                                  vector<GenRegister> temps)
   {
     SelectionVector *vector = NULL;
     SelectionInstruction *insn = NULL;
 
     if (!native_long) {
-      unsigned dstNum = flagTemp == NULL ? 0 : 1;
+      unsigned dstNum = temps.size();
       insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 2);
       vector = this->appendVector();
       // Register layout:
@@ -1382,8 +1463,10 @@ namespace gbe
         insn->src(elemID + 1) = src[elemID];
 
       insn->src(srcNum+1) = bti;
-      if (flagTemp)
-        insn->dst(0) = *flagTemp;
+      if (temps.size()) {
+        insn->dst(0) = temps[0];
+        insn->dst(1) = temps[1];
+      }
       insn->extra.elem = srcNum;
 
       vector->regNum = srcNum + 1;
@@ -1391,7 +1474,7 @@ namespace gbe
       vector->reg = &insn->src(0);
       vector->isSrc = 1;
     } else { // handle the native long case
-      unsigned dstNum = flagTemp == NULL ? srcNum : srcNum+1;
+      unsigned dstNum = srcNum + temps.size();
       // Register layout:
       // dst: srcNum, (flagTemp)
       // src: srcNum, addr, srcNum, bti.
@@ -1411,8 +1494,10 @@ namespace gbe
       for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
         insn->dst(elemID) = tmp[0];
 
-      if (flagTemp)
-        insn->dst(srcNum) = *flagTemp;
+      if (temps.size()) {
+        insn->dst(srcNum) = temps[0];
+        insn->dst(srcNum + 1) = temps[1];
+      }
       insn->extra.elem = srcNum;
 
       vector->regNum = srcNum + 1;
@@ -1431,10 +1516,11 @@ namespace gbe
                                         const GenRegister *src,
                                         uint32_t elemNum,
                                         GenRegister bti,
-                                        GenRegister *flagTemp)
+                                        vector<GenRegister> temps)
   {
-    unsigned dstNum = flagTemp == NULL ? 0 : 1;
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, dstNum, elemNum+2);
+    unsigned dstNum = temps.size();
+    unsigned srcNum = elemNum + 2 + temps.size();
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, dstNum, srcNum);
     SelectionVector *vector = this->appendVector();
 
     if (bti.file != GEN_IMMEDIATE_VALUE) {
@@ -1442,14 +1528,17 @@ namespace gbe
       insn->state.subFlag = 1;
     }
 
-    if (flagTemp) insn->dst(0) = *flagTemp;
     // Regular instruction to encode
     insn->src(0) = addr;
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->src(elemID+1) = src[elemID];
     insn->src(elemNum+1) = bti;
-    if (flagTemp)
-      insn->src(elemNum+2) = *flagTemp;
+    if (temps.size()) {
+      insn->dst(0) = temps[0];
+      insn->dst(1) = temps[1];
+      insn->src(elemNum + 2) = temps[0];
+      insn->src(elemNum + 3) = temps[1];
+    }
     insn->extra.elem = elemNum;
 
     // Sends require contiguous allocation for the sources
@@ -1459,8 +1548,11 @@ namespace gbe
     vector->isSrc = 1;
   }
 
-  void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
-    unsigned dstNum = flagTemp == NULL ? 1 : 2;
+  void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr,
+                                      uint32_t elemSize,
+                                      GenRegister bti,
+                                      vector<GenRegister> temps) {
+    unsigned dstNum = 1 + temps.size();
     SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, dstNum, 2);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
@@ -1476,8 +1568,10 @@ namespace gbe
     insn->src(0) = addr;
     insn->src(1) = bti;
     insn->dst(0) = dst;
-    if (flagTemp)
-      insn->dst(1) = *flagTemp;
+    if (temps.size()) {
+      insn->dst(1) = temps[0];
+      insn->dst(2) = temps[1];
+    }
 
     insn->extra.elem = elemSize;
 
@@ -1493,8 +1587,9 @@ namespace gbe
     srcVector->reg = &insn->src(0);
   }
 
-  void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
-    unsigned dstNum = flagTemp == NULL ? 0 : 1;
+  void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize,
+                                       GenRegister bti, vector<GenRegister> temps) {
+    unsigned dstNum = temps.size();
     SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, dstNum, 3);
     SelectionVector *vector = this->appendVector();
 
@@ -1503,8 +1598,10 @@ namespace gbe
       insn->state.subFlag = 1;
     }
 
-    if (flagTemp)
-      insn->dst(0) = *flagTemp;
+    if (temps.size()) {
+      insn->dst(0) = temps[0];
+      insn->dst(1) = temps[1];
+    }
     // Instruction to encode
     insn->src(0) = addr;
     insn->src(1) = src;
@@ -1617,6 +1714,15 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
+  void Selection::Opaque::F64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmpNum) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_F64DIV, tmpNum + 1, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < tmpNum; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
   void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
     SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
     insn->dst(0) = dst;
@@ -1661,6 +1767,29 @@ namespace gbe
     insn->src(1) = src1;
   }
 
+  GenRegister Selection::Opaque::getLaneIDReg()
+  {
+    const GenRegister laneID = GenRegister::immv(0x76543210);
+    ir::Register r = reg(ir::RegisterFamily::FAMILY_WORD);
+    const GenRegister dst = selReg(r, ir::TYPE_U16);
+
+    uint32_t execWidth = curr.execWidth;
+    if (execWidth == 8)
+      MOV(dst, laneID);
+    else {
+      push();
+      curr.execWidth = 8;
+      curr.noMask = 1;
+      MOV(dst, laneID);
+      //Packed Unsigned Half-Byte Integer Vector does not work
+      //have to mock by adding 8 to the singed vector
+      const GenRegister eight = GenRegister::immuw(8);
+      ADD(GenRegister::offset(dst, 0, 16), dst, eight);
+      pop();
+    }
+    return dst;
+  }
+
   void Selection::Opaque::I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_I64CMP, 3, 2);
     insn->src(0) = src0;
@@ -1750,6 +1879,268 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
+  void Selection::Opaque::CALC_TIMESTAMP(GenRegister ts[5], int tsN, GenRegister tmp, uint32_t pointNum, uint32_t tsType) {
+    SelectionInstruction *insn = NULL;
+    if (!this->hasLongType()) {
+      insn = this->appendInsn(SEL_OP_CALC_TIMESTAMP, tsN + 1, tsN);
+    } else {// No need for tmp
+      insn = this->appendInsn(SEL_OP_CALC_TIMESTAMP, tsN, tsN);
+    }
+
+    for (int i = 0; i < tsN; i++) {
+      insn->src(i) = ts[i];
+      insn->dst(i) = ts[i];
+    }
+
+    if (!this->hasLongType())
+      insn->dst(tsN) = tmp;
+
+    insn->extra.pointNum = static_cast<uint16_t>(pointNum);
+    insn->extra.timestampType = static_cast<uint16_t>(tsType);
+  }
+
+  void Selection::Opaque::STORE_PROFILING(uint32_t profilingType, uint32_t bti,
+                GenRegister tmp0, GenRegister tmp1, GenRegister ts[5], int tsNum) {
+    if (tsNum == 3) { // SIMD16 mode
+      SelectionInstruction *insn = this->appendInsn(SEL_OP_STORE_PROFILING, 1, 3);
+      for (int i = 0; i < 3; i++)
+        insn->src(i) = ts[i];
+      insn->dst(0) = tmp0;
+
+      insn->extra.profilingType = static_cast<uint16_t>(profilingType);
+      insn->extra.profilingBTI = static_cast<uint16_t>(bti);
+    } else { // SIMD8 mode
+      GBE_ASSERT(tsNum == 5);
+      SelectionInstruction *insn = this->appendInsn(SEL_OP_STORE_PROFILING, 2, 5);
+      SelectionVector *dstVector = this->appendVector();
+      for (int i = 0; i < 5; i++)
+        insn->src(i) = ts[i];
+      insn->dst(0) = tmp0;
+      insn->dst(1) = tmp1;
+
+      dstVector->regNum = 2;
+      dstVector->isSrc = 0;
+      dstVector->offsetID = 0;
+      dstVector->reg = &insn->dst(0);
+
+      insn->extra.profilingType = static_cast<uint16_t>(profilingType);
+      insn->extra.profilingBTI = static_cast<uint16_t>(bti);
+    }
+  }
+
+  void Selection::Opaque::PRINTF(GenRegister dst, uint8_t bti, GenRegister tmp0, GenRegister tmp1,
+               GenRegister src[8], int srcNum, uint16_t num, bool isContinue, uint32_t totalSize) {
+    if (isContinue) {
+      SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum + 1);
+      SelectionVector *vector = this->appendVector();
+
+      for (int i = 0; i < srcNum; i++)
+        insn->src(i) = src[i];
+
+      insn->src(srcNum) = tmp0;
+
+      insn->dst(0) = dst;
+      insn->dst(1) = tmp0;
+      insn->dst(2) = tmp1;
+
+      vector->regNum = 2;
+      vector->reg = &insn->dst(1);
+      vector->offsetID = 0;
+      vector->isSrc = 0;
+
+      insn->extra.printfSize = static_cast<uint16_t>(totalSize);
+      insn->extra.continueFlag = isContinue;
+      insn->extra.printfBTI = bti;
+      insn->extra.printfNum = num;
+    } else {
+      SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum);
+      SelectionVector *vector = this->appendVector();
+
+      for (int i = 0; i < srcNum; i++)
+        insn->src(i) = src[i];
+
+      insn->dst(0) = dst;
+      insn->dst(1) = tmp0;
+      insn->dst(2) = tmp1;
+
+      vector->regNum = 2;
+      vector->reg = &insn->dst(1);
+      vector->offsetID = 0;
+      vector->isSrc = 0;
+
+      insn->extra.printfSize = static_cast<uint16_t>(totalSize);
+      insn->extra.continueFlag = isContinue;
+      insn->extra.printfBTI = bti;
+      insn->extra.printfNum = num;
+    }
+  }
+
+  void Selection::Opaque::WORKGROUP_OP(uint32_t wg_op,
+                                       Reg dst,
+                                       GenRegister src,
+                                       GenRegister tmpData1,
+                                       GenRegister localThreadID,
+                                       GenRegister localThreadNUM,
+                                       GenRegister tmpData2,
+                                       GenRegister slmOff,
+                                       vector<GenRegister> msg,
+                                       uint32_t msgSizeReq,
+                                       GenRegister localBarrier)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WORKGROUP_OP, 2 + msg.size(), 6);
+    SelectionVector *vector = this->appendVector();
+
+    /* allocate continuous GRF registers for READ/WRITE to SLM */
+    GBE_ASSERT(msg.size() >= msgSizeReq);
+    vector->regNum = msg.size();
+    vector->offsetID = 0;
+    vector->reg = &insn->dst(2);
+    vector->isSrc = 0;
+    insn->extra.workgroupOp = wg_op;
+
+    insn->dst(0) = dst;
+    insn->dst(1) = tmpData1;
+    for(uint32_t i = 0; i < msg.size(); i++)
+      insn->dst(2 + i) = msg[i];
+
+    insn->src(0) = localThreadID;
+    insn->src(1) = localThreadNUM;
+    insn->src(2) = src;
+    insn->src(3) = tmpData2;
+    insn->src(4) = slmOff;
+    insn->src(5) = localBarrier;
+  }
+
+  void Selection::Opaque::SUBGROUP_OP(uint32_t wg_op,
+                                       Reg dst,
+                                       GenRegister src,
+                                       GenRegister tmpData1,
+                                       GenRegister tmpData2)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_SUBGROUP_OP, 2, 2);
+
+    insn->extra.workgroupOp = wg_op;
+
+    insn->dst(0) = dst;
+    insn->dst(1) = tmpData1;
+
+    insn->src(0) = src;
+    insn->src(1) = tmpData2;
+  }
+  void Selection::Opaque::OBREAD(GenRegister* dsts,
+                                 uint32_t vec_size,
+                                 GenRegister addr,
+                                 GenRegister header,
+                                 uint32_t bti,
+                                 GenRegister* tmp,
+                                 uint32_t tmp_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + vec_size + tmp_size, 1);
+    SelectionVector *vector = this->appendVector();
+    insn->dst(0) = header;
+    for (uint32_t i = 0; i < vec_size; ++i)
+      insn->dst(1 + i) = dsts[i];
+    for (uint32_t i = 0; i < tmp_size; ++i)
+      insn->dst(1 + i + vec_size) = tmp[i];
+    insn->src(0) = addr;
+    insn->setbti(bti);
+    insn->extra.elem = vec_size; // number of vector size
+
+    // tmp regs for OWORD read dst
+    vector->regNum = tmp_size;
+    vector->reg = &insn->dst(1 + vec_size);
+    vector->offsetID = 1 + vec_size;
+    vector->isSrc = 0;
+  }
+
+  void Selection::Opaque::OBWRITE(GenRegister addr,
+                                  GenRegister* values,
+                                  uint32_t vec_size,
+                                  GenRegister header,
+                                  uint32_t bti,
+                                  GenRegister* tmp,
+                                  uint32_t tmp_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1);
+    SelectionVector *vector = this->appendVector();
+    insn->src(0) = addr;
+    for (uint32_t i = 0; i < vec_size; ++i)
+      insn->src(i + 1) = values[i];
+    insn->dst(0) = header;
+    for (uint32_t i = 0; i < tmp_size; ++i)
+      insn->dst(i + 1) = tmp[i];
+    insn->setbti(bti);
+    insn->extra.elem = vec_size; // number of vector_size
+
+    // tmp regs for OWORD read dst
+    vector->regNum = tmp_size + 1;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
+    vector->isSrc = 0;
+  }
+
+  void Selection::Opaque::MBREAD(GenRegister* dsts,
+                                 GenRegister coordx,
+                                 GenRegister coordy,
+                                 GenRegister header,
+                                 GenRegister* tmp,
+                                 uint32_t bti,
+                                 uint32_t vec_size) {
+
+    uint32_t simdWidth = curr.execWidth;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8, 3);
+    SelectionVector *vector = this->appendVector();
+    for (uint32_t i = 0; i < vec_size; ++i) {
+      insn->dst(i) = dsts[i];
+      if(simdWidth == 16)
+        insn->dst(i + vec_size) = tmp[i];
+    }
+    insn->src(0) = coordx;
+    insn->src(1) = coordy;
+    insn->src(2) = header;
+    insn->setbti(bti);
+    insn->extra.elem = vec_size; // vector size
+
+    vector->regNum = vec_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
+    vector->isSrc = 0;
+
+    if(simdWidth == 16)
+    {
+      SelectionVector *vectortmp = this->appendVector();
+      vectortmp->regNum = vec_size;
+      vectortmp->reg = &insn->dst(vec_size);
+      vectortmp->offsetID = vec_size;
+      vectortmp->isSrc = 0;
+    }
+  }
+
+  void Selection::Opaque::MBWRITE(GenRegister coordx,
+                                  GenRegister coordy,
+                                  GenRegister* values,
+                                  GenRegister header,
+                                  GenRegister* tmp,
+                                  uint32_t bti,
+                                  uint32_t vec_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size);
+    SelectionVector *vector = this->appendVector();
+    insn->src(0) = coordx;
+    insn->src(1) = coordy;
+    for (uint32_t i = 0; i < vec_size; ++i)
+      insn->src(2 + i) = values[i];
+    insn->dst(0) = header;
+    for (uint32_t i = 0; i < vec_size; ++i)
+      insn->dst(1 + i) = tmp[i];
+    insn->state = this->curr;
+    insn->setbti(bti);
+    insn->extra.elem = vec_size; // vector size
+
+    // We need to put the header and the data together
+    vector->regNum = 1 + vec_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
+    vector->isSrc = 0;
+  }
+
   // Boiler plate to initialize the selection library at c++ pre-main
   static SelectionLibrary *selLib = NULL;
   static void destroySelectionLibrary(void) { GBE_DELETE(selLib); }
@@ -1774,9 +2165,11 @@ namespace gbe
     const ir::BasicBlock *insnBlock = insn.getParent();
     const ir::Liveness &liveness = this->ctx.getLiveness();
     const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(insnBlock);
-    const ir::Register reg = insn.getDst(0);
-    if (liveOut.contains(reg))
-      return true;
+    for(uint32_t i = 0; i < insn.getDstNum(); i++) {
+      const ir::Register reg = insn.getDst(i);
+      if (liveOut.contains(reg))
+        return true;
+    }
 
     // The instruction is only used in the current basic block
     return false;
@@ -1918,6 +2311,11 @@ namespace gbe
     return insnNum;
   }
 
+extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
+#define SET_SEL_DBGINFO(I)  \
+	if(OCL_DEBUGINFO)	 \
+	  this->setDBGInfo_SEL(I.DBGInfo)
+
   void Selection::Opaque::matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum)
   {
     // Bottom up code generation
@@ -1937,6 +2335,7 @@ namespace gbe
     for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
       // Process all possible patterns for this instruction
       SelectionDAG &dag = *insnDAG[insnID];
+      SET_SEL_DBGINFO(dag.insn);
       if (dag.isRoot) {
         const ir::Instruction &insn = dag.insn;
         const ir::Opcode opcode = insn.getOpcode();
@@ -1988,6 +2387,7 @@ namespace gbe
       }
     }
   }
+#undef SET_SEL_DBGINFO
 
   void Selection::Opaque::select(void)
   {
@@ -2035,50 +2435,102 @@ namespace gbe
     insn->extra.isUniform = isUniform;
   }
 
+  void Selection::Opaque::VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal,
+                              uint32_t dstNum, uint32_t srcNum, uint32_t msg_type,
+                              uint32_t vme_search_path_lut, uint32_t lut_sub) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_VME, dstNum, srcNum);
+    SelectionVector *dstVector = this->appendVector();
+    SelectionVector *msgVector = this->appendVector();
+
+    for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
+    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+      insn->src(elemID) = payloadVal[elemID];
+
+    dstVector->regNum = dstNum;
+    dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
+    dstVector->reg = &insn->dst(0);
+
+    msgVector->regNum = srcNum;
+    msgVector->isSrc = 1;
+    msgVector->offsetID = 0;
+    msgVector->reg = &insn->src(0);
+
+    insn->setbti(bti);
+    insn->extra.msg_type = msg_type;
+    insn->extra.vme_search_path_lut = vme_search_path_lut;
+    insn->extra.lut_sub = lut_sub;
+  }
+
   ///////////////////////////////////////////////////////////////////////////
   // Code selection public implementation
   ///////////////////////////////////////////////////////////////////////////
+  const GenContext& Selection::getCtx()
+  {
+    return this->opaque->ctx;
+  }
 
   Selection::Selection(GenContext &ctx) {
     this->blockList = NULL;
     this->opaque = GBE_NEW(Selection::Opaque, ctx);
     this->opaque->setSlowByteGather(true);
+    opt_features = 0;
   }
 
   Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
     this->opaque->setSlowByteGather(false);
+    opt_features = 0;
   }
 
   Selection8::Selection8(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
-    this->opaque->setSlowByteGather(true);
+    this->opaque->setHasDoubleType(true);
+    this->opaque->setSlowByteGather(false);
     this->opaque->setHasHalfType(true);
+    opt_features = SIOF_LOGICAL_SRCMOD;
   }
 
   SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
+    this->opaque->setHasDoubleType(true);
     this->opaque->setLongRegRestrict(true);
-    this->opaque->setSlowByteGather(true);
+    this->opaque->setSlowByteGather(false);
     this->opaque->setHasHalfType(true);
+    opt_features = SIOF_LOGICAL_SRCMOD | SIOF_OP_MOV_LONG_REG_RESTRICT;
   }
 
   Selection9::Selection9(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
+    this->opaque->setHasDoubleType(true);
     this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
-    this->opaque->setSlowByteGather(true);
+    this->opaque->setSlowByteGather(false);
     this->opaque->setHasHalfType(true);
+    opt_features = SIOF_LOGICAL_SRCMOD;
   }
 
   SelectionBxt::SelectionBxt(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
     this->opaque->setLongRegRestrict(true);
+    this->opaque->setHasDoubleType(true);
     this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
-    this->opaque->setSlowByteGather(true);
+    this->opaque->setSlowByteGather(false);
     this->opaque->setHasHalfType(true);
+    opt_features = SIOF_LOGICAL_SRCMOD | SIOF_OP_MOV_LONG_REG_RESTRICT;
+  }
+
+  SelectionKbl::SelectionKbl(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setHas32X32Mul(true);
+    this->opaque->setHasLongType(true);
+    this->opaque->setHasDoubleType(true);
+    this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
+    this->opaque->setSlowByteGather(false);
+    this->opaque->setHasHalfType(true);
+    opt_features = SIOF_LOGICAL_SRCMOD;
   }
 
   void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
@@ -2314,8 +2766,8 @@ namespace gbe
           break;
         case ir::OP_SIMD_ID:
           {
-            const GenRegister selLaneID = sel.selReg(ir::ocl::laneid, ir::TYPE_U32);
-            sel.MOV(dst, selLaneID);
+            GenRegister laneID = sel.getLaneIDReg();
+            sel.MOV(dst, laneID);
           }
           break;
         default: NOT_SUPPORTED;
@@ -2364,10 +2816,7 @@ namespace gbe
             }
             break;
           case ir::OP_MOV:
-            if (dst.isdf()) {
-              ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
-              sel.MOV_DF(dst, src, sel.selReg(r, ir::TYPE_U64));
-            } else {
+            {
               sel.push();
                 auto dag = sel.regDAG[insn.getDst(0)];
                 if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL &&
@@ -2406,12 +2855,13 @@ namespace gbe
           case ir::OP_SIMD_ANY:
             {
               const GenRegister constZero = GenRegister::immuw(0);;
-              const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one);
+              const GenRegister constOne = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_DWORD)), GEN_TYPE_UD);
               const GenRegister flag01 = GenRegister::flag(0, 1);
 
               sel.push();
                 int simdWidth = sel.curr.execWidth;
                 sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.MOV(constOne, GenRegister::immud(1));
                 sel.curr.execWidth = 1;
                 sel.curr.noMask = 1;
                 sel.MOV(flag01, constZero);
@@ -2428,19 +2878,21 @@ namespace gbe
                   sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
                 else
                   NOT_IMPLEMENTED;
-                sel.SEL(dst, regOne, constZero);
+                sel.SEL(dst, constOne, constZero);
               sel.pop();
             }
             break;
           case ir::OP_SIMD_ALL:
             {
               const GenRegister constZero = GenRegister::immuw(0);
+              const GenRegister constOne = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_DWORD)), GEN_TYPE_UD);
               const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one);
               const GenRegister flag01 = GenRegister::flag(0, 1);
 
               sel.push();
                 int simdWidth = sel.curr.execWidth;
                 sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.MOV(constOne, GenRegister::immud(1));
                 sel.curr.execWidth = 1;
                 sel.curr.noMask = 1;
                 sel.MOV(flag01, regOne);
@@ -2458,7 +2910,7 @@ namespace gbe
                   sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
                 else
                   NOT_IMPLEMENTED;
-                sel.SEL(dst, regOne, constZero);
+                sel.SEL(dst, constOne, constZero);
               sel.pop();
             }
             break;
@@ -2565,6 +3017,39 @@ namespace gbe
           else
             sel.I64REM(dst, src0, src1, tmp, tmp_num);
         sel.pop();
+      } else if (type == TYPE_DOUBLE) {
+        if (!sel.hasDoubleType())
+          GBE_ASSERT(0);
+
+        GenRegister tmp[10];
+        int tmpNum = 7;
+        ir::RegisterFamily fm;
+        if (sel.ctx.getSimdWidth() == 16) {
+          fm = FAMILY_WORD;
+        } else {
+          fm = FAMILY_DWORD;
+        }
+
+        /* madm and invm need special accumutor support, which require us in align16
+           mode. If any src is uniform, we need another tmp register and MOV the
+           uniform one to it. Because the madm and invm will work in align16 mode,
+           the channel mask is different from the align1 mode. So we can not directly
+           write the result to the dst and need a tmp register to hold the result and
+           MOV it to dst later. */
+        tmpNum++; //For the dst.
+        if (src0.hstride == GEN_HORIZONTAL_STRIDE_0) tmpNum++;
+        if (src1.hstride == GEN_HORIZONTAL_STRIDE_0) tmpNum++;
+
+        for (int i = 0; i < tmpNum; i++)
+          tmp[i] = GenRegister::df8grf(sel.reg(fm));
+
+        sel.push();
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
+          sel.F64DIV(dst, src0, src1, tmp, tmpNum);
+        sel.pop();
+      } else {
+        GBE_ASSERT(0);
       }
       markAllChildren(dag);
       return true;
@@ -2863,7 +3348,7 @@ namespace gbe
 
       // XXX TODO: we need a clean support of FP_CONTRACT to remove below line 'return false'
       // if 'pragma FP_CONTRACT OFF' is used in cl kernel, we should not do mad optimization.
-      if (!sel.ctx.relaxMath || sel.ctx.getSimdWidth() == 16)
+      if (!sel.ctx.relaxMath)
         return false;
       // MAD tend to increase liveness of the sources (since there are three of
       // them). TODO refine this strategy. Well, we should be able at least to
@@ -2881,17 +3366,15 @@ namespace gbe
       const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_FLOAT);
       if (child0 && child0->insn.getOpcode() == OP_MUL) {
         GBE_ASSERT(cast<ir::BinaryInstruction>(child0->insn).getType() == TYPE_FLOAT);
-        SelectionDAG *child00 = child0->child[0];
-        SelectionDAG *child01 = child0->child[1];
-        if ((child00 && child00->insn.getOpcode() == OP_LOADI) ||
-            (child01 && child01->insn.getOpcode() == OP_LOADI) ||
-            (child1 && child1->insn.getOpcode() == OP_LOADI))
-          return false;
         const GenRegister src0 = sel.selReg(child0->insn.getSrc(0), TYPE_FLOAT);
         const GenRegister src1 = sel.selReg(child0->insn.getSrc(1), TYPE_FLOAT);
         GenRegister src2 = sel.selReg(insn.getSrc(1), TYPE_FLOAT);
         if(insn.getOpcode() == ir::OP_SUB) src2 = GenRegister::negate(src2);
+        sel.push();
+        if (sel.isScalarReg(insn.getDst(0)))
+          sel.curr.execWidth = 1;
         sel.MAD(dst, src2, src0, src1); // order different on HW!
+        sel.pop();
         if (child0->child[0]) child0->child[0]->isRoot = 1;
         if (child0->child[1]) child0->child[1]->isRoot = 1;
         if (child1) child1->isRoot = 1;
@@ -2899,17 +3382,15 @@ namespace gbe
       }
       if (child1 && child1->insn.getOpcode() == OP_MUL) {
         GBE_ASSERT(cast<ir::BinaryInstruction>(child1->insn).getType() == TYPE_FLOAT);
-        SelectionDAG *child10 = child1->child[0];
-        SelectionDAG *child11 = child1->child[1];
-        if ((child10 && child10->insn.getOpcode() == OP_LOADI) ||
-            (child11 && child11->insn.getOpcode() == OP_LOADI) ||
-            (child0 && child0->insn.getOpcode() == OP_LOADI))
-          return false;
         GenRegister src0 = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT);
         const GenRegister src1 = sel.selReg(child1->insn.getSrc(1), TYPE_FLOAT);
         const GenRegister src2 = sel.selReg(insn.getSrc(0), TYPE_FLOAT);
         if(insn.getOpcode() == ir::OP_SUB) src0 = GenRegister::negate(src0);
+        sel.push();
+        if (sel.isScalarReg(insn.getDst(0)))
+          sel.curr.execWidth = 1;
         sel.MAD(dst, src2, src0, src1); // order different on HW!
+        sel.pop();
         if (child1->child[0]) child1->child[0]->isRoot = 1;
         if (child1->child[1]) child1->child[1]->isRoot = 1;
         if (child0) child0->isRoot = 1;
@@ -3230,12 +3711,12 @@ namespace gbe
           ir::half hf = imm.getHalfValue();
           sel.MOV(GenRegister::retype(dst, GEN_TYPE_HF), GenRegister::immh(hf.getVal()));
           break;
-	}
+        }
         case TYPE_U16: sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
         case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
         case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
         case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
-        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.getDoubleValue()), sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U64)); break;
+        case TYPE_DOUBLE: sel.MOV(dst, GenRegister::immdf(imm.getDoubleValue())); break;
         case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
         case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immuint64(imm.getIntegerValue())); break;
         default: NOT_SUPPORTED;
@@ -3264,6 +3745,25 @@ namespace gbe
     DECL_CTOR(SyncInstruction, 1,1);
   };
 
+  /*! Wait instruction */
+  DECL_PATTERN(WaitInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::WaitInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      // Debugwait will use reg 1, which is different from barrier
+      sel.push();
+        sel.curr.noMask = 1;
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.WAIT(1);
+      sel.pop();
+      return true;
+    }
+
+    DECL_CTOR(WaitInstruction, 1,1);
+  };
+
   INLINE uint32_t getByteScatterGatherSize(Selection::Opaque &sel, ir::Type type) {
     using namespace ir;
     switch (type) {
@@ -3303,11 +3803,8 @@ namespace gbe
                    uint32_t valueNum,
                    ir::BTI bti) const
     {
-        //GenRegister temp = getRelativeAddress(sel, addr, sel.selReg(bti.base, ir::TYPE_U32));
-
         GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
-        GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
-        sel.UNTYPED_READ(addr, dst.data(), valueNum, b, bti.isConst ? NULL : &tmp);
+        sel.UNTYPED_READ(addr, dst.data(), valueNum, b, sel.getBTITemps(bti));
     }
 
     void emitUntypedRead(Selection::Opaque &sel,
@@ -3368,7 +3865,6 @@ namespace gbe
       GBE_ASSERT(bti.isConst == 1);
       vector<GenRegister> dst(valueNum);
       GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
-      GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
       for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
 
@@ -3378,9 +3874,9 @@ namespace gbe
           tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
         }
 
-        sel.READ64(addr, dst.data(), tmp.data(), valueNum, b, true, bti.isConst ? NULL : &tmpFlag);
+        sel.READ64(addr, dst.data(), tmp.data(), valueNum, b, true, sel.getBTITemps(bti));
       } else {
-        sel.READ64(addr, dst.data(), NULL, valueNum, b, false, bti.isConst ? NULL : &tmpFlag);
+        sel.READ64(addr, dst.data(), NULL, valueNum, b, false, sel.getBTITemps(bti));
       }
     }
 
@@ -3397,7 +3893,6 @@ namespace gbe
         GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32);
 
         GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
-        GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
 
         // Get dword aligned addr
         sel.push();
@@ -3410,7 +3905,7 @@ namespace gbe
         sel.push();
           if (isUniform)
             sel.curr.noMask = 1;
-          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, b, bti.isConst ? NULL : &tmpFlag);
+          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, b, sel.getBTITemps(bti));
 
           if (isUniform)
             sel.curr.execWidth = 1;
@@ -3522,53 +4017,96 @@ namespace gbe
 
       if(valueNum > 1) {
         GBE_ASSERT(!isUniform && "vector load should not be uniform. Something went wrong.");
-        vector<GenRegister> dst(valueNum);
-        const uint32_t typeSize = getFamilySize(family);
+        //need to investigate the case of GEN_BYTE_SCATTER_WORD later
+        //for GEN_BYTE_SCATTER_BYTE, if the pointer is not aligned to 4, using byte gather,
+        //                           on BDW, vec8 and vec16 are worse. on SKL/BXT, vec16 is worse.
+        if(sel.getSlowByteGather() || elemSize == GEN_BYTE_SCATTER_WORD
+            || (elemSize == GEN_BYTE_SCATTER_BYTE && (valueNum == 16 || valueNum == 8))) {
+          vector<GenRegister> dst(valueNum);
+          const uint32_t typeSize = getFamilySize(family);
 
-        for(uint32_t i = 0; i < valueNum; i++)
-          dst[i] = sel.selReg(insn.getValue(i), getType(family));
+          for(uint32_t i = 0; i < valueNum; i++)
+            dst[i] = sel.selReg(insn.getValue(i), getType(family));
 
-        uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
-        vector<GenRegister> tmp(effectDataNum + 1);
-        vector<GenRegister> tmp2(effectDataNum + 1);
-        vector<GenRegister> effectData(effectDataNum);
-        for(uint32_t i = 0; i < effectDataNum + 1; i++)
-          tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+          uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
+          vector<GenRegister> tmp(effectDataNum + 1);
+          vector<GenRegister> tmp2(effectDataNum + 1);
+          vector<GenRegister> effectData(effectDataNum);
+          for(uint32_t i = 0; i < effectDataNum + 1; i++)
+            tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
 
-        GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
-        sel.push();
-          if (isUniform)
-            sel.curr.noMask = 1;
-          sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
-        sel.pop();
+          GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+          sel.push();
+            if (isUniform)
+              sel.curr.noMask = 1;
+            sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
+          sel.pop();
 
-        uint32_t remainedReg = effectDataNum + 1;
-        uint32_t pos = 0;
-        do {
-          uint32_t width = remainedReg > 4 ? 4 : remainedReg;
-          vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
-          vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
-          if (pos != 0) {
-            sel.push();
-              if (isUniform)
-                sel.curr.noMask = 1;
-              sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
-            sel.pop();
+          uint32_t remainedReg = effectDataNum + 1;
+          uint32_t pos = 0;
+          do {
+            uint32_t width = remainedReg > 4 ? 4 : remainedReg;
+            vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
+            vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
+            if (pos != 0) {
+              sel.push();
+                if (isUniform)
+                  sel.curr.noMask = 1;
+                sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
+              sel.pop();
+            }
+            readDWord(sel, t1, alignedAddr, width, bti);
+            remainedReg -= width;
+            pos += width;
+          } while(remainedReg);
+
+          for(uint32_t i = 0; i < effectDataNum; i++)
+            effectData[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+
+          getEffectByteData(sel, effectData, tmp, effectDataNum, address, isUniform);
+
+          for(uint32_t i = 0; i < effectDataNum; i++) {
+            unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
+                                   4/typeSize : (valueNum - i * (4 / typeSize));
+            sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize, elemNum);
+          }
+        } else {
+          GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_BYTE);
+          GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+          vector<GenRegister> dst(valueNum);
+          for(uint32_t i = 0; i < valueNum; i++)
+            dst[i] = sel.selReg(insn.getValue(i), getType(family));
+
+          GenRegister readDst = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+          uint32_t valueIndex = 0;
+          uint32_t loopCount = (valueNum + 3) / 4;
+          GenRegister addressForLoop = address;
+
+          sel.push();
+          if (loopCount > 1) {
+            addressForLoop = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+            sel.MOV(addressForLoop, address);
           }
-          readDWord(sel, t1, alignedAddr, width, bti);
-          remainedReg -= width;
-          pos += width;
-        } while(remainedReg);
 
-        for(uint32_t i = 0; i < effectDataNum; i++)
-          effectData[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+          for (uint32_t i = 0; i < loopCount; ++i) {
+            uint32_t valueLeft = valueNum - valueIndex;
+            GBE_ASSERT(valueLeft > 1);
+            uint32_t dataSize = 0;
+            if (valueLeft == 2)
+              dataSize = GEN_BYTE_SCATTER_WORD;
+            else
+              dataSize = GEN_BYTE_SCATTER_DWORD;
+            sel.BYTE_GATHER(readDst, addressForLoop, dataSize, b, sel.getBTITemps(bti));
 
-        getEffectByteData(sel, effectData, tmp, effectDataNum, address, isUniform);
+            // only 4 bytes is gathered even if valueLeft >= 4
+            sel.UNPACK_BYTE(dst.data(), readDst, getFamilySize(FAMILY_BYTE), (valueLeft < 4 ? valueLeft : 4));
+            valueIndex += 4;
 
-        for(uint32_t i = 0; i < effectDataNum; i++) {
-          unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
-                                 4/typeSize : (valueNum - i * (4 / typeSize));
-          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize, elemNum);
+            //calculate the new address to read
+            if (valueIndex < valueNum)
+              sel.ADD(addressForLoop, addressForLoop, GenRegister::immud(4));
+          }
+          sel.pop();
         }
       } else {
         GBE_ASSERT(insn.getValueNum() == 1);
@@ -3578,14 +4116,13 @@ namespace gbe
           readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
         else {
           GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
-          GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
 
           // We need a temporary register if we read bytes or words
           Register dst = sel.reg(FAMILY_DWORD, isUniform);
           sel.push();
             if (isUniform)
               sel.curr.noMask = 1;
-            sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, elemSize, b, bti.isConst ? NULL : & tmpFlag);
+            sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, elemSize, b, sel.getBTITemps(bti));
           sel.pop();
 
           sel.push();
@@ -3602,6 +4139,27 @@ namespace gbe
       }
     }
 
+    void emitOWordRead(Selection::Opaque &sel,
+                       const ir::LoadInstruction &insn,
+                       GenRegister address,
+                       ir::BTI bti) const
+    {
+      using namespace ir;
+      const uint32_t vec_size = insn.getValueNum();
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+      vector<GenRegister> valuesVec;
+      for(uint32_t i = 0; i < vec_size; i++)
+        valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
+      // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
+      uint32_t tmp_size = simdWidth * vec_size / 8;
+      tmp_size = tmp_size > 4 ? 4 : tmp_size;
+      vector<GenRegister> tmpVec;
+      for(uint32_t i = 0; i < tmp_size; i++)
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+      sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm, &tmpVec[0], tmp_size);
+    }
+
     // check whether all binded table index point to constant memory
     INLINE bool isAllConstant(const ir::BTI &bti) const {
       if (bti.isConst && bti.imm == BTI_CONSTANT)
@@ -3609,28 +4167,12 @@ namespace gbe
       return false;
     }
 
-    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::LoadInstruction &insn) const {
-      using namespace ir;
-      SelectionDAG *child0 = dag.child[0];
-      ir::BTI b;
-      if (insn.isFixedBTI()) {
-        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
-        const auto imm = immInsn.getImmediate();
-        b.isConst = 1;
-        b.imm = imm.getIntegerValue();
-      } else {
-        b.isConst = 0;
-        b.reg = insn.getBTI();
-      }
-      return b;
-    }
-
     /*! Implements base class */
     virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
     {
       using namespace ir;
       const ir::LoadInstruction &insn = cast<ir::LoadInstruction>(dag.insn);
-      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+      GenRegister address = sel.selReg(insn.getAddressRegister(), ir::TYPE_U32);
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
                  insn.getAddressSpace() == MEM_CONSTANT ||
                  insn.getAddressSpace() == MEM_PRIVATE ||
@@ -3638,13 +4180,24 @@ namespace gbe
                  insn.getAddressSpace() == MEM_MIXED);
       //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
 
-      BTI bti = getBTI(dag, insn);
-
+      BTI bti;
+      AddressMode am = insn.getAddressMode();
+      if (am == AM_StaticBti) {
+        bti.isConst = 1;
+        bti.imm = insn.getSurfaceIndex();
+      } else if (am == AM_DynamicBti) {
+        bti.isConst = 0;
+        bti.reg = insn.getBtiReg();
+      } else {
+        assert(0 && "stateless not supported yet");
+      }
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(sel, type);
       bool allConstant = isAllConstant(bti);
 
-      if (allConstant) {
+      if (insn.isBlock())
+        this->emitOWordRead(sel, insn, address, bti);
+      else if (allConstant) {
         // XXX TODO read 64bit constant through constant cache
         // Per HW Spec, constant cache messages can read at least DWORD data.
         // So, byte/short data type, we have to read through data cache.
@@ -3667,15 +4220,11 @@ namespace gbe
           this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
       }
 
-
-      // for fixed bti, don't generate the useless loadi
-      if (insn.isFixedBTI())
-        dag.child[0] = NULL;
       markAllChildren(dag);
-
       return true;
     }
   };
+
   class StoreInstructionPattern : public SelectionPattern
   {
   public:
@@ -3690,13 +4239,12 @@ namespace gbe
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      vector<GenRegister> value(valueNum);
+      vector<GenRegister> value(valueNum), tmps;
       GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
 
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
-      GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
-      sel.UNTYPED_WRITE(address, value.data(), valueNum, b, bti.isConst? NULL : &tmp);
+      sel.UNTYPED_WRITE(address, value.data(), valueNum, b, sel.getBTITemps(bti));
     }
 
     void emitWrite64(Selection::Opaque &sel,
@@ -3714,16 +4262,14 @@ namespace gbe
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
 
-      GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
-
       if (sel.hasLongType()) {
         vector<GenRegister> tmp(valueNum);
         for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
           tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
         }
-        sel.WRITE64(address, src.data(), tmp.data(), valueNum, b, true, bti.isConst? NULL : &tmpFlag);
+        sel.WRITE64(address, src.data(), tmp.data(), valueNum, b, true, sel.getBTITemps(bti));
       } else {
-        sel.WRITE64(address, src.data(), NULL, valueNum, b, false, bti.isConst? NULL : &tmpFlag);
+        sel.WRITE64(address, src.data(), NULL, valueNum, b, false, sel.getBTITemps(bti));
       }
     }
 
@@ -3738,7 +4284,6 @@ namespace gbe
       uint32_t valueNum = insn.getValueNum();
 
       GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
-      GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
       if(valueNum > 1) {
         const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
         vector<GenRegister> value(valueNum);
@@ -3758,7 +4303,7 @@ namespace gbe
           sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
         }
 
-        sel.UNTYPED_WRITE(address, tmp.data(), tmpRegNum, b, bti.isConst ? NULL : &tmpFlag);
+        sel.UNTYPED_WRITE(address, tmp.data(), tmpRegNum, b, sel.getBTITemps(bti));
       } else {
         const GenRegister value = sel.selReg(insn.getValue(0));
         GBE_ASSERT(insn.getValueNum() == 1);
@@ -3775,38 +4320,55 @@ namespace gbe
           else if (elemSize == GEN_BYTE_SCATTER_BYTE)
             sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
         sel.pop();
-        sel.BYTE_SCATTER(address, tmp, elemSize, b, bti.isConst ? NULL : &tmpFlag);
+        sel.BYTE_SCATTER(address, tmp, elemSize, b, sel.getBTITemps(bti));
       }
     }
 
-
-    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::StoreInstruction &insn) const {
+    void emitOWordWrite(Selection::Opaque &sel,
+                        const ir::StoreInstruction &insn,
+                        GenRegister address,
+                        ir::BTI bti) const
+    {
       using namespace ir;
-      SelectionDAG *child0 = dag.child[0];
-      ir::BTI b;
-      if (insn.isFixedBTI()) {
-        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
-        const auto imm = immInsn.getImmediate();
-        b.isConst = 1;
-        b.imm = imm.getIntegerValue();
-      } else {
-        b.isConst = 0;
-        b.reg = insn.getBTI();
-      }
-      return b;
+      const uint32_t vec_size = insn.getValueNum();
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+      vector<GenRegister> valuesVec;
+      for(uint32_t i = 0; i < vec_size; i++)
+        valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
+      // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
+      uint32_t tmp_size = simdWidth * vec_size / 8;
+      tmp_size = tmp_size > 4 ? 4 : tmp_size;
+      vector<GenRegister> tmpVec;
+      for(uint32_t i = 0; i < tmp_size; i++)
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+      sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm, &tmpVec[0], tmp_size);
     }
+
     virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
     {
       using namespace ir;
       const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
-      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+      GenRegister address = sel.selReg(insn.getAddressRegister(), ir::TYPE_U32);
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(sel, type);
 
-      const bool isUniform = sel.isScalarReg(insn.getAddress()) && sel.isScalarReg(insn.getValue(0));
-      BTI bti = getBTI(dag, insn);
+      const bool isUniform = sel.isScalarReg(insn.getAddressRegister()) && sel.isScalarReg(insn.getValue(0));
+      BTI bti;
+      AddressMode am = insn.getAddressMode();
+      if (am == AM_StaticBti) {
+        bti.isConst = 1;
+        bti.imm = insn.getSurfaceIndex();
+      } else if (am == AM_DynamicBti) {
+        bti.isConst = 0;
+        bti.reg = insn.getBtiReg();
+      } else {
+        assert(0 && "stateless not supported yet");
+      }
 
-      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+      if (insn.isBlock())
+        this->emitOWordWrite(sel, insn, address, bti);
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
         this->emitWrite64(sel, insn, address, bti);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedWrite(sel, insn, address,  bti);
@@ -3814,11 +4376,7 @@ namespace gbe
         this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
       }
 
-      // for fixed bti, don't generate the useless loadi
-      if (insn.isFixedBTI())
-        dag.child[0] = NULL;
       markAllChildren(dag);
-
       return true;
     }
   };
@@ -4109,148 +4667,132 @@ namespace gbe
       return false;
     }
 
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    INLINE void convertBetweenHalfFloat(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       const Type dstType = insn.getDstType();
       const Type srcType = insn.getSrcType();
-      const RegisterFamily dstFamily = getFamily(dstType);
-      const RegisterFamily srcFamily = getFamily(srcType);
       const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
       const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
       const Opcode opcode = insn.getOpcode();
-      sel.push();
-        if (sel.isScalarReg(insn.getDst(0)) == true) {
-          sel.curr.execWidth = 1;
-          sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.curr.noMask = 1;
-        }
-      if(opcode == ir::OP_SAT_CVT)
-        sel.curr.saturate = 1;
 
-      // We need two instructions to make the conversion
       if (opcode == OP_F16TO32) {
         sel.F16TO32(dst, src);
       } else if (opcode == OP_F32TO16) {
+        // We need two instructions to make the conversion
         GenRegister unpacked;
         unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
         sel.push();
-          if (sel.isScalarReg(insn.getSrc(0))) {
-            sel.curr.execWidth = 1;
-            sel.curr.predicate = GEN_PREDICATE_NONE;
-            sel.curr.noMask = 1;
-          }
-          sel.F32TO16(unpacked, src);
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.F32TO16(unpacked, src);
         sel.pop();
         sel.MOV(dst, unpacked);
-      } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {//convert i32 to small int and half
-        GenRegister unpacked;
-        if (dstFamily == FAMILY_WORD) {
-          uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
-
-	  /* The special case, when dst is half, float->word->half will lose accuracy. */
-	  if (dstType == TYPE_HALF) {
-            GBE_ASSERT(sel.hasHalfType());
-            type = GEN_TYPE_HF;
-          }
+      } else {
+        GBE_ASSERT("Not conversion between float and half\n");
+      }
+    }
 
-          if (!sel.isScalarReg(dst.reg())) {
-            unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
-            unpacked = GenRegister::retype(unpacked, type);
-          } else
-            unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
-        } else {
-          const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
-          if (!sel.isScalarReg(dst.reg())) {
-            unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
-            unpacked = GenRegister::retype(unpacked, type);
-          } else
-            unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
-        }
+    INLINE void convert32bitsToSmall(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      GenRegister unpacked;
+      const RegisterFamily dstFamily = getFamily(dstType);
 
-        sel.push();
-          if (sel.isScalarReg(insn.getSrc(0))) {
-            sel.curr.execWidth = 1;
-            sel.curr.predicate = GEN_PREDICATE_NONE;
-            sel.curr.noMask = 1;
-          }
-          sel.MOV(unpacked, src);
-        sel.pop();
+      if (dstFamily == FAMILY_WORD) {
+        uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
 
-        if (unpacked.reg() != dst.reg())
-          sel.MOV(dst, unpacked);
-      } else if (dstFamily == FAMILY_WORD && srcFamily == FAMILY_QWORD) { //convert i64 to i16 and half.
+        /* The special case, when dst is half, float->word->half will lose accuracy. */
         if (dstType == TYPE_HALF) {
-          /* There is no MOV for Long <---> Half. So Long-->Float-->half. */
-          GBE_ASSERT(sel.hasLongType());
           GBE_ASSERT(sel.hasHalfType());
-          sel.push();
-          if (sel.isScalarReg(insn.getSrc(0))) {
-            sel.curr.execWidth = 1;
-            sel.curr.predicate = GEN_PREDICATE_NONE;
-            sel.curr.noMask = 1;
-          }
+          type = GEN_TYPE_HF;
+        }
 
-          GenRegister funpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
-          funpacked = GenRegister::retype(funpacked, GEN_TYPE_F);
-          sel.MOV(funpacked, src);
-          GenRegister ftmp = sel.selReg(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
-          ftmp = GenRegister::retype(ftmp, GEN_TYPE_F);
-          sel.MOV(ftmp, funpacked);
-          GenRegister unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
-          unpacked = GenRegister::retype(unpacked, GEN_TYPE_HF);
-          sel.MOV(unpacked, ftmp);
-          sel.pop();
-          sel.MOV(dst, unpacked);
-        } else {
-          uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+        if (!sel.isScalarReg(dst.reg())) {
+          unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, type);
+        } else
+          unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
+      } else {
+        const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
+        if (!sel.isScalarReg(dst.reg())) {
+          unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, type);
+        } else
+          unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
+      }
 
-          GenRegister unpacked;
-          if (!sel.isScalarReg(dst.reg())) {
-            if (sel.hasLongType()) {
-              unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
-            } else {
-              unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
-            }
-            unpacked = GenRegister::retype(unpacked, type);
-          } else {
-            unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
-          }
+      sel.push();
+      if (sel.isScalarReg(insn.getSrc(0))) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+      sel.MOV(unpacked, src);
+      sel.pop();
 
-          if(!sel.hasLongType()) {
-           GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
-            tmp.type = GEN_TYPE_D;
-            sel.CONVI64_TO_I(tmp, src);
-            sel.MOV(unpacked, tmp);
-          } else {
-            sel.push();
-              if (sel.isScalarReg(insn.getSrc(0))) {
-                sel.curr.execWidth = 1;
-                sel.curr.predicate = GEN_PREDICATE_NONE;
-                sel.curr.noMask = 1;
-              }
-              sel.MOV(unpacked, src);
-            sel.pop();
-          }
+      if (unpacked.reg() != dst.reg())
+        sel.MOV(dst, unpacked);
+    }
 
-          if (unpacked.reg() != dst.reg()) {
-            sel.MOV(dst, unpacked);
-          }
+    INLINE void convertI64To16bits(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+
+      if (dstType == TYPE_HALF) {
+        /* There is no MOV for Long <---> Half. So Long-->Float-->half. */
+        GBE_ASSERT(sel.hasLongType());
+        GBE_ASSERT(sel.hasHalfType());
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
         }
-      } else if (dstFamily == FAMILY_BYTE && srcFamily == FAMILY_QWORD) { //convert i64 to i8
-        GenRegister unpacked;
-        const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
 
-        if (sel.hasLongType()) { // handle the native long logic.
-          if (!sel.isScalarReg(dst.reg())) {
-            /* When convert i64 to i8, the hstride should be 8, but the hstride do not
-               support more than 4, so we need to split it to 2 steps. */
+        GenRegister funpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+        funpacked = GenRegister::retype(funpacked, GEN_TYPE_F);
+        sel.MOV(funpacked, src);
+        GenRegister ftmp = sel.selReg(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+        ftmp = GenRegister::retype(ftmp, GEN_TYPE_F);
+        sel.MOV(ftmp, funpacked);
+        GenRegister unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+        unpacked = GenRegister::retype(unpacked, GEN_TYPE_HF);
+        sel.MOV(unpacked, ftmp);
+        sel.pop();
+        sel.MOV(dst, unpacked);
+      } else {
+        uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+
+        GenRegister unpacked;
+        if (!sel.isScalarReg(dst.reg())) {
+          if (sel.hasLongType()) {
             unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
-            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
           } else {
-            unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
           }
+          unpacked = GenRegister::retype(unpacked, type);
+        } else {
+          unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
+        }
 
+        if(!sel.hasLongType()) {
+          GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
+          tmp.type = GEN_TYPE_D;
+          sel.CONVI64_TO_I(tmp, src);
+          sel.MOV(unpacked, tmp);
+        } else {
           sel.push();
           if (sel.isScalarReg(insn.getSrc(0))) {
             sel.curr.execWidth = 1;
@@ -4259,243 +4801,636 @@ namespace gbe
           }
           sel.MOV(unpacked, src);
           sel.pop();
+        }
 
-          if (unpacked.reg() != dst.reg()) {
-            sel.MOV(dst, unpacked);
-          }
-        } else { // Do not have native long
-          if (!sel.isScalarReg(dst.reg())) {
-            unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
-            unpacked = GenRegister::retype(unpacked, type);
-          } else {
-            unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
-          }
+        if (unpacked.reg() != dst.reg()) {
+          sel.MOV(dst, unpacked);
+        }
+      }
+    }
 
-          GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
-          tmp.type = GEN_TYPE_D;
-          sel.CONVI64_TO_I(tmp, src);
-          sel.MOV(unpacked, tmp);
+    INLINE void convertI64ToI8(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      GenRegister unpacked;
+      const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
+
+      if (sel.hasLongType()) { // handle the native long logic.
+        if (!sel.isScalarReg(dst.reg())) {
+          /* When convert i64 to i8, the hstride should be 8, but the hstride do not
+             support more than 4, so we need to split it to 2 steps. */
+          unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, dstType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
+        } else {
+          unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
+        }
 
-          if (unpacked.reg() != dst.reg()) {
-            sel.MOV(dst, unpacked);
-          }
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
         }
-      } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) &&
-                 (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) {// Convert i64 to i32
-        if (sel.hasLongType()) {
-          GenRegister unpacked;
-          const uint32_t type = dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D;
-          if (!sel.isScalarReg(dst.reg())) {
-            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
-            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D);
-          } else {
-            unpacked = GenRegister::retype(sel.unpacked_ud(dst.reg()), type);
-          }
+        sel.MOV(unpacked, src);
+        sel.pop();
 
-          sel.push();
-            if (sel.isScalarReg(insn.getSrc(0))) {
-              sel.curr.execWidth = 1;
-              sel.curr.predicate = GEN_PREDICATE_NONE;
-              sel.curr.noMask = 1;
-            }
-            sel.MOV(unpacked, src);
-          sel.pop();
+        if (unpacked.reg() != dst.reg()) {
+          sel.MOV(dst, unpacked);
+        }
+      } else { // Do not have native long
+        if (!sel.isScalarReg(dst.reg())) {
+          unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, type);
+        } else {
+          unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
+        }
 
-          if (unpacked.reg() != dst.reg()) {
-            sel.MOV(dst, unpacked);
-          }
+        GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
+        tmp.type = GEN_TYPE_D;
+        sel.CONVI64_TO_I(tmp, src);
+        sel.MOV(unpacked, tmp);
+
+        if (unpacked.reg() != dst.reg()) {
+          sel.MOV(dst, unpacked);
+        }
+      }
+    }
+
+    INLINE void convertI64ToI32(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      if (sel.hasLongType()) {
+        GenRegister unpacked;
+        const uint32_t type = dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D;
+        if (!sel.isScalarReg(dst.reg())) {
+          unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D);
         } else {
-          sel.CONVI64_TO_I(dst, src);
+          unpacked = GenRegister::retype(sel.unpacked_ud(dst.reg()), type);
         }
-      } else if (dstType == ir::TYPE_FLOAT && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) { //i64 to float
-        auto dag = sel.regDAG[src.reg()];
-        // FIXME, in the future, we need to do a common I64 lower to I32 analysis
-        // at llvm IR layer which could cover more cases then just this one.
-        SelectionDAG *dag0, *dag1;
-        if (dag && dag->child[0] && dag->child[1]) {
-          if (dag->child[0]->insn.getOpcode() == OP_LOADI) {
-            dag0 = dag->child[1];
-            dag1 = dag->child[0];
-          } else {
-            dag0 = dag->child[0];
-            dag1 = dag->child[1];
-          }
-          GBE_ASSERT(!(dag->child[0]->insn.getOpcode() == OP_LOADI &&
-                       dag->child[1]->insn.getOpcode() == OP_LOADI));
-          if (dag->insn.getOpcode() == OP_AND ||
-              dag->insn.getOpcode() == OP_OR  ||
-              dag->insn.getOpcode() == OP_XOR) {
-            GenRegister src0;
-            GenRegister src1;
-            if (lowerI64Reg(sel, dag0, src0, GEN_TYPE_UD) &&
-                lowerI64Reg(sel, dag1, src1, GEN_TYPE_UD)) {
-              switch (dag->insn.getOpcode()) {
-                default:
-                case OP_AND: sel.AND(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
-                case OP_OR:  sel.OR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
-                case OP_XOR: sel.XOR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
-              }
-              sel.MOV(dst, GenRegister::retype(dst, GEN_TYPE_UD));
-              markChildren = false;
-              return true;
+
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.MOV(unpacked, src);
+        sel.pop();
+
+        if (unpacked.reg() != dst.reg()) {
+          sel.MOV(dst, unpacked);
+        }
+      } else {
+        sel.CONVI64_TO_I(dst, src);
+      }
+    }
+
+    INLINE void convertI64ToFloat(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      auto dag = sel.regDAG[src.reg()];
+
+      // FIXME, in the future, we need to do a common I64 lower to I32 analysis
+      // at llvm IR layer which could cover more cases then just this one.
+      SelectionDAG *dag0, *dag1;
+      if (dag && dag->child[0] && dag->child[1]) {
+        if (dag->child[0]->insn.getOpcode() == OP_LOADI) {
+          dag0 = dag->child[1];
+          dag1 = dag->child[0];
+        } else {
+          dag0 = dag->child[0];
+          dag1 = dag->child[1];
+        }
+        GBE_ASSERT(!(dag->child[0]->insn.getOpcode() == OP_LOADI &&
+              dag->child[1]->insn.getOpcode() == OP_LOADI));
+        if (dag->insn.getOpcode() == OP_AND ||
+            dag->insn.getOpcode() == OP_OR  ||
+            dag->insn.getOpcode() == OP_XOR) {
+          GenRegister src0;
+          GenRegister src1;
+          if (lowerI64Reg(sel, dag0, src0, GEN_TYPE_UD) &&
+              lowerI64Reg(sel, dag1, src1, GEN_TYPE_UD)) {
+            switch (dag->insn.getOpcode()) {
+              default:
+              case OP_AND: sel.AND(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+              case OP_OR:  sel.OR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+              case OP_XOR: sel.XOR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
             }
+            sel.MOV(dst, GenRegister::retype(dst, GEN_TYPE_UD));
+            markChildren = false;
+            return;
           }
         }
+      }
 
-        if (!sel.hasLongType()) {
-          GenRegister tmp[6];
-          for(int i=0; i<6; i++) {
-            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-          }
-          sel.push();
-            sel.curr.flag = 0;
-            sel.curr.subFlag = 1;
-            sel.CONVI64_TO_F(dst, src, tmp);
-          sel.pop();
-        } else {
-          GenRegister unpacked;
-          const uint32_t type = GEN_TYPE_F;
-          unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
-          unpacked = GenRegister::retype(unpacked, type);
+      if (!sel.hasLongType()) {
+        GenRegister tmp[6];
+        for(int i=0; i<6; i++) {
+          tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        }
+        sel.push();
+        sel.curr.flag = 0;
+        sel.curr.subFlag = 1;
+        sel.CONVI64_TO_F(dst, src, tmp);
+        sel.pop();
+      } else {
+        GenRegister unpacked;
+        const uint32_t type = GEN_TYPE_F;
+        unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+        unpacked = GenRegister::retype(unpacked, type);
 
-          sel.push();
-            if (sel.isScalarReg(insn.getSrc(0))) {
-              sel.curr.execWidth = 1;
-              sel.curr.predicate = GEN_PREDICATE_NONE;
-              sel.curr.noMask = 1;
-            }
-            sel.MOV(unpacked, src);
-          sel.pop();
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.MOV(unpacked, src);
+        sel.pop();
 
-          if (unpacked.reg() != dst.reg()) {
-            sel.MOV(dst, unpacked);
-          }
+        if (unpacked.reg() != dst.reg()) {
+          sel.MOV(dst, unpacked);
         }
-      }   else if (sel.hasLongType() && sel.hasLongRegRestrict() && dstFamily == FAMILY_QWORD && srcFamily != FAMILY_QWORD) {
-        // Convert i32/i16/i8/float to i64/double if hasLongRegRestrict(src and dst hstride must be aligned to the same qword).
+      }
+    }
+
+    INLINE void convertSmallIntsToI64(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      const RegisterFamily srcFamily = getFamily(srcType);
+
+      if (sel.hasLongType() && sel.hasLongRegRestrict()) {
+        // Convert i32/i16/i8 to i64 if hasLongRegRestrict(src and dst hstride must be aligned to the same qword).
         GenRegister unpacked;
         GenRegister unpacked_src = src;
 
         sel.push();
-          if (sel.isScalarReg(insn.getSrc(0))) {
-            sel.curr.execWidth = 1;
-            sel.curr.predicate = GEN_PREDICATE_NONE;
-            sel.curr.noMask = 1;
-          }
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
 
-          if (srcType == ir::TYPE_FLOAT) {
-            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
-            unpacked = GenRegister::retype(unpacked, GEN_TYPE_F);
-          } else if(srcFamily == FAMILY_DWORD) {
-            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
-            unpacked = GenRegister::retype(unpacked, srcType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D);
-          } else if(srcFamily == FAMILY_WORD) {
-            unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
-            unpacked = GenRegister::retype(unpacked, srcType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W);
-          } else if(srcFamily == FAMILY_BYTE) {
-            GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, sel.isScalarReg(insn.getSrc(0))));
-            tmp = GenRegister::retype(tmp, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
-            unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
-            unpacked = GenRegister::retype(unpacked, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
-            sel.MOV(tmp, src);
-            unpacked_src = tmp;
-          } else
-            GBE_ASSERT(0);
+        if(srcFamily == FAMILY_DWORD) {
+          unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, srcType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D);
+        } else if(srcFamily == FAMILY_WORD) {
+          unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, srcType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W);
+        } else if(srcFamily == FAMILY_BYTE) {
+          GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, sel.isScalarReg(insn.getSrc(0))));
+          tmp = GenRegister::retype(tmp, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
+          unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
+          sel.MOV(tmp, src);
+          unpacked_src = tmp;
+        } else
+          GBE_ASSERT(0);
 
-          sel.MOV(unpacked, unpacked_src);
+        sel.MOV(unpacked, unpacked_src);
         sel.pop();
         sel.MOV(dst, unpacked);
-      }else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
-                 (src.isdf() && dstType == ir::TYPE_FLOAT)) { // float and double conversion
-        ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
-        sel.MOV_DF(dst, src, sel.selReg(r, TYPE_U64));
-      } else if (dst.isint64()) { // promote to i64
-        switch(src.type) {
-          case GEN_TYPE_F:
-            {
-              if (!sel.hasLongType()) {
-                GenRegister tmp[2];
-                tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-                tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_FLOAT);
-                sel.push();
-                  sel.curr.flag = 0;
-                  sel.curr.subFlag = 1;
-                  sel.CONVF_TO_I64(dst, src, tmp);
-                sel.pop();
-              } else {
-                sel.MOV(dst, src);
-              }
-              break;
-            }
-          case GEN_TYPE_HF:
-            {
-              GBE_ASSERT(sel.hasLongType());
-              GBE_ASSERT(sel.hasHalfType());
-              uint32_t type = dstType == TYPE_U64 ? GEN_TYPE_UD : GEN_TYPE_D;
-              GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))), TYPE_U32), type);
-              sel.push();
-              if (sel.isScalarReg(insn.getSrc(0))) {
-                sel.curr.execWidth = 1;
-                sel.curr.predicate = GEN_PREDICATE_NONE;
-                sel.curr.noMask = 1;
-              }
-              sel.MOV(tmp, src);
-              sel.pop();
-              sel.MOV(dst, tmp);
-              break;
-            }
-          case GEN_TYPE_DF:
-            NOT_IMPLEMENTED;
-          default:
-            if (sel.hasLongType()) {
-              sel.MOV(dst, src);
-            } else {
-              sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
-            }
+      } else if (sel.hasLongType()) {
+        sel.MOV(dst, src);
+      } else {
+        sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
+      }
+    }
+
+    INLINE void convertFToI64(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+
+      if (sel.hasLongType() && sel.hasLongRegRestrict() && srcType == ir::TYPE_FLOAT) { // typical bsw float->long case
+        // Convert float to i64 if hasLongRegRestrict(src and dst hstride must be aligned to the same qword).
+        GenRegister unpacked;
+        GenRegister unpacked_src = src;
+
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
         }
-      } else if (srcType == ir::TYPE_HALF && (dstFamily == FAMILY_BYTE || dstFamily == FAMILY_WORD)) {
-      // Special case, half -> char/short.
-      /* [DevBDW+]:  Format conversion to or from HF (Half Float) must be DWord-aligned and
-         strided by a DWord on the destination. */
+
+        unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+        unpacked = GenRegister::retype(unpacked, GEN_TYPE_F);
+        sel.MOV(unpacked, unpacked_src);
+        sel.pop();
+        sel.MOV(dst, unpacked);
+      } else if (srcType == ir::TYPE_FLOAT) {
+        if (sel.hasLongType()) { // typical bdw float->long case
+          sel.MOV(dst, src);
+        } else { // typical old platform float->long case
+          GenRegister tmp[2];
+          tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+          tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_FLOAT);
+          sel.push();
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
+          sel.CONVF_TO_I64(dst, src, tmp);
+          sel.pop();
+        }
+      } else if (srcType == ir::TYPE_HALF) {
+        /* No need to consider old platform. if we support half, we must have native long. */
+        GBE_ASSERT(sel.hasLongType());
         GBE_ASSERT(sel.hasHalfType());
-        GenRegister tmp;
+        uint32_t type = dstType == TYPE_U64 ? GEN_TYPE_UD : GEN_TYPE_D;
+        GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))), TYPE_U32), type);
         sel.push();
         if (sel.isScalarReg(insn.getSrc(0))) {
           sel.curr.execWidth = 1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.curr.noMask = 1;
         }
-        if (dstFamily == FAMILY_BYTE) {
-          const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
-          tmp = GenRegister::retype(sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), type);
-          sel.MOV(tmp, src);
+
+        sel.MOV(tmp, src);
+
+        if (sel.hasLongRegRestrict()) { // special for BSW case.
+          GenRegister unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, type);
+          sel.MOV(unpacked, tmp);
+          sel.pop();
+          sel.MOV(dst, unpacked);
         } else {
-          const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
-          tmp = GenRegister::retype(sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), type);
-          sel.MOV(tmp, src);
+          sel.pop();
+          sel.MOV(dst, tmp);
         }
+      } else if (src.type == GEN_TYPE_DF) {
+        GBE_ASSERT(sel.hasDoubleType());
+        GBE_ASSERT(sel.hasLongType()); //So far, if we support double, we support native long.
+
+        // Just Mov
+        sel.MOV(dst, src);
+      } else {
+        /* Invalid case. */
+        GBE_ASSERT(0);
+      }
+    }
+
+    INLINE void convertBetweenFloatDouble(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+
+      GBE_ASSERT(sel.hasDoubleType());
+
+      if (sel.isScalarReg(insn.getDst(0))) {
+        // dst is scalar, just MOV and nothing more.
+        GBE_ASSERT(sel.isScalarReg(insn.getSrc(0)));
+        sel.MOV(dst, src);
+      } else if (srcType == ir::TYPE_DOUBLE) {
+        // double to float
+        GBE_ASSERT(dstType == ir::TYPE_FLOAT);
+        GenRegister unpacked;
+        unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+        unpacked = GenRegister::retype(unpacked, GEN_TYPE_F);
+
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.MOV(unpacked, src);
         sel.pop();
-        sel.MOV(dst, tmp);
-      } else if (dstType == ir::TYPE_HALF && (srcFamily == FAMILY_BYTE || srcFamily == FAMILY_WORD)) {
-        // Special case, char/uchar -> half
-        /* [DevBDW+]:  Format conversion to or from HF (Half Float) must be DWord-aligned and
-           strided by a DWord on the destination. */
-        GBE_ASSERT(sel.hasHalfType());
-        GenRegister tmp = GenRegister::retype(sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), GEN_TYPE_HF);
+
+        sel.MOV(dst, unpacked);
+      } else {
+        // float to double, just mov
+        sel.MOV(dst, src);
+      }
+
+      return;
+    }
+
+    INLINE void convertBetweenHalfDouble(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+
+      GBE_ASSERT(sel.hasDoubleType());
+      GBE_ASSERT(sel.hasHalfType()); //So far, if we support double, we support half.
+
+      if (sel.isScalarReg(insn.getDst(0))) { // uniform case.
+        GBE_ASSERT(sel.isScalarReg(insn.getSrc(0)));
+        GBE_ASSERT(sel.curr.execWidth == 1);
+        GenRegister tmpFloat = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_F);
+        sel.MOV(tmpFloat, src);
+        sel.MOV(dst, tmpFloat);
+        return;
+      }
+
+      if (dstType == ir::TYPE_DOUBLE) {
+        // half to double. There is no direct double to half MOV, need tmp float.
+        GBE_ASSERT(srcType == ir::TYPE_HALF);
+        GenRegister tmpFloat = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_F);
+
         sel.push();
         if (sel.isScalarReg(insn.getSrc(0))) {
           sel.curr.execWidth = 1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.curr.noMask = 1;
         }
+        sel.MOV(tmpFloat, src);
+        sel.pop();
+
+        sel.MOV(dst, tmpFloat);
+      } else {
+        // double to half. No direct MOV from double to half, so double->float->half
+        GBE_ASSERT(srcType == ir::TYPE_DOUBLE);
+        GBE_ASSERT(dstType == ir::TYPE_HALF);
+
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+
+        // double to float
+        GenRegister unpackedFloat = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+        unpackedFloat = GenRegister::retype(unpackedFloat, GEN_TYPE_F);
+        sel.MOV(unpackedFloat, src);
+
+        // float to half
+        GenRegister unpackedHalf = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+        unpackedHalf = GenRegister::retype(unpackedHalf, GEN_TYPE_HF);
+        sel.MOV(unpackedHalf, unpackedFloat);
+        sel.pop();
+
+        sel.MOV(dst, unpackedHalf);
+      }
+    }
+
+    INLINE void convertHalfToSmallInts(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      const RegisterFamily dstFamily = getFamily(dstType);
+
+      // Special case, half -> char/short.
+      /* [DevBDW+]:	Format conversion to or from HF (Half Float) must be DWord-aligned and
+         strided by a DWord on the destination. */
+      GBE_ASSERT(sel.hasHalfType());
+      GenRegister tmp;
+      sel.push();
+      if (sel.isScalarReg(insn.getSrc(0))) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+      if (dstFamily == FAMILY_BYTE) {
+        const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
+        tmp = GenRegister::retype(sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), type);
+        sel.MOV(tmp, src);
+      } else {
+        const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+        tmp = GenRegister::retype(sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), type);
         sel.MOV(tmp, src);
+      }
+      sel.pop();
+      sel.MOV(dst, tmp);
+    }
+
+    INLINE void convertSmallIntsToHalf(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+
+      // Special case, char/uchar -> half
+      /* [DevBDW+]:  Format conversion to or from HF (Half Float) must be DWord-aligned and
+         strided by a DWord on the destination. */
+      GBE_ASSERT(sel.hasHalfType());
+      GenRegister tmp = GenRegister::retype(sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), GEN_TYPE_HF);
+      sel.push();
+      if (sel.isScalarReg(insn.getSrc(0))) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+      sel.MOV(tmp, src);
+      sel.pop();
+      sel.MOV(dst, tmp);
+    }
+
+    INLINE void convertDoubleToSmallInts(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      const RegisterFamily dstFamily = getFamily(dstType);
+
+      GBE_ASSERT(sel.hasDoubleType());
+      GBE_ASSERT(sel.hasHalfType()); //So far, if we support double, we support half.
+      if (sel.isScalarReg(insn.getDst(0))) {
+        // dst is scalar, just MOV and nothing more.
+        GBE_ASSERT(sel.isScalarReg(insn.getSrc(0)));
+        sel.MOV(dst, src);
+      } else {
+        GenRegister unpacked;
+        if (dstFamily == FAMILY_DWORD) {
+          // double to int
+          unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D);
+        } else if (dstFamily == FAMILY_WORD) {
+          // double to short
+          unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W);
+        } else {
+          GBE_ASSERT(dstFamily == FAMILY_BYTE);
+          // double to char
+          unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, dstType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
+        }
+
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.MOV(unpacked, src);
         sel.pop();
-        sel.MOV(dst, tmp);
-      } else
+
+        sel.MOV(dst, unpacked);
+      }
+    }
+
+    INLINE void convertI64ToDouble(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+
+      GBE_ASSERT(sel.hasDoubleType());
+      GBE_ASSERT(sel.hasLongType()); //So far, if we support double, we support native long.
+      // Just Mov
+      sel.MOV(dst, src);
+    }
+
+    INLINE void convertSmallIntsToDouble(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      const RegisterFamily srcFamily = getFamily(srcType);
+
+      GBE_ASSERT(sel.hasDoubleType());
+      GBE_ASSERT(sel.hasLongType()); //So far, if we support double, we support native long.
+
+      if (sel.hasLongType() && sel.hasLongRegRestrict()) {
+        // Convert i32/i16/i8 to i64 if hasLongRegRestrict(src and dst hstride must be aligned to the same qword).
+        GenRegister unpacked;
+        GenRegister unpacked_src = src;
+
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+
+        if(srcFamily == FAMILY_DWORD) {
+          unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, srcType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D);
+        } else if(srcFamily == FAMILY_WORD) {
+          unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, srcType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W);
+        } else if(srcFamily == FAMILY_BYTE) {
+          GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, sel.isScalarReg(insn.getSrc(0))));
+          tmp = GenRegister::retype(tmp, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
+          unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
+          sel.MOV(tmp, src);
+          unpacked_src = tmp;
+        } else
+          GBE_ASSERT(0);
+
+        sel.MOV(unpacked, unpacked_src);
+        sel.pop();
+        sel.MOV(dst, unpacked);
+      } else if (sel.hasLongType()) {
+        sel.MOV(dst, src);
+      }
+    }
+
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const RegisterFamily dstFamily = getFamily(dstType);
+      const RegisterFamily srcFamily = getFamily(srcType);
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      const Opcode opcode = insn.getOpcode();
+      sel.push();
+      if (sel.isScalarReg(insn.getDst(0)) == true) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+      if(opcode == ir::OP_SAT_CVT)
+        sel.curr.saturate = 1;
+
+      if (opcode == OP_F16TO32 || opcode == OP_F32TO16) {
+        /* Conversion between float and half. */
+        convertBetweenHalfFloat(sel, insn, markChildren);
+      } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {
+        //convert i32/float to small int/half
+        convert32bitsToSmall(sel, insn, markChildren);
+      } else if (dstFamily == FAMILY_WORD && srcFamily == FAMILY_QWORD && srcType != ir::TYPE_DOUBLE) {
+        //convert i64 to i16 and half.
+        convertI64To16bits(sel, insn, markChildren);
+      } else if (dstFamily == FAMILY_BYTE && srcFamily == FAMILY_QWORD && srcType != ir::TYPE_DOUBLE) {
+        //convert i64 to i8
+        convertI64ToI8(sel, insn, markChildren);
+      } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) &&
+          (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) {
+        // Convert i64 to i32
+        convertI64ToI32(sel, insn, markChildren);
+      } else if (dstType == ir::TYPE_FLOAT && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) {
+        // long -> float
+        convertI64ToFloat(sel, insn, markChildren);
+      } else if (dstType == ir::TYPE_DOUBLE && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) {
+        // long -> double
+        convertI64ToDouble(sel, insn, markChildren);
+      } else if ((dstType == ir::TYPE_U64 || dstType == ir::TYPE_S64)
+          && (srcFamily != FAMILY_QWORD && srcType != ir::TYPE_FLOAT && srcType != ir::TYPE_HALF)) {
+        // int/short/char to long
+        convertSmallIntsToI64(sel, insn, markChildren);
+      } else if ((dstType == ir::TYPE_DOUBLE)
+          && (srcFamily != FAMILY_QWORD && srcType != ir::TYPE_FLOAT && srcType != ir::TYPE_HALF)) {
+        // int/short/char to double
+        convertSmallIntsToDouble(sel, insn, markChildren);
+      } else if ((dstType == ir::TYPE_U64 || dstType == ir::TYPE_S64)
+          && (srcType == ir::TYPE_FLOAT || srcType == ir::TYPE_HALF || srcType == ir::TYPE_DOUBLE)) {
+        // All float type to long
+        convertFToI64(sel, insn, markChildren);
+      } else if ((srcType == ir::TYPE_FLOAT && dstType == ir::TYPE_DOUBLE) ||
+            (dstType == ir::TYPE_FLOAT && srcType == ir::TYPE_DOUBLE)) {
+        // float and double conversion
+        convertBetweenFloatDouble(sel, insn, markChildren);
+      } else if ((srcType == ir::TYPE_HALF && dstType == ir::TYPE_DOUBLE) ||
+            (dstType == ir::TYPE_HALF && srcType == ir::TYPE_DOUBLE)) {
+        // float and half conversion
+        convertBetweenHalfDouble(sel, insn, markChildren);
+      } else if (srcType == ir::TYPE_DOUBLE && dstType != ir::TYPE_FLOAT
+             && dstType != ir::TYPE_HALF && dstFamily != FAMILY_QWORD) {
+        // double to int/short/char
+        convertDoubleToSmallInts(sel, insn, markChildren);
+      } else if (srcType == ir::TYPE_HALF && (dstFamily == FAMILY_BYTE || dstFamily == FAMILY_WORD)) {
+        // Convert half to small int
+        convertHalfToSmallInts(sel, insn, markChildren);
+      } else if (dstType == ir::TYPE_HALF && (srcFamily == FAMILY_BYTE || srcFamily == FAMILY_WORD)) {
+        // Convert small int to half
+        convertSmallIntsToHalf(sel, insn, markChildren);
+      } else {
+        /* All special cases has been handled, just MOV. */
         sel.MOV(dst, src);
+      }
 
       sel.pop();
-
       return true;
     }
     DECL_CTOR(ConvertInstruction, 1, 1);
@@ -4511,49 +5446,36 @@ namespace gbe
           this->opcodes.push_back(ir::Opcode(op));
     }
 
-    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::AtomicInstruction &insn) const {
-      using namespace ir;
-      SelectionDAG *child0 = dag.child[0];
-      ir::BTI b;
-      if (insn.isFixedBTI()) {
-        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
-        const auto imm = immInsn.getImmediate();
-        b.isConst = 1;
-        b.imm = imm.getIntegerValue();
-      } else {
-        b.isConst = 0;
-        b.reg = insn.getBTI();
-      }
-      return b;
-    }
-
     INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
       using namespace ir;
       const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
 
-      ir::BTI b = getBTI(dag, insn);
+      ir::BTI b;
       const AtomicOps atomicOp = insn.getAtomicOpcode();
       unsigned srcNum = insn.getSrcNum();
-      unsigned opNum = srcNum - 1;
+      unsigned msgPayload;
+
+      AddressMode AM = insn.getAddressMode();
+      if (AM == AM_DynamicBti) {
+        b.reg = insn.getBtiReg();
+        msgPayload = srcNum - 1;
+      } else {
+        b.imm = insn.getSurfaceIndex();
+        b.isConst = 1;
+        msgPayload = srcNum;
+      }
 
       GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
       GenRegister bti =  b.isConst ? GenRegister::immud(b.imm) : sel.selReg(b.reg, ir::TYPE_U32);
-      GenRegister src0 = sel.selReg(insn.getSrc(1), TYPE_U32);   //address
+      GenRegister src0 = sel.selReg(insn.getAddressRegister(), TYPE_U32);
       GenRegister src1 = src0, src2 = src0;
-      if(srcNum > 2) src1 = sel.selReg(insn.getSrc(2), TYPE_U32);
-      if(srcNum > 3) src2 = sel.selReg(insn.getSrc(3), TYPE_U32);
-
-      GenRegister flagTemp = sel.selReg(sel.reg(FAMILY_WORD, true), TYPE_U16);
+      if(msgPayload > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
+      if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
 
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
+      sel.ATOMIC(dst, genAtomicOp, msgPayload, src0, src1, src2, bti, sel.getBTITemps(b));
 
-      sel.ATOMIC(dst, genAtomicOp, opNum, src0, src1, src2, bti, b.isConst ? NULL : &flagTemp);
-
-      // for fixed bti, don't generate the useless loadi
-      if (insn.isFixedBTI())
-        dag.child[0] = NULL;
       markAllChildren(dag);
-
       return true;
     }
   };
@@ -4654,7 +5576,16 @@ namespace gbe
          }
         case OP_MAD:
          {
+          sel.push();
+          if (sel.isScalarReg(insn.getDst(0)))
+            sel.curr.execWidth = 1;
           sel.MAD(dst, src2, src0, src1);
+          sel.pop();
+          break;
+         }
+        case OP_LRP:
+         {
+          sel.LRP(dst, src0, src1, src2);
           break;
          }
         default:
@@ -4844,6 +5775,47 @@ namespace gbe
     DECL_CTOR(SampleInstruction, 1, 1);
   };
 
+  DECL_PATTERN(VmeInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::VmeInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      uint32_t msg_type, vme_search_path_lut, lut_sub;
+      msg_type = insn.getMsgType();
+      vme_search_path_lut = 0;
+      lut_sub = 0;
+      GBE_ASSERT(msg_type == 1);
+      uint32_t payloadLen = 0;
+      //We allocate 5 virtual payload grfs to selection dst register.
+      if(msg_type == 1){
+        payloadLen = 5;
+      }
+      uint32_t selDstNum = insn.getDstNum() + payloadLen;
+      uint32_t srcNum = insn.getSrcNum();
+      vector<GenRegister> dst(selDstNum);
+      vector<GenRegister> payloadVal(srcNum);
+      uint32_t valueID = 0;
+      for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
+        dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
+      for (valueID = insn.getDstNum(); valueID < selDstNum; ++valueID)
+        dst[valueID] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+
+      for (valueID = 0; valueID < srcNum; ++valueID)
+        payloadVal[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+
+      uint32_t bti = insn.getImageIndex();
+      if (bti > BTI_MAX_ID) {
+        std::cerr << "Too large bti " << bti;
+        return false;
+      }
+
+      sel.VME(bti, dst.data(), payloadVal.data(), selDstNum, srcNum, msg_type, vme_search_path_lut, lut_sub);
+
+      return true;
+    }
+    DECL_CTOR(VmeInstruction, 1, 1);
+  };
+
   /*! Typed write instruction pattern. */
   DECL_PATTERN(TypedWriteInstruction)
   {
@@ -5013,17 +5985,28 @@ namespace gbe
       }
 
       sel.push();
-      if (sel.isScalarReg(insn.getDst(0))) {
-        sel.curr.execWidth = 1;
-        sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.noMask = 1;
-      }
-      if (src1.file == GEN_IMMEDIATE_VALUE)
-        sel.SIMD_SHUFFLE(dst, src0, src1);
-      else {
-        GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        sel.SHL(shiftL, src1, GenRegister::immud(0x2));
-        sel.SIMD_SHUFFLE(dst, src0, shiftL);
+      if (sel.isScalarReg(insn.getSrc(0))) {
+        if (sel.isScalarReg(insn.getDst(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.MOV(dst, src0);     //no matter what src1 is
+      } else {
+        if (src1.file == GEN_IMMEDIATE_VALUE) {
+          uint32_t offset = src1.value.ud % sel.curr.execWidth;
+          GenRegister reg = GenRegister::subphysicaloffset(src0, offset);
+          reg.vstride = GEN_VERTICAL_STRIDE_0;
+          reg.hstride = GEN_HORIZONTAL_STRIDE_0;
+          reg.width = GEN_WIDTH_1;
+          sel.MOV(dst, reg);
+        }
+        else {
+          GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+          uint32_t SHLimm = typeSize(getGenType(type)) == 2 ? 1 : (typeSize(getGenType(type)) == 4 ? 2 : 3);
+          sel.SHL(shiftL, src1, GenRegister::immud(SHLimm));
+          sel.SIMD_SHUFFLE(dst, src0, shiftL);
+        }
       }
       sel.pop();
       return true;
@@ -5079,6 +6062,156 @@ namespace gbe
     }
   };
 
+  class CalcTimestampInstructionPattern : public SelectionPattern
+  {
+  public:
+    CalcTimestampInstructionPattern(void) : SelectionPattern(1,1) {
+      this->opcodes.push_back(ir::OP_CALC_TIMESTAMP);
+    }
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::CalcTimestampInstruction &insn = cast<ir::CalcTimestampInstruction>(dag.insn);
+      uint32_t pointNum = insn.getPointNum();
+      uint32_t tsType = insn.getTimestamptType();
+      GBE_ASSERT(sel.ctx.getSimdWidth() == 16 || sel.ctx.getSimdWidth() == 8);
+      GenRegister tmp;
+      GenRegister ts[5];
+      int tsNum;
+      if (sel.ctx.getSimdWidth() == 16) {
+        if (!sel.hasLongType())
+          tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD)), GEN_TYPE_UD);
+        ts[0] = GenRegister::retype(sel.selReg(ir::ocl::profilingts0, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[1] = GenRegister::retype(sel.selReg(ir::ocl::profilingts1, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[2] = GenRegister::retype(sel.selReg(ir::ocl::profilingts2, ir::TYPE_U32), GEN_TYPE_UW);
+        tsNum = 3;
+      } else {
+        if (!sel.hasLongType())
+          tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+        ts[0] = GenRegister::retype(sel.selReg(ir::ocl::profilingts0, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[1] = GenRegister::retype(sel.selReg(ir::ocl::profilingts1, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[2] = GenRegister::retype(sel.selReg(ir::ocl::profilingts2, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[3] = GenRegister::retype(sel.selReg(ir::ocl::profilingts3, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[4] = GenRegister::retype(sel.selReg(ir::ocl::profilingts4, ir::TYPE_U32), GEN_TYPE_UD);
+        tsNum = 5;
+      }
+
+      sel.push(); {
+        sel.curr.flag = 0;
+        sel.curr.subFlag = 1;
+        sel.CALC_TIMESTAMP(ts, tsNum, tmp, pointNum, tsType);
+      } sel.pop();
+      markAllChildren(dag);
+      return true;
+    }
+  };
+
+  class StoreProfilingInstructionPattern : public SelectionPattern
+  {
+  public:
+    StoreProfilingInstructionPattern(void) : SelectionPattern(1,1) {
+      this->opcodes.push_back(ir::OP_STORE_PROFILING);
+    }
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::StoreProfilingInstruction &insn = cast<ir::StoreProfilingInstruction>(dag.insn);
+      uint32_t profilingType = insn.getProfilingType();
+      uint32_t BTI = insn.getBTI();
+      GBE_ASSERT(sel.ctx.getSimdWidth() == 16 || sel.ctx.getSimdWidth() == 8);
+      GenRegister tmp0;
+      GenRegister tmp1;
+      GenRegister ts[5];
+      int tsNum;
+      if (sel.ctx.getSimdWidth() == 16) {
+        tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+        ts[0] = GenRegister::retype(sel.selReg(ir::ocl::profilingts0, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[1] = GenRegister::retype(sel.selReg(ir::ocl::profilingts1, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[2] = GenRegister::retype(sel.selReg(ir::ocl::profilingts2, ir::TYPE_U32), GEN_TYPE_UW);
+        tsNum = 3;
+      } else {
+        tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+        tmp1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+        ts[0] = GenRegister::retype(sel.selReg(ir::ocl::profilingts0, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[1] = GenRegister::retype(sel.selReg(ir::ocl::profilingts1, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[2] = GenRegister::retype(sel.selReg(ir::ocl::profilingts2, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[3] = GenRegister::retype(sel.selReg(ir::ocl::profilingts3, ir::TYPE_U32), GEN_TYPE_UD);
+        ts[4] = GenRegister::retype(sel.selReg(ir::ocl::profilingts4, ir::TYPE_U32), GEN_TYPE_UD);
+        tsNum = 5;
+      }
+      sel.push(); {
+        sel.curr.flag = 0;
+        sel.curr.subFlag = 1;
+        sel.STORE_PROFILING(profilingType, BTI, tmp0, tmp1, ts, tsNum);
+      } sel.pop();
+      markAllChildren(dag);
+      return true;
+    }
+  };
+
+  class PrintfInstructionPattern : public SelectionPattern
+  {
+  public:
+    PrintfInstructionPattern(void) : SelectionPattern(1,1) {
+      this->opcodes.push_back(ir::OP_PRINTF);
+    }
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::PrintfInstruction &insn = cast<ir::PrintfInstruction>(dag.insn);
+      uint16_t num = insn.getNum();
+      uint8_t BTI = insn.getBti();
+      GenRegister tmp0, tmp1;
+      uint32_t srcNum = insn.getSrcNum();
+      GenRegister dst = sel.selReg(insn.getDst(0), TYPE_S32);
+      //GBE_ASSERT(srcNum);
+      uint32_t i = 0;
+      uint32_t totalSize = 0;
+      bool isContinue = false;
+      GBE_ASSERT(sel.ctx.getSimdWidth() == 16 || sel.ctx.getSimdWidth() == 8);
+      if (sel.ctx.getSimdWidth() == 16) {
+        tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+        tmp1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+      } else {
+        tmp0 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), GEN_TYPE_UD);
+        tmp1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), GEN_TYPE_UD);
+      }
+
+      /* Get the total size for one printf statement. */
+      for (i = 0; i < srcNum; i++) {
+        Type type = insn.getType(i);
+        if (type == TYPE_DOUBLE || type == TYPE_S64 || type == TYPE_U64) {
+          totalSize += 8;
+        } else {
+          totalSize += 4; // Make sure always align to 4.
+        }
+      }
+
+      i = 0;
+      GenRegister regs[8];
+      if (srcNum == 0) {
+          sel.PRINTF(dst, BTI, tmp0, tmp1, regs, srcNum, num, isContinue, totalSize);
+      } else {
+        do {
+          uint32_t s = srcNum < 8 ? srcNum : 8;
+          for (uint32_t j = 0; j < s; j++) {
+            regs[j] = sel.selReg(insn.getSrc(i + j), insn.getType(i + j));
+          }
+          sel.PRINTF(dst, BTI, tmp0, tmp1, regs, s, num, isContinue, totalSize);
+
+          if (srcNum > 8) {
+            srcNum -= 8;
+            i += 8;
+          } else {
+            srcNum = 0;
+          }
+
+          isContinue = true;
+        } while(srcNum);
+      }
+
+      markAllChildren(dag);
+      return true;
+    }
+  };
+
   /*! Branch instruction pattern */
   class BranchInstructionPattern : public SelectionPattern
   {
@@ -5135,7 +6268,8 @@ namespace gbe
           sel.curr.execWidth = 1;
           sel.curr.noMask = 1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, curr->getLabelIndex());
+          // Actually, the origin of this JMPI should be the beginning of next BB.
+          sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, ir::LabelIndex(curr->getLabelIndex().value() + 1));
         sel.pop();
       }
     }
@@ -5274,6 +6408,349 @@ namespace gbe
 
   };
 
+  /*! WorkGroup instruction pattern */
+  DECL_PATTERN(WorkGroupInstruction)
+  {
+    /* WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+     * Shared local memory bassed communication between threads,
+     * prepare for the workgroup op in gen context
+     * Algorithm logic is in gen context,  */
+    INLINE bool emitWGReduce(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
+    {
+      using namespace ir;
+
+      GBE_ASSERT(insn.getSrcNum() == 3);
+      GBE_ASSERT(insn.getSrc(0) == ocl::threadn);
+      GBE_ASSERT(insn.getSrc(1) == ocl::threadid);
+
+      const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
+      const Type type = insn.getType();
+      GenRegister dst = sel.selReg(insn.getDst(0), type);
+      GenRegister src = sel.selReg(insn.getSrc(2), type);
+      GenRegister tmpData1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
+      GenRegister tmpData2 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
+      GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U32);
+      GenRegister localThreadID = sel.selReg(ocl::threadid, TYPE_U32);
+      GenRegister localThreadNUM = sel.selReg(ocl::threadn, TYPE_U32);
+      GenRegister localBarrier = GenRegister::ud8grf(sel.reg(FAMILY_DWORD));
+
+      /* Allocate registers for message sending
+       * (read/write to shared local memory) */
+      vector<GenRegister> msg;
+      for(uint32_t i = 0; i < 6; i++)
+        msg.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
+
+      /* Insert a barrier to make sure all the var we are interested in
+         have been assigned the final value. */
+      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+                  sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+
+      /* Pass the shared local memory offset  */
+      sel.MOV(slmOff, GenRegister::immud(insn.getSlmAddr()));
+
+      /* Perform workgroup op */
+      sel.WORKGROUP_OP(workGroupOp, dst, src, tmpData1,
+                       localThreadID, localThreadNUM, tmpData2, slmOff, msg, 6,
+                       localBarrier);
+
+      return true;
+    }
+
+    /* WORKGROUP OP: BROADCAST
+     * 1. BARRIER    Ensure all the threads have set the correct value for the var which will be broadcasted.
+       2. CMP IDs    Compare the local IDs with the specified ones in the function call.
+       3. STORE      Use flag to control the store of the var. Only the specified item will execute the store.
+       4. BARRIER    Ensure the specified value has been stored.
+       5. LOAD       Load the stored value to all the dst value, the dst of all the items will have same value,
+       so broadcasted. */
+    INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const
+    {
+      using namespace ir;
+
+      const uint32_t srcNum = insn.getSrcNum();
+      GBE_ASSERT(srcNum >= 2);
+
+      const Type type = insn.getType();
+      const GenRegister src = sel.selReg(insn.getSrc(0), type);
+      const GenRegister dst = sel.selReg(insn.getDst(0), type);
+      const uint32_t slmAddr = insn.getSlmAddr();
+      GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      vector<GenRegister> fakeTemps;
+      fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
+      fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
+
+      GenRegister coords[3];
+      for (uint32_t i = 1; i < srcNum; i++)
+        coords[i - 1] = GenRegister::toUniform(sel.selReg(insn.getSrc(i), TYPE_U32), GEN_TYPE_UD);
+
+      sel.push(); {
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(addr, GenRegister::immud(slmAddr));
+      } sel.pop();
+
+      /* insert a barrier to make sure all the var we are interested in
+         have been assigned the final value. */
+      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+                  sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+
+      sel.push(); {
+        sel.curr.flag = 0;
+        sel.curr.subFlag = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        GenRegister lid0, lid1, lid2;
+        uint32_t dim = srcNum - 1;
+        lid0 = GenRegister::retype(sel.selReg(ocl::lid0, TYPE_U32), GEN_TYPE_UD);
+        lid1 = GenRegister::retype(sel.selReg(ocl::lid1, TYPE_U32), GEN_TYPE_UD);
+        lid2 = GenRegister::retype(sel.selReg(ocl::lid2, TYPE_U32), GEN_TYPE_UD);
+
+        sel.CMP(GEN_CONDITIONAL_EQ, coords[0], lid0,
+                GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+        sel.curr.predicate = GEN_PREDICATE_NORMAL;
+        if (dim >= 2)
+          sel.CMP(GEN_CONDITIONAL_EQ, coords[1], lid1,
+                  GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+        if (dim >= 3)
+          sel.CMP(GEN_CONDITIONAL_EQ, coords[2], lid2,
+                  GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+
+        /* write to shared local memory for BYTE/WORD/DWORD types */
+        if (typeSize(src.type) <= 4) {
+          GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+          GenRegister _src = GenRegister::retype(src, GEN_TYPE_UD);
+          sel.UNTYPED_WRITE(_addr, &_src, 1, GenRegister::immw(0xfe), fakeTemps);
+        }
+        /* write to shared local memory for QWORD types */
+        else if (typeSize(src.type) == 8) {
+          sel.push(); {
+          /* arrange data in QWORD */
+          GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+          GenRegister srcQW = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+          GenRegister srcQW_p1 = src.retype(srcQW, GEN_TYPE_UD);
+          GenRegister srcQW_p2 = src.retype(src.offset(srcQW, 2, 0), GEN_TYPE_UD);
+          vector<GenRegister> srcVec;
+          srcVec.push_back(srcQW_p1);
+          srcVec.push_back(srcQW_p2);
+
+          /* unpack into 2 DWORD */
+          sel.UNPACK_LONG(srcQW, src);
+
+          /* emit write through SEND */
+          sel.UNTYPED_WRITE(_addr, srcVec.data(), 2,
+                            GenRegister::immw(0xfe), fakeTemps);
+          }sel.pop();
+        }
+        else
+          GBE_ASSERT(0);
+
+      } sel.pop();
+      /* make sure the slm var have the valid value now */
+      sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
+                  sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+
+      /* read from shared local memory for BYTE/WORD/DWORD types */
+      if (typeSize(src.type) <= 4) {
+        GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+        GenRegister _dst = GenRegister::retype(dst, GEN_TYPE_UD);
+        sel.UNTYPED_READ(_addr, &_dst, 1, GenRegister::immw(0xfe), fakeTemps);
+      }
+      /* read from shared local memory for QWORD types */
+      else if (typeSize(src.type) == 8) {
+        GenRegister _addr = GenRegister::retype(addr, GEN_TYPE_UD);
+        vector<GenRegister> _dst;
+        _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32));
+        _dst.push_back(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U32));
+        GenRegister _dstQ = dst.toUniform(_dst[0], GEN_TYPE_UL);
+
+        sel.push(); {
+        /* emit read through SEND */
+        sel.curr.execWidth = 8;
+        sel.UNTYPED_READ(_addr, _dst.data(), 2, GenRegister::immw(0xfe), fakeTemps);
+
+        /* reconstruct QWORD type */
+        _dst[0] = dst.toUniform(dst.offset(_dst[0], 0, 4), GEN_TYPE_UD);
+        _dst[1] = dst.toUniform(_dst[1], GEN_TYPE_UD);
+        sel.curr.execWidth = 1;
+        sel.MOV(_dst[0], _dst[1]);
+        } sel.pop();
+
+        /* set all elements assigned to thread */
+        sel.MOV(dst, _dstQ);
+      }
+      else
+        GBE_ASSERT(0);
+
+      return true;
+    }
+
+
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
+
+      if (workGroupOp == WORKGROUP_OP_BROADCAST){
+        return emitWGBroadcast(sel, insn);
+      }
+      else if (workGroupOp >= WORKGROUP_OP_ANY && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX){
+        return emitWGReduce(sel, insn);
+      }
+      else
+        GBE_ASSERT(0);
+
+      return true;
+    }
+    DECL_CTOR(WorkGroupInstruction, 1, 1);
+  };
+
+  /*! SubGroup instruction pattern */
+  class SubGroupInstructionPattern : public SelectionPattern
+  {
+  public:
+    SubGroupInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::SubGroupInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    /* SUBGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE
+     * Shared algorithm with workgroup inthread */
+    INLINE bool emitSGReduce(Selection::Opaque &sel, const ir::SubGroupInstruction &insn) const
+    {
+      using namespace ir;
+
+      GBE_ASSERT(insn.getSrcNum() == 1);
+
+      const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
+      const Type type = insn.getType();
+      GenRegister dst = sel.selReg(insn.getDst(0), type);
+      GenRegister src = sel.selReg(insn.getSrc(0), type);
+      GenRegister tmpData1 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
+      GenRegister tmpData2 = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
+
+      /* Perform workgroup op */
+      sel.SUBGROUP_OP(workGroupOp, dst, src, tmpData1, tmpData2);
+
+      return true;
+    }
+
+    /* SUBROUP OP: BROADCAST
+     * Shared algorithm with simd shuffle */
+    INLINE bool emitSGBroadcast(Selection::Opaque &sel, const ir::SubGroupInstruction &insn, SelectionDAG &dag) const
+    {
+      using namespace ir;
+
+      GBE_ASSERT(insn.getSrcNum() == 2);
+
+      const Type type = insn.getType();
+      const GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+      const GenRegister dst = sel.selReg(insn.getDst(0), type);
+      GenRegister src1;
+
+      SelectionDAG *dag0 = dag.child[0];
+      SelectionDAG *dag1 = dag.child[1];
+      if (dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
+        const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
+        src1 = getRegisterFromImmediate(childInsn.getImmediate(), TYPE_U32);
+        if (dag0) dag0->isRoot = 1;
+      } else {
+        markAllChildren(dag);
+        src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
+      }
+
+      sel.push(); {
+      if (src1.file == GEN_IMMEDIATE_VALUE) {
+          uint32_t offset = src1.value.ud % sel.curr.execWidth;
+          GenRegister reg = GenRegister::subphysicaloffset(src0, offset);
+          reg.vstride = GEN_VERTICAL_STRIDE_0;
+          reg.hstride = GEN_HORIZONTAL_STRIDE_0;
+          reg.width = GEN_WIDTH_1;
+          sel.MOV(dst, reg);
+      } else {
+        GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        uint32_t SHLimm = typeSize(getGenType(type)) == 2 ? 1 : (typeSize(getGenType(type)) == 4 ? 2 : 3);
+        sel.SHL(shiftL, src1, GenRegister::immud(SHLimm));
+        sel.SIMD_SHUFFLE(dst, src0, shiftL);
+      }
+      } sel.pop();
+
+      return true;
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+    {
+      using namespace ir;
+      const ir::SubGroupInstruction &insn = cast<SubGroupInstruction>(dag.insn);
+      const WorkGroupOps workGroupOp = insn.getWorkGroupOpcode();
+
+      if (workGroupOp == WORKGROUP_OP_BROADCAST){
+        return emitSGBroadcast(sel, insn, dag);
+      }
+      else if (workGroupOp >= WORKGROUP_OP_ANY && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX){
+        if(emitSGReduce(sel, insn))
+          markAllChildren(dag);
+        else
+          return false;
+      }
+      else
+        GBE_ASSERT(0);
+
+      return true;
+    }
+  };
+
+  /*! Media Block Read pattern */
+  DECL_PATTERN(MediaBlockReadInstruction)
+  {
+    bool emitOne(Selection::Opaque &sel, const ir::MediaBlockReadInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      uint32_t vec_size = insn.getVectorSize();
+      uint32_t simdWidth = sel.curr.execWidth;
+      vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
+      for (uint32_t i = 0; i < vec_size; ++i) {
+        valuesVec.push_back(sel.selReg(insn.getDst(i), TYPE_U32));
+        if(simdWidth == 16)
+          tmpVec.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
+      }
+      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
+      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      GenRegister *tmp = NULL;
+      if(simdWidth == 16)
+        tmp = &tmpVec[0];
+      sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize());
+      return true;
+    }
+    DECL_CTOR(MediaBlockReadInstruction, 1, 1);
+  };
+
+  /*! Media Block Write pattern */
+  DECL_PATTERN(MediaBlockWriteInstruction)
+  {
+    bool emitOne(Selection::Opaque &sel, const ir::MediaBlockWriteInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      uint32_t vec_size = insn.getVectorSize();
+      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
+      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
+      for(uint32_t i = 0; i < vec_size; i++)
+      {
+        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), TYPE_U32));
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+      }
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+      sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
+      return true;
+    }
+    DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
+  };
+
+
   /*! Sort patterns */
   INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) {
     if (p0->insnNum != p1->insnNum)
@@ -5302,12 +6779,21 @@ namespace gbe
     this->insert<MulAddInstructionPattern>();
     this->insert<SelectModifierInstructionPattern>();
     this->insert<SampleInstructionPattern>();
+    this->insert<VmeInstructionPattern>();
     this->insert<GetImageInfoInstructionPattern>();
     this->insert<ReadARFInstructionPattern>();
     this->insert<RegionInstructionPattern>();
     this->insert<SimdShuffleInstructionPattern>();
     this->insert<IndirectMovInstructionPattern>();
+    this->insert<CalcTimestampInstructionPattern>();
+    this->insert<StoreProfilingInstructionPattern>();
+    this->insert<WorkGroupInstructionPattern>();
+    this->insert<SubGroupInstructionPattern>();
     this->insert<NullaryInstructionPattern>();
+    this->insert<WaitInstructionPattern>();
+    this->insert<PrintfInstructionPattern>();
+    this->insert<MediaBlockReadInstructionPattern>();
+    this->insert<MediaBlockWriteInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 3bb00dd..5e28ec9 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -90,8 +90,10 @@ namespace gbe
     const GenRegister &dst(uint32_t dstID) const { return regs[dstID]; }
     /*! Damn C++ */
     const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
-    /*! No more than 9 sources (used by typed writes on simd8 mode.) */
-    enum { MAX_SRC_NUM = 9 };
+    /*! Set debug infomation to selection */
+    void setDBGInfo(DebugInfo in) { DBGInfo = in; }
+    /*! No more than 40 sources (40 sources are used by vme for payload passing and setting) */
+    enum { MAX_SRC_NUM = 40 };
     /*! No more than 16 destinations (15 used by I64DIV/I64REM) */
     enum { MAX_DST_NUM = 16 };
     /*! State of the instruction (extra fields neeed for the encoding) */
@@ -129,29 +131,57 @@ namespace gbe
         bool     isLD;  // is this a ld message?
         bool     isUniform;
       };
+      struct {
+        uint16_t vme_bti:8;
+        uint16_t msg_type:2;
+        uint16_t vme_search_path_lut:3;
+        uint16_t lut_sub:2;
+      };
       uint32_t barrierType;
+      uint32_t waitType;
       bool longjmp;
       uint32_t indirect_offset;
+      struct {
+        uint32_t pointNum:16;
+        uint32_t timestampType:16;
+      };
+      struct {
+        uint32_t profilingType:16;
+        uint32_t profilingBTI:16;
+      };
+      struct {
+        uint32_t printfNum:16;
+        uint32_t printfBTI:8;
+        uint32_t continueFlag:8;
+        uint16_t printfSize;
+      };
+      uint32_t workgroupOp;
     } extra;
     /*! Gen opcode */
     uint8_t opcode;
     /*! Number of destinations */
     uint8_t dstNum:5;
     /*! Number of sources */
-    uint8_t srcNum:4;
+    uint8_t srcNum:6;
     /*! To store various indices */
     uint32_t index;
     /*! For BRC/IF to store the UIP */
     uint32_t index1;
     /*! instruction ID used for vector allocation. */
     uint32_t ID;
+    DebugInfo DBGInfo;
     /*! Variable sized. Destinations and sources go here */
     GenRegister regs[0];
     INLINE uint32_t getbti() const {
       GBE_ASSERT(isRead() || isWrite());
       switch (opcode) {
+        case SEL_OP_OBREAD:
+        case SEL_OP_OBWRITE:
+        case SEL_OP_MBREAD:
+        case SEL_OP_MBWRITE:
         case SEL_OP_DWORD_GATHER: return extra.function;
         case SEL_OP_SAMPLE: return extra.rdbti;
+        case SEL_OP_VME: return extra.vme_bti;
         case SEL_OP_TYPED_WRITE: return extra.bti;
         default:
           GBE_ASSERT(0);
@@ -162,8 +192,13 @@ namespace gbe
     INLINE void setbti(uint32_t bti) {
       GBE_ASSERT(isRead() || isWrite());
       switch (opcode) {
+        case SEL_OP_OBREAD:
+        case SEL_OP_OBWRITE:
+        case SEL_OP_MBREAD:
+        case SEL_OP_MBWRITE:
         case SEL_OP_DWORD_GATHER: extra.function = bti; return;
         case SEL_OP_SAMPLE: extra.rdbti = bti; return;
+        case SEL_OP_VME: extra.vme_bti = bti; return;
         case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
         default:
           GBE_ASSERT(0);
@@ -226,6 +261,14 @@ namespace gbe
     bool removeSimpleIfEndif;
   };
 
+  enum SEL_IR_OPT_FEATURE {
+    //for OP_AND/not/or/xor , on BDW+, SrcMod value indicates a logical source modifier
+    //                        on PRE-BDW, SrcMod value indicates a numeric source modifier
+    SIOF_LOGICAL_SRCMOD = 1 << 0,
+    //for OP_MOV, on BSW, for long data type, src and dst hstride must be aligned to the same qword
+    SIOF_OP_MOV_LONG_REG_RESTRICT = 1 << 1,
+  };
+
   /*! Owns the selection engine */
   class GenContext;
   /*! Selection engine produces the pre-ISA instruction blocks */
@@ -266,6 +309,13 @@ namespace gbe
     class Opaque;
     /*! Created and destroyed in cpp */
     Opaque *opaque;
+
+    /* optimize at selection IR level */
+    void optimize(void);
+    uint32_t opt_features;
+
+    const GenContext &getCtx();
+
     /*! Use custom allocators */
     GBE_CLASS(Selection);
   };
@@ -305,6 +355,13 @@ namespace gbe
       SelectionBxt(GenContext &ctx);
   };
 
+  class SelectionKbl : public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      SelectionKbl(GenContext &ctx);
+  };
+
 } /* namespace gbe */
 
 #endif /*  __GEN_INSN_SELECTION_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index adbb137..ccaf526 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,8 +1,6 @@
 DECL_SELECTION_IR(LABEL, LabelInstruction)
 DECL_SELECTION_IR(MOV, UnaryInstruction)
 DECL_SELECTION_IR(BSWAP, UnaryWithTempInstruction)
-DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
-DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
 DECL_SELECTION_IR(NOT, UnaryInstruction)
 DECL_SELECTION_IR(LZD, UnaryInstruction)
@@ -45,6 +43,7 @@ DECL_SELECTION_IR(CMP, CompareInstruction)
 DECL_SELECTION_IR(I64CMP, I64CompareInstruction)
 DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
 DECL_SELECTION_IR(MAD, TernaryInstruction)
+DECL_SELECTION_IR(LRP, TernaryInstruction)
 DECL_SELECTION_IR(JMPI, JumpInstruction)
 DECL_SELECTION_IR(EOT, EotInstruction)
 DECL_SELECTION_IR(INDIRECT_MOVE, IndirectMoveInstruction)
@@ -65,6 +64,7 @@ DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction)
 DECL_SELECTION_IR(PACK_LONG, PackLongInstruction)
 DECL_SELECTION_IR(UNPACK_LONG, UnpackLongInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
+DECL_SELECTION_IR(VME, VmeInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
 DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
 DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
@@ -90,3 +90,13 @@ DECL_SELECTION_IR(ENDIF, UnaryInstruction)
 DECL_SELECTION_IR(ELSE, UnaryInstruction)
 DECL_SELECTION_IR(READ_ARF, UnaryInstruction)
 DECL_SELECTION_IR(WHILE, UnaryInstruction)
+DECL_SELECTION_IR(F64DIV, F64DIVInstruction)
+DECL_SELECTION_IR(CALC_TIMESTAMP, CalcTimestampInstruction)
+DECL_SELECTION_IR(STORE_PROFILING, StoreProfilingInstruction)
+DECL_SELECTION_IR(WORKGROUP_OP, WorkGroupOpInstruction)
+DECL_SELECTION_IR(SUBGROUP_OP, SubGroupOpInstruction)
+DECL_SELECTION_IR(PRINTF, PrintfInstruction)
+DECL_SELECTION_IR(OBREAD, OBReadInstruction)
+DECL_SELECTION_IR(OBWRITE, OBWriteInstruction)
+DECL_SELECTION_IR(MBREAD, MBReadInstruction)
+DECL_SELECTION_IR(MBWRITE, MBWriteInstruction)
diff --git a/backend/src/backend/gen_insn_selection_optimize.cpp b/backend/src/backend/gen_insn_selection_optimize.cpp
new file mode 100644
index 0000000..b8aa776
--- /dev/null
+++ b/backend/src/backend/gen_insn_selection_optimize.cpp
@@ -0,0 +1,288 @@
+
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_context.hpp"
+#include "ir/function.hpp"
+#include "ir/liveness.hpp"
+#include "ir/profile.hpp"
+#include "sys/cvar.hpp"
+#include "sys/vector.hpp"
+#include <algorithm>
+#include <climits>
+#include <map>
+
+namespace gbe
+{
+  //helper functions
+  static uint32_t CalculateElements(const GenRegister& reg, uint32_t execWidth)
+  {
+    uint32_t elements = 0;
+    uint32_t elementSize = typeSize(reg.type);
+    uint32_t width = GenRegister::width_size(reg);
+    assert(execWidth >= width);
+    uint32_t height = execWidth / width;
+    uint32_t vstride = GenRegister::vstride_size(reg);
+    uint32_t hstride = GenRegister::hstride_size(reg);
+    uint32_t base = reg.subnr;
+    for (uint32_t i = 0; i < height; ++i) {
+      uint32_t offsetInByte = base;
+      for (uint32_t j = 0; j < width; ++j) {
+        uint32_t offsetInType = offsetInByte / elementSize;
+        //it is possible that offsetInType > 32, it doesn't matter even elements is 32 bit.
+        //the reseason is that if one instruction span several registers,
+        //the other registers' visit pattern is same as first register if the vstride is normal(width * hstride)
+        assert(vstride == width * hstride);
+        elements |= (1 << offsetInType);
+        offsetInByte += hstride * elementSize;
+      }
+      base += vstride * elementSize;
+    }
+    return elements;
+  }
+
+  class SelOptimizer
+  {
+  public:
+    SelOptimizer(const GenContext& ctx, uint32_t features) : ctx(ctx), features(features) {}
+    virtual void run() = 0;
+    virtual ~SelOptimizer() {}
+  protected:
+    const GenContext &ctx;      //in case that we need it
+    uint32_t features;
+  };
+
+  class SelBasicBlockOptimizer : public SelOptimizer
+  {
+  public:
+    SelBasicBlockOptimizer(const GenContext& ctx,
+                           const ir::Liveness::LiveOut& liveout,
+                           uint32_t features,
+                           SelectionBlock &bb) :
+        SelOptimizer(ctx, features), bb(bb), liveout(liveout), optimized(false)
+    {
+    }
+    ~SelBasicBlockOptimizer() {}
+    virtual void run();
+
+  private:
+    // local copy propagation
+    class ReplaceInfo
+    {
+    public:
+      ReplaceInfo(SelectionInstruction& insn,
+                  const GenRegister& intermedia,
+                  const GenRegister& replacement) :
+                  insn(insn), intermedia(intermedia), replacement(replacement)
+      {
+        assert(insn.opcode == SEL_OP_MOV);
+        assert(&(insn.src(0)) == &replacement);
+        assert(&(insn.dst(0)) == &intermedia);
+        this->elements = CalculateElements(intermedia, insn.state.execWidth);
+        replacementOverwritten = false;
+      }
+      ~ReplaceInfo()
+      {
+        this->toBeReplaceds.clear();
+      }
+
+      SelectionInstruction& insn;
+      const GenRegister& intermedia;
+      uint32_t elements;
+      const GenRegister& replacement;
+      set<GenRegister*> toBeReplaceds;
+      bool replacementOverwritten;
+      GBE_CLASS(ReplaceInfo);
+    };
+    typedef map<ir::Register, ReplaceInfo*> ReplaceInfoMap;
+    ReplaceInfoMap replaceInfoMap;
+    void doLocalCopyPropagation();
+    void addToReplaceInfoMap(SelectionInstruction& insn);
+    void changeInsideReplaceInfoMap(const SelectionInstruction& insn, GenRegister& var);
+    void removeFromReplaceInfoMap(const SelectionInstruction& insn, const GenRegister& var);
+    void doReplacement(ReplaceInfo* info);
+    bool CanBeReplaced(const ReplaceInfo* info, const SelectionInstruction& insn, const GenRegister& var);
+    void cleanReplaceInfoMap();
+
+    SelectionBlock &bb;
+    const ir::Liveness::LiveOut& liveout;
+    bool optimized;
+    static const size_t MaxTries = 1;   //the max times of optimization try
+  };
+
+  void SelBasicBlockOptimizer::doReplacement(ReplaceInfo* info)
+  {
+    for (GenRegister* reg : info->toBeReplaceds) {
+      GenRegister::propagateRegister(*reg, info->replacement);
+    }
+    bb.insnList.erase(&(info->insn));
+    optimized = true;
+  }
+
+  void SelBasicBlockOptimizer::cleanReplaceInfoMap()
+  {
+    for (auto& pair : replaceInfoMap) {
+      ReplaceInfo* info = pair.second;
+      doReplacement(info);
+      delete info;
+    }
+    replaceInfoMap.clear();
+  }
+
+  void SelBasicBlockOptimizer::removeFromReplaceInfoMap(const SelectionInstruction& insn, const GenRegister& var)
+  {
+    for (ReplaceInfoMap::iterator pos = replaceInfoMap.begin(); pos != replaceInfoMap.end(); ++pos) {
+      ReplaceInfo* info = pos->second;
+      if (info->intermedia.reg() == var.reg()) {   //intermedia is overwritten
+        if (info->intermedia.quarter == var.quarter && info->intermedia.subnr == var.subnr) {
+          // We need to check the if intermedia is fully overwritten, they may be in some prediction state.
+          if (CanBeReplaced(info, insn, var))
+            doReplacement(info);
+        }
+        replaceInfoMap.erase(pos);
+        delete info;
+        return;
+      }
+      if (info->replacement.reg() == var.reg()) {  //replacement is overwritten
+        //there could be more than one replacements (with different physical subnr) overwritten,
+        //so do not break here, need to scann the whole map.
+        //here is an example:
+        // mov %10, %9.0
+        // mov %11, %9.1
+        // ...
+        // mov %9, %8
+        //both %9.0 and %9.1 are collected into replacement in the ReplaceInfoMap after the first two insts are scanned.
+        //when scan the last inst that %9 is overwritten, we should flag both %9.0 and %9.1 in the map.
+        info->replacementOverwritten = true;
+      }
+    }
+  }
+
+  void SelBasicBlockOptimizer::addToReplaceInfoMap(SelectionInstruction& insn)
+  {
+    assert(insn.opcode == SEL_OP_MOV);
+    const GenRegister& src = insn.src(0);
+    const GenRegister& dst = insn.dst(0);
+    if (src.type != dst.type || src.file != dst.file)
+      return;
+
+    if (liveout.find(dst.reg()) != liveout.end())
+      return;
+
+    ReplaceInfo* info = new ReplaceInfo(insn, dst, src);
+    replaceInfoMap[dst.reg()] = info;
+  }
+
+  bool SelBasicBlockOptimizer::CanBeReplaced(const ReplaceInfo* info, const SelectionInstruction& insn, const GenRegister& var)
+  {
+    //some conditions here are very strict, while some conditions are very light
+    //the reason is that i'm unable to find a perfect condition now in the first version
+    //need to refine the conditions when debugging/optimizing real kernels
+
+    if (insn.opcode == SEL_OP_BSWAP) //should remove once bswap issue is fixed
+      return false;
+
+    if (insn.isWrite() || insn.isRead()) //register in selection vector
+      return false;
+
+    if (features & SIOF_LOGICAL_SRCMOD)
+      if ((insn.opcode == SEL_OP_AND || insn.opcode == SEL_OP_NOT || insn.opcode == SEL_OP_OR || insn.opcode == SEL_OP_XOR) &&
+            (info->replacement.absolute || info->replacement.negation))
+        return false;
+
+    if (features & SIOF_OP_MOV_LONG_REG_RESTRICT && insn.opcode == SEL_OP_MOV) {
+      const GenRegister& dst = insn.dst(0);
+      if (dst.isint64() && !info->replacement.isint64() && info->elements != CalculateElements(info->replacement, insn.state.execWidth))
+        return false;
+    }
+
+    if (info->replacementOverwritten)
+      return false;
+
+    if (info->insn.state.noMask == 0 && insn.state.noMask == 1)
+      return false;
+
+    // If insn is in no prediction state, it will overwrite the info insn.
+    if (info->insn.state.predicate != insn.state.predicate && insn.state.predicate != GEN_PREDICATE_NONE)
+      return false;
+
+    if (info->insn.state.inversePredicate != insn.state.inversePredicate)
+      return false;
+
+    if (info->intermedia.type == var.type && info->intermedia.quarter == var.quarter && info->intermedia.subnr == var.subnr) {
+      uint32_t elements = CalculateElements(var, insn.state.execWidth);  //considering width, hstrid, vstrid and execWidth
+      if (info->elements == elements)
+        return true;
+    }
+
+    return false;
+  }
+
+  void SelBasicBlockOptimizer::changeInsideReplaceInfoMap(const SelectionInstruction& insn, GenRegister& var)
+  {
+    ReplaceInfoMap::iterator it = replaceInfoMap.find(var.reg());
+    if (it != replaceInfoMap.end()) {    //same ir register
+      ReplaceInfo* info = it->second;
+      if (CanBeReplaced(info, insn, var)) {
+        info->toBeReplaceds.insert(&var);
+      } else {
+        //if it is the same ir register, but could not be replaced for some reason,
+        //that means we could not remove MOV instruction, and so no replacement,
+        //so we'll remove the info for this case.
+        replaceInfoMap.erase(it);
+        delete info;
+      }
+    }
+  }
+
+  void SelBasicBlockOptimizer::doLocalCopyPropagation()
+  {
+    for (SelectionInstruction &insn : bb.insnList) {
+      for (uint8_t i = 0; i < insn.srcNum; ++i)
+        changeInsideReplaceInfoMap(insn, insn.src(i));
+
+      for (uint8_t i = 0; i < insn.dstNum; ++i)
+        removeFromReplaceInfoMap(insn, insn.dst(i));
+
+      if (insn.opcode == SEL_OP_MOV)
+        addToReplaceInfoMap(insn);
+    }
+    cleanReplaceInfoMap();
+  }
+
+  void SelBasicBlockOptimizer::run()
+  {
+    for (size_t i = 0; i < MaxTries; ++i) {
+      optimized = false;
+
+      doLocalCopyPropagation();
+      //doOtherLocalOptimization();
+
+      if (!optimized)
+        break;      //break since no optimization found at this round
+    }
+  }
+
+  class SelGlobalOptimizer : public SelOptimizer
+  {
+  public:
+    SelGlobalOptimizer(const GenContext& ctx, uint32_t features) : SelOptimizer(ctx, features) {}
+    ~SelGlobalOptimizer() {}
+    virtual void run();
+  };
+
+  void SelGlobalOptimizer::run()
+  {
+
+  }
+
+  void Selection::optimize()
+  {
+    //do basic block level optimization
+    for (SelectionBlock &block : *blockList) {
+      SelBasicBlockOptimizer bbopt(getCtx(), getCtx().getLiveOut(block.bb), opt_features, block);
+      bbopt.run();
+    }
+
+    //do global optimization
+
+  }
+} /* namespace gbe */
diff --git a/backend/src/backend/gen_insn_selection_output.cpp b/backend/src/backend/gen_insn_selection_output.cpp
new file mode 100644
index 0000000..ed26650
--- /dev/null
+++ b/backend/src/backend/gen_insn_selection_output.cpp
@@ -0,0 +1,172 @@
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_insn_selection_output.hpp"
+#include "sys/cvar.hpp"
+#include "sys/intrusive_list.hpp"
+#include <string.h>
+#include <iostream>
+#include <iomanip>
+using namespace std;
+
+namespace gbe
+{
+  static void outputGenReg(GenRegister& reg, bool dst)
+  {
+    if (reg.file == GEN_IMMEDIATE_VALUE || reg.file == GEN_GENERAL_REGISTER_FILE) {
+      if (reg.file == GEN_IMMEDIATE_VALUE) {
+        switch (reg.type) {
+          case GEN_TYPE_UD:
+          case GEN_TYPE_UW:
+          case GEN_TYPE_UB:
+          case GEN_TYPE_HF_IMM:
+            cout << hex << "0x" << reg.value.ud  << dec;
+            break;
+          case GEN_TYPE_D:
+          case GEN_TYPE_W:
+          case GEN_TYPE_B:
+            cout << reg.value.d;
+            break;
+          case GEN_TYPE_V:
+            cout << hex << "0x" << reg.value.ud << dec;
+            break;
+          case GEN_TYPE_UL:
+            cout << reg.value.u64;
+            break;
+          case GEN_TYPE_L:
+            cout << reg.value.i64;
+            break;
+          case GEN_TYPE_F:
+            cout << reg.value.f;
+            break;
+        }
+      }else {
+        if (reg.negation)
+          cout << "-";
+        if (reg.absolute)
+          cout << "(abs)";
+        cout << "%" << reg.value.reg;
+        if (reg.subphysical)
+          cout << "." << reg.subnr;
+
+        if (dst)
+          cout << "<" << GenRegister::hstride_size(reg) << ">";
+        else
+          cout << "<" << GenRegister::vstride_size(reg) << "," << GenRegister::width_size(reg) << "," << GenRegister::hstride_size(reg) << ">";
+      }
+
+      cout << ":";
+      switch (reg.type) {
+        case GEN_TYPE_UD:
+          cout << "UD";
+          break;
+        case GEN_TYPE_UW:
+          cout << "UW";
+          break;
+        case GEN_TYPE_UB:
+          cout << "UB";
+          break;
+        case GEN_TYPE_HF_IMM:
+          cout << "HF";
+          break;
+        case GEN_TYPE_D:
+          cout << "D";
+          break;
+        case GEN_TYPE_W:
+          cout << "W";
+          break;
+        case GEN_TYPE_B:
+          cout << "B";
+          break;
+        case GEN_TYPE_V:
+          cout << "V";
+          break;
+        case GEN_TYPE_UL:
+          cout << "UL";
+          break;
+        case GEN_TYPE_L:
+          cout << "L";
+          break;
+        case GEN_TYPE_F:
+          cout << "F";
+          break;
+      }
+    } else if (reg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
+      cout << setw(8) << "arf";
+    } else
+      assert(!"should not reach here");
+  }
+
+#define OP_NAME_LENGTH 512
+  void outputSelectionIR(GenContext &ctx, Selection* sel)
+  {
+    cout << "SELECTION IR begin:" << endl;
+    cout << "WARNING: not completed yet, welcome for the FIX!" << endl;
+    for (SelectionBlock &block : *sel->blockList) {
+      for (SelectionInstruction &insn : block.insnList) {
+        char opname[OP_NAME_LENGTH];
+        if (insn.isLabel()) {
+            cout << "  L" << insn.index << ":" << endl;
+            continue;
+        } else {
+          switch (insn.opcode) {
+            #define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: sprintf(opname, "%s", #OP); break;
+            #include "backend/gen_insn_selection.hxx"
+            #undef DECL_SELECTION_IR
+          }
+        }
+
+        if (insn.opcode == SEL_OP_CMP) {
+          switch (insn.extra.function) {
+            case GEN_CONDITIONAL_LE:
+              strcat(opname, ".le");
+              break;
+            case GEN_CONDITIONAL_L:
+              strcat(opname, ".l");
+              break;
+            case GEN_CONDITIONAL_GE:
+              strcat(opname, ".ge");
+              break;
+            case GEN_CONDITIONAL_G:
+              strcat(opname, ".g");
+              break;
+            case GEN_CONDITIONAL_EQ:
+              strcat(opname, ".eq");
+              break;
+            case GEN_CONDITIONAL_NEQ:
+              strcat(opname, ".neq");
+              break;
+          }
+        }
+
+        int n = strlen(opname);
+        if(n >= OP_NAME_LENGTH - 20) {
+          cout << "opname too long: " << opname << endl;
+          return;
+        }
+
+        sprintf(&opname[n], "(%d)", insn.state.execWidth);
+        cout << "    " << left << setw(20) << opname;
+
+        for (int i = 0; i < insn.dstNum; ++i)
+        {
+          GenRegister dst = insn.dst(i);
+          outputGenReg(dst, true);
+          cout << "\t";
+        }
+
+        cout << ":\t";
+
+        for (int i = 0; i < insn.srcNum; ++i)
+        {
+          GenRegister src = insn.src(i);
+          outputGenReg(src, false);
+          cout << "\t";
+        }
+
+        cout << endl;
+      }
+      cout << endl;
+    }
+    cout << "SELECTION IR end." << endl << endl;
+  }
+
+}
diff --git a/backend/src/backend/gen_insn_selection_output.hpp b/backend/src/backend/gen_insn_selection_output.hpp
new file mode 100644
index 0000000..dd372dc
--- /dev/null
+++ b/backend/src/backend/gen_insn_selection_output.hpp
@@ -0,0 +1,13 @@
+#ifndef __GBE_GEN_INSN_SELECTION_OUTPUT_HPP__
+#define __GBE_GEN_INSN_SELECTION_OUTPUT_HPP__
+
+namespace gbe
+{
+  class Selection;  // Pre ISA code
+  class GenContext; // Handle compilation for Gen
+
+  void outputSelectionIR(GenContext &ctx, Selection* sel);
+
+} /* namespace gbe */
+
+#endif
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 4577990..32f7794 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -80,22 +80,33 @@ namespace gbe {
     insns = (GenInstruction *)ins;
     insnNum = size / sizeof(GenInstruction);
   }
-  size_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
+  uint32_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
 
   void GenKernel::printStatus(int indent, std::ostream& outs) {
 #ifdef GBE_COMPILER_AVAILABLE
     Kernel::printStatus(indent, outs);
 
     FILE *f = fopen("/dev/null", "w");
+    if(!f) {
+      outs << "could not open /dev/null !";
+      return;
+    }
+
     char *buf = new char[4096];
     setbuffer(f, buf, 4096);
     GenCompactInstruction * pCom = NULL;
     GenInstruction insn[2];
 
+    uint32_t insn_version = 0;
+    if (IS_GEN7(deviceID) || IS_GEN75(deviceID))
+      insn_version = 7;
+    else if (IS_GEN8(deviceID) || IS_GEN9(deviceID))
+      insn_version = 8;
+
     for (uint32_t i = 0; i < insnNum;) {
       pCom = (GenCompactInstruction*)(insns+i);
       if(pCom->bits1.cmpt_control == 1) {
-        decompactInstruction(pCom, &insn);
+        decompactInstruction(pCom, &insn, insn_version);
         gen_disasm(f, &insn, deviceID, 1);
         i++;
       } else {
@@ -140,12 +151,15 @@ namespace gbe {
     {8, 16, false},
   };
 
-  Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath) {
+  Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name,
+                                    bool relaxMath, int profiling) {
 #ifdef GBE_COMPILER_AVAILABLE
     // Be careful when the simdWidth is forced by the programmer. We can see it
     // when the function already provides the simd width we need to use (i.e.
     // non zero)
     const ir::Function *fn = unit.getFunction(name);
+    if(fn == NULL)
+      GBE_ASSERT(0);
     uint32_t codeGenNum = sizeof(codeGenStrategy) / sizeof(codeGenStrategy[0]);
     uint32_t codeGen = 0;
     GenContext *ctx = NULL;
@@ -172,8 +186,16 @@ namespace gbe {
       ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath);
     } else if (IS_BROXTON(deviceID)) {
       ctx = GBE_NEW(BxtContext, unit, name, deviceID, relaxMath);
+    } else if (IS_KABYLAKE(deviceID)) {
+      ctx = GBE_NEW(KblContext, unit, name, deviceID, relaxMath);
     }
     GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
+
+    if (profiling) {
+      ctx->setProfilingMode(true);
+      unit.getProfilingInfo()->setDeviceID(deviceID);
+    }
+
     ctx->setASMFileName(this->asm_file_name);
 
     for (; codeGen < codeGenNum; ++codeGen) {
@@ -182,14 +204,17 @@ namespace gbe {
       const uint32_t reservedSpillRegs = codeGenStrategy[codeGen].reservedSpillRegs;
 
       // Force the SIMD width now and try to compile
-      unit.getFunction(name)->setSimdWidth(simdWidth);
+      ir::Function *simdFn = unit.getFunction(name);
+      if(simdFn == NULL)
+        GBE_ASSERT(0);
+      simdFn->setSimdWidth(simdWidth);
       ctx->startNewCG(simdWidth, reservedSpillRegs, limitRegisterPressure);
       kernel = ctx->compileKernel();
       if (kernel != NULL) {
         GBE_ASSERT(ctx->getErrCode() == NO_ERROR);
         break;
       }
-      fn->getImageSet()->clearInfo();
+      simdFn->getImageSet()->clearInfo();
       // If we get a out of range if/endif error.
       // We need to set the context to if endif fix mode and restart the previous compile.
       if ( ctx->getErrCode() == OUT_OF_RANGE_IF_ENDIF && !ctx->getIFENDIFFix() ) {
@@ -199,45 +224,97 @@ namespace gbe {
         GBE_ASSERT(!(ctx->getErrCode() == OUT_OF_RANGE_IF_ENDIF && ctx->getIFENDIFFix()));
     }
 
-    GBE_ASSERTM(kernel != NULL, "Fail to compile kernel, may need to increase reserved registers for spilling.");
+    //GBE_ASSERTM(kernel != NULL, "Fail to compile kernel, may need to increase reserved registers for spilling.");
     return kernel;
 #else
     return NULL;
 #endif
   }
 
-#define BINARY_HEADER_LENGTH 8
-#define IS_GEN_BINARY(binary) (*binary == '\0' && *(binary+1) == 'G'&& *(binary+2) == 'E' &&*(binary+3) == 'N' &&*(binary+4) == 'C')
-#define FILL_GEN_BINARY(binary) do{*binary = '\0'; *(binary+1) = 'G'; *(binary+2) = 'E'; *(binary+3) = 'N'; *(binary+4) = 'C';}while(0)
-#define FILL_DEVICE_ID(binary, src_hw_info) do {*(binary+5) = src_hw_info[0]; *(binary+6) = src_hw_info[1]; *(binary+7) = src_hw_info[2];}while(0)
-#define DEVICE_MATCH(typeA, src_hw_info) ((IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "IVB")) ||  \
-                                      (IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
-                                      (IS_BAYTRAIL_T(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
-                                      (IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) ||  \
-                                      (IS_BROADWELL(typeA) && !strcmp(src_hw_info, "BDW")) ||  \
-                                      (IS_CHERRYVIEW(typeA) && !strcmp(src_hw_info, "CHV")) ||  \
-                                      (IS_SKYLAKE(typeA) && !strcmp(src_hw_info, "SKL")) || \
-                                      (IS_BROXTON(typeA) && !strcmp(src_hw_info, "BXT")) )
+#define GEN_BINARY_HEADER_LENGTH 8
+
+  enum GEN_BINARY_HEADER_INDEX {
+    GBHI_BYT = 0,
+    GBHI_IVB = 1,
+    GBHI_HSW = 2,
+    GBHI_CHV = 3,
+    GBHI_BDW = 4,
+    GBHI_SKL = 5,
+    GBHI_BXT = 6,
+    GBHI_KBL = 7,
+    GBHI_MAX,
+  };
+#define GEN_BINARY_VERSION  1
+  static const unsigned char gen_binary_header[GBHI_MAX][GEN_BINARY_HEADER_LENGTH]= \
+                                             {{GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'B', 'Y', 'T'},
+                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'I', 'V', 'B'},
+                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'H', 'S', 'W'},
+                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'C', 'H', 'V'},
+                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'B', 'D', 'W'},
+                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'S', 'K', 'L'},
+                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'B', 'X', 'T'},
+                                              {GEN_BINARY_VERSION, 'G','E', 'N', 'C', 'K', 'B', 'T'}
+                                              };
+
+#define FILL_GEN_HEADER(binary, index)  do {int i = 0; do {*(binary+i) = gen_binary_header[index][i]; i++; }while(i < GEN_BINARY_HEADER_LENGTH);}while(0)
+#define FILL_BYT_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_BYT)
+#define FILL_IVB_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_IVB)
+#define FILL_HSW_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_HSW)
+#define FILL_CHV_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_CHV)
+#define FILL_BDW_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_BDW)
+#define FILL_SKL_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_SKL)
+#define FILL_BXT_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_BXT)
+#define FILL_KBL_HEADER(binary) FILL_GEN_HEADER(binary, GBHI_KBL)
+
+  static bool genHeaderCompare(const unsigned char *BufPtr, GEN_BINARY_HEADER_INDEX index)
+  {
+    bool matched = true;
+    for (int i = 1; i < GEN_BINARY_HEADER_LENGTH; ++i)
+    {
+      matched = matched && (BufPtr[i] == gen_binary_header[index][i]);
+    }
+    if(matched) {
+      if(BufPtr[0] != gen_binary_header[index][0]) {
+        std::cout << "Beignet binary format have been changed, please generate binary again.\n";
+        matched = false;
+      }
+    }
+    return matched;
+  }
+
+#define MATCH_BYT_HEADER(binary) genHeaderCompare(binary, GBHI_BYT)
+#define MATCH_IVB_HEADER(binary) genHeaderCompare(binary, GBHI_IVB)
+#define MATCH_HSW_HEADER(binary) genHeaderCompare(binary, GBHI_HSW)
+#define MATCH_CHV_HEADER(binary) genHeaderCompare(binary, GBHI_CHV)
+#define MATCH_BDW_HEADER(binary) genHeaderCompare(binary, GBHI_BDW)
+#define MATCH_SKL_HEADER(binary) genHeaderCompare(binary, GBHI_SKL)
+#define MATCH_BXT_HEADER(binary) genHeaderCompare(binary, GBHI_BXT)
+#define MATCH_KBL_HEADER(binary) genHeaderCompare(binary, GBHI_KBL)
+
+#define MATCH_DEVICE(deviceID, binary) ((IS_IVYBRIDGE(deviceID) && MATCH_IVB_HEADER(binary)) ||  \
+                                      (IS_IVYBRIDGE(deviceID) && MATCH_IVB_HEADER(binary)) ||  \
+                                      (IS_BAYTRAIL_T(deviceID) && MATCH_BYT_HEADER(binary)) ||  \
+                                      (IS_HASWELL(deviceID) && MATCH_HSW_HEADER(binary)) ||  \
+                                      (IS_BROADWELL(deviceID) && MATCH_BDW_HEADER(binary)) ||  \
+                                      (IS_CHERRYVIEW(deviceID) && MATCH_CHV_HEADER(binary)) ||  \
+                                      (IS_SKYLAKE(deviceID) && MATCH_SKL_HEADER(binary)) || \
+                                      (IS_BROXTON(deviceID) && MATCH_BXT_HEADER(binary)) || \
+                                      (IS_KABYLAKE(deviceID) && MATCH_KBL_HEADER(binary)) \
+                                      )
 
   static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
     using namespace gbe;
     std::string binary_content;
+
+    if(size < GEN_BINARY_HEADER_LENGTH)
+      return NULL;
+
     //the header length is 8 bytes: 1 byte is binary type, 4 bytes are bitcode header, 3  bytes are hw info.
-    char src_hw_info[4]="";
-    src_hw_info[0] = *(binary+5);
-    src_hw_info[1] = *(binary+6);
-    src_hw_info[2] = *(binary+7);
-
-    // check whether is gen binary ('/0GENC')
-    if(!IS_GEN_BINARY(binary)){
-        return NULL;
-    }
-    // check the whether the current device ID match the binary file's.
-    if(!DEVICE_MATCH(deviceID, src_hw_info)){
+    if(!MATCH_DEVICE(deviceID, (unsigned char*)binary)){
       return NULL;
     }
 
-    binary_content.assign(binary+BINARY_HEADER_LENGTH, size-BINARY_HEADER_LENGTH);
+    binary_content.assign(binary+GEN_BINARY_HEADER_LENGTH, size-GEN_BINARY_HEADER_LENGTH);
     GenProgram *program = GBE_NEW(GenProgram, deviceID);
     std::istringstream ifs(binary_content, std::ostringstream::binary);
 
@@ -302,47 +379,35 @@ namespace gbe {
 
       //add header to differetiate from llvm bitcode binary.
       //the header length is 8 bytes: 1 byte is binary type, 4 bytes are bitcode header, 3  bytes are hw info.
-      *binary = (char *)malloc(sizeof(char) * (sz+BINARY_HEADER_LENGTH) );
-      memset(*binary, 0, sizeof(char) * (sz+BINARY_HEADER_LENGTH) );
-      FILL_GEN_BINARY(*binary);
-      char src_hw_info[4]="";
+      *binary = (char *)malloc(sizeof(char) * (sz+GEN_BINARY_HEADER_LENGTH) );
+      if(*binary == NULL)
+        return 0;
+
+      memset(*binary, 0, sizeof(char) * (sz+GEN_BINARY_HEADER_LENGTH) );
       if(IS_IVYBRIDGE(prog->deviceID)){
-        src_hw_info[0]='I';
-        src_hw_info[1]='V';
-        src_hw_info[2]='B';
+        FILL_IVB_HEADER(*binary);
         if(IS_BAYTRAIL_T(prog->deviceID)){
-          src_hw_info[0]='B';
-          src_hw_info[1]='Y';
-          src_hw_info[2]='T';
+        FILL_BYT_HEADER(*binary);
         }
       }else if(IS_HASWELL(prog->deviceID)){
-        src_hw_info[0]='H';
-        src_hw_info[1]='S';
-        src_hw_info[2]='W';
+        FILL_HSW_HEADER(*binary);
       }else if(IS_BROADWELL(prog->deviceID)){
-        src_hw_info[0]='B';
-        src_hw_info[1]='D';
-        src_hw_info[2]='W';
+        FILL_BDW_HEADER(*binary);
       }else if(IS_CHERRYVIEW(prog->deviceID)){
-        src_hw_info[0]='C';
-        src_hw_info[1]='H';
-        src_hw_info[2]='V';
+        FILL_CHV_HEADER(*binary);
       }else if(IS_SKYLAKE(prog->deviceID)){
-        src_hw_info[0]='S';
-        src_hw_info[1]='K';
-        src_hw_info[2]='L';
+        FILL_SKL_HEADER(*binary);
       }else if(IS_BROXTON(prog->deviceID)){
-        src_hw_info[0]='B';
-        src_hw_info[1]='X';
-        src_hw_info[2]='T';
+        FILL_BXT_HEADER(*binary);
+      }else if(IS_KABYLAKE(prog->deviceID)){
+        FILL_KBL_HEADER(*binary);
       }else {
         free(*binary);
         *binary = NULL;
         return 0;
       }
-      FILL_DEVICE_ID(*binary, src_hw_info);
-      memcpy(*binary+BINARY_HEADER_LENGTH, oss.str().c_str(), sz*sizeof(char));
-      return sz+BINARY_HEADER_LENGTH;
+      memcpy(*binary+GEN_BINARY_HEADER_LENGTH, oss.str().c_str(), sz*sizeof(char));
+      return sz+GEN_BINARY_HEADER_LENGTH;
     }else{
 #ifdef GBE_COMPILER_AVAILABLE
       std::string str;
@@ -351,6 +416,9 @@ namespace gbe {
       std::string& bin_str = OS.str();
       int llsz = bin_str.size();
       *binary = (char *)malloc(sizeof(char) * (llsz+1) );
+      if(*binary == NULL)
+        return 0;
+
       *(*binary) = binary_type;
       memcpy(*binary+1, bin_str.c_str(), llsz);
       return llsz+1;
@@ -368,10 +436,16 @@ namespace gbe {
                                            size_t stringSize,
                                            char *err,
                                            size_t *errSize,
-                                           int optLevel)
+                                           int optLevel,
+                                           const char* options)
   {
     using namespace gbe;
-    GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx, asm_file_name);
+    uint32_t fast_relaxed_math = 0;
+    if (options != NULL)
+      if (strstr(options, "-cl-fast-relaxed-math") != NULL)
+        fast_relaxed_math = 1;
+
+    GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx, asm_file_name, fast_relaxed_math);
 #ifdef GBE_COMPILER_AVAILABLE
     std::string error;
     // Try to compile the program
@@ -390,9 +464,9 @@ namespace gbe {
   }
 
   static gbe_program genProgramNewGenProgram(uint32_t deviceID, const void* module,
-                                             const void* llvm_ctx)  {
+                                             const void* llvm_ctx,const char* asm_file_name)  {
     using namespace gbe;
-    GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx);
+    GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx, asm_file_name);
     // Everything run fine
     return (gbe_program) program;
   }
@@ -407,7 +481,11 @@ namespace gbe {
     using namespace gbe;
     char* errMsg;
     if(((GenProgram*)dst_program)->module == NULL){
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+      ((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module).release();
+#else
       ((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module);
+#endif
       errSize = 0;
     }else{
       llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
@@ -440,17 +518,47 @@ namespace gbe {
 #ifdef GBE_COMPILER_AVAILABLE
     using namespace gbe;
     std::string error;
-
     int optLevel = 1;
+    std::string dumpASMFileName;
+    size_t start = 0, end = 0;
+    uint32_t fast_relaxed_math = 0;
 
     if(options) {
       char *p;
       p = strstr(const_cast<char *>(options), "-cl-opt-disable");
       if (p)
         optLevel = 0;
+
+      if (options != NULL)
+        if (strstr(options, "-cl-fast-relaxed-math") != NULL)
+          fast_relaxed_math = 1;
+
+    char *options_str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
+      memcpy(options_str, options, strlen(options) + 1);
+      std::string optionStr(options_str);
+      while (end != std::string::npos) {
+        end = optionStr.find(' ', start);
+        std::string str = optionStr.substr(start, end - start);
+        start = end + 1;
+        if(str.size() == 0)
+          continue;
+
+        if(str.find("-dump-opt-asm=") != std::string::npos) {
+          dumpASMFileName = str.substr(str.find("=") + 1);
+          continue; // Don't push this str back; ignore it.
+        }
+      }
+      free(options_str);
     }
 
     GenProgram* p = (GenProgram*) program;
+    p->fast_relaxed_math = fast_relaxed_math;
+    if (!dumpASMFileName.empty()) {
+        p->asm_file_name = dumpASMFileName.c_str();
+        FILE *asmDumpStream = fopen(dumpASMFileName.c_str(), "w");
+        if (asmDumpStream)
+          fclose(asmDumpStream);
+      }
     // Try to compile the program
     acquireLLVMContextLock();
     llvm::Module* module = (llvm::Module*)p->module;
diff --git a/backend/src/backend/gen_program.hpp b/backend/src/backend/gen_program.hpp
index 75d77ba..ff756e0 100644
--- a/backend/src/backend/gen_program.hpp
+++ b/backend/src/backend/gen_program.hpp
@@ -46,7 +46,7 @@ namespace gbe
     /*! Set the instruction stream (to be implemented) */
     virtual void setCode(const char *, size_t size);
     /*! Implements get the code size */
-    virtual size_t getCodeSize(void) const;
+    virtual uint32_t getCodeSize(void) const;
     /*! Implements printStatus*/
     virtual void printStatus(int indent, std::ostream& outs);
     uint32_t deviceID;      //!< Current device ID
@@ -60,8 +60,8 @@ namespace gbe
   {
   public:
     /*! Create an empty program */
-    GenProgram(uint32_t deviceID, const void* mod = NULL, const void* ctx = NULL, const char* asm_fname = NULL) :
-      deviceID(deviceID),module((void*)mod), llvm_ctx((void*)ctx), asm_file_name(asm_fname) {}
+    GenProgram(uint32_t deviceID, const void* mod = NULL, const void* ctx = NULL, const char* asm_fname = NULL, uint32_t fast_relaxed_math = 0) :
+      Program(fast_relaxed_math), deviceID(deviceID),module((void*)mod), llvm_ctx((void*)ctx), asm_file_name(asm_fname) {}
     /*! Current device ID*/
     uint32_t deviceID;
     /*! Destroy the program */
@@ -69,7 +69,7 @@ namespace gbe
     /*! Clean LLVM resource */
     virtual void CleanLlvmResource(void);
     /*! Implements base class */
-    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath);
+    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath, int profiling);
     /*! Allocate an empty kernel. */
     virtual Kernel *allocateKernel(const std::string &name) {
       return GBE_NEW(GenKernel, name, deviceID);
@@ -81,7 +81,7 @@ namespace gbe
     GBE_CLASS(GenProgram);
   };
   /*! decompact GEN ASM if it is in compacted format */
-  extern void decompactInstruction(union GenCompactInstruction *p, void *insn);
+  extern void decompactInstruction(union GenCompactInstruction *p, void *insn, uint32_t insn_version);
 } /* namespace gbe */
 
 #endif /* __GBE_GEN_PROGRAM_HPP__ */
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 4cb88e9..4451efb 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -35,6 +35,7 @@
 #include <iomanip>
 
 
+#define HALF_REGISTER_FILE_OFFSET (32*64)
 namespace gbe
 {
   /////////////////////////////////////////////////////////////////////////////
@@ -48,47 +49,22 @@ namespace gbe
    */
   struct GenRegInterval {
     INLINE GenRegInterval(ir::Register reg) :
-      reg(reg), minID(INT_MAX), maxID(-INT_MAX) {}
+      reg(reg), minID(INT_MAX), maxID(-INT_MAX), accessCount(0),
+      conflictReg(0), b3OpAlign(0) {}
     ir::Register reg;     //!< (virtual) register of the interval
     int32_t minID, maxID; //!< Starting and ending points
+    int32_t accessCount;
+    ir::Register conflictReg; // < has banck conflict with this register
+    bool b3OpAlign;
   };
 
-  typedef struct GenRegIntervalKey {
-    GenRegIntervalKey(uint32_t reg, int32_t maxID) {
-      key = ((uint64_t)maxID << 32) | reg;
-    }
-    const ir::Register getReg() const {
-      return (ir::Register)(key & 0xFFFFFFFF);
-    }
-    int32_t getMaxID() const {
-      return key >> 32;
-    }
-    uint64_t key;
-  } GenRegIntervalKey;
-
-  struct spillCmp {
-    bool operator () (const GenRegIntervalKey &lhs, const GenRegIntervalKey &rhs) const
-    { return lhs.key > rhs.key; }
-  };
-
-  typedef set <GenRegIntervalKey, spillCmp> SpillSet;
-
-  class SpillCandidateSet : public SpillSet
-  {
-  public:
-    std::set<GenRegIntervalKey, spillCmp>::iterator find(GenRegInterval interval) {
-      GenRegIntervalKey key(interval.reg, interval.maxID);
-      return SpillSet::find(key);
-    }
-    void insert(GenRegInterval interval) {
-      GenRegIntervalKey key(interval.reg, interval.maxID);
-      SpillSet::insert(key);
-    }
-    void erase(GenRegInterval interval) {
-      GenRegIntervalKey key(interval.reg, interval.maxID);
-      SpillSet::erase(key);
-    }
+  struct SpillInterval {
+    SpillInterval(const ir::Register r, float c):
+      reg(r), cost(c) {}
+    ir::Register reg;
+    float cost;
   };
+  typedef std::vector<SpillInterval>::iterator SpillIntervalIter;
 
   /*! Implements the register allocation */
   class GenRegAllocator::Opaque
@@ -102,6 +78,9 @@ namespace gbe
     bool allocate(Selection &selection);
     /*! Return the Gen register from the selection register */
     GenRegister genReg(const GenRegister &reg);
+    INLINE bool isAllocated(const ir::Register &reg) {
+      return RA.contains(reg);
+    }
     /*! Output the register allocation */
     void outputAllocation(void);
     INLINE void getRegAttrib(ir::Register reg, uint32_t &regSize, ir::RegisterFamily *regFamily = NULL) const {
@@ -125,6 +104,9 @@ namespace gbe
     bool expireFlag(const GenRegInterval &limit);
     /*! Allocate the virtual boolean (== flags) registers */
     void allocateFlags(Selection &selection);
+    /*! calculate the spill cost, what we store here is 'use count',
+     * we use [use count]/[live range] as spill cost */
+    void calculateSpillCost(Selection &selection);
     /*! validated flags which contains valid value in the physical flag register */
     set<uint32_t> validatedFlags;
     /*! validated temp flag register which indicate the flag 0,1 contains which virtual flag register. */
@@ -133,8 +115,8 @@ namespace gbe
     void validateFlag(Selection &selection, SelectionInstruction &insn);
     /*! Allocate the GRF registers */
     bool allocateGRFs(Selection &selection);
-    /*! Create gen registers for all preallocated curbe registers. */
-    void allocatePayloadRegs(void);
+    /*! Create gen registers for all preallocated special registers. */
+    void allocateSpecialRegs(void);
     /*! Create a Gen register from a register set in the payload */
     void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0);
     /*! Create the intervals for each register */
@@ -175,7 +157,9 @@ namespace gbe
     /*! registers that are spilled */
     SpilledRegs spilledRegs;
     /*! register which could be spilled.*/
-    SpillCandidateSet spillCandidate;
+    std::set<GenRegInterval*> spillCandidate;
+    /*! BBs last instruction ID map */
+    map<const ir::BasicBlock *, int32_t> bbLastInsnIDMap;
     /* reserved registers for register spill/reload */
     uint32_t reservedReg;
     /*! Current vector to expire */
@@ -183,11 +167,14 @@ namespace gbe
     INLINE void insertNewReg(const Selection &selection, ir::Register reg, uint32_t grfOffset, bool isVector = false);
     INLINE bool expireReg(ir::Register reg);
     INLINE bool spillAtInterval(GenRegInterval interval, int size, uint32_t alignment);
+    INLINE bool findNextSpillCandidate(std::vector<SpillInterval> &candidate,
+                int &remainSize, int &offset, SpillIntervalIter &nextCand);
     INLINE uint32_t allocateReg(GenRegInterval interval, uint32_t size, uint32_t alignment);
     INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
     INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
     INLINE bool vectorCanSpill(SelectionVector *vector);
-    INLINE void allocateScratchForSpilled();
+    INLINE bool allocateScratchForSpilled();
+    void allocateCurbePayload(void);
 
     /*! replace specified source/dst register with temporary register and update interval */
     INLINE ir::Register replaceReg(Selection &sel, SelectionInstruction *insn,
@@ -196,11 +183,13 @@ namespace gbe
       ir::Register reg;
       if (isSrc) {
         reg = sel.replaceSrc(insn, regID, type, needMov);
+        assert(reg == intervals.size());
         intervals.push_back(reg);
         intervals[reg].minID = insn->ID - 1;
         intervals[reg].maxID = insn->ID;
       } else {
         reg = sel.replaceDst(insn, regID, type, needMov);
+        assert(reg == intervals.size());
         intervals.push_back(reg);
         intervals[reg].minID = insn->ID;
         intervals[reg].maxID = insn->ID + 1;
@@ -208,6 +197,7 @@ namespace gbe
       return reg;
     }
     /*! Use custom allocator */
+    friend GenRegAllocator;
     GBE_CLASS(Opaque);
   };
 
@@ -223,12 +213,12 @@ namespace gbe
     assert(offset >= GEN_REG_SIZE);
     offset += subOffset;
     RA.insert(std::make_pair(reg, offset));
-    GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
-    this->intervals[reg].minID = 0;
-    this->intervals[reg].maxID = 0;
+    //GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
+    //this->intervals[reg].minID = 0;
+    //this->intervals[reg].maxID = 0;
   }
 
-  INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
+  INLINE void GenRegAllocator::Opaque::allocateSpecialRegs(void) {
     using namespace ir;
     for(auto &it : this->ctx.curbeRegs)
       allocatePayloadReg(it.first, it.second);
@@ -240,14 +230,52 @@ namespace gbe
     for (auto rit = pushMap.rbegin(); rit != pushMap.rend(); ++rit) {
       const uint32_t argID = rit->second.argID;
       const FunctionArgument arg = fn.getArg(argID);
-
       const uint32_t subOffset = rit->second.offset;
       const Register reg = rit->second.getRegister();
+
+      if (intervals[reg].maxID == - INT_MAX)
+        continue;
       auto it = this->ctx.curbeRegs.find(arg.reg);
       assert(it != ctx.curbeRegs.end());
       allocatePayloadReg(reg, it->second, subOffset);
       ctx.splitBlock(it->second, subOffset);
     }
+
+    // Group and barrier IDs are always allocated by the hardware in r0
+    RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
+    RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
+    RA.insert(std::make_pair(ocl::groupid2,  7*sizeof(float))); // r0.7
+    RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
+  }
+
+  template <bool sortStartingPoint>
+  inline bool cmp(const GenRegInterval *i0, const GenRegInterval *i1) {
+    if (sortStartingPoint) {
+      if (i0->minID == i1->minID)
+        return (i0->maxID < i1->maxID);
+      return i0->minID < i1->minID;
+    } else {
+      if (i0->maxID == i1->maxID)
+        return (i0->minID < i1->minID);
+      return i0->maxID < i1->maxID;
+    }
+  }
+
+  void GenRegAllocator::Opaque::allocateCurbePayload(void) {
+    vector <GenRegInterval *> payloadInterval;
+    for (auto interval : starting) {
+      if (!ctx.isPayloadReg(interval->reg))
+        continue;
+      if (interval->minID > 0)
+        break;
+      payloadInterval.push_back(interval);
+    }
+    std::sort(payloadInterval.begin(), payloadInterval.end(), cmp<false>);
+    for(auto interval : payloadInterval) {
+      if (interval->maxID < 0)
+        continue;
+      ctx.allocCurbeReg(interval->reg);
+    }
   }
 
   bool GenRegAllocator::Opaque::createGenReg(const Selection &selection, const GenRegInterval &interval) {
@@ -300,12 +328,13 @@ namespace gbe
       // case 1: the register is not already in a vector, so it can stay in this
       // vector. Note that local IDs are *non-scalar* special registers but will
       // require a MOV anyway since pre-allocated in the CURBE
-      // If an element has very long interval, we don't want to put it into a
-      // vector as it will add more pressure to the register allocation.
+      // for dst SelectionVector, we can always try to allocate them even under
+      // spilling, reason is that its components can be expired separately, so,
+      // it does not introduce too much register pressure.
       if (it == vectorMap.end() &&
           ctx.sel->isScalarReg(reg) == false &&
           ctx.isSpecialReg(reg) == false &&
-          (intervals[reg].maxID - intervals[reg].minID) < 2048)
+          (ctx.reservedSpillRegs == 0 || !vector->isSrc) )
       {
         const VectorLocation location = std::make_pair(vector, regID);
         this->vectorMap.insert(std::make_pair(reg, location));
@@ -327,7 +356,7 @@ namespace gbe
   }
 
   /*! Will sort vector in decreasing order */
-  inline bool cmp(const SelectionVector *v0, const SelectionVector *v1) {
+  inline bool cmpVec(const SelectionVector *v0, const SelectionVector *v1) {
     return v0->regNum > v1->regNum;
   }
 
@@ -344,7 +373,7 @@ namespace gbe
 
     // Heuristic (really simple...): sort them by the number of registers they
     // contain
-    std::sort(this->vectors.begin(), this->vectors.end(), cmp);
+    std::sort(this->vectors.begin(), this->vectors.end(), cmpVec);
 
     // Insert MOVs when this is required
     for (vectorID = 0; vectorID < vectorNum; ++vectorID) {
@@ -355,19 +384,6 @@ namespace gbe
     }
   }
 
-  template <bool sortStartingPoint>
-  inline bool cmp(const GenRegInterval *i0, const GenRegInterval *i1) {
-    if (sortStartingPoint) {
-      if (i0->minID == i1->minID)
-        return (i0->maxID < i1->maxID);
-      return i0->minID < i1->minID;
-    } else {
-      if (i0->maxID == i1->maxID)
-        return (i0->minID < i1->minID);
-      return i0->maxID < i1->maxID;
-    }
-  }
-
   bool GenRegAllocator::Opaque::expireGRF(const GenRegInterval &limit) {
     bool ret = false;
     while (this->expiringID != ending.size()) {
@@ -475,6 +491,7 @@ namespace gbe
     // policy is to spill the allocate flag which live to the last time end point.
 
     // we have three flags we use for booleans f0.0 , f1.0 and f1.1
+    set<const ir::BasicBlock *> liveInSet01;
     for (auto &block : *selection.blockList) {
       // Store the registers allocated in the map
       map<ir::Register, uint32_t> allocatedFlags;
@@ -644,6 +661,7 @@ namespace gbe
             sel0->src(0) = GenRegister::uw1grf(ir::ocl::one);
             sel0->src(1) = GenRegister::uw1grf(ir::ocl::zero);
             sel0->dst(0) = GET_FLAG_REG(insn);
+            liveInSet01.insert(insn.parent->bb);
             insn.append(*sel0);
             // We use the zero one after the liveness analysis, we have to update
             // the liveness data manually here.
@@ -662,6 +680,30 @@ namespace gbe
         }
       }
     }
+
+    // As we introduce two global variables zero and one, we have to
+    // recompute its liveness information here!
+    if (liveInSet01.size()) {
+      set<const ir::BasicBlock *> liveOutSet01;
+      set<const ir::BasicBlock *> workSet(liveInSet01.begin(), liveInSet01.end());
+      while(workSet.size()) {
+        for (auto bb = workSet.begin(); bb != workSet.end(); ) {
+          for(auto predBB : (*bb)->getPredecessorSet()) {
+            liveOutSet01.insert(predBB);
+            if (liveInSet01.find(predBB) != liveInSet01.end())
+              continue;
+            liveInSet01.insert(predBB);
+            workSet.insert(predBB);
+          }
+          bb = workSet.erase(bb);
+        }
+      }
+      int32_t maxID = 0;
+      for(auto bb : liveOutSet01)
+        maxID = std::max(maxID, bbLastInsnIDMap.find(bb)->second);
+      intervals[ir::ocl::zero].maxID = std::max(intervals[ir::ocl::zero].maxID, maxID);
+      intervals[ir::ocl::one].maxID = std::max(intervals[ir::ocl::one].maxID, maxID);
+    }
   }
 
   IVAR(OCL_SIMD16_SPILL_THRESHOLD, 0, 16, 256);
@@ -672,11 +714,11 @@ namespace gbe
     for (uint32_t startID = 0; startID < regNum; ++startID) {
       const GenRegInterval &interval = *this->starting[startID];
       const ir::Register reg = interval.reg;
+
       if (interval.maxID == -INT_MAX)
         continue; // Unused register
       if (RA.contains(reg))
         continue; // already allocated
-
       if (flagBooleans.contains(reg))
         continue;
 
@@ -732,7 +774,10 @@ namespace gbe
           return false;
         }
       }
-      allocateScratchForSpilled();
+      if (!allocateScratchForSpilled()) {
+        ctx.errCode = REGISTER_SPILL_NO_SPACE;
+        return false;
+      }
       bool success = selection.spillRegs(spilledRegs, reservedReg);
       if (!success) {
         ctx.errCode = REGISTER_SPILL_FAIL;
@@ -743,7 +788,7 @@ namespace gbe
     return true;
   }
 
-  INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled()
+  INLINE bool GenRegAllocator::Opaque::allocateScratchForSpilled()
   {
     const uint32_t regNum = spilledRegs.size();
     this->starting.resize(regNum);
@@ -777,7 +822,10 @@ namespace gbe
       ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg);
       it->second.addr = ctx.allocateScratchMem(getFamilySize(family)
                                              * ctx.getSimdWidth());
-      }
+      if (it->second.addr == -1)
+        return false;
+    }
+    return true;
   }
 
   INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
@@ -792,8 +840,8 @@ namespace gbe
 
     ctx.deallocate(it->second);
     if (reservedReg != 0
-        && (spillCandidate.find(intervals[reg]) != spillCandidate.end())) {
-        spillCandidate.erase(intervals[reg]);
+        && (spillCandidate.find(&intervals[reg]) != spillCandidate.end())) {
+        spillCandidate.erase(&intervals[reg]);
         /* offset --> reg map should keep updated. */
         offsetReg.erase(it->second);
     }
@@ -826,7 +874,7 @@ namespace gbe
           && !selection.isPartialWrite(reg)) {
          GBE_ASSERT(offsetReg.find(grfOffset) == offsetReg.end());
          offsetReg.insert(std::make_pair(grfOffset, reg));
-         spillCandidate.insert(intervals[reg]);
+         spillCandidate.insert(&intervals[reg]);
        }
      }
   }
@@ -871,99 +919,199 @@ namespace gbe
   // FIXME we may need to fix those unspillable vector in the furture.
   INLINE bool GenRegAllocator::Opaque::vectorCanSpill(SelectionVector *vector) {
     for(uint32_t id = 0; id < vector->regNum; id++)
-      if (spillCandidate.find(intervals[(ir::Register)(vector->reg[id].value.reg)])
+      if (spillCandidate.find(&intervals[(ir::Register)(vector->reg[id].value.reg)])
           == spillCandidate.end())
         return false;
     return true;
   }
 
+  INLINE float getSpillCost(const GenRegInterval &v) {
+    // check minID maxId value
+    assert(v.maxID >= v.minID);
+    if (v.maxID == v.minID)
+      return 1.0f;
+    // FIXME some register may get access count of 0, need to be fixed.
+    float count = v.accessCount == 0 ? (float)2 : (float)v.accessCount;
+    return count / (float)(v.maxID - v.minID);
+  }
+
+  bool spillinterval_cmp(const SpillInterval &v1, const SpillInterval &v2) {
+    return v1.cost < v2.cost;
+  }
+
+  INLINE SpillIntervalIter findRegisterInSpillQueue(
+                           std::vector<SpillInterval> &cand, ir::Register reg) {
+    for (SpillIntervalIter it = cand.begin(); it != cand.end(); ++it) {
+      if (it->reg == reg)
+        return it;
+    }
+    return cand.end();
+  }
+  // The function tries to search in 'free physical register' and 'candidate'.
+  // so, the result may be on of the three possible situations:
+  // 1. search completed, find the next valid iterator to a candidate.
+  // 2. search ended, because we met unspillable register, we have to drop the iteration
+  // 3. search completed, there are enough free physical register.
+  //
+  // return value: should we break? because of:
+  // 1. search end, found enough free register
+  // 2. search end, because met unspillable register
+  INLINE bool GenRegAllocator::Opaque::findNextSpillCandidate(
+              std::vector<SpillInterval> &candidate, int &remainSize,
+              int &offset, SpillIntervalIter &nextCand) {
+    bool isFree = false;
+    bool shouldBreak = false;
+    do {
+      // check is free?
+      isFree = ctx.isSuperRegisterFree(offset);
+
+      if (isFree) {
+        remainSize -= GEN_REG_SIZE;
+        offset += GEN_REG_SIZE;
+      }
+    } while(isFree && remainSize > 0);
+
+    // done
+    if (remainSize <= 0) return true;
+
+    auto registerIter = offsetReg.find(offset);
+    shouldBreak = registerIter == offsetReg.end();
+
+    if (!shouldBreak) {
+      ir::Register reg = registerIter->second;
+      nextCand = findRegisterInSpillQueue(candidate, reg);
+    }
+    // if shouldBreak is false, means we need go on
+    return shouldBreak;
+  }
+
   INLINE bool GenRegAllocator::Opaque::spillAtInterval(GenRegInterval interval,
                                                        int size,
                                                        uint32_t alignment) {
     if (reservedReg == 0)
       return false;
-    auto it = spillCandidate.begin();
-    // If there is no spill candidate or current register is spillable and current register's
-    // endpoint is after all the spillCandidate register's endpoint we return false. The
-    // caller will spill current register.
-    // At simd16 mode, we will always try to spill here rather than return to the caller.
-    // The reason is that the caller may have a vector to allocate, and some element may be
-    // temporary registers which could not be spilled.
-    if (it == spillCandidate.end()
-        || (ctx.getSimdWidth() == 8 && (it->getMaxID() <= interval.maxID
-            && alignment == ctx.getSimdWidth()/8 * GEN_REG_SIZE)))
+
+    if (spillCandidate.empty())
       return false;
 
-    ir::Register reg = it->getReg();
-    set<ir::Register> spillSet;
-    int32_t savedSize = size;
-    while(size > 0) {
-      auto vectorIt = vectorMap.find(reg);
-      bool isVector = vectorIt != vectorMap.end();
-      bool needRestart = false;
-      ir::RegisterFamily family = ctx.sel->getRegisterFamily(reg);
-      if (isVector
-          && (vectorCanSpill(vectorIt->second.first))) {
-        const SelectionVector *vector = vectorIt->second.first;
-        for (uint32_t id = 0; id < vector->regNum; id++) {
-          GBE_ASSERT(spilledRegs.find(vector->reg[id].reg())
-                     == spilledRegs.end());
-          spillSet.insert(vector->reg[id].reg());
-          reg = vector->reg[id].reg();
-          family = ctx.sel->getRegisterFamily(reg);
-          size -= family == ir::FAMILY_QWORD ? 2 * GEN_REG_SIZE * ctx.getSimdWidth()/8
-                                             : GEN_REG_SIZE * ctx.getSimdWidth()/8;
+    // push spill candidate into a vector in ascending order of spill-cost.
+    std::vector<SpillInterval> candQ;
+    for (auto &p : spillCandidate) {
+      float cost = getSpillCost(*p);
+      candQ.push_back(SpillInterval(p->reg, cost));
+    }
+    std::sort(candQ.begin(), candQ.end(), spillinterval_cmp);
+
+    bool scalarAllocationFail = (vectorMap.find(interval.reg) == vectorMap.end());
+
+    int remainSize = size;
+    float spillCostTotal = 0.0f;
+    std::set<ir::Register> spillSet;
+    // if we search the whole register, it will take lots of time.
+    // so, I just add this max value to make the compile time not
+    // grow too much, although this method may not find truely lowest
+    // spill cost candidates.
+    const int spillGroupMax = 8;
+    int spillGroupID = 0;
+
+    std::vector<std::set<ir::Register>> spillGroups;
+    std::vector<float> spillGroupCost;
+
+    auto searchBegin = candQ.begin();
+    while (searchBegin != candQ.end() && spillGroupID < spillGroupMax) {
+      auto contiguousIter = searchBegin;
+
+      while (contiguousIter != candQ.end()) {
+        ir::Register reg = contiguousIter->reg;
+
+        auto vectorIt = vectorMap.find(reg);
+        bool spillVector = (vectorIt != vectorMap.end());
+        int32_t nextOffset = -1;
+
+        // is register allocation failed at scalar register?
+        // if so, don't try to spill a vector register,
+        // which is obviously of no benefit.
+        if (scalarAllocationFail && spillVector) break;
+
+        if (spillVector) {
+          if (vectorCanSpill(vectorIt->second.first)) {
+            const SelectionVector *vector = vectorIt->second.first;
+            for (uint32_t id = 0; id < vector->regNum; id++) {
+              GBE_ASSERT(spilledRegs.find(vector->reg[id].reg())
+                         == spilledRegs.end());
+              spillSet.insert(vector->reg[id].reg());
+              reg = vector->reg[id].reg();
+              uint32_t s;
+              getRegAttrib(reg, s);
+              remainSize-= s;
+              spillCostTotal += contiguousIter->cost;
+            }
+          } else {
+            break;
+          }
+        } else {
+          spillSet.insert(reg);
+          uint32_t s;
+          getRegAttrib(reg, s);
+          spillCostTotal += contiguousIter->cost;
+          remainSize -= s;
         }
-      } else if (!isVector) {
-        spillSet.insert(reg);
-        size -= family == ir::FAMILY_QWORD ? 2 * GEN_REG_SIZE * ctx.getSimdWidth()/8
-                                           : GEN_REG_SIZE * ctx.getSimdWidth()/8;
-      } else
-        needRestart = true; // is a vector which could not be spilled.
-
-      if (size <= 0)
-        break;
-      if (!needRestart) {
+        if (remainSize <= 0)
+          break;
+
         uint32_t offset = RA.find(reg)->second;
-        uint32_t nextOffset = (family == ir::FAMILY_QWORD) ? (offset + 2 * GEN_REG_SIZE * ctx.getSimdWidth() / 8)
-                                                           : (offset + GEN_REG_SIZE * ctx.getSimdWidth() / 8);
-        auto nextRegIt = offsetReg.find(nextOffset);
-        if (nextRegIt != offsetReg.end())
-          reg = nextRegIt->second;
-        else
-          needRestart = true;
+        uint32_t s; getRegAttrib(reg, s);
+        nextOffset = offset + s;
+
+        SpillIntervalIter nextValid = candQ.end();
+
+        bool shouldBreak = findNextSpillCandidate(candQ, remainSize, nextOffset,
+                                                  nextValid);
+        contiguousIter = nextValid;
+        if (shouldBreak)
+          break;
       }
 
-      if (needRestart) {
-#if 0
-        // FIXME, we should enable this code block in the future.
-        // If the spill set is not zero and we need a restart, we can
-        // simply return to try to allocate the registers at first.
-        // As some vectors which have expired elements may be marked as
-        // unspillable vector.
-        if (spillSet.size() > 0)
+      if (remainSize <= 0) {
+        if (scalarAllocationFail) {
+          // Done
           break;
-#endif
-        it++;
-        // next register is not in spill candidate.
-        // let's move to next candidate and start over.
-        if (it == spillCandidate.end())
-          return false;
-        reg = it->getReg();
-        size = savedSize;
-        spillSet.clear();
+        } else {
+          // Add as one spillGroup
+          spillGroups.push_back(spillSet);
+          spillGroupCost.push_back(spillCostTotal);
+          ++spillGroupID;
+        }
       }
+
+      ++searchBegin;
+      // restore states
+      remainSize = size;
+      spillCostTotal = 0.0f;
+      spillSet.clear();
+    }
+    // failed to spill
+    if (scalarAllocationFail && remainSize > 0) return false;
+    if (!scalarAllocationFail && spillGroups.size() == 0) return false;
+
+    if (!scalarAllocationFail) {
+      // push min spillcost group into spillSet
+      int minIndex = std::distance(spillGroupCost.begin(),
+                                   std::min_element(spillGroupCost.begin(),
+                                                    spillGroupCost.end()));
+      spillSet.swap(spillGroups[minIndex]);
     }
 
-    for(auto spillreg : spillSet)
+    for(auto spillreg : spillSet) {
       spillReg(spillreg, true);
+    }
     return true;
   }
 
   INLINE uint32_t GenRegAllocator::Opaque::allocateReg(GenRegInterval interval,
                                                        uint32_t size,
                                                        uint32_t alignment) {
-    uint32_t grfOffset;
+    int32_t grfOffset;
     // Doing expireGRF too freqently will cause the post register allocation
     // scheduling very hard. As it will cause a very high register conflict rate.
     // The tradeoff here is to reduce the freqency here. And if we are under spilling
@@ -976,7 +1124,20 @@ namespace gbe
     // and the source is a scalar Dword. If that is the case, the byte register
     // must get 4byte alignment register offset.
     alignment = (alignment + 3) & ~3;
-    while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
+
+    bool direction = true;
+    if (interval.conflictReg != 0) {
+      // try to allocate conflict registers in top/bottom half.
+      if (RA.contains(interval.conflictReg)) {
+        if (RA.find(interval.conflictReg)->second < HALF_REGISTER_FILE_OFFSET) {
+          direction = false;
+        }
+      }
+    }
+    if (interval.b3OpAlign != 0) {
+      alignment = (alignment + 15) & ~15;
+    }
+    while ((grfOffset = ctx.allocate(size, alignment, direction)) == -1) {
       const bool success = this->expireGRF(interval);
       if (success == false) {
         if (spillAtInterval(interval, size, alignment) == false)
@@ -986,30 +1147,65 @@ namespace gbe
     return grfOffset;
   }
 
+  int UseCountApproximate(int loopDepth) {
+    int ret = 1;
+    for (int i = 0; i < loopDepth; i++) {
+      ret = ret * 10;
+    }
+    return ret;
+  }
+
+  void GenRegAllocator::Opaque::calculateSpillCost(Selection &selection) {
+    int BlockIndex = 0;
+    for (auto &block : *selection.blockList) {
+      int LoopDepth = ctx.fn.getLoopDepth(ir::LabelIndex(BlockIndex));
+      for (auto &insn : block.insnList) {
+        const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const GenRegister &selReg = insn.src(srcID);
+          const ir::Register reg = selReg.reg();
+          if (selReg.file == GEN_GENERAL_REGISTER_FILE)
+            this->intervals[reg].accessCount += UseCountApproximate(LoopDepth);
+        }
+        for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+          const GenRegister &selReg = insn.dst(dstID);
+          const ir::Register reg = selReg.reg();
+          if (selReg.file == GEN_GENERAL_REGISTER_FILE)
+            this->intervals[reg].accessCount += UseCountApproximate(LoopDepth);
+        }
+      }
+      BlockIndex++;
+    }
+  }
+
   INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
     using namespace ir;
+    const Function::PushMap &pushMap = ctx.fn.getPushMap();
+
     if (ctx.reservedSpillRegs != 0) {
-      reservedReg = ctx.allocate(ctx.reservedSpillRegs * GEN_REG_SIZE, GEN_REG_SIZE);
+      reservedReg = ctx.allocate(ctx.reservedSpillRegs * GEN_REG_SIZE, GEN_REG_SIZE, false);
       reservedReg /= GEN_REG_SIZE;
     } else {
       reservedReg = 0;
     }
-    // schedulePreRegAllocation(ctx, selection);
 
     // Now start the linear scan allocation
-    for (uint32_t regID = 0; regID < ctx.sel->getRegNum(); ++regID)
+    for (uint32_t regID = 0; regID < ctx.sel->getRegNum(); ++regID) {
       this->intervals.push_back(ir::Register(regID));
-
-    // Allocate the special registers (only those which are actually used)
-    this->allocatePayloadRegs();
-
-    // Group and barrier IDs are always allocated by the hardware in r0
-    RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
-    RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
-    RA.insert(std::make_pair(ocl::groupid2,  7*sizeof(float))); // r0.7
-    RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
-
-    // block IP used to handle the mask in SW is always allocated
+      // Set all payload register's liveness minID to 0.
+      gbe_curbe_type curbeType;
+      int subType;
+      ctx.getRegPayloadType(ir::Register(regID), curbeType, subType);
+      if (curbeType != GBE_GEN_REG) {
+        intervals[regID].minID = 0;
+
+        // FIXME stack buffer is not used, we may need to remove it in the furture.
+        if (curbeType == GBE_CURBE_EXTRA_ARGUMENT && subType == GBE_STACK_BUFFER)
+          intervals[regID].maxID = 1;
+      }
+      if (regID == ir::ocl::zero.value() || regID ==  ir::ocl::one.value())
+        intervals[regID].minID = 0;
+    }
 
     // Compute the intervals
     int32_t insnID = 0;
@@ -1024,6 +1220,7 @@ namespace gbe
       for (auto &insn : block.insnList) {
         const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
         insn.ID  = insnID;
+        bool is3SrcOp = insn.opcode == SEL_OP_MAD;
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const GenRegister &selReg = insn.src(srcID);
           const ir::Register reg = selReg.reg();
@@ -1033,6 +1230,20 @@ namespace gbe
               reg == ir::ocl::groupid1  ||
               reg == ir::ocl::groupid2)
             continue;
+          ir::Register conflictReg = ir::Register(0);
+          if (is3SrcOp) {
+            if (srcID == 1)
+              conflictReg = insn.src(2).reg();
+            else if (srcID == 2)
+              conflictReg = insn.src(1).reg();
+          }
+          // we only let it conflict with one register, and with smaller reg number,
+          // as smaller virtual register usually comes first,
+          // and linear scan allocator allocate from smaller to larger register
+          // so, conflict with larger register number will not make any effect.
+          if (this->intervals[reg].conflictReg == 0 ||
+              this->intervals[reg].conflictReg > conflictReg)
+          this->intervals[reg].conflictReg = conflictReg;
           this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
           this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
         }
@@ -1045,6 +1256,9 @@ namespace gbe
               reg == ir::ocl::groupid1 ||
               reg == ir::ocl::groupid2)
             continue;
+          if (is3SrcOp) {
+              this->intervals[reg].b3OpAlign = 1;
+          }
           this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
           this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
         }
@@ -1094,6 +1308,7 @@ namespace gbe
 
       // All registers alive at the begining of the block must update their intervals.
       const ir::BasicBlock *bb = block.bb;
+      bbLastInsnIDMap.insert(std::make_pair(bb, lastID));
       for (auto reg : ctx.getLiveIn(bb))
         this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
 
@@ -1108,6 +1323,62 @@ namespace gbe
         delete boolsMap;
     }
 
+    for (auto &it : this->intervals) {
+      if (it.maxID == -INT_MAX)  continue;
+      if(pushMap.find(it.reg) != pushMap.end()) {
+        uint32_t argID = ctx.fn.getPushLocation(it.reg)->argID;
+        ir::Register argReg = ctx.fn.getArg(argID).reg;
+        intervals[argReg].maxID = std::max(intervals[argReg].maxID, 1);
+      }
+    }
+
+    if (ctx.inProfilingMode) {
+      /* If we are in profiling mode, we always need xyz dim info and timestamp curbes.
+         xyz dim info related curbe registers just live for the first INSN, but timestamp
+         curbes will live the whole execution life. */
+#define ADD_CURB_REG_FOR_PROFILING(REG_NAME, LIFE_START, LIFE_END) \
+do { \
+  bool hasIt = false; \
+  for (auto& itv : this->intervals) { \
+    if (itv.reg == REG_NAME) { \
+      hasIt = true; \
+      if (itv.minID > LIFE_START) itv.minID = LIFE_START; \
+      if (itv.maxID < LIFE_END) itv.maxID = LIFE_END; \
+      break; \
+    } \
+  } \
+  if (!hasIt) { \
+    GenRegInterval regInv(REG_NAME);  \
+    regInv.minID = LIFE_START; \
+    regInv.maxID = LIFE_END; \
+    this->intervals.push_back(regInv); \
+  } \
+} while(0)
+
+      ADD_CURB_REG_FOR_PROFILING(ocl::lsize0, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::lsize1, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::lsize2, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::goffset0, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::goffset1, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::goffset2, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::groupid0, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::groupid1, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::groupid2, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::lid0, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::lid1, 0, 1);
+      ADD_CURB_REG_FOR_PROFILING(ocl::lid2, 0, 1);
+
+      ADD_CURB_REG_FOR_PROFILING(ocl::profilingbptr, 0, INT_MAX);
+      ADD_CURB_REG_FOR_PROFILING(ocl::profilingts0, 0, INT_MAX);
+      ADD_CURB_REG_FOR_PROFILING(ocl::profilingts1, 0, INT_MAX);
+      ADD_CURB_REG_FOR_PROFILING(ocl::profilingts2, 0, INT_MAX);
+      if (ctx.simdWidth == 8) {
+        ADD_CURB_REG_FOR_PROFILING(ocl::profilingts3, 0, INT_MAX);
+        ADD_CURB_REG_FOR_PROFILING(ocl::profilingts4, 0, INT_MAX);
+      }
+    }
+#undef ADD_CURB_REG_FOR_PROFILING
+
     this->intervals[ocl::retVal].minID = INT_MAX;
     this->intervals[ocl::retVal].maxID = -INT_MAX;
 
@@ -1116,6 +1387,7 @@ namespace gbe
 
     // First we try to put all booleans registers into flags
     this->allocateFlags(selection);
+    this->calculateSpillCost(selection);
 
     // Sort both intervals in starting point and ending point increasing orders
     const uint32_t regNum = ctx.sel->getRegNum();
@@ -1136,6 +1408,12 @@ namespace gbe
         break;
     }
 
+    this->allocateCurbePayload();
+    ctx.buildPatchList();
+
+    // Allocate the special registers (only those which are actually used)
+    this->allocateSpecialRegs();
+
     // Allocate all the GRFs now (regular register and boolean that are not in
     // flag registers)
     return this->allocateGRFs(selection);
@@ -1158,7 +1436,7 @@ namespace gbe
              << "  " << setw(-3) << regSize  << "B\t"
              << "[  " << setw(8) << this->intervals[(uint)vReg].minID
              << " -> " << setw(8) << this->intervals[(uint)vReg].maxID
-             << "]" << endl;
+             << "]" << setw(8) << "use count: " << this->intervals[(uint)vReg].accessCount << endl;
     }
     if (!spilledRegs.empty())
       cout << "## spilled registers: " << spilledRegs.size() << endl;
@@ -1173,7 +1451,7 @@ namespace gbe
            <<  "  " << setw(-3) << regSize << "B\t"
            << "[  " << setw(8) << this->intervals[(uint)vReg].minID
            << " -> " << setw(8) << this->intervals[(uint)vReg].maxID
-           << "]" << endl;
+           << "]" << setw(8) << "use count: " << this->intervals[(uint)vReg].accessCount << endl;
     }
     cout << endl;
   }
@@ -1225,14 +1503,33 @@ namespace gbe
     return this->opaque->genReg(reg);
   }
 
+  bool GenRegAllocator::isAllocated(const ir::Register &reg) {
+    return this->opaque->isAllocated(reg);
+  }
+
   void GenRegAllocator::outputAllocation(void) {
     this->opaque->outputAllocation();
   }
 
   uint32_t GenRegAllocator::getRegSize(ir::Register reg) {
-     uint32_t regSize; 
-     this->opaque->getRegAttrib(reg, regSize); 
-     return regSize;
+    uint32_t regSize;
+    gbe_curbe_type curbeType = GBE_GEN_REG;
+    int subType = 0;
+    this->opaque->ctx.getRegPayloadType(reg, curbeType, subType);
+    if (curbeType == GBE_CURBE_IMAGE_INFO)
+      regSize = 4;
+    else if (curbeType == GBE_CURBE_KERNEL_ARGUMENT) {
+      const ir::FunctionArgument &arg = this->opaque->ctx.getFunction().getArg(subType);
+      if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
+          arg.type == ir::FunctionArgument::LOCAL_POINTER  ||
+          arg.type == ir::FunctionArgument::CONSTANT_POINTER)
+        regSize = this->opaque->ctx.getPointerSize();
+      else
+        regSize = arg.size;
+      GBE_ASSERT(arg.reg == reg);
+    } else
+      this->opaque->getRegAttrib(reg, regSize);
+    return regSize;
   }
 
 } /* namespace gbe */
diff --git a/backend/src/backend/gen_reg_allocation.hpp b/backend/src/backend/gen_reg_allocation.hpp
index 89dba64..8d5e797 100644
--- a/backend/src/backend/gen_reg_allocation.hpp
+++ b/backend/src/backend/gen_reg_allocation.hpp
@@ -54,6 +54,8 @@ namespace gbe
     bool allocate(Selection &selection);
     /*! Virtual to physical translation */
     GenRegister genReg(const GenRegister &reg);
+    /*! Check whether a register is allocated. */
+    bool isAllocated(const ir::Register &reg);
     /*! Output the register allocation */
     void outputAllocation(void);
     /*! Get register actual size in byte. */
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index a15fd60..bbea761 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -274,6 +274,15 @@ namespace gbe
       return r;
     }
 
+    static INLINE GenRegister toUniform(GenRegister reg, uint32_t type) {
+      GenRegister r = reg;
+      r.type = type;
+      r.hstride = GEN_HORIZONTAL_STRIDE_0;
+      r.vstride = GEN_VERTICAL_STRIDE_0;
+      r.width = GEN_WIDTH_1;
+      return r;
+    }
+
     static INLINE uint32_t grfOffset(GenRegister reg) {
       return reg.nr * GEN_REG_SIZE + reg.subnr;
     }
@@ -458,11 +467,11 @@ namespace gbe
     }
 
     static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
-      return retype(vec16(file, reg), GEN_TYPE_DF);
+      return retype(vec4(file, reg), GEN_TYPE_DF);
     }
 
     static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
-      return retype(vec8(file, reg), GEN_TYPE_DF);
+      return retype(vec4(file, reg), GEN_TYPE_DF);
     }
 
     static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
@@ -608,7 +617,7 @@ namespace gbe
     }
 
     static INLINE GenRegister immdf(double df) {
-      GenRegister immediate = imm(GEN_TYPE_DF);
+      GenRegister immediate = imm(GEN_TYPE_DF_IMM);
       immediate.value.df = df;
       return immediate;
     }
@@ -789,6 +798,16 @@ namespace gbe
       return reg;
     }
 
+    static INLINE GenRegister tm0(void) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         0xc0,
+                         0,
+                         GEN_TYPE_UW,
+                         GEN_VERTICAL_STRIDE_4,
+                         GEN_WIDTH_4,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
     static INLINE GenRegister acc(void) {
       return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
                          GEN_ARF_ACCUMULATOR,
@@ -809,10 +828,20 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_0);
     }
 
-    static INLINE GenRegister notification1(void) {
+    static INLINE GenRegister sr(uint32_t nr, uint32_t subnr = 0) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_STATE | nr,
+                         subnr,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister notification0(uint32_t subnr) {
       return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
                          GEN_ARF_NOTIFICATION_COUNT,
-                         0,
+                         subnr,
                          GEN_TYPE_UD,
                          GEN_VERTICAL_STRIDE_0,
                          GEN_WIDTH_1,
@@ -927,6 +956,16 @@ namespace gbe
                     GEN_HORIZONTAL_STRIDE_0);
     }
 
+    static INLINE uint32_t hstrideFromSize(int size) {
+      switch (size) {
+        case 0: return GEN_HORIZONTAL_STRIDE_0;
+        case 1: return GEN_HORIZONTAL_STRIDE_1;
+        case 2: return GEN_HORIZONTAL_STRIDE_2;
+        case 4: return GEN_HORIZONTAL_STRIDE_4;
+        default: NOT_IMPLEMENTED; return GEN_HORIZONTAL_STRIDE_0;
+      }
+    }
+
     static INLINE int hstride_size(GenRegister reg) {
       switch (reg.hstride) {
         case GEN_HORIZONTAL_STRIDE_0: return 0;
@@ -937,6 +976,34 @@ namespace gbe
       }
     }
 
+    static INLINE int vstride_size(GenRegister reg) {
+      switch (reg.vstride) {
+        case GEN_VERTICAL_STRIDE_0: return 0;
+        case GEN_VERTICAL_STRIDE_1: return 1;
+        case GEN_VERTICAL_STRIDE_2: return 2;
+        case GEN_VERTICAL_STRIDE_4: return 4;
+        case GEN_VERTICAL_STRIDE_8: return 8;
+        case GEN_VERTICAL_STRIDE_16: return 16;
+        case GEN_VERTICAL_STRIDE_32: return 32;
+        case GEN_VERTICAL_STRIDE_64: return 64;
+        case GEN_VERTICAL_STRIDE_128: return 128;
+        case GEN_VERTICAL_STRIDE_256: return 256;
+        default: NOT_IMPLEMENTED; return 0;
+      }
+    }
+
+    static INLINE int width_size(GenRegister reg) {
+      switch (reg.width) {
+        case GEN_WIDTH_1: return 1;
+        case GEN_WIDTH_2: return 2;
+        case GEN_WIDTH_4: return 4;
+        case GEN_WIDTH_8: return 8;
+        case GEN_WIDTH_16: return 16;
+        case GEN_WIDTH_32: return 32;
+        default: NOT_IMPLEMENTED; return 0;
+      }
+    }
+
     static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
       if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
         reg.subnr += delta * typeSize(reg.type) * hstride_size(reg);
@@ -946,6 +1013,14 @@ namespace gbe
       return reg;
     }
 
+    static INLINE GenRegister subphysicaloffset(GenRegister reg, uint32_t delta) {
+      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        reg.subnr += delta * typeSize(reg.type) * hstride_size(reg);
+        reg.subphysical = 1;
+      }
+      return reg;
+    }
+
     static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
     }
@@ -1128,6 +1203,22 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_2);
     }
 
+    static INLINE GenRegister unpacked_uw(const GenRegister& reg) {
+      uint32_t nr = reg.nr;
+      uint32_t subnr = reg.subnr / typeSize(GEN_TYPE_UW);
+      uint32_t width = reg.width;
+      int hstrideSize = GenRegister::hstride_size(reg) * typeSize(reg.type) / typeSize(GEN_TYPE_UW);
+      uint32_t hstride = GenRegister::hstrideFromSize(hstrideSize);
+
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UW,
+                         GEN_VERTICAL_STRIDE_16,
+                         width,
+                         hstride);
+    }
+
     static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) {
       return GenRegister(GEN_GENERAL_REGISTER_FILE,
                          nr,
@@ -1188,6 +1279,27 @@ namespace gbe
       return reg;
     }
 
+    static INLINE void propagateRegister(GenRegister& dst, const GenRegister& src)
+    {
+      dst.type = src.type;
+      dst.file = src.file;
+      dst.physical = src.physical;
+      dst.subphysical = src.subphysical;
+      dst.value.reg = src.value.reg;
+      dst.vstride = src.vstride;
+      dst.width = src.width;
+      dst.hstride = src.hstride;
+      dst.quarter = src.quarter;
+      dst.nr = src.nr;
+      dst.subnr = src.subnr;
+      dst.address_mode = src.address_mode;
+      dst.a0_subnr = src.a0_subnr;
+      dst.addr_imm = src.addr_imm;
+
+      dst.negation = dst.negation ^ src.negation;
+      dst.absolute = dst.absolute | src.absolute;
+    }
+
     /*! Generate register encoding with run-time simdWidth */
 #define DECL_REG_ENCODER(NAME, SIMD16, SIMD8, SIMD1) \
     template <typename... Args> \
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 145eb0f..b7dc00e 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -88,12 +88,14 @@ namespace gbe {
 
   Kernel::Kernel(const std::string &name) :
     name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false),
-        slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), printfSet(NULL) {}
+        slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), printfSet(NULL),
+        profilingInfo(NULL) {}
   Kernel::~Kernel(void) {
     if(ctx) GBE_DELETE(ctx);
     if(samplerSet) GBE_DELETE(samplerSet);
     if(imageSet) GBE_DELETE(imageSet);
     if(printfSet) GBE_DELETE(printfSet);
+    if(profilingInfo) GBE_DELETE(profilingInfo);
     GBE_SAFE_DELETE_ARRAY(args);
   }
   int32_t Kernel::getCurbeOffset(gbe_curbe_type type, uint32_t subType) const {
@@ -104,7 +106,7 @@ namespace gbe {
     return it->offset; // we found it!
   }
 
-  Program::Program(void) : constantSet(NULL) {}
+  Program::Program(uint32_t fast_relaxed_math) : fast_relaxed_math(fast_relaxed_math), constantSet(NULL) {}
   Program::~Program(void) {
     for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it)
       GBE_DELETE(it->second);
@@ -114,14 +116,24 @@ namespace gbe {
 #ifdef GBE_COMPILER_AVAILABLE
   BVAR(OCL_OUTPUT_GEN_IR, false);
   BVAR(OCL_STRICT_CONFORMANCE, true);
+  IVAR(OCL_PROFILING_LOG, 0, 0, 1); // Int for different profiling types.
+  BVAR(OCL_OUTPUT_BUILD_LOG, false);
 
   bool Program::buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel) {
     ir::Unit *unit = new ir::Unit();
     llvm::Module * cloned_module = NULL;
+    bool ret = false;
     if(module){
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+      cloned_module = llvm::CloneModule((llvm::Module*)module).release();
+#else
       cloned_module = llvm::CloneModule((llvm::Module*)module);
+#endif
     }
-    if (llvmToGen(*unit, fileName, module, optLevel, OCL_STRICT_CONFORMANCE) == false) {
+    bool strictMath = true;
+    if (fast_relaxed_math || !OCL_STRICT_CONFORMANCE)
+      strictMath = false;
+    if (llvmToGen(*unit, fileName, module, optLevel, strictMath, OCL_PROFILING_LOG, error) == false) {
       if (fileName)
         error = std::string(fileName) + " not found";
       delete unit;
@@ -134,19 +146,24 @@ namespace gbe {
       unit = new ir::Unit();
       if(cloned_module){
         //suppose file exists and llvmToGen will not return false.
-        llvmToGen(*unit, fileName, cloned_module, 0, OCL_STRICT_CONFORMANCE);
+        llvmToGen(*unit, fileName, cloned_module, 0, strictMath, OCL_PROFILING_LOG, error);
       }else{
         //suppose file exists and llvmToGen will not return false.
-        llvmToGen(*unit, fileName, module, 0, OCL_STRICT_CONFORMANCE);
+        llvmToGen(*unit, fileName, module, 0, strictMath, OCL_PROFILING_LOG, error);
       }
     }
-    assert(unit->getValid());
-    this->buildFromUnit(*unit, error);
+    if(unit->getValid()){
+      std::string error2;
+      if (this->buildFromUnit(*unit, error2)){
+        ret = true;
+      }
+      error = error + error2;
+    }
     delete unit;
     if(cloned_module){
       delete (llvm::Module*) cloned_module;
     }
-    return true;
+    return ret;
   }
 
   bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
@@ -155,10 +172,23 @@ namespace gbe {
     const uint32_t kernelNum = set.size();
     if (OCL_OUTPUT_GEN_IR) std::cout << unit;
     if (kernelNum == 0) return true;
+
+    bool strictMath = true;
+    if (fast_relaxed_math || !OCL_STRICT_CONFORMANCE)
+      strictMath = false;
+
     for (const auto &pair : set) {
       const std::string &name = pair.first;
-      Kernel *kernel = this->compileKernel(unit, name, !OCL_STRICT_CONFORMANCE);
+      Kernel *kernel = this->compileKernel(unit, name, !strictMath, OCL_PROFILING_LOG);
+      if (!kernel) {
+        error +=  name;
+        error += ":(GBE): error: failed in Gen backend.\n";
+        if (OCL_OUTPUT_BUILD_LOG)
+          llvm::errs() << error;
+        return false;
+      }
       kernel->setSamplerSet(pair.second->getSamplerSet());
+      kernel->setProfilingInfo(new ir::ProfilingInfo(*unit.getProfilingInfo()));
       kernel->setImageSet(pair.second->getImageSet());
       kernel->setPrintfSet(pair.second->getPrintfSet());
       kernel->setCompileWorkGroupSize(pair.second->getCompileWorkGroupSize());
@@ -172,17 +202,17 @@ namespace gbe {
 #define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
 #define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
 
-  size_t Program::serializeToBin(std::ostream& outs) {
-    size_t ret_size = 0;
-    size_t ker_num = kernels.size();
-    int has_constset = 0;
+  uint32_t Program::serializeToBin(std::ostream& outs) {
+    uint32_t ret_size = 0;
+    uint32_t ker_num = kernels.size();
+    uint32_t has_constset = 0;
 
     OUT_UPDATE_SZ(magic_begin);
 
     if (constantSet) {
       has_constset = 1;
       OUT_UPDATE_SZ(has_constset);
-      size_t sz = constantSet->serializeToBin(outs);
+      uint32_t sz = constantSet->serializeToBin(outs);
       if (!sz)
         return 0;
 
@@ -193,7 +223,7 @@ namespace gbe {
 
     OUT_UPDATE_SZ(ker_num);
     for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it) {
-      size_t sz = it->second->serializeToBin(outs);
+      uint32_t sz = it->second->serializeToBin(outs);
       if (!sz)
         return 0;
 
@@ -206,10 +236,10 @@ namespace gbe {
     return ret_size;
   }
 
-  size_t Program::deserializeFromBin(std::istream& ins) {
-    size_t total_size = 0;
+  uint32_t Program::deserializeFromBin(std::istream& ins) {
+    uint32_t total_size = 0;
     int has_constset = 0;
-    size_t ker_num;
+    uint32_t ker_num;
     uint32_t magic;
 
     IN_UPDATE_SZ(magic);
@@ -219,19 +249,18 @@ namespace gbe {
     IN_UPDATE_SZ(has_constset);
     if(has_constset) {
       constantSet = new ir::ConstantSet;
-      size_t sz = constantSet->deserializeFromBin(ins);
+      uint32_t sz = constantSet->deserializeFromBin(ins);
 
-      if (sz == 0) {
+      if (sz == 0)
         return 0;
-      }
 
       total_size += sz;
     }
 
     IN_UPDATE_SZ(ker_num);
 
-    for (size_t i = 0; i < ker_num; i++) {
-      size_t ker_serial_sz;
+    for (uint32_t i = 0; i < ker_num; i++) {
+      uint32_t ker_serial_sz;
       std::string ker_name; // Just a empty name here.
       Kernel* ker = allocateKernel(ker_name);
 
@@ -246,7 +275,7 @@ namespace gbe {
     if (magic != magic_end)
       return 0;
 
-    size_t total_bytes;
+    uint32_t total_bytes;
     IN_UPDATE_SZ(total_bytes);
     if (total_bytes + sizeof(total_size) != total_size)
       return 0;
@@ -254,15 +283,17 @@ namespace gbe {
     return total_size;
   }
 
-  size_t Kernel::serializeToBin(std::ostream& outs) {
+  uint32_t Kernel::serializeToBin(std::ostream& outs) {
     unsigned int i;
-    size_t ret_size = 0;
+    uint32_t ret_size = 0;
     int has_samplerset = 0;
     int has_imageset = 0;
+    uint32_t sz = 0;
 
     OUT_UPDATE_SZ(magic_begin);
 
-    OUT_UPDATE_SZ(name.size());
+    sz = name.size();
+    OUT_UPDATE_SZ(sz);
     outs.write(name.c_str(), name.size());
     ret_size += sizeof(char)*name.size();
 
@@ -276,25 +307,30 @@ namespace gbe {
 
       OUT_UPDATE_SZ(arg.info.addrSpace);
 
-      OUT_UPDATE_SZ(arg.info.typeName.size());
+      sz = arg.info.typeName.size();
+      OUT_UPDATE_SZ(sz);
       outs.write(arg.info.typeName.c_str(), arg.info.typeName.size());
       ret_size += sizeof(char)*arg.info.typeName.size();
 
-      OUT_UPDATE_SZ(arg.info.accessQual.size());
+      sz = arg.info.accessQual.size();
+      OUT_UPDATE_SZ(sz);
       outs.write(arg.info.accessQual.c_str(), arg.info.accessQual.size());
       ret_size += sizeof(char)*arg.info.accessQual.size();
 
-      OUT_UPDATE_SZ(arg.info.typeQual.size());
+      sz = arg.info.typeQual.size();
+      OUT_UPDATE_SZ(sz);
       outs.write(arg.info.typeQual.c_str(), arg.info.typeQual.size());
       ret_size += sizeof(char)*arg.info.typeQual.size();
 
-      OUT_UPDATE_SZ(arg.info.argName.size());
+      sz = arg.info.argName.size();
+      OUT_UPDATE_SZ(sz);
       outs.write(arg.info.argName.c_str(), arg.info.argName.size());
       ret_size += sizeof(char)*arg.info.argName.size();
     }
 
-    OUT_UPDATE_SZ(patches.size());
-    for (size_t i = 0; i < patches.size(); ++i) {
+    sz = patches.size();
+    OUT_UPDATE_SZ(sz);
+    for (uint32_t i = 0; i < patches.size(); ++i) {
       const PatchInfo& patch = patches[i];
       unsigned int tmp;
       tmp = patch.type;
@@ -318,7 +354,7 @@ namespace gbe {
     if (!samplerSet->empty()) {   //samplerSet is always valid, allocated in Function::Function
       has_samplerset = 1;
       OUT_UPDATE_SZ(has_samplerset);
-      size_t sz = samplerSet->serializeToBin(outs);
+      uint32_t sz = samplerSet->serializeToBin(outs);
       if (!sz)
         return 0;
 
@@ -331,7 +367,7 @@ namespace gbe {
     if (!imageSet->empty()) {   //imageSet is always valid, allocated in Function::Function
       has_imageset = 1;
       OUT_UPDATE_SZ(has_imageset);
-      size_t sz = imageSet->serializeToBin(outs);
+      uint32_t sz = imageSet->serializeToBin(outs);
       if (!sz)
         return 0;
 
@@ -352,19 +388,19 @@ namespace gbe {
     return ret_size;
   }
 
-  size_t Kernel::deserializeFromBin(std::istream& ins) {
-    size_t total_size = 0;
+  uint32_t Kernel::deserializeFromBin(std::istream& ins) {
+    uint32_t total_size = 0;
     int has_samplerset = 0;
     int has_imageset = 0;
-    size_t code_size = 0;
+    uint32_t code_size = 0;
     uint32_t magic = 0;
-    size_t patch_num = 0;
+    uint32_t patch_num = 0;
 
     IN_UPDATE_SZ(magic);
     if (magic != magic_begin)
       return 0;
 
-    size_t name_len;
+    uint32_t name_len;
     IN_UPDATE_SZ(name_len);
     char* c_name = new char[name_len+1];
     ins.read(c_name, name_len*sizeof(char));
@@ -384,7 +420,7 @@ namespace gbe {
 
       IN_UPDATE_SZ(arg.info.addrSpace);
 
-      size_t len;
+      uint32_t len;
       char* a_name = NULL;
 
       IN_UPDATE_SZ(len);
@@ -447,7 +483,7 @@ namespace gbe {
     IN_UPDATE_SZ(has_samplerset);
     if (has_samplerset) {
       samplerSet = GBE_NEW(ir::SamplerSet);
-      size_t sz = samplerSet->deserializeFromBin(ins);
+      uint32_t sz = samplerSet->deserializeFromBin(ins);
       if (sz == 0) {
         return 0;
       }
@@ -460,7 +496,7 @@ namespace gbe {
     IN_UPDATE_SZ(has_imageset);
     if (has_imageset) {
       imageSet = GBE_NEW(ir::ImageSet);
-      size_t sz = imageSet->deserializeFromBin(ins);
+      uint32_t sz = imageSet->deserializeFromBin(ins);
       if (sz == 0) {
         return 0;
       }
@@ -482,7 +518,7 @@ namespace gbe {
     if (magic != magic_end)
       return 0;
 
-    size_t total_bytes;
+    uint32_t total_bytes;
     IN_UPDATE_SZ(total_bytes);
     if (total_bytes + sizeof(total_size) != total_size)
       return 0;
@@ -570,11 +606,10 @@ namespace gbe {
     program->CleanLlvmResource();
   }
 
+  BVAR(OCL_DEBUGINFO, false);
 #ifdef GBE_COMPILER_AVAILABLE
-  BVAR(OCL_OUTPUT_BUILD_LOG, false);
-
   static bool buildModuleFromSource(const char *source, llvm::Module** out_module, llvm::LLVMContext* llvm_ctx,
-                                    std::string dumpLLVMFileName, std::vector<std::string>& options, size_t stringSize, char *err,
+                                    std::string dumpLLVMFileName, std::string dumpSPIRBinaryName, std::vector<std::string>& options, size_t stringSize, char *err,
                                     size_t *errSize) {
     // Arguments to pass to the clang frontend
     vector<const char *> args;
@@ -605,7 +640,8 @@ namespace gbe {
     args.push_back("spir");
 #endif /* LLVM_VERSION_MINOR <= 2 */
     args.push_back("stringInput.cl");
-    args.push_back("-ffp-contract=off");
+    args.push_back("-ffp-contract=on");
+    if(OCL_DEBUGINFO) args.push_back("-g");
 
     // The compiler invocation needs a DiagnosticsEngine so it can report problems
     std::string ErrorString;
@@ -695,6 +731,20 @@ namespace gbe {
         (*out_module)->print(ostream, 0);
       } //Otherwise, you'll have to make do without the dump.
     }
+
+    if (!dumpSPIRBinaryName.empty()) {
+      std::string err;
+      llvm::raw_fd_ostream ostream (dumpSPIRBinaryName.c_str(),
+                                    err,
+      #if LLVM_VERSION_MINOR == 3
+                                    0
+      #else
+                                    llvm::sys::fs::F_None
+      #endif
+                                    );
+      if (err.empty())
+        llvm::WriteBitcodeToFile(*out_module, ostream);
+    }
 #else
     if (!dumpLLVMFileName.empty()) {
       std::error_code err;
@@ -704,6 +754,14 @@ namespace gbe {
         (*out_module)->print(ostream, 0);
       } //Otherwise, you'll have to make do without the dump.
     }
+
+    if (!dumpSPIRBinaryName.empty()) {
+      std::error_code err;
+      llvm::raw_fd_ostream ostream (dumpSPIRBinaryName.c_str(),
+                                    err, llvm::sys::fs::F_None);
+      if (!err)
+        llvm::WriteBitcodeToFile(*out_module, ostream);
+    }
 #endif
     return true;
   }
@@ -719,6 +777,7 @@ namespace gbe {
                                      std::vector<std::string>& clOpt,
                                      std::string& dumpLLVMFileName,
                                      std::string& dumpASMFileName,
+                                     std::string& dumpSPIRBinaryName,
                                      int& optLevel,
                                      size_t stringSize,
                                      char *err,
@@ -728,10 +787,18 @@ namespace gbe {
     std::istringstream idirs(dirs);
     std::string pchFileName;
     bool findPCH = false;
+#if defined(__ANDROID__)
+    bool invalidPCH = true;
+#else
     bool invalidPCH = false;
+#endif
     size_t start = 0, end = 0;
 
     std::string hdirs = OCL_HEADER_FILE_DIR;
+    if(hdirs == "")
+      hdirs = OCL_HEADER_DIR;
+    if(dirs == "")
+      dirs = OCL_PCH_OBJECT;
     std::istringstream hidirs(hdirs);
     std::string headerFilePath;
     bool findOcl = false;
@@ -750,7 +817,7 @@ namespace gbe {
         std::cout << options << std::endl;
       }
       std::cout << "CL kernel source:" << std::endl;
-      std::cout << source;
+      std::cout << source << std::endl;
     }
     std::string includePath  = "-I" + headerFilePath;
     clOpt.push_back(includePath);
@@ -763,7 +830,7 @@ namespace gbe {
       const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
                        "-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt");
 
-      const std::string uncompatiblePCHOptions = ("-cl-single-precision-constant, -cl-fast-relaxed-math, -cl-std=CL1.1, -cl-finite-math-only");
+      const std::string uncompatiblePCHOptions = ("-cl-single-precision-constant, -cl-fast-relaxed-math, -cl-std=CL1.1, -cl-finite-math-only, -cl-unsafe-math-optimizations");
       const std::string fastMathOption = ("-cl-fast-relaxed-math");
       while (end != std::string::npos) {
         end = optionStr.find(' ', start);
@@ -868,6 +935,11 @@ EXTEND_QUOTE:
           continue; // Don't push this str back; ignore it.
         }
 
+        if(str.find("-dump-spir-binary=") != std::string::npos) {
+          dumpSPIRBinaryName = str.substr(str.find("=") + 1);
+          continue; // Don't push this str back; ignore it.
+        }
+
         clOpt.push_back(str);
       }
       free(c_str);
@@ -912,8 +984,9 @@ EXTEND_QUOTE:
     int optLevel = 1;
     std::vector<std::string> clOpt;
     std::string dumpLLVMFileName, dumpASMFileName;
+    std::string dumpSPIRBinaryName;
     if (!processSourceAndOption(source, options, NULL, clOpt,
-                                dumpLLVMFileName, dumpASMFileName,
+                                dumpLLVMFileName, dumpASMFileName, dumpSPIRBinaryName,
                                 optLevel,
                                 stringSize, err, errSize))
       return NULL;
@@ -926,14 +999,14 @@ EXTEND_QUOTE:
     if (!llvm::llvm_is_multithreaded())
       llvm_mutex.lock();
 
-    if (buildModuleFromSource(source, &out_module, llvm_ctx, dumpLLVMFileName, clOpt,
+    if (buildModuleFromSource(source, &out_module, llvm_ctx, dumpLLVMFileName, dumpSPIRBinaryName, clOpt,
                               stringSize, err, errSize)) {
     // Now build the program from llvm
       size_t clangErrSize = 0;
-      if (err != NULL) {
+      if (err != NULL && *errSize != 0) {
         GBE_ASSERT(errSize != NULL);
-        stringSize -= *errSize;
-        err += *errSize;
+        stringSize = stringSize - *errSize;
+        err = err + *errSize;
         clangErrSize = *errSize;
       }
 
@@ -942,9 +1015,10 @@ EXTEND_QUOTE:
         if (asmDumpStream)
           fclose(asmDumpStream);
       }
+
       p = gbe_program_new_from_llvm(deviceID, NULL, out_module, llvm_ctx,
                                     dumpASMFileName.empty() ? NULL : dumpASMFileName.c_str(),
-                                    stringSize, err, errSize, optLevel);
+                                    stringSize, err, errSize, optLevel, options);
       if (err != NULL)
         *errSize += clangErrSize;
       if (OCL_OUTPUT_BUILD_LOG && options)
@@ -972,8 +1046,9 @@ EXTEND_QUOTE:
     int optLevel = 1;
     std::vector<std::string> clOpt;
     std::string dumpLLVMFileName, dumpASMFileName;
+    std::string dumpSPIRBinaryName;
     if (!processSourceAndOption(source, options, temp_header_path, clOpt,
-                                dumpLLVMFileName, dumpASMFileName,
+                                dumpLLVMFileName, dumpASMFileName, dumpSPIRBinaryName,
                                 optLevel, stringSize, err, errSize))
       return NULL;
 
@@ -984,7 +1059,7 @@ EXTEND_QUOTE:
     llvm::Module * out_module;
     llvm::LLVMContext* llvm_ctx = &llvm::getGlobalContext();
 
-    if (buildModuleFromSource(source, &out_module, llvm_ctx, dumpLLVMFileName, clOpt,
+    if (buildModuleFromSource(source, &out_module, llvm_ctx, dumpLLVMFileName, dumpSPIRBinaryName, clOpt,
                               stringSize, err, errSize)) {
     // Now build the program from llvm
       if (err != NULL) {
@@ -993,7 +1068,7 @@ EXTEND_QUOTE:
         err += *errSize;
       }
 
-      p = gbe_program_new_gen_program(deviceID, out_module, NULL);
+      p = gbe_program_new_gen_program(deviceID, out_module, NULL, NULL);
 
       if (OCL_OUTPUT_BUILD_LOG && options)
         llvm::errs() << options;
@@ -1039,6 +1114,10 @@ EXTEND_QUOTE:
       if(pos != std::string::npos) {
         s.erase(pos, strlen("-enable-link-options"));
       }
+      pos = s.find("-dump-opt-asm");
+      if(pos != std::string::npos) {
+        s.erase(pos, strlen("-dump-opt-asm"));
+      }
       args.push_back(s.c_str());
 
       // The compiler invocation needs a DiagnosticsEngine so it can report problems
@@ -1223,6 +1302,21 @@ EXTEND_QUOTE:
     kernel->getSamplerData(samplers);
   }
 
+  static void* kernelDupProfiling(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return NULL;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->dupProfilingInfo();
+  }
+  static uint32_t kernelGetProfilingBTI(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->getProfilingBTI();
+  }
+  static void kernelOutputProfiling(void *profiling_info, void* buf) {
+    if (profiling_info == NULL) return;
+    ir::ProfilingInfo *pi = (ir::ProfilingInfo *)profiling_info;
+    return pi->outputProfilingInfo(buf);
+  }
   static uint32_t kernelGetPrintfNum(void * printf_info) {
     if (printf_info == NULL) return 0;
     const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
@@ -1241,33 +1335,17 @@ EXTEND_QUOTE:
     return ps->getBufBTI();
   }
 
-  static uint8_t kernelGetPrintfIndexBufBTI(void * printf_info) {
-    if (printf_info == NULL) return 0;
-    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
-    return ps->getIndexBufBTI();
-  }
-
   static void kernelReleasePrintfSet(void * printf_info) {
     if (printf_info == NULL) return;
     ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
     delete ps;
   }
 
-  static uint32_t kernelGetPrintfSizeOfSize(void * printf_info) {
-    if (printf_info == NULL) return 0;
-    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
-    return ps->getPrintfSizeOfSize();
-  }
-
-  static void kernelOutputPrintf(void * printf_info, void* index_addr,
-                                 void* buf_addr, size_t global_wk_sz0,
-                                 size_t global_wk_sz1, size_t global_wk_sz2,
-                                 size_t output_sz)
+  static void kernelOutputPrintf(void * printf_info, void* buf_addr)
   {
     if (printf_info == NULL) return;
     ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
-    ps->outputPrintf(index_addr, buf_addr, global_wk_sz0,
-                         global_wk_sz1, global_wk_sz2, output_sz);
+    ps->outputPrintf(buf_addr);
   }
 
   static void kernelGetCompileWorkGroupSize(gbe_kernel gbeKernel, size_t wg_size[3]) {
@@ -1345,12 +1423,13 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data =
 GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
+GBE_EXPORT_SYMBOL gbe_output_profiling_cb *gbe_output_profiling = NULL;
+GBE_EXPORT_SYMBOL gbe_dup_profiling_cb *gbe_dup_profiling = NULL;
+GBE_EXPORT_SYMBOL gbe_get_profiling_bti_cb *gbe_get_profiling_bti = NULL;
 GBE_EXPORT_SYMBOL gbe_get_printf_num_cb *gbe_get_printf_num = NULL;
 GBE_EXPORT_SYMBOL gbe_dup_printfset_cb *gbe_dup_printfset = NULL;
 GBE_EXPORT_SYMBOL gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti = NULL;
-GBE_EXPORT_SYMBOL gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti = NULL;
 GBE_EXPORT_SYMBOL gbe_release_printf_info_cb *gbe_release_printf_info = NULL;
-GBE_EXPORT_SYMBOL gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size = NULL;
 GBE_EXPORT_SYMBOL gbe_output_printf_cb *gbe_output_printf = NULL;
 
 #ifdef GBE_COMPILER_AVAILABLE
@@ -1394,11 +1473,12 @@ namespace gbe
       gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
       gbe_kernel_get_image_size = gbe::kernelGetImageSize;
       gbe_kernel_get_image_data = gbe::kernelGetImageData;
+      gbe_get_profiling_bti = gbe::kernelGetProfilingBTI;
       gbe_get_printf_num = gbe::kernelGetPrintfNum;
+      gbe_dup_profiling = gbe::kernelDupProfiling;
+      gbe_output_profiling = gbe::kernelOutputProfiling;
       gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
-      gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
       gbe_dup_printfset = gbe::kernelDupPrintfSet;
-      gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
       gbe_release_printf_info = gbe::kernelReleasePrintfSet;
       gbe_output_printf = gbe::kernelOutputPrintf;
       genSetupCallBacks();
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 84ce333..db770a6 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -36,6 +36,11 @@
 extern "C" {
 #endif /* __cplusplus */
 
+typedef struct _DebugInfo {
+    uint32_t line;
+    uint32_t col;
+} DebugInfo;
+
 /*! Opaque structure that interfaces a GBE program */
 typedef struct _gbe_program *gbe_program;
 
@@ -92,25 +97,24 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_Z,
   GBE_CURBE_WORK_DIM,
   GBE_CURBE_IMAGE_INFO,
-  GBE_CURBE_STACK_POINTER,
-  GBE_CURBE_PRINTF_BUF_POINTER,
-  GBE_CURBE_PRINTF_INDEX_POINTER,
   GBE_CURBE_KERNEL_ARGUMENT,
   GBE_CURBE_EXTRA_ARGUMENT,
   GBE_CURBE_BLOCK_IP,
   GBE_CURBE_DW_BLOCK_IP,
   GBE_CURBE_THREAD_NUM,
-  GBE_CURBE_ZERO,
-  GBE_CURBE_ONE,
-  GBE_CURBE_LANE_ID,
-  GBE_CURBE_SLM_OFFSET,
-  GBE_CURBE_BTI_UTIL,
+  GBE_CURBE_PROFILING_BUF_POINTER,
+  GBE_CURBE_PROFILING_TIMESTAMP0,
+  GBE_CURBE_PROFILING_TIMESTAMP1,
+  GBE_CURBE_PROFILING_TIMESTAMP2,
+  GBE_CURBE_PROFILING_TIMESTAMP3,
+  GBE_CURBE_PROFILING_TIMESTAMP4,
+  GBE_CURBE_THREAD_ID,
+  GBE_GEN_REG,
 };
 
 /*! Extra arguments use the negative range of sub-values */
 enum gbe_extra_argument {
   GBE_STACK_BUFFER = 0,   /* Give stack location in curbe */
-  GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
 };
 
 typedef struct ImageInfo {
@@ -138,6 +142,17 @@ extern gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size;
 typedef void (gbe_kernel_get_image_data_cb)(gbe_kernel gbeKernel, ImageInfo *images);
 extern gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data;
 
+/*! Get whether we are in the code profiling mode */
+typedef void (gbe_output_profiling_cb)(void* profiling_info, void* buf);
+extern gbe_output_profiling_cb *gbe_output_profiling;
+
+/*! Get the profiling bti */
+typedef uint32_t (gbe_get_profiling_bti_cb)(gbe_kernel gbeKernel);
+extern gbe_get_profiling_bti_cb *gbe_get_profiling_bti;
+
+typedef void* (gbe_dup_profiling_cb)(gbe_kernel gbeKernel);
+extern gbe_dup_profiling_cb *gbe_dup_profiling;
+
 /*! Get the printf number */
 typedef uint32_t (gbe_get_printf_num_cb)(void* printf_info);
 extern gbe_get_printf_num_cb *gbe_get_printf_num;
@@ -146,9 +161,6 @@ extern gbe_get_printf_num_cb *gbe_get_printf_num;
 typedef uint8_t (gbe_get_printf_buf_bti_cb)(void* printf_info);
 extern gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti;
 
-typedef uint8_t (gbe_get_printf_indexbuf_bti_cb)(void* printf_info);
-extern gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti;
-
 /*! Release the printfset */
 typedef void (gbe_release_printf_info_cb)(void* printf_info);
 extern gbe_release_printf_info_cb *gbe_release_printf_info;
@@ -157,12 +169,7 @@ extern gbe_release_printf_info_cb *gbe_release_printf_info;
 typedef void* (gbe_dup_printfset_cb)(gbe_kernel gbeKernel);
 extern gbe_dup_printfset_cb *gbe_dup_printfset;
 
-/*! Get the printf buffer const offset */
-typedef uint32_t (gbe_get_printf_sizeof_size_cb)(void* printf_info);
-extern gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size;
-
-typedef void (gbe_output_printf_cb) (void* printf_info, void* index_addr, void* buf_addr,
-              size_t global_wk_sz0, size_t global_wk_sz1, size_t global_wk_sz2, size_t outbuf_sz);
+typedef void (gbe_output_printf_cb) (void* printf_info, void* buf_addr);
 extern gbe_output_printf_cb* gbe_output_printf;
 
 /*! Create a new program from the given source code (zero terminated string) */
@@ -198,7 +205,8 @@ extern gbe_program_check_opt_cb *gbe_program_check_opt;
 /*! create s new genprogram for link. */
 typedef gbe_program (gbe_program_new_gen_program_cb)(uint32_t deviceID,
                                                      const void *module,
-                                                     const void *act);
+                                                     const void *act,
+                                                     const char *asm_file_name);
 extern gbe_program_new_gen_program_cb *gbe_program_new_gen_program;
 
 /*! Create a new program from the given blob */
@@ -222,7 +230,8 @@ typedef gbe_program (gbe_program_new_from_llvm_cb)(uint32_t deviceID,
                                                    size_t string_size,
                                                    char *err,
                                                    size_t *err_size,
-                                                   int optLevel);
+                                                   int optLevel,
+                                                   const char* options);
 extern gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm;
 
 /*! link the programs from llvm level. */
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 4836c51..1f0ec55 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -90,7 +90,7 @@ namespace gbe {
     /*! Set the instruction stream.*/
     virtual void setCode(const char *, size_t size) = 0;
     /*! Return the instruction stream size (to be implemented) */
-    virtual size_t getCodeSize(void) const = 0;
+    virtual uint32_t getCodeSize(void) const = 0;
     /*! Get the kernel name */
     INLINE const char *getName(void) const { return name.c_str(); }
     /*! Return the number of arguments for the kernel call */
@@ -137,14 +137,22 @@ namespace gbe {
     void setImageSet(ir::ImageSet * from) {
       imageSet = from;
     }
+    /*! Set profiling info. */
+    void setProfilingInfo(ir::ProfilingInfo * from) {
+      profilingInfo = from;
+    }
+    void * dupProfilingInfo() const {
+      void* ptr = profilingInfo ? (void *)(new ir::ProfilingInfo(*profilingInfo)) : NULL;
+      return ptr;
+    }
+    uint32_t getProfilingBTI(void) const {
+      return profilingInfo ? profilingInfo->getBTI() : 0;
+    }
     /*! Set printf set. */
     void setPrintfSet(ir::PrintfSet * from) {
       printfSet = from;
     }
-    /* ! Return the offset in the sizeof(xxx). */
-    uint32_t getPrintfSizeOfSize(void) const {
-      return printfSet ? printfSet->getPrintfSizeOfSize() : 0;
-    }
+
     uint32_t getPrintfNum() const {
       return printfSet ? printfSet->getPrintfNum() : 0;
     }
@@ -158,16 +166,14 @@ namespace gbe {
       return printfSet->getBufBTI();
     }
 
-    uint8_t getPrintfIndexBufBTI() const {
-      GBE_ASSERT(printfSet);
-      return printfSet->getIndexBufBTI();
+    uint32_t getProfilingBufBTI() const {
+      GBE_ASSERT(profilingInfo);
+      return profilingInfo->getBTI();
     }
 
-    void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
-                      size_t global_wk_sz1, size_t global_wk_sz2, size_t output_sz) {
-      if(printfSet)
-        printfSet->outputPrintf(index_addr, buf_addr, global_wk_sz0,
-                                global_wk_sz1, global_wk_sz2, output_sz);
+    void outputProfilingInfo(void* buf) {
+      if(profilingInfo)
+        profilingInfo->outputProfilingInfo(buf);
     }
 
     KernelArgument::ArgInfo* getArgInfo(uint32_t id) const { return &args[id].info; }
@@ -219,8 +225,8 @@ namespace gbe {
     */
 
     /*! Implements the serialization. */
-    virtual size_t serializeToBin(std::ostream& outs);
-    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual uint32_t serializeToBin(std::ostream& outs);
+    virtual uint32_t deserializeFromBin(std::istream& ins);
     virtual void printStatus(int indent, std::ostream& outs);
 
   protected:
@@ -232,7 +238,7 @@ namespace gbe {
     uint32_t argNum;           //!< Number of function arguments
     uint32_t curbeSize;        //!< Size of the data to push
     uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
-    uint32_t stackSize;        //!< Stack size (may be 0 if unused)
+    uint32_t stackSize;        //!< Stack size (0 if unused)
     uint32_t scratchSize;      //!< Scratch memory size (may be 0 if unused)
     bool useSLM;               //!< SLM requires a special HW config
     uint32_t slmSize;          //!< slm size for kernel variable
@@ -240,7 +246,8 @@ namespace gbe {
     ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
     ir::ImageSet *imageSet;    //!< Copy from the corresponding function.
     ir::PrintfSet *printfSet;  //!< Copy from the corresponding function.
-    size_t compileWgSize[3];   //!< required work group size by kernel attribute.
+    ir::ProfilingInfo *profilingInfo;  //!< Copy from the corresponding function.
+    uint32_t compileWgSize[3]; //!< required work group size by kernel attribute.
     std::string functionAttributes; //!< function attribute qualifiers combined.
     GBE_CLASS(Kernel);         //!< Use custom allocators
   };
@@ -250,7 +257,7 @@ namespace gbe {
   {
   public:
     /*! Create an empty program */
-    Program(void);
+    Program(uint32_t fast_relaxed_math);
     /*! Destroy the program */
     virtual ~Program(void);
     /*! Clean LLVM resource of the program */
@@ -305,13 +312,15 @@ namespace gbe {
     */
 
     /*! Implements the serialization. */
-    virtual size_t serializeToBin(std::ostream& outs);
-    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual uint32_t serializeToBin(std::ostream& outs);
+    virtual uint32_t deserializeFromBin(std::istream& ins);
     virtual void printStatus(int indent, std::ostream& outs);
+    uint32_t fast_relaxed_math : 1;
 
   protected:
     /*! Compile a kernel */
-    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath) = 0;
+    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name,
+                                  bool relaxMath, int profiling) = 0;
     /*! Allocate an empty kernel. */
     virtual Kernel *allocateKernel(const std::string &name) = 0;
     /*! Kernels sorted by their name */
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index 8225d4a..8e42891 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -196,8 +196,8 @@ void program_build_instance::serialize_program(void) throw(int)
 
       if(gen_pci_id){
         //add header to differeciate from llvm bitcode binary.
-        // (5 bytes: 1 byte for binary type, 4 byte for bc code, 'GENC' is for gen binary.)
-        char gen_header[6] = "\0GENC";
+        // (5 bytes: 1 byte for binary version, 4 byte for bc code, 'GENC' is for gen binary.)
+        char gen_header[6] = "\1GENC";
         OUTS_UPDATE_SZ(gen_header[0]);
         OUTS_UPDATE_SZ(gen_header[1]);
         OUTS_UPDATE_SZ(gen_header[2]);
@@ -226,6 +226,7 @@ void program_build_instance::serialize_program(void) throw(int)
         size_t bin_length = gbe_program_serialize_to_binary((gbe_program)gbe_prog, &llvm_binary, 1);
         oss.write(llvm_binary, bin_length);
         sz += bin_length;
+        free(llvm_binary);
       }
 
       for (size_t i = 0; i < sz; i++) {
@@ -242,8 +243,8 @@ void program_build_instance::serialize_program(void) throw(int)
     } else {
       if(gen_pci_id){
         //add header to differeciate from llvm bitcode binary.
-        // (5 bytes: 1 byte for binary type, 4 byte for bc code, 'GENC' is for gen binary.)
-        char gen_header[6] = "\0GENC";
+        // (5 bytes: 1 byte for binary version, 4 byte for bc code, 'GENC' is for gen binary.)
+        char gen_header[6] = "\1GENC";
         OUTF_UPDATE_SZ(gen_header[0]);
         OUTF_UPDATE_SZ(gen_header[1]);
         OUTF_UPDATE_SZ(gen_header[2]);
@@ -258,6 +259,7 @@ void program_build_instance::serialize_program(void) throw(int)
         size_t bin_length = gbe_program_serialize_to_binary((gbe_program)gbe_prog, &llvm_binary, 1);
         ofs.write(llvm_binary, bin_length);
         sz+=bin_length;
+        free(llvm_binary);
       }
     }
 
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
index 4594a0a..34d04dd 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -22,6 +22,7 @@
 #include "sys/platform.cpp"
 #include "ir/constant.cpp"
 #include "ir/printf.cpp"
+#include "ir/profiling.cpp"
 
 #pragma GCC diagnostic ignored "-Wunused-function"
 #pragma GCC diagnostic ignored "-Wunused-variable"
@@ -64,11 +65,12 @@ struct BinInterpCallBackInitializer
     gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
     gbe_kernel_get_image_data = gbe::kernelGetImageData;
     gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
+    gbe_get_profiling_bti = gbe::kernelGetProfilingBTI;
+    gbe_dup_profiling = gbe::kernelDupProfiling;
+    gbe_output_profiling = gbe::kernelOutputProfiling;
     gbe_get_printf_num = gbe::kernelGetPrintfNum;
     gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
-    gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
     gbe_dup_printfset = gbe::kernelDupPrintfSet;
-    gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
     gbe_release_printf_info = gbe::kernelReleasePrintfSet;
     gbe_output_printf = gbe::kernelOutputPrintf;
   }
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
index fa4e14a..54ae3f1 100644
--- a/backend/src/ir/constant.cpp
+++ b/backend/src/ir/constant.cpp
@@ -43,31 +43,34 @@ namespace ir {
 #define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
 #define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
 
-  size_t ConstantSet::serializeToBin(std::ostream& outs) {
-    size_t ret_size = 0;
+  uint32_t ConstantSet::serializeToBin(std::ostream& outs) {
+    uint32_t ret_size = 0;
 
     OUT_UPDATE_SZ(magic_begin);
 
     /* output the const data. */
-    OUT_UPDATE_SZ((data.size()*sizeof(char)));
+    uint32_t sz = data.size()*sizeof(char);
+    OUT_UPDATE_SZ(sz);
     if(data.size() > 0) {
-      outs.write(data.data(), data.size()*sizeof(char));
-      ret_size += data.size()*sizeof(char);
+      outs.write(data.data(), sz);
+      ret_size += sz;
     }
 
-    OUT_UPDATE_SZ(constants.size());
-    for (size_t i = 0; i < constants.size(); ++i) {
+    sz = constants.size();
+    OUT_UPDATE_SZ(sz);
+    for (uint32_t i = 0; i < constants.size(); ++i) {
       Constant& cnst = constants[i];
-      size_t bytes = sizeof(cnst.getName().size())        //name length self
-                     + cnst.getName().size()*sizeof(char) //name
-                     + sizeof(cnst.getSize())             //size
-                     + sizeof(cnst.getAlignment())        //alignment
-                     + sizeof(cnst.getOffset());	        //offset
+      sz = cnst.getName().size()*sizeof(char);
+      uint32_t bytes = sizeof(sz)        //name length self
+                       + sz              //name
+                       + sizeof(cnst.getSize())             //size
+                       + sizeof(cnst.getAlignment())        //alignment
+                       + sizeof(cnst.getOffset());	        //offset
       OUT_UPDATE_SZ(bytes);
 
-      OUT_UPDATE_SZ(cnst.getName().size());
-      outs.write(cnst.getName().c_str(), cnst.getName().size());
-      ret_size += sizeof(char)*cnst.getName().size();
+      OUT_UPDATE_SZ(sz);
+      outs.write(cnst.getName().c_str(), sz);
+      ret_size += sz;
       OUT_UPDATE_SZ(cnst.getSize());
       OUT_UPDATE_SZ(cnst.getAlignment());
       OUT_UPDATE_SZ(cnst.getOffset());
@@ -79,10 +82,10 @@ namespace ir {
     return ret_size;
   }
 
-  size_t ConstantSet::deserializeFromBin(std::istream& ins) {
-    size_t total_size = 0;
-    size_t global_data_sz = 0;
-    size_t const_num;
+  uint32_t ConstantSet::deserializeFromBin(std::istream& ins) {
+    uint32_t total_size = 0;
+    uint32_t global_data_sz = 0;
+    uint32_t const_num;
     uint32_t magic;
 
     IN_UPDATE_SZ(magic);
@@ -90,18 +93,18 @@ namespace ir {
       return 0;
 
     IN_UPDATE_SZ(global_data_sz);
-    for (size_t i = 0; i < global_data_sz; i++) {
+    for (uint32_t i = 0; i < global_data_sz; i++) {
       char elt;
       IN_UPDATE_SZ(elt);
       data.push_back(elt);
     }
 
     IN_UPDATE_SZ(const_num);
-    for (size_t i = 0; i < const_num; i++) {
-      size_t bytes;
+    for (uint32_t i = 0; i < const_num; i++) {
+      uint32_t bytes;
       IN_UPDATE_SZ(bytes);
 
-      size_t name_len;
+      uint32_t name_len;
       IN_UPDATE_SZ(name_len);
 
       char* c_name = new char[name_len+1];
@@ -129,7 +132,7 @@ namespace ir {
     if (magic != magic_end)
       return 0;
 
-    size_t total_bytes;
+    uint32_t total_bytes;
     IN_UPDATE_SZ(total_bytes);
     if (total_bytes + sizeof(total_size) != total_size)
       return 0;
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
index 0891d7b..c9080b8 100644
--- a/backend/src/ir/constant.hpp
+++ b/backend/src/ir/constant.hpp
@@ -119,8 +119,8 @@ namespace ir {
     */
 
     /*! Implements the serialization. */
-    virtual size_t serializeToBin(std::ostream& outs);
-    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual uint32_t serializeToBin(std::ostream& outs);
+    virtual uint32_t deserializeFromBin(std::istream& ins);
 
   private:
     vector<char> data;         //!< The constant data serialized in one array
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp
index 2412fe9..e4aac08 100644
--- a/backend/src/ir/context.cpp
+++ b/backend/src/ir/context.cpp
@@ -93,9 +93,11 @@ namespace ir {
     usedLabels = elem.usedLabels;
   }
 
-  Register Context::reg(RegisterFamily family, bool uniform) {
+  Register Context::reg(RegisterFamily family, bool uniform,
+                        gbe_curbe_type curbeType,
+                        int subType) {
     GBE_ASSERTM(fn != NULL, "No function currently defined");
-    return fn->newRegister(family, uniform);
+    return fn->newRegister(family, uniform, curbeType, subType);
   }
 
   LabelIndex Context::label(void) {
@@ -113,6 +115,7 @@ namespace ir {
     GBE_ASSERTM(fn != NULL, "No function currently defined");
     GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
     FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align, info, bti);
+    fn->setRegPayloadType(arg->reg, GBE_CURBE_KERNEL_ARGUMENT, fn->args.size());
     fn->args.push_back(arg);
   }
 
@@ -159,6 +162,7 @@ namespace ir {
     // Append the instruction in the stream
     Instruction *insnPtr = fn->newInstruction(insn);
     bb->append(*insnPtr);
+    insnPtr->setDBGInfo(this->DBGInfo);
 #if GBE_DEBUG
     std::string whyNot;
     if(getUnit().getValid())
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index 54265d0..877d639 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -63,7 +63,8 @@ namespace ir {
     /*! Append a new pushed constant */
     void appendPushedConstant(Register reg, const PushLocation &pushed);
     /*! Create a new register with the given family for the current function */
-    Register reg(RegisterFamily family, bool uniform = false);
+    Register reg(RegisterFamily family, bool uniform = false,
+                 gbe_curbe_type curbeType = GBE_GEN_REG, int subType = 0);
     /*! Create a new immediate value */
     template <typename T> INLINE ImmediateIndex newImmediate(T value) {
       const Immediate imm(value);
@@ -148,6 +149,11 @@ namespace ir {
       GBE_ASSERTM(fn != NULL, "No function currently defined");
       return fn->file.appendArrayTuple(reg, regNum);
     }
+    /*! Make a tuple from an array of types */
+    INLINE Tuple arrayTypeTuple(const ir::Type *type, uint32_t num) {
+      GBE_ASSERTM(fn != NULL, "No function currently defined");
+      return fn->file.appendArrayTypeTuple((uint8_t*)type, num);
+    }
     /*! We just use variadic templates to forward instruction functions */
 #define DECL_INSN(NAME, FAMILY) \
     template <typename... Args> INLINE void NAME(Args...args);
@@ -174,6 +180,7 @@ namespace ir {
     DECL_THREE_SRC_INSN(SEL);
     DECL_THREE_SRC_INSN(I64MADSAT);
     DECL_THREE_SRC_INSN(MAD);
+    DECL_THREE_SRC_INSN(LRP);
 #undef DECL_THREE_SRC_INSN
 
     /*! For all nullary functions */
@@ -188,26 +195,8 @@ namespace ir {
       this->append(insn);
     }
 
-    /*! LOAD with the destinations directly specified */
-    template <typename... Args>
-    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
-    {
-      const Tuple index = this->tuple(values...);
-      const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
-      GBE_ASSERT(valueNum > 0);
-      this->LOAD(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
-    }
-
-    /*! STORE with the sources directly specified */
-    template <typename... Args>
-    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
-    {
-      const Tuple index = this->tuple(values...);
-      const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
-      GBE_ASSERT(valueNum > 0);
-      this->STORE(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
-    }
     void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti, reg); }
+    void setDBGInfo(DebugInfo in) { DBGInfo = in; }
 
   protected:
     /*! A block must be started with a label */
@@ -232,6 +221,7 @@ namespace ir {
       vector<uint8_t> *usedLabels; //!< Store all labels that are defined
     };
     vector<StackElem> fnStack;     //!< Stack of functions still to finish
+    DebugInfo DBGInfo;
     GBE_CLASS(Context);
   };
 
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index f87f23a..2fe080a 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -43,7 +43,8 @@ namespace ir {
   ///////////////////////////////////////////////////////////////////////////
 
   Function::Function(const std::string &name, const Unit &unit, Profile profile) :
-    name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0), stackSize(0)
+    name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0), stackSize(0),
+    wgBroadcastSLM(-1), tidMapSLM(-1)
   {
     initProfile(*this);
     samplerSet = GBE_NEW(SamplerSet);
@@ -61,8 +62,38 @@ namespace ir {
     return unit.getPointerFamily();
   }
 
-  void Function::addLoop(const vector<LabelIndex> &bbs, const vector<std::pair<LabelIndex, LabelIndex>> &exits) {
-    loops.push_back(GBE_NEW(Loop, bbs, exits));
+  void Function::addLoop(LabelIndex preheader,
+                        int parent,
+                        const vector<LabelIndex> &bbs,
+                        const vector<std::pair<LabelIndex, LabelIndex>> &exits) {
+    loops.push_back(GBE_NEW(Loop, preheader, parent, bbs, exits));
+  }
+
+  int Function::getLoopDepth(LabelIndex Block) const{
+    if (loops.size() == 0) return 0;
+
+    int LoopIndex = -1;
+    int LoopDepth = 0;
+    // get innermost loop
+    for (int Idx = loops.size()-1; Idx >= 0; Idx--) {
+      Loop *lp = loops[Idx];
+      vector<LabelIndex> &Blocks = lp->bbs;
+      bool Found = (std::find(Blocks.begin(), Blocks.end(), Block) != Blocks.end());
+      if (Found) {
+        LoopIndex = Idx;
+        break;
+      }
+    }
+
+    if (LoopIndex != -1) {
+      int LoopId = LoopIndex;
+      do {
+        LoopId = loops[LoopId]->parent;
+        LoopDepth++;
+      } while(LoopId != -1);
+    }
+
+    return LoopDepth;
   }
 
   void Function::checkEmptyLabels(void) {
@@ -119,6 +150,8 @@ namespace ir {
       for (auto &y : x->bbs)
         y = labelMap[y];
 
+      x->preheader = labelMap[x->preheader];
+
       for (auto &z : x->exits) {
         z.first = labelMap[z.first];
         z.second = labelMap[z.second];
@@ -392,7 +425,7 @@ namespace ir {
   LabelIndex BasicBlock::getLabelIndex(void) const {
     const Instruction *first = this->getFirstInstruction();
     const LabelInstruction *label = cast<LabelInstruction>(first);
-    return label->getLabelIndex();
+    return label?label->getLabelIndex():LabelIndex(-1);
   }
 
 } /* namespace ir */
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 5d00cca..ae0a702 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -177,10 +177,14 @@ namespace ir {
     struct InfoFromLLVM { // All the info about passed by llvm, using -cl-kernel-arg-info
       uint32_t addrSpace;
       std::string typeName;
+      std::string typeBaseName;
       std::string accessQual;
       std::string typeQual;
       std::string argName; // My different from arg->getName()
 
+
+      // only llvm-3.6 or later has kernel_arg_base_type in metadata.
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 5)
       bool isImage1dT() const {
         return typeName.compare("image1d_t") == 0;
       }
@@ -199,16 +203,38 @@ namespace ir {
       bool isImage3dT() const {
         return typeName.compare("image3d_t") == 0;
       }
+      bool isSamplerType() const {
+        return typeName.compare("sampler_t") == 0;
+      }
+#else
+      bool isImage1dT() const {
+        return typeBaseName.compare("image1d_t") == 0;
+      }
+      bool isImage1dArrayT() const {
+        return typeBaseName.compare("image1d_array_t") == 0;
+      }
+      bool isImage1dBufferT() const {
+        return typeBaseName.compare("image1d_buffer_t") == 0;
+      }
+      bool isImage2dT() const {
+        return typeBaseName.compare("image2d_t") == 0;
+      }
+      bool isImage2dArrayT() const {
+        return typeBaseName.compare("image2d_array_t") == 0;
+      }
+      bool isImage3dT() const {
+        return typeBaseName.compare("image3d_t") == 0;
+      }
+      bool isSamplerType() const {
+        return typeBaseName.compare("sampler_t") == 0;
+      }
+#endif
 
       bool isImageType() const {
         return isImage1dT() || isImage1dArrayT() || isImage1dBufferT() ||
                isImage2dT() || isImage2dArrayT() || isImage3dT();
       }
 
-      bool isSamplerType() const {
-        return typeName.compare("sampler_t") == 0;
-      }
-
     };
 
     /*! Create a function input argument */
@@ -247,8 +273,14 @@ namespace ir {
   struct Loop : public NonCopyable
   {
   public:
-    Loop(const vector<LabelIndex> &in, const vector<std::pair<LabelIndex, LabelIndex>> &exit) :
-    bbs(in), exits(exit) {}
+    Loop(LabelIndex pre,
+         int paren,
+         const vector<LabelIndex> &in,
+         const vector<std::pair<LabelIndex, LabelIndex>> &exit) :
+         preheader(pre), parent(paren), bbs(in), exits(exit) {}
+
+    LabelIndex preheader;
+    int parent;
     vector<LabelIndex> bbs;
     vector<std::pair<LabelIndex, LabelIndex>> exits;
     GBE_STRUCT(Loop);
@@ -273,8 +305,11 @@ namespace ir {
     /*! Get the function profile */
     INLINE Profile getProfile(void) const { return profile; }
     /*! Get a new valid register */
-    INLINE Register newRegister(RegisterFamily family, bool uniform = false) {
-      return this->file.append(family, uniform);
+    INLINE Register newRegister(RegisterFamily family,
+                                bool uniform = false,
+                                gbe_curbe_type curbeType = GBE_GEN_REG,
+                                int subType = 0) {
+      return this->file.append(family, uniform, curbeType, subType);
     }
     /*! Get the function name */
     const std::string &getName(void) const { return name; }
@@ -288,6 +323,18 @@ namespace ir {
     INLINE void setRegisterUniform(Register reg, bool uniform) { file.setUniform(reg, uniform); }
     /*! return true if the specified regsiter is uniform type */
     INLINE bool isUniformRegister(Register reg) { return file.isUniform(reg); }
+    /*! set register as specified payload type */
+    INLINE void setRegPayloadType(Register reg, gbe_curbe_type curbeType, int subType) {
+      file.setPayloadType(reg, curbeType, subType);
+    }
+    /*! get register's payload type. */
+    INLINE void getRegPayloadType(Register reg, gbe_curbe_type &curbeType, int &subType) const {
+      file.getPayloadType(reg, curbeType, subType);
+    }
+    /*! check whether a register is a payload register */
+    INLINE bool isPayloadReg(Register reg) const{
+      return file.isPayloadReg(reg);
+    }
     /*! Get the register family from the register itself */
     INLINE RegisterFamily getRegisterFamily(Register reg) const {
       return this->getRegisterData(reg).family;
@@ -300,6 +347,14 @@ namespace ir {
     INLINE void setRegister(Tuple ID, uint32_t which, Register reg) {
       file.set(ID, which, reg);
     }
+    /*! Get the type from the tuple vector */
+    INLINE uint8_t getType(Tuple ID, uint32_t which) const {
+      return file.getType(ID, which);
+    }
+    /*! Set the type into the tuple vector */
+    INLINE void setType(Tuple ID, uint32_t which, uint8_t type) {
+      file.setType(ID, which, type);
+    }
     /*! Get the register file */
     INLINE const RegisterFile &getRegisterFile(void) const { return file; }
     /*! Get the given value ie immediate from the function */
@@ -438,6 +493,14 @@ namespace ir {
         block->foreach(functor);
       }
     }
+    /*! Get wgBroadcastSLM in this function */
+    int32_t getwgBroadcastSLM(void) const { return wgBroadcastSLM; }
+    /*! Set wgBroadcastSLM for this function */
+    void setwgBroadcastSLM(int32_t v) { wgBroadcastSLM = v; }
+    /*! Get tidMapSLM in this function */
+    int32_t gettidMapSLM(void) const { return tidMapSLM; }
+    /*! Set tidMapSLM for this function */
+    void settidMapSLM(int32_t v) { tidMapSLM = v; }
     /*! Does it use SLM */
     INLINE bool getUseSLM(void) const { return this->useSLM; }
     /*! Change the SLM config for the function */
@@ -465,12 +528,27 @@ namespace ir {
     /*! Push stack size. */
     INLINE void pushStackSize(uint32_t step) { this->stackSize += step; }
     /*! add the loop info for later liveness analysis */
-    void addLoop(const vector<LabelIndex> &bbs, const vector<std::pair<LabelIndex, LabelIndex>> &exits);
+    void addLoop(LabelIndex preheader,
+                 int parent,
+                 const vector<LabelIndex> &bbs,
+                 const vector<std::pair<LabelIndex, LabelIndex>> &exits);
     INLINE const vector<Loop * > &getLoops() { return loops; }
+    int getLoopDepth(LabelIndex Block) const;
     vector<BasicBlock *> &getBlocks() { return blocks; }
     /*! Get surface starting address register from bti */
     Register getSurfaceBaseReg(uint8_t bti) const;
     void appendSurface(uint8_t bti, Register reg);
+    /*! Get instruction distance between two BBs include both b0 and b1,
+        and b0 must be less than b1. */
+    INLINE uint32_t getDistance(LabelIndex b0, LabelIndex b1) const {
+      uint32_t insnNum = 0;
+      GBE_ASSERT(b0.value() <= b1.value());
+      for(uint32_t i = b0.value(); i <= b1.value(); i++) {
+        BasicBlock &bb = getBlock(LabelIndex(i));
+        insnNum += bb.size();
+      }
+      return insnNum;
+    }
     /*! Output the control flow graph to .dot file */
     void outputCFG();
   private:
@@ -498,6 +576,8 @@ namespace ir {
     size_t compileWgSize[3];        //!< required work group size specified by
                                     //   __attribute__((reqd_work_group_size(X, Y, Z))).
     std::string functionAttributes; //!< function attribute qualifiers combined.
+    int32_t wgBroadcastSLM;         //!< Used for broadcast the workgroup value.
+    int32_t tidMapSLM;              //!< Used to store the map between groupid and hw thread.
     GBE_CLASS(Function);            //!< Use custom allocator
   };
 
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index 8976a68..eaf6be7 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -107,12 +107,14 @@ namespace ir {
 #define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
 
   /*! Implements the serialization. */
-  size_t ImageSet::serializeToBin(std::ostream& outs) {
-    size_t ret_size = 0;
+  uint32_t ImageSet::serializeToBin(std::ostream& outs) {
+    uint32_t ret_size = 0;
+    uint32_t sz = 0;
 
     OUT_UPDATE_SZ(magic_begin);
 
-    OUT_UPDATE_SZ(regMap.size());
+    sz = regMap.size();
+    OUT_UPDATE_SZ(sz);
     for (map<Register, struct ImageInfo *>::const_iterator it = regMap.begin(); it != regMap.end(); ++it) {
       OUT_UPDATE_SZ(it->first);
       OUT_UPDATE_SZ(it->second->arg_idx);
@@ -125,7 +127,8 @@ namespace ir {
       OUT_UPDATE_SZ(it->second->dimOrderSlot);
     }
 
-    OUT_UPDATE_SZ(indexMap.size());
+    sz = indexMap.size();
+    OUT_UPDATE_SZ(sz);
     for (map<uint32_t, struct ImageInfo *>::iterator it = indexMap.begin(); it != indexMap.end(); ++it) {
       OUT_UPDATE_SZ(it->first);
       OUT_UPDATE_SZ(it->second->arg_idx);
@@ -144,17 +147,17 @@ namespace ir {
     return ret_size;
   }
 
-  size_t ImageSet::deserializeFromBin(std::istream& ins) {
-    size_t total_size = 0;
+  uint32_t ImageSet::deserializeFromBin(std::istream& ins) {
+    uint32_t total_size = 0;
     uint32_t magic;
-    size_t image_map_sz = 0;
+    uint32_t image_map_sz = 0;
 
     IN_UPDATE_SZ(magic);
     if (magic != magic_begin)
       return 0;
 
     IN_UPDATE_SZ(image_map_sz); //regMap
-    for (size_t i = 0; i < image_map_sz; i++) {
+    for (uint32_t i = 0; i < image_map_sz; i++) {
       ir::Register reg;
       ImageInfo *img_info = GBE_NEW(struct ImageInfo);;
 
@@ -193,7 +196,7 @@ namespace ir {
     if (magic != magic_end)
       return 0;
 
-    size_t total_bytes;
+    uint32_t total_bytes;
     IN_UPDATE_SZ(total_bytes);
     if (total_bytes + sizeof(total_size) != total_size)
       return 0;
@@ -247,7 +250,7 @@ namespace ir {
     auto it = infoRegMap.find(key.data);
     if (it != infoRegMap.end())
       return it->second;
-    Register reg = ctx->reg(FAMILY_DWORD);
+    Register reg = ctx->reg(FAMILY_DWORD, false, GBE_CURBE_IMAGE_INFO, key.data);
     infoRegMap.insert(std::make_pair(key.data, reg));
     return reg;
   }
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
index a93a4b6..ca616c1 100644
--- a/backend/src/ir/image.hpp
+++ b/backend/src/ir/image.hpp
@@ -86,8 +86,8 @@ namespace ir {
     */
 
     /*! Implements the serialization. */
-    virtual size_t serializeToBin(std::ostream& outs);
-    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual uint32_t serializeToBin(std::ostream& outs);
+    virtual uint32_t deserializeFromBin(std::istream& ins);
     virtual void printStatus(int indent, std::ostream& outs);
 
   private:
diff --git a/backend/src/ir/immediate.cpp b/backend/src/ir/immediate.cpp
index 35fc965..30db8e7 100644
--- a/backend/src/ir/immediate.cpp
+++ b/backend/src/ir/immediate.cpp
@@ -276,6 +276,7 @@ using namespace ir;
         for(uint32_t i = 0; i < immVec.size(); i++)
           this->data.immVec[i] = immVec[i];
       }
+      defaultData = 0ull;
     }
 
 
diff --git a/backend/src/ir/immediate.hpp b/backend/src/ir/immediate.hpp
index 3141643..6bc60d5 100644
--- a/backend/src/ir/immediate.hpp
+++ b/backend/src/ir/immediate.hpp
@@ -343,7 +343,7 @@ namespace ir {
       float *f32;
       double *f64;
       half *f16;
-      const Immediate *immVec[];
+      const Immediate **immVec;
       void *p;
     } data;     //!< Value to store
     Immediate operator+ (const Immediate &) const;
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index f93c528..ed64580 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -316,60 +316,104 @@ namespace ir {
       Type srcType; //!< Type to convert from
     };
 
+    class ALIGNED_INSTRUCTION MemInstruction :
+      public BasePolicy
+    {
+    public:
+      MemInstruction(AddressMode   _AM,
+                     AddressSpace _AS,
+                     bool _dwAligned,
+                     Type _type,
+                     Register _offset)
+                   : AM(_AM),
+                     AS(_AS),
+                     dwAligned(_dwAligned),
+                     type(_type),
+                     SurfaceIndex(0),
+                     offset(_offset) {
+      }
+      AddressMode  getAddressMode()    const { return AM; }
+      AddressSpace getAddressSpace()   const { return AS; }
+      /*! MemInstruction may have one possible btiReg */
+      Register     getBtiReg()         const { assert(AM == AM_DynamicBti); return BtiReg; }
+      unsigned     getSurfaceIndex()   const { assert(AM != AM_DynamicBti); return SurfaceIndex; }
+      Register     getAddressRegister()const { return offset; }
+      unsigned     getAddressIndex()   const { return 0; }
+      Type         getValueType()      const { return type; }
+      INLINE bool  isAligned(void)     const { return !!dwAligned; }
+
+      void         setSurfaceIndex (unsigned id)  { SurfaceIndex = id; }
+      void         setBtiReg(Register reg)        { BtiReg = reg;      }
+    protected:
+      /*! including address reg + optional bti reg */
+      int          getBaseSrcNum()    const { return AM == AM_DynamicBti ? 2 : 1; }
+      bool         hasExtraBtiReg()   const { return AM == AM_DynamicBti; }
+      AddressMode       AM;
+      AddressSpace      AS;
+      uint8_t           dwAligned : 1;
+      Type              type;
+      union {
+        Register        BtiReg;
+        unsigned        SurfaceIndex;
+      };
+      Register          offset;
+    };
+
     class ALIGNED_INSTRUCTION AtomicInstruction :
-      public BasePolicy,
+      public MemInstruction,
       public NDstPolicy<AtomicInstruction, 1>
     {
     public:
       AtomicInstruction(AtomicOps atomicOp,
+                         Type type,
                          Register dst,
                          AddressSpace addrSpace,
-                         Register bti,
-                         bool fixedBTI,
-                         Tuple src)
+                         Register address,
+                         Tuple payload,
+                         AddressMode AM)
+        : MemInstruction(AM, addrSpace, true, type, address)
       {
         this->opcode = OP_ATOMIC;
         this->atomicOp = atomicOp;
         this->dst[0] = dst;
-        this->src = src;
-        this->addrSpace = addrSpace;
-        this->bti = bti;
-        this->fixedBTI = fixedBTI ? 1: 0;
-        srcNum = 2;
+        this->payload = payload;
+
+        int payloadNum = 1;
         if((atomicOp == ATOMIC_OP_INC) ||
           (atomicOp == ATOMIC_OP_DEC))
-          srcNum = 1;
+          payloadNum = 0;
         if(atomicOp == ATOMIC_OP_CMPXCHG)
-          srcNum = 3;
-        srcNum++;
+          payloadNum = 2;
+
+        srcNum = payloadNum + getBaseSrcNum();
       }
       INLINE Register getSrc(const Function &fn, uint32_t ID) const {
-        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
-        if (ID == 0u)
-          return bti;
-        else
-          return fn.getRegister(src, ID -1);
+        GBE_ASSERTM((int)ID < (int)srcNum, "Out-of-bound source register for atomic");
+        if (ID == 0) {
+          return offset;
+        } else if (hasExtraBtiReg() && (int)ID == (int)srcNum-1) {
+          return getBtiReg();
+        } else {
+          return fn.getRegister(payload, ID - 1);
+        }
       }
       INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
-        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
-        if (ID == 0u)
-          bti = reg;
-        else
-          fn.setRegister(src, ID - 1, reg);
+        GBE_ASSERTM((int)ID < (int)srcNum, "Out-of-bound source register for atomic");
+        if (ID == 0) {
+          offset = reg;
+        } else if (hasExtraBtiReg() && (int)ID == (int)srcNum - 1) {
+          setBtiReg(reg);
+        } else {
+          fn.setRegister(payload, ID - 1, reg);
+        }
       }
       INLINE uint32_t getSrcNum(void) const { return srcNum; }
 
-      INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
-      INLINE Register getBTI(void) const { return bti; }
-      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
       INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
       INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       Register dst[1];
-      Tuple src;
-      AddressSpace addrSpace; //!< Address space
-      Register bti;               //!< bti
-      uint8_t fixedBTI:1;      //!< fixed bti or not
+      Tuple payload;
       uint8_t srcNum:3;     //!<Source Number
       AtomicOps atomicOp:6;     //!<Source Number
     };
@@ -428,119 +472,111 @@ namespace ir {
       Register dst[0];       //!< No destination
     };
 
+
     class ALIGNED_INSTRUCTION LoadInstruction :
-      public BasePolicy,
-      public NSrcPolicy<LoadInstruction, 2>
+      public MemInstruction
     {
-    public:
-      LoadInstruction(Type type,
-                      Tuple dstValues,
-                      Register offset,
-                      AddressSpace addrSpace,
-                      uint32_t valueNum,
-                      bool dwAligned,
-                      bool fixedBTI,
-                      Register bti)
-      {
-        GBE_ASSERT(valueNum < 128);
-        this->opcode = OP_LOAD;
-        this->type = type;
-        this->offset = offset;
-        this->values = dstValues;
-        this->addrSpace = addrSpace;
-        this->valueNum = valueNum;
-        this->dwAligned = dwAligned ? 1 : 0;
-        this->fixedBTI = fixedBTI ? 1 : 0;
-        this->bti = bti;
-      }
-      INLINE Register getDst(const Function &fn, uint32_t ID) const {
-        GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
-        return fn.getRegister(values, ID);
-      }
-      INLINE void setDst(Function &fn, uint32_t ID, Register reg) {
-        GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
-        fn.setRegister(values, ID, reg);
-      }
-      INLINE uint32_t getDstNum(void) const { return valueNum; }
-      INLINE Type getValueType(void) const { return type; }
-      INLINE uint32_t getValueNum(void) const { return valueNum; }
-      INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
-      INLINE Register getBTI(void) const { return bti; }
-      INLINE bool wellFormed(const Function &fn, std::string &why) const;
-      INLINE void out(std::ostream &out, const Function &fn) const;
-      INLINE bool isAligned(void) const { return !!dwAligned; }
-      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
-      Type type;              //!< Type to store
-      Register src[0];        //!< Address where to load from
-      Register bti;
-      Register offset;        //!< Alias to make it similar to store
-      Tuple values;           //!< Values to load
-      AddressSpace addrSpace; //!< Where to load
-      uint8_t fixedBTI:1;
-      uint8_t valueNum:7;     //!< Number of values to load
-      uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
-    };
+      public:
+        LoadInstruction(Type type,
+                        Tuple dstValues,
+                        Register offset,
+                        AddressSpace AS,
+                        uint32_t _valueNum,
+                        bool dwAligned,
+                        AddressMode AM,
+                        bool ifBlock = false)
+                      : MemInstruction(AM, AS, dwAligned, type, offset),
+                        valueNum(_valueNum),
+                        values(dstValues),
+                        ifBlock(ifBlock)
+        {
+          this->opcode = OP_LOAD;
+        }
 
+        INLINE unsigned getSrcNum() const { return getBaseSrcNum(); }
+        INLINE Register getSrc(const Function &fn, unsigned id) const {
+          if (id == 0) return offset;
+          if (hasExtraBtiReg() && id == 1) return BtiReg;
+          assert(0 && "LoadInstruction::getSrc() out-of-range");
+          return ir::Register(0);
+        }
+        INLINE void     setSrc(Function &fn, unsigned id, Register reg) {
+          assert(id < getSrcNum());
+          if (id == 0) { offset = reg;   return; }
+          if (id == 1) { setBtiReg(reg); return; }
+        }
+        INLINE unsigned getDstNum() const { return valueNum; }
+        INLINE Register getDst(const Function &fn, unsigned id) const {
+          assert(id < valueNum);
+          return fn.getRegister(values, id);
+        }
+        INLINE void     setDst(Function &fn, unsigned id, Register reg) {
+          assert(id < getDstNum());
+          fn.setRegister(values, id, reg);
+        }
+        INLINE uint32_t getValueNum(void) const { return valueNum; }
+        INLINE Register getValue(const Function &fn, unsigned id) const {
+          assert(id < valueNum);
+          return fn.getRegister(values, id);
+        }
+        INLINE bool wellFormed(const Function &fn, std::string &why) const;
+        INLINE void out(std::ostream &out, const Function &fn) const;
+        INLINE bool isBlock() const { return ifBlock; }
+
+        uint8_t         valueNum;
+        Tuple             values;
+        bool             ifBlock;
+    };
     class ALIGNED_INSTRUCTION StoreInstruction :
-      public BasePolicy, public NDstPolicy<StoreInstruction, 0>
+      public MemInstruction,
+      public NDstPolicy<StoreInstruction, 0>
     {
-    public:
-      StoreInstruction(Type type,
-                       Tuple values,
-                       Register offset,
-                       AddressSpace addrSpace,
-                       uint32_t valueNum,
-                       bool dwAligned,
-                       bool fixedBTI,
-                       Register bti)
-      {
-        GBE_ASSERT(valueNum < 255);
-        this->opcode = OP_STORE;
-        this->type = type;
-        this->offset = offset;
-        this->values = values;
-        this->addrSpace = addrSpace;
-        this->valueNum = valueNum;
-        this->dwAligned = dwAligned ? 1 : 0;
-        this->fixedBTI = fixedBTI ? 1 : 0;
-        this->bti = bti;
-      }
-      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
-        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
-        if (ID == 0u)
-          return bti;
-        else if (ID == 1u)
-          return offset;
-        else
-          return fn.getRegister(values, ID - 2);
-      }
-      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
-        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
-        if (ID == 0u)
-          bti = reg;
-        else if (ID == 1u)
-          offset = reg;
-        else
-          fn.setRegister(values, ID - 2, reg);
-      }
-      INLINE uint32_t getSrcNum(void) const { return valueNum + 2u; }
-      INLINE uint32_t getValueNum(void) const { return valueNum; }
-      INLINE Type getValueType(void) const { return type; }
-      INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
-      INLINE Register getBTI(void) const { return bti; }
-      INLINE bool wellFormed(const Function &fn, std::string &why) const;
-      INLINE void out(std::ostream &out, const Function &fn) const;
-      INLINE bool isAligned(void) const { return !!dwAligned; }
-      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
-      Type type;              //!< Type to store
-      Register bti;
-      Register offset;        //!< First source is the offset where to store
-      Tuple values;           //!< Values to store
-      AddressSpace addrSpace; //!< Where to store
-      uint8_t fixedBTI:1;                //!< Which btis need access
-      uint8_t valueNum:7;     //!< Number of values to store
-      uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
-      Register dst[0];        //!< No destination
+      public:
+        StoreInstruction(Type type,
+                         Tuple values,
+                         Register offset,
+                         AddressSpace addrSpace,
+                         uint32_t valueNum,
+                         bool dwAligned,
+                         AddressMode AM,
+                         bool ifBlock = false)
+          : MemInstruction(AM, addrSpace, dwAligned, type, offset)
+        {
+          this->opcode = OP_STORE;
+          this->values = values;
+          this->valueNum = valueNum;
+          this->ifBlock = ifBlock;
+        }
+        INLINE unsigned getValueNum()      const { return valueNum; }
+        INLINE Register getValue(const Function &fn, unsigned id) const {
+          return fn.getRegister(values, id);
+        }
+        INLINE unsigned getSrcNum()        const { return getBaseSrcNum() + valueNum; }
+        INLINE Register getSrc(const Function &fn, unsigned id) const {
+          if (id == 0)  return offset;
+          if (id <= valueNum) return fn.getRegister(values, id-1);
+          if (hasExtraBtiReg() && (int)id == (int)valueNum+1) return getBtiReg();
+          assert(0 && "StoreInstruction::getSrc() out-of-range");
+          return Register(0);
+        }
+        INLINE void     setSrc(Function &fn, unsigned id, Register reg) {
+          if (id == 0)                   { offset = reg; return; }
+          if (id > 0 && id <= valueNum)  { fn.setRegister(values, id-1, reg); return; }
+          if (hasExtraBtiReg() &&
+              (int)id == (int)valueNum + 1)        {
+            setBtiReg(reg);
+            return;
+          }
+          assert(0 && "StoreInstruction::setSrc() index out-of-range");
+        }
+        INLINE bool wellFormed(const Function &fn, std::string &why) const;
+        INLINE void out(std::ostream &out, const Function &fn) const;
+        INLINE bool isBlock() const { return ifBlock; }
+
+        Register      dst[0];
+        uint8_t     valueNum;
+        Tuple         values;
+        bool         ifBlock;
     };
 
     class ALIGNED_INSTRUCTION SampleInstruction : // TODO
@@ -595,6 +631,58 @@ namespace ir {
       static const uint32_t dstNum = 4;
     };
 
+    class ALIGNED_INSTRUCTION VmeInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<VmeInstruction>,
+      public TupleDstPolicy<VmeInstruction>
+    {
+    public:
+      VmeInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple,
+                     uint32_t dstNum, uint32_t srcNum, int msg_type,
+                     int vme_search_path_lut, int lut_sub) {
+        this->opcode = OP_VME;
+        this->dst = dstTuple;
+        this->src = srcTuple;
+        this->dstNum = dstNum;
+        this->srcNum = srcNum;
+        this->imageIdx = imageIdx;
+        this->msg_type = msg_type;
+        this->vme_search_path_lut = vme_search_path_lut;
+        this->lut_sub = lut_sub;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << " src_surface id " << (int)this->getImageIndex()
+            << " ref_surface id " << (int)this->getImageIndex() + 1;
+        for(uint32_t i = 0; i < dstNum; i++){
+          out<< " %" << this->getDst(fn, i);
+        }
+        for(uint32_t i = 0; i < srcNum; i++){
+          out<< " %" << this->getSrc(fn, i);
+        }
+        out
+            << " msg_type " << (int)this->getMsgType()
+            << " vme_search_path_lut " << (int)this->vme_search_path_lut
+            << " lut_sub " << (int)this->lut_sub;
+      }
+      Tuple src;
+      Tuple dst;
+
+      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE uint8_t getMsgType(void) const { return this->msg_type; }
+
+      INLINE Type getSrcType(void) const { return TYPE_U32; }
+      INLINE Type getDstType(void) const { return TYPE_U32; }
+      uint8_t imageIdx;
+      uint8_t msg_type;
+      uint8_t vme_search_path_lut;
+      uint8_t lut_sub;
+      uint32_t srcNum;
+      uint32_t dstNum;
+    };
+
+
     class ALIGNED_INSTRUCTION TypedWriteInstruction : // TODO
       public BasePolicy,
       public TupleSrcPolicy<TypedWriteInstruction>,
@@ -677,6 +765,58 @@ namespace ir {
       static const uint32_t dstNum = 1;
     };
 
+    class ALIGNED_INSTRUCTION CalcTimestampInstruction :
+      public BasePolicy,
+      public NSrcPolicy<CalcTimestampInstruction, 0>,
+      public NDstPolicy<CalcTimestampInstruction, 0>
+    {
+    public:
+      CalcTimestampInstruction(uint32_t pointNum, uint32_t timestampType) {
+        this->opcode = OP_CALC_TIMESTAMP;
+        this->timestampType = static_cast<uint8_t>(timestampType);
+        this->pointNum = static_cast<uint8_t>(pointNum);
+      }
+
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << "TimeStamp pointer " << static_cast<uint32_t>(pointNum)
+          << " (Type " << static_cast<uint32_t>(timestampType) << ")";
+      }
+      uint32_t getPointNum(void) const { return this->pointNum; }
+      uint32_t getTimestamptType(void) const { return this->timestampType; }
+      uint8_t timestampType;       //!< Type of the time stamp, 16bits or 32bits, eg.
+      uint8_t pointNum;            //!< The insert point number.
+      Register dst[0], src[0];
+    };
+
+    class ALIGNED_INSTRUCTION StoreProfilingInstruction :
+      public BasePolicy,
+      public NSrcPolicy<StoreProfilingInstruction, 0>,
+      public NDstPolicy<StoreProfilingInstruction, 0>
+    {
+    public:
+      StoreProfilingInstruction(uint32_t bti, uint32_t profilingType) {
+        this->opcode = OP_STORE_PROFILING;
+        this->profilingType = static_cast<uint8_t>(profilingType);
+        this->bti = static_cast<uint8_t>(bti);
+      }
+
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << " BTI " << static_cast<uint32_t>(this->bti)
+          << " (Type " << static_cast<uint32_t>(this->profilingType) << ")";
+      }
+
+      uint32_t getProfilingType(void) const { return this->profilingType; }
+      uint32_t getBTI(void) const { return this->bti; }
+      uint8_t profilingType;     //!< Type format of profiling, 16bits or 32bits, eg.
+      uint8_t bti;
+      Register src[0];
+      Register dst[0];
+    };
+
     class ALIGNED_INSTRUCTION LoadImmInstruction :
       public BasePolicy,
       public NSrcPolicy<LoadImmInstruction, 0>,
@@ -818,6 +958,184 @@ namespace ir {
       Register dst[0], src[0];
     };
 
+    /*! Wait instructions */
+    class ALIGNED_INSTRUCTION WaitInstruction :
+      public BasePolicy,
+      public NSrcPolicy<WaitInstruction, 0>,
+      public NDstPolicy<WaitInstruction, 0>
+    {
+    public:
+      INLINE WaitInstruction() {
+        this->opcode = OP_WAIT;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Register dst[0], src[0];
+    };
+
+    class ALIGNED_INSTRUCTION WorkGroupInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<WorkGroupInstruction>,
+      public NDstPolicy<WorkGroupInstruction, 1>
+    {
+      public:
+        INLINE WorkGroupInstruction(WorkGroupOps opcode, uint32_t slmAddr, Register dst,
+            Tuple srcTuple, uint8_t srcNum, Type type) {
+          this->opcode = OP_WORKGROUP;
+          this->workGroupOp = opcode;
+          this->type = type;
+          this->dst[0] = dst;
+          this->src = srcTuple;
+          this->srcNum = srcNum;
+          this->slmAddr = slmAddr;
+        }
+        INLINE Type getType(void) const { return this->type; }
+        INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+        INLINE void out(std::ostream &out, const Function &fn) const;
+        INLINE WorkGroupOps getWorkGroupOpcode(void) const { return this->workGroupOp; }
+        uint32_t getSlmAddr(void) const { return this->slmAddr; }
+
+        WorkGroupOps workGroupOp:5;
+        uint32_t srcNum:3;          //!< Source Number
+        uint32_t slmAddr:24;        //!< Thread Map in SLM.
+        Type type;                  //!< Type of the instruction
+        Tuple src;
+        Register dst[1];
+    };
+
+    class ALIGNED_INSTRUCTION SubGroupInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<SubGroupInstruction>,
+      public NDstPolicy<SubGroupInstruction, 1>
+    {
+      public:
+        INLINE SubGroupInstruction(WorkGroupOps opcode, Register dst,
+            Tuple srcTuple, uint8_t srcNum, Type type) {
+          this->opcode = OP_SUBGROUP;
+          this->workGroupOp = opcode;
+          this->type = type;
+          this->dst[0] = dst;
+          this->src = srcTuple;
+          this->srcNum = srcNum;
+        }
+        INLINE Type getType(void) const { return this->type; }
+        INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+        INLINE void out(std::ostream &out, const Function &fn) const;
+        INLINE WorkGroupOps getWorkGroupOpcode(void) const { return this->workGroupOp; }
+
+        WorkGroupOps workGroupOp:5;
+        uint32_t srcNum:3;          //!< Source Number
+        Type type;                  //!< Type of the instruction
+        Tuple src;
+        Register dst[1];
+    };
+
+    class ALIGNED_INSTRUCTION PrintfInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<PrintfInstruction>,
+      public NDstPolicy<PrintfInstruction, 1>
+    {
+      public:
+        INLINE PrintfInstruction(Register dst, Tuple srcTuple, Tuple typeTuple,
+                                 uint8_t srcNum, uint8_t bti, uint16_t num) {
+          this->opcode = OP_PRINTF;
+          this->dst[0] = dst;
+          this->src = srcTuple;
+          this->type = typeTuple;
+          this->srcNum = srcNum;
+          this->bti = bti;
+          this->num = num;
+        }
+        INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+        INLINE void out(std::ostream &out, const Function &fn) const;
+
+        uint32_t getNum(void) const { return this->num; }
+        uint32_t getBti(void) const { return this->bti; }
+        Type getType(const Function& fn, uint32_t ID) const {
+          GBE_ASSERTM(ID < this->srcNum, "Out-of-bound types");
+          return (Type)fn.getType(type, ID);
+        }
+
+        uint32_t srcNum:8;    //!< Source Number
+        uint32_t bti:8;       //!< The BTI
+        uint32_t num:16;      //!< The printf statement number of one kernel.
+        Tuple src;
+        Tuple type;
+        Register dst[1];
+    };
+
+    class ALIGNED_INSTRUCTION MediaBlockReadInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<MediaBlockReadInstruction>,
+      public TupleDstPolicy<MediaBlockReadInstruction>
+    {
+    public:
+      INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum) {
+        this->opcode = OP_MBREAD;
+        this->dst = dst;
+        this->dstNum = vec_size;
+        this->src = srcTuple;
+        this->srcNum = srcNum;
+        this->imageIdx = imageIdx;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << (int)this->getVectorSize();
+        out << " {";
+        for (uint32_t i = 0; i < dstNum; ++i)
+          out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
+        out << "}";
+        out << " 2D surface id " << (int)this->getImageIndex()
+            << " byte coord x %" << this->getSrc(fn, 0)
+            << " row coord y %" << this->getSrc(fn, 1);
+      }
+      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE uint8_t getVectorSize(void) const { return this->dstNum; }
+
+      Tuple src;
+      Tuple dst;
+      uint8_t imageIdx;
+      uint8_t srcNum;
+      uint8_t dstNum;
+    };
+
+    class ALIGNED_INSTRUCTION MediaBlockWriteInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<MediaBlockWriteInstruction>,
+      public NDstPolicy<MediaBlockWriteInstruction, 0>
+    {
+    public:
+
+      INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
+        this->opcode = OP_MBWRITE;
+        this->src = srcTuple;
+        this->srcNum = srcNum;
+        this->imageIdx = imageIdx;
+        this->vec_size = vec_size;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << (int)this->getVectorSize()
+            << " 2D surface id " << (int)this->getImageIndex()
+            << " byte coord x %" << this->getSrc(fn, 0)
+            << " row coord y %" << this->getSrc(fn, 1);
+        out << " {";
+        for (uint32_t i = 0; i < vec_size; ++i)
+          out << "%" << this->getSrc(fn, i + 2) << (i != (vec_size-1u) ? " " : "");
+        out << "}";
+      }
+      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE uint8_t getVectorSize(void) const { return this->vec_size; }
+
+      Tuple src;
+      Register dst[0];
+      uint8_t imageIdx;
+      uint8_t srcNum;
+      uint8_t vec_size;
+    };
+
 #undef ALIGNED_INSTRUCTION
 
     /////////////////////////////////////////////////////////////////////////
@@ -1037,8 +1355,6 @@ namespace ir {
         if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
           return false;
 
-      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, bti, fn, whyNot) == false))
-        return false;
       return true;
     }
 
@@ -1065,7 +1381,7 @@ namespace ir {
     template <typename T>
     INLINE bool wellFormedLoadStore(const T &insn, const Function &fn, std::string &whyNot)
     {
-      if (UNLIKELY(insn.offset >= fn.regNum())) {
+      if (UNLIKELY(insn.getAddressRegister() >= fn.regNum())) {
         whyNot = "Out-of-bound offset register index";
         return false;
       }
@@ -1073,10 +1389,11 @@ namespace ir {
         whyNot = "Out-of-bound tuple index";
         return false;
       }
+
       // Check all registers
-      const RegisterFamily family = getFamily(insn.type);
-      for (uint32_t valueID = 0; valueID < insn.valueNum; ++valueID) {
-        const Register regID = fn.getRegister(insn.values, valueID);
+      const RegisterFamily family = getFamily(insn.getValueType());
+      for (uint32_t valueID = 0; valueID < insn.getValueNum(); ++valueID) {
+        const Register regID = insn.getValue(fn, valueID);;
         if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
           return false;
       }
@@ -1111,10 +1428,14 @@ namespace ir {
     // TODO
     INLINE bool SampleInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
+    INLINE bool VmeInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
     INLINE bool TypedWriteInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
     INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
+    INLINE bool WaitInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
 
 
     // Ensure that types and register family match
@@ -1226,6 +1547,138 @@ namespace ir {
       return true;
     }
 
+    INLINE bool CalcTimestampInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      if (UNLIKELY(this->timestampType != 1)) {
+        whyNot = "Wrong time stamp type";
+        return false;
+      }
+      if (UNLIKELY(this->pointNum >= 20 && this->pointNum != 0xff && this->pointNum != 0xfe)) {
+        whyNot = "To much Insert pointer";
+        return false;
+      }
+      return true;
+    }
+
+    INLINE bool StoreProfilingInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      if (UNLIKELY(this->profilingType != 1)) {
+        whyNot = "Wrong profiling format";
+        return false;
+      }
+      return true;
+    }
+
+    INLINE bool WorkGroupInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      const RegisterFamily family = getFamily(this->type);
+
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+        return false;
+
+      switch (this->workGroupOp) {
+        case WORKGROUP_OP_ANY:
+        case WORKGROUP_OP_ALL:
+        case WORKGROUP_OP_REDUCE_ADD:
+        case WORKGROUP_OP_REDUCE_MIN:
+        case WORKGROUP_OP_REDUCE_MAX:
+        case WORKGROUP_OP_INCLUSIVE_ADD:
+        case WORKGROUP_OP_INCLUSIVE_MIN:
+        case WORKGROUP_OP_INCLUSIVE_MAX:
+        case WORKGROUP_OP_EXCLUSIVE_ADD:
+        case WORKGROUP_OP_EXCLUSIVE_MIN:
+        case WORKGROUP_OP_EXCLUSIVE_MAX:
+          if (this->srcNum != 3) {
+            whyNot = "Wrong number of source.";
+            return false;
+          }
+          break;
+        case WORKGROUP_OP_BROADCAST:
+          if (this->srcNum <= 1) {
+            whyNot = "Wrong number of source.";
+            return false;
+          } else {
+            const RegisterFamily fam = fn.getPointerFamily();
+            for (uint32_t srcID = 1; srcID < this->srcNum; ++srcID) {
+              const Register regID = fn.getRegister(src, srcID);
+              if (UNLIKELY(checkRegisterData(fam, regID, fn, whyNot) == false))
+                return false;
+            }
+          }
+          break;
+        default:
+          whyNot = "No such work group function.";
+          return false;
+      }
+
+      return true;
+    }
+
+    INLINE bool SubGroupInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      const RegisterFamily family = getFamily(this->type);
+
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+        return false;
+
+      switch (this->workGroupOp) {
+        case WORKGROUP_OP_ANY:
+        case WORKGROUP_OP_ALL:
+        case WORKGROUP_OP_REDUCE_ADD:
+        case WORKGROUP_OP_REDUCE_MIN:
+        case WORKGROUP_OP_REDUCE_MAX:
+        case WORKGROUP_OP_INCLUSIVE_ADD:
+        case WORKGROUP_OP_INCLUSIVE_MIN:
+        case WORKGROUP_OP_INCLUSIVE_MAX:
+        case WORKGROUP_OP_EXCLUSIVE_ADD:
+        case WORKGROUP_OP_EXCLUSIVE_MIN:
+        case WORKGROUP_OP_EXCLUSIVE_MAX:
+          if (this->srcNum != 1) {
+            whyNot = "Wrong number of source.";
+            return false;
+          }
+          break;
+        case WORKGROUP_OP_BROADCAST:
+          if (this->srcNum != 2) {
+            whyNot = "Wrong number of source.";
+            return false;
+          } else {
+            const RegisterFamily fam = fn.getPointerFamily();
+            for (uint32_t srcID = 1; srcID < this->srcNum; ++srcID) {
+              const Register regID = fn.getRegister(src, srcID);
+              if (UNLIKELY(checkRegisterData(fam, regID, fn, whyNot) == false))
+                return false;
+            }
+          }
+          break;
+        default:
+          whyNot = "No such sub group function.";
+          return false;
+      }
+
+      return true;
+    }
+
+    INLINE bool PrintfInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      return true;
+    }
+
+    INLINE bool MediaBlockReadInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      if (this->srcNum != 2) {
+        whyNot = "Wrong number of source.";
+        return false;
+      }
+      return true;
+    }
+
+    INLINE bool MediaBlockWriteInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      if (this->srcNum != 2 + this->vec_size) {
+        whyNot = "Wrong number of source.";
+        return false;
+      }
+      return true;
+    }
+
 #undef CHECK_TYPE
 
     /////////////////////////////////////////////////////////////////////////
@@ -1260,12 +1713,18 @@ namespace ir {
 
     INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
-      out << "." << addrSpace;
+      out << "." << AS;
       out << " %" << this->getDst(fn, 0);
-      out << " {" << "%" << this->getSrc(fn, 1) << "}";
-      for (uint32_t i = 2; i < srcNum; ++i)
+      out << " {" << "%" << this->getSrc(fn, 0) << "}";
+      for (uint32_t i = 1; i < srcNum; ++i)
         out << " %" << this->getSrc(fn, i);
-      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
+      AddressMode am = this->getAddressMode();
+      out << " bti:";
+      if ( am == AM_DynamicBti) {
+        out << " %" << this->getBtiReg();
+      } else {
+        out << this->getSurfaceIndex();
+      }
     }
 
 
@@ -1293,24 +1752,40 @@ namespace ir {
     }
 
     INLINE void LoadInstruction::out(std::ostream &out, const Function &fn) const {
+      if(ifBlock)
+        out<< "BLOCK";
       this->outOpcode(out);
-      out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
+      out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
       out << " {";
       for (uint32_t i = 0; i < valueNum; ++i)
         out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
       out << "}";
-      out << " %" << this->getSrc(fn, 1);
-      out << (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
+      out << " %" << this->getSrc(fn, 0);
+      AddressMode am = this->getAddressMode();
+      out << " bti:";
+      if ( am == AM_DynamicBti) {
+        out << " %" << this->getBtiReg();
+      } else {
+        out << this->getSurfaceIndex();
+      }
     }
 
     INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
+      if(ifBlock)
+        out<< "BLOCK";
       this->outOpcode(out);
-      out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
-      out << " %" << this->getSrc(fn, 1) << " {";
+      out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
+      out << " %" << this->getSrc(fn, 0) << " {";
       for (uint32_t i = 0; i < valueNum; ++i)
-        out << "%" << this->getSrc(fn, i+2) << (i != (valueNum-1u) ? " " : "");
+        out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
       out << "}";
-      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
+      AddressMode am = this->getAddressMode();
+      out << " bti:";
+      if ( am == AM_DynamicBti) {
+        out << " %" << this->getBtiReg();
+      } else {
+        out << this->getSurfaceIndex();
+      }
     }
 
     INLINE void ReadARFInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1361,6 +1836,147 @@ namespace ir {
           out << "." << syncStr[field];
     }
 
+    INLINE void WaitInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+    }
+
+    INLINE void WorkGroupInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+
+      switch (this->workGroupOp) {
+        case WORKGROUP_OP_ANY:
+          out << "_" << "ANY";
+          break;
+        case WORKGROUP_OP_ALL:
+          out << "_" << "ALL";
+          break;
+        case WORKGROUP_OP_REDUCE_ADD:
+          out << "_" << "REDUCE_ADD";
+          break;
+        case WORKGROUP_OP_REDUCE_MIN:
+          out << "_" << "REDUCE_MIN";
+          break;
+        case WORKGROUP_OP_REDUCE_MAX:
+          out << "_" << "REDUCE_MAX";
+          break;
+        case WORKGROUP_OP_INCLUSIVE_ADD:
+          out << "_" << "INCLUSIVE_ADD";
+          break;
+        case WORKGROUP_OP_INCLUSIVE_MIN:
+          out << "_" << "INCLUSIVE_MIN";
+          break;
+        case WORKGROUP_OP_INCLUSIVE_MAX:
+          out << "_" << "INCLUSIVE_MAX";
+          break;
+        case WORKGROUP_OP_EXCLUSIVE_ADD:
+          out << "_" << "EXCLUSIVE_ADD";
+          break;
+        case WORKGROUP_OP_EXCLUSIVE_MIN:
+          out << "_" << "EXCLUSIVE_MIN";
+          break;
+        case WORKGROUP_OP_EXCLUSIVE_MAX:
+          out << "_" << "EXCLUSIVE_MAX";
+          break;
+        case WORKGROUP_OP_BROADCAST:
+          out << "_" << "BROADCAST";
+          break;
+        default:
+          GBE_ASSERT(0);
+      }
+
+      out << " %" << this->getDst(fn, 0);
+      out << " %" << this->getSrc(fn, 0);
+
+      if (this->workGroupOp == WORKGROUP_OP_BROADCAST) {
+        do {
+          int localN = srcNum - 1;
+          GBE_ASSERT(localN);
+          out << " Local X:";
+          out << " %" << this->getSrc(fn, 1);
+          localN--;
+          if (!localN)
+            break;
+
+          out << " Local Y:";
+          out << " %" << this->getSrc(fn, 2);
+          localN--;
+          if (!localN)
+            break;
+
+          out << " Local Z:";
+          out << " %" << this->getSrc(fn, 3);
+          localN--;
+          GBE_ASSERT(!localN);
+        } while(0);
+      }
+
+      out << "TheadID Map at SLM: " << this->slmAddr;
+    }
+
+    INLINE void SubGroupInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+
+      switch (this->workGroupOp) {
+        case WORKGROUP_OP_ANY:
+          out << "_" << "ANY";
+          break;
+        case WORKGROUP_OP_ALL:
+          out << "_" << "ALL";
+          break;
+        case WORKGROUP_OP_REDUCE_ADD:
+          out << "_" << "REDUCE_ADD";
+          break;
+        case WORKGROUP_OP_REDUCE_MIN:
+          out << "_" << "REDUCE_MIN";
+          break;
+        case WORKGROUP_OP_REDUCE_MAX:
+          out << "_" << "REDUCE_MAX";
+          break;
+        case WORKGROUP_OP_INCLUSIVE_ADD:
+          out << "_" << "INCLUSIVE_ADD";
+          break;
+        case WORKGROUP_OP_INCLUSIVE_MIN:
+          out << "_" << "INCLUSIVE_MIN";
+          break;
+        case WORKGROUP_OP_INCLUSIVE_MAX:
+          out << "_" << "INCLUSIVE_MAX";
+          break;
+        case WORKGROUP_OP_EXCLUSIVE_ADD:
+          out << "_" << "EXCLUSIVE_ADD";
+          break;
+        case WORKGROUP_OP_EXCLUSIVE_MIN:
+          out << "_" << "EXCLUSIVE_MIN";
+          break;
+        case WORKGROUP_OP_EXCLUSIVE_MAX:
+          out << "_" << "EXCLUSIVE_MAX";
+          break;
+        case WORKGROUP_OP_BROADCAST:
+          out << "_" << "BROADCAST";
+          break;
+        default:
+          GBE_ASSERT(0);
+      }
+
+      out << " %" << this->getDst(fn, 0);
+      out << " %" << this->getSrc(fn, 0);
+
+      if (this->workGroupOp == WORKGROUP_OP_BROADCAST) {
+        do {
+          int localN = srcNum - 1;
+          GBE_ASSERT(localN);
+          out << " Local ID:";
+          out << " %" << this->getSrc(fn, 1);
+          localN--;
+          if (!localN)
+            break;
+        } while(0);
+      }
+
+    }
+
+    INLINE void PrintfInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+    }
 
   } /* namespace internal */
 
@@ -1466,6 +2082,14 @@ START_INTROSPECTION(GetImageInfoInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(GetImageInfoInstruction)
 
+START_INTROSPECTION(CalcTimestampInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(CalcTimestampInstruction)
+
+START_INTROSPECTION(StoreProfilingInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(StoreProfilingInstruction)
+
 START_INTROSPECTION(LoadImmInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LoadImmInstruction)
@@ -1502,6 +2126,34 @@ START_INTROSPECTION(LabelInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LabelInstruction)
 
+START_INTROSPECTION(WaitInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(WaitInstruction)
+
+START_INTROSPECTION(VmeInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(VmeInstruction)
+
+START_INTROSPECTION(WorkGroupInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(WorkGroupInstruction)
+
+START_INTROSPECTION(SubGroupInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SubGroupInstruction)
+
+START_INTROSPECTION(PrintfInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(PrintfInstruction)
+
+START_INTROSPECTION(MediaBlockReadInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(MediaBlockReadInstruction)
+
+START_INTROSPECTION(MediaBlockWriteInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(MediaBlockWriteInstruction)
+
 #undef END_INTROSPECTION
 #undef START_INTROSPECTION
 #undef DECL_INSN
@@ -1645,7 +2297,12 @@ END_FUNCTION(Instruction, Register)
     return opcode == OP_STORE ||
            opcode == OP_TYPED_WRITE ||
            opcode == OP_SYNC ||
-           opcode == OP_ATOMIC;
+           opcode == OP_ATOMIC ||
+           opcode == OP_CALC_TIMESTAMP ||
+           opcode == OP_STORE_PROFILING ||
+           opcode == OP_WAIT ||
+           opcode == OP_PRINTF ||
+           opcode == OP_MBWRITE;
   }
 
 #define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
@@ -1664,19 +2321,19 @@ DECL_MEM_FN(BitCastInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
-DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(MemInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(MemInstruction, AddressMode, getAddressMode(void), getAddressMode())
+DECL_MEM_FN(MemInstruction, Register, getAddressRegister(void), getAddressRegister())
+DECL_MEM_FN(MemInstruction, Register, getBtiReg(void), getBtiReg())
+DECL_MEM_FN(MemInstruction, unsigned, getSurfaceIndex(void), getSurfaceIndex())
+DECL_MEM_FN(MemInstruction, Type,     getValueType(void), getValueType())
+DECL_MEM_FN(MemInstruction, bool,     isAligned(void), isAligned())
+DECL_MEM_FN(MemInstruction, unsigned, getAddressIndex(void), getAddressIndex())
 DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
-DECL_MEM_FN(AtomicInstruction, bool, isFixedBTI(void), isFixedBTI())
-DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
-DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
-DECL_MEM_FN(StoreInstruction, bool, isFixedBTI(void), isFixedBTI())
-DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
+DECL_MEM_FN(StoreInstruction, bool, isBlock(void), isBlock())
 DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
-DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
-DECL_MEM_FN(LoadInstruction, bool, isFixedBTI(void), isFixedBTI())
+DECL_MEM_FN(LoadInstruction, bool, isBlock(void), isBlock())
 DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
 DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
 DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
@@ -1694,11 +2351,40 @@ DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(SampleInstruction, uint8_t, getSamplerIndex(void), getSamplerIndex())
 DECL_MEM_FN(SampleInstruction, uint8_t, getSamplerOffset(void), getSamplerOffset())
 DECL_MEM_FN(SampleInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(VmeInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(VmeInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(VmeInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(VmeInstruction, uint8_t, getMsgType(void), getMsgType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
 DECL_MEM_FN(TypedWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
 DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(CalcTimestampInstruction, uint32_t, getPointNum(void), getPointNum())
+DECL_MEM_FN(CalcTimestampInstruction, uint32_t, getTimestamptType(void), getTimestamptType())
+DECL_MEM_FN(StoreProfilingInstruction, uint32_t, getProfilingType(void), getProfilingType())
+DECL_MEM_FN(StoreProfilingInstruction, uint32_t, getBTI(void), getBTI())
+DECL_MEM_FN(WorkGroupInstruction, Type, getType(void), getType())
+DECL_MEM_FN(WorkGroupInstruction, WorkGroupOps, getWorkGroupOpcode(void), getWorkGroupOpcode())
+DECL_MEM_FN(WorkGroupInstruction, uint32_t, getSlmAddr(void), getSlmAddr())
+DECL_MEM_FN(SubGroupInstruction, Type, getType(void), getType())
+DECL_MEM_FN(SubGroupInstruction, WorkGroupOps, getWorkGroupOpcode(void), getWorkGroupOpcode())
+DECL_MEM_FN(PrintfInstruction, uint32_t, getNum(void), getNum())
+DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti())
+DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), getType(fn, ID))
+DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), getVectorSize())
+DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize())
+
+#undef DECL_MEM_FN
+
+#define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
+  RET CLASS::PROTOTYPE { \
+    return reinterpret_cast<internal::CLASS*>(this)->CALL; \
+  }
+DECL_MEM_FN(MemInstruction, void,     setSurfaceIndex(unsigned id), setSurfaceIndex(id))
+DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
 
 #undef DECL_MEM_FN
 
@@ -1800,6 +2486,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   Instruction MAD(Type type, Register dst, Tuple src) {
     return internal::TernaryInstruction(OP_MAD, type, dst, src).convert();
   }
+
+  Instruction LRP(Type type, Register dst, Tuple src) {
+    return internal::TernaryInstruction(OP_LRP, type, dst, src).convert();
+  }
   // All compare functions
 #define DECL_EMIT_FUNCTION(NAME) \
   Instruction NAME(Type type, Register dst,  Register src0, Register src1) { \
@@ -1843,8 +2533,16 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   }
 
   // For all unary functions with given opcode
-  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src) {
-    return internal::AtomicInstruction(atomicOp, dst, space, bti, fixedBTI, src).convert();
+  Instruction ATOMIC(AtomicOps atomicOp, Type type, Register dst, AddressSpace space, Register address, Tuple payload, AddressMode AM, Register bti) {
+    internal::AtomicInstruction insn = internal::AtomicInstruction(atomicOp, type, dst, space, address, payload, AM);
+    insn.setBtiReg(bti);
+    return insn.convert();
+  }
+
+  Instruction ATOMIC(AtomicOps atomicOp, Type type, Register dst, AddressSpace space, Register address, Tuple payload, AddressMode AM, unsigned SurfaceIndex) {
+    internal::AtomicInstruction insn = internal::AtomicInstruction(atomicOp, type, dst, space, address, payload, AM);
+    insn.setSurfaceIndex(SurfaceIndex);
+    return insn.convert();
   }
 
   // BRA
@@ -1892,10 +2590,26 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
                    AddressSpace space, \
                    uint32_t valueNum, \
                    bool dwAligned, \
-                   bool fixedBTI, \
+                   AddressMode AM, \
+                   unsigned SurfaceIndex, \
+                   bool isBlock) \
+  { \
+    internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM, isBlock); \
+    insn.setSurfaceIndex(SurfaceIndex);\
+    return insn.convert(); \
+  } \
+  Instruction NAME(Type type, \
+                   Tuple tuple, \
+                   Register offset, \
+                   AddressSpace space, \
+                   uint32_t valueNum, \
+                   bool dwAligned, \
+                   AddressMode AM, \
                    Register bti) \
   { \
-    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,fixedBTI,bti).convert(); \
+    internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \
+    insn.setBtiReg(bti); \
+    return insn.convert(); \
   }
 
   DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
@@ -1932,6 +2646,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
     return internal::SampleInstruction(imageIndex, dst, src, srcNum, dstIsFloat, srcIsFloat, sampler, samplerOffset).convert();
   }
 
+  Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int lut_sub) {
+    return internal::VmeInstruction(imageIndex, dst, src, dstNum, srcNum, msg_type, vme_search_path_lut, lut_sub).convert();
+  }
+
   Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType) {
     return internal::TypedWriteInstruction(imageIndex, src, srcNum, srcType, coordType).convert();
   }
@@ -1940,6 +2658,40 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
     return internal::GetImageInfoInstruction(infoType, dst, imageIndex, infoReg).convert();
   }
 
+  Instruction CALC_TIMESTAMP(uint32_t pointNum, uint32_t tsType) {
+    return internal::CalcTimestampInstruction(pointNum, tsType).convert();
+  }
+
+  Instruction STORE_PROFILING(uint32_t bti, uint32_t profilingType) {
+    return internal::StoreProfilingInstruction(bti, profilingType).convert();
+  }
+
+  // WAIT
+  Instruction WAIT(void) {
+    return internal::WaitInstruction().convert();
+  }
+
+  Instruction WORKGROUP(WorkGroupOps opcode, uint32_t slmAddr, Register dst, Tuple srcTuple, uint8_t srcNum, Type type) {
+    return internal::WorkGroupInstruction(opcode, slmAddr, dst, srcTuple, srcNum, type).convert();
+  }
+
+  Instruction SUBGROUP(WorkGroupOps opcode, Register dst, Tuple srcTuple, uint8_t srcNum, Type type) {
+    return internal::SubGroupInstruction(opcode, dst, srcTuple, srcNum, type).convert();
+  }
+
+  Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num) {
+    return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, num).convert();
+  }
+
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum) {
+    return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum).convert();
+  }
+
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
+    return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size).convert();
+  }
+
+
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
     const Function &fn = insn.getFunction();
     const BasicBlock *bb = insn.getParent();
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index cf8d839..b2b0b49 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -65,6 +65,13 @@ namespace ir {
     MEM_INVALID
   };
 
+  enum AddressMode : uint8_t {
+    AM_DynamicBti = 0,
+    AM_Stateless,
+    AM_StaticBti,
+    AM_INVALID
+  };
+
   enum AtomicOps {
     ATOMIC_OP_AND       = 1,
     ATOMIC_OP_OR        = 2,
@@ -82,6 +89,22 @@ namespace ir {
     ATOMIC_OP_INVALID
   };
 
+  enum WorkGroupOps {
+    WORKGROUP_OP_ANY = 1,
+    WORKGROUP_OP_ALL = 2,
+    WORKGROUP_OP_BROADCAST = 3,
+    WORKGROUP_OP_REDUCE_ADD = 4,
+    WORKGROUP_OP_REDUCE_MIN = 5,
+    WORKGROUP_OP_REDUCE_MAX = 6,
+    WORKGROUP_OP_INCLUSIVE_ADD = 7,
+    WORKGROUP_OP_INCLUSIVE_MIN = 8,
+    WORKGROUP_OP_INCLUSIVE_MAX = 9,
+    WORKGROUP_OP_EXCLUSIVE_ADD = 10,
+    WORKGROUP_OP_EXCLUSIVE_MIN = 11,
+    WORKGROUP_OP_EXCLUSIVE_MAX = 12,
+    WORKGROUP_OP_INVALID
+  };
+
   /* Vote function per hardware thread */
   enum VotePredicate : uint8_t {
     VOTE_ALL = 0,
@@ -111,10 +134,10 @@ namespace ir {
   {
   public:
     /*! Initialize the instruction from a 8 bytes stream */
-    INLINE InstructionBase(const char *stream) {
-      opcode = Opcode(stream[0]);
+    INLINE InstructionBase(Opcode op, const char* opaque) {
+      opcode = op;
       for (uint32_t byte = 0; byte < opaqueSize; ++byte)
-        opaque[byte] = stream[byte+1];
+        this->opaque[byte] = opaque[byte];
     }
     /*! Uninitialized instruction */
     INLINE InstructionBase(void) {}
@@ -132,12 +155,12 @@ namespace ir {
   {
   public:
     /*! Initialize the instruction from a 8 bytes stream */
-    INLINE Instruction(const char *stream) : InstructionBase(stream) {
+    INLINE Instruction(const char *stream) : InstructionBase(Opcode(stream[0]), &stream[1]) {
       parent = NULL;
     }
     /*! Copy the private fields and give it the same parent */
     INLINE Instruction(const Instruction &other) :
-      InstructionBase(reinterpret_cast<const char*>(&other.opcode)) {
+      InstructionBase(other.opcode, other.opaque) {
       parent = other.parent;
     }
 
@@ -185,15 +208,17 @@ namespace ir {
     void remove(void);
     /* Insert the instruction after the previous one. */
     void insert(Instruction *prev, Instruction ** new_ins = NULL);
+    void setDBGInfo(DebugInfo in) { DBGInfo = in; }
     /*! Indicates if the instruction belongs to instruction type T. Typically, T
      *  can be BinaryInstruction, UnaryInstruction, LoadInstruction and so on
      */
     template <typename T> INLINE bool isMemberOf(void) const {
       return T::isClassOf(*this);
     }
-    /*! max_src for store instruction (vec16 + addr) */
-    static const uint32_t MAX_SRC_NUM = 32;
+    /*! max_src used by vme for payload passing and setting */
+    static const uint32_t MAX_SRC_NUM = 40;
     static const uint32_t MAX_DST_NUM = 32;
+    DebugInfo DBGInfo;
   protected:
     BasicBlock *parent;      //!< The basic block containing the instruction
     GBE_CLASS(Instruction);  //!< Use internal allocators
@@ -288,20 +313,30 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  class MemInstruction : public Instruction {
+  public:
+    unsigned getSurfaceIndex() const;
+    unsigned getAddressIndex() const;
+    /*! Address space that is manipulated here */
+    AddressMode getAddressMode() const;
+    Register getBtiReg() const;
+    /*! Return the register that contains the addresses */
+    Register getAddressRegister() const;
+    AddressSpace getAddressSpace() const;
+    /*! Return the types of the values */
+    Type getValueType() const;
+    bool isAligned(void) const;
+    void setBtiReg(Register reg);
+    void setSurfaceIndex(unsigned idx);
+  };
+
   /*! Atomic instruction */
-  class AtomicInstruction : public Instruction {
+  class AtomicInstruction : public MemInstruction {
   public:
     /*! Where the address register goes */
-    static const uint32_t btiIndex = 0;
-    static const uint32_t addressIndex = 1;
-    /*! Address space that is manipulated here */
-    AddressSpace getAddressSpace(void) const;
-    Register getBTI(void) const { return this->getSrc(btiIndex); }
-    bool isFixedBTI(void) const;
+    static const uint32_t addressIndex = 0;
     /*! Return the atomic function code */
     AtomicOps getAtomicOpcode(void) const;
-    /*! Return the register that contains the addresses */
-    INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
   };
@@ -309,56 +344,38 @@ namespace ir {
   /*! Store instruction. First source is the address. Next sources are the
    *  values to store contiguously at the given address
    */
-  class StoreInstruction : public Instruction {
+  class StoreInstruction : public MemInstruction {
   public:
     /*! Where the address register goes */
-    static const uint32_t btiIndex = 0;
-    static const uint32_t addressIndex = 1;
-    /*! Return the types of the values to store */
-    Type getValueType(void) const;
-    /*! Give the number of values the instruction is storing (srcNum-1) */
+    static const uint32_t addressIndex = 0;
     uint32_t getValueNum(void) const;
-    Register getBTI(void) const { return this->getSrc(btiIndex); }
-    bool isFixedBTI(void) const;
-    /*! Address space that is manipulated here */
-    AddressSpace getAddressSpace(void) const;
-    /*! DWORD aligned means untyped read for Gen. That is what matters */
-    bool isAligned(void) const;
-    /*! Return the register that contains the addresses */
-    INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       GBE_ASSERT(valueID < this->getValueNum());
-      return this->getSrc(valueID + 2u);
+      return this->getSrc(valueID + 1u);
     }
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
+    /*! Return true if the given instruction is block write */
+    bool isBlock() const;
   };
 
   /*! Load instruction. The source is simply the address where to get the data.
    *  The multiple destinations are the contiguous values loaded at the given
    *  address
    */
-  class LoadInstruction : public Instruction {
+  class LoadInstruction : public MemInstruction {
   public:
-    /*! Type of the loaded values (ie type of all the destinations) */
-    Type getValueType(void) const;
     /*! Number of values loaded (ie number of destinations) */
     uint32_t getValueNum(void) const;
-    /*! Address space that is manipulated here */
-    AddressSpace getAddressSpace(void) const;
-    /*! DWORD aligned means untyped read for Gen. That is what matters */
-    bool isAligned(void) const;
-    /*! Return the register that contains the addresses */
-    INLINE Register getAddress(void) const { return this->getSrc(1u); }
-    Register getBTI(void) const {return this->getSrc(0u);}
-    bool isFixedBTI(void) const;
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       return this->getDst(valueID);
     }
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
+    /*! Return true if the given instruction is block read */
+    bool isBlock() const;
   };
 
   /*! Load immediate instruction loads an typed immediate value into the given
@@ -399,8 +416,20 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Video motion estimation */
+  class VmeInstruction : public Instruction {
+  public:
+    uint8_t getImageIndex() const;
+    uint8_t getMsgType() const;
+    Type getSrcType(void) const;
+    Type getDstType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   typedef union _ImageInfoKey{
     _ImageInfoKey(uint8_t i, uint8_t t) : index(i), type(t) {};
+    _ImageInfoKey(int key) : data(key) {};
     struct {
      uint8_t index; /*! the allocated image index */
      uint8_t  type;  /*! the information type */
@@ -440,6 +469,28 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! calculate the exec time and store it. */
+  class CalcTimestampInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+    /*! Get the point number of timestamp point */
+    uint32_t getPointNum(void) const;
+    /*! Get the timestamp type */
+    uint32_t getTimestamptType(void) const;
+  };
+
+  /*! store the profiling information. */
+  class StoreProfilingInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+    /*! Get the profiling info type */
+    uint32_t getProfilingType(void) const;
+    /*! Get the BTI index*/
+    uint32_t getBTI(void) const;
+  };
+
   /*! Branch instruction is the unified way to branch (with or without
    *  predicate)
    */
@@ -547,6 +598,61 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Indirect Move instruction */
+  class WaitInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+  
+  /*! Related to Work Group. */
+  class WorkGroupInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+    Type getType(void) const;
+    WorkGroupOps getWorkGroupOpcode(void) const;
+    uint32_t getSlmAddr(void) const;
+  };
+
+  /*! Related to Sub Group. */
+  class SubGroupInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+    Type getType(void) const;
+    WorkGroupOps getWorkGroupOpcode(void) const;
+  };
+
+  /*! Printf instruction. */
+  class PrintfInstruction : public Instruction {
+  public:
+    uint32_t getNum(void) const;
+    uint32_t getBti(void) const;
+    Type getType(const Function& fn, uint32_t ID) const;
+    Type getType(uint32_t ID) const { return this->getType(this->getFunction(), ID); };
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Media Block Read.  */
+  class MediaBlockReadInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+    uint8_t getImageIndex() const;
+    uint8_t getVectorSize() const;
+  };
+
+  /*! Media Block Write.  */
+  class MediaBlockWriteInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+    uint8_t getImageIndex() const;
+    uint8_t getVectorSize() const;
+  };
+
   /*! Specialize the instruction. Also performs typechecking first based on the
    *  opcode. Crashes if it fails
    */
@@ -615,6 +721,8 @@ namespace ir {
   Instruction I64MADSAT(Type type, Register dst, Tuple src);
   /*! mad.type dst src */
   Instruction MAD(Type type, Register dst, Tuple src);
+  /*! lrp.type dst src */
+  Instruction LRP(Type type, Register dst, Tuple src);
   /*! upsample_short.type dst src */
   Instruction UPSAMPLE_SHORT(Type type, Register dst, Register src0, Register src1);
   /*! upsample_int.type dst src */
@@ -724,7 +832,8 @@ namespace ir {
   /*! F32TO16.{dstType <- srcType} dst src */
   Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
   /*! atomic dst addr.space {src1 {src2}} */
-  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src);
+  Instruction ATOMIC(AtomicOps opcode, Type, Register dst, AddressSpace space, Register ptr, Tuple payload, AddressMode, unsigned);
+  Instruction ATOMIC(AtomicOps opcode, Type, Register dst, AddressSpace space, Register ptr, Tuple src, AddressMode, Register);
   /*! bra labelIndex */
   Instruction BRA(LabelIndex labelIndex);
   /*! (pred) bra labelIndex */
@@ -739,10 +848,12 @@ namespace ir {
   Instruction WHILE(LabelIndex labelIndex, Register pred);
   /*! ret */
   Instruction RET(void);
-  /*! load.type.space {dst1,...,dst_valueNum} offset value */
-  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
-  /*! store.type.space offset {src1,...,src_valueNum} value */
-  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
+  /*! load.type.space {dst1,...,dst_valueNum} offset value, {bti} */
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex, bool isBlock = false);
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
+  /*! store.type.space offset {src1,...,src_valueNum} value {bti}*/
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex, bool isBlock = false);
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
   /*! loadi.type dst value */
   Instruction LOADI(Type type, Register dst, ImmediateIndex value);
   /*! sync.params... (see Sync instruction) */
@@ -755,11 +866,29 @@ namespace ir {
   Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType);
   /*! sample textures */
   Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, uint8_t srcNum, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset);
+  /*! video motion estimation */
+  Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int lut_sub);
   /*! get image information , such as width/height/depth/... */
   Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg);
   /*! label labelIndex */
   Instruction LABEL(LabelIndex labelIndex);
-
+  /*! calculate the execute timestamp for profiling */
+  Instruction CALC_TIMESTAMP(uint32_t pointNum, uint32_t tsType);
+  /*! calculate the execute timestamp for profiling */
+  Instruction STORE_PROFILING(uint32_t bti, uint32_t Type);
+  /*! wait */
+  Instruction WAIT(void);
+
+  /*! work group */
+  Instruction WORKGROUP(WorkGroupOps opcode, uint32_t slmAddr, Register dst, Tuple srcTuple, uint8_t srcNum, Type type);
+  /*! sub group */
+  Instruction SUBGROUP(WorkGroupOps opcode, Register dst, Tuple srcTuple, uint8_t srcNum, Type type);
+  /*! printf */
+  Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num);
+  /*! media block read */
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum);
+  /*! media block write */
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size);
 } /* namespace ir */
 } /* namespace gbe */
 
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 81548c9..7d755ae 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -85,6 +85,7 @@ DECL_INSN(SYNC, SyncInstruction)
 DECL_INSN(LABEL, LabelInstruction)
 DECL_INSN(READ_ARF, ReadARFInstruction)
 DECL_INSN(REGION, RegionInstruction)
+DECL_INSN(VME, VmeInstruction)
 DECL_INSN(INDIRECT_MOV, IndirectMovInstruction)
 DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
 DECL_INSN(MUL_HI, BinaryInstruction)
@@ -102,7 +103,16 @@ DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
 DECL_INSN(UPSAMPLE_LONG, BinaryInstruction)
 DECL_INSN(I64MADSAT, TernaryInstruction)
 DECL_INSN(MAD, TernaryInstruction)
+DECL_INSN(LRP, TernaryInstruction)
 DECL_INSN(IF, BranchInstruction)
 DECL_INSN(ENDIF, BranchInstruction)
 DECL_INSN(ELSE, BranchInstruction)
 DECL_INSN(WHILE, BranchInstruction)
+DECL_INSN(CALC_TIMESTAMP, CalcTimestampInstruction)
+DECL_INSN(STORE_PROFILING, StoreProfilingInstruction)
+DECL_INSN(WAIT, WaitInstruction)
+DECL_INSN(WORKGROUP, WorkGroupInstruction)
+DECL_INSN(SUBGROUP, SubGroupInstruction)
+DECL_INSN(PRINTF, PrintfInstruction)
+DECL_INSN(MBREAD, MediaBlockReadInstruction)
+DECL_INSN(MBWRITE, MediaBlockWriteInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index 9fa7ac3..dbb5c33 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -27,7 +27,7 @@
 namespace gbe {
 namespace ir {
 
-  Liveness::Liveness(Function &fn) : fn(fn) {
+  Liveness::Liveness(Function &fn, bool isInGenBackend) : fn(fn) {
     // Initialize UEVar and VarKill for each block
     fn.foreachBlock([this](const BasicBlock &bb) {
       this->initBlock(bb);
@@ -48,12 +48,58 @@ namespace ir {
     }
     // extend register (def in loop, use out-of-loop) liveness to the whole loop
     set<Register> extentRegs;
-    this->computeExtraLiveInOut(extentRegs);
-    // analyze uniform values. The extentRegs contains all the values which is
-    // defined in a loop and use out-of-loop which could not be a uniform. The reason
-    // is that when it reenter the second time, it may active different lanes. So
-    // reenter many times may cause it has different values in different lanes.
-    this->analyzeUniform(&extentRegs);
+    // Only in Gen backend we need to take care of extra live out analysis.
+    if (isInGenBackend) {
+      this->computeExtraLiveInOut(extentRegs);
+      // analyze uniform values. The extentRegs contains all the values which is
+      // defined in a loop and use out-of-loop which could not be a uniform. The reason
+      // is that when it reenter the second time, it may active different lanes. So
+      // reenter many times may cause it has different values in different lanes.
+      this->analyzeUniform(&extentRegs);
+    }
+  }
+
+  void Liveness::removeRegs(const set<Register> &removes) {
+    for (auto &pair : liveness) {
+      BlockInfo &info = *(pair.second);
+      for (auto reg : removes) {
+        if (info.liveOut.contains(reg))
+          info.liveOut.erase(reg);
+        if (info.upwardUsed.contains(reg))
+          info.upwardUsed.erase(reg);
+      }
+    }
+  }
+
+  void Liveness::replaceRegs(const map<Register, Register> &replaceMap) {
+
+    for (auto &pair : liveness) {
+      BlockInfo &info = *pair.second;
+      BasicBlock *bb = const_cast<BasicBlock *>(&info.bb);
+      for (auto &pair : replaceMap) {
+        Register from = pair.first;
+        Register to = pair.second;
+        if (info.liveOut.contains(from)) {
+          info.liveOut.erase(from);
+          info.liveOut.insert(to);
+          // FIXME, a hack method to avoid the "to" register be treated as
+          // uniform value.
+          bb->definedPhiRegs.insert(to);
+        }
+        if (info.upwardUsed.contains(from)) {
+          info.upwardUsed.erase(from);
+          info.upwardUsed.insert(to);
+        }
+        if (info.varKill.contains(from)) {
+          info.varKill.erase(from);
+          info.varKill.insert(to);
+        }
+        if (bb->undefPhiRegs.contains(from)) {
+          bb->undefPhiRegs.erase(from);
+          bb->undefPhiRegs.insert(to);
+        }
+      }
+    }
   }
 
   Liveness::~Liveness(void) {
@@ -71,11 +117,17 @@ namespace ir {
         if (insn.getOpcode() == ir::OP_SIMD_ID)
           uniform = false;
 
+        // do not change dst uniform for block read
+        if ((insn.getOpcode() == ir::OP_LOAD && ir::cast<ir::LoadInstruction>(insn).isBlock()) ||
+            insn.getOpcode() == ir::OP_MBREAD)
+          uniform = false;
+
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const Register reg = insn.getSrc(srcID);
           if (!fn.isUniformRegister(reg))
             uniform = false;
         }
+
         // A destination is a killed value
         for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
           const Register reg = insn.getDst(dstID);
@@ -154,25 +206,6 @@ namespace ir {
           workSet.insert(prevInfo);
       }
     };
-#if 0
-    fn.foreachBlock([this](const BasicBlock &bb){
-      printf("label %d:\n", bb.getLabelIndex());
-      BlockInfo *info = liveness[&bb];
-      auto &outVarSet = info->liveOut;
-      auto &inVarSet = info->upwardUsed;
-      printf("\n\tin Lives: ");
-      for (auto inVar : inVarSet) {
-        printf("%d ", inVar);
-      }
-      printf("\n");
-      printf("\tout Lives: ");
-      for (auto outVar : outVarSet) {
-        printf("%d ", outVar);
-      }
-      printf("\n");
-
-    });
-#endif
    }
 /*
   As we run in SIMD mode with prediction mask to indicate active lanes.
@@ -190,19 +223,38 @@ namespace ir {
     if(loops.size() == 0) return;
 
     for (auto l : loops) {
+      const BasicBlock &preheader = fn.getBlock(l->preheader);
+      BlockInfo *preheaderInfo = liveness[&preheader];
       for (auto x : l->exits) {
         const BasicBlock &a = fn.getBlock(x.first);
         const BasicBlock &b = fn.getBlock(x.second);
         BlockInfo * exiting = liveness[&a];
         BlockInfo * exit = liveness[&b];
         std::vector<Register> toExtend;
+        std::vector<Register> toExtendCand;
 
-        if(b.getPredecessorSet().size() > 1) {
+        if(b.getPredecessorSet().size() <= 1) {
+          // the exits only have one predecessor
           for (auto p : exit->upwardUsed)
-            toExtend.push_back(p);
+            toExtendCand.push_back(p);
         } else {
-          std::set_intersection(exiting->liveOut.begin(), exiting->liveOut.end(), exit->upwardUsed.begin(), exit->upwardUsed.end(), std::back_inserter(toExtend));
+          // the exits have more than one predecessors
+          std::set_intersection(exiting->liveOut.begin(),
+                                exiting->liveOut.end(),
+                                exit->upwardUsed.begin(),
+                                exit->upwardUsed.end(),
+                                std::back_inserter(toExtendCand));
         }
+        // toExtendCand may contain some virtual register defined before loop,
+        // which need to be excluded. Because what we need is registers defined
+        // in the loop. Such kind of registers must be in live-out of the loop's
+        // preheader. So we do the subtraction here.
+        std::set_difference(toExtendCand.begin(),
+                            toExtendCand.end(),
+                            preheaderInfo->liveOut.begin(),
+                            preheaderInfo->liveOut.end(),
+                            std::back_inserter(toExtend));
+
         if (toExtend.size() == 0) continue;
         for(auto r : toExtend)
           extentRegs.insert(r);
@@ -216,27 +268,28 @@ namespace ir {
         }
       }
     }
-#if 0
-    fn.foreachBlock([this](const BasicBlock &bb){
-      printf("label %d:\n", bb.getLabelIndex());
-      BlockInfo *info = liveness[&bb];
-      auto &outVarSet = info->liveOut;
-      auto &inVarSet = info->upwardUsed;
-      printf("\n\tLive Ins: ");
-      for (auto inVar : inVarSet) {
-        printf("%d ", inVar);
-      }
-      printf("\n");
-      printf("\tLive outs: ");
-      for (auto outVar : outVarSet) {
-        printf("%d ", outVar);
-      }
-      printf("\n");
-
-    });
-#endif
    }
 
+  std::ostream &operator<< (std::ostream &out, const Liveness &live) {
+    const Function &fn = live.getFunction();
+    fn.foreachBlock([&] (const BasicBlock &bb) {
+      out << std::endl;
+      out << "Label $" << bb.getLabelIndex() << std::endl;
+      const Liveness::BlockInfo &bbInfo = live.getBlockInfo(&bb);
+      out << "liveIn:" << std::endl;
+      for (auto &x: bbInfo.upwardUsed) {
+        out << x << " ";
+      }
+      out << std::endl << "liveOut:" << std::endl;
+      for (auto &x : bbInfo.liveOut)
+        out << x << " ";
+      out << std::endl << "varKill:" << std::endl;
+      for (auto &x : bbInfo.varKill)
+        out << x << " ";
+      out << std::endl;
+    });
+    return out;
+  }
 
   /*! To pretty print the livfeness info */
   static const uint32_t prettyInsnStrSize = 48;
diff --git a/backend/src/ir/liveness.hpp b/backend/src/ir/liveness.hpp
index 4a7dc4e..df889e6 100644
--- a/backend/src/ir/liveness.hpp
+++ b/backend/src/ir/liveness.hpp
@@ -48,7 +48,7 @@ namespace ir {
   class Liveness : public NonCopyable
   {
   public:
-    Liveness(Function &fn);
+    Liveness(Function &fn, bool isInGenBackend = false);
     ~Liveness(void);
     /*! Set of variables used upwards in the block (before a definition) */
     typedef set<Register> UEVar;
@@ -116,6 +116,13 @@ namespace ir {
         }
       }
     }
+
+    // remove some registers from the liveness information.
+    void removeRegs(const set<Register> &removes);
+
+    // replace some registers according to (from, to) register map.
+    void replaceRegs(const map<Register, Register> &replaceMap);
+
   private:
     /*! Store the liveness of all blocks */
     Info liveness;
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 9fcdf74..654a3bb 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -52,6 +52,13 @@ namespace ir {
     const LabelIndex index = this->label();
     this->LABEL(index);
     const BasicBlock *lastBlock = this->bb;
+
+    /* Append the STORE_PROFILING just before return. */
+    if (unit.getInProfilingMode() == true) {
+      this->STORE_PROFILING(this->getUnit().getProfilingInfo()->getBTI(),
+                            this->getUnit().getProfilingInfo()->getProfilingType());
+    }
+
     this->RET();
 
     // Now traverse all instructions and replace all returns by GOTO index
@@ -163,6 +170,8 @@ namespace ir {
 
     if (opcode == OP_LOAD) {
       LoadInstruction *load = cast<LoadInstruction>(insn);
+      if(!load)
+        return false;
       if (load->getAddressSpace() != MEM_PRIVATE)
         return false;
       loadAddImm.load = insn;
@@ -243,6 +252,7 @@ namespace ir {
     set<PushLocation> inserted;
     for (const auto &loadAddImm : seq) {
       LoadInstruction *load = cast<LoadInstruction>(loadAddImm.load);
+      if(!load) continue;
       const uint32_t valueNum = load->getValueNum();
       bool replaced = false;
       Instruction *ins_after = load; // the instruction to insert after.
@@ -316,11 +326,13 @@ namespace ir {
           derivedRegs.push_back(dst);
         } else if(opcode == OP_LOAD) {
           LoadInstruction *load = cast<LoadInstruction>(insn);
+          if(!load)
+            continue;
           if (load->getAddressSpace() != MEM_PRIVATE)
             continue;
 
           IndirectLoad indirectLoad;
-          Register addr = load->getAddress();
+          Register addr = load->getAddressRegister();
           indirectLoad.argID = argID;
           indirectLoad.load = insn;
 
@@ -357,7 +369,7 @@ namespace ir {
       const Register arg = fn->getArg(indirectLoad.argID).reg;
       if(dead.contains(indirectLoad.load)) continue;  //repetitive load in the indirectSeq, skip.
       LoadInstruction *load = cast<LoadInstruction>(indirectLoad.load);
-      const uint32_t valueNum = load->getValueNum();
+      const uint32_t valueNum = load ? load->getValueNum() : 0;
       bool replaced = false;
       Instruction *ins_after = load; // the instruction to insert after.
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
@@ -368,7 +380,7 @@ namespace ir {
 
         const Register reg = load->getValue(valueID);
 
-        Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load->getAddress(), offset);
+        Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load->getAddressRegister(), offset);
         mov.insert(ins_after, &ins_after);
         replaced = true;
       }
@@ -381,7 +393,7 @@ namespace ir {
       vector<Instruction *> adds = indirectLoad.adds;
       for (uint32_t i=0; i<adds.size(); i++) {
         BinaryInstruction *add = cast<BinaryInstruction>(adds[i]);
-        if (!dead.contains(add)) {
+        if (add && !dead.contains(add)) {
           Register dst = add->getDst();
           const Register src0 = add->getSrc(0);
           const Register src1 = add->getSrc(1);
@@ -444,6 +456,7 @@ namespace ir {
       // add.ptr_type dst ptr other
       if (opcode != OP_ADD) return false;
       BinaryInstruction *add = cast<BinaryInstruction>(insn);
+      if(!add) return false;
       const Type addType = add->getType();
       const RegisterFamily family = getFamily(addType);
       if (family != unit.getPointerFamily()) return false;
@@ -460,6 +473,7 @@ namespace ir {
       Instruction *otherInsn = const_cast<Instruction*>(otherDef->getInstruction());
       if (otherInsn->getOpcode() != OP_LOADI) return false;
       LoadImmInstruction *loadImm = cast<LoadImmInstruction>(otherInsn);
+      if(!loadImm) return false;
       const Immediate imm = loadImm->getImmediate();
       const uint64_t offset = getOffsetFromImm(imm);
 
diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
index eb1c199..7ca127d 100644
--- a/backend/src/ir/printf.cpp
+++ b/backend/src/ir/printf.cpp
@@ -23,6 +23,7 @@
 
 #include <stdarg.h>
 #include "printf.hpp"
+#include "ir/unit.hpp"
 
 namespace gbe
 {
@@ -31,47 +32,10 @@ namespace gbe
 
     pthread_mutex_t PrintfSet::lock = PTHREAD_MUTEX_INITIALIZER;
 
-    PrintfSlot::~PrintfSlot(void)
-    {
-        if (ptr)
-        {
-          if (type == PRINTF_SLOT_TYPE_STRING) {
-            free(ptr);
-            ptr = NULL;
-          } else if (type == PRINTF_SLOT_TYPE_STATE) {
-            delete state;
-            state = NULL;
-          } else {
-            type = PRINTF_SLOT_TYPE_NONE;
-            ptr = NULL;
-          }
-        }
-    }
-
-    uint32_t PrintfSet::append(PrintfFmt* fmt, Unit& unit)
-    {
-      fmts.push_back(*fmt);
-      vector<PrintfSlot>& vp = fmts.back().first;
-
-      for (vector<PrintfSlot>::iterator f = vp.begin(); f !=  vp.end(); ++f) {
-        if (f->type == PRINTF_SLOT_TYPE_STRING)
-          continue;
-
-        slots.push_back(*f);
-      }
-
-      /* Update the total size of size. */
-      if (slots.size() > 0)
-        sizeOfSize = slots.back().state->out_buf_sizeof_offset
-                     + getPrintfBufferElementSize(slots.size() - 1);
-
-      return (uint32_t)fmts.size();
-    }
-
     static void generatePrintfFmtString(PrintfState& state, std::string& str)
     {
       char num_str[16];
-      str += "%";
+      str = "%";
 
       if (state.left_justified) {
         str += "-";
@@ -123,152 +87,120 @@ namespace gbe
 #define PRINT_SOMETHING(target_ty, conv)  do {                          \
       if (!vec_i)                                                       \
         pf_str = pf_str + std::string(#conv);                           \
-      char *ptr = ((char *)buf_addr + sizeOfSize * global_wk_sz0 * global_wk_sz1 * global_wk_sz2 * n \
-                   + slot.state->out_buf_sizeof_offset *                \
-                   global_wk_sz0 * global_wk_sz1 * global_wk_sz2);      \
-      target_ty* obj_ptr = ((target_ty *)ptr) + (k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i) * vec_num + vec_i; \
-      if ((char *)obj_ptr + sizeof(target_ty) > (char *)buf_addr + output_sz) {            \
-        printf("\n\n!!!The printf message is out of range because of the limited buffer, ignore.\n"); \
-        return;                                                         \
-      }                                                                 \
-      printf(pf_str.c_str(),  *obj_ptr);                                \
+      printf(pf_str.c_str(), log.getData<target_ty>());                 \
     } while (0)
 
-
-    void PrintfSet::outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
-                                 size_t global_wk_sz1, size_t global_wk_sz2, size_t output_sz)
+    static void printOutOneStatement(PrintfSet::PrintfFmt& fmt, PrintfLog& log)
     {
-      LockOutput lock;
-      size_t i, j, k;
-      std::string pf_str;
-      int stmt = 0;
-
-      for (size_t count = 0; count < fmts.size(); ++count) {
-        for (i = 0; i < global_wk_sz0; i++) {
-          for (j = 0; j < global_wk_sz1; j++) {
-            for (k = 0; k < global_wk_sz2; k++) {
-              int loop_num = ((int *)index_addr)[(stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
-                                                 + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i)*2];
-              int printf_num = ((int *)index_addr)[(stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
-                                                 + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i)*2 + 1];
-              if (!loop_num) continue;
-
-              PrintfFmt* ppf = NULL;
-              for (auto& f : fmts) {
-                if (f.second == printf_num) {
-                  ppf = &f;
-                  break;
-                }
-              }
-
-              PrintfFmt& pf = *ppf;
-
-              for (int n = 0; n < loop_num; n++) {
-                for (vector<PrintfSlot>::iterator pfit = pf.first.begin(); pfit != pf.first.end(); ++pfit) {
-                  PrintfSlot& slot = *pfit;
-                  pf_str = "";
-                  int vec_num;
-
-                  if (slot.type == PRINTF_SLOT_TYPE_STRING) {
-                    printf("%s", slot.str);
-                    continue;
-                  }
-                  assert(slot.type == PRINTF_SLOT_TYPE_STATE);
-
-                  generatePrintfFmtString(*slot.state, pf_str);
-
-
-                  vec_num = slot.state->vector_n > 0 ? slot.state->vector_n : 1;
-
-                  for (int vec_i = 0; vec_i < vec_num; vec_i++) {
-                    if (vec_i)
-                      printf(",");
-
-                    switch (slot.state->conversion_specifier) {
-                      case PRINTF_CONVERSION_D:
-                      case PRINTF_CONVERSION_I:
-                        if (slot.state->length_modifier == PRINTF_LM_L)
-                          PRINT_SOMETHING(uint64_t, d);
-                        else
-                          PRINT_SOMETHING(int, d);
-                        break;
-
-                      case PRINTF_CONVERSION_O:
-                        if (slot.state->length_modifier == PRINTF_LM_L)
-                          PRINT_SOMETHING(uint64_t, o);
-                        else
-                          PRINT_SOMETHING(int, o);
-                        break;
-                      case PRINTF_CONVERSION_U:
-                        if (slot.state->length_modifier == PRINTF_LM_L)
-                          PRINT_SOMETHING(uint64_t, u);
-                        else
-                          PRINT_SOMETHING(int, u);
-                        break;
-                      case PRINTF_CONVERSION_X:
-                        if (slot.state->length_modifier == PRINTF_LM_L)
-                          PRINT_SOMETHING(uint64_t, X);
-                        else
-                          PRINT_SOMETHING(int, X);
-                        break;
-                      case PRINTF_CONVERSION_x:
-                        if (slot.state->length_modifier == PRINTF_LM_L)
-                          PRINT_SOMETHING(uint64_t, x);
-                        else
-                          PRINT_SOMETHING(int, x);
-                        break;
-
-                      case PRINTF_CONVERSION_C:
-                        PRINT_SOMETHING(char, c);
-                        break;
-
-                      case PRINTF_CONVERSION_F:
-                        PRINT_SOMETHING(float, F);
-                        break;
-                      case PRINTF_CONVERSION_f:
-                        PRINT_SOMETHING(float, f);
-                        break;
-                      case PRINTF_CONVERSION_E:
-                        PRINT_SOMETHING(float, E);
-                        break;
-                      case PRINTF_CONVERSION_e:
-                        PRINT_SOMETHING(float, e);
-                        break;
-                      case PRINTF_CONVERSION_G:
-                        PRINT_SOMETHING(float, G);
-                        break;
-                      case PRINTF_CONVERSION_g:
-                        PRINT_SOMETHING(float, g);
-                        break;
-                      case PRINTF_CONVERSION_A:
-                        PRINT_SOMETHING(float, A);
-                        break;
-                      case PRINTF_CONVERSION_a:
-                        PRINT_SOMETHING(float, a);
-                        break;
-                      case PRINTF_CONVERSION_P:
-                        PRINT_SOMETHING(int, p);
-                        break;
-
-                      case PRINTF_CONVERSION_S:
-                        pf_str = pf_str + "s";
-                        printf(pf_str.c_str(), slot.state->str.c_str());
-                        break;
-
-                      default:
-                        assert(0);
-                        return;
-                    }
-                  }
-
-                  pf_str = "";
-                }
-
-              }
-            }
+      std::string pf_str = "";
+      for (auto& slot : fmt) {
+        if (slot.type == PRINTF_SLOT_TYPE_STRING) {
+          printf("%s", slot.str.c_str());
+          continue;
+        }
+        assert(slot.type == PRINTF_SLOT_TYPE_STATE);
+
+        generatePrintfFmtString(slot.state, pf_str);
+
+        int vec_num;
+        vec_num = slot.state.vector_n > 0 ? slot.state.vector_n : 1;
+
+        for (int vec_i = 0; vec_i < vec_num; vec_i++) {
+          if (vec_i)
+            printf(",");
+
+          switch (slot.state.conversion_specifier) {
+            case PRINTF_CONVERSION_D:
+            case PRINTF_CONVERSION_I:
+              if (slot.state.length_modifier == PRINTF_LM_L)
+                PRINT_SOMETHING(uint64_t, d);
+              else
+                PRINT_SOMETHING(int, d);
+              break;
+
+            case PRINTF_CONVERSION_O:
+              if (slot.state.length_modifier == PRINTF_LM_L)
+                PRINT_SOMETHING(uint64_t, o);
+              else
+                PRINT_SOMETHING(int, o);
+              break;
+            case PRINTF_CONVERSION_U:
+              if (slot.state.length_modifier == PRINTF_LM_L)
+                PRINT_SOMETHING(uint64_t, u);
+              else
+                PRINT_SOMETHING(int, u);
+              break;
+            case PRINTF_CONVERSION_X:
+              if (slot.state.length_modifier == PRINTF_LM_L)
+                PRINT_SOMETHING(uint64_t, X);
+              else
+                PRINT_SOMETHING(int, X);
+              break;
+            case PRINTF_CONVERSION_x:
+              if (slot.state.length_modifier == PRINTF_LM_L)
+                PRINT_SOMETHING(uint64_t, x);
+              else
+                PRINT_SOMETHING(int, x);
+              break;
+
+            case PRINTF_CONVERSION_C:
+              PRINT_SOMETHING(char, c);
+              break;
+
+            case PRINTF_CONVERSION_F:
+              PRINT_SOMETHING(float, F);
+              break;
+            case PRINTF_CONVERSION_f:
+              PRINT_SOMETHING(float, f);
+              break;
+            case PRINTF_CONVERSION_E:
+              PRINT_SOMETHING(float, E);
+              break;
+            case PRINTF_CONVERSION_e:
+              PRINT_SOMETHING(float, e);
+              break;
+            case PRINTF_CONVERSION_G:
+              PRINT_SOMETHING(float, G);
+              break;
+            case PRINTF_CONVERSION_g:
+              PRINT_SOMETHING(float, g);
+              break;
+            case PRINTF_CONVERSION_A:
+              PRINT_SOMETHING(float, A);
+              break;
+            case PRINTF_CONVERSION_a:
+              PRINT_SOMETHING(float, a);
+              break;
+            case PRINTF_CONVERSION_P:
+              PRINT_SOMETHING(int, p);
+              break;
+
+            case PRINTF_CONVERSION_S:
+              pf_str = pf_str + "s";
+              printf(pf_str.c_str(), slot.state.str.c_str());
+              break;
+
+            default:
+              assert(0);
+              return;
           }
         }
-        stmt++;
+
+      }
+    }
+
+    void PrintfSet::outputPrintf(void* buf_addr)
+    {
+      LockOutput lock;
+      uint32_t totalSZ = ((uint32_t *)buf_addr)[0];
+      char* p = (char*)buf_addr + sizeof(uint32_t);
+
+      for (uint32_t parsed = 4; parsed < totalSZ; ) {
+        PrintfLog log(p);
+        GBE_ASSERT(fmts.find(log.statementNum) != fmts.end());
+        printOutOneStatement(fmts[log.statementNum], log);
+        parsed += log.size;
+        p += log.size;
       }
     }
   } /* namespace ir */
diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
index df58437..728aa68 100644
--- a/backend/src/ir/printf.hpp
+++ b/backend/src/ir/printf.hpp
@@ -26,12 +26,12 @@
 #include <string.h>
 #include "sys/map.hpp"
 #include "sys/vector.hpp"
-#include "unit.hpp"
 
 namespace gbe
 {
   namespace ir
   {
+    class Unit;
 
     /* Things about printf info. */
     enum {
@@ -111,55 +111,62 @@ namespace gbe
     };
 
     struct PrintfSlot {
-      int type;
-      union {
-        char* str;
-        PrintfState* state;
-        void *ptr;
-      };
+      uint32_t type;
+      std::string str;
+      PrintfState state;
 
       PrintfSlot(void) {
         type = PRINTF_SLOT_TYPE_NONE;
-        ptr = NULL;
       }
 
-      PrintfSlot(const char * s) {
+      PrintfSlot(std::string& s) : str(s) {
         type = PRINTF_SLOT_TYPE_STRING;
-        int len = strlen(s);
-        str = (char*)malloc((len + 1) * sizeof(char));
-        memcpy(str, s, (len + 1) * sizeof(char));
-        str[len] = 0;
       }
 
-      PrintfSlot(PrintfState * st) {
+      PrintfSlot(PrintfState& st) {
         type = PRINTF_SLOT_TYPE_STATE;
-        state = new PrintfState(*st);
+        state = st;
       }
 
       PrintfSlot(const PrintfSlot & other) {
         if (other.type == PRINTF_SLOT_TYPE_STRING) {
-          int len = strlen(other.str);
-          str = (char*)malloc((len + 1) * sizeof(char));
-          memcpy(str, other.str, (len + 1) * sizeof(char));
-          str[len] = 0;
           type = PRINTF_SLOT_TYPE_STRING;
+          str = other.str;
         } else if (other.type == PRINTF_SLOT_TYPE_STATE) {
           type = PRINTF_SLOT_TYPE_STATE;
-          state = new PrintfState(*other.state);
+          state = other.state;
         } else {
           type = PRINTF_SLOT_TYPE_NONE;
-          ptr = NULL;
         }
       }
 
-      PrintfSlot(PrintfSlot && other) {
-        void *p = other.ptr;
-        type = other.type;
-        other.ptr = ptr;
-        ptr = p;
+      ~PrintfSlot(void) {
+      }
+    };
+
+    struct PrintfLog {
+      uint32_t magic;  // 0xAABBCCDD as magic for ASSERT.
+      uint32_t size;  // Size of this printf log, include header.
+      uint32_t statementNum; // which printf within one kernel.
+      const char* content;
+
+      PrintfLog(const char* p) {
+        GBE_ASSERT(*((uint32_t *)p) == 0xAABBCCDD);
+        magic = *((uint32_t *)p);
+        p += sizeof(uint32_t);
+        size = *((uint32_t *)p);
+        p += sizeof(uint32_t);
+        statementNum = *((uint32_t *)p);
+        p += sizeof(uint32_t);
+        content = p;
       }
 
-      ~PrintfSlot(void);
+      template <typename T>
+      T getData(void) {
+        T D = *((T *)content);
+        content += sizeof(T);
+        return D;
+      }
     };
 
     class Context;
@@ -168,19 +175,8 @@ namespace gbe
     {
     public:
       PrintfSet(const PrintfSet& other) {
-        for (size_t i = 0; i < other.fmts.size(); ++i) {
-          const PrintfFmt& f = other.fmts[i];
-          fmts.push_back(f);
-        }
-
-        for (size_t i = 0; i < other.slots.size(); ++i) {
-          PrintfSlot s = other.slots[i];
-          slots.push_back(s);
-        }
-
-        sizeOfSize = other.sizeOfSize;
+        fmts = other.fmts;
         btiBuf = other.btiBuf;
-        btiIndexBuf = other.btiIndexBuf;
       }
 
       PrintfSet(void) = default;
@@ -195,32 +191,30 @@ namespace gbe
         }
       };
 
-      typedef std::pair<vector<PrintfSlot>, int> PrintfFmt;
-      uint32_t append(PrintfFmt* fmt, Unit &unit);
+      typedef vector<PrintfSlot> PrintfFmt;
 
-      uint32_t getPrintfNum(void) const {
-        return fmts.size();
+      void append(uint32_t num, PrintfFmt* fmt) {
+        GBE_ASSERT(fmts.find(num) == fmts.end());
+        fmts.insert(std::pair<uint32_t, PrintfFmt>(num, *fmt));
       }
 
-      uint32_t getPrintfSizeOfSize(void) const {
-        return sizeOfSize;
+      uint32_t getPrintfNum(void) const {
+        return fmts.size();
       }
 
       void setBufBTI(uint8_t b)      { btiBuf = b; }
-      void setIndexBufBTI(uint8_t b) { btiIndexBuf = b; }
       uint8_t getBufBTI() const      { return btiBuf; }
-      uint8_t getIndexBufBTI() const { return btiIndexBuf; }
 
       uint32_t getPrintfBufferElementSize(uint32_t i) {
-        PrintfSlot& slot = slots[i];
+        PrintfSlot slot;
         int vec_num = 1;
-        if (slot.state->vector_n > 0) {
-          vec_num = slot.state->vector_n;
+        if (slot.state.vector_n > 0) {
+          vec_num = slot.state.vector_n;
         }
 
         assert(vec_num > 0 && vec_num <= 16);
 
-        switch (slot.state->conversion_specifier) {
+        switch (slot.state.conversion_specifier) {
           case PRINTF_CONVERSION_I:
           case PRINTF_CONVERSION_D:
           case PRINTF_CONVERSION_O:
@@ -249,16 +243,12 @@ namespace gbe
         return 0;
       }
 
-      void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
-                        size_t global_wk_sz1, size_t global_wk_sz2, size_t output_sz);
+      void outputPrintf(void* buf_addr);
 
     private:
-      vector<PrintfFmt> fmts;
-      vector<PrintfSlot> slots;
-      uint32_t sizeOfSize; // Total sizeof size.
+      std::map<uint32_t, PrintfFmt> fmts;
       friend struct LockOutput;
       uint8_t btiBuf;
-      uint8_t btiIndexBuf;
       static pthread_mutex_t lock;
       GBE_CLASS(PrintfSet);
     };
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index af9f698..b16319a 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -41,58 +41,60 @@ namespace ir {
         "block_ip",
         "barrier_id", "thread_number", "work_dimension",
         "zero", "one",
-        "retVal", "slm_offset",
-        "printf_buffer_pointer", "printf_index_buffer_pointer",
+        "retVal",
         "dwblockip",
-        "lane_id",
-        "invalid",
-        "bti_utility"
+        "profiling_buffer_pointer",
+        "profiling_timestamps0", "profiling_timestamps1",
+        "profiling_timestamps2", "profiling_timestamps3",
+        "profiling_timestamps4",
+        "threadid"
     };
 
 #if GBE_DEBUG
-#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
-   r = fn.newRegister(FAMILY, UNIFORM); \
+#define DECL_NEW_REG(FAMILY, REG, ...) \
+   r = fn.newRegister(FAMILY, __VA_ARGS__); \
    GBE_ASSERT(r == REG);
 #else
-#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
-   fn.newRegister(FAMILY, UNIFORM);
+#define DECL_NEW_REG(FAMILY, REG, ...) \
+   fn.newRegister(FAMILY, __VA_ARGS__);
 #endif /* GBE_DEBUG */
     static void init(Function &fn) {
       IF_DEBUG(Register r);
-      DECL_NEW_REG(FAMILY_DWORD, lid0, 0);
-      DECL_NEW_REG(FAMILY_DWORD, lid1, 0);
-      DECL_NEW_REG(FAMILY_DWORD, lid2, 0);
+      DECL_NEW_REG(FAMILY_DWORD, lid0, 0, GBE_CURBE_LOCAL_ID_X);
+      DECL_NEW_REG(FAMILY_DWORD, lid1, 0, GBE_CURBE_LOCAL_ID_Y);
+      DECL_NEW_REG(FAMILY_DWORD, lid2, 0, GBE_CURBE_LOCAL_ID_Z);
       DECL_NEW_REG(FAMILY_DWORD, groupid0, 1);
       DECL_NEW_REG(FAMILY_DWORD, groupid1, 1);
       DECL_NEW_REG(FAMILY_DWORD, groupid2, 1);
-      DECL_NEW_REG(FAMILY_DWORD, numgroup0, 1);
-      DECL_NEW_REG(FAMILY_DWORD, numgroup1, 1);
-      DECL_NEW_REG(FAMILY_DWORD, numgroup2, 1);
-      DECL_NEW_REG(FAMILY_DWORD, lsize0, 1);
-      DECL_NEW_REG(FAMILY_DWORD, lsize1, 1);
-      DECL_NEW_REG(FAMILY_DWORD, lsize2, 1);
-      DECL_NEW_REG(FAMILY_DWORD, gsize0, 1);
-      DECL_NEW_REG(FAMILY_DWORD, gsize1, 1);
-      DECL_NEW_REG(FAMILY_DWORD, gsize2, 1);
-      DECL_NEW_REG(FAMILY_DWORD, goffset0, 1);
-      DECL_NEW_REG(FAMILY_DWORD, goffset1, 1);
-      DECL_NEW_REG(FAMILY_DWORD, goffset2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup0, 1, GBE_CURBE_GROUP_NUM_X);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup1, 1, GBE_CURBE_GROUP_NUM_Y);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup2, 1, GBE_CURBE_GROUP_NUM_Z);
+      DECL_NEW_REG(FAMILY_DWORD, lsize0, 1, GBE_CURBE_LOCAL_SIZE_X);
+      DECL_NEW_REG(FAMILY_DWORD, lsize1, 1, GBE_CURBE_LOCAL_SIZE_Y);
+      DECL_NEW_REG(FAMILY_DWORD, lsize2, 1, GBE_CURBE_LOCAL_SIZE_Z);
+      DECL_NEW_REG(FAMILY_DWORD, gsize0, 1, GBE_CURBE_GLOBAL_SIZE_X);
+      DECL_NEW_REG(FAMILY_DWORD, gsize1, 1, GBE_CURBE_GLOBAL_SIZE_Y);
+      DECL_NEW_REG(FAMILY_DWORD, gsize2, 1, GBE_CURBE_GLOBAL_SIZE_Z);
+      DECL_NEW_REG(FAMILY_DWORD, goffset0, 1, GBE_CURBE_GLOBAL_OFFSET_X);
+      DECL_NEW_REG(FAMILY_DWORD, goffset1, 1, GBE_CURBE_GLOBAL_OFFSET_Y);
+      DECL_NEW_REG(FAMILY_DWORD, goffset2, 1, GBE_CURBE_GLOBAL_OFFSET_Z);
       DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
-      DECL_NEW_REG(FAMILY_QWORD, stackbuffer, 1);
-      DECL_NEW_REG(FAMILY_WORD,  blockip, 0);
+      DECL_NEW_REG(FAMILY_QWORD, stackbuffer, 1, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
+      DECL_NEW_REG(FAMILY_WORD,  blockip, 0, GBE_CURBE_BLOCK_IP);
       DECL_NEW_REG(FAMILY_DWORD, barrierid, 1);
-      DECL_NEW_REG(FAMILY_DWORD, threadn, 1);
-      DECL_NEW_REG(FAMILY_DWORD, workdim, 1);
+      DECL_NEW_REG(FAMILY_DWORD, threadn, 1, GBE_CURBE_THREAD_NUM);
+      DECL_NEW_REG(FAMILY_DWORD, workdim, 1, GBE_CURBE_WORK_DIM);
       DECL_NEW_REG(FAMILY_DWORD, zero, 1);
       DECL_NEW_REG(FAMILY_DWORD, one, 1);
       DECL_NEW_REG(FAMILY_WORD, retVal, 1);
-      DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1);
-      DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
-      DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
-      DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0);
-      DECL_NEW_REG(FAMILY_DWORD, laneid, 0);
-      DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
-      DECL_NEW_REG(FAMILY_DWORD, btiUtil, 1);
+      DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0, GBE_CURBE_DW_BLOCK_IP);
+      DECL_NEW_REG(FAMILY_QWORD, profilingbptr, 1, GBE_CURBE_PROFILING_BUF_POINTER);
+      DECL_NEW_REG(FAMILY_DWORD, profilingts0, 0, GBE_CURBE_PROFILING_TIMESTAMP0);
+      DECL_NEW_REG(FAMILY_DWORD, profilingts1, 0, GBE_CURBE_PROFILING_TIMESTAMP1);
+      DECL_NEW_REG(FAMILY_DWORD, profilingts2, 0, GBE_CURBE_PROFILING_TIMESTAMP2);
+      DECL_NEW_REG(FAMILY_DWORD, profilingts3, 0, GBE_CURBE_PROFILING_TIMESTAMP3);
+      DECL_NEW_REG(FAMILY_DWORD, profilingts4, 0, GBE_CURBE_PROFILING_TIMESTAMP4);
+      DECL_NEW_REG(FAMILY_DWORD, threadid, 1, GBE_CURBE_THREAD_ID);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 9323824..eab7892 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -68,14 +68,15 @@ namespace ir {
     static const Register zero = Register(24);     //  scalar register holds zero.
     static const Register one = Register(25);     //  scalar register holds one. 
     static const Register retVal = Register(26);   // helper register to do data flow analysis.
-    static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
-    static const Register printfbptr = Register(28); // printf buffer address .
-    static const Register printfiptr = Register(29); // printf index buffer address.
-    static const Register dwblockip = Register(30);  // blockip
-    static const Register laneid = Register(31);  // lane id.
-    static const Register invalid = Register(32);  // used for valid comparation.
-    static const Register btiUtil = Register(33);  // used for mixed pointer as bti utility.
-    static const uint32_t regNum = 34;             // number of special registers
+    static const Register dwblockip = Register(27);  // blockip
+    static const Register profilingbptr = Register(28); // buffer addr for profiling.
+    static const Register profilingts0 = Register(29); // timestamp for profiling.
+    static const Register profilingts1 = Register(30); // timestamp for profiling.
+    static const Register profilingts2 = Register(31); // timestamp for profiling.
+    static const Register profilingts3 = Register(32); // timestamp for profiling.
+    static const Register profilingts4 = Register(33); // timestamp for profiling.
+    static const Register threadid = Register(34); // the thread id of this thread.
+    static const uint32_t regNum = 35;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/ir/profiling.cpp b/backend/src/ir/profiling.cpp
new file mode 100644
index 0000000..09537fa
--- /dev/null
+++ b/backend/src/ir/profiling.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+/**
+ * \file profiling.cpp
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "ir/profiling.hpp"
+#include "src/cl_device_data.h"
+
+namespace gbe
+{
+namespace ir
+{
+  pthread_mutex_t ProfilingInfo::lock = PTHREAD_MUTEX_INITIALIZER;
+
+  void ProfilingInfo::outputProfilingInfo(void * logBuf)
+  {
+    LockOutput lock;
+    uint32_t logNum = *reinterpret_cast<uint32_t*>(logBuf);
+    printf("Total log number is %u\n", logNum);
+    ProfilingReportItem* log = reinterpret_cast<ProfilingReportItem*>((char*)logBuf + 4);
+    for (int i = 0; i < (int)logNum; i++) {
+      GBE_ASSERT(log->simdType == ProfilingSimdType8 || log->simdType == ProfilingSimdType16);
+      uint32_t simd = log->simdType == ProfilingSimdType16 ? 16 : 8;
+      printf(" ------------------------ Log %-6d -----------------------\n", i);
+      printf(" | fix functions id:%4d     simd: %4d   kernel id: %4d  |\n", log->fixedFunctionID,
+          simd, log->kernelID);
+      if (IS_IVYBRIDGE(deviceID)) {
+        printf(" | thread id:       %4d     EU id:%4d   half slice id:%2d |\n", log->genInfo.gen7.thread_id,
+            log->genInfo.gen7.eu_id, log->genInfo.gen7.half_slice_id);
+      } else if (IS_HASWELL(deviceID)) {
+        printf(" | thread id: %4d  EU id:%4d half slice id:%2d slice id%2d |\n", log->genInfo.gen7.thread_id,
+            log->genInfo.gen7.eu_id, log->genInfo.gen7.half_slice_id, log->genInfo.gen7.slice_id);
+      } else if (IS_BROADWELL(deviceID)) {
+        printf(" | thread id: %4d  EU id:%4d  sub slice id:%2d slice id%2d |\n", log->genInfo.gen8.thread_id,
+            log->genInfo.gen8.eu_id, log->genInfo.gen8.subslice_id, log->genInfo.gen8.slice_id);
+      }
+
+      uint64_t proLog = log->timestampPrologHi;
+      proLog = ((proLog << 32) & 0xffffffff00000000) + log->timestampPrologLo;
+      uint64_t epiLog = log->timestampEpilogHi;
+      epiLog = ((epiLog << 32) & 0xffffffff00000000) + log->timestampEpilogLo;
+      printf(" | dispatch Mask:%4x prolog:%10lu  epilog:%10lu |\n", log->dispatchMask, proLog, epiLog);
+
+      printf(" | globalX:%4d~%4d  globalY:%4d~%4d  globalZ:%4d~%4d |\n", log->gidXStart, log->gidXEnd,
+          log->gidYStart, log->gidYEnd, log->gidZStart, log->gidZEnd);
+      for (uint32_t i = 0; i < MaxTimestampProfilingPoints - 2; i += 3) {
+        printf(" |  ts%-2d:%10u  | ts%-2d:%10u  | ts%-2d:%10u  |\n", i, log->userTimestamp[i],
+            i + 1, log->userTimestamp[i + 1], i + 2, log->userTimestamp[i + 2]);
+      }
+      printf(" |  ts18:%10u  | ts19:%10u  |                  |\n", log->userTimestamp[18], log->userTimestamp[19]);
+      log++;
+    }
+  }
+}
+}
diff --git a/backend/src/ir/profiling.hpp b/backend/src/ir/profiling.hpp
new file mode 100644
index 0000000..ce9866f
--- /dev/null
+++ b/backend/src/ir/profiling.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+/**
+ * \file profiling.hpp
+ *
+ */
+#ifndef __GBE_IR_PROFILING_HPP__
+#define __GBE_IR_PROFILING_HPP__
+
+#include <string.h>
+#include "sys/map.hpp"
+#include "sys/vector.hpp"
+#include "unit.hpp"
+
+namespace gbe
+{
+  namespace ir
+  {
+    class Context;
+    class ProfilingInfo //: public Serializable
+    {
+    public:
+      const static uint32_t MaxTimestampProfilingPoints = 20;
+      enum {
+        ProfilingSimdType1,
+        ProfilingSimdType8,
+        ProfilingSimdType16,
+      };
+
+      typedef struct {
+        uint32_t fixedFunctionID:4;
+        uint32_t simdType:4;
+        uint32_t kernelID:24;
+        union GenInfo {
+          struct Gen7Info {
+            uint16_t thread_id:3;
+            uint16_t reserved1:5;
+            uint16_t eu_id:4;
+            uint16_t half_slice_id:1;
+            uint16_t slice_id:2;
+            uint16_t reserved0:1;
+          } gen7;
+          struct Gen8Info {
+            uint16_t thread_id:3;
+            uint16_t reserved1:5;
+            uint16_t eu_id:4;
+            uint16_t subslice_id:2;
+            uint16_t slice_id:2;
+          } gen8;
+        } genInfo;
+        uint16_t dispatchMask;
+        uint32_t gidXStart;
+        uint32_t gidXEnd;
+        uint32_t gidYStart;
+        uint32_t gidYEnd;
+        uint32_t gidZStart;
+        uint32_t gidZEnd;
+        uint32_t userTimestamp[MaxTimestampProfilingPoints];
+        uint32_t timestampPrologLo;
+        uint32_t timestampPrologHi;
+        uint32_t timestampEpilogLo;
+        uint32_t timestampEpilogHi;
+      } ProfilingReportItem;
+
+      ProfilingInfo(const ProfilingInfo& other) {
+        this->bti = other.bti;
+        this->profilingType = other.profilingType;
+        this->deviceID = other.deviceID;
+      }
+
+      ProfilingInfo(void) {
+        this->bti = 0;
+        this->profilingType = 0;
+        this->deviceID = 0;
+      }
+      struct LockOutput {
+        LockOutput(void) {
+          pthread_mutex_lock(&lock);
+        }
+
+        ~LockOutput(void) {
+          pthread_mutex_unlock(&lock);
+        }
+      };
+
+      void setBTI(uint32_t b) {
+        bti = b;
+      }
+      uint32_t getBTI() const {
+        return bti;
+      }
+      void setProfilingType(uint32_t t) {
+        profilingType = t;
+      }
+      uint32_t getProfilingType() const {
+        return profilingType;
+      }
+      void setDeviceID(uint32_t id) {
+        deviceID = id;
+      }
+      uint32_t getDeviceID() const {
+        return deviceID;
+      }
+      void outputProfilingInfo(void* logBuf);
+
+    private:
+      uint32_t bti;
+      uint32_t profilingType;
+      uint32_t deviceID;
+      friend struct LockOutput;
+      static pthread_mutex_t lock;
+      GBE_CLASS(ProfilingInfo);
+    };
+  } /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_PROFILING_HPP__ */
diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp
index 48d6875..8200c31 100644
--- a/backend/src/ir/register.cpp
+++ b/backend/src/ir/register.cpp
@@ -62,6 +62,14 @@ namespace ir {
     return index;
   }
 
+  Tuple RegisterFile::appendArrayTypeTuple(const uint8_t *types, uint32_t num) {
+    const Tuple index = Tuple(typeTuples.size());
+    for (uint32_t id = 0; id < num; id++) {
+      typeTuples.push_back(types[id]);
+    }
+    return index;
+  }
+
 } /* namespace ir */
 } /* namespace gbe */
 
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index d8df7b0..11ab756 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -26,6 +26,7 @@
 
 #include "sys/vector.hpp"
 #include "sys/platform.hpp"
+#include "../backend/program.h"
 
 namespace gbe {
 namespace ir {
@@ -78,21 +79,38 @@ namespace ir {
     ARF_TM
   };
 
+  /*! Register is the position of the index of the register data in the register
+   *  file. We enforce type safety with this class
+   */
+  TYPE_SAFE(Register, uint32_t)
+
   /*! A register can be either a byte, a word, a dword or a qword. We store this
    *  value into a register data (which makes the register file) 
    */
   class RegisterData
   {
   public:
+    struct PayloadRegisterData {
+      gbe_curbe_type  curbeType;
+      int subType;
+    };
+
     /*! Build a register. All fields will be immutable */
     INLINE RegisterData(RegisterFamily family,
-                        bool uniform = false) : family(family), uniform(uniform) {}
+                        bool uniform,
+                        gbe_curbe_type curbeType,
+                        int subType) : family(family), uniform(uniform) {
+      payloadData.curbeType = curbeType;
+      payloadData.subType = subType;
+    }
+
     /*! Copy constructor */
-    INLINE RegisterData(const RegisterData &other) : family(other.family), uniform(other.uniform) {}
+    INLINE RegisterData(const RegisterData &other) : family(other.family), uniform(other.uniform), payloadData(other.payloadData) {}
     /*! Copy operator */
     INLINE RegisterData &operator= (const RegisterData &other) {
       this->family = other.family;
       this->uniform = other.uniform;
+      this->payloadData = other.payloadData;
       return *this;
     }
     /*! Nothing really happens here */
@@ -100,18 +118,26 @@ namespace ir {
     RegisterFamily family;            //!< Register size or if it is a flag
     INLINE bool isUniform() const { return uniform; }
     INLINE void setUniform(bool uni) { uniform = uni; }
+    INLINE void setPayloadType(gbe_curbe_type curbeType, int subType) {
+      payloadData.curbeType = curbeType;
+      payloadData.subType = subType;
+    }
+    INLINE void getPayloadType(gbe_curbe_type &curbeType, int &subType) const {
+      curbeType = payloadData.curbeType;
+      subType = payloadData.subType;
+    }
+    INLINE bool isPayloadType(void) const {
+      return payloadData.curbeType != GBE_GEN_REG;
+    }
   private:
     bool uniform;
+    PayloadRegisterData payloadData;
     GBE_CLASS(RegisterData);
   };
 
   /*! Output the register file string in the given stream */
   std::ostream &operator<< (std::ostream &out, const RegisterData &regData);
 
-  /*! Register is the position of the index of the register data in the register
-   *  file. We enforce type safety with this class
-   */
-  TYPE_SAFE(Register, uint32_t)
   INLINE bool operator< (const Register &r0, const Register &r1) {
     return r0.value() < r1.value();
   }
@@ -128,14 +154,18 @@ namespace ir {
   {
   public:
     /*! Return the index of a newly allocated register */
-    INLINE Register append(RegisterFamily family, bool uniform = false) {
+    INLINE Register append(RegisterFamily family,
+                           bool uniform = false,
+                           gbe_curbe_type curbeType = GBE_GEN_REG,
+                           int subType = 0) {
       GBE_ASSERTM((uint64_t)regNum() < MAX_INDEX,
                   "Too many defined registers (only 4G are supported)");
       const uint32_t index = regNum();
-      const RegisterData reg(family, uniform);
+      const RegisterData reg(family, uniform, curbeType, subType);
       regs.push_back(reg);
       return Register(index);
     }
+
     /*! Make a tuple from an array of register */
     Tuple appendArrayTuple(const Register *reg, uint32_t regNum);
     /*! Make a tuple and return the index to the first element of the tuple */
@@ -149,12 +179,36 @@ namespace ir {
     }
     /*! To terminate variadic recursion */
     INLINE void appendTuple(void) {}
+    /*! Make a tuple from an array of Type */
+    Tuple appendArrayTypeTuple(const uint8_t *types, uint32_t num);
+    /*! Make a tuple and return the index to the first element of the tuple */
+    template <typename First, typename... Rest>
+    INLINE Tuple appendTypeTuple(First first, Rest... rest) {
+      const Tuple index = Tuple(typeTuples.size());
+      typeTuples.push_back(first);
+      appendTuple(rest...);
+      return index;
+    }
+    /*! To terminate variadic recursion */
+    INLINE void appendTypeTuple(void) {}
     /*! Return a copy of the register at index */
     INLINE RegisterData get(Register index) const { return regs[index]; }
     /*! Return true if the specified register is uniform type. */
     INLINE bool isUniform(Register index) { return regs[index].isUniform(); }
     /*! Set a register to uniform or varying data type*/
     INLINE void setUniform(Register index, bool uniform) { regs[index].setUniform(uniform); }
+    /*! Set payload type of a register */
+    INLINE void setPayloadType(Register index, gbe_curbe_type curbeType, int subType) {
+      regs[index].setPayloadType(curbeType, subType);
+    }
+    /*! Get payload type of a register */
+    INLINE void getPayloadType(Register index, gbe_curbe_type &curbeType, int &subType) const {
+      regs[index].getPayloadType(curbeType, subType);
+    }
+    /*! Check whether the register is a payload register */
+    INLINE bool isPayloadReg(Register index) const {
+      return regs[index].isPayloadType();
+    }
     /*! Get the register index from the tuple */
     INLINE Register get(Tuple index, uint32_t which) const {
       return regTuples[index.value() + which];
@@ -163,6 +217,14 @@ namespace ir {
     INLINE void set(Tuple index, uint32_t which, Register reg) {
       regTuples[index.value() + which] = reg;
     }
+    /*! Get the type from the tuple */
+    INLINE uint8_t getType(Tuple index, uint32_t which) const {
+      return typeTuples[index.value() + which];
+    }
+    /*! Set the type to the tuple */
+    INLINE void setType(Tuple index, uint32_t which, uint8_t type) {
+      typeTuples[index.value() + which] = type;
+    }
     /*! Number of registers in the register file */
     INLINE uint32_t regNum(void) const { return regs.size(); }
     /*! Number of tuples in the register file */
@@ -172,6 +234,7 @@ namespace ir {
   private:
     vector<RegisterData> regs;   //!< All the registers together
     vector<Register> regTuples;  //!< Tuples are used for many src / dst
+    vector<uint8_t> typeTuples;  //!< Tuples are used for one instruction has multi src/dst types.
     GBE_CLASS(RegisterFile);
   };
 
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
index a4e1ddd..1987344 100644
--- a/backend/src/ir/sampler.cpp
+++ b/backend/src/ir/sampler.cpp
@@ -65,12 +65,14 @@ namespace ir {
 #define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
 
   /*! Implements the serialization. */
-  size_t SamplerSet::serializeToBin(std::ostream& outs) {
-    size_t ret_size = 0;
+  uint32_t SamplerSet::serializeToBin(std::ostream& outs) {
+    uint32_t ret_size = 0;
+    uint32_t sz = 0;
 
     OUT_UPDATE_SZ(magic_begin);
 
-    OUT_UPDATE_SZ(samplerMap.size());
+    sz = samplerMap.size();
+    OUT_UPDATE_SZ(sz);
     for (map<uint32_t, uint32_t>::iterator it = samplerMap.begin(); it != samplerMap.end(); ++it) {
       OUT_UPDATE_SZ(it->first);
       OUT_UPDATE_SZ(it->second);
@@ -82,10 +84,10 @@ namespace ir {
     return ret_size;
   }
 
-  size_t SamplerSet::deserializeFromBin(std::istream& ins) {
-    size_t total_size = 0;
+  uint32_t SamplerSet::deserializeFromBin(std::istream& ins) {
+    uint32_t total_size = 0;
     uint32_t magic;
-    size_t sampler_map_sz = 0;
+    uint32_t sampler_map_sz = 0;
 
     IN_UPDATE_SZ(magic);
     if (magic != magic_begin)
@@ -105,7 +107,7 @@ namespace ir {
     if (magic != magic_end)
       return 0;
 
-    size_t total_bytes;
+    uint32_t total_bytes;
     IN_UPDATE_SZ(total_bytes);
     if (total_bytes + sizeof(total_size) != total_size)
       return 0;
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
index 85e6d54..036fa69 100644
--- a/backend/src/ir/sampler.hpp
+++ b/backend/src/ir/sampler.hpp
@@ -79,8 +79,8 @@ namespace ir {
     */
 
     /*! Implements the serialization. */
-    virtual size_t serializeToBin(std::ostream& outs);
-    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual uint32_t serializeToBin(std::ostream& outs);
+    virtual uint32_t deserializeFromBin(std::istream& ins);
     virtual void printStatus(int indent, std::ostream& outs);
 
   private:
diff --git a/backend/src/ir/structurizer.cpp b/backend/src/ir/structurizer.cpp
index 38d3dd1..749cb94 100644
--- a/backend/src/ir/structurizer.cpp
+++ b/backend/src/ir/structurizer.cpp
@@ -57,7 +57,7 @@ namespace ir {
     Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
     pbb->insertAt(it, *p_new_insn);
     pbb->whileLabel = whileLabel;
-    pbb->erase(it);
+    it->remove();
   }
 
   /* recursive mark the bbs' variable needEndif*/
@@ -122,7 +122,7 @@ namespace ir {
     /* since this block is an if block, so we remove the BRA instruction at the bottom of the exit BB of 'block',
      * and insert IF instead
      */
-    pbb->erase(it);
+    it->remove();
     Instruction insn = IF(matchingElseLabel, reg, block->inversePredicate);
     Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
     pbb->append(*p_new_insn);
@@ -160,7 +160,7 @@ namespace ir {
     BasicBlock::iterator it = pbb->end();
     it--;
     if((*it).getOpcode() == OP_BRA)
-      pbb->erase(it);
+      it->remove();
 
     if(block->getExit()->getNextBlock() == elseblock->getEntry())
       return;
@@ -321,8 +321,7 @@ namespace ir {
         {
           BasicBlock::iterator it= bbs[i]->end();
           it--;
-
-          bbs[i]->erase(it);
+          it->remove();
 
           if (bbs[i]->hasExtraBra)
             bbs[i]->hasExtraBra = false;
diff --git a/backend/src/ir/type.cpp b/backend/src/ir/type.cpp
index 450ba61..682757b 100644
--- a/backend/src/ir/type.cpp
+++ b/backend/src/ir/type.cpp
@@ -32,11 +32,11 @@ namespace ir {
       case TYPE_S8: return out << "int8";
       case TYPE_U8: return out << "uint8";
       case TYPE_S16: return out << "int16";
-      case TYPE_U16: return out << "uin16";
+      case TYPE_U16: return out << "uint16";
       case TYPE_S32: return out << "int32";
-      case TYPE_U32: return out << "uin32";
+      case TYPE_U32: return out << "uint32";
       case TYPE_S64: return out << "int64";
-      case TYPE_U64: return out << "uin64";
+      case TYPE_U64: return out << "uint64";
       case TYPE_HALF: return out << "half";
       case TYPE_FLOAT: return out << "float";
       case TYPE_DOUBLE: return out << "double";
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
index 84208e5..c9cb15e 100644
--- a/backend/src/ir/unit.cpp
+++ b/backend/src/ir/unit.cpp
@@ -27,9 +27,14 @@
 namespace gbe {
 namespace ir {
 
-  Unit::Unit(PointerSize pointerSize) : pointerSize(pointerSize), valid(true) {}
+  Unit::Unit(PointerSize pointerSize) : pointerSize(pointerSize), valid(true) {
+    profilingInfo = GBE_NEW(ProfilingInfo);
+    inProfilingMode = false;
+  }
   Unit::~Unit(void) {
     for (const auto &pair : functions) GBE_DELETE(pair.second);
+    for (const auto &pair : printfs) GBE_DELETE(pair.second);
+    delete profilingInfo;
   }
   Function *Unit::getFunction(const std::string &name) const {
     auto it = functions.find(name);
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 8ff858d..10a1af6 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -26,13 +26,18 @@
 
 #include "ir/constant.hpp"
 #include "ir/register.hpp"
+#include "ir/profiling.hpp"
+#include "ir/printf.hpp"
 #include "sys/map.hpp"
 
+#include "llvm/IR/Instructions.h"
+
 namespace gbe {
 namespace ir {
 
   // A unit contains a set of functions
   class Function;
+  class ProfilingInfo;
 
   /*! Complete unit of compilation. It contains a set of functions and a set of
    *  constant the functions may refer to.
@@ -41,6 +46,8 @@ namespace ir {
   {
   public:
     typedef map<std::string, Function*> FunctionSet;
+    /*! Moved from printf pass */
+    map<llvm::CallInst*, PrintfSet::PrintfFmt*> printfs;
     /*! Create an empty unit */
     Unit(PointerSize pointerSize = POINTER_32_BITS);
     /*! Release everything (*including* the function pointers) */
@@ -72,6 +79,12 @@ namespace ir {
     ConstantSet& getConstantSet(void) { return constantSet; }
     /*! Return the constant set */
     const ConstantSet& getConstantSet(void) const { return constantSet; }
+    /*! Get profiling info in this function */
+    ProfilingInfo* getProfilingInfo(void) const { return profilingInfo; }
+    /*! Set in profiling mode */
+    void setInProfilingMode(bool b) { inProfilingMode = b; }
+    /*! Get in profiling mode */
+    bool getInProfilingMode(void) const { return inProfilingMode; }
     void setValid(bool value) { valid = value; }
     bool getValid() { return valid; }
   private:
@@ -79,8 +92,10 @@ namespace ir {
     FunctionSet functions; //!< All the defined functions
     ConstantSet constantSet; //!< All the constants defined in the unit
     PointerSize pointerSize; //!< Size shared by all pointers
+    ProfilingInfo *profilingInfo; //!< profilingInfo store the information for profiling.
     GBE_CLASS(Unit);
     bool valid;
+    bool inProfilingMode;
   };
 
   /*! Output the unit string in the given stream */
diff --git a/backend/src/ir/value.cpp b/backend/src/ir/value.cpp
index 840fb5c..d2f0c2e 100644
--- a/backend/src/ir/value.cpp
+++ b/backend/src/ir/value.cpp
@@ -558,6 +558,203 @@ namespace ir {
     return it->second;
   }
 
+  void FunctionDAG::getRegUDBBs(Register r, set<const BasicBlock *> &BBs) const{
+    auto dSet = getRegDef(r);
+    for (auto &def : *dSet)
+      BBs.insert(def->getInstruction()->getParent());
+    auto uSet = getRegUse(r);
+    for (auto &use : *uSet)
+      BBs.insert(use->getInstruction()->getParent());
+  }
+
+  static void getLivenessBBs(const Liveness &liveness, Register r, const set<const BasicBlock *> &useDefSet,
+                             set<const BasicBlock *> &liveInSet, set<const BasicBlock *> &liveOutSet){
+    for (auto bb : useDefSet) {
+      if (liveness.getLiveOut(bb).contains(r))
+        liveOutSet.insert(bb);
+      if (liveness.getLiveIn(bb).contains(r))
+        liveInSet.insert(bb);
+    }
+  }
+
+  static void getBlockDefInsns(const BasicBlock *bb, const DefSet *dSet, Register r, set <const Instruction *> &defInsns) {
+    for (auto def : *dSet) {
+      auto defInsn = def->getInstruction();
+      if (defInsn->getParent() == bb)
+        defInsns.insert(defInsn);
+    }
+  }
+
+  static bool liveinInterfere(const BasicBlock *bb, const Instruction *defInsn, Register r1) {
+    BasicBlock::const_iterator iter = BasicBlock::const_iterator(defInsn);
+    BasicBlock::const_iterator iterE = bb->end();
+
+    if (defInsn->getOpcode() == OP_MOV &&
+        defInsn->getSrc(0) == r1)
+      return false;
+    while (iter != iterE) {
+      const Instruction *insn = iter.node();
+      for (unsigned i = 0; i < insn->getDstNum(); i++) {
+        Register dst = insn->getDst(i);
+        if (dst == r1)
+          return false;
+      }
+      for (unsigned i = 0; i < insn->getSrcNum(); i++) {
+        ir::Register src = insn->getSrc(i);
+        if (src == r1)
+          return true;
+      }
+      ++iter;
+    }
+
+    return false;
+  }
+
+  // r0 and r1 both are in Livein set.
+  // Only if r0/r1 is used after r1/r0 has been modified.
+  bool FunctionDAG::interfereLivein(const BasicBlock *bb, Register r0, Register r1) const {
+    set <const Instruction *> defInsns0, defInsns1;
+    auto defSet0 = getRegDef(r0);
+    auto defSet1 = getRegDef(r1);
+    getBlockDefInsns(bb, defSet0, r0, defInsns0);
+    getBlockDefInsns(bb, defSet1, r1, defInsns1);
+    if (defInsns0.size() == 0 && defInsns1.size() == 0)
+      return false;
+
+    for (auto insn : defInsns0) {
+      if (liveinInterfere(bb, insn, r1))
+        return true;
+    }
+
+    for (auto insn : defInsns1) {
+      if (liveinInterfere(bb, insn, r0))
+        return true;
+    }
+    return false;
+  }
+
+  // r0 and r1 both are in Liveout set.
+  // Only if the last definition of r0/r1 is a MOV r0, r1 or MOV r1, r0,
+  // it will not introduce interfering in this BB.
+  bool FunctionDAG::interfereLiveout(const BasicBlock *bb, Register r0, Register r1) const {
+    set <const Instruction *> defInsns0, defInsns1;
+    auto defSet0 = getRegDef(r0);
+    auto defSet1 = getRegDef(r1);
+    getBlockDefInsns(bb, defSet0, r0, defInsns0);
+    getBlockDefInsns(bb, defSet1, r1, defInsns1);
+    if (defInsns0.size() == 0 && defInsns1.size() == 0)
+      return false;
+
+    BasicBlock::const_iterator iter = --bb->end();
+    BasicBlock::const_iterator iterE = bb->begin();
+    do {
+      const Instruction *insn = iter.node();
+      for (unsigned i = 0; i < insn->getDstNum(); i++) {
+        Register dst = insn->getDst(i);
+        if (dst == r0 || dst == r1) {
+          if (insn->getOpcode() != OP_MOV)
+            return true;
+          if (dst == r0 && insn->getSrc(0) != r1)
+            return true;
+          if (dst == r1 && insn->getSrc(0) != r0)
+            return true;
+          return false;
+        }
+      }
+      --iter;
+    } while (iter != iterE);
+    return false;
+  }
+
+  // check instructions after the def of r0, if there is any def of r1, then no interefere for this
+  // range. Otherwise, if there is any use of r1, then return true.
+  bool FunctionDAG::interfere(const BasicBlock *bb, Register inReg, Register outReg) const {
+    auto dSet = getRegDef(outReg);
+    for (auto &def : *dSet) {
+      auto defInsn = def->getInstruction();
+      if (defInsn->getParent() == bb) {
+        if (defInsn->getOpcode() == OP_MOV && defInsn->getSrc(0) == inReg)
+          continue;
+        BasicBlock::const_iterator iter = BasicBlock::const_iterator(defInsn);
+        BasicBlock::const_iterator iterE = bb->end();
+        iter++;
+        // check no use of phi in this basicblock between [phiCopySrc def, bb end]
+        while (iter != iterE) {
+          const ir::Instruction *insn = iter.node();
+          // check phiUse
+          for (unsigned i = 0; i < insn->getSrcNum(); i++) {
+            ir::Register src = insn->getSrc(i);
+            if (src == inReg)
+              return true;
+          }
+          ++iter;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool FunctionDAG::interfere(const Liveness &liveness, Register r0, Register r1) const {
+    // If there are no any intersection BB, they are not interfering to each other.
+    // There are three different interfering cases which need further checking:
+    //   1. Both registers are in the LiveIn register set.
+    //   2. Both registers are in the LiveOut register set.
+    //   3. One is in LiveIn set and the Other is in LiveOut set.
+    // For the above 3 cases, we need 3 different ways to check whether they really
+    // interfering to each other.
+    set<const BasicBlock *> bbSet0;
+    set<const BasicBlock *> bbSet1;
+    getRegUDBBs(r0, bbSet0);
+    getRegUDBBs(r1, bbSet1);
+
+    set<const BasicBlock *> liveInBBSet0, liveInBBSet1;
+    set<const BasicBlock *> liveOutBBSet0, liveOutBBSet1;
+    getLivenessBBs(liveness, r0, bbSet0, liveInBBSet0, liveOutBBSet0);
+    getLivenessBBs(liveness, r1, bbSet1, liveInBBSet1, liveOutBBSet1);
+    GBE_ASSERT(liveInBBSet0.size() + liveOutBBSet0.size() > 0);
+    GBE_ASSERT(liveInBBSet1.size() + liveOutBBSet1.size() > 0);
+
+    set<const BasicBlock *> intersect;
+    set_intersection(liveInBBSet0.begin(), liveInBBSet0.end(),
+                     liveInBBSet1.begin(), liveInBBSet1.end(),
+                     std::inserter(intersect, intersect.begin()));
+    for (auto bb : intersect) {
+      if (interfereLivein(bb, r0, r1))
+        return true;
+    }
+    intersect.clear();
+    for (auto &bb: liveOutBBSet0) {
+      if (liveness.getBlockInfo(bb).inLiveOut(r1))
+        intersect.insert(bb);
+    }
+
+    for (auto bb: liveOutBBSet1) {
+      if (liveness.getBlockInfo(bb).inLiveOut(r0))
+        intersect.insert(bb);
+    }
+    for (auto bb : intersect) {
+      if (interfereLiveout(bb, r0, r1))
+        return true;
+    }
+    set<const BasicBlock *> OIIntersect, IOIntersect;
+    set_intersection(liveOutBBSet0.begin(), liveOutBBSet0.end(),
+                     liveInBBSet1.begin(), liveInBBSet1.end(),
+                     std::inserter(OIIntersect, OIIntersect.begin()));
+
+    for (auto bb : OIIntersect) {
+      if (interfere(bb, r1, r0))
+        return true;
+    }
+    set_intersection(liveInBBSet0.begin(), liveInBBSet0.end(),
+                     liveOutBBSet1.begin(), liveOutBBSet1.end(),
+                     std::inserter(IOIntersect, IOIntersect.begin()));
+    for (auto bb : IOIntersect) {
+      if (interfere(bb, r0, r1))
+        return true;
+    }
+    return false;
+  }
+
   std::ostream &operator<< (std::ostream &out, const FunctionDAG &dag) {
     const Function &fn = dag.getFunction();
 
diff --git a/backend/src/ir/value.hpp b/backend/src/ir/value.hpp
index a9e5108..b23dc0b 100644
--- a/backend/src/ir/value.hpp
+++ b/backend/src/ir/value.hpp
@@ -238,6 +238,22 @@ namespace ir {
     typedef map<ValueUse, DefSet*> UDGraph;
     /*! The UseSet for each definition */
     typedef map<ValueDef, UseSet*> DUGraph;
+    /*! get register's use and define BB set */
+    void getRegUDBBs(Register r, set<const BasicBlock *> &BBs) const;
+    // check whether two register interering in the specific BB.
+    // This function must be called at the following conditions:
+    // 1. The outReg is in the BB's liveout set and not in the livein set.
+    // 2. The inReg is in the BB's livein set but not in the livout set.
+    bool interfere(const BasicBlock *bb, Register inReg, Register outReg) const;
+    // check whether two register interfering to each other.
+    // This function must be called at the following conditions:
+    //   r0 and r1 are both not a local variable which means they have information
+    //   in the liveness object.
+    bool interfere(const Liveness &liveness, Register r0, Register r1) const;
+    /*! check whether two registers which are both in liveout set interfering in the current BB. */
+    bool interfereLiveout(const BasicBlock *bb, Register r0, Register r1) const;
+    /*! check whether two registers which are both in livein set interfering in the current BB. */
+    bool interfereLivein(const BasicBlock *bb, Register r0, Register r1) const;
   private:
     UDGraph udGraph;                   //!< All the UD chains
     DUGraph duGraph;                   //!< All the DU chains
diff --git a/backend/src/libocl/Android.mk b/backend/src/libocl/Android.mk
new file mode 100644
index 0000000..8e45c12
--- /dev/null
+++ b/backend/src/libocl/Android.mk
@@ -0,0 +1,89 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libgbe
+LOCAL_MODULE_TAGS := optional
+LOCAL_MODULE_CLASS := SHARED_LIBRARIES
+
+generated_sources := $(call local-generated-sources-dir)/libocl
+
+$(shell mkdir -p ${generated_sources}/include/)
+$(shell mkdir -p ${generated_sources}/src/)
+#$(shell echo "cat $(LOCAL_PATH)/tmpl/ocl_defines.tmpl.h \\> ${LIBOCL_BINARY_DIR}/include/ocl_defines.h")
+$(shell cat $(LOCAL_PATH)/tmpl/ocl_defines.tmpl.h > ${generated_sources}/include/ocl_defines.h)
+#$(shell echo "cat $(LOCAL_PATH)/../ocl_common_defines.h \\>\\> ${LIBOCL_BINARY_DIR}/include/ocl_defines.h")
+$(shell cat ${LOCAL_PATH}/../ocl_common_defines.h >> ${generated_sources}/include/ocl_defines.h)
+$(shell echo "Generate the header: ${generated_sources}/include/ocl_defines.h")
+
+define COPY_THE_HEADER
+    # Use the python script to generate the header files.
+    $(shell cp ${LOCAL_PATH}/include/$(1).h ${generated_sources}/include/$(1).h)
+endef
+define COPY_THE_SOURCE
+    # Use the python script to generate the header files.
+    $(shell cp ${LOCAL_PATH}/src/$(1).cl ${generated_sources}/src/$(1).cl)
+endef
+
+OCL_COPY_MODULES := ocl ocl_types ocl_float ocl_printf
+$(foreach _M_, ${OCL_COPY_MODULES}, $(eval $(call COPY_THE_HEADER,$(_M_))))
+
+OCL_COPY_MODULES := ocl_workitem ocl_atom ocl_async ocl_sync ocl_memcpy ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image ocl_work_group
+OCL_SOURCE_FILES := $(OCL_COPY_MODULES)
+$(foreach _M_, ${OCL_COPY_MODULES}, $(eval $(call COPY_THE_HEADER,$(_M_))))
+$(foreach _M_, ${OCL_COPY_MODULES}, $(eval $(call COPY_THE_SOURCE,$(_M_))))
+
+define GENERATE_HEADER_PY
+    # Use the python script to generate the header files.
+    $(shell cat ${LOCAL_PATH}/tmpl/$(1).tmpl.h > ${generated_sources}/include/$(1).h)
+    $(shell /usr/bin/python ${LOCAL_PATH}/script/gen_vector.py ${LOCAL_PATH}/script/$(1).def ${generated_sources}/include/$(1).h 1)
+    $(shell echo "#endif" >> ${generated_sources}/include/$(1).h)
+endef
+define GENERATE_SOURCE_PY
+    # Use the python script to generate the header files.
+    $(shell cat ${LOCAL_PATH}/tmpl/$(1).tmpl.cl > ${generated_sources}/src/$(1).cl)
+    $(shell /usr/bin/python ${LOCAL_PATH}/script/gen_vector.py ${LOCAL_PATH}/script/$(1).def ${generated_sources}/src/$(1).cl 0)
+endef
+
+OCL_COPY_MODULES_PY := ocl_common ocl_relational ocl_integer ocl_math ocl_simd
+OCL_SOURCE_FILES += $(OCL_COPY_MODULES_PY)
+$(foreach _M_, ${OCL_COPY_MODULES_PY}, $(eval $(call GENERATE_HEADER_PY,$(_M_))))
+$(foreach _M_, ${OCL_COPY_MODULES_PY}, $(eval $(call GENERATE_SOURCE_PY,$(_M_))))
+
+define GENERATE_HEADER_BASH
+    # Use the python script to generate the header files.\
+    $(shell ${LOCAL_PATH}/script/$(1).sh -p > ${generated_sources}/include/$(1).h)
+endef
+define GENERATE_SOURCE_BASH
+    # Use the python script to generate the header files.
+    $(shell ${LOCAL_PATH}/script/$(1).sh > ${generated_sources}/src/$(1).cl)
+endef
+OCL_COPY_MODULES_SH := ocl_as ocl_convert
+OCL_SOURCE_FILES += $(OCL_COPY_MODULES_SH)
+$(foreach _M_, ${OCL_COPY_MODULES_SH}, $(eval $(call GENERATE_HEADER_BASH,$(_M_))))
+$(foreach _M_, ${OCL_COPY_MODULES_SH}, $(eval $(call GENERATE_SOURCE_BASH,$(_M_))))
+
+CLANG_OCL_FLAGS := -fno-builtin -ffp-contract=off -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2"
+define ADD_CL_TO_BC_TARGET
+    # Use the python script to generate the header files.
+    $(shell $(HOST_OUT)/bin/clang -cc1 ${CLANG_OCL_FLAGS} -I ${generated_sources}/include/ -emit-llvm-bc -triple spir -o ${generated_sources}/$(1).bc -x cl ${generated_sources}/src/$(1).cl)
+endef
+$(foreach _M_, ${OCL_SOURCE_FILES}, $(eval $(call ADD_CL_TO_BC_TARGET,$(_M_))))
+
+define COPY_THE_LL
+    # Use the python script to generate the header files.
+    $(shell cp ${LOCAL_PATH}/src/$(1).ll ${generated_sources}/src/$(1).ll)
+endef
+define ADD_LL_TO_BC_TARGET
+    # Use the python script to generate the header files.
+    $(shell $(HOST_OUT)/bin/llvm-as -o ${generated_sources}/$(1).bc ${generated_sources}/src/$(1).ll)
+endef
+OCL_LL_MODULES := ocl_barrier ocl_clz
+OCL_SOURCE_FILES += $(OCL_LL_MODULES)
+$(foreach _M_, ${OCL_LL_MODULES}, $(eval $(call COPY_THE_LL,$(_M_))))
+$(foreach _M_, ${OCL_LL_MODULES}, $(eval $(call ADD_LL_TO_BC_TARGET,$(_M_))))
+
+$(shell $(HOST_OUT)/bin/llvm-link -o ${generated_sources}/../beignet.bc $(addprefix ${generated_sources}/, $(addsuffix .bc, ${OCL_SOURCE_FILES})))
+
+$(shell $(HOST_OUT)/bin/clang -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${generated_sources}/include/ --relocatable-pch -emit-pch -isysroot ${generated_sources} -x cl ${generated_sources}/include/ocl.h -o ${generated_sources}/../beignet.pch)
+
+
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 0fffd9b..1d1ec68 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -53,7 +53,7 @@ FOREACH(M ${OCL_COPY_HEADERS})
 ENDFOREACH(M) 
 
 SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_memcpy
-                      ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image)
+                      ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image ocl_work_group)
 FOREACH(M ${OCL_COPY_MODULES})
     COPY_THE_HEADER(${M})
     COPY_THE_SOURCE(${M})
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 7897567..abb2bd4 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -31,6 +31,7 @@
 #include "ocl_integer.h"
 #include "ocl_math.h"
 #include "ocl_memcpy.h"
+#include "ocl_memset.h"
 #include "ocl_misc.h"
 #include "ocl_printf.h"
 #include "ocl_relational.h"
@@ -38,6 +39,7 @@
 #include "ocl_vload.h"
 #include "ocl_workitem.h"
 #include "ocl_simd.h"
+#include "ocl_work_group.h"
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
 #endif
diff --git a/backend/src/libocl/include/ocl_float.h b/backend/src/libocl/include/ocl_float.h
index e63eaf9..6be6c7c 100644
--- a/backend/src/libocl/include/ocl_float.h
+++ b/backend/src/libocl/include/ocl_float.h
@@ -81,6 +81,7 @@ INLINE_OVERLOADABLE int __ocl_finitef (float x){
 #define M_E_F        2.718281828459045F
 #define M_LOG2E_F    1.4426950408889634F
 #define M_LOG10E_F   0.43429448190325176F
+#define M_LOG210_F   3.3219280948873626F
 #define M_LN2_F      0.6931471805599453F
 #define M_LN10_F     2.302585092994046F
 #define M_PI_F       3.141592653589793F
diff --git a/backend/src/libocl/include/ocl_geometric.h b/backend/src/libocl/include/ocl_geometric.h
index 86d543b..1713f8f 100644
--- a/backend/src/libocl/include/ocl_geometric.h
+++ b/backend/src/libocl/include/ocl_geometric.h
@@ -24,6 +24,10 @@ OVERLOADABLE float dot(float p0, float p1);
 OVERLOADABLE float dot(float2 p0, float2 p1);
 OVERLOADABLE float dot(float3 p0, float3 p1);
 OVERLOADABLE float dot(float4 p0, float4 p1);
+OVERLOADABLE half dot(half p0, half p1);
+OVERLOADABLE half dot(half2 p0, half2 p1);
+OVERLOADABLE half dot(half3 p0, half3 p1);
+OVERLOADABLE half dot(half4 p0, half4 p1);
 OVERLOADABLE float length(float x);
 OVERLOADABLE float length(float2 x);
 OVERLOADABLE float length(float3 x);
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 359025b..7d4abab 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -136,5 +136,20 @@ struct time_stamp {
   uint event;
 };
 
+uint __gen_ocl_region(ushort offset, uint data);
+
 struct time_stamp __gen_ocl_get_timestamp(void);
+
+uint8 __gen_ocl_vme(image2d_t, image2d_t,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   int, int, int);
 #endif
diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h
index 18090d5..312928e 100644
--- a/backend/src/libocl/include/ocl_sync.h
+++ b/backend/src/libocl/include/ocl_sync.h
@@ -28,8 +28,9 @@
 
 typedef uint cl_mem_fence_flags;
 OVERLOADABLE void barrier(cl_mem_fence_flags flags);
-void mem_fence(cl_mem_fence_flags flags);
-void read_mem_fence(cl_mem_fence_flags flags);
-void write_mem_fence(cl_mem_fence_flags flags);
+OVERLOADABLE void debugwait(void);
+OVERLOADABLE void mem_fence(cl_mem_fence_flags flags);
+OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags);
+OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags);
 
 #endif  /* __OCL_SYNC_H__ */
diff --git a/backend/src/libocl/include/ocl_vload.h b/backend/src/libocl/include/ocl_vload.h
index b1b1a32..c26f640 100644
--- a/backend/src/libocl/include/ocl_vload.h
+++ b/backend/src/libocl/include/ocl_vload.h
@@ -88,6 +88,7 @@ DECL_BYTE_RW_ALL(char)
 DECL_BYTE_RW_ALL(uchar)
 DECL_BYTE_RW_ALL(short)
 DECL_BYTE_RW_ALL(ushort)
+DECL_BYTE_RW_ALL(half)
 DECL_UNTYPED_RW_ALL(int)
 DECL_UNTYPED_RW_ALL(uint)
 DECL_UNTYPED_RW_ALL(long)
@@ -109,12 +110,17 @@ DECL_UNTYPED_RW_ALL(double)
 
 #define DECL_HALF_LD_SPACE(SPACE) \
 OVERLOADABLE float vload_half(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p);  \
 OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p); \
+OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p); \
 OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p); \
 OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p); \
 OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p);  \
 OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p);  \
-OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p);
+OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p); \
+OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p); \
 
 #define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
 OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p);  \
@@ -151,10 +157,4 @@ DECL_HALF_ST_SPACE(__private)
 #undef DECL_HALF_ST_SPACE
 #undef DECL_HALF_ST_SPACE_ROUND
 
-#define vloada_half vload_half
-#define vloada_half2 vload_half2
-#define vloada_half4 vload_half4
-#define vloada_half8 vload_half8
-#define vloada_half16 vload_half16
-
 #endif  /* __OCL_VLOAD_H__ */
diff --git a/backend/src/libocl/include/ocl_work_group.h b/backend/src/libocl/include/ocl_work_group.h
new file mode 100644
index 0000000..ebd264f
--- /dev/null
+++ b/backend/src/libocl/include/ocl_work_group.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_WORK_GROUP_H__
+#define __OCL_WORK_GROUP_H__
+#include "ocl_types.h"
+
+int work_group_all(int predicate);
+int work_group_any(int predicate);
+
+/* broadcast */
+OVERLOADABLE int work_group_broadcast(int a, size_t local_id);
+OVERLOADABLE uint work_group_broadcast(uint a, size_t local_id);
+OVERLOADABLE long work_group_broadcast(long a, size_t local_id);
+OVERLOADABLE ulong work_group_broadcast(ulong a, size_t local_id);
+OVERLOADABLE float work_group_broadcast(float a, size_t local_id);
+OVERLOADABLE double work_group_broadcast(double a, size_t local_id);
+
+OVERLOADABLE int work_group_broadcast(int a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE uint work_group_broadcast(uint a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE long work_group_broadcast(long a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE ulong work_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE float work_group_broadcast(float a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE double work_group_broadcast(double a, size_t local_id_x, size_t local_id_y);
+
+OVERLOADABLE int work_group_broadcast(int a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE uint work_group_broadcast(uint a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE long work_group_broadcast(long a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE ulong work_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE float work_group_broadcast(float a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE double work_group_broadcast(double a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+
+/* reduce add */
+OVERLOADABLE int work_group_reduce_add(int x);
+OVERLOADABLE uint work_group_reduce_add(uint x);
+OVERLOADABLE long work_group_reduce_add(long x);
+OVERLOADABLE ulong work_group_reduce_add(ulong x);
+OVERLOADABLE float work_group_reduce_add(float x);
+OVERLOADABLE double work_group_reduce_add(double x);
+
+/* reduce min */
+OVERLOADABLE int work_group_reduce_min(int x);
+OVERLOADABLE uint work_group_reduce_min(uint x);
+OVERLOADABLE long work_group_reduce_min(long x);
+OVERLOADABLE ulong work_group_reduce_min(ulong x);
+OVERLOADABLE float work_group_reduce_min(float x);
+OVERLOADABLE double work_group_reduce_min(double x);
+
+/* reduce max */
+OVERLOADABLE int work_group_reduce_max(int x);
+OVERLOADABLE uint work_group_reduce_max(uint x);
+OVERLOADABLE long work_group_reduce_max(long x);
+OVERLOADABLE ulong work_group_reduce_max(ulong x);
+OVERLOADABLE float work_group_reduce_max(float x);
+OVERLOADABLE double work_group_reduce_max(double x);
+
+/* scan_inclusive add */
+OVERLOADABLE int work_group_scan_inclusive_add(int x);
+OVERLOADABLE uint work_group_scan_inclusive_add(uint x);
+OVERLOADABLE long work_group_scan_inclusive_add(long x);
+OVERLOADABLE ulong work_group_scan_inclusive_add(ulong x);
+OVERLOADABLE float work_group_scan_inclusive_add(float x);
+OVERLOADABLE double work_group_scan_inclusive_add(double x);
+
+/* scan_inclusive min */
+OVERLOADABLE int work_group_scan_inclusive_min(int x);
+OVERLOADABLE uint work_group_scan_inclusive_min(uint x);
+OVERLOADABLE long work_group_scan_inclusive_min(long x);
+OVERLOADABLE ulong work_group_scan_inclusive_min(ulong x);
+OVERLOADABLE float work_group_scan_inclusive_min(float x);
+OVERLOADABLE double work_group_scan_inclusive_min(double x);
+
+/* scan_inclusive max */
+OVERLOADABLE int work_group_scan_inclusive_max(int x);
+OVERLOADABLE uint work_group_scan_inclusive_max(uint x);
+OVERLOADABLE long work_group_scan_inclusive_max(long x);
+OVERLOADABLE ulong work_group_scan_inclusive_max(ulong x);
+OVERLOADABLE float work_group_scan_inclusive_max(float x);
+OVERLOADABLE double work_group_scan_inclusive_max(double x);
+
+/* scan_exclusive add */
+OVERLOADABLE int work_group_scan_exclusive_add(int x);
+OVERLOADABLE uint work_group_scan_exclusive_add(uint x);
+OVERLOADABLE long work_group_scan_exclusive_add(long x);
+OVERLOADABLE ulong work_group_scan_exclusive_add(ulong x);
+OVERLOADABLE float work_group_scan_exclusive_add(float x);
+OVERLOADABLE double work_group_scan_exclusive_add(double x);
+
+/* scan_exclusive min */
+OVERLOADABLE int work_group_scan_exclusive_min(int x);
+OVERLOADABLE uint work_group_scan_exclusive_min(uint x);
+OVERLOADABLE long work_group_scan_exclusive_min(long x);
+OVERLOADABLE ulong work_group_scan_exclusive_min(ulong x);
+OVERLOADABLE float work_group_scan_exclusive_min(float x);
+OVERLOADABLE double work_group_scan_exclusive_min(double x);
+
+/* scan_exclusive max */
+OVERLOADABLE int work_group_scan_exclusive_max(int x);
+OVERLOADABLE uint work_group_scan_exclusive_max(uint x);
+OVERLOADABLE long work_group_scan_exclusive_max(long x);
+OVERLOADABLE ulong work_group_scan_exclusive_max(ulong x);
+OVERLOADABLE float work_group_scan_exclusive_max(float x);
+OVERLOADABLE double work_group_scan_exclusive_max(double x);
+#endif  /* __OCL_WORK_GROUP_H__ */
diff --git a/backend/src/libocl/include/ocl_workitem.h b/backend/src/libocl/include/ocl_workitem.h
index 84bb1fb..c3b0bdb 100644
--- a/backend/src/libocl/include/ocl_workitem.h
+++ b/backend/src/libocl/include/ocl_workitem.h
@@ -24,9 +24,12 @@ OVERLOADABLE uint get_work_dim(void);
 OVERLOADABLE uint get_global_size(uint dimindx);
 OVERLOADABLE uint get_global_id(uint dimindx);
 OVERLOADABLE uint get_local_size(uint dimindx);
+OVERLOADABLE uint get_enqueued_local_size(uint dimindx);
 OVERLOADABLE uint get_local_id(uint dimindx);
 OVERLOADABLE uint get_num_groups(uint dimindx);
 OVERLOADABLE uint get_group_id(uint dimindx);
 OVERLOADABLE uint get_global_offset(uint dimindx);
+OVERLOADABLE uint get_global_linear_id(void);
+OVERLOADABLE uint get_local_linear_id(void);
 
 #endif  /* __OCL_WORKITEM_H__ */
diff --git a/backend/src/libocl/script/gen_vector.py b/backend/src/libocl/script/gen_vector.py
index cb562a2..10e8634 100755
--- a/backend/src/libocl/script/gen_vector.py
+++ b/backend/src/libocl/script/gen_vector.py
@@ -327,9 +327,9 @@ class builtinProto():
                     formatStr += '({0} {1} *)param{2} + {3:2d}'.format(ptype[2], ptype[0], n, j)
                 else:
                     if (self.functionName == 'select' and n == 2):
-                        formatStr += '({0})(param{1}.s{2:x} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
+                        formatStr += '({0})(param{1}.s{2:X} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
                     else:
-                        formatStr += 'param{0}.s{1:x}'.format(n, j)
+                        formatStr += 'param{0}.s{1:X}'.format(n, j)
 
             formatStr += ')'
 
diff --git a/backend/src/libocl/script/ocl_math.def b/backend/src/libocl/script/ocl_math.def
index 9c65af7..b5c1e47 100644
--- a/backend/src/libocl/script/ocl_math.def
+++ b/backend/src/libocl/script/ocl_math.def
@@ -177,17 +177,17 @@ gentype native_tan (gentype x)
 
 
 ##half_native_math
-#gentype half_cos (gentype x)
-#gentype half_divide (gentype x, gentype y)
-#gentype half_exp (gentype x)
-#gentype half_exp2 (gentype x)
-#gentype half_exp10 (gentype x)
-#gentype half_log (gentype x)
-#gentype half_log2 (gentype x)
-#gentype half_log10 (gentype x)
-#gentype half_powr (gentype x, gentype y)
-#gentype half_recip (gentype x)
-#gentype half_rsqrt (gentype x)
-#gentype half_sin (gentype x)
-#gentype half_sqrt (gentype x)
-#gentype half_tan (gentype x)
+gentype half_cos (gentype x)
+gentype half_divide (gentype x, gentype y)
+gentype half_exp (gentype x)
+gentype half_exp2 (gentype x)
+gentype half_exp10 (gentype x)
+gentype half_log (gentype x)
+gentype half_log2 (gentype x)
+gentype half_log10 (gentype x)
+gentype half_powr (gentype x, gentype y)
+gentype half_recip (gentype x)
+gentype half_rsqrt (gentype x)
+gentype half_sin (gentype x)
+gentype half_sqrt (gentype x)
+gentype half_tan (gentype x)
diff --git a/backend/src/libocl/script/ocl_simd.def b/backend/src/libocl/script/ocl_simd.def
index e26243e..aa47735 100644
--- a/backend/src/libocl/script/ocl_simd.def
+++ b/backend/src/libocl/script/ocl_simd.def
@@ -2,3 +2,12 @@
 floatn intel_sub_group_shuffle(floatn x, uint c)
 intn intel_sub_group_shuffle(intn x, uint c)
 uintn intel_sub_group_shuffle(uintn x, uint c)
+floatn intel_sub_group_shuffle_down(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_down(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_down(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_up(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_up(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_up(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_xor(floatn x, uint c)
+intn intel_sub_group_shuffle_xor(intn x, uint c)
+uintn intel_sub_group_shuffle_xor(uintn x, uint c)
diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
index 2765a71..9416f80 100644
--- a/backend/src/libocl/src/ocl_barrier.ll
+++ b/backend/src/libocl/src/ocl_barrier.ll
@@ -12,6 +12,7 @@ declare i32 @_get_global_mem_fence() nounwind alwaysinline
 declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
 declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
 declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_debugwait() nounwind alwaysinline noduplicate
 
 define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
   %1 = icmp eq i32 %flags, 3
@@ -40,3 +41,8 @@ barrier_global:
 done:
   ret void
 }
+
+define void @_Z9debugwaitv() nounwind noduplicate alwaysinline {
+  call void @__gen_ocl_debugwait()
+  ret void
+}
diff --git a/backend/src/libocl/src/ocl_geometric.cl b/backend/src/libocl/src/ocl_geometric.cl
index 886e88c..cf98503 100644
--- a/backend/src/libocl/src/ocl_geometric.cl
+++ b/backend/src/libocl/src/ocl_geometric.cl
@@ -35,6 +35,18 @@ OVERLOADABLE float dot(float3 p0, float3 p1) {
 OVERLOADABLE float dot(float4 p0, float4 p1) {
   return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
 }
+OVERLOADABLE half dot(half p0, half p1) {
+  return p0 * p1;
+}
+OVERLOADABLE half dot(half2 p0, half2 p1) {
+  return p0.x * p1.x + p0.y * p1.y;
+}
+OVERLOADABLE half dot(half3 p0, half3 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z;
+}
+OVERLOADABLE half dot(half4 p0, half4 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
+}
 OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
 
 #define BODY \
diff --git a/backend/src/libocl/src/ocl_memset.cl b/backend/src/libocl/src/ocl_memset.cl
index b41851a..d8bc5df 100644
--- a/backend/src/libocl/src/ocl_memset.cl
+++ b/backend/src/libocl/src/ocl_memset.cl
@@ -21,7 +21,7 @@
 void __gen_memset_ ##NAME## _align (DST_SPACE uchar* dst, uchar val, size_t size) { \
   size_t index = 0; \
   uint v = (val << 24) | (val << 16) | (val << 8) | val; \
-  while((index + 4) >= size) { \
+  while((index + 4) <= size) { \
     *((DST_SPACE uint *)(dst + index)) = v; \
     index += 4; \
   } \
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index 7f40054..94bf178 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -62,12 +62,12 @@
     y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
     y.s8 = ((TYPE *) &x)[mask.s8 & (vec_step(x) - 1)]; \
     y.s9 = ((TYPE *) &x)[mask.s9 & (vec_step(x) - 1)]; \
-    y.sa = ((TYPE *) &x)[mask.sa & (vec_step(x) - 1)]; \
-    y.sb = ((TYPE *) &x)[mask.sb & (vec_step(x) - 1)]; \
-    y.sc = ((TYPE *) &x)[mask.sc & (vec_step(x) - 1)]; \
-    y.sd = ((TYPE *) &x)[mask.sd & (vec_step(x) - 1)]; \
-    y.se = ((TYPE *) &x)[mask.se & (vec_step(x) - 1)]; \
-    y.sf = ((TYPE *) &x)[mask.sf & (vec_step(x) - 1)]; \
+    y.sA = ((TYPE *) &x)[mask.sA & (vec_step(x) - 1)]; \
+    y.sB = ((TYPE *) &x)[mask.sB & (vec_step(x) - 1)]; \
+    y.sC = ((TYPE *) &x)[mask.sC & (vec_step(x) - 1)]; \
+    y.sD = ((TYPE *) &x)[mask.sD & (vec_step(x) - 1)]; \
+    y.sE = ((TYPE *) &x)[mask.sE & (vec_step(x) - 1)]; \
+    y.sF = ((TYPE *) &x)[mask.sF & (vec_step(x) - 1)]; \
     return y; \
   }
 
@@ -164,12 +164,12 @@ DEF(ulong)
     z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
     z.s8 = mask.s8 < 16 ? ((TYPE *)&x)[mask.s8] : ((TYPE *)&y)[mask.s8 & 15]; \
     z.s9 = mask.s9 < 16 ? ((TYPE *)&x)[mask.s9] : ((TYPE *)&y)[mask.s9 & 15]; \
-    z.sa = mask.sa < 16 ? ((TYPE *)&x)[mask.sa] : ((TYPE *)&y)[mask.sa & 15]; \
-    z.sb = mask.sb < 16 ? ((TYPE *)&x)[mask.sb] : ((TYPE *)&y)[mask.sb & 15]; \
-    z.sc = mask.sc < 16 ? ((TYPE *)&x)[mask.sc] : ((TYPE *)&y)[mask.sc & 15]; \
-    z.sd = mask.sd < 16 ? ((TYPE *)&x)[mask.sd] : ((TYPE *)&y)[mask.sd & 15]; \
-    z.se = mask.se < 16 ? ((TYPE *)&x)[mask.se] : ((TYPE *)&y)[mask.se & 15]; \
-    z.sf = mask.sf < 16 ? ((TYPE *)&x)[mask.sf] : ((TYPE *)&y)[mask.sf & 15]; \
+    z.sA = mask.sA < 16 ? ((TYPE *)&x)[mask.sA] : ((TYPE *)&y)[mask.sA & 15]; \
+    z.sB = mask.sB < 16 ? ((TYPE *)&x)[mask.sB] : ((TYPE *)&y)[mask.sB & 15]; \
+    z.sC = mask.sC < 16 ? ((TYPE *)&x)[mask.sC] : ((TYPE *)&y)[mask.sC & 15]; \
+    z.sD = mask.sD < 16 ? ((TYPE *)&x)[mask.sD] : ((TYPE *)&y)[mask.sD & 15]; \
+    z.sE = mask.sE < 16 ? ((TYPE *)&x)[mask.sE] : ((TYPE *)&y)[mask.sE & 15]; \
+    z.sF = mask.sF < 16 ? ((TYPE *)&x)[mask.sF] : ((TYPE *)&y)[mask.sF & 15]; \
     return z; \
   }
 
diff --git a/backend/src/libocl/src/ocl_sync.cl b/backend/src/libocl/src/ocl_sync.cl
index d008639..b6efef8 100644
--- a/backend/src/libocl/src/ocl_sync.cl
+++ b/backend/src/libocl/src/ocl_sync.cl
@@ -20,12 +20,13 @@
 void __gen_ocl_barrier_local(void);
 void __gen_ocl_barrier_global(void);
 void __gen_ocl_barrier_local_and_global(void);
+void __gen_ocl_debugwait(void);
 
-void mem_fence(cl_mem_fence_flags flags) {
+OVERLOADABLE void mem_fence(cl_mem_fence_flags flags) {
 }
 
-void read_mem_fence(cl_mem_fence_flags flags) {
+OVERLOADABLE void read_mem_fence(cl_mem_fence_flags flags) {
 }
 
-void write_mem_fence(cl_mem_fence_flags flags) {
+OVERLOADABLE void write_mem_fence(cl_mem_fence_flags flags) {
 }
diff --git a/backend/src/libocl/src/ocl_vload.cl b/backend/src/libocl/src/ocl_vload.cl
index fa5e04f..bac0ed7 100644
--- a/backend/src/libocl/src/ocl_vload.cl
+++ b/backend/src/libocl/src/ocl_vload.cl
@@ -120,6 +120,7 @@ OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
   DECL_BYTE_WR_SPACE(TYPE, __private)
 
 DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(half)
 DECL_BYTE_RW_ALL(uchar)
 DECL_BYTE_RW_ALL(short)
 DECL_BYTE_RW_ALL(ushort)
@@ -179,10 +180,17 @@ OVERLOADABLE short f32to16_rtz(float f) {
 OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
   return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
 } \
+OVERLOADABLE float vloada_half(size_t offset, const SPACE half *p) { \
+  return vload_half(offset, p); \
+} \
 OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
   return (float2)(vload_half(offset*2, p), \
                   vload_half(offset*2 + 1, p)); \
 } \
+OVERLOADABLE float2 vloada_half2(size_t offset, const SPACE half *p) { \
+  return (float2)(vloada_half(offset*2, p), \
+                  vloada_half(offset*2 + 1, p)); \
+} \
 OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
   return (float3)(vload_half(offset*3, p), \
                   vload_half(offset*3 + 1, p), \
@@ -197,14 +205,26 @@ OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
   return (float4)(vload_half2(offset*2, p), \
                   vload_half2(offset*2 + 1, p)); \
 } \
+OVERLOADABLE float4 vloada_half4(size_t offset, const SPACE half *p) { \
+  return (float4)(vloada_half2(offset*2, p), \
+                  vloada_half2(offset*2 + 1, p)); \
+} \
 OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
   return (float8)(vload_half4(offset*2, p), \
                   vload_half4(offset*2 + 1, p)); \
 } \
+OVERLOADABLE float8 vloada_half8(size_t offset, const SPACE half *p) { \
+  return (float8)(vloada_half4(offset*2, p), \
+                  vloada_half4(offset*2 + 1, p)); \
+} \
 OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
   return (float16)(vload_half8(offset*2, p), \
                    vload_half8(offset*2 + 1, p)); \
-}
+}\
+OVERLOADABLE float16 vloada_half16(size_t offset, const SPACE half *p) { \
+  return (float16)(vloada_half8(offset*2, p), \
+                   vloada_half8(offset*2 + 1, p)); \
+}\
 
 #define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
 OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
diff --git a/backend/src/libocl/src/ocl_work_group.cl b/backend/src/libocl/src/ocl_work_group.cl
new file mode 100644
index 0000000..2c43d6d
--- /dev/null
+++ b/backend/src/libocl/src/ocl_work_group.cl
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_work_group.h"
+
+int __gen_ocl_work_group_all(int);
+int work_group_all(int predicate) {
+  return __gen_ocl_work_group_all(predicate);
+}
+
+int __gen_ocl_work_group_any(int);
+int work_group_any(int predicate) {
+  return __gen_ocl_work_group_any(predicate);
+}
+
+/* broadcast */
+#define BROADCAST_IMPL(GEN_TYPE) \
+    OVERLOADABLE GEN_TYPE __gen_ocl_work_group_broadcast(GEN_TYPE a, size_t local_id); \
+    OVERLOADABLE GEN_TYPE work_group_broadcast(GEN_TYPE a, size_t local_id) { \
+      return __gen_ocl_work_group_broadcast(a, local_id); \
+    } \
+    OVERLOADABLE GEN_TYPE __gen_ocl_work_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y); \
+    OVERLOADABLE GEN_TYPE work_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y) { \
+      return __gen_ocl_work_group_broadcast(a, local_id_x, local_id_y);  \
+    } \
+    OVERLOADABLE GEN_TYPE __gen_ocl_work_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z); \
+    OVERLOADABLE GEN_TYPE work_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z) { \
+      return __gen_ocl_work_group_broadcast(a, local_id_x, local_id_y, local_id_z); \
+    }
+
+BROADCAST_IMPL(int)
+BROADCAST_IMPL(uint)
+BROADCAST_IMPL(long)
+BROADCAST_IMPL(ulong)
+BROADCAST_IMPL(float)
+BROADCAST_IMPL(double)
+#undef BROADCAST_IMPL
+
+
+#define RANGE_OP(RANGE, OP, GEN_TYPE, SIGN) \
+    OVERLOADABLE GEN_TYPE __gen_ocl_work_group_##RANGE##_##OP(bool sign, GEN_TYPE x); \
+    OVERLOADABLE GEN_TYPE work_group_##RANGE##_##OP(GEN_TYPE x) { \
+      return __gen_ocl_work_group_##RANGE##_##OP(SIGN, x);  \
+    }
+
+/* reduce add */
+RANGE_OP(reduce, add, int, true)
+RANGE_OP(reduce, add, uint, false)
+RANGE_OP(reduce, add, long, true)
+RANGE_OP(reduce, add, ulong, false)
+RANGE_OP(reduce, add, float, true)
+RANGE_OP(reduce, add, double, true)
+/* reduce min */
+RANGE_OP(reduce, min, int, true)
+RANGE_OP(reduce, min, uint, false)
+RANGE_OP(reduce, min, long, true)
+RANGE_OP(reduce, min, ulong, false)
+RANGE_OP(reduce, min, float, true)
+RANGE_OP(reduce, min, double, true)
+/* reduce max */
+RANGE_OP(reduce, max, int, true)
+RANGE_OP(reduce, max, uint, false)
+RANGE_OP(reduce, max, long, true)
+RANGE_OP(reduce, max, ulong, false)
+RANGE_OP(reduce, max, float, true)
+RANGE_OP(reduce, max, double, true)
+
+/* scan_inclusive add */
+RANGE_OP(scan_inclusive, add, int, true)
+RANGE_OP(scan_inclusive, add, uint, false)
+RANGE_OP(scan_inclusive, add, long, true)
+RANGE_OP(scan_inclusive, add, ulong, false)
+RANGE_OP(scan_inclusive, add, float, true)
+RANGE_OP(scan_inclusive, add, double, true)
+/* scan_inclusive min */
+RANGE_OP(scan_inclusive, min, int, true)
+RANGE_OP(scan_inclusive, min, uint, false)
+RANGE_OP(scan_inclusive, min, long, true)
+RANGE_OP(scan_inclusive, min, ulong, false)
+RANGE_OP(scan_inclusive, min, float, true)
+RANGE_OP(scan_inclusive, min, double, true)
+/* scan_inclusive max */
+RANGE_OP(scan_inclusive, max, int, true)
+RANGE_OP(scan_inclusive, max, uint, false)
+RANGE_OP(scan_inclusive, max, long, true)
+RANGE_OP(scan_inclusive, max, ulong, false)
+RANGE_OP(scan_inclusive, max, float, true)
+RANGE_OP(scan_inclusive, max, double, true)
+
+/* scan_exclusive add */
+RANGE_OP(scan_exclusive, add, int, true)
+RANGE_OP(scan_exclusive, add, uint, false)
+RANGE_OP(scan_exclusive, add, long, true)
+RANGE_OP(scan_exclusive, add, ulong, false)
+RANGE_OP(scan_exclusive, add, float, true)
+RANGE_OP(scan_exclusive, add, double, true)
+/* scan_exclusive min */
+RANGE_OP(scan_exclusive, min, int, true)
+RANGE_OP(scan_exclusive, min, uint, false)
+RANGE_OP(scan_exclusive, min, long, true)
+RANGE_OP(scan_exclusive, min, ulong, false)
+RANGE_OP(scan_exclusive, min, float, true)
+RANGE_OP(scan_exclusive, min, double, true)
+/* scan_exclusive max */
+RANGE_OP(scan_exclusive, max, int, true)
+RANGE_OP(scan_exclusive, max, uint, false)
+RANGE_OP(scan_exclusive, max, long, true)
+RANGE_OP(scan_exclusive, max, ulong, false)
+RANGE_OP(scan_exclusive, max, float, true)
+RANGE_OP(scan_exclusive, max, double, true)
+
+#undef RANGE_OP
diff --git a/backend/src/libocl/src/ocl_workitem.cl b/backend/src/libocl/src/ocl_workitem.cl
index 6ddc406..235f12b 100644
--- a/backend/src/libocl/src/ocl_workitem.cl
+++ b/backend/src/libocl/src/ocl_workitem.cl
@@ -55,3 +55,33 @@ DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
 OVERLOADABLE uint get_global_id(uint dim) {
   return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
 }
+
+OVERLOADABLE uint get_enqueued_local_size (uint dimindx)
+{
+  //TODO: should be different with get_local_size when support
+  //non-uniform work-group size
+  return get_local_size(dimindx);
+}
+
+OVERLOADABLE uint get_global_linear_id(void)
+{
+  uint dim = __gen_ocl_get_work_dim();
+  if (dim == 1) return get_global_id(0) - get_global_offset(0);
+  else if (dim == 2) return (get_global_id(1) - get_global_offset(1)) * get_global_size(0) +
+                             get_global_id(0) -get_global_offset(0);
+  else if (dim == 3) return ((get_global_id(2) - get_global_offset(2)) *
+                              get_global_size(1) * get_global_size(0)) +
+                            ((get_global_id(1) - get_global_offset(1)) * get_global_size (0)) +
+                             (get_global_id(0) - get_global_offset(0));
+  else return 0;
+}
+
+OVERLOADABLE uint get_local_linear_id(void)
+{
+  uint dim = __gen_ocl_get_work_dim();
+  if (dim == 1) return get_local_id(0);
+  else if (dim == 2) return get_local_id(1) * get_local_size (0) + get_local_id(0);
+  else if (dim == 3) return (get_local_id(2) * get_local_size(1) * get_local_size(0)) +
+                            (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+  else return 0;
+}
diff --git a/backend/src/libocl/tmpl/ocl_common.tmpl.cl b/backend/src/libocl/tmpl/ocl_common.tmpl.cl
index b6b09b5..0b6a8fb 100644
--- a/backend/src/libocl/tmpl/ocl_common.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_common.tmpl.cl
@@ -24,6 +24,7 @@
 /////////////////////////////////////////////////////////////////////////////
 PURE CONST OVERLOADABLE float __gen_ocl_fmax(float a, float b);
 PURE CONST OVERLOADABLE float __gen_ocl_fmin(float a, float b);
+PURE CONST OVERLOADABLE float __gen_ocl_lrp(float a, float b, float c);
 
 OVERLOADABLE float step(float edge, float x) {
   return x < edge ? 0.0 : 1.0;
@@ -36,7 +37,7 @@ OVERLOADABLE float min(float a, float b) {
   return __gen_ocl_fmin(a, b);
 }
 OVERLOADABLE float mix(float x, float y, float a) {
-  return x + (y-x)*a;
+  return __gen_ocl_lrp(a,y,x); //The lrp using a different order with mix
 }
 OVERLOADABLE float clamp(float v, float l, float u) {
   return max(min(v, u), l);
diff --git a/backend/src/libocl/tmpl/ocl_defines.tmpl.h b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
index 9c53093..8fb5d2b 100644
--- a/backend/src/libocl/tmpl/ocl_defines.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
@@ -36,5 +36,6 @@
 #define cl_khr_gl_sharing
 #define cl_khr_spir
 #define cl_khr_fp16
+#define cl_khr_3d_image_writes
 
 #endif /* end of __OCL_COMMON_DEF_H__ */
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 12408eb..7e7f4ae 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -139,8 +139,8 @@ DEF(long)
 DEF(ulong)
 #undef DEF
 
-OVERLOADABLE int mul24(int a, int b) { return ((a << 8) >> 8) * ((b << 8) >> 8); }
-OVERLOADABLE uint mul24(uint a, uint b) { return (a & 0xFFFFFF) * (b & 0xFFFFFF); }
+OVERLOADABLE int mul24(int a, int b) { return a*b; }
+OVERLOADABLE uint mul24(uint a, uint b) { return a*b; }
 
 OVERLOADABLE int mad24(int a, int b, int c) { return mul24(a, b) + c; }
 OVERLOADABLE uint mad24(uint a, uint b, uint c) { return mul24(a, b) + c; }
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index dc0363d..9f10713 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -57,7 +57,7 @@ OVERLOADABLE float native_tan(float x) {
 }
 OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
 OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
-OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+OVERLOADABLE float native_exp10(float x) { return __gen_ocl_exp(M_LOG210_F*x); }
 OVERLOADABLE float native_divide(float x, float y) { return x/y; }
 
 /* Fast path */
@@ -164,7 +164,7 @@ OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
   return ux.f;
 }
 
-OVERLOADABLE float __gen_ocl_internal_log(float x) {
+OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
 /*
  *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
  * ====================================================
@@ -178,192 +178,105 @@ OVERLOADABLE float __gen_ocl_internal_log(float x) {
  */
   union { unsigned int i; float f; } u;
   const float
-  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
-  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
-  two25 =    3.355443200e+07, /* 0x4c000000 */
+  ln2_hi = 6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =  3.355443200e+07, /* 0x4c000000 */
   Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
   Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
   Lg3 = 2.8571429849e-01, /* 3E924925 */
-  Lg4 = 2.2222198546e-01, /* 3E638E29 */
-  Lg5 = 1.8183572590e-01, /* 3E3A3325 */
-  Lg6 = 1.5313838422e-01, /* 3E1CD04F */
-  Lg7 = 1.4798198640e-01; /* 3E178897 */
+  Lg4 = 2.2222198546e-01; /* 3E638E29 */
 
   const float zero   =  0.0;
-  float hfsq,f,s,z,R,w,t1,t2,dk;
-  int k,ix,i,j;
+  float fsq, f, s, z, R, w, t1, t2, partial;
+  int k, ix, i, j;
 
   u.f = x;  ix = u.i;
-  k=0;
-  if (ix < 0x00800000) {      /* x < 2**-126  */
-      if ((ix&0x7fffffff)==0)
-    return -two25/zero;   /* log(+-0)=-inf */
-      if (ix<0) return (x-x)/zero;  /* log(-#) = NaN */
-      return -INFINITY;  /* Gen does not support subnormal number now */
-      //k -= 25; x *= two25; /* subnormal number, scale up x */
-      //u.f = x;  ix = u.i;
-  }
-  if (ix >= 0x7f800000) return x+x;
-  k += (ix>>23)-127;
+  k = 0;
+
+  k += (ix>>23) - 127;
   ix &= 0x007fffff;
-  i = (ix+(0x95f64<<3))&0x800000;
-  u.i = ix|(i^0x3f800000); x = u.f;
+  i = (ix + (0x95f64<<3)) & 0x800000;
+  u.i = ix | (i^0x3f800000); x = u.f;
   k += (i>>23);
-  f = x-(float)1.0;
-  if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
-      if(f==zero) {
-        if(k==0) return zero;
-        else {
-          dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
-        }
-      }
-      R = f*f*((float)0.5-(float)0.33333333333333333*f);
-      if(k==0)
-        return f-R;
-      else {
-        dk=(float)k;  return dk*ln2_hi-((R-dk*ln2_lo)-f);
-      }
+  f = x - 1.0f;
+  fsq = f * f;
+
+  if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
+      R = fsq * (0.5f - 0.33333333333333333f * f);
+      return k * ln2_hi + k * ln2_lo + f - R;
   }
-  s = f/((float)2.0+f);
-  dk = (float)k;
-  z = s*s;
-  i = ix-(0x6147a<<3);
-  w = z*z;
-  j = (0x6b851<<3)-ix;
-  t1= w*(Lg2+w*(Lg4+w*Lg6));
-  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+
+  s = f / (2.0f + f);
+  z = s * s;
+  i = ix - (0x6147a << 3);
+  w = z * z;
+  j = (0x6b851 << 3) - ix;
+  t1= w * mad(w, Lg4, Lg2);
+  t2= z * mad(w, Lg3, Lg1);
   i |= j;
-  R = t2+t1;
-  if(i>0) {
-      hfsq=(float)0.5*f*f;
-      if(k==0) return f-(hfsq-s*(hfsq+R)); else
-         return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
-  } else {
-      if(k==0) return f-s*(f-R); else
-         return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
-  }
+  R = t2 + t1;
+  partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
+
+  return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
 }
 
+OVERLOADABLE float __gen_ocl_internal_log(float x)
+{
+  union { unsigned int i; float f; } u;
+  u.f = x;
+  int ix = u.i;
 
-OVERLOADABLE float __gen_ocl_internal_log10(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  union {float f; unsigned i; }u;
+  if (ix < 0 )
+	return NAN;  /* log(-#) = NaN */
+  if (ix >= 0x7f800000)
+    return NAN;
+
+  return __gen_ocl_internal_log_valid(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_log10(float x)
+{
+  union { float f; unsigned i; } u;
   const float
-  zero       = 0.0,
-  two25      =  3.3554432000e+07, /* 0x4c000000 */
   ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
   log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
   log10_2lo  =  7.9034151668e-07; /* 0x355427db */
 
-  float y,z;
-  int i,k,hx;
+  float y, z;
+  int i, k, hx;
 
   u.f = x; hx = u.i;
-  k=0;
-  if (hx < 0x00800000) {                  /* x < 2**-126  */
-    if ((hx&0x7fffffff)==0)
-      return -two25/zero;             /* log(+-0)=-inf */
-    if (hx<0) return NAN;        /* log(-#) = NaN */
-    return -INFINITY;      /* Gen does not support subnormal now */
-    //k -= 25; x *= two25; /* subnormal number, scale up x */
-    //u.f = x; hx = u.i;
-  }
-  if (hx >= 0x7f800000) return x+x;
-  k += (hx>>23)-127;
-  i  = ((unsigned)k&0x80000000)>>31;
-  hx = (hx&0x007fffff)|((0x7f-i)<<23);
-  y  = (float)(k+i);
-  u.i = hx; x = u.f;
-  z  = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
-  return  z+y*log10_2hi;
-}
 
+  if (hx<0)
+    return NAN; /* log(-#) = NaN */
+  if (hx >= 0x7f800000)
+    return NAN;
 
-OVERLOADABLE float __gen_ocl_internal_log2(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- *  adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  const float zero   =  0.0,
-  ln2 = 0.69314718055994530942,
-  two25 =    3.355443200e+07, /** 0x4c000000 */
-  Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
-  Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
-  Lg3 = 2.8571429849e-01, /** 3E924925 */
-  Lg4 = 2.2222198546e-01, /** 3E638E29 */
-  Lg5 = 1.8183572590e-01, /** 3E3A3325 */
-  Lg6 = 1.5313838422e-01, /** 3E1CD04F */
-  Lg7 = 1.4798198640e-01; /** 3E178897 */
-
-  float hfsq,f,s,z,R,w,t1,t2,dk;
-  int k,ix,i,j;
-
-  union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
-  u.f = x; ix = u.i;
-
-  k=0;
-  if (ix < 0x00800000) {           /** x < 2**-126  */
-      if ((ix&0x7fffffff)==0)
-      return -two25/(x-x);        /** log(+-0)=-inf */
-
-      if (ix<0) return (x-x)/(x-x);    /** log(-#) = NaN */
-      return -INFINITY;
-      k -= 25; x *= two25; /** subnormal number, scale up x */
-      u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
-  }
-
-  if (ix >= 0x7f800000) return x+x;
+  k = (hx >> 23) - 127;
+  i  = ((unsigned)k & 0x80000000) >> 31;
+  hx = (hx&0x007fffff) | ((0x7f-i) << 23);
+  y  = (float)(k + i);
+  u.i = hx; x = u.f;
 
-  k += (ix>>23)-127;
-  ix &= 0x007fffff;
-  i = (ix+(0x95f64<<3))&0x800000;
+  return  y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
+}
 
-  u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000));    /** normalize x or x/2 */
-  k += (i>>23);
-  dk = (float)k;
-  f = x-(float)1.0;
 
-  if((0x007fffff&(15+ix))<16) {    /** |f| < 2**-20 */
-      if(f==zero) return dk;
+OVERLOADABLE float __gen_ocl_internal_log2(float x)
+{
+  const float zero   =  0.0,
+  invln2 = 0x1.715476p+0f;
+  int ix;
 
-      R = f*f*((float)0.5-(float)0.33333333333333333*f);
-      return dk-(R-f)/ln2;
-  }
+  union { float f; int i; } u;
+  u.f = x; ix = u.i;
 
-  s = f/((float)2.0+f);
-  z = s*s;
-  i = ix-(0x6147a<<3);
-  w = z*z;
-  j = (0x6b851<<3)-ix;
-  t1= w*(Lg2+w*(Lg4+w*Lg6));
-  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
-  i |= j;
-  R = t2+t1;
+  if (ix < 0)
+	return NAN;    /** log(-#) = NaN */
+  if (ix >= 0x7f800000)
+	return NAN;
 
-  if(i>0) {
-      hfsq=(float)0.5*f*f;
-      return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
-  } else {
-      return dk-((s*(f-R))-f)/ln2;
-  }
+  return invln2 * __gen_ocl_internal_log_valid(x);
 }
 
 
@@ -543,18 +456,16 @@ OVERLOADABLE float __kernel_sinf(float x)
 {
   /* copied from fdlibm */
   const float
-  half_value =  5.0000000000e-01,/* 0x3f000000 */
   S1  = -1.6666667163e-01, /* 0xbe2aaaab */
   S2  =  8.3333337680e-03, /* 0x3c088889 */
   S3  = -1.9841270114e-04, /* 0xb9500d01 */
-  S4  =  2.7557314297e-06, /* 0x3638ef1b */
-  S5  = -2.5050759689e-08, /* 0xb2d72f34 */
-  S6  =  1.5896910177e-10; /* 0x2f2ec9d3 */
+  S4  =  2.7557314297e-06; /* 0x3638ef1b */
   float z,r,v;
   z =  x*x;
   v =  z*x;
-  r =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
-  return x+v*(S1+z*r);
+  r = mad(z, mad(z, mad(z, S4, S3), S2), S1);
+
+  return mad(v, r, x);
 }
 
 float __kernel_cosf(float x, float y)
@@ -564,16 +475,14 @@ float __kernel_cosf(float x, float y)
   one =  1.0000000000e+00, /* 0x3f800000 */
   C1  =  4.1666667908e-02, /* 0x3d2aaaab */
   C2  = -1.3888889225e-03, /* 0xbab60b61 */
-  C3  =  2.4801587642e-05, /* 0x37d00d01 */
-  C4  = -2.7557314297e-07, /* 0xb493f27c */
-  C5  =  2.0875723372e-09, /* 0x310f74f6 */
-  C6  = -1.1359647598e-11; /* 0xad47d74e */
+  C3  =  2.4801587642e-05; /* 0x37d00d01 */
   float a,hz,z,r,qx;
   int ix;
   GEN_OCL_GET_FLOAT_WORD(ix,x);
   ix &= 0x7fffffff;     /* ix = |x|'s high word*/
   z  = x*x;
-  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+  r = z * mad(z, mad(z, C3, C2), C1);
+
   if(ix < 0x3e99999a)       /* if |x| < 0.3 */
       return one - ((float)0.5*z - (z*r - x*y));
   else {
@@ -584,24 +493,27 @@ float __kernel_cosf(float x, float y)
   }
 }
 
-OVERLOADABLE float sin(float x) {
+OVERLOADABLE float sin(float x)
+{
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_sin(x);
 
+  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
   float y,z=0.0;
   int n, ix;
 
   float negative = x < 0.0f? -1.0f : 1.0f;
-  x = negative * x;
+  x = fabs(x);
 
   GEN_OCL_GET_FLOAT_WORD(ix,x);
-
   ix &= 0x7fffffff;
 
     /* sin(Inf or NaN) is NaN */
-  if (ix>=0x7f800000) return x-x;
+  if (ix >= 0x7f800000) return x-x;
 
-    /* argument reduction needed */
+  if(x <= pio4)
+	  return negative * __kernel_sinf(x);
+  /* argument reduction needed */
   else {
       n = __ieee754_rem_pio2f(x,&y);
       float s = __kernel_sinf(y);
@@ -611,10 +523,12 @@ OVERLOADABLE float sin(float x) {
   }
 }
 
-OVERLOADABLE float cos(float x) {
+OVERLOADABLE float cos(float x)
+{
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_cos(x);
 
+  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
   float y,z=0.0;
   int n, ix;
   x = __gen_ocl_fabs(x);
@@ -623,9 +537,11 @@ OVERLOADABLE float cos(float x) {
   ix &= 0x7fffffff;
 
     /* cos(Inf or NaN) is NaN */
-  if (ix>=0x7f800000) return x-x;
+  if (ix >= 0x7f800000) return x-x;
 
-    /* argument reduction needed */
+  if(x <= pio4)
+	  return __kernel_cosf(x, 0.f);
+  /* argument reduction needed */
   else {
       n = __ieee754_rem_pio2f(x,&y);
       n &= 3;
@@ -662,12 +578,6 @@ float __kernel_tanf(float x, float y, int iy)
          T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
          T[6] = 1.4562094584e-03; /* 0x3abede48 */
          T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
-         T[8] = 2.4646313977e-04; /* 0x398137b9 */
-         T[9] = 7.8179444245e-05; /* 0x38a3f445 */
-         T[10] = 7.1407252108e-05; /* 0x3895c07a */
-         T[11] = -1.8558637748e-05; /* 0xb79bae5f */
-         T[12] = 2.5907305826e-05; /* 0x37d95384 */
-
 
         GEN_OCL_GET_FLOAT_WORD(hx,x);
         ix = hx&0x7fffffff;     /* high word of |x| */
@@ -679,22 +589,22 @@ float __kernel_tanf(float x, float y, int iy)
             }
         if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
             if(hx<0) {x = -x; y = -y;}
-
-
             z = pio4-x;
             w = pio4lo-y;
             x = z+w; y = 0.0;
         }
         z       =  x*x;
         w       =  z*z;
-    /* Break x^5*(T[1]+x^2*T[2]+...) into
-     *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
-     *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
-     */
-        r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
-        v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
+		/* Break x^5*(T[1]+x^2*T[2]+...) into
+		 *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+		 *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+		 */
+
+        r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]);
+        v = z* mad(w, mad(w, T[6], T[4]), T[2]);
+
         s = z*x;
-        r = y + z*(s*(r+v)+y);
+        r = mad(z, mad(s, r + v, y), y);
         r += T[0]*s;
         w = x+r;
         if(ix>=0x3f2ca140) {
@@ -702,21 +612,8 @@ float __kernel_tanf(float x, float y, int iy)
             return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
         }
         if(iy==1) return w;
-        else {          /* if allow error up to 2 ulp
-                           simply return -1.0/(x+r) here */
-     /*  compute -1.0/(x+r) accurately */
-            float a,t;
-            int i;
-            z  = w;
-            GEN_OCL_GET_FLOAT_WORD(i,z);
-            GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
-            v  = r-(z - x);     /* z+v = r+x */
-            t = a  = -(float)1.0/w;     /* a = -1.0/w */
-            GEN_OCL_GET_FLOAT_WORD(i,t);
-            GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
-            s  = (float)1.0+t*z;
-            return t+a*(s+t*v);
-        }
+        else
+        	return -1.0/(x+r);
 }
 
 OVERLOADABLE float tan(float x)
@@ -937,44 +834,46 @@ OVERLOADABLE float lgamma(float x) {
 		switch (i) {
 		case 0:
 			z = y * y;
-			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
-			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
-			p = y * p1 + p2;
+			p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);
+			p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);
+			p = mad(y, p1, p2);
 			r += (p - (float) 0.5 * y);
 			break;
 		case 1:
 			z = y * y;
 			w = z * y;
-			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
-			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
-			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
-			p = z * p1 - (tt - w * (p2 + y * p3));
+			p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);
+			p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);
+			p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);
+			p = mad(p1, z, mad(w, mad(y, p3, p2), -tt));
 			r += (tf + p);
 			break;
 		case 2:
-			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
-			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+			p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);
+			p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);
 			r += (-(float) 0.5 * y + p1 / p2);
 		}
 	} else if (ix < 0x41000000) {
 		i = (int) x;
 		t = zero;
 		y = x - (float) i;
-		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
-		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+
+		p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);
+		q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);
 		r = .5f * y + p / q;
 		z = one;
+
 		switch (i) {
 		case 7:
-			z *= (y + (float) 6.0);
+			z *= (y + 6.0f);
 		case 6:
-			z *= (y + (float) 5.0);
+			z *= (y + 5.0f);
 		case 5:
-			z *= (y + (float) 4.0);
+			z *= (y + 4.0f);
 		case 4:
-			z *= (y + (float) 3.0);
+			z *= (y + 3.0f);
 		case 3:
-			z *= (y + (float) 2.0);
+			z *= (y + 2.0f);
 			r += native_log(z);
 			break;
 		}
@@ -983,7 +882,7 @@ OVERLOADABLE float lgamma(float x) {
 		t = native_log(x);
 		z = one / x;
 		y = z * z;
-		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+		w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);
 		r = (x - .5f) * (t - one) + w;
 	} else
 		r = x * (native_log(x) - one);
@@ -1129,32 +1028,32 @@ OVERLOADABLE float lgamma(float x) {
 		switch (i) {  \
 		case 0:  \
 			z = y * y;  \
-			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));  \
-			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));  \
-			p = y * p1 + p2;  \
-			r += (p - (float) 0.5 * y);  \
+			p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);	\
+			p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);	\
+			p = mad(y, p1, p2);	\
+			r = r - mad(y, 0.5f, -p);	\
 			break;  \
 		case 1:  \
 			z = y * y;  \
 			w = z * y;  \
-			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));  \
-			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));  \
-			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));  \
-			p = z * p1 - (tt - w * (p2 + y * p3));  \
+			p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);	\
+			p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);	\
+			p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);	\
+			p = z * p1 + mad(w, mad(y, p3, p2), -tt);	\
 			r += (tf + p);  \
 			break;  \
 		case 2:  \
-			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));  \
-			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));  \
-			r += (-(float) 0.5 * y + p1 / p2);  \
+			p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);	\
+			p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);	\
+			r = r + mad(y, -0.5f, p1 / p2);	\
 		}  \
 	} else if (ix < 0x41000000) {  \
 		i = (int) x;  \
 		t = zero;  \
 		y = x - (float) i;  \
-		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));  \
-		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));  \
-		r = .5f * y + p / q;  \
+		p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);		\
+		q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);	\
+		r = mad(y, 0.5f, p / q);	\
 		z = one;  \
 		switch (i) {  \
 		case 7:  \
@@ -1175,10 +1074,10 @@ OVERLOADABLE float lgamma(float x) {
 		t = native_log(x);  \
 		z = one / x;  \
 		y = z * z;  \
-		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));  \
+		w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);  \
 		r = (x - .5f) * (t - one) + w;  \
 	} else  \
-		r = x * (native_log(x) - one);  \
+		r = x * (native_log(x) - one);	\
 	if (hx < 0)  \
 		r = nadj - r;  \
 	return r;
@@ -1208,10 +1107,7 @@ OVERLOADABLE float log1p(float x) {
   Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
   Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
   Lp3 = 2.8571429849e-01, /* 3E924925 */
-  Lp4 = 2.2222198546e-01, /* 3E638E29 */
-  Lp5 = 1.8183572590e-01, /* 3E3A3325 */
-  Lp6 = 1.5313838422e-01, /* 3E1CD04F */
-  Lp7 = 1.4798198640e-01; /* 3E178897 */
+  Lp4 = 2.2222198546e-01; /* 3E638E29 */
   const float zero = 0.0;
   float hfsq,f,c,s,z,R,u;
   int k,hx,hu,ax;
@@ -1262,20 +1158,26 @@ OVERLOADABLE float log1p(float x) {
       f = u-(float)1.0;
   }
   hfsq=(float)0.5*f*f;
-  if(hu==0) { /* |f| < 2**-20 */
-      if(f==zero) { if(k==0) return zero;
-      else {c += k*ln2_lo; return k*ln2_hi+c;} }
-      R = hfsq*((float)1.0-(float)0.66666666666666666*f);
+  if(hu==0)
+  { /* |f| < 2**-20 */
+      if(f==zero)
+      {
+    	  if(k==0) return zero;
+    	  else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
+      }
+      R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
       if(k==0) return f-R; else
-             return k*ln2_hi-((R-(k*ln2_lo+c))-f);
+    	  return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
   }
   s = f/((float)2.0+f);
   z = s*s;
-  R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
-  if(k==0) return f-(hfsq-s*(hfsq+R)); else
-     return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
-
+  R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
+  if(k==0)
+	  return f + mad(hfsq + R, s, -hfsq);
+  else
+	  return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
 }
+
 OVERLOADABLE float logb(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_logb(x);
@@ -1387,14 +1289,14 @@ OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
 
     /* new cbrt to 23 bits */
   r=t*t/x;
-  s=C+r*t;
+  s=mad(r, t, C);
   t*=G+F/(s+E+D/s);
     /* one step newton iteration to 53 bits with error less than 0.667 ulps */
   s=t*t;    /* t*t is exact */
   r=x/s;
   w=t+t;
   r=(r-t)/(w+r);  /* r-s is exact */
-  t=t+t*r;
+  t=mad(t, r, t);
 
     /* retore the sign bit */
   GEN_OCL_GET_FLOAT_WORD(high,t);
@@ -1440,17 +1342,16 @@ INLINE float __gen_ocl_asin_util(float x) {
   pS2 =  2.01212532134862925881e-01,
   pS3 = -4.00555345006794114027e-02,
   pS4 =  7.91534994289814532176e-04,
-  pS5 =  3.47933107596021167570e-05,
   qS1 = -2.40339491173441421878e+00,
   qS2 =  2.02094576023350569471e+00,
   qS3 = -6.88283971605453293030e-01,
   qS4 =  7.70381505559019352791e-02;
 
   float t = x*x;
-  float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
-  float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
+  float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1), pS0);
+  float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1), 1.0f);
   float w = p / q;
-  return x + x*w;
+  return mad(x, w, x);
 }
 
 OVERLOADABLE float __gen_ocl_internal_asin(float x) {
@@ -1512,10 +1413,6 @@ OVERLOADABLE float __gen_ocl_internal_atan(float x) {
   aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
   aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
   aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
-  aT[7] =  -5.8335702866e-02; /* 0xbd6ef16b */
-  aT[8] =   4.9768779427e-02; /* 0x3d4bda59 */
-  aT[9] =  -3.6531571299e-02; /* 0xbd15a221 */
-  aT[10] =   1.6285819933e-02; /* 0x3c8569d7 */
   const float one = 1.0, huge = 1.0e30;
 
   float w,s1,s2,z;
@@ -1552,8 +1449,8 @@ OVERLOADABLE float __gen_ocl_internal_atan(float x) {
   z = x*x;
   w = z*z;
     /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
-  s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
-  s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
+  s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]);
+  s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]);
   if (id<0) return x - x*(s1+s2);
   else {
       z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
@@ -1666,12 +1563,6 @@ OVERLOADABLE float __gen_ocl_internal_rint(float x) {
 }
 
 OVERLOADABLE float __gen_ocl_internal_exp(float x) {
-  //use native instruction when it has enough precision
-  if (x > -0x1.6p1 && x < 0x1.6p1)
-  {
-    return native_exp(x);
-  }
-
   float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
   u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
   twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
@@ -1679,10 +1570,7 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
   one = 1.0,
   huge = 1.0e+30,
   P1 = 1.6666667163e-01, /* 0x3e2aaaab */
-  P2 = -2.7777778450e-03, /* 0xbb360b61 */
-  P3 = 6.6137559770e-05, /* 0x388ab355 */
-  P4 = -1.6533901999e-06, /* 0xb5ddea0e */
-  P5 =	4.1381369442e-08; /* 0x3331bb4c */
+  P2 = -2.7777778450e-03; /* 0xbb360b61 */
   float y,hi=0.0,lo=0.0,c,t;
   int k=0,xsb;
   unsigned hx;
@@ -1726,7 +1614,7 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
 
   /* x is now in primary range */
   t  = x*x;
-  c  = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  c  = x - t*(P1+t*P2);
   if(k==0)
     return one-((x*c)/(c-(float)2.0)-x);
   else
@@ -1852,15 +1740,15 @@ sb7  = -2.2440952301e+01; /* 0xc1b38712 */
 		return x + efx*x;
 	    }
 	    z = x*x;
-	    r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
-	    s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
-	    y = r/s;
-	    return x + x*y;
+	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1), one);
+	    y = r / s;
+	    return mad(x, y, x);
 	}
 	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
 	    s = __gen_ocl_internal_fabs(x)-one;
-	    P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
-	    Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
+	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
 	    if(hx>=0) return erx + P/Q; else return -erx - P/Q;
 	}
 	if (ix >= 0x40c00000) {		/* inf>|x|>=6 */
@@ -1869,15 +1757,15 @@ sb7  = -2.2440952301e+01; /* 0xc1b38712 */
 	x = __gen_ocl_internal_fabs(x);
     s = one/(x*x);
 	if(ix< 0x4036DB6E) {	/* |x| < 1/0.35 */
-	    R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
-				ra5+s*(ra6+s*ra7))))));
-	    S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
-				sa5+s*(sa6+s*(sa7+s*sa8)))))));
+	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
 	} else {	/* |x| >= 1/0.35 */
-	    R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
-				rb5+s*rb6)))));
-	    S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
-				sb5+s*(sb6+s*sb7))))));
+	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
 	}
 	GEN_OCL_GET_FLOAT_WORD(ix,x);
 	GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
@@ -1972,8 +1860,8 @@ sb7  = -2.2440952301e+01; /* 0xc1b38712 */
 	    if(ix < 0x23800000)  	/* |x|<2**-56 */
 		return one-x;
 	    z = x*x;
-	    r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
-	    s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
+	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2), qq1), one);
 	    y = r/s;
 	    if(hx < 0x3e800000) {  	/* x<1/4 */
 		return one-(x+x*y);
@@ -1985,8 +1873,8 @@ sb7  = -2.2440952301e+01; /* 0xc1b38712 */
 	}
 	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
 	    s = __gen_ocl_internal_fabs(x)-one;
-	    P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
-	    Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
+	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
 	    if(hx>=0) {
 	        z  = one-erx; return z - P/Q;
 	    } else {
@@ -1997,16 +1885,16 @@ sb7  = -2.2440952301e+01; /* 0xc1b38712 */
 	    x = __gen_ocl_internal_fabs(x);
         s = one/(x*x);
 	    if(ix< 0x4036DB6D) {	/* |x| < 1/.35 ~ 2.857143*/
-	        R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
-				ra5+s*(ra6+s*ra7))))));
-	        S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
-				sa5+s*(sa6+s*(sa7+s*sa8)))))));
+		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
 	    } else {			/* |x| >= 1/.35 ~ 2.857143 */
 		if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
-	        R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
-				rb5+s*rb6)))));
-	        S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
-				sb5+s*(sb6+s*sb7))))));
+		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
 	    }
 	    GEN_OCL_GET_FLOAT_WORD(ix,x);
 	    GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
@@ -2107,9 +1995,6 @@ OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
   ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
   ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
   Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
-  Q3 = -7.9365076090e-05, /* 0xb8a670cd */
-  Q4 = 4.0082177293e-06, /* 0x36867e54 */
-  Q5 = -2.0109921195e-07, /* 0xb457edbb */
   huge = 1.0e30,
   tiny = 1.0e-30,
   ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
@@ -2166,7 +2051,7 @@ OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
   /* x is now in primary range */
   hfx = (float)0.5*x;
   hxs = x*hfx;
-  r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+  r1 = one+hxs*(Q1+hxs*Q2);
   t = (float)3.0-r1*hfx;
   e = hxs*((r1-t)/((float)6.0 - x*t));
   if(k==0)
@@ -2250,7 +2135,7 @@ OVERLOADABLE float __gen_ocl_internal_asinh(float x){
   } else {
     float xa = __gen_ocl_internal_fabs(x);
     if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
-      w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
+      w = __gen_ocl_internal_log(mad(xa, 2.0f, one / (__gen_ocl_sqrt(mad(xa, xa, one)) + xa)));
     } else {		/* 2.0 > |x| > 2**-14 */
       float t = xa*xa;
       w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
@@ -2609,7 +2494,8 @@ OVERLOADABLE float ldexp(float x, int n) {
   return __gen_ocl_internal_ldexp(x, n);
 }
 
-CONST float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
+CONST OVERLOADABLE float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
+CONST OVERLOADABLE half __gen_ocl_mad(half a, half b, half c) __asm("llvm.fma" ".f16");
 PURE CONST float __gen_ocl_fmax(float a, float b);
 PURE CONST float __gen_ocl_fmin(float a, float b);
 
@@ -2652,6 +2538,10 @@ OVERLOADABLE float nextafter(float x, float y) {
   hy = as_int(y);
   ix = hx & 0x7fffffff;
   iy = hy & 0x7fffffff;
+  if(ix == 0)
+    ix = hx & 0x7fffff;
+  if(iy == 0)
+    iy = hy & 0x7fffff;
   if(ix>0x7f800000 || iy>0x7f800000)
     return x+y;
   if(hx == hy)
@@ -2745,15 +2635,8 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
   L1  =  6.0000002384e-01, /* 0x3f19999a */
   L2  =  4.2857143283e-01, /* 0x3edb6db7 */
-  L3  =  3.3333334327e-01, /* 0x3eaaaaab */
-  L4  =  2.7272811532e-01, /* 0x3e8ba305 */
-  L5  =  2.3066075146e-01, /* 0x3e6c3255 */
-  L6  =  2.0697501302e-01, /* 0x3e53f142 */
   P1   =  1.6666667163e-01, /* 0x3e2aaaab */
   P2   = -2.7777778450e-03, /* 0xbb360b61 */
-  P3   =  6.6137559770e-05, /* 0x388ab355 */
-  P4   = -1.6533901999e-06, /* 0xb5ddea0e */
-  P5   =  4.1381369442e-08, /* 0x3331bb4c */
   lg2  =  6.9314718246e-01, /* 0x3f317218 */
   lg2_h  =  6.93145752e-01, /* 0x3f317200 */
   lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
@@ -2881,7 +2764,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
 
     /* compute log(ax) */
     s2 = s*s;
-    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+    r = s2*s2*(L1+s2*L2);
     r += s_l*(s_h+s);
     s2  = s_h*s_h;
     t_h = 3.0f+s2+r;
@@ -2946,7 +2829,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   z = u+v;
   w = v-(z-u);
   t  = z*z;
-  t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  t1  = z - t*(P1+t*P2);
   r  = (z*t1)/(t1-two)-(w+z*w);
   z  = one-(r-z);
   GEN_OCL_GET_FLOAT_WORD(j,z);
@@ -3059,15 +2942,8 @@ float __gen_ocl_internal_pown(float x, int y) {
     /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
   L1  =  6.0000002384e-01, /* 0x3f19999a */
   L2  =  4.2857143283e-01, /* 0x3edb6db7 */
-  L3  =  3.3333334327e-01, /* 0x3eaaaaab */
-  L4  =  2.7272811532e-01, /* 0x3e8ba305 */
-  L5  =  2.3066075146e-01, /* 0x3e6c3255 */
-  L6  =  2.0697501302e-01, /* 0x3e53f142 */
   P1   =  1.6666667163e-01, /* 0x3e2aaaab */
   P2   = -2.7777778450e-03, /* 0xbb360b61 */
-  P3   =  6.6137559770e-05, /* 0x388ab355 */
-  P4   = -1.6533901999e-06, /* 0xb5ddea0e */
-  P5   =  4.1381369442e-08, /* 0x3331bb4c */
   lg2  =  6.9314718246e-01, /* 0x3f317218 */
   lg2_h  =  0x1.62ep-1,
   lg2_l  =  0x1.0bfbe8p-15,
@@ -3167,7 +3043,7 @@ float __gen_ocl_internal_pown(float x, int y) {
 
     /* compute log(ax) */
     s2 = s*s;
-    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+    r = s2*s2*(L1+s2*L2);
     r += s_l*(s_h+s);
     s2  = s_h*s_h;
     t_h = (float)3.0+s2+r;
@@ -3239,7 +3115,7 @@ float __gen_ocl_internal_pown(float x, int y) {
   z = u+v;
   w = v-(z-u);
   t  = z*z;
-  t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  t1  = z - t*(P1+t*P2);
   r  = (z*t1)/(t1-two)-(w+z*w);
   z  = one-(r-z);
   GEN_OCL_GET_FLOAT_WORD(j,z);
@@ -3527,6 +3403,10 @@ OVERLOADABLE float log(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_log(x);
 
+  /* Use native instruction when it has enough precision */
+  if((x > 0x1.1p0) || (x <= 0))
+    return __gen_ocl_internal_fastpath_log(x);
+
   return  __gen_ocl_internal_log(x);
 }
 
@@ -3534,6 +3414,10 @@ OVERLOADABLE float log2(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_log2(x);
 
+  /* Use native instruction when it has enough precision */
+  if((x > 0x1.1p0) || (x <= 0))
+    return __gen_ocl_internal_fastpath_log2(x);
+
   return  __gen_ocl_internal_log2(x);
 }
 
@@ -3541,6 +3425,10 @@ OVERLOADABLE float log10(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_log10(x);
 
+  /* Use native instruction when it has enough precision */
+  if((x > 0x1.1p0) || (x <= 0))
+    return __gen_ocl_internal_fastpath_log10(x);
+
   return  __gen_ocl_internal_log10(x);
 }
 
@@ -3548,10 +3436,15 @@ OVERLOADABLE float exp(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_exp(x);
 
+  /* Use native instruction when it has enough precision */
+  if (x > -0x1.6p1 && x < 0x1.6p1)
+    return __gen_ocl_internal_fastpath_exp(x);
+
   return  __gen_ocl_internal_exp(x);
 }
 
 OVERLOADABLE float exp2(float x) {
+  /* Use native instruction when it has enough precision, exp2 always */
   return native_exp2(x);
 }
 
@@ -3608,6 +3501,48 @@ OVERLOADABLE half acos(half x) {
   float _x = (float)x;
   return (half)acos(_x);
 }
+OVERLOADABLE float half_cos(float x) {
+  return (float)cos(x);
+}
+OVERLOADABLE float half_divide(float x, float y) {
+  return (float)native_divide(x, y);
+}
+OVERLOADABLE float half_exp(float x) {
+  return (float)native_exp(x);
+}
+OVERLOADABLE float half_exp2(float x){
+  return (float)native_exp2(x);
+}
+OVERLOADABLE float half_exp10(float x){
+  return (float)native_exp10(x);
+}
+OVERLOADABLE float half_log(float x){
+  return (float)native_log(x);
+}
+OVERLOADABLE float half_log2(float x){
+  return (float)native_log2(x);
+}
+OVERLOADABLE float half_log10(float x){
+  return (float)native_log10(x);
+}
+OVERLOADABLE float half_powr(float x, float y){
+  return (float)powr(x, y);
+}
+OVERLOADABLE float half_recip(float x){
+  return (float)native_recip(x);
+}
+OVERLOADABLE float half_rsqrt(float x){
+  return (float)native_rsqrt(x);
+}
+OVERLOADABLE float half_sin(float x){
+  return (float)sin(x);
+}
+OVERLOADABLE float half_sqrt(float x){
+  return (float)native_sqrt(x);
+}
+OVERLOADABLE float half_tan(float x){
+  return (float)tan(x);
+}
 OVERLOADABLE half acospi(half x) {
   float _x = (float)x;
   return (half)acospi(_x);
@@ -3788,10 +3723,7 @@ OVERLOADABLE half exp2(half x) {
   return (half)exp2(_x);
 }
 OVERLOADABLE half mad(half a, half b, half c) {
-  float _a = (float)a;
-  float _b = (float)b;
-  float _c = (float)c;
-  return (half)mad(_a, _b, _c);
+  return __gen_ocl_mad(a,b,c);
 }
 OVERLOADABLE half sin(half x) {
   float _x = (float)x;
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h
index 90dad1f..0de3642 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
@@ -216,19 +216,18 @@ OVERLOADABLE half native_sin(half x);
 OVERLOADABLE half native_sqrt(half x);
 OVERLOADABLE half native_tan(half x);
 
-
 // half accuracy
-#define half_cos cos
-#define half_divide native_divide
-#define half_exp native_exp
-#define half_exp2 native_exp2
-#define half_exp10 native_exp10
-#define half_log native_log
-#define half_log2 native_log2
-#define half_log10 native_log10
-#define half_powr powr
-#define half_recip native_recip
-#define half_rsqrt native_rsqrt
-#define half_sin sin
-#define half_sqrt native_sqrt
-#define half_tan tan
+OVERLOADABLE float half_cos(float x);
+OVERLOADABLE float half_divide(float x, float y);
+OVERLOADABLE float half_exp(float x);
+OVERLOADABLE float half_exp2(float x);
+OVERLOADABLE float half_exp10(float x);
+OVERLOADABLE float half_log(float x);
+OVERLOADABLE float half_log2(float x);
+OVERLOADABLE float half_log10(float x);
+OVERLOADABLE float half_powr(float x, float y);
+OVERLOADABLE float half_recip(float x);
+OVERLOADABLE float half_rsqrt(float x);
+OVERLOADABLE float half_sin(float x);
+OVERLOADABLE float half_sqrt(float x);
+OVERLOADABLE float half_tan(float x);
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index b9da5e2..9c09b21 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -17,3 +17,246 @@
  */
 
 #include "ocl_simd.h"
+#include "ocl_workitem.h"
+
+uint get_max_sub_group_size(void)
+{
+  uint local_sz = get_local_size(0)*get_local_size(1)*get_local_size(2);
+  uint simd_sz = get_simd_size();
+  return local_sz > simd_sz ? simd_sz : local_sz;
+}
+
+uint get_sub_group_size(void)
+{
+  uint threadn = get_num_sub_groups();
+  uint threadid = get_sub_group_id();
+  if((threadid == (threadn - 1)) && (threadn > 1))
+    return (get_local_size(0)*get_local_size(1)*get_local_size(2)) % get_max_sub_group_size();
+  else
+    return get_max_sub_group_size();
+}
+
+/* broadcast */
+#define BROADCAST_IMPL(GEN_TYPE) \
+    OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, size_t local_id); \
+    OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, size_t local_id) { \
+      return __gen_ocl_sub_group_broadcast(a, local_id); \
+    } \
+    OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y); \
+    OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y) { \
+      return __gen_ocl_sub_group_broadcast(a, local_id_x, local_id_y);  \
+    } \
+    OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z); \
+    OVERLOADABLE GEN_TYPE sub_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z) { \
+      return __gen_ocl_sub_group_broadcast(a, local_id_x, local_id_y, local_id_z); \
+    }
+
+BROADCAST_IMPL(int)
+BROADCAST_IMPL(uint)
+BROADCAST_IMPL(long)
+BROADCAST_IMPL(ulong)
+BROADCAST_IMPL(half)
+BROADCAST_IMPL(float)
+BROADCAST_IMPL(double)
+#undef BROADCAST_IMPL
+
+
+#define RANGE_OP(RANGE, OP, GEN_TYPE, SIGN) \
+    OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_##RANGE##_##OP(bool sign, GEN_TYPE x); \
+    OVERLOADABLE GEN_TYPE sub_group_##RANGE##_##OP(GEN_TYPE x) { \
+      return __gen_ocl_sub_group_##RANGE##_##OP(SIGN, x);  \
+    }
+
+/* reduce add */
+RANGE_OP(reduce, add, int, true)
+RANGE_OP(reduce, add, uint, false)
+RANGE_OP(reduce, add, long, true)
+RANGE_OP(reduce, add, ulong, false)
+RANGE_OP(reduce, add, half, true)
+RANGE_OP(reduce, add, float, true)
+RANGE_OP(reduce, add, double, true)
+/* reduce min */
+RANGE_OP(reduce, min, int, true)
+RANGE_OP(reduce, min, uint, false)
+RANGE_OP(reduce, min, long, true)
+RANGE_OP(reduce, min, ulong, false)
+RANGE_OP(reduce, min, half, true)
+RANGE_OP(reduce, min, float, true)
+RANGE_OP(reduce, min, double, true)
+/* reduce max */
+RANGE_OP(reduce, max, int, true)
+RANGE_OP(reduce, max, uint, false)
+RANGE_OP(reduce, max, long, true)
+RANGE_OP(reduce, max, ulong, false)
+RANGE_OP(reduce, max, half, true)
+RANGE_OP(reduce, max, float, true)
+RANGE_OP(reduce, max, double, true)
+
+/* scan_inclusive add */
+RANGE_OP(scan_inclusive, add, int, true)
+RANGE_OP(scan_inclusive, add, uint, false)
+RANGE_OP(scan_inclusive, add, long, true)
+RANGE_OP(scan_inclusive, add, ulong, false)
+RANGE_OP(scan_inclusive, add, half, true)
+RANGE_OP(scan_inclusive, add, float, true)
+RANGE_OP(scan_inclusive, add, double, true)
+/* scan_inclusive min */
+RANGE_OP(scan_inclusive, min, int, true)
+RANGE_OP(scan_inclusive, min, uint, false)
+RANGE_OP(scan_inclusive, min, long, true)
+RANGE_OP(scan_inclusive, min, ulong, false)
+RANGE_OP(scan_inclusive, min, half, true)
+RANGE_OP(scan_inclusive, min, float, true)
+RANGE_OP(scan_inclusive, min, double, true)
+/* scan_inclusive max */
+RANGE_OP(scan_inclusive, max, int, true)
+RANGE_OP(scan_inclusive, max, uint, false)
+RANGE_OP(scan_inclusive, max, long, true)
+RANGE_OP(scan_inclusive, max, ulong, false)
+RANGE_OP(scan_inclusive, max, half, true)
+RANGE_OP(scan_inclusive, max, float, true)
+RANGE_OP(scan_inclusive, max, double, true)
+
+/* scan_exclusive add */
+RANGE_OP(scan_exclusive, add, int, true)
+RANGE_OP(scan_exclusive, add, uint, false)
+RANGE_OP(scan_exclusive, add, long, true)
+RANGE_OP(scan_exclusive, add, ulong, false)
+RANGE_OP(scan_exclusive, add, half, true)
+RANGE_OP(scan_exclusive, add, float, true)
+RANGE_OP(scan_exclusive, add, double, true)
+/* scan_exclusive min */
+RANGE_OP(scan_exclusive, min, int, true)
+RANGE_OP(scan_exclusive, min, uint, false)
+RANGE_OP(scan_exclusive, min, long, true)
+RANGE_OP(scan_exclusive, min, ulong, false)
+RANGE_OP(scan_exclusive, min, half, true)
+RANGE_OP(scan_exclusive, min, float, true)
+RANGE_OP(scan_exclusive, min, double, true)
+/* scan_exclusive max */
+RANGE_OP(scan_exclusive, max, int, true)
+RANGE_OP(scan_exclusive, max, uint, false)
+RANGE_OP(scan_exclusive, max, long, true)
+RANGE_OP(scan_exclusive, max, ulong, false)
+RANGE_OP(scan_exclusive, max, half, true)
+RANGE_OP(scan_exclusive, max, float, true)
+RANGE_OP(scan_exclusive, max, double, true)
+
+#undef RANGE_OP
+PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global uint* p);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global uint* p);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global uint* p);
+OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
+{
+  return __gen_ocl_sub_group_block_read_mem(p);
+}
+OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
+{
+  return __gen_ocl_sub_group_block_read_mem2(p);
+}
+OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
+{
+  return __gen_ocl_sub_group_block_read_mem4(p);
+
+}
+OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
+{
+  return __gen_ocl_sub_group_block_read_mem8(p);
+}
+void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint data);
+void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2 data);
+void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4 data);
+void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8 data);
+OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint data)
+{
+  __gen_ocl_sub_group_block_write_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data)
+{
+  __gen_ocl_sub_group_block_write_mem2(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data)
+{
+  __gen_ocl_sub_group_block_write_mem4(p, data);
+
+}
+OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data)
+{
+  __gen_ocl_sub_group_block_write_mem8(p, data);
+}
+
+PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int y);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_image2(image2d_t p, int x, int y);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_image4(image2d_t p, int x, int y);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_image8(image2d_t p, int x, int y);
+OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_image(p, cord.x, cord.y);
+}
+OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_image2(p, cord.x, cord.y);
+}
+OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_image4(p, cord.x, cord.y);
+}
+OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y);
+}
+void __gen_ocl_sub_group_block_write_image(image2d_t p, int x, int y, uint data);
+void __gen_ocl_sub_group_block_write_image2(image2d_t p, int x, int y, uint2 data);
+void __gen_ocl_sub_group_block_write_image4(image2d_t p, int x, int y, uint4 data);
+void __gen_ocl_sub_group_block_write_image8(image2d_t p, int x, int y, uint8 data);
+OVERLOADABLE void intel_sub_group_block_write(image2d_t p, int2 cord, uint data)
+{
+  __gen_ocl_sub_group_block_write_image(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write2(image2d_t p, int2 cord, uint2 data)
+{
+  __gen_ocl_sub_group_block_write_image2(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write4(image2d_t p, int2 cord, uint4 data)
+{
+  __gen_ocl_sub_group_block_write_image4(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 data)
+{
+  __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data);
+}
+
+#define SHUFFLE_DOWN(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
+  TYPE res0, res1; \
+  res0 = intel_sub_group_shuffle(x, (get_sub_group_local_id() + c)%get_max_sub_group_size()); \
+  res1 = intel_sub_group_shuffle(y, (get_sub_group_local_id() + c)%get_max_sub_group_size()); \
+  bool inRange = ((int)c + (int)get_sub_group_local_id() > 0) && (((int)c + (int)get_sub_group_local_id() < (int) get_max_sub_group_size())); \
+  return inRange ? res0 : res1; \
+}
+SHUFFLE_DOWN(float)
+SHUFFLE_DOWN(int)
+SHUFFLE_DOWN(uint)
+#undef SHUFFLE_DOWN
+
+#define SHUFFLE_UP(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) { \
+  TYPE res0, res1; \
+  res0 = intel_sub_group_shuffle(x, (get_max_sub_group_size() + get_sub_group_local_id() - c)%get_max_sub_group_size()); \
+  res1 = intel_sub_group_shuffle(y, (get_max_sub_group_size() + get_sub_group_local_id() - c)%get_max_sub_group_size()); \
+  bool inRange = ((int)c - (int)get_sub_group_local_id() > 0) && (((int)c - (int)get_sub_group_local_id() < (int) get_max_sub_group_size())); \
+  return inRange ? res0 : res1; \
+}
+SHUFFLE_UP(float)
+SHUFFLE_UP(int)
+SHUFFLE_UP(uint)
+#undef SHUFFLE_UP
+#define SHUFFLE_XOR(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
+  return intel_sub_group_shuffle(x, (get_sub_group_local_id() ^ c) % get_max_sub_group_size()); \
+}
+SHUFFLE_XOR(float)
+SHUFFLE_XOR(int)
+SHUFFLE_XOR(uint)
+#undef SHUFFLE_XOR
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 67a1cee..ae3b379 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -26,9 +26,152 @@
 int sub_group_any(int);
 int sub_group_all(int);
 
+uint get_simd_size(void);
+
 uint get_sub_group_size(void);
+uint get_max_sub_group_size(void);
+uint get_num_sub_groups(void);
 uint get_sub_group_id(void);
+uint get_sub_group_local_id(void);
+
+/* broadcast */
+OVERLOADABLE int sub_group_broadcast(int a, size_t local_id);
+OVERLOADABLE uint sub_group_broadcast(uint a, size_t local_id);
+OVERLOADABLE long sub_group_broadcast(long a, size_t local_id);
+OVERLOADABLE ulong sub_group_broadcast(ulong a, size_t local_id);
+OVERLOADABLE half sub_group_broadcast(half a, size_t local_id);
+OVERLOADABLE float sub_group_broadcast(float a, size_t local_id);
+OVERLOADABLE double sub_group_broadcast(double a, size_t local_id);
+
+OVERLOADABLE int sub_group_broadcast(int a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE uint sub_group_broadcast(uint a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE long sub_group_broadcast(long a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE ulong sub_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE half sub_group_broadcast(half a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE float sub_group_broadcast(float a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE double sub_group_broadcast(double a, size_t local_id_x, size_t local_id_y);
+
+OVERLOADABLE int sub_group_broadcast(int a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE uint sub_group_broadcast(uint a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE long sub_group_broadcast(long a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE ulong sub_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE half sub_group_broadcast(half a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE float sub_group_broadcast(float a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE double sub_group_broadcast(double a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+
+/* reduce add */
+OVERLOADABLE int sub_group_reduce_add(int x);
+OVERLOADABLE uint sub_group_reduce_add(uint x);
+OVERLOADABLE long sub_group_reduce_add(long x);
+OVERLOADABLE ulong sub_group_reduce_add(ulong x);
+OVERLOADABLE half sub_group_reduce_add(half x);
+OVERLOADABLE float sub_group_reduce_add(float x);
+OVERLOADABLE double sub_group_reduce_add(double x);
+
+/* reduce min */
+OVERLOADABLE int sub_group_reduce_min(int x);
+OVERLOADABLE uint sub_group_reduce_min(uint x);
+OVERLOADABLE long sub_group_reduce_min(long x);
+OVERLOADABLE ulong sub_group_reduce_min(ulong x);
+OVERLOADABLE half sub_group_reduce_min(half x);
+OVERLOADABLE float sub_group_reduce_min(float x);
+OVERLOADABLE double sub_group_reduce_min(double x);
+
+/* reduce max */
+OVERLOADABLE int sub_group_reduce_max(int x);
+OVERLOADABLE uint sub_group_reduce_max(uint x);
+OVERLOADABLE long sub_group_reduce_max(long x);
+OVERLOADABLE ulong sub_group_reduce_max(ulong x);
+OVERLOADABLE half sub_group_reduce_max(half x);
+OVERLOADABLE float sub_group_reduce_max(float x);
+OVERLOADABLE double sub_group_reduce_max(double x);
+
+/* scan_inclusive add */
+OVERLOADABLE int sub_group_scan_inclusive_add(int x);
+OVERLOADABLE uint sub_group_scan_inclusive_add(uint x);
+OVERLOADABLE long sub_group_scan_inclusive_add(long x);
+OVERLOADABLE ulong sub_group_scan_inclusive_add(ulong x);
+OVERLOADABLE half sub_group_scan_inclusive_add(half x);
+OVERLOADABLE float sub_group_scan_inclusive_add(float x);
+OVERLOADABLE double sub_group_scan_inclusive_add(double x);
+
+/* scan_inclusive min */
+OVERLOADABLE int sub_group_scan_inclusive_min(int x);
+OVERLOADABLE uint sub_group_scan_inclusive_min(uint x);
+OVERLOADABLE long sub_group_scan_inclusive_min(long x);
+OVERLOADABLE ulong sub_group_scan_inclusive_min(ulong x);
+OVERLOADABLE half sub_group_scan_inclusive_min(half x);
+OVERLOADABLE float sub_group_scan_inclusive_min(float x);
+OVERLOADABLE double sub_group_scan_inclusive_min(double x);
 
+/* scan_inclusive max */
+OVERLOADABLE int sub_group_scan_inclusive_max(int x);
+OVERLOADABLE uint sub_group_scan_inclusive_max(uint x);
+OVERLOADABLE long sub_group_scan_inclusive_max(long x);
+OVERLOADABLE ulong sub_group_scan_inclusive_max(ulong x);
+OVERLOADABLE half sub_group_scan_inclusive_max(half x);
+OVERLOADABLE float sub_group_scan_inclusive_max(float x);
+OVERLOADABLE double sub_group_scan_inclusive_max(double x);
+
+/* scan_exclusive add */
+OVERLOADABLE int sub_group_scan_exclusive_add(int x);
+OVERLOADABLE uint sub_group_scan_exclusive_add(uint x);
+OVERLOADABLE long sub_group_scan_exclusive_add(long x);
+OVERLOADABLE ulong sub_group_scan_exclusive_add(ulong x);
+OVERLOADABLE half sub_group_scan_exclusive_add(half x);
+OVERLOADABLE float sub_group_scan_exclusive_add(float x);
+OVERLOADABLE double sub_group_scan_exclusive_add(double x);
+
+/* scan_exclusive min */
+OVERLOADABLE int sub_group_scan_exclusive_min(int x);
+OVERLOADABLE uint sub_group_scan_exclusive_min(uint x);
+OVERLOADABLE long sub_group_scan_exclusive_min(long x);
+OVERLOADABLE ulong sub_group_scan_exclusive_min(ulong x);
+OVERLOADABLE half sub_group_scan_exclusive_min(half x);
+OVERLOADABLE float sub_group_scan_exclusive_min(float x);
+OVERLOADABLE double sub_group_scan_exclusive_min(double x);
+
+/* scan_exclusive max */
+OVERLOADABLE int sub_group_scan_exclusive_max(int x);
+OVERLOADABLE uint sub_group_scan_exclusive_max(uint x);
+OVERLOADABLE long sub_group_scan_exclusive_max(long x);
+OVERLOADABLE ulong sub_group_scan_exclusive_max(ulong x);
+OVERLOADABLE half sub_group_scan_exclusive_max(half x);
+OVERLOADABLE float sub_group_scan_exclusive_max(float x);
+OVERLOADABLE double sub_group_scan_exclusive_max(double x);
+
+/* shuffle */
+OVERLOADABLE half intel_sub_group_shuffle(half x, uint c);
 OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);
+
+/* blocak read/write */
+OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
+OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p);
+OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
+OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);
+
+OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data);
+
+OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t image, int2 byte_coord);
+
+OVERLOADABLE void intel_sub_group_block_write(image2d_t image, int2 byte_coord, uint data);
+OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2 byte_coord, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2 byte_coord, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2 byte_coord, uint8 data);
diff --git a/backend/src/llvm/ExpandConstantExpr.cpp b/backend/src/llvm/ExpandConstantExpr.cpp
index c6f57b8..e9ec3ab 100644
--- a/backend/src/llvm/ExpandConstantExpr.cpp
+++ b/backend/src/llvm/ExpandConstantExpr.cpp
@@ -115,7 +115,7 @@ static Value *expandConstantVector(Instruction *InsertPt, ConstantVector *CV) {
   Type *IntTy = IntegerType::get(CV->getContext(), 32);
 
   BasicBlock::iterator InsertPos(InsertPt);
-  IRBuilder<> IRB(InsertPos);
+  IRBuilder<> IRB(&*InsertPos);
   Value *vec = UndefValue::get(CV->getType());
   for (int i = 0; i < elemNum; i++) {
     Value *idx = ConstantInt::get(IntTy, i);
@@ -177,7 +177,7 @@ bool ExpandConstantExpr::runOnFunction(Function &Func) {
     for (BasicBlock::InstListType::iterator Inst = BB->begin(), E = BB->end();
          Inst != E;
          ++Inst) {
-      Modified |= expandInstruction(Inst);
+      Modified |= expandInstruction(&*Inst);
     }
   }
   return Modified;
diff --git a/backend/src/llvm/ExpandLargeIntegers.cpp b/backend/src/llvm/ExpandLargeIntegers.cpp
index 20fdda9..1ee294f 100644
--- a/backend/src/llvm/ExpandLargeIntegers.cpp
+++ b/backend/src/llvm/ExpandLargeIntegers.cpp
@@ -156,7 +156,7 @@ static TypePair getExpandedIntTypes(Type *Ty) {
 
 // Return true if Val is an int which should be converted.
 static bool shouldConvert(const Value *Val) {
-  Type *Ty = Val->getType();
+  Type *Ty = Val ? Val->getType() : NULL;
   if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
     return !isLegalBitSize(ITy->getBitWidth());
   return false;
@@ -388,7 +388,7 @@ static void convertInstruction(Instruction *Inst, ConversionState &State,
   // Set the insert point *after* Inst, so that any instructions inserted here
   // will be visited again. That allows iterative expansion of types > i128.
   BasicBlock::iterator InsertPos(Inst);
-  IRBuilder<> IRB(++InsertPos);
+  IRBuilder<> IRB(&*++InsertPos);
   StringRef Name = Inst->getName();
 
   if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
@@ -398,6 +398,8 @@ static void convertInstruction(Instruction *Inst, ConversionState &State,
     PHINode *Hi = IRB.CreatePHI(OpTys.Hi, N, Twine(Name + ".hi"));
     for (unsigned I = 0; I != N; ++I) {
       Value *InVal = Phi->getIncomingValue(I);
+      if(!InVal)
+        continue;
       BasicBlock *InBB = Phi->getIncomingBlock(I);
       // If the value hasn't already been converted then this is a
       // forward-reference PHI which needs to be patched up after RPO traversal.
diff --git a/backend/src/llvm/ExpandUtils.cpp b/backend/src/llvm/ExpandUtils.cpp
index 801f969..a09d990 100644
--- a/backend/src/llvm/ExpandUtils.cpp
+++ b/backend/src/llvm/ExpandUtils.cpp
@@ -101,7 +101,7 @@ namespace llvm {
   Function *RecreateFunction(Function *Func, FunctionType *NewType) {
     Function *NewFunc = Function::Create(NewType, Func->getLinkage());
     NewFunc->copyAttributesFrom(Func);
-    Func->getParent()->getFunctionList().insert(Func, NewFunc);
+    Func->getParent()->getFunctionList().insert(ilist_iterator<Function>(Func), NewFunc);
     NewFunc->takeName(Func);
     NewFunc->getBasicBlockList().splice(NewFunc->begin(),
                                         Func->getBasicBlockList());
diff --git a/backend/src/llvm/PromoteIntegers.cpp b/backend/src/llvm/PromoteIntegers.cpp
index b65440f..adba004 100644
--- a/backend/src/llvm/PromoteIntegers.cpp
+++ b/backend/src/llvm/PromoteIntegers.cpp
@@ -129,7 +129,8 @@ static Type *getPromotedType(Type *Ty) {
 
 // Return true if Val is an int which should be converted.
 static bool shouldConvert(Value *Val) {
-  if (IntegerType *ITy = dyn_cast<IntegerType>(Val->getType())) {
+  Type *Ty = Val ? Val->getType() : NULL;
+  if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
     if (!isLegalSize(ITy->getBitWidth())) {
       return true;
     }
@@ -338,6 +339,8 @@ static Value *splitStore(StoreInst *Inst, ConversionState &State) {
 // original type cleared.
 static Value *getClearConverted(Value *Operand, Instruction *InsertPt,
                                 ConversionState &State) {
+  if(!Operand)
+    return Operand;
   Type *OrigType = Operand->getType();
   Instruction *OrigInst = dyn_cast<Instruction>(Operand);
   Operand = State.getConverted(Operand);
@@ -615,7 +618,7 @@ bool PromoteIntegers::runOnFunction(Function &F) {
   // Don't support changing the function arguments. This should not be
   // generated by clang.
   for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
-    Value *Arg = I;
+    Value *Arg = &*I;
     if (shouldConvert(Arg)) {
       errs() << "Function " << F.getName() << ": " << *Arg << "\n";
       llvm_unreachable("Function has illegal integer/pointer argument");
@@ -626,7 +629,7 @@ bool PromoteIntegers::runOnFunction(Function &F) {
   bool Modified = false;
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
     for (BasicBlock::iterator BBI = FI->begin(), BBE = FI->end(); BBI != BBE;) {
-      Instruction *Inst = BBI++;
+      Instruction *Inst = &*BBI++;
       // Only attempt to convert an instruction if its result or any of its
       // operands are illegal.
       bool ShouldConvert = shouldConvert(Inst);
diff --git a/backend/src/llvm/StripAttributes.cpp b/backend/src/llvm/StripAttributes.cpp
index e6df312..3bf3853 100644
--- a/backend/src/llvm/StripAttributes.cpp
+++ b/backend/src/llvm/StripAttributes.cpp
@@ -98,7 +98,7 @@ bool StripAttributes::runOnFunction(Function &Func) {
        BB != E; ++BB) {
     for (BasicBlock::iterator Inst = BB->begin(), E = BB->end();
          Inst != E; ++Inst) {
-      CallSite Call(Inst);
+      CallSite Call(&*Inst);
       if (Call)
         Call.setCallingConv(CallingConv::C);
     }
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 56205bb..748a7fe 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -36,6 +36,8 @@ namespace gbe
   static Module* createOclBitCodeModule(LLVMContext& ctx, bool strictMath)
   {
     std::string bitCodeFiles = OCL_BITCODE_LIB_PATH;
+    if(bitCodeFiles == "")
+      bitCodeFiles = OCL_BITCODE_BIN;
     std::istringstream bitCodeFilePath(bitCodeFiles);
     std::string FilePath;
     bool findBC = false;
@@ -48,7 +50,10 @@ namespace gbe
         break;
       }
     }
-    assert(findBC);
+    if (!findBC) {
+      printf("Fatal Error: ocl lib %s does not exist\n", bitCodeFiles.c_str());
+      return NULL;
+    }
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
     oclLib = getLazyIRFileModule(FilePath, Err, ctx);
@@ -68,8 +73,9 @@ namespace gbe
     return oclLib;
   }
 
-  static bool materializedFuncCall(Module& src, Module& lib, llvm::Function &KF, std::set<std::string>& MFS)
-  {
+  static bool materializedFuncCall(Module& src, Module& lib, llvm::Function& KF,
+                                   std::set<std::string>& MFS,
+                                   std::vector<GlobalValue *>&Gvs) {
     bool fromSrc = false;
     for (llvm::Function::iterator B = KF.begin(), BE = KF.end(); B != BE; B++) {
       for (BasicBlock::iterator instI = B->begin(),
@@ -79,8 +85,12 @@ namespace gbe
           continue;
         }
 
-        if (call->getCalledFunction() &&
-            call->getCalledFunction()->getIntrinsicID() != 0)
+        llvm::Function * callFunc = call->getCalledFunction();
+        if(!callFunc) {
+          continue;
+        }
+
+        if (callFunc->getIntrinsicID() != 0)
           continue;
 
         std::string fnName = call->getCalledValue()->stripPointerCasts()->getName();
@@ -112,9 +122,10 @@ namespace gbe
             printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), EC.message().c_str());
             return false;
           }
+          Gvs.push_back((GlobalValue *)newMF);
 #endif
         }
-        if (!materializedFuncCall(src, lib, *newMF, MFS))
+        if (!materializedFuncCall(src, lib, *newMF, MFS, Gvs))
           return false;
 
       }
@@ -128,8 +139,10 @@ namespace gbe
   {
     LLVMContext& ctx = mod->getContext();
     std::set<std::string> materializedFuncs;
+    std::vector<GlobalValue *> Gvs;
     Module* clonedLib = createOclBitCodeModule(ctx, strictMath);
-    assert(clonedLib && "Can not create the beignet bitcode\n");
+    if (clonedLib == NULL)
+      return NULL;
 
     std::vector<const char *> kernels;
     std::vector<const char *> builtinFuncs;
@@ -173,10 +186,11 @@ namespace gbe
       if (!isKernelFunction(*SF)) continue;
       kernels.push_back(SF->getName().data());
 
-      if (!materializedFuncCall(*mod, *clonedLib, *SF, materializedFuncs)) {
+      if (!materializedFuncCall(*mod, *clonedLib, *SF, materializedFuncs, Gvs)) {
         delete clonedLib;
         return NULL;
       }
+      Gvs.push_back((GlobalValue *)&*SF);
     }
 
     if (kernels.empty()) {
@@ -215,14 +229,43 @@ namespace gbe
       }
 #endif
 
-      if (!materializedFuncCall(*mod, *clonedLib, *newMF, materializedFuncs)) {
+      if (!materializedFuncCall(*mod, *clonedLib, *newMF, materializedFuncs, Gvs)) {
         delete clonedLib;
         return NULL;
       }
 
+      Gvs.push_back((GlobalValue *)newMF);
       kernels.push_back(f);
     }
 
+  /* The llvm 3.8 now has a strict materialized check for all value by checking
+   * module is materialized. If we want to use library as old style that just
+   * materialize what we need, we need to remove what we did not need before
+   * materialize all of the module. To do this, we need all of the builtin
+   * funcitons and what are needed from the kernel functions, these functions
+   * are materalized and are recorded in Gvs, the GlobalValue like PI are also
+   * needed and are added. Now we could not use use_empty to check if the GVs
+   * are needed before the module is marked as all materialized, so we just
+   * materialize all of them as there are only 7 GVs. Then we use GVExtraction
+   * pass to extract the functions and values in Gvs from the library module.
+   * After extract what we need and remove what we do not need, we use 
+   * materializeAll to mark the module as materialized. */
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=8
+    /* Get all GlobalValue from module. */
+    Module::GlobalListType &GVlist = clonedLib->getGlobalList();
+    for(Module::global_iterator GVitr = GVlist.begin();GVitr != GVlist.end();++GVitr) {
+      GlobalValue * GV = &*GVitr;
+      clonedLib->materialize(GV);
+      Gvs.push_back(GV);
+    }
+    llvm::legacy::PassManager Extract;
+    /* Extract all values we need using GVExtractionPass. */
+    Extract.add(createGVExtractionPass(Gvs, false));
+    Extract.run(*clonedLib);
+    /* Mark the library module as materialized for later use. */
+    clonedLib->materializeAll();
+#endif
+
     /* the SPIR binary datalayout maybe different with beignet's bitcode */
     if(clonedLib->getDataLayout() != mod->getDataLayout())
       mod->setDataLayout(clonedLib->getDataLayout());
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 17b65a1..5135950 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -83,6 +83,8 @@
 #include "sys/cvar.hpp"
 #include "backend/program.h"
 #include <sstream>
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DebugInfo.h"
 
 /* Not defined for LLVM 3.0 */
 #if !defined(LLVM_VERSION_MAJOR)
@@ -101,6 +103,7 @@ using namespace llvm;
 
 namespace gbe
 {
+  extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
   /*! Gen IR manipulates only scalar types */
   static bool isScalarType(const Type *type)
   {
@@ -111,6 +114,46 @@ namespace gbe
            type->isPointerTy();
   }
 
+  static std::string getTypeName(ir::Context &ctx, const Type *type, int sign)
+  {
+    GBE_ASSERT(isScalarType(type));
+    if (type->isFloatTy() == true)
+      return "float";
+    if (type->isHalfTy() == true)
+      return "half";
+    if (type->isDoubleTy() == true)
+      return "double";
+
+    GBE_ASSERT(type->isIntegerTy() == true);
+    if(sign) {
+      if (type == Type::getInt1Ty(type->getContext()))
+        return "char";
+      if (type == Type::getInt8Ty(type->getContext()))
+        return "char";
+      if (type == Type::getInt16Ty(type->getContext()))
+        return "short";
+      if (type == Type::getInt32Ty(type->getContext()))
+        return "int";
+      if (type == Type::getInt64Ty(type->getContext()))
+        return "long";
+    }
+    else
+    {
+      if (type == Type::getInt1Ty(type->getContext()))
+        return "uchar";
+      if (type == Type::getInt8Ty(type->getContext()))
+        return "uchar";
+      if (type == Type::getInt16Ty(type->getContext()))
+        return "ushort";
+      if (type == Type::getInt32Ty(type->getContext()))
+        return "uint";
+      if (type == Type::getInt64Ty(type->getContext()))
+        return "ulong";
+    }
+    GBE_ASSERTM(false, "Unsupported type.");
+    return "";
+  }
+
   /*! LLVM IR Type to Gen IR type translation */
   static ir::Type getType(ir::Context &ctx, const Type *type)
   {
@@ -386,6 +429,36 @@ namespace gbe
     ir::Context &ctx;
   };
 
+  class GenWriter;
+  class MemoryInstHelper {
+    public:
+      MemoryInstHelper(ir::Context &c, ir::Unit &u, GenWriter *w, bool l)
+                : ctx(c),
+                  unit(u),
+                  writer(w),
+                  legacyMode(l)
+                  { }
+      void         emitUnalignedDQLoadStore(Value *llvmValues);
+      ir::Tuple    getValueTuple(llvm::Value *llvmValues, llvm::Type *elemType, unsigned start, unsigned elemNum);
+      void         emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum, Value *llvmValues, Type * elemType);
+      ir::Register getOffsetAddress(ir::Register basePtr, unsigned offset);
+      void         shootMessage(ir::Type type, ir::Register offset, ir::Tuple value, unsigned elemNum);
+      template <bool IsLoad, typename T>
+      void         emitLoadOrStore(T &I);
+    private:
+      ir::Context             &ctx;
+      ir::Unit               &unit;
+      GenWriter            *writer;
+      bool              legacyMode;
+      ir::AddressSpace   addrSpace;
+      ir::Register            mBTI;
+      ir::Register            mPtr;
+      ir::AddressMode mAddressMode;
+      unsigned        SurfaceIndex;
+      bool                  isLoad;
+      bool               dwAligned;
+  };
+
   /*! Translate LLVM IR code to Gen IR code */
   class GenWriter : public FunctionPass, public InstVisitor<GenWriter>
   {
@@ -413,6 +486,9 @@ namespace gbe
     typedef map<Value *, SmallVector<Value *, 4>>::iterator PtrOrigMapIter;
     // map pointer source to bti
     map<Value *, unsigned> BtiMap;
+    // map printf pointer source to bti
+    int printfBti;
+    uint32_t printfNum;
     // map ptr to its bti register
     map<Value *, Value *> BtiValueMap;
     // map ptr to it's base
@@ -437,6 +513,10 @@ namespace gbe
     Function *Func;
     const Module *TheModule;
     int btiBase;
+    bool has_errors;
+    /*! legacyMode is for hardware before BDW,
+     * which do not support stateless memory access */
+    bool legacyMode;
   public:
     static char ID;
     explicit GenWriter(ir::Unit &unit)
@@ -444,9 +524,13 @@ namespace gbe
         unit(unit),
         ctx(unit),
         regTranslator(ctx),
+        printfBti(-1),
+        printfNum(0),
         LI(0),
         TheModule(0),
-        btiBase(BTI_RESERVED_NUM)
+        btiBase(BTI_RESERVED_NUM),
+        has_errors(false),
+        legacyMode(true)
     {
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
       initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
@@ -491,7 +575,8 @@ namespace gbe
 
       Func = &F;
       assignBti(F);
-      analyzePointerOrigin(F);
+      if (legacyMode)
+        analyzePointerOrigin(F);
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=7
       LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -508,6 +593,7 @@ namespace gbe
       addrStoreInst.clear();
       // Reset for next function
       btiBase = BTI_RESERVED_NUM;
+      printfBti = -1;
       return false;
     }
     /*! Given a possible pointer value, find out the interested escape like
@@ -516,7 +602,7 @@ namespace gbe
     /*! For all possible pointers, GlobalVariable, function pointer argument,
         alloca instruction, find their pointer escape points */
     void analyzePointerOrigin(Function &F);
-    unsigned getNewBti(Value *origin, bool isImage);
+    unsigned getNewBti(Value *origin, bool force);
     void assignBti(Function &F);
     bool isSingleBti(Value *Val);
     Value *getBtiRegister(Value *v);
@@ -558,11 +644,19 @@ namespace gbe
      */
     INLINE void simplifyTerminator(BasicBlock *bb);
     /*! Helper function to emit loads and stores */
-    template <bool isLoad, typename T> void emitLoadOrStore(T &I);
+    template <bool IsLoad, typename T> void emitLoadOrStore(T &I);
     /*! Will try to remove MOVs due to PHI resolution */
     void removeMOVs(const ir::Liveness &liveness, ir::Function &fn);
     /*! Optimize phi move based on liveness information */
-    void optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn);
+    void optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn,
+                         map <ir::Register, ir::Register> &replaceMap,
+                         map <ir::Register, ir::Register> &redundantPhiCopyMap);
+    /*! further optimization after phi copy optimization.
+     *  Global liveness interefering checking based redundant phy value
+     *  elimination. */
+    void postPhiCopyOptimization(ir::Liveness &liveness, ir::Function &fn,
+                                 map <ir::Register, ir::Register> &replaceMap,
+                                 map <ir::Register, ir::Register> &redundantPhiCopyMap);
     /*! Will try to remove redundants LOADI in basic blocks */
     void removeLOADIs(const ir::Liveness &liveness, ir::Function &fn);
     /*! To avoid lost copy, we need two values for PHI. This function create a
@@ -601,6 +695,13 @@ namespace gbe
     void emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode, ir::Type = ir::TYPE_FLOAT);
     // Emit unary instructions from gen native function
     void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
+    // Emit workgroup instructions
+    void emitWorkGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
+    // Emit subgroup instructions
+    void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
+    // Emit subgroup instructions
+    void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
+    void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
 
     uint8_t appendSampler(CallSite::arg_iterator AI);
     uint8_t getImageID(CallInst &I);
@@ -615,10 +716,10 @@ namespace gbe
     void visitResumeInst(ResumeInst &I) {NOT_SUPPORTED;}
     void visitInlineAsm(CallInst &I) {NOT_SUPPORTED;}
     void visitIndirectBrInst(IndirectBrInst &I) {NOT_SUPPORTED;}
-    void visitUnreachableInst(UnreachableInst &I) {NOT_SUPPORTED;}
+    void visitUnreachableInst(UnreachableInst &I) {;}
     void visitGetElementPtrInst(GetElementPtrInst &I) {NOT_SUPPORTED;}
     void visitInsertValueInst(InsertValueInst &I) {NOT_SUPPORTED;}
-    template <bool isLoad, typename T> void visitLoadOrStore(T &I);
+    template <bool IsLoad, typename T> void visitLoadOrStore(T &I);
 
     INLINE void gatherBTI(Value *pointer, ir::BTI &bti);
     // batch vec4/8/16 load/store
@@ -629,12 +730,19 @@ namespace gbe
     // handle load of dword/qword with unaligned address
     void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
+    ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) {
+      if (unit.printfs.find(inst) == unit.printfs.end())
+        return NULL;
+      return unit.printfs[inst];
+    }
     private:
+      void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug infomation in context for subsequently passing to Gen insn
       ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
       template <typename T, typename P = T>
       ir::ImmediateIndex processSeqConstant(ConstantDataSequential *seq,
                                             int index, ConstTypeId tid);
       ir::ImmediateIndex processConstantVector(ConstantVector *cv, int index);
+      friend class MemoryInstHelper;
   };
 
   char GenWriter::ID = 0;
@@ -642,13 +750,13 @@ namespace gbe
   static void updatePointerSource(Value *parent, Value *theUser, Value *source, SmallVector<Value *, 4> &pointers) {
     if (isa<SelectInst>(theUser)) {
       SelectInst *si = dyn_cast<SelectInst>(theUser);
-      if (si->getTrueValue() == parent)
+      if (si && si->getTrueValue() == parent)
         pointers[0] = source;
       else
         pointers[1] = source;
     } else if (isa<PHINode>(theUser)) {
       PHINode *phi = dyn_cast<PHINode>(theUser);
-      unsigned opNum = phi->getNumIncomingValues();
+      unsigned opNum = phi ? phi->getNumIncomingValues() : 0;
       for (unsigned j = 0; j < opNum; j++) {
         if (phi->getIncomingValue(j) == parent) {
           pointers[j] = source;
@@ -728,7 +836,7 @@ namespace gbe
             if (isa<SelectInst>(theUser)) capacity = 2;
             if (isa<PHINode>(theUser)) {
               PHINode *phi = dyn_cast<PHINode>(theUser);
-              capacity = phi->getNumIncomingValues();
+              capacity = phi ? phi->getNumIncomingValues() : 1;
             }
 
             SmallVector<Value *, 4> pointers;
@@ -814,7 +922,7 @@ namespace gbe
           } else if (isa<CallInst>(theUser)) {
             // atomic/read(write)image
             CallInst *ci = dyn_cast<CallInst>(theUser);
-            pointer = ci->getArgOperand(0);
+            pointer = ci ? ci->getArgOperand(0) : NULL;
           } else {
             theUser->dump();
             GBE_ASSERT(0 && "Unknown instruction operating on pointers\n");
@@ -878,6 +986,12 @@ namespace gbe
     if (baseIter != pointerBaseMap.end()) {
       return baseIter->second;
     }
+
+    if (isa<ConstantPointerNull>(ptr)) {
+      PointerType *ty = PointerType::get(ptr->getType(), 0);
+      return ConstantPointerNull::get(ty);
+    }
+
     typedef std::map<Value *, unsigned>::iterator BtiIter;
     // for pointers that already assigned a bti, it is the base pointer,
     BtiIter found = BtiMap.find(ptr);
@@ -897,6 +1011,13 @@ namespace gbe
     }
 
     PtrOrigMapIter iter = pointerOrigMap.find(ptr);
+
+    // we may not find the ptr, as it may be uninitialized
+    if (iter == pointerOrigMap.end()) {
+      PointerType *ty = PointerType::get(ptr->getType(), 0);
+      return ConstantPointerNull::get(ty);
+    }
+
     SmallVector<Value *, 4> &pointers = (*iter).second;
     if (isSingleBti(ptr)) {
       Value *base = getPointerBase(pointers[0]);
@@ -951,7 +1072,7 @@ namespace gbe
           return basePhi;
       } else {
         ptr->dump();
-        GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
+        GBE_ASSERT(0 && "Unhandled instruction in getPointerBase\n");
         return ptr;
       }
     }
@@ -976,15 +1097,24 @@ namespace gbe
     if (valueIter != BtiValueMap.end())
       return valueIter->second;
 
+    if (isa<ConstantPointerNull>(Val)) {
+      return ConstantInt::get(Type::getInt32Ty(Val->getContext()), BTI_PRIVATE);
+    }
+
     if (found != BtiMap.end()) {
       // the Val already got assigned an BTI, return it
-      Value *bti = ConstantInt::get(IntegerType::get(Val->getContext(), 32), found->second);
+      Value *bti = ConstantInt::get(IntegerType::get(Val->getContext(), 32),
+                                    found->second);
       BtiValueMap.insert(std::make_pair(Val, bti));
       return bti;
     } else {
+      PtrOrigMapIter iter = pointerOrigMap.find(Val);
+      // the pointer may access an uninitialized pointer,
+      // in this case, we will not find it in pointerOrigMap
+      if (iter == pointerOrigMap.end())
+        return ConstantInt::get(Type::getInt32Ty(Val->getContext()), BTI_PRIVATE);
+
       if (isSingleBti(Val)) {
-        PtrOrigMapIter iter = pointerOrigMap.find(Val);
-        GBE_ASSERT(iter != pointerOrigMap.end());
         Value * bti = getBtiRegister((*iter).second[0]);
         BtiValueMap.insert(std::make_pair(Val, bti));
         return bti;
@@ -993,12 +1123,11 @@ namespace gbe
           SelectInst *si = dyn_cast<SelectInst>(Val);
 
           IRBuilder<> Builder(si->getParent());
-          PtrOrigMapIter iter = pointerOrigMap.find(Val);
-          GBE_ASSERT(iter != pointerOrigMap.end());
           Value *trueVal = getBtiRegister((*iter).second[0]);
           Value *falseVal = getBtiRegister((*iter).second[1]);
           Builder.SetInsertPoint(si);
-          Value *bti = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
+          Value *bti = Builder.CreateSelect(si->getCondition(),
+                                            trueVal, falseVal);
           BtiValueMap.insert(std::make_pair(Val, bti));
           return bti;
         } else if (isa<PHINode>(Val)) {
@@ -1006,9 +1135,9 @@ namespace gbe
           IRBuilder<> Builder(phi->getParent());
           Builder.SetInsertPoint(phi);
 
-          PHINode *btiPhi = Builder.CreatePHI(IntegerType::get(Val->getContext(), 32), phi->getNumIncomingValues());
-          PtrOrigMapIter iter = pointerOrigMap.find(Val);
-          GBE_ASSERT(iter != pointerOrigMap.end());
+          PHINode *btiPhi = Builder.CreatePHI(
+                                    IntegerType::get(Val->getContext(), 32),
+                                    phi->getNumIncomingValues());
           SmallVector<Value *, 4> &pointers = (*iter).second;
           unsigned srcNum = pointers.size();
           for (unsigned x = 0; x < srcNum; x++) {
@@ -1031,18 +1160,15 @@ namespace gbe
     }
   }
 
-  unsigned GenWriter::getNewBti(Value *origin, bool isImage) {
+  unsigned GenWriter::getNewBti(Value *origin, bool force) {
     unsigned new_bti = 0;
-    if (isImage) {
+    if (force) {
       new_bti = btiBase;
       incBtiBase();
       return new_bti;
     }
 
-    if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
-      new_bti = btiBase;
-      incBtiBase();
-    } else if (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
+    if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
       new_bti = btiBase;
       incBtiBase();
     }
@@ -1103,8 +1229,9 @@ namespace gbe
       BtiMap.insert(std::make_pair(&v, getNewBti(&v, false)));
     }
     MDNode *typeNameNode = NULL;
+    MDNode *typeBaseNameNode = NULL;
     MDNode *node = getKernelFunctionMetadata(&F);
-    for(uint j = 0; j < node->getNumOperands() - 1; j++) {
+    for(uint j = 0;node && j < node->getNumOperands() - 1; j++) {
       MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
       if (attrNode == NULL) break;
       MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
@@ -1112,15 +1239,23 @@ namespace gbe
       if (attrName->getString() == "kernel_arg_type") {
         typeNameNode = attrNode;
       }
+      if (attrName->getString() == "kernel_arg_base_type") {
+        typeBaseNameNode = attrNode;
+      }
     }
 
     unsigned argID = 0;
     ir::FunctionArgument::InfoFromLLVM llvmInfo;
     for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, argID++) {
-      llvmInfo.typeName= (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+      if(typeNameNode) {
+        llvmInfo.typeName= (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+      }
+      if(typeBaseNameNode) {
+        llvmInfo.typeBaseName= (cast<MDString>(typeBaseNameNode->getOperand(1 + argID)))->getString();
+      }
       bool isImage = llvmInfo.isImageType();
       if (I->getType()->isPointerTy() || isImage) {
-        BtiMap.insert(std::make_pair(I, getNewBti(I, isImage)));
+        BtiMap.insert(std::make_pair(&*I, getNewBti(&*I, isImage)));
       }
     }
 
@@ -1234,7 +1369,7 @@ namespace gbe
     // function argument
     for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
       if (I->getType()->isPointerTy()) {
-        findPointerEscape(I, mixedPtr, true, revisit);
+        findPointerEscape(&*I, mixedPtr, true, revisit);
       }
     }
     // alloca
@@ -1283,7 +1418,7 @@ namespace gbe
       while (isa<AllocaInst>(bbIter)) ++bbIter;
 
       IRBuilder<> Builder(&entry);
-      Builder.SetInsertPoint(bbIter);
+      Builder.SetInsertPoint(&*bbIter);
 
       PointerType * AITy = cast<AllocaInst>(base)->getType();
       Value * btiArray = Builder.CreateAlloca(AITy->getElementType(), ArraySize, base->getName() + ".bti");
@@ -1322,7 +1457,7 @@ namespace gbe
           const StructType * strTy = cast<StructType>(c->getType());
           uint32_t size = 0;
 
-          for(uint32_t op=0; op < strTy->getNumElements(); op++)
+          for(uint32_t op=0; strTy && op < strTy->getNumElements(); op++)
           {
             Type* elementType = strTy->getElementType(op);
             uint32_t align = 8 * getAlignmentByte(unit, elementType);
@@ -1344,6 +1479,8 @@ namespace gbe
             getSequentialData(cds, mem, offset);
           else {
             const ConstantArray *ca = dyn_cast<ConstantArray>(c);
+            if(!ca)
+              return;
             const ArrayType *arrTy = ca->getType();
             Type* elemTy = arrTy->getElementType();
             uint32_t elemSize = getTypeBitSize(unit, elemTy);
@@ -1402,7 +1539,7 @@ namespace gbe
       const GlobalVariable &v = *i;
       if(!v.isConstantUsed()) continue;
       const char *name = v.getName().data();
-      unsigned addrSpace = v.getType()->getAddressSpace();
+      ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace());
       if(addrSpace == ir::AddressSpace::MEM_CONSTANT || v.isConstant()) {
         GBE_ASSERT(v.hasInitializer());
         const Constant *c = v.getInitializer();
@@ -1708,7 +1845,7 @@ namespace gbe
   }
 
   void GenWriter::simplifyTerminator(BasicBlock *bb) {
-    Value *value = --bb->end();
+    Value *value = bb->getTerminator();
     BranchInst *I = NULL;
     if ((I = dyn_cast<BranchInst>(value)) != NULL) {
       if (I->isConditional() == false)
@@ -1737,7 +1874,13 @@ namespace gbe
   void GenWriter::emitBasicBlock(BasicBlock *BB) {
     GBE_ASSERT(labelMap.find(BB) != labelMap.end());
     ctx.LABEL(labelMap[BB]);
-    for (auto II = BB->begin(), E = BB->end(); II != E; ++II) visit(*II);
+    for (auto II = BB->begin(), E = BB->end(); II != E; ++II) {
+      if(OCL_DEBUGINFO) {
+        llvm::Instruction * It = dyn_cast<llvm::Instruction>(II);
+        setDebugInfo_CTX(It);
+      }
+      visit(*II);
+    }
   }
 
   void GenWriter::emitMovForPHI(BasicBlock *curr, BasicBlock *succ) {
@@ -1804,6 +1947,15 @@ namespace gbe
     }
   }
 
+  void GenWriter::setDebugInfo_CTX(llvm::Instruction * insn)
+  {
+    llvm::DebugLoc dg = insn->getDebugLoc();
+    DebugInfo dbginfo;
+    dbginfo.line = dg.getLine();
+    dbginfo.col = dg.getCol();
+    ctx.setDBGInfo(dbginfo);
+  }
+
   void GenWriter::emitFunctionPrototype(Function &F)
   {
     GBE_ASSERTM(F.hasStructRetAttr() == false,
@@ -1815,6 +1967,7 @@ namespace gbe
     ir::FunctionArgument::InfoFromLLVM llvmInfo;
     MDNode *addrSpaceNode = NULL;
     MDNode *typeNameNode = NULL;
+    MDNode *typeBaseNameNode = NULL;
     MDNode *accessQualNode = NULL;
     MDNode *typeQualNode = NULL;
     MDNode *argNameNode = NULL;
@@ -1829,7 +1982,7 @@ namespace gbe
       assert(node);
 
 
-    for(uint j = 0; j < node->getNumOperands() - 1; j++) {
+    for(uint j = 0; node && j < node->getNumOperands() - 1; j++) {
       MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
       if (attrNode == NULL) break;
       MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
@@ -1852,7 +2005,7 @@ namespace gbe
         reqd_wg_sz[2] = z->getZExtValue();
         functionAttributes += attrName->getString();
         std::stringstream param;
-        char buffer[100];
+        char buffer[100] = {0};
         param <<"(";
         param << reqd_wg_sz[0];
         param << ",";
@@ -1870,6 +2023,8 @@ namespace gbe
         accessQualNode = attrNode;
       } else if (attrName->getString() == "kernel_arg_type") {
         typeNameNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_base_type") {
+        typeBaseNameNode = attrNode;
       } else if (attrName->getString() == "kernel_arg_type_qual") {
         typeQualNode = attrNode;
       } else if (attrName->getString() == "kernel_arg_name") {
@@ -1877,6 +2032,38 @@ namespace gbe
       } else if (attrName->getString() == "vec_type_hint") {
         GBE_ASSERT(attrNode->getNumOperands() == 3);
         functionAttributes += attrName->getString();
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+        Value* V = attrNode->getOperand(1);
+#else
+        auto *Op1 = cast<ValueAsMetadata>(attrNode->getOperand(1));
+        Value *V = Op1 ? Op1->getValue() : NULL;
+#endif
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+        ConstantInt *sign = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+#else
+        ConstantInt *sign = mdconst::extract<ConstantInt>(attrNode->getOperand(2));
+#endif
+        size_t signValue = sign->getZExtValue();
+        Type* vtype = V->getType();
+        Type* stype = vtype;
+        uint32_t elemNum = 0;
+        if(vtype->isVectorTy()) {
+          VectorType *vectorType = cast<VectorType>(vtype);
+          stype = vectorType->getElementType();
+          elemNum = vectorType->getNumElements();
+        }
+
+        std::string typeName = getTypeName(ctx, stype, signValue);
+
+        std::stringstream param;
+        char buffer[100] = {0};
+        param <<"(";
+        param << typeName;
+        if(vtype->isVectorTy())
+          param << elemNum;
+        param <<")";
+        param >> buffer;
+        functionAttributes += buffer;
         functionAttributes += " ";
       } else if (attrName->getString() == "work_group_size_hint") {
         GBE_ASSERT(attrNode->getNumOperands() == 4);
@@ -1895,7 +2082,7 @@ namespace gbe
         hint_wg_sz[2] = z->getZExtValue();
         functionAttributes += attrName->getString();
         std::stringstream param;
-        char buffer[100];
+        char buffer[100] = {0};
         param <<"(";
         param << hint_wg_sz[0];
         param << ",";
@@ -1925,25 +2112,36 @@ namespace gbe
       for (; I != E; ++I, ++argID) {
         const std::string &argName = I->getName().str();
         Type *type = I->getType();
+        if(addrSpaceNode) {
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
-        llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
+          llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
 #else
-        llvmInfo.addrSpace = (mdconst::extract<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
+          llvmInfo.addrSpace = (mdconst::extract<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
 #endif
-        llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
-        llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(1 + argID)))->getString();
-        llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(1 + argID)))->getString();
+        }
+        if(typeNameNode) {
+          llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+        }
+        if(typeBaseNameNode){
+          llvmInfo.typeBaseName = (cast<MDString>(typeBaseNameNode->getOperand(1 + argID)))->getString();
+        }
+        if(accessQualNode) {
+          llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(1 + argID)))->getString();
+        }
+        if(typeQualNode) {
+          llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(1 + argID)))->getString();
+        }
         if(argNameNode){
           llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
         }
 
         // function arguments are uniform values.
-        this->newRegister(I, NULL, true);
+        this->newRegister(&*I, NULL, true);
 
         // add support for vector argument.
         if(type->isVectorTy()) {
           VectorType *vectorType = cast<VectorType>(type);
-          ir::Register reg = getRegister(I, 0);
+          ir::Register reg = getRegister(&*I, 0);
           Type *elemType = vectorType->getElementType();
           const uint32_t elemSize = getTypeByteSize(unit, elemType);
           const uint32_t elemNum = vectorType->getNumElements();
@@ -1953,7 +2151,7 @@ namespace gbe
           ir::Function& fn = ctx.getFunction();
           for(uint32_t i=1; i < elemNum; i++) {
             ir::PushLocation argLocation(fn, argID, elemSize*i);
-            reg = getRegister(I, i);
+            reg = getRegister(&*I, i);
             ctx.appendPushedConstant(reg, argLocation);  //add to push map for reg alloc
           }
           continue;
@@ -1961,10 +2159,10 @@ namespace gbe
 
         GBE_ASSERTM(isScalarType(type) == true,
                     "vector type in the function argument is not supported yet");
-        const ir::Register reg = getRegister(I);
+        const ir::Register reg = getRegister(&*I);
         if (llvmInfo.isImageType()) {
           ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, 4, 4, 0);
-          ctx.getFunction().getImageSet()->append(reg, &ctx, BtiMap.find(I)->second);
+          ctx.getFunction().getImageSet()->append(reg, &ctx, BtiMap.find(&*I)->second);
           collectImageArgs(llvmInfo.accessQual, imageArgsInfo);
           continue;
         }
@@ -1979,6 +2177,8 @@ namespace gbe
           ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
         else {
           PointerType *pointerType = dyn_cast<PointerType>(type);
+          if(!pointerType)
+            continue;
           Type *pointed = pointerType->getElementType();
           // By value structure
 #if LLVM_VERSION_MINOR <= 1
@@ -1997,7 +2197,7 @@ namespace gbe
             const uint32_t align = getAlignmentByte(unit, pointed);
               switch (addrSpace) {
               case ir::MEM_GLOBAL:
-                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, BtiMap.find(I)->second);
+                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, BtiMap.find(&*I)->second);
               break;
               case ir::MEM_LOCAL:
                 ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, BTI_LOCAL);
@@ -2074,6 +2274,11 @@ namespace gbe
     // destinations)
     uint32_t insnID = 2;
     bb.foreach([&](ir::Instruction &insn) {
+      if (insn.getOpcode() == ir::OP_MOV &&
+          insn.getDst(0) == insn.getSrc(0)) {
+        insn.remove();
+        return;
+      }
       const uint32_t dstNum = insn.getDstNum();
       const uint32_t srcNum = insn.getSrcNum();
       for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
@@ -2090,7 +2295,9 @@ namespace gbe
     });
   }
 
-  void GenWriter::optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn)
+  void GenWriter::optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn,
+          map<ir::Register, ir::Register> &replaceMap,
+          map<ir::Register, ir::Register> &redundantPhiCopyMap)
   {
     // The overall idea behind is we check whether there is any interference
     // between phi and phiCopy live range. If there is no point that
@@ -2101,7 +2308,6 @@ namespace gbe
 
     using namespace ir;
     ir::FunctionDAG *dag = new ir::FunctionDAG(liveness);
-
     for (auto &it : phiMap) {
       const Register phi = it.first;
       const Register phiCopy = it.second;
@@ -2153,6 +2359,8 @@ namespace gbe
 
             ir::BasicBlock::const_iterator iter = ir::BasicBlock::const_iterator(phiCopySrcDefInsn);
             ir::BasicBlock::const_iterator iterE = bb->end();
+
+            iter++;
             // check no use of phi in this basicblock between [phiCopySrc def, bb end]
             bool phiPhiCopySrcInterfere = false;
             while (iter != iterE) {
@@ -2167,8 +2375,7 @@ namespace gbe
               ++iter;
             }
             if (!phiPhiCopySrcInterfere) {
-              // phiCopy source can be coaleased with phiCopy
-              const_cast<Instruction *>(phiCopyDefInsn)->remove();
+              replaceSrc(const_cast<Instruction *>(phiCopyDefInsn), phiCopySrc, phiCopy);
 
               for (auto &s : *phiCopySrcDef) {
                 const Instruction *phiSrcDefInsn = s->getInstruction();
@@ -2179,8 +2386,15 @@ namespace gbe
                 const Instruction *phiSrcUseInsn = s->getInstruction();
                 replaceSrc(const_cast<Instruction *>(phiSrcUseInsn), phiCopySrc, phiCopy);
               }
+              replaceMap.insert(std::make_pair(phiCopySrc, phiCopy));
             }
           }
+        } else {
+          // FIXME, if the phiCopySrc is a phi value and has been used for more than one phiCopySrc
+          // This 1:1 map will ignore the second one.
+          if (((*(phiCopySrcDef->begin()))->getType() == ValueDef::DEF_INSN_DST) &&
+              redundantPhiCopyMap.find(phiCopySrc) == redundantPhiCopyMap.end())
+            redundantPhiCopyMap.insert(std::make_pair(phiCopySrc, phiCopy));
         }
 
         // If phi is used in the same BB that define the phiCopy,
@@ -2212,20 +2426,117 @@ namespace gbe
         }
       }
 
-      // coalease phi and phiCopy 
+      // coalease phi and phiCopy
       if (isOpt) {
         for (auto &x : *phiDef) {
-          const_cast<Instruction *>(x->getInstruction())->remove();
+          replaceDst(const_cast<Instruction *>(x->getInstruction()), phi, phiCopy);
         }
         for (auto &x : *phiUse) {
           const Instruction *phiUseInsn = x->getInstruction();
           replaceSrc(const_cast<Instruction *>(phiUseInsn), phi, phiCopy);
+          replaceMap.insert(std::make_pair(phi, phiCopy));
         }
       }
     }
     delete dag;
   }
 
+  void GenWriter::postPhiCopyOptimization(ir::Liveness &liveness,
+         ir::Function &fn, map <ir::Register, ir::Register> &replaceMap,
+         map <ir::Register, ir::Register> &redundantPhiCopyMap)
+  {
+    // When doing the first pass phi copy optimization, we skip all the phi src MOV cases
+    // whoes phiSrdDefs are also a phi value. We leave it here when all phi copy optimizations
+    // have been done. Then we don't need to worry about there are still reducible phi copy remained.
+    // We only need to check whether those possible redundant phi copy pairs' interfering to
+    // each other globally, by leverage the DAG information.
+    using namespace ir;
+
+    // Firstly, validate all possible redundant phi copy map and update liveness information
+    // accordingly.
+    if (replaceMap.size() != 0) {
+      for (auto pair : replaceMap) {
+        if (redundantPhiCopyMap.find(pair.first) != redundantPhiCopyMap.end()) {
+          auto it = redundantPhiCopyMap.find(pair.first);
+          Register phiCopy = it->second;
+          Register newPhiCopySrc = pair.second;
+          redundantPhiCopyMap.erase(it);
+          redundantPhiCopyMap.insert(std::make_pair(newPhiCopySrc, phiCopy));
+        }
+      }
+      liveness.replaceRegs(replaceMap);
+      replaceMap.clear();
+    }
+    if (redundantPhiCopyMap.size() == 0)
+      return;
+    auto dag = new FunctionDAG(liveness);
+
+    map<Register, Register> newRedundant;
+    map<Register, Register> *curRedundant = &redundantPhiCopyMap;
+    map<Register, Register> *nextRedundant = &newRedundant, tmp;
+    map<Register, Register> replacedRegs, revReplacedRegs;
+    // Do multi pass redundant phi copy elimination based on the global interfering information.
+    // FIXME, we don't need to re-compute the whole DAG for each pass.
+    while (curRedundant->size() > 0) {
+      //for (auto &pair = *curRedundant) {
+      for (auto pair = curRedundant->begin(); pair != curRedundant->end(); ) {
+        auto phiCopySrc = pair->first;
+        auto phiCopy = pair->second;
+        if (replacedRegs.find(phiCopy) != replacedRegs.end() ||
+            revReplacedRegs.find(phiCopy) != revReplacedRegs.end() ||
+            revReplacedRegs.find(phiCopySrc) != revReplacedRegs.end()) {
+          pair++;
+          continue;
+        }
+        if (!dag->interfere(liveness, phiCopySrc, phiCopy)) {
+          const ir::DefSet *phiCopySrcDef = dag->getRegDef(phiCopySrc);
+          const ir::UseSet *phiCopySrcUse = dag->getRegUse(phiCopySrc);
+          for (auto &s : *phiCopySrcDef) {
+            const Instruction *phiSrcDefInsn = s->getInstruction();
+            replaceDst(const_cast<Instruction *>(phiSrcDefInsn), phiCopySrc, phiCopy);
+          }
+
+          for (auto &s : *phiCopySrcUse) {
+            const Instruction *phiSrcUseInsn = s->getInstruction();
+            replaceSrc(const_cast<Instruction *>(phiSrcUseInsn), phiCopySrc, phiCopy);
+          }
+
+          replacedRegs.insert(std::make_pair(phiCopySrc, phiCopy));
+          revReplacedRegs.insert(std::make_pair(phiCopy, phiCopySrc));
+          curRedundant->erase(pair++);
+        } else
+          pair++;
+      }
+
+      if (replacedRegs.size() != 0) {
+        liveness.replaceRegs(replacedRegs);
+        for (auto &pair : *curRedundant) {
+          auto from = pair.first;
+          auto to = pair.second;
+          bool revisit = false;
+          if (replacedRegs.find(pair.second) != replacedRegs.end()) {
+            to = replacedRegs.find(to)->second;
+            revisit = true;
+          }
+          if (revReplacedRegs.find(from) != revReplacedRegs.end() ||
+              revReplacedRegs.find(to) != revReplacedRegs.end())
+            revisit = true;
+          if (revisit)
+            nextRedundant->insert(std::make_pair(from, to));
+        }
+        std::swap(curRedundant, nextRedundant);
+      } else
+        break;
+
+      nextRedundant->clear();
+      replacedRegs.clear();
+      revReplacedRegs.clear();
+      delete dag;
+      dag = new ir::FunctionDAG(liveness);
+    }
+    delete dag;
+  }
+
   void GenWriter::removeMOVs(const ir::Liveness &liveness, ir::Function &fn)
   {
     // We store the last write and last read for each register
@@ -2376,7 +2687,6 @@ namespace gbe
   {
     // Allocate a address register for each global variable
     const Module::GlobalListType &globalList = TheModule->getGlobalList();
-    size_t j = 0;
     for(auto i = globalList.begin(); i != globalList.end(); i ++) {
       const GlobalVariable &v = *i;
       if(!v.isConstantUsed()) continue;
@@ -2408,16 +2718,12 @@ namespace gbe
         GBE_ASSERT(v.hasInitializer());
         this->newRegister(const_cast<GlobalVariable*>(&v));
         ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
-        ir::Constant &con = unit.getConstantSet().getConstant(j ++);
-        GBE_ASSERT(con.getName() == v.getName());
+        ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
         ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
       } else {
-        if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
-          ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
-          regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
-        } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
-          ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
-          regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
+        if(v.getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
+          ctx.getUnit().getProfilingInfo()->setBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
+          regTranslator.newScalarProxy(ir::ocl::profilingbptr, const_cast<GlobalVariable*>(&v));
         } else if(v.getName().str().substr(0, 4) == ".str") {
           /* When there are multi printf statements in multi kernel fucntions within the same
              translate unit, if they have the same sting parameter, such as
@@ -2459,35 +2765,6 @@ namespace gbe
     std::vector<std::pair<Loop*, int>> lp;
 
     findAllLoops(LI, lp);
-#if GBE_DEBUG
-    // check two loops' interference
-    for(unsigned int i = 0; i < lp.size(); i++) {
-        SmallVector<Loop::Edge, 8> exitBBs;
-        lp[i].first->getExitEdges(exitBBs);
-
-      const std::vector<BasicBlock*> &inBBs = lp[i].first->getBlocks();
-      std::vector<ir::LabelIndex> bbs1;
-      for(auto x : inBBs) {
-        bbs1.push_back(labelMap[x]);
-      }
-      std::sort(bbs1.begin(), bbs1.end());
-      for(unsigned int j = i+1; j < lp.size(); j++) {
-        if(! lp[i].first->contains(lp[j].first)) {
-          const std::vector<BasicBlock*> &inBBs2 = lp[j].first->getBlocks();
-          std::vector<ir::LabelIndex> bbs2;
-          std::vector<ir::LabelIndex> bbs3;
-
-          for(auto x : inBBs2) {
-            bbs2.push_back(labelMap[x]);
-          }
-
-          std::sort(bbs2.begin(), bbs2.end());
-          std::set_intersection(bbs1.begin(), bbs1.end(), bbs2.begin(), bbs2.end(), std::back_inserter(bbs3));
-          GBE_ASSERT(bbs3.size() < 1);
-        }
-      }
-    }
-#endif
 
     for (auto loop : lp) {
       loopBBs.clear();
@@ -2498,6 +2775,11 @@ namespace gbe
         GBE_ASSERT(labelMap.find(b) != labelMap.end());
         loopBBs.push_back(labelMap[b]);
       }
+      BasicBlock *preheader = loop.first->getLoopPredecessor();
+      ir::LabelIndex preheaderBB(0);
+      if (preheader) {
+        preheaderBB = labelMap[preheader];
+      }
 
       SmallVector<Loop::Edge, 8> exitBBs;
       loop.first->getExitEdges(exitBBs);
@@ -2506,7 +2788,7 @@ namespace gbe
         GBE_ASSERT(labelMap.find(b.second) != labelMap.end());
         loopExits.push_back(std::make_pair(labelMap[b.first], labelMap[b.second]));
       }
-      fn.addLoop(loopBBs, loopExits);
+      fn.addLoop(preheaderBB, loop.second, loopBBs, loopExits);
     }
   }
 
@@ -2662,15 +2944,18 @@ namespace gbe
     pass = PASS_EMIT_REGISTERS;
     for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
       visit(*I);
+    
+    // Abort if this found an error (otherwise emitBasicBlock will assert)
+    if(has_errors){return;}
 
     // First create all the labels (one per block) ...
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      this->newLabelIndex(BB);
+      this->newLabelIndex(&*BB);
 
     // Then, for all branch instructions that have conditions, see if we can
     // simplify the code by inverting condition code
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      this->simplifyTerminator(BB);
+      this->simplifyTerminator(&*BB);
 
     // gather loop info, which is useful for liveness analysis
     gatherLoopInfo(fn);
@@ -2678,15 +2963,19 @@ namespace gbe
     // ... then, emit the instructions for all basic blocks
     pass = PASS_EMIT_INSTRUCTIONS;
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      emitBasicBlock(BB);
+      emitBasicBlock(&*BB);
     ctx.endFunction();
 
     // Liveness can be shared when we optimized the immediates and the MOVs
     ir::Liveness liveness(fn);
 
     if (OCL_OPTIMIZE_LOADI) this->removeLOADIs(liveness, fn);
-    if (OCL_OPTIMIZE_PHI_MOVES) this->optimizePhiCopy(liveness, fn);
-    if (OCL_OPTIMIZE_PHI_MOVES) this->removeMOVs(liveness, fn);
+    if (OCL_OPTIMIZE_PHI_MOVES) {
+      map <ir::Register, ir::Register> replaceMap, redundantPhiCopyMap;
+      this->optimizePhiCopy(liveness, fn, replaceMap, redundantPhiCopyMap);
+      this->postPhiCopyOptimization(liveness, fn, replaceMap, redundantPhiCopyMap);
+      this->removeMOVs(liveness, fn);
+    }
   }
 
   void GenWriter::regAllocateReturnInst(ReturnInst &I) {}
@@ -3204,7 +3493,7 @@ namespace gbe
     Value *Callee = I.getCalledValue();
     GBE_ASSERT(ctx.getFunction().getProfile() == ir::PROFILE_OCL);
     GBE_ASSERT(isa<InlineAsm>(I.getCalledValue()) == false);
-    GBE_ASSERT(I.hasStructRetAttr() == false);
+    if(I.getNumArgOperands()) GBE_ASSERT(I.hasStructRetAttr() == false);
 
     // We only support a small number of intrinsics right now
     if (Function *F = I.getCalledFunction()) {
@@ -3225,6 +3514,7 @@ namespace gbe
           break;
 #endif /* LLVM_VERSION_MINOR >= 2 */
           case Intrinsic::debugtrap:
+          case Intrinsic::trap:
           case Intrinsic::dbg_value:
           case Intrinsic::dbg_declare:
           break;
@@ -3300,6 +3590,10 @@ namespace gbe
         regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break;
       case GEN_OCL_GET_GLOBAL_OFFSET2:
         regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
+      case GEN_OCL_GET_THREAD_NUM:
+        regTranslator.newScalarProxy(ir::ocl::threadn, dst); break;
+      case GEN_OCL_GET_THREAD_ID:
+        regTranslator.newScalarProxy(ir::ocl::threadid, dst); break;
       case GEN_OCL_GET_WORK_DIM:
         regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
       case GEN_OCL_FBH:
@@ -3434,17 +3728,67 @@ namespace gbe
       case GEN_OCL_REGION:
       case GEN_OCL_SIMD_ID:
       case GEN_OCL_SIMD_SHUFFLE:
+      case GEN_OCL_VME:
+      case GEN_OCL_WORK_GROUP_ALL:
+      case GEN_OCL_WORK_GROUP_ANY:
+      case GEN_OCL_WORK_GROUP_BROADCAST:
+      case GEN_OCL_WORK_GROUP_REDUCE_ADD:
+      case GEN_OCL_WORK_GROUP_REDUCE_MAX:
+      case GEN_OCL_WORK_GROUP_REDUCE_MIN:
+      case GEN_OCL_WORK_GROUP_SCAN_EXCLUSIVE_ADD:
+      case GEN_OCL_WORK_GROUP_SCAN_EXCLUSIVE_MAX:
+      case GEN_OCL_WORK_GROUP_SCAN_EXCLUSIVE_MIN:
+      case GEN_OCL_WORK_GROUP_SCAN_INCLUSIVE_ADD:
+      case GEN_OCL_WORK_GROUP_SCAN_INCLUSIVE_MAX:
+      case GEN_OCL_WORK_GROUP_SCAN_INCLUSIVE_MIN:
+      case GEN_OCL_SUB_GROUP_BROADCAST:
+      case GEN_OCL_SUB_GROUP_REDUCE_ADD:
+      case GEN_OCL_SUB_GROUP_REDUCE_MAX:
+      case GEN_OCL_SUB_GROUP_REDUCE_MIN:
+      case GEN_OCL_SUB_GROUP_SCAN_EXCLUSIVE_ADD:
+      case GEN_OCL_SUB_GROUP_SCAN_EXCLUSIVE_MAX:
+      case GEN_OCL_SUB_GROUP_SCAN_EXCLUSIVE_MIN:
+      case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_ADD:
+      case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MAX:
+      case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN:
+      case GEN_OCL_LRP:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
         this->newRegister(&I);
         break;
       case GEN_OCL_PRINTF:
+        this->newRegister(&I);  // fall through
+      case GEN_OCL_PUTS:
+      {
+         // We need a new BTI as printf output.
+         if (printfBti < 0) {
+           printfBti = this->getNewBti(&I, true);
+           ctx.getFunction().getPrintfSet()->setBufBTI(printfBti);
+         }
+         break;
+      }
+      case GEN_OCL_CALC_TIMESTAMP:
+      case GEN_OCL_STORE_PROFILING:
+      case GEN_OCL_DEBUGWAIT:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
         break;
       case GEN_OCL_NOT_FOUND:
       default:
-        std::cerr << "Caller instruction: " << std::endl;
-        I.dump();
-        std::cerr << "Callee function: " << std::endl;
-        Callee->dump();
-        GBE_ASSERT(0);
+        has_errors = true;
+        Func->getContext().emitError(&I,"function '" + fnName + "' not found or cannot be inlined");
     };
   }
 
@@ -3463,49 +3807,264 @@ namespace gbe
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
     GBE_ASSERT(AI != AE);
-
-    ir::AddressSpace addrSpace;
-
     Value *llvmPtr = *AI;
-    Value *bti = getBtiRegister(llvmPtr);
-    Value *ptrBase = getPointerBase(llvmPtr);
+    ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
     ir::Register pointer = this->getRegister(llvmPtr);
-    ir::Register baseReg = this->getRegister(ptrBase);
 
+    ir::Register ptr;
     ir::Register btiReg;
-    bool fixedBTI = false;
-    if (isa<ConstantInt>(bti)) {
-      fixedBTI = true;
-      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
-      addrSpace = btiToGen(index);
-      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
-      btiReg = ctx.reg(ir::FAMILY_DWORD);
-      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
+    unsigned SurfaceIndex = 0xff;;
+
+    ir::AddressMode AM;
+    if (legacyMode) {
+      Value *bti = getBtiRegister(llvmPtr);
+      Value *ptrBase = getPointerBase(llvmPtr);
+      ir::Register baseReg = this->getRegister(ptrBase);
+      if (isa<ConstantInt>(bti)) {
+        AM = ir::AM_StaticBti;
+        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+        addrSpace = btiToGen(SurfaceIndex);
+      } else {
+        AM = ir::AM_DynamicBti;
+        addrSpace = ir::MEM_MIXED;
+        btiReg = this->getRegister(bti);
+      }
+      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      ptr = ctx.reg(pointerFamily);
+      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
     } else {
-      addrSpace = ir::MEM_MIXED;
-      btiReg = this->getRegister(bti);
+      AM = ir::AM_Stateless;
+      ptr = pointer;
     }
 
-    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
-    const ir::Register ptr = ctx.reg(pointerFamily);
-    ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
-
     const ir::Register dst = this->getRegister(&I);
 
-    uint32_t srcNum = 0;
-    vector<ir::Register> src;
-    src.push_back(ptr);
-    srcNum++;
+    uint32_t payloadNum = 0;
+    vector<ir::Register> payload;
     AI++;
 
     while(AI != AE) {
-      src.push_back(this->getRegister(*(AI++)));
-      srcNum++;
+      payload.push_back(this->getRegister(*(AI++)));
+      payloadNum++;
+    }
+    ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+    const ir::Tuple payloadTuple = payloadNum == 0 ?
+                                   ir::Tuple(0) :
+                                   ctx.arrayTuple(&payload[0], payloadNum);
+    if (AM == ir::AM_DynamicBti) {
+      ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM, btiReg);
+    } else {
+      ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM, SurfaceIndex);
     }
-    const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
-    ctx.ATOMIC(opcode, dst, addrSpace, btiReg, fixedBTI, srcTuple);
   }
 
+  void GenWriter::emitWorkGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode) {
+    ir::Function &f = ctx.getFunction();
+
+    if (f.getwgBroadcastSLM() < 0 && opcode == ir::WORKGROUP_OP_BROADCAST) {
+      uint32_t mapSize = 8;
+      f.setUseSLM(true);
+      uint32_t oldSlm = f.getSLMSize();
+      f.setSLMSize(oldSlm + mapSize);
+      f.setwgBroadcastSLM(oldSlm);
+      GBE_ASSERT(f.getwgBroadcastSLM() >= 0);
+    }
+
+    else if (f.gettidMapSLM() < 0 && opcode >= ir::WORKGROUP_OP_ANY && opcode <= ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
+      /* 1. For thread SLM based communication (default):
+       * Threads will use SLM to write partial results computed individually
+         and then read the whole set. Because the read is done in chunks of 4
+         extra padding is required.
+
+         When we come to here, the global thread local vars should have all been
+         allocated, so it's safe for us to steal a piece of SLM for this usage. */
+
+      // at most 64 thread for one subslice, along with extra padding
+      uint32_t mapSize = sizeof(uint32_t) * (64 + 4);
+      f.setUseSLM(true);
+      uint32_t oldSlm = f.getSLMSize();
+      f.setSLMSize(oldSlm + mapSize);
+      f.settidMapSLM(oldSlm);
+      GBE_ASSERT(f.gettidMapSLM() >= 0);
+    }
+
+    CallSite::arg_iterator AI = CS.arg_begin();
+    CallSite::arg_iterator AE = CS.arg_end();
+    GBE_ASSERT(AI != AE);
+
+    if (opcode == ir::WORKGROUP_OP_ALL || opcode == ir::WORKGROUP_OP_ANY) {
+      GBE_ASSERT(getType(ctx, (*AI)->getType()) == ir::TYPE_S32);
+      ir::Register src[3];
+      src[0] = ir::ocl::threadn;
+      src[1] = ir::ocl::threadid;
+      src[2] = this->getRegister(*(AI++));
+      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3);
+      ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(), getRegister(&I), srcTuple, 3, ir::TYPE_S32);
+    } else if (opcode == ir::WORKGROUP_OP_BROADCAST) {
+      int argNum = CS.arg_size();
+      std::vector<ir::Register> src(argNum);
+      for (int i = 0; i < argNum; i++) {
+        src[i] = this->getRegister(*(AI++));
+      }
+      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], argNum);
+      ctx.WORKGROUP(ir::WORKGROUP_OP_BROADCAST, (uint32_t)f.getwgBroadcastSLM(), getRegister(&I), srcTuple, argNum,
+          getType(ctx, (*CS.arg_begin())->getType()));
+    } else {
+      ConstantInt *sign = dyn_cast<ConstantInt>(AI);
+      GBE_ASSERT(sign);
+      bool isSign = sign->getZExtValue();
+      AI++;
+      ir::Type ty;
+      if (isSign) {
+        ty = getType(ctx, (*AI)->getType());
+
+      } else {
+        ty = getUnsignedType(ctx, (*AI)->getType());
+      }
+
+      ir::Register src[3];
+      src[0] = ir::ocl::threadn;
+      src[1] = ir::ocl::threadid;
+      src[2] = this->getRegister(*(AI++));
+      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3);
+      ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(), getRegister(&I), srcTuple, 3, ty);
+    }
+
+    GBE_ASSERT(AI == AE);
+  }
+
+  void GenWriter::emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode) {
+    CallSite::arg_iterator AI = CS.arg_begin();
+    CallSite::arg_iterator AE = CS.arg_end();
+    GBE_ASSERT(AI != AE);
+
+    if (opcode == ir::WORKGROUP_OP_ALL || opcode == ir::WORKGROUP_OP_ANY) {
+      GBE_ASSERT(getType(ctx, (*AI)->getType()) == ir::TYPE_S32);
+      ir::Register src[3];
+      src[0] = this->getRegister(*(AI++));
+      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 1);
+      ctx.SUBGROUP(opcode, getRegister(&I), srcTuple, 1, ir::TYPE_S32);
+    } else if (opcode == ir::WORKGROUP_OP_BROADCAST) {
+      int argNum = CS.arg_size();
+      std::vector<ir::Register> src(argNum);
+      for (int i = 0; i < argNum; i++) {
+        src[i] = this->getRegister(*(AI++));
+      }
+      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], argNum);
+      ctx.SUBGROUP(ir::WORKGROUP_OP_BROADCAST, getRegister(&I), srcTuple, argNum,
+          getType(ctx, (*CS.arg_begin())->getType()));
+    } else {
+      ConstantInt *sign = dyn_cast<ConstantInt>(AI);
+      GBE_ASSERT(sign);
+      bool isSign = sign->getZExtValue();
+      AI++;
+      ir::Type ty;
+      if (isSign) {
+        ty = getType(ctx, (*AI)->getType());
+
+      } else {
+        ty = getUnsignedType(ctx, (*AI)->getType());
+      }
+
+      ir::Register src[3];
+      src[0] = this->getRegister(*(AI++));
+      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 1);
+      ctx.SUBGROUP(opcode, getRegister(&I), srcTuple, 1, ty);
+    }
+
+    GBE_ASSERT(AI == AE);
+  }
+
+  void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
+    CallSite::arg_iterator AI = CS.arg_begin();
+    CallSite::arg_iterator AE = CS.arg_end();
+    GBE_ASSERT(AI != AE);
+
+    Value *llvmPtr = *(AI++);
+    ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
+    GBE_ASSERT(addrSpace == ir::MEM_GLOBAL);
+    ir::Register pointer = this->getRegister(llvmPtr);
+
+    ir::Register ptr;
+    ir::Register btiReg;
+    unsigned SurfaceIndex = 0xff;
+
+    ir::AddressMode AM;
+    if (legacyMode) {
+      Value *bti = getBtiRegister(llvmPtr);
+      Value *ptrBase = getPointerBase(llvmPtr);
+      ir::Register baseReg = this->getRegister(ptrBase);
+      if (isa<ConstantInt>(bti)) {
+        AM = ir::AM_StaticBti;
+        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+        addrSpace = btiToGen(SurfaceIndex);
+      } else {
+        AM = ir::AM_DynamicBti;
+        addrSpace = ir::MEM_MIXED;
+        btiReg = this->getRegister(bti);
+      }
+      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      ptr = ctx.reg(pointerFamily);
+      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+    } else {
+      AM = ir::AM_Stateless;
+      ptr = pointer;
+    }
+
+    ir::Type type = ir::TYPE_U32;
+    GBE_ASSERT(AM != ir::AM_DynamicBti);
+
+    if(isWrite){
+      Value *llvmValues = *(AI++);
+      vector<ir::Register> srcTupleData;
+      for(int i = 0;i < vec_size; i++)
+        srcTupleData.push_back(getRegister(llvmValues, i));
+      const ir::Tuple tuple = ctx.arrayTuple(&srcTupleData[0], vec_size);
+      ctx.STORE(type, tuple, ptr, addrSpace, vec_size, true, AM, SurfaceIndex, true);
+    } else {
+      vector<ir::Register> dstTupleData;
+      for(int i = 0;i < vec_size; i++)
+        dstTupleData.push_back(getRegister(&I, i));
+      const ir::Tuple tuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
+      ctx.LOAD(type, tuple, ptr, addrSpace, vec_size, true, AM, SurfaceIndex, true);
+    }
+
+    GBE_ASSERT(AI == AE);
+  }
+
+  void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
+    CallSite::arg_iterator AI = CS.arg_begin();
+    CallSite::arg_iterator AE = CS.arg_end();
+    GBE_ASSERT(AI != AE);
+
+    const uint8_t imageID = getImageID(I);
+    AI++;
+
+    if(isWrite){
+      vector<ir::Register> srcTupleData;
+      srcTupleData.push_back(getRegister(*(AI++)));
+      srcTupleData.push_back(getRegister(*(AI++)));
+      for(int i = 0;i < vec_size; i++)
+        srcTupleData.push_back(getRegister(*(AI), i));
+      AI++;
+      const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + vec_size);
+      ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size);
+    } else {
+      ir::Register src[2];
+      src[0] = getRegister(*(AI++));
+      src[1] = getRegister(*(AI++));
+      vector<ir::Register> dstTupleData;
+      for(int i = 0;i < vec_size; i++)
+        dstTupleData.push_back(getRegister(&I, i));
+      const ir::Tuple srctuple = ctx.arrayTuple(src, 2);
+      const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
+      ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2);
+    }
+
+    GBE_ASSERT(AI == AE);
+  }
+
+
   /* append a new sampler. should be called before any reference to
    * a sampler_t value. */
   uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
@@ -3560,22 +4119,12 @@ namespace gbe
           }
           break;
 #if LLVM_VERSION_MINOR >= 2
-          case Intrinsic::fmuladd:
-          {
-            const ir::Register tmp  = ctx.reg(ir::FAMILY_DWORD);
-            const ir::Register dst  = this->getRegister(&I);
-            const ir::Register src0 = this->getRegister(I.getOperand(0));
-            const ir::Register src1 = this->getRegister(I.getOperand(1));
-            const ir::Register src2 = this->getRegister(I.getOperand(2));
-            ctx.MUL(ir::TYPE_FLOAT, tmp, src0, src1);
-            ctx.ADD(ir::TYPE_FLOAT, dst, tmp, src2);
-          }
-          break;
           case Intrinsic::lifetime_start:
           case Intrinsic::lifetime_end:
           break;
 #endif /* LLVM_VERSION_MINOR >= 2 */
           case Intrinsic::debugtrap:
+          case Intrinsic::trap:
           case Intrinsic::dbg_value:
           case Intrinsic::dbg_declare:
           break;
@@ -3657,6 +4206,7 @@ namespace gbe
           }
           break;
           case Intrinsic::fma:
+          case Intrinsic::fmuladd:
           {
             ir::Type srcType = getType(ctx, I.getType());
             const ir::Register dst = this->getRegister(&I);
@@ -3668,7 +4218,6 @@ namespace gbe
           break;
           case Intrinsic::sqrt: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
           case Intrinsic::ceil: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
-          case Intrinsic::fabs: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
           case Intrinsic::trunc: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
           case Intrinsic::rint: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
           case Intrinsic::floor: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
@@ -3686,6 +4235,13 @@ namespace gbe
             ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
             break;
           }
+          case Intrinsic::fabs:
+          {
+            const ir::Register src = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU1(ir::OP_ABS, getType(ctx, (*AI)->getType()), dst, src);
+            break;
+          }
           default: NOT_IMPLEMENTED;
         }
       } else {
@@ -3732,6 +4288,52 @@ namespace gbe
             ctx.READ_ARF(ir::TYPE_U32, dst, ir::ARF_TM);
             break;
           }
+          case GEN_OCL_VME:
+          {
+
+            const uint8_t imageID = getImageID(I);
+
+            AI++;
+            AI++;
+
+            uint32_t src_length = 40;
+
+            vector<ir::Register> dstTupleData, srcTupleData;
+            for (uint32_t i = 0; i < src_length; i++, AI++){
+              srcTupleData.push_back(this->getRegister(*AI));
+            }
+
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], src_length);
+
+            Constant *msg_type_cpv = dyn_cast<Constant>(*AI);
+            assert(msg_type_cpv);
+            const ir::Immediate &msg_type_x = processConstantImm(msg_type_cpv);
+            int msg_type = msg_type_x.getIntegerValue();
+            uint32_t dst_length;
+            //msy_type =1 indicate inter search only of gen vme shared function
+            GBE_ASSERT(msg_type == 1);
+            if(msg_type == 1)
+              dst_length = 6;
+            for (uint32_t elemID = 0; elemID < dst_length; ++elemID) {
+              const ir::Register reg = this->getRegister(&I, elemID);
+              dstTupleData.push_back(reg);
+            }
+            const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], dst_length);
+            ++AI;
+            Constant *vme_search_path_lut_cpv = dyn_cast<Constant>(*AI);
+            assert(vme_search_path_lut_cpv);
+            const ir::Immediate &vme_search_path_lut_x = processConstantImm(vme_search_path_lut_cpv);
+            ++AI;
+            Constant *lut_sub_cpv = dyn_cast<Constant>(*AI);
+            assert(lut_sub_cpv);
+            const ir::Immediate &lut_sub_x = processConstantImm(lut_sub_cpv);
+
+            ctx.VME(imageID, dstTuple, srcTuple, dst_length, src_length,
+                    msg_type, vme_search_path_lut_x.getIntegerValue(),
+                    lut_sub_x.getIntegerValue());
+
+            break;
+          }
           case GEN_OCL_REGION:
           {
             const ir::Register dst = this->getRegister(&I);
@@ -4117,9 +4719,96 @@ namespace gbe
 
           case GEN_OCL_PRINTF:
           {
-            ir::PrintfSet::PrintfFmt* fmt = (ir::PrintfSet::PrintfFmt*)getPrintfInfo(&I);
-            ctx.getFunction().getPrintfSet()->append(fmt, unit);
-            assert(fmt);
+            ir::PrintfSet::PrintfFmt* fmt = getPrintfInfo(&I);
+            if (fmt == NULL)
+              break;
+
+            ctx.getFunction().getPrintfSet()->append(printfNum, fmt);
+
+            vector<ir::Register> tupleData;
+            vector<ir::Type> tupleTypeData;
+            int argNum = static_cast<int>(I.getNumOperands());
+            argNum -= 2; // no fmt and last NULL.
+            int realArgNum = argNum;
+
+            for (int n = 0; n < argNum; n++) {
+              /* First, ignore %s, the strings are recorded and not passed to GPU. */
+              llvm::Constant* args = dyn_cast<llvm::ConstantExpr>(I.getOperand(n + 1));
+              llvm::Constant* args_ptr = NULL;
+              if (args)
+                args_ptr = dyn_cast<llvm::Constant>(args->getOperand(0));
+
+              if (args_ptr) {
+                ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(args_ptr->getOperand(0));
+                if (fmt_arg && fmt_arg->isCString()) {
+                  realArgNum--;
+                  continue;
+                }
+              }
+
+              Type * type = I.getOperand(n + 1)->getType();
+              if (type->isVectorTy()) {
+                uint32_t srcElemNum = 0;
+                Value *srcValue = I.getOperand(n + 1);
+                ir::Type srcType = getVectorInfo(ctx, srcValue, srcElemNum);
+                GBE_ASSERT(!(srcType == ir::TYPE_DOUBLE));
+
+                uint32_t elemID = 0;
+                for (elemID = 0; elemID < srcElemNum; ++elemID) {
+                  ir::Register reg = getRegister(srcValue, elemID);
+                  tupleData.push_back(reg);
+                  tupleTypeData.push_back(srcType);
+                }
+                realArgNum += srcElemNum - 1;
+              } else {
+                ir::Register reg = getRegister(I.getOperand(n + 1));
+                tupleData.push_back(reg);
+                tupleTypeData.push_back(getType(ctx, I.getOperand(n + 1)->getType()));
+              }
+            }
+
+            ir::Tuple tuple;
+            ir::Tuple typeTuple;
+            if (realArgNum > 0) {
+              tuple = ctx.arrayTuple(&tupleData[0], realArgNum);
+              typeTuple = ctx.arrayTypeTuple(&tupleTypeData[0], realArgNum);
+            }
+            ctx.PRINTF(getRegister(&I), tuple, typeTuple, realArgNum, printfBti, printfNum);
+            printfNum++;
+            break;
+          }
+          case GEN_OCL_CALC_TIMESTAMP:
+          {
+            GBE_ASSERT(AI != AE);
+            ConstantInt *CI = dyn_cast<ConstantInt>(*AI);
+            GBE_ASSERT(CI);
+            uint32_t pointNum = CI->getZExtValue();
+            AI++;
+            GBE_ASSERT(AI != AE);
+            CI = dyn_cast<ConstantInt>(*AI);
+            GBE_ASSERT(CI);
+            uint32_t tsType = CI->getZExtValue();
+            ctx.CALC_TIMESTAMP(pointNum, tsType);
+            break;
+          }
+          case GEN_OCL_STORE_PROFILING:
+          {
+            /* The profiling log always begin at 0 offset, so we
+               never need the buffer ptr value and ptrBase, and
+               no need for SUB to calculate the real address, neither.
+               We just pass down the BTI value to the instruction. */
+            GBE_ASSERT(AI != AE);
+            Value* llvmPtr = *AI;
+            Value *bti = getBtiRegister(llvmPtr);
+            GBE_ASSERT(isa<ConstantInt>(bti)); //Should never be mixed pointer.
+            uint32_t index = cast<ConstantInt>(bti)->getZExtValue();
+            GBE_ASSERT(btiToGen(index) == ir::MEM_GLOBAL);
+            ++AI;
+            GBE_ASSERT(AI != AE);
+            ConstantInt *CI = dyn_cast<ConstantInt>(*AI);
+            GBE_ASSERT(CI);
+            uint32_t ptype = CI->getZExtValue();
+            ctx.getUnit().getProfilingInfo()->setProfilingType(ptype);
             break;
           }
           case GEN_OCL_SIMD_SIZE:
@@ -4142,6 +4831,97 @@ namespace gbe
             ctx.SIMD_SHUFFLE(getType(ctx, I.getType()), dst, src0, src1);
             break;
           }
+          case GEN_OCL_DEBUGWAIT:
+          {
+            ctx.WAIT();
+            break;
+          }
+          case GEN_OCL_WORK_GROUP_ALL: this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_ALL); break;
+          case GEN_OCL_WORK_GROUP_ANY: this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_ANY); break;
+          case GEN_OCL_WORK_GROUP_BROADCAST:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_BROADCAST); break;
+          case GEN_OCL_WORK_GROUP_REDUCE_ADD:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_REDUCE_ADD); break;
+          case GEN_OCL_WORK_GROUP_REDUCE_MAX:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_REDUCE_MAX); break;
+          case GEN_OCL_WORK_GROUP_REDUCE_MIN:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_REDUCE_MIN); break;
+          case GEN_OCL_WORK_GROUP_SCAN_EXCLUSIVE_ADD:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_EXCLUSIVE_ADD); break;
+          case GEN_OCL_WORK_GROUP_SCAN_EXCLUSIVE_MAX:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_EXCLUSIVE_MAX); break;
+          case GEN_OCL_WORK_GROUP_SCAN_EXCLUSIVE_MIN:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_EXCLUSIVE_MIN); break;
+          case GEN_OCL_WORK_GROUP_SCAN_INCLUSIVE_ADD:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_INCLUSIVE_ADD); break;
+          case GEN_OCL_WORK_GROUP_SCAN_INCLUSIVE_MAX:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_INCLUSIVE_MAX); break;
+          case GEN_OCL_WORK_GROUP_SCAN_INCLUSIVE_MIN:
+            this->emitWorkGroupInst(I, CS, ir::WORKGROUP_OP_INCLUSIVE_MIN); break;
+          case GEN_OCL_SUB_GROUP_BROADCAST:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_BROADCAST); break;
+          case GEN_OCL_SUB_GROUP_REDUCE_ADD:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_REDUCE_ADD); break;
+          case GEN_OCL_SUB_GROUP_REDUCE_MAX:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_REDUCE_MAX); break;
+          case GEN_OCL_SUB_GROUP_REDUCE_MIN:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_REDUCE_MIN); break;
+          case GEN_OCL_SUB_GROUP_SCAN_EXCLUSIVE_ADD:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_EXCLUSIVE_ADD); break;
+          case GEN_OCL_SUB_GROUP_SCAN_EXCLUSIVE_MAX:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_EXCLUSIVE_MAX); break;
+          case GEN_OCL_SUB_GROUP_SCAN_EXCLUSIVE_MIN:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_EXCLUSIVE_MIN); break;
+          case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_ADD:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_INCLUSIVE_ADD); break;
+          case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MAX:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_INCLUSIVE_MAX); break;
+          case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN:
+            this->emitSubGroupInst(I, CS, ir::WORKGROUP_OP_INCLUSIVE_MIN); break;
+          case GEN_OCL_LRP:
+          {
+            const ir::Register dst  = this->getRegister(&I);
+            GBE_ASSERT(AI != AE);
+            const ir::Register src0 = this->getRegister(*(AI++));
+            GBE_ASSERT(AI != AE);
+            const ir::Register src1 = this->getRegister(*(AI++));
+            GBE_ASSERT(AI != AE);
+            const ir::Register src2 = this->getRegister(*(AI++));
+            ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2);
+            break;
+          }
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+            this->emitBlockReadWriteMemInst(I, CS, false, 1); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
+            this->emitBlockReadWriteMemInst(I, CS, false, 2); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
+            this->emitBlockReadWriteMemInst(I, CS, false, 4); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
+            this->emitBlockReadWriteMemInst(I, CS, false, 8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+            this->emitBlockReadWriteMemInst(I, CS, true, 1); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
+            this->emitBlockReadWriteMemInst(I, CS, true, 2); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
+            this->emitBlockReadWriteMemInst(I, CS, true, 4); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+            this->emitBlockReadWriteMemInst(I, CS, true, 8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
+            this->emitBlockReadWriteImageInst(I, CS, false, 1); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
+            this->emitBlockReadWriteImageInst(I, CS, false, 2); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
+            this->emitBlockReadWriteImageInst(I, CS, false, 4); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+            this->emitBlockReadWriteImageInst(I, CS, false, 8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
+            this->emitBlockReadWriteImageInst(I, CS, true, 1); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
+            this->emitBlockReadWriteImageInst(I, CS, true, 2); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
+            this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+            this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
           default: break;
         }
       }
@@ -4216,65 +4996,82 @@ namespace gbe
     this->newRegister(&I);
   }
   void GenWriter::regAllocateStoreInst(StoreInst &I) {}
+  void GenWriter::emitLoadInst(LoadInst &I) {
+    MemoryInstHelper *h = new MemoryInstHelper(ctx, unit, this, legacyMode);
+    h->emitLoadOrStore<true>(I);
+    delete h;
+  }
 
-  void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
-                                      Value *llvmValues, const ir::Register ptr,
-                                      const ir::AddressSpace addrSpace,
-                                      Type * elemType, bool isLoad, ir::Register bti,
-                                      bool dwAligned, bool fixedBTI) {
-    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
-    uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
-    uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
-    const uint32_t perMsgNum = elemNum / msgNum;
+  void GenWriter::emitStoreInst(StoreInst &I) {
+    MemoryInstHelper *h = new MemoryInstHelper(ctx, unit, this, legacyMode);
+    h->emitLoadOrStore<false>(I);
+    delete h;
+  }
 
-    for (uint32_t msg = 0; msg < msgNum; ++msg) {
-      // Build the tuple data in the vector
+  llvm::FunctionPass *createGenPass(ir::Unit &unit) {
+    return new GenWriter(unit);
+  }
+
+  ir::Tuple MemoryInstHelper::getValueTuple(llvm::Value *llvmValues, llvm::Type *elemType, unsigned start, unsigned elemNum) {
       vector<ir::Register> tupleData; // put registers here
-      for (uint32_t elemID = 0; elemID < perMsgNum; ++elemID) {
+      for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
         ir::Register reg;
-        if(regTranslator.isUndefConst(llvmValues, elemID)) {
+        if(writer->regTranslator.isUndefConst(llvmValues, elemID)) {
           Value *v = Constant::getNullValue(elemType);
-          reg = this->getRegister(v);
+          reg = writer->getRegister(v);
         } else
-          reg = this->getRegister(llvmValues, perMsgNum*msg+elemID);
+          reg = writer->getRegister(llvmValues, start + elemID);
 
         tupleData.push_back(reg);
       }
-      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], perMsgNum);
-
-      // We may need to update to offset the pointer
-      ir::Register addr;
-      if (msg == 0)
-        addr = ptr;
-      else {
-        const ir::Register offset = ctx.reg(pointerFamily);
-        ir::ImmediateIndex immIndex;
-        ir::Type immType;
+      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
+      return tuple;
+  }
+
+  void MemoryInstHelper::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+                                      Value *llvmValues,
+                                      Type * elemType) {
+    uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
+    uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
+    const uint32_t perMsgNum = elemNum / msgNum;
+
+    for (uint32_t msg = 0; msg < msgNum; ++msg) {
+      // Build the tuple data in the vector
+     ir::Tuple tuple = getValueTuple(llvmValues, elemType, perMsgNum*msg, perMsgNum);
         // each message can read/write 16 byte
         const int32_t stride = 16;
-        if (pointerFamily == ir::FAMILY_DWORD) {
-          immIndex = ctx.newImmediate(int32_t(msg*stride));
-          immType = ir::TYPE_S32;
-        } else {
-          immIndex = ctx.newImmediate(int64_t(msg*stride));
-          immType = ir::TYPE_S64;
-        }
+      ir::Register addr = getOffsetAddress(mPtr, msg*stride);
+      shootMessage(type, addr, tuple, perMsgNum);
+    }
+  }
+
+  ir::Register MemoryInstHelper::getOffsetAddress(ir::Register basePtr, unsigned offset) {
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+    ir::Register addr;
+    if (offset == 0)
+      addr = basePtr;
+    else {
+      const ir::Register offsetReg = ctx.reg(pointerFamily);
+      ir::ImmediateIndex immIndex;
+      ir::Type immType;
 
-        addr = ctx.reg(pointerFamily);
-        ctx.LOADI(immType, offset, immIndex);
-        ctx.ADD(immType, addr, ptr, offset);
+      if (pointerFamily == ir::FAMILY_DWORD) {
+        immIndex = ctx.newImmediate(int32_t(offset));
+        immType = ir::TYPE_S32;
+      } else {
+        immIndex = ctx.newImmediate(int64_t(offset));
+        immType = ir::TYPE_S64;
       }
 
-      // Emit the instruction
-      if (isLoad)
-        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
-      else
-        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
+      addr = ctx.reg(pointerFamily);
+      ctx.LOADI(immType, offsetReg, immIndex);
+      ctx.ADD(immType, addr, basePtr, offsetReg);
     }
+    return addr;
   }
 
   // handle load of dword/qword with unaligned address
-  void GenWriter::emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI)
+  void MemoryInstHelper::emitUnalignedDQLoadStore(Value *llvmValues)
   {
     Type *llvmType = llvmValues->getType();
     unsigned byteSize = getTypeByteSize(unit, llvmType);
@@ -4288,19 +5085,7 @@ namespace gbe
     }
     const ir::Type type = getType(ctx, elemType);
 
-    vector<ir::Register> tupleData;
-    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-      ir::Register reg;
-      if(regTranslator.isUndefConst(llvmValues, elemID)) {
-        Value *v = Constant::getNullValue(elemType);
-        reg = this->getRegister(v);
-      } else
-        reg = this->getRegister(llvmValues, elemID);
-
-      tupleData.push_back(reg);
-    }
-    const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
-
+    ir::Tuple tuple = getValueTuple(llvmValues, elemType, 0, elemNum);
     vector<ir::Register> byteTupleData;
     for (uint32_t elemID = 0; elemID < byteSize; ++elemID) {
       byteTupleData.push_back(ctx.reg(ir::FAMILY_BYTE));
@@ -4308,97 +5093,83 @@ namespace gbe
     const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
 
     if (isLoad) {
-      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
+      shootMessage(ir::TYPE_U8, mPtr, byteTuple, byteSize);
       ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
     } else {
       ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
       // FIXME: byte scatter does not handle correctly vector store, after fix that,
       //        we can directly use on store instruction like:
       //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
-      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
       for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
-        const ir::Register reg = byteTupleData[elemID];
-        ir::Register addr;
-        if (elemID == 0)
-          addr = ptr;
-        else {
-          const ir::Register offset = ctx.reg(pointerFamily);
-          ir::ImmediateIndex immIndex;
-          immIndex = ctx.newImmediate(int32_t(elemID));
-          addr = ctx.reg(pointerFamily);
-          ctx.LOADI(ir::TYPE_S32, offset, immIndex);
-          ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
-        }
-       ctx.STORE(ir::TYPE_U8, addr, addrSpace, dwAligned, fixedBTI, bti, reg);
+        const ir::Register addr = getOffsetAddress(mPtr, elemID);
+        const ir::Tuple value = ctx.arrayTuple(&byteTupleData[elemID], 1);
+        shootMessage(ir::TYPE_U8, addr, value, 1);
       }
     }
   }
 
-  extern int OCL_SIMD_WIDTH;
-  template <bool isLoad, typename T>
-  INLINE void GenWriter::emitLoadOrStore(T &I)
-  {
+  template <bool IsLoad, typename T>
+  void MemoryInstHelper::emitLoadOrStore(T &I) {
     Value *llvmPtr = I.getPointerOperand();
     Value *llvmValues = getLoadOrStoreValue(I);
     Type *llvmType = llvmValues->getType();
-    const bool dwAligned = (I.getAlignment() % 4) == 0;
-    ir::AddressSpace addrSpace;
-    const ir::Register pointer = this->getRegister(llvmPtr);
+    dwAligned = (I.getAlignment() % 4) == 0;
+    addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
+    const ir::Register pointer = writer->getRegister(llvmPtr);
     const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
 
-    Value *bti = getBtiRegister(llvmPtr);
-    Value *ptrBase = getPointerBase(llvmPtr);
-    ir::Register baseReg = this->getRegister(ptrBase);
-    bool zeroBase = false;
-    if (isa<ConstantPointerNull>(ptrBase)) {
-      zeroBase = true;
-    }
-
-    ir::Register btiReg;
-    bool fixedBTI = false;
-    if (isa<ConstantInt>(bti)) {
-      fixedBTI = true;
-      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
-      addrSpace = btiToGen(index);
-      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
-      btiReg = ctx.reg(ir::FAMILY_DWORD);
-      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
-    } else {
-      addrSpace = ir::MEM_MIXED;
-      btiReg = this->getRegister(bti);
-    }
-
+    this->isLoad = IsLoad;
     Type *scalarType = llvmType;
     if (!isScalarType(llvmType)) {
       VectorType *vectorType = cast<VectorType>(llvmType);
       scalarType = vectorType->getElementType();
     }
 
-    ir::Register ptr = ctx.reg(pointerFamily);
-    // FIXME: avoid subtraction zero at this stage is not a good idea,
-    // but later ArgumentLower pass need to match exact load/addImm pattern
-    // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
-    if (!zeroBase)
-      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
-    else
-      ptr = pointer;
+    // calculate bti and pointer operand
+    if (legacyMode) {
+      Value *bti = writer->getBtiRegister(llvmPtr);
+      Value *ptrBase = writer->getPointerBase(llvmPtr);
+      ir::Register baseReg = writer->getRegister(ptrBase);
+      bool zeroBase = isa<ConstantPointerNull>(ptrBase) ? true : false;
+
+      if (isa<ConstantInt>(bti)) {
+        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+        addrSpace = btiToGen(SurfaceIndex);
+        mAddressMode = ir::AM_StaticBti;
+      } else {
+        addrSpace = ir::MEM_MIXED;
+        mBTI = writer->getRegister(bti);
+        mAddressMode = ir::AM_DynamicBti;
+      }
+      mPtr = ctx.reg(pointerFamily);
+
+      // FIXME: avoid subtraction zero at this stage is not a good idea,
+      // but later ArgumentLower pass need to match exact load/addImm pattern
+      // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
+      if (!zeroBase)
+        ctx.SUB(ir::TYPE_U32, mPtr, pointer, baseReg);
+      else
+        mPtr = pointer;
+    } else {
+      mPtr = pointer;
+      SurfaceIndex = 0xff;
+      mAddressMode = ir::AM_Stateless;
+    }
 
     unsigned primitiveBits = scalarType->getPrimitiveSizeInBits();
     if (!dwAligned
        && (primitiveBits == 64
           || primitiveBits == 32)
        ) {
-      emitUnalignedDQLoadStore(ptr, llvmValues, addrSpace, btiReg, isLoad, dwAligned, fixedBTI);
+      emitUnalignedDQLoadStore(llvmValues);
       return;
     }
     // Scalar is easy. We neednot build register tuples
     if (isScalarType(llvmType) == true) {
       const ir::Type type = getType(ctx, llvmType);
-      const ir::Register values = this->getRegister(llvmValues);
-      if (isLoad)
-        ctx.LOAD(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
-      else
-        ctx.STORE(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
+      const ir::Register values = writer->getRegister(llvmValues);
+      const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
+      shootMessage(type, mPtr, tuple, 1);
     }
     // A vector type requires to build a tuple
     else {
@@ -4409,13 +5180,6 @@ namespace gbe
       uint32_t elemNum = vectorType->getNumElements();
       GBE_ASSERTM(elemNum == 2 || elemNum == 3 || elemNum == 4 || elemNum == 8 || elemNum == 16,
                   "Only vectors of 2,3,4,8 or 16 elements are supported");
-      // Per OPenCL 1.2 spec 6.1.5:
-      //   For 3-component vector data types, the size of the data type is 4 * sizeof(component).
-      // And the llvm does cast a type3 data to type4 for load/store instruction,
-      // so a 4 elements vector may only have 3 valid elements. We need to fix it to correct element
-      // count here.
-      if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3))
-          elemNum = 3;
 
       // The code is going to be fairly different from types to types (based on
       // size of each vector element)
@@ -4425,72 +5189,44 @@ namespace gbe
       if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
         // One message is enough here. Nothing special to do
         if (elemNum <= 4) {
-          // Build the tuple data in the vector
-          vector<ir::Register> tupleData; // put registers here
-          for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-            ir::Register reg;
-            if(regTranslator.isUndefConst(llvmValues, elemID)) {
-              Value *v = Constant::getNullValue(elemType);
-              reg = this->getRegister(v);
-            } else
-              reg = this->getRegister(llvmValues, elemID);
-
-            tupleData.push_back(reg);
-          }
-          const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
-
-          // Emit the instruction
-          if (isLoad)
-            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
-          else
-            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
+          ir::Tuple tuple = getValueTuple(llvmValues, elemType, 0, elemNum);
+          shootMessage(type, mPtr, tuple, elemNum);
         }
-        // Not supported by the hardware. So, we split the message and we use
-        // strided loads and stores
         else {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, elemType);
         }
       }
       else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
               (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, elemType);
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
-          if(regTranslator.isUndefConst(llvmValues, elemID))
+          if(writer->regTranslator.isUndefConst(llvmValues, elemID))
             continue;
 
-          const ir::Register reg = this->getRegister(llvmValues, elemID);
-          ir::Register addr;
-          if (elemID == 0)
-            addr = ptr;
-          else {
-              const ir::Register offset = ctx.reg(pointerFamily);
-              ir::ImmediateIndex immIndex;
-              int elemSize = getTypeByteSize(unit, elemType);
-              immIndex = ctx.newImmediate(int32_t(elemID * elemSize));
-              addr = ctx.reg(pointerFamily);
-              ctx.LOADI(ir::TYPE_S32, offset, immIndex);
-              ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
-          }
-          if (isLoad)
-           ctx.LOAD(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
-          else
-           ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
+          const ir::Register reg = writer->getRegister(llvmValues, elemID);
+          int elemSize = getTypeByteSize(unit, elemType);
+
+          ir::Register addr = getOffsetAddress(mPtr, elemID*elemSize);
+          const ir::Tuple tuple = ctx.arrayTuple(&reg, 1);
+          shootMessage(type, addr, tuple, 1);
         }
       }
     }
   }
 
-  void GenWriter::emitLoadInst(LoadInst &I) {
-    this->emitLoadOrStore<true>(I);
-  }
-
-  void GenWriter::emitStoreInst(StoreInst &I) {
-    this->emitLoadOrStore<false>(I);
-  }
-
-  llvm::FunctionPass *createGenPass(ir::Unit &unit) {
-    return new GenWriter(unit);
+  void MemoryInstHelper::shootMessage(ir::Type type, ir::Register offset, ir::Tuple value, unsigned elemNum) {
+    if (mAddressMode == ir::AM_DynamicBti) {
+      if (isLoad)
+        ctx.LOAD(type, value, offset, addrSpace, elemNum, dwAligned, mAddressMode, mBTI);
+      else
+        ctx.STORE(type, value, offset, addrSpace, elemNum, dwAligned, mAddressMode, mBTI);
+    } else {
+      if (isLoad)
+        ctx.LOAD(type, value, offset, addrSpace, elemNum, dwAligned, mAddressMode, SurfaceIndex);
+      else
+        ctx.STORE(type, value, offset, addrSpace, elemNum, dwAligned, mAddressMode, SurfaceIndex);
+    }
   }
 } /* namespace gbe */
 
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 94a377b..f2a278e 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -83,12 +83,13 @@ namespace gbe
 
       if (it == map.end()) {
         int status;
-        const char *realName = abi::__cxa_demangle(symbol.c_str(), NULL, NULL, &status);
+        char *realName = abi::__cxa_demangle(symbol.c_str(), NULL, NULL, &status);
         if (status == 0) {
           std::string realFnName(realName), stripName;
           stripName = realFnName.substr(0, realFnName.find("("));
           it = map.find(stripName);
         }
+        free(realName);
       }
       // FIXME, should create a complete error reporting mechanism
       // when found error in beignet managed passes including Gen pass.
@@ -140,7 +141,10 @@ namespace gbe
   llvm::BasicBlockPass *createIntrinsicLoweringPass();
 
   /*! Passer the printf function call. */
-  llvm::FunctionPass* createPrintfParserPass();
+  llvm::FunctionPass* createPrintfParserPass(ir::Unit &unit);
+
+  /*! Insert the time stamp for profiling. */
+  llvm::FunctionPass* createProfilingInserterPass(int profilingType, ir::Unit &unit);
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
   /* customized loop unrolling pass. */
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index cabb225..48a72d1 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -161,12 +161,78 @@ DECL_LLVM_GEN_FUNCTION(SAT_CONV_F16_TO_U32, _Z16convert_uint_satDh)
 // SIMD level function for internal usage
 DECL_LLVM_GEN_FUNCTION(SIMD_ANY, sub_group_any)
 DECL_LLVM_GEN_FUNCTION(SIMD_ALL, sub_group_all)
-DECL_LLVM_GEN_FUNCTION(SIMD_SIZE, get_sub_group_size)
-DECL_LLVM_GEN_FUNCTION(SIMD_ID, get_sub_group_id)
+DECL_LLVM_GEN_FUNCTION(SIMD_SIZE, get_simd_size)
+DECL_LLVM_GEN_FUNCTION(SIMD_ID, get_sub_group_local_id)
+DECL_LLVM_GEN_FUNCTION(GET_THREAD_NUM, get_num_sub_groups)
+DECL_LLVM_GEN_FUNCTION(GET_THREAD_ID, get_sub_group_id)
 DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle)
 
 DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
 DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
 
+DECL_LLVM_GEN_FUNCTION(VME, __gen_ocl_vme)
+
 // printf function
-DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
+DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf_stub)
+DECL_LLVM_GEN_FUNCTION(PUTS, __gen_ocl_puts_stub)
+
+// store timestamp function
+DECL_LLVM_GEN_FUNCTION(CALC_TIMESTAMP, __gen_ocl_calc_timestamp)
+// store profiling info to the mem.
+DECL_LLVM_GEN_FUNCTION(STORE_PROFILING, __gen_ocl_store_profiling)
+
+// debug wait function
+DECL_LLVM_GEN_FUNCTION(DEBUGWAIT, __gen_ocl_debugwait)
+
+// work group function
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST, __gen_ocl_work_group_broadcast)
+
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD, __gen_ocl_work_group_reduce_add)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAX, __gen_ocl_work_group_reduce_max)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MIN, __gen_ocl_work_group_reduce_min)
+
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD, __gen_ocl_work_group_scan_exclusive_add)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX, __gen_ocl_work_group_scan_exclusive_max)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN, __gen_ocl_work_group_scan_exclusive_min)
+
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD, __gen_ocl_work_group_scan_inclusive_add)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_work_group_scan_inclusive_max)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_work_group_scan_inclusive_min)
+
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ALL, __gen_ocl_work_group_all)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ANY, __gen_ocl_work_group_any)
+
+// sub group function
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BROADCAST, __gen_ocl_sub_group_broadcast)
+
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_REDUCE_ADD, __gen_ocl_sub_group_reduce_add)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_REDUCE_MAX, __gen_ocl_sub_group_reduce_max)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_REDUCE_MIN, __gen_ocl_sub_group_reduce_min)
+
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_EXCLUSIVE_ADD, __gen_ocl_sub_group_scan_exclusive_add)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_EXCLUSIVE_MAX, __gen_ocl_sub_group_scan_exclusive_max)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_EXCLUSIVE_MIN, __gen_ocl_sub_group_scan_exclusive_min)
+
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, __gen_ocl_sub_group_scan_inclusive_add)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_inclusive_max)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min)
+
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2, __gen_ocl_sub_group_block_read_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4, __gen_ocl_sub_group_block_read_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8, __gen_ocl_sub_group_block_read_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2, __gen_ocl_sub_group_block_write_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4, __gen_ocl_sub_group_block_write_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8, __gen_ocl_sub_group_block_write_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, __gen_ocl_sub_group_block_read_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, __gen_ocl_sub_group_block_read_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, __gen_ocl_sub_group_block_read_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE8, __gen_ocl_sub_group_block_read_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE, __gen_ocl_sub_group_block_write_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, __gen_ocl_sub_group_block_write_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, __gen_ocl_sub_group_block_write_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, __gen_ocl_sub_group_block_write_image8)
+
+// common function
+DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
diff --git a/backend/src/llvm/llvm_includes.hpp b/backend/src/llvm/llvm_includes.hpp
index fed3a18..d2deb90 100644
--- a/backend/src/llvm/llvm_includes.hpp
+++ b/backend/src/llvm/llvm_includes.hpp
@@ -122,4 +122,9 @@
 
 #include <clang/CodeGen/CodeGenAction.h>
 
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=8
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#endif
+
 #endif /* __GBE_IR_LLVM_INCLUDES_HPP__ */
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index b35d1e6..c26e96a 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -73,7 +73,7 @@ namespace gbe {
         Constant* FCache = M->getOrInsertFunction(NewFn,
                                         FunctionType::get(RetTy, ParamTys, false));
 
-        IRBuilder<> Builder(CI->getParent(), CI);
+        IRBuilder<> Builder(CI->getParent(), BasicBlock::iterator(CI));
         SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
         CallInst *NewCI = Builder.CreateCall(FCache, Args);
         NewCI->setName(CI->getName());
@@ -90,12 +90,12 @@ namespace gbe {
         DataLayout TD(M);
         LLVMContext &Context = BB.getContext();
         for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
-          Instruction *Inst = DI++;
+          Instruction *Inst = &*DI++;
           CallInst* CI = dyn_cast<CallInst>(Inst);
           if(CI == NULL)
             continue;
 
-          IRBuilder<> Builder(&BB, CI);
+          IRBuilder<> Builder(&BB, BasicBlock::iterator(CI));
           // only support memcpy and memset
           if (Function *F = CI->getCalledFunction()) {
             const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index 698fdc2..e797e98 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -35,13 +35,22 @@ namespace gbe {
     GenLoadStoreOptimization() : BasicBlockPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
+#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+#else
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
+#endif
       AU.setPreservesCFG();
     }
 
     virtual bool runOnBasicBlock(BasicBlock &BB) {
+#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+#else
       SE = &getAnalysis<ScalarEvolution>();
+#endif
       #if LLVM_VERSION_MINOR >= 7
         TD = &BB.getModule()->getDataLayout();
       #elif LLVM_VERSION_MINOR >= 5
@@ -61,11 +70,11 @@ namespace gbe {
     bool     isLoadStoreCompatible(Value *A, Value *B);
     void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
     void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
-    BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
-                                               SmallVector<Instruction*, 16> &merged,
-                                               BasicBlock::iterator &start,
-                                               unsigned maxVecSize,
-                                               bool isLoad);
+    bool     findConsecutiveAccess(BasicBlock &BB,
+                                  SmallVector<Instruction*, 16> &merged,
+                                  const BasicBlock::iterator &start,
+                                  unsigned maxVecSize,
+                                  bool isLoad);
 
     virtual const char *getPassName() const {
       return "Merge compatible Load/stores for Gen";
@@ -150,38 +159,58 @@ namespace gbe {
       values[i]->replaceAllUsesWith(S);
     }
   }
-
-  BasicBlock::iterator
+  // When searching for consecutive memory access, we do it in a small window,
+  // if the window is too large, it would take up too much compiling time.
+  // An Important rule we have followed is don't try to change load/store order.
+  // But an exeption is 'load& store that are from different address spaces. The
+  // return value will indicate wheter such kind of reorder happens.
+  bool
   GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
                             SmallVector<Instruction*, 16> &merged,
-                            BasicBlock::iterator &start,
+                            const BasicBlock::iterator &start,
                             unsigned maxVecSize,
                             bool isLoad) {
 
-    BasicBlock::iterator stepForward = start;
-    if(!isSimpleLoadStore(start)) return stepForward;
+    if(!isSimpleLoadStore(&*start)) return false;
 
-    merged.push_back(start);
+    merged.push_back(&*start);
+    unsigned targetAddrSpace = getAddressSpace(&*start);
 
     BasicBlock::iterator E = BB.end();
-    BasicBlock::iterator J = ++start;
+    BasicBlock::iterator J = start;
+    ++J;
 
-    unsigned maxLimit = maxVecSize * 3;
+    unsigned maxLimit = maxVecSize * 8;
+    bool reordered = false;
 
     for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
       if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
-        if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
-          merged.push_back(J);
-          stepForward = ++J;
+        if(isLoadStoreCompatible(merged[merged.size()-1], &*J)) {
+          merged.push_back(&*J);
         }
-      } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
+      } else if((isLoad && isa<StoreInst>(*J))) {
         // simple stop to keep read/write order
-        break;
+        StoreInst *st = cast<StoreInst>(&*J);
+        unsigned addrSpace = st->getPointerAddressSpace();
+        if (addrSpace != targetAddrSpace) {
+          reordered = true;
+        } else {
+          break;
+        }
+      } else if ((!isLoad && isa<LoadInst>(*J))) {
+        LoadInst *ld = cast<LoadInst>(&*J);
+        unsigned addrSpace = ld->getPointerAddressSpace();
+        if (addrSpace != targetAddrSpace) {
+          reordered = true;
+        } else {
+          break;
+        }
       }
 
       if(merged.size() >= maxVecSize) break;
     }
-    return stepForward;
+
+    return reordered;
   }
 
   void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
@@ -193,6 +222,9 @@ namespace gbe {
       values.push_back(cast<StoreInst>(merged[i])->getValueOperand());
     }
     StoreInst *st = cast<StoreInst>(merged[0]);
+    if(!st)
+      return;
+
     unsigned addrSpace = st->getPointerAddressSpace();
 
     unsigned align = st->getAlignment();
@@ -206,28 +238,61 @@ namespace gbe {
       parent = Builder.CreateInsertElement(parent, values[i], ConstantInt::get(IntegerType::get(st->getContext(), 32), i));
     }
 
-    Value *newPtr = Builder.CreateBitCast(st->getPointerOperand(), PointerType::get(vecTy, addrSpace));
+    Value * stPointer = st->getPointerOperand();
+    if(!stPointer)
+      return;
+    Value *newPtr = Builder.CreateBitCast(stPointer, PointerType::get(vecTy, addrSpace));
     StoreInst *newST = Builder.CreateStore(parent, newPtr);
     newST->setAlignment(align);
   }
 
+  // Find the safe iterator we can point to. If reorder happens, we need to
+  // point to the instruction after the first of toBeDeleted. If no reorder,
+  // we are safe to point to the instruction after the last of toBeDeleted
+  static BasicBlock::iterator
+  findSafeInstruction(SmallVector<Instruction*, 16> &toBeDeleted,
+                           const BasicBlock::iterator &current,
+                           bool reorder) {
+    BasicBlock::iterator safe = current;
+    unsigned size = toBeDeleted.size();
+    if (reorder) {
+      unsigned i = 0;
+      while (i < size && toBeDeleted[i] == &*safe) {
+        ++i;
+        ++safe;
+      }
+    } else {
+      safe = BasicBlock::iterator(toBeDeleted[size - 1]);
+      ++safe;
+    }
+    return safe;
+  }
+
   bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
     bool changed = false;
     SmallVector<Instruction*, 16> merged;
     for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
       if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
         bool isLoad = isa<LoadInst>(*BBI) ? true: false;
-        Type *ty = getValueType(BBI);
+        Type *ty = getValueType(&*BBI);
+        if(!ty) continue;
         if(ty->isVectorTy()) continue;
         // TODO Support DWORD/WORD/BYTE LOAD for store support DWORD only now.
         if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
              ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
           continue;
+
         unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
                               (ty->isIntegerTy(16) ? 8 : 16);
-        BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
+        bool reorder = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
         uint32_t size = merged.size();
         uint32_t pos = 0;
+        bool doDeleting = size > 1;
+        if (doDeleting) {
+          // choose next undeleted instruction
+          BBI = findSafeInstruction(merged, BBI, reorder);
+        }
+
         while(size > 1) {
           unsigned vecSize = (size >= 16) ? 16 :
                              (size >= 8 ? 8 :
@@ -244,6 +309,12 @@ namespace gbe {
           pos += vecSize;
           size -= vecSize;
         }
+        if (doDeleting) {
+          //adjust the BBI back by one, as we would increase it in for loop
+          //don't do this if BBI points to the very first instruction.
+          if (BBI != BB.begin())
+            --BBI;
+        }
         merged.clear();
       }
     }
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index d5d965b..b925e5f 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -222,7 +222,9 @@ namespace gbe
   {
     const uint32_t ptrSize = unit.getPointerSize();
     Value* parentPointer = GEPInst->getOperand(0);
-    CompositeType* CompTy = cast<CompositeType>(parentPointer->getType());
+    CompositeType* CompTy = parentPointer ? cast<CompositeType>(parentPointer->getType()) : NULL;
+    if(!CompTy)
+      return false;
 
     Value* currentAddrInst = 
       new PtrToIntInst(parentPointer, IntegerType::get(GEPInst->getContext(), ptrSize), "", GEPInst);
@@ -254,6 +256,9 @@ namespace gbe
 
         Value *operand = GEPInst->getOperand(op); 
 
+        if(!operand)
+          continue;
+
         //HACK TODO: Inserted by type replacement.. this code could break something????
         if(getTypeByteSize(unit, operand->getType())>4)
         {
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 47688f7..800f343 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -38,6 +38,7 @@
 #include "llvm/llvm_gen_backend.hpp"
 #include "sys/map.hpp"
 #include "ir/printf.hpp"
+#include "ir/unit.hpp"
 
 using namespace llvm;
 
@@ -235,8 +236,8 @@ again:
       }
 
       if (p != begin) {
-        std::string s = std::string(begin, size_t(p - begin));
-        printf_fmt->first.push_back(PrintfSlot(s.c_str()));
+        std::string s(begin, size_t(p - begin));
+        printf_fmt->push_back(PrintfSlot(s));
       }
 
       if (p == end) // finish
@@ -247,7 +248,7 @@ again:
       if (ret_char < 0)
         goto error;
 
-      printf_fmt->first.push_back(&state);
+      printf_fmt->push_back(state);
       num++;
 
       if (rend == end)
@@ -292,58 +293,21 @@ error:
   public:
     static char ID;
     typedef std::pair<Instruction*, bool> PrintfInst;
-    std::vector<PrintfInst> deadprintfs;
     Module* module;
     IRBuilder<>* builder;
     Type* intTy;
-    llvm::Constant * pbuf_global;
-    llvm::Constant * index_buf_global;
-    Value* pbuf_ptr;
-    Value* index_buf_ptr;
-    Value* g1Xg2Xg3;
-    Value* wg_offset;
-    int out_buf_sizeof_offset;
-    static map<CallInst*, PrintfSet::PrintfFmt*> printfs;
-    int printf_num;
-    int totalSizeofSize;
-
-    struct PrintfParserInfo {
-      llvm::CallInst* call;
-      PrintfSet::PrintfFmt* printf_fmt;
-    };
-
-    void stateInit(void) {
+    ir::Unit &unit;
+
+    PrintfParser(ir::Unit &unit) : FunctionPass(ID),
+      unit(unit)
+    {
       module = NULL;
       builder = NULL;
       intTy = NULL;
-      out_buf_sizeof_offset = 0;
-      pbuf_ptr = NULL;
-      index_buf_ptr = NULL;
-      g1Xg2Xg3 = NULL;
-      wg_offset = NULL;
-      printf_num = 0;
-      totalSizeofSize = 0;
-    }
-
-    PrintfParser(void) : FunctionPass(ID)
-    {
-      stateInit();
-      pbuf_global = NULL;
-      index_buf_global = NULL;
     }
 
-    ~PrintfParser(void)
-    {
-      for (auto &s : printfs) {
-        delete s.second;
-        s.second = NULL;
-      }
-      printfs.clear();
-    }
-
-    bool parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, int& sizeof_size);
-    bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size);
-    bool generateOnePrintfInstruction(PrintfParserInfo& pInfo);
+    bool parseOnePrintfInstruction(CallInst * call);
+    bool generateOneParameterInst(PrintfSlot& slot, Value* arg, Value*& new_arg);
 
     virtual const char *getPassName() const
     {
@@ -353,102 +317,16 @@ error:
     virtual bool runOnFunction(llvm::Function &F);
   };
 
-  bool PrintfParser::generateOnePrintfInstruction(PrintfParserInfo& pInfo)
-  {
-    Value* op0 = NULL;
-    Value* val = NULL;
-
-    /////////////////////////////////////////////////////
-    /* calculate index address.
-       index_addr = (index_offset + wg_offset )* sizeof(int) * 2 + index_buf_ptr
-       index_offset = global_size2 * global_size1 * global_size0 * printf_num */
-
-    Value* index_offset = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, printf_num));
-    // index_offset + offset
-    op0 = builder->CreateAdd(index_offset, wg_offset);
-    // (index_offset + offset)* sizeof(int) * 2
-    op0 = builder->CreateMul(op0, ConstantInt::get(intTy, sizeof(int)*2));
-    // Final index address = index_buf_ptr + (index_offset + offset)* sizeof(int)
-    op0 = builder->CreateAdd(index_buf_ptr, op0);
-    Value* index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1));
-    // Load the printf num first, printf may be in loop.
-    Value* loop_num = builder->CreateLoad(index_addr);
-    val = builder->CreateAdd(loop_num, ConstantInt::get(intTy, 1));
-    builder->CreateStore(val, index_addr);// The loop number.
-
-    op0 = builder->CreateAdd(op0, ConstantInt::get(intTy, sizeof(int)));
-    index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1));
-    builder->CreateStore(ConstantInt::get(intTy, printf_num), index_addr);// The printf number.
-
-    int i = 1;
-    Value* data_addr = NULL;
-    for (auto &s : (*pInfo.printf_fmt).first) {
-      if (s.type == PRINTF_SLOT_TYPE_STRING)
-        continue;
-
-      assert(i < static_cast<int>(pInfo.call->getNumOperands()) - 1);
-
-      Value *out_arg = pInfo.call->getOperand(i);
-      Type *dst_type = NULL;
-      int sizeof_size = 0;
-      if (!generateOneParameterInst(s, out_arg, dst_type, sizeof_size)) {
-        printf("Printf: %d, parameter %d may have no result because some error\n",
-               printf_num, i - 1);
-        i++;
-        continue;
-      }
-
-      s.state->out_buf_sizeof_offset = out_buf_sizeof_offset;
-      if (!sizeof_size) {
-        i++;
-        continue;
-      }
-
-      assert(dst_type);
-
-      /////////////////////////////////////////////////////
-      /* Calculate the data address.
-      data_addr = (data_offset + pbuf_ptr + offset * sizeof(specify)) +
-               totalSizeofSize * global_size2 * global_size1 * global_size0 * loop_num
-      data_offset = global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset
-
-      //global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset */
-      op0 = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
-      //offset * sizeof(specify)
-      val = builder->CreateMul(wg_offset, ConstantInt::get(intTy, sizeof_size));
-      //data_offset + pbuf_ptr
-      op0 = builder->CreateAdd(pbuf_ptr, op0);
-      op0 = builder->CreateAdd(op0, val);
-      //totalSizeofSize * global_size2 * global_size1 * global_size0
-      val = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, totalSizeofSize));
-      //totalSizeofSize * global_size2 * global_size1 * global_size0 * loop_num
-      val = builder->CreateMul(val, loop_num);
-      //final
-      op0 = builder->CreateAdd(op0, val);
-      data_addr = builder->CreateIntToPtr(op0, dst_type);
-      builder->CreateStore(out_arg, data_addr);
-
-      out_buf_sizeof_offset += ((sizeof_size + 3) / 4) * 4;
-      i++;
-    }
-
-    CallInst* printf_inst = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
-                              "__gen_ocl_printf", Type::getVoidTy(module->getContext()),
-                              NULL)));
-    assert(printfs[printf_inst] == NULL);
-    printfs[printf_inst] = pInfo.printf_fmt;
-    printfs[printf_inst]->second = printf_num;
-    printf_num++;
-    return true;
-  }
-
-  bool PrintfParser::parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, int& sizeof_size)
+  bool PrintfParser::parseOnePrintfInstruction(CallInst * call)
   {
     CallSite CS(call);
     CallSite::arg_iterator CI_FMT = CS.arg_begin();
     int param_num = 0;
 
     llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
+    if(!arg0) {
+      return false;
+    }
     llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
     if (!arg0_ptr) {
       return false;
@@ -460,77 +338,55 @@ error:
     }
 
     std::string fmt = fmt_arg->getAsCString();
+    if (fmt.size() == 0)
+      return false;
 
     PrintfSet::PrintfFmt* printf_fmt = NULL;
 
     if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
+      printf("Warning: Parse the printf inst %s failed, no output for it\n", fmt.c_str());
       return false;
     }
 
     /* iff parameter more than %, error. */
     /* str_fmt arg0 arg1 ... NULL */
-    if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
+    if (param_num + 2 != static_cast<int>(call->getNumOperands())) {
       delete printf_fmt;
+      printf("Warning: Parse the printf inst %s failed, parameters do not match the %% number, no output for it\n",
+             fmt.c_str());
       return false;
     }
 
-    info.call = call;
-    info.printf_fmt = printf_fmt;
-
-    sizeof_size = 0;
+    /* Insert some conversion if types do not match. */
+    builder->SetInsertPoint(call);
     int i = 1;
-    for (auto &s : (*printf_fmt).first) {
-      int sz = 0;
+    for (auto &s : *printf_fmt) {
       if (s.type == PRINTF_SLOT_TYPE_STRING)
         continue;
 
       assert(i < static_cast<int>(call->getNumOperands()) - 1);
-
-      switch (s.state->conversion_specifier) {
-        case PRINTF_CONVERSION_I:
-        case PRINTF_CONVERSION_D:
-        case PRINTF_CONVERSION_O:
-        case PRINTF_CONVERSION_U:
-        case PRINTF_CONVERSION_x:
-        case PRINTF_CONVERSION_X:
-        case PRINTF_CONVERSION_P:
-          if (s.state->length_modifier == PRINTF_LM_L)
-            sz = sizeof(int64_t);
-          else
-            sz = sizeof(int);
-          break;
-        case PRINTF_CONVERSION_C:
-          sz = sizeof(char);
-          break;
-        case PRINTF_CONVERSION_F:
-        case PRINTF_CONVERSION_f:
-        case PRINTF_CONVERSION_E:
-        case PRINTF_CONVERSION_e:
-        case PRINTF_CONVERSION_G:
-        case PRINTF_CONVERSION_g:
-        case PRINTF_CONVERSION_A:
-        case PRINTF_CONVERSION_a:
-          sz = sizeof(float);
-          break;
-        default:
-          sz = 0;
-          break;
+      Value* new_arg = NULL;
+      Value *arg = call->getOperand(i);
+      if (generateOneParameterInst(s, arg, new_arg) == false) {
+        delete printf_fmt;
+        printf("Warning: Parse the printf inst %s failed, the %d parameter format is wrong, no output for it\n",
+               fmt.c_str(), i);
+        return false;
       }
 
-      if (s.state->vector_n) {
-        sz = sz * s.state->vector_n;
+      if (new_arg) { // replace the according argument.
+        call->setArgOperand(i, new_arg);
       }
-
-      sizeof_size += ((sz + 3) / 4) * 4;
+      ++i;
     }
 
+    GBE_ASSERT(unit.printfs.find(call) == unit.printfs.end());
+    unit.printfs.insert(std::pair<llvm::CallInst*, PrintfSet::PrintfFmt*>(call, printf_fmt));
     return true;
   }
 
   bool PrintfParser::runOnFunction(llvm::Function &F)
   {
-    stateInit();
-    bool changed = false;
     bool hasPrintf = false;
     switch (F.getCallingConv()) {
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
@@ -547,8 +403,6 @@ error:
         GBE_ASSERTM(false, "Unsupported calling convention");
     }
 
-    std::vector<PrintfParserInfo> infoVect;
-    totalSizeofSize = 0;
     module = F.getParent();
     intTy = IntegerType::get(module->getContext(), 32);
 
@@ -572,15 +426,17 @@ error:
       for (BasicBlock::iterator instI = B->begin(),
            instE = B->end(); instI != instE; ++instI) {
 
-        PrintfParserInfo pInfo;
-        int sizeof_size = 0;
-
         llvm::CallInst* call = dyn_cast<llvm::CallInst>(instI);
         if (!call) {
           continue;
         }
 
-        if (call->getCalledFunction() && call->getCalledFunction()->getIntrinsicID() != 0)
+        llvm::Function * callFunc = call->getCalledFunction();
+        if(!callFunc) {
+          continue;
+        }
+
+        if ( callFunc->getIntrinsicID() != 0)
           continue;
 
         Value *Callee = call->getCalledValue();
@@ -589,154 +445,20 @@ error:
         if (fnName != "__gen_ocl_printf_stub" && fnName != "__gen_ocl_puts_stub")
           continue;
 
-        if (!parseOnePrintfInstruction(call, pInfo, sizeof_size)) {
-          printf("Parse One printf inst failed, may have some error\n");
-          // Just kill this printf instruction.
-          deadprintfs.push_back(PrintfInst(cast<Instruction>(call),0));
+        if (!parseOnePrintfInstruction(call)) {
+          // Just skip this printf instruction.
           continue;
         }
 
         hasPrintf = true;
-
-        infoVect.push_back(pInfo);
-        totalSizeofSize += sizeof_size;
       }
     }
 
-    if (!hasPrintf)
-      return changed;
-
-    if (!pbuf_global) {
-      /* alloc a new buffer ptr to collect the print output. */
-      Type *ptrTy = Type::getInt32PtrTy(module->getContext(), 1);
-      pbuf_global= new GlobalVariable(*module, ptrTy, false,
-                                GlobalVariable::ExternalLinkage,
-                                nullptr,
-                                StringRef("__gen_ocl_printf_buf"),
-                                nullptr,
-                                GlobalVariable::NotThreadLocal,
-                                1);
-    }
-    pbuf_ptr = builder->CreatePtrToInt(pbuf_global, Type::getInt32Ty(module->getContext()));
-
-    if (!index_buf_global) {
-      Type *ptrTy = Type::getInt32PtrTy(module->getContext(), 1);
-      index_buf_global = new GlobalVariable(*module, ptrTy, false,
-                                GlobalVariable::ExternalLinkage,
-                                nullptr,
-                                StringRef("__gen_ocl_printf_index_buf"),
-                                nullptr,
-                                GlobalVariable::NotThreadLocal,
-                                1);
-    }
-    index_buf_ptr = builder->CreatePtrToInt(index_buf_global, Type::getInt32Ty(module->getContext()));
-
-    if (!wg_offset || !g1Xg2Xg3) {
-      Value* op0 = NULL;
-      Value* val = NULL;
-
-      builder->SetInsertPoint(F.begin()->begin());// Insert the common var in the begin.
-
-      /* FIXME: Because the OpenCL language do not support va macro, and we do not want
-         to introduce the va_list, va_start and va_end into our code, we just simulate
-         the function calls to caculate the offset caculation here. */
-#define BUILD_CALL_INST(name) \
-	CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
-				 "__gen_ocl_get_"#name, 					\
-				 IntegerType::getInt32Ty(module->getContext()), 		\
-				 NULL)))
-
-      BUILD_CALL_INST(group_id2);
-      BUILD_CALL_INST(group_id1);
-      BUILD_CALL_INST(group_id0);
-      BUILD_CALL_INST(global_size2);
-      BUILD_CALL_INST(global_size1);
-      BUILD_CALL_INST(global_size0);
-      BUILD_CALL_INST(local_id2);
-      BUILD_CALL_INST(local_id1);
-      BUILD_CALL_INST(local_id0);
-      BUILD_CALL_INST(local_size2);
-      BUILD_CALL_INST(local_size1);
-      BUILD_CALL_INST(local_size0);
-
-#undef BUILD_CALL_INST
-
-      /* calculate offset for later usage.
-         wg_offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
-         + (local_id1 + local_size1 * group_id1) * global_size0
-         + (local_id0 + local_size0 * group_id0))  */
-
-      // local_size2 * group_id2
-      val = builder->CreateMul(local_size2, group_id2);
-      // local_id2 + local_size2 * group_id2
-      val = builder->CreateAdd(local_id2, val);
-      // global_size1 * global_size0
-      op0 = builder->CreateMul(global_size1, global_size0);
-      // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
-      Value* offset1 = builder->CreateMul(val, op0);
-      // local_size1 * group_id1
-      val = builder->CreateMul(local_size1, group_id1);
-      // local_id1 + local_size1 * group_id1
-      val = builder->CreateAdd(local_id1, val);
-      // (local_id1 + local_size1 * group_id1) * global_size_0
-      Value* offset2 = builder->CreateMul(val, global_size0);
-      // local_size0 * group_id0
-      val = builder->CreateMul(local_size0, group_id0);
-      // local_id0 + local_size0 * group_id0
-      val = builder->CreateAdd(local_id0, val);
-      // The total sum
-      val = builder->CreateAdd(val, offset1);
-      wg_offset = builder->CreateAdd(val, offset2);
-
-      // global_size2 * global_size1
-      op0 = builder->CreateMul(global_size2, global_size1);
-      // global_size2 * global_size1 * global_size0
-      g1Xg2Xg3 = builder->CreateMul(op0, global_size0);
-    }
-
-
-    /* Now generate the instructions. */
-    for (auto pInfo : infoVect) {
-      builder->SetInsertPoint(pInfo.call);
-      deadprintfs.push_back(PrintfInst(cast<Instruction>(pInfo.call), generateOnePrintfInstruction(pInfo)));
-    }
-
-    assert(out_buf_sizeof_offset == totalSizeofSize);
-
-    /* Replace the instruction's operand if using printf's return value. */
-    for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
-      for (BasicBlock::iterator instI = B->begin(),
-           instE = B->end(); instI != instE; ++instI) {
-
-        for (unsigned i = 0; i < instI->getNumOperands(); i++) {
-          for (auto &prf : deadprintfs) {
-            if (instI->getOperand(i) == prf.first) {
-
-              if (prf.second == true) {
-                instI->setOperand(i, ConstantInt::get(intTy, 0));
-              } else {
-                instI->setOperand(i, ConstantInt::get(intTy, -1));
-              }
-            }
-          }
-        }
-      }
-    }
-
-    /* Kill the dead printf instructions. */
-    for (auto &prf : deadprintfs) {
-      prf.first->dropAllReferences();
-      if (prf.first->use_empty())
-        prf.first->eraseFromParent();
-    }
-
-    deadprintfs.clear();
     delete builder;
-
-    return changed;
+    return hasPrintf;
   }
 
-  bool PrintfParser::generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size)
+  bool PrintfParser::generateOneParameterInst(PrintfSlot& slot, Value* arg, Value*& new_arg)
   {
     assert(slot.type == PRINTF_SLOT_TYPE_STATE);
     assert(builder);
@@ -746,7 +468,7 @@ error:
     switch (arg->getType()->getTypeID()) {
       case Type::IntegerTyID: {
         bool sign = false;
-        switch (slot.state->conversion_specifier) {
+        switch (slot.state.conversion_specifier) {
           case PRINTF_CONVERSION_I:
           case PRINTF_CONVERSION_D:
             sign = true;
@@ -754,29 +476,21 @@ error:
           case PRINTF_CONVERSION_U:
           case PRINTF_CONVERSION_x:
           case PRINTF_CONVERSION_X:
-            if (slot.state->length_modifier == PRINTF_LM_L) { /* we would rather print long. */
+            if (slot.state.length_modifier == PRINTF_LM_L) { /* we would rather print long. */
               if (arg->getType() != Type::getInt64Ty(module->getContext())) {
-                arg = builder->CreateIntCast(arg, Type::getInt64Ty(module->getContext()), sign);
+                new_arg = builder->CreateIntCast(arg, Type::getInt64Ty(module->getContext()), sign);
               }
-              dst_type = Type::getInt64PtrTy(module->getContext(), 1);
-              sizeof_size = sizeof(int64_t);
             } else {
               /* If the bits change, we need to consider the signed. */
               if (arg->getType() != Type::getInt32Ty(module->getContext())) {
-                arg = builder->CreateIntCast(arg, Type::getInt32Ty(module->getContext()), sign);
+                new_arg = builder->CreateIntCast(arg, Type::getInt32Ty(module->getContext()), sign);
               }
-
-              /* Int to Int, just store. */
-              dst_type = Type::getInt32PtrTy(module->getContext(), 1);
-              sizeof_size = sizeof(int);
             }
             return true;
 
           case PRINTF_CONVERSION_C:
             /* Int to Char, add a conversion. */
-            arg = builder->CreateIntCast(arg, Type::getInt8Ty(module->getContext()), false);
-            dst_type = Type::getInt8PtrTy(module->getContext(), 1);
-            sizeof_size = sizeof(char);
+            new_arg = builder->CreateIntCast(arg, Type::getInt8Ty(module->getContext()), false);
             return true;
 
           case PRINTF_CONVERSION_F:
@@ -788,15 +502,12 @@ error:
           case PRINTF_CONVERSION_A:
           case PRINTF_CONVERSION_a:
             printf("Warning: Have a float parameter for %%d like specifier, take care of it\n");
-            arg = builder->CreateSIToFP(arg, Type::getFloatTy(module->getContext()));
-            dst_type = Type::getFloatPtrTy(module->getContext(), 1);
-            sizeof_size = sizeof(float);
+            new_arg = builder->CreateSIToFP(arg, Type::getFloatTy(module->getContext()));
             return true;
 
           case PRINTF_CONVERSION_S:
             /* Here, the case is printf("xxx%s", 0); we should output the null. */
-            sizeof_size = 0;
-            slot.state->str = "(null)";
+            slot.state.str = "(null)";
             return true;
 
           default:
@@ -811,20 +522,18 @@ error:
         /* llvm 3.6 will give a undef value for NAN. */
         if (dyn_cast<llvm::UndefValue>(arg)) {
           APFloat nan = APFloat::getNaN(APFloat::IEEEsingle, false);
-          arg = ConstantFP::get(module->getContext(), nan);
+          new_arg = ConstantFP::get(module->getContext(), nan);
         }
 
         /* Because the printf is a variable parameter function, it does not have the
            function prototype, so the compiler will always promote the arg to the
            longest precise type for float. So here, we can always find it is double. */
-        switch (slot.state->conversion_specifier) {
+        switch (slot.state.conversion_specifier) {
           case PRINTF_CONVERSION_I:
           case PRINTF_CONVERSION_D:
             /* Float to Int, add a conversion. */
             printf("Warning: Have a int parameter for %%f like specifier, take care of it\n");
-            arg = builder->CreateFPToSI(arg, Type::getInt32Ty(module->getContext()));
-            dst_type = Type::getInt32PtrTy(module->getContext(), 1);
-            sizeof_size = sizeof(int);
+            new_arg = builder->CreateFPToSI(arg, Type::getInt32Ty(module->getContext()));
             return true;
 
           case PRINTF_CONVERSION_O:
@@ -833,9 +542,7 @@ error:
           case PRINTF_CONVERSION_X:
             /* Float to uint, add a conversion. */
             printf("Warning: Have a uint parameter for %%f like specifier, take care of it\n");
-            arg = builder->CreateFPToUI(arg, Type::getInt32Ty(module->getContext()));
-            dst_type = Type::getInt32PtrTy(module->getContext(), 1);
-            sizeof_size = sizeof(int);
+            new_arg = builder->CreateFPToUI(arg, Type::getInt32Ty(module->getContext()));
             return true;
 
           case PRINTF_CONVERSION_F:
@@ -846,9 +553,7 @@ error:
           case PRINTF_CONVERSION_g:
           case PRINTF_CONVERSION_A:
           case PRINTF_CONVERSION_a:
-            arg = builder->CreateFPCast(arg, Type::getFloatTy(module->getContext()));
-            dst_type = Type::getFloatPtrTy(module->getContext(), 1);
-            sizeof_size = sizeof(float);
+            new_arg = builder->CreateFPCast(arg, Type::getFloatTy(module->getContext()));
             return true;
 
           default:
@@ -860,9 +565,12 @@ error:
 
       /* %p and %s */
       case Type::PointerTyID:
-        switch (slot.state->conversion_specifier) {
+        switch (slot.state.conversion_specifier) {
           case PRINTF_CONVERSION_S: {
             llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(arg);
+            if(!arg0) {
+              return false;
+            }
             llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
             if (!arg0_ptr) {
               return false;
@@ -872,14 +580,11 @@ error:
             if (!fmt_arg || !fmt_arg->isCString()) {
               return false;
             }
-            sizeof_size = 0;
-            slot.state->str = fmt_arg->getAsCString();
+            slot.state.str = fmt_arg->getAsCString();
             return true;
           }
           case PRINTF_CONVERSION_P: {
-            arg = builder->CreatePtrToInt(arg, Type::getInt32Ty(module->getContext()));
-            dst_type = arg->getType()->getPointerTo(1);
-            sizeof_size = sizeof(int);
+            new_arg = builder->CreatePtrToInt(arg, Type::getInt32Ty(module->getContext()));
             return true;
           }
           default:
@@ -894,12 +599,12 @@ error:
         int vec_num = vect_type->getVectorNumElements();
         bool sign = false;
 
-        if (vec_num != slot.state->vector_n) {
+        if (vec_num != slot.state.vector_n) {
           printf("Error The printf vector number is not match!\n");
           return false;
         }
 
-        switch (slot.state->conversion_specifier) {
+        switch (slot.state.conversion_specifier) {
           case PRINTF_CONVERSION_I:
           case PRINTF_CONVERSION_D:
             sign = true;
@@ -913,7 +618,7 @@ error:
             }
 
             Type* elt_dst_type = NULL;
-            if (slot.state->length_modifier == PRINTF_LM_L) {
+            if (slot.state.length_modifier == PRINTF_LM_L) {
               elt_dst_type = Type::getInt64Ty(elt_type->getContext());
             } else {
               elt_dst_type = Type::getInt32Ty(elt_type->getContext());
@@ -929,12 +634,9 @@ error:
                 Value *cvt = builder->CreateIntCast(org, elt_dst_type, sign);
                 II = builder->CreateInsertElement(vec, cvt, cv);
               }
-              arg = II;
+              new_arg = II;
             }
 
-            dst_type = arg->getType()->getPointerTo(1);
-            sizeof_size = (elt_dst_type == Type::getInt32Ty(elt_type->getContext()) ?
-                           sizeof(int) * vec_num  : sizeof(int64_t) * vec_num);
             return true;
           }
 
@@ -960,11 +662,9 @@ error:
                 Value* cvt  = builder->CreateFPCast(org, Type::getFloatTy(module->getContext()));
                 II = builder->CreateInsertElement(vec, cvt, cv);
               }
-              arg = II;
+              new_arg = II;
             }
 
-            dst_type = arg->getType()->getPointerTo(1);
-            sizeof_size = sizeof(int) * vec_num;
             return true;
 
           default:
@@ -979,18 +679,9 @@ error:
     return false;
   }
 
-  map<CallInst*, PrintfSet::PrintfFmt*> PrintfParser::printfs;
-
-  void* getPrintfInfo(CallInst* inst)
-  {
-    if (PrintfParser::printfs[inst])
-      return (void*)PrintfParser::printfs[inst];
-    return NULL;
-  }
-
-  FunctionPass* createPrintfParserPass()
+  FunctionPass* createPrintfParserPass(ir::Unit &unit)
   {
-    return new PrintfParser();
+    return new PrintfParser(unit);
   }
   char PrintfParser::ID = 0;
 
diff --git a/backend/src/llvm/llvm_profiling.cpp b/backend/src/llvm/llvm_profiling.cpp
new file mode 100644
index 0000000..96c95ee
--- /dev/null
+++ b/backend/src/llvm/llvm_profiling.cpp
@@ -0,0 +1,214 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file llvm_profiling.cpp
+ * This file will insert some instructions for each profiling point.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#endif
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Attributes.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+#include "ir/unit.hpp"
+
+#include <iostream>
+#include <vector>
+
+
+using namespace llvm;
+using std::vector;
+
+
+namespace gbe
+{
+  using namespace ir;
+
+  class ProfilingInserter : public FunctionPass
+  {
+  public:
+    static char ID;
+    Module* module;
+    IRBuilder<>* builder;
+    Type* intTy;
+    Type *ptrTy;
+    int profilingType;
+
+    ProfilingInserter(int profiling) : FunctionPass(ID), profilingType(profiling)
+    {
+      module = NULL;
+      builder = NULL;
+      intTy = NULL;
+      ptrTy = NULL;
+    }
+
+    ~ProfilingInserter(void)
+    {
+    }
+
+    virtual const char *getPassName() const
+    {
+      return "Timestamp Parser";
+    }
+
+    virtual bool runOnFunction(llvm::Function &F);
+  };
+
+  bool ProfilingInserter::runOnFunction(llvm::Function &F)
+  {
+    bool changed = false;
+    int pointNum = 0;
+
+    switch (F.getCallingConv()) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+      case CallingConv::PTX_Device:
+        return false;
+      case CallingConv::PTX_Kernel:
+#else
+      case CallingConv::C:
+      case CallingConv::Fast:
+      case CallingConv::SPIR_KERNEL:
+#endif
+        break;
+      default:
+        GBE_ASSERTM(false, "Unsupported calling convention");
+    }
+
+    // As we inline all function calls, so skip non-kernel functions
+    bool bKernel = isKernelFunction(F);
+    if (!bKernel) return changed;
+
+    module = F.getParent();
+    intTy = IntegerType::get(module->getContext(), 32);
+    ptrTy = Type::getInt32PtrTy(module->getContext(), 1);
+    builder = new IRBuilder<>(module->getContext());
+
+    /* alloc a new buffer ptr to collect the timestamps. */
+    builder->SetInsertPoint(&*F.begin()->begin());
+    llvm::Constant *profilingBuf = module->getGlobalVariable("__gen_ocl_profiling_buf");
+    if (!profilingBuf) {
+      profilingBuf = new GlobalVariable(*module, intTy, false,
+          GlobalVariable::ExternalLinkage, nullptr, StringRef("__gen_ocl_profiling_buf"),
+          nullptr, GlobalVariable::NotThreadLocal, 1);
+    }
+
+    changed = true;
+
+    for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
+      /* Skip the empty blocks. */
+      if (B->empty())
+        continue;
+
+      BasicBlock::iterator instI = B->begin();
+      for ( ; instI != B->end(); instI++) {
+        if (dyn_cast<llvm::PHINode>(instI))
+          continue;
+        if (dyn_cast<llvm::ReturnInst>(instI)) {
+          instI++;
+          GBE_ASSERT(instI == B->end());
+          break;
+        }
+        if (dyn_cast<llvm::BranchInst>(instI)) {
+          instI++;
+          GBE_ASSERT(instI == B->end());
+          break;
+        }
+        break;
+      }
+
+      if (instI == B->end())
+        continue;
+
+      if (pointNum >= 20) // To many timestamp.
+        continue;
+
+      // Insert the first one at beginning of not PHI.
+      builder->SetInsertPoint(&*instI);
+      /* Add the timestamp store function call. */
+      // __gen_ocl_store_timestamp(int nth, int type);
+      Value *Args[2] = {ConstantInt::get(intTy, pointNum++), ConstantInt::get(intTy, profilingType)};
+      builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
+              "__gen_ocl_calc_timestamp", Type::getVoidTy(module->getContext()),
+              IntegerType::getInt32Ty(module->getContext()),
+              IntegerType::getInt32Ty(module->getContext()),
+              NULL)),
+              ArrayRef<Value*>(Args));
+    }
+    /* We insert one store_profiling at the end of the last block to hold the place. */
+    llvm::Function::iterator BE = F.end();
+    BE--;
+    BasicBlock::iterator retInst = BE->end();
+    retInst--;
+    builder->SetInsertPoint(&*retInst);
+    Value *Args2[2] = {profilingBuf, ConstantInt::get(intTy, profilingType)};
+
+    builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
+            "__gen_ocl_store_profiling", Type::getVoidTy(module->getContext()),
+            ptrTy,
+            IntegerType::getInt32Ty(module->getContext()),
+            NULL)),
+            ArrayRef<Value*>(Args2));
+
+    delete builder;
+    return changed;
+  }
+
+  FunctionPass* createProfilingInserterPass(int profilingType, ir::Unit &unit)
+  {
+    unit.setInProfilingMode(true);
+    return new ProfilingInserter(profilingType);
+  }
+  char ProfilingInserter::ID = 0;
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_sampler_fix.cpp b/backend/src/llvm/llvm_sampler_fix.cpp
index 01db8fe..de7ebdb 100644
--- a/backend/src/llvm/llvm_sampler_fix.cpp
+++ b/backend/src/llvm/llvm_sampler_fix.cpp
@@ -41,6 +41,8 @@ namespace gbe {
     }
 
     bool visitCallInst(CallInst *I) {
+      if(!I)
+        return false;
       Value *Callee = I->getCalledValue();
       const std::string fnName = Callee->getName();
       bool changed = false;
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 7ee5259..615fb50 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -173,6 +173,8 @@ namespace gbe {
     }
 
     Type* GetBasicType(Type* type) {
+      if(!type)
+        return type;
       switch(type->getTypeID()) {
       case Type::VectorTyID:
       case Type::ArrayTyID:
@@ -184,20 +186,20 @@ namespace gbe {
     }
 
     int GetComponentCount(const Type* type)  {
-      if (type->getTypeID() == Type::VectorTyID)
+      if (type && type->getTypeID() == Type::VectorTyID)
         return llvm::dyn_cast<VectorType>(type)->getNumElements();
       else
         return 1;
     }
 
     int GetComponentCount(const Value* value) {
-      return GetComponentCount(value->getType());
+      return GetComponentCount(value ? value->getType() : NULL);
     }
 
     /* set to insert new instructions after the specified instruction.*/
     void setAppendPoint(Instruction *insn)  {
       BasicBlock::iterator next(insn);
-      builder->SetInsertPoint(++next);
+      builder->SetInsertPoint(&*++next);
     }
 
     DenseMap<Value*, VectorValues> vectorVals;
@@ -227,13 +229,16 @@ namespace gbe {
     // of their operands hadn't before been visited (i.e. loop variant
     // variables)
     SmallVector<PHINode*, 16> incompletePhis;
+
+    // Map for alloca vec uesd for Extractelememt < vec, alloca >
+    std::map<Value*, Value*> vectorAlloca;
   };
 
   Value* Scalarize::getComponent(int component, Value* v)
   {
     assert(canGetComponent(v) && "getComponent called on unhandled vector");
 
-    if (v->getType()->isVectorTy()) {
+    if (v && v->getType() && v->getType()->isVectorTy()) {
       if (ConstantDataVector* c = dyn_cast<ConstantDataVector>(v)) {
         return c->getElementAsConstant(component);
       } else if (ConstantVector* c = dyn_cast<ConstantVector>(v)) {
@@ -266,6 +271,7 @@ namespace gbe {
           case Intrinsic::sqrt:
           case Intrinsic::ceil:
           case Intrinsic::trunc:
+          case Intrinsic::fmuladd:
               return true;
         }
     }
@@ -333,7 +339,7 @@ namespace gbe {
   }
   bool Scalarize::canGetComponent(Value* v)
   {
-    if (v->getType()->isVectorTy()) {
+    if (v && v->getType() && v->getType()->isVectorTy()) {
       if (isa<ConstantDataVector>(v) || isa<ConstantVector>(v) || isa<ConstantAggregateZero>(v) || isa<UndefValue>(v)) {
         return true;
       } else {
@@ -537,13 +543,18 @@ namespace gbe {
     VectorValues& vVals = vectorVals[sv];
 
     int size = GetComponentCount(sv);
-    int srcSize = GetComponentCount(sv->getOperand(0)->getType());
+
+    Value* Op0 = sv->getOperand(0);
+    if(!Op0)
+      return false;
+
+    int srcSize = GetComponentCount(Op0->getType());
 
     for (int i = 0; i < size; ++i) {
       int select = sv->getMaskValue(i);
 
       if (select < 0) {
-        setComponent(vVals, i, UndefValue::get(GetBasicType(sv->getOperand(0))));
+        setComponent(vVals, i, UndefValue::get(GetBasicType(Op0)));
         continue;
       }
 
@@ -671,6 +682,41 @@ namespace gbe {
             *CI = InsertToVector(call, *CI);
             break;
           }
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+          {
+            ++CI;
+            ++CI;
+            if ((*CI)->getType()->isVectorTy())
+              *CI = InsertToVector(call, *CI);
+            break;
+          }
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+          {
+            if ((*CI)->getType()->isVectorTy())
+              *CI = InsertToVector(call, *CI);
+            break;
+          }
+          case GEN_OCL_VME:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+            setAppendPoint(call);
+            extractFromVector(call);
+            break;
+          case GEN_OCL_PRINTF:
+            for (; CI != CS.arg_end(); ++CI)
+              if ((*CI)->getType()->isVectorTy())
+                *CI = InsertToVector(call, *CI);
+            break;
         }
       }
     }
@@ -719,17 +765,52 @@ namespace gbe {
     if (! isa<Constant>(extr->getOperand(1))) {
         // TODO: Variably referenced components. Probably handle/emulate through
         // a series of selects.
-        NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components");
+        //NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components");
+        //TODO: This is a implement for the non-constant index, we use an allocated new vector
+        //to store the need vector elements.
+        Value* foo = extr->getOperand(0);
+        Type* fooTy = foo ? foo->getType() : NULL;
+
+        Value* Alloc;
+        if(vectorAlloca.find(foo) == vectorAlloca.end())
+        {
+          BasicBlock &entry = extr->getParent()->getParent()->getEntryBlock();
+          BasicBlock::iterator bbIter = entry.begin();
+          while (isa<AllocaInst>(bbIter)) ++bbIter;
+
+          IRBuilder<> allocBuilder(&entry);
+          allocBuilder.SetInsertPoint(&*bbIter);
+
+          Alloc = allocBuilder.CreateAlloca(fooTy, nullptr, "");
+          for (int i = 0; i < GetComponentCount(foo); ++i)
+          {
+            Value* foo_i = getComponent(i, foo);
+            assert(foo_i && "There is unhandled vector component");
+            Value* idxs_i[] = {ConstantInt::get(intTy,0), ConstantInt::get(intTy,i)};
+            Value* storePtr_i = builder->CreateGEP(Alloc, idxs_i);
+            builder->CreateStore(foo_i, storePtr_i);
+          }
+          vectorAlloca[foo] = Alloc;
+        }
+        else Alloc = vectorAlloca[foo];
+
+        Value* Idxs[] = {ConstantInt::get(intTy,0), extr->getOperand(1)};
+        Value* getPtr = builder->CreateGEP(Alloc, Idxs);
+        Value* loadComp = builder->CreateLoad(getPtr);
+        extr->replaceAllUsesWith(loadComp);
+        return true;
     }
     //if (isa<Argument>(extr->getOperand(0)))
     //  return false;
-    int component = GetConstantInt(extr->getOperand(1));
-    Value* v = getComponent(component, extr->getOperand(0));
-    if(extr == v)
-      return false;
-    replaceAllUsesOfWith(dyn_cast<Instruction>(extr), dyn_cast<Instruction>(v));
+    else{
+      int component = GetConstantInt(extr->getOperand(1));
+      Value* v = getComponent(component, extr->getOperand(0));
+      if(extr == v)
+        return false;
+      replaceAllUsesOfWith(dyn_cast<Instruction>(extr), dyn_cast<Instruction>(v));
 
-    return true;
+      return true;
+    }
   }
 
   bool Scalarize::scalarizeInsert(InsertElementInst* ins)
@@ -759,7 +840,10 @@ namespace gbe {
       return;
     ReversePostOrderTraversal<Function*> rpot(&F);
     BasicBlock::iterator instI = (*rpot.begin())->begin();
-    builder->SetInsertPoint(instI);
+    Instruction* instVal = &*instI;
+    if(instVal == nullptr)
+      return;
+    builder->SetInsertPoint(instVal);
 
     Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
 
@@ -767,7 +851,7 @@ namespace gbe {
       Type *type = I->getType();
 
       if(type->isVectorTy())
-        extractFromVector(I);
+        extractFromVector(&*I);
     }
     return;
   }
@@ -804,11 +888,11 @@ namespace gbe {
     RPOTType rpot(&F);
     for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
       for (BasicBlock::iterator instI = (*bbI)->begin(), instE = (*bbI)->end(); instI != instE; ++instI) {
-        bool scalarized = scalarize(instI);
+        bool scalarized = scalarize(&*instI);
         if (scalarized) {
           changed = true;
           // TODO: uncomment when done
-          deadList.push_back(instI);
+          deadList.push_back(&*instI);
         }
       }
     }
@@ -836,6 +920,7 @@ namespace gbe {
     incompletePhis.clear();
     vectorVals.clear();
     usedVecVals.clear();
+    vectorAlloca.clear();
 
     delete builder;
     builder = 0;
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 24d4be7..02a69ec 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -26,6 +26,8 @@
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "llvm/llvm_to_gen.hpp"
+#include <llvm/IR/DiagnosticInfo.h>
+#include <llvm/IR/DiagnosticPrinter.h>
 #include "sys/cvar.hpp"
 #include "sys/platform.hpp"
 #include "ir/unit.hpp"
@@ -45,7 +47,6 @@ namespace gbe
   using namespace llvm;
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
-  using namespace llvm::legacy;
   #define TARGETLIBRARY  TargetLibraryInfoImpl
 #else
   #define TARGETLIBRARY  TargetLibraryInfo
@@ -53,7 +54,11 @@ namespace gbe
 
   void runFuntionPass(Module &mod, TARGETLIBRARY *libraryInfo, const DataLayout &DL)
   {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+    legacy::FunctionPassManager FPM(&mod);
+#else
     FunctionPassManager FPM(&mod);
+#endif
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
 #elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
@@ -74,8 +79,13 @@ namespace gbe
 #else
     FPM.add(new TargetLibraryInfo(*libraryInfo));
 #endif
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+    FPM.add(createTypeBasedAAWrapperPass());
+    FPM.add(createBasicAAWrapperPass());
+#else
     FPM.add(createTypeBasedAliasAnalysisPass());
     FPM.add(createBasicAliasAnalysisPass());
+#endif
     FPM.add(createCFGSimplificationPass());
     FPM.add(createSROAPass());
     FPM.add(createEarlyCSEPass());
@@ -91,7 +101,11 @@ namespace gbe
 
   void runModulePass(Module &mod, TARGETLIBRARY *libraryInfo, const DataLayout &DL, int optLevel, bool strictMath)
   {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+    legacy::PassManager MPM;
+#else
     PassManager MPM;
+#endif
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
 #elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
@@ -107,8 +121,13 @@ namespace gbe
 #else
     MPM.add(new TargetLibraryInfo(*libraryInfo));
 #endif
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+    MPM.add(createTypeBasedAAWrapperPass());
+    MPM.add(createBasicAAWrapperPass());
+#else
     MPM.add(createTypeBasedAliasAnalysisPass());
     MPM.add(createBasicAliasAnalysisPass());
+#endif
     MPM.add(createIntrinsicLoweringPass());
     MPM.add(createStripAttributesPass());     // Strip unsupported attributes and calling conventions.
     MPM.add(createSamplerFixPass());
@@ -123,11 +142,19 @@ namespace gbe
     MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr before inlining.
     MPM.add(createFunctionInliningPass(20000));
     MPM.add(createBarrierNodupPass(true));    // restore noduplicate fnAttr after inlining.
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+    MPM.add(createPostOrderFunctionAttrsPass());       // Set readonly/readnone attrs
+#else
     MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+#endif
 
     //MPM.add(createScalarReplAggregatesPass(64, true, -1, -1, 64))
     if(optLevel > 0)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+      MPM.add(createSROAPass());
+#else
       MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+#endif
     MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
     MPM.add(createJumpThreadingPass());         // Thread jumps.
     MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
@@ -144,9 +171,13 @@ namespace gbe
     MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
     MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
     MPM.add(createLoopDeletionPass());          // Delete dead loops
-    MPM.add(createLoopUnrollPass(1024)); //1024, 32, 1024, 512)); //Unroll loops
+    MPM.add(createLoopUnrollPass(640)); //1024, 32, 1024, 512)); //Unroll loops
     if(optLevel > 0) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+      MPM.add(createSROAPass());
+#else
       MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+#endif
       MPM.add(createGVNPass());                 // Remove redundancies
     }
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
@@ -154,10 +185,16 @@ namespace gbe
     // and it may even make som cl kernel cannot compile because of limited scratch memory for spill.
     // As we observe this under strict math. So we disable CustomLoopUnroll if strict math is enabled.
     if (!strictMath) {
+#if !defined(__ANDROID__)
       MPM.add(createCustomLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
+#endif
       MPM.add(createLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
       if(optLevel > 0) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+        MPM.add(createSROAPass());
+#else
         MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+#endif
         MPM.add(createGVNPass());                 // Remove redundancies
       }
     }
@@ -184,7 +221,15 @@ namespace gbe
   }
 
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#define OUTPUT_BITCODE(STAGE, MOD)  do {         \
+  legacy::PassManager passes__;           \
+   if (OCL_OUTPUT_LLVM_##STAGE) {                \
+     passes__.add(createPrintModulePass(*o));    \
+     passes__.run(MOD);                          \
+   }                                             \
+ }while(0)
+#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
 #define OUTPUT_BITCODE(STAGE, MOD)  do {         \
    PassManager passes__;           \
    if (OCL_OUTPUT_LLVM_##STAGE) {                \
@@ -206,7 +251,36 @@ namespace gbe
   BVAR(OCL_OUTPUT_LLVM_AFTER_LINK, false);
   BVAR(OCL_OUTPUT_LLVM_AFTER_GEN, false);
 
-  bool llvmToGen(ir::Unit &unit, const char *fileName,const void* module, int optLevel, bool strictMath)
+  class gbeDiagnosticContext
+  {
+  public:
+    gbeDiagnosticContext() : _str(""), messages(_str), printer(messages), _has_errors(false) {}
+    void process(const llvm::DiagnosticInfo &diagnostic)
+    {
+      if (diagnostic.getSeverity() != DS_Remark) { // avoid noise from function inlining remarks
+        diagnostic.print(printer);
+      }
+      if (diagnostic.getSeverity() == DS_Error) {
+        _has_errors = true;
+      }
+    }
+    std::string str(){return messages.str();}
+    bool has_errors(){return _has_errors;}
+  private:
+    std::string _str;
+    llvm::raw_string_ostream messages;
+    llvm::DiagnosticPrinterRawOStream printer;
+    bool _has_errors;
+  };
+  
+  void gbeDiagnosticHandler(const llvm::DiagnosticInfo &diagnostic, void *context)
+  {
+    gbeDiagnosticContext *dc = reinterpret_cast<gbeDiagnosticContext*>(context);
+    dc->process(diagnostic);
+  }
+
+  bool llvmToGen(ir::Unit &unit, const char *fileName,const void* module,
+                 int optLevel, bool strictMath, int profiling, std::string &errors)
   {
     std::string errInfo;
     std::unique_ptr<llvm::raw_fd_ostream> o = NULL;
@@ -243,6 +317,9 @@ namespace gbe
 
     Module &mod = *M.get();
     DataLayout DL(&mod);
+    
+    gbeDiagnosticContext dc;
+    mod.getContext().setDiagnosticHandler(&gbeDiagnosticHandler,&dc);
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
     mod.setDataLayout(DL);
@@ -255,7 +332,11 @@ namespace gbe
 
     runFuntionPass(mod, libraryInfo, DL);
     runModulePass(mod, libraryInfo, DL, optLevel, strictMath);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+    legacy::PassManager passes;
+#else
     PassManager passes;
+#endif
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
 #elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
     passes.add(new DataLayoutPass());
@@ -274,7 +355,7 @@ namespace gbe
     passes.add(createPromoteMemoryToRegisterPass());
     if(optLevel > 0)
       passes.add(createGVNPass());                 // Remove redundancies
-    passes.add(createPrintfParserPass());
+    passes.add(createPrintfParserPass(unit));
     passes.add(createExpandConstantExprPass());    // expand ConstantExpr
     passes.add(createScalarizePass());             // Expand all vector ops
     passes.add(createExpandLargeIntegersPass());   // legalize large integer operation
@@ -286,6 +367,9 @@ namespace gbe
     passes.add(createDeadInstEliminationPass());   // Remove simplified instructions
     passes.add(createCFGSimplificationPass());     // Merge & remove BBs
     passes.add(createLowerSwitchPass());           // simplify cfg will generate switch-case instruction
+    if (profiling) {
+      passes.add(createProfilingInserterPass(profiling, unit));     // insert the time stamp for profiling.
+    }
     passes.add(createScalarizePass());             // Expand all vector ops
 
     if(OCL_OUTPUT_CFG)
@@ -294,6 +378,12 @@ namespace gbe
       passes.add(createCFGOnlyPrinterPass());
     passes.add(createGenPass(unit));
     passes.run(mod);
+    errors = dc.str();
+    if(dc.has_errors()){
+      unit.setValid(false);
+      delete libraryInfo;
+      return true;
+    }
 
     // Print the code extra optimization passes
     OUTPUT_BITCODE(AFTER_GEN, mod);
diff --git a/backend/src/llvm/llvm_to_gen.hpp b/backend/src/llvm/llvm_to_gen.hpp
index 22ffcb4..e0a6145 100644
--- a/backend/src/llvm/llvm_to_gen.hpp
+++ b/backend/src/llvm/llvm_to_gen.hpp
@@ -32,7 +32,8 @@ namespace gbe {
 
   /*! Convert the LLVM IR code to a GEN IR code,
 		  optLevel 0 equal to clang -O1 and 1 equal to clang -O2*/
-  bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module, int optLevel, bool strictMath);
+  bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module,
+                 int optLevel, bool strictMath, int profiling, std::string &errors);
 
 } /* namespace gbe */
 
diff --git a/backend/src/llvm/llvm_unroll.cpp b/backend/src/llvm/llvm_unroll.cpp
index 6990e39..a289c11 100644
--- a/backend/src/llvm/llvm_unroll.cpp
+++ b/backend/src/llvm/llvm_unroll.cpp
@@ -47,8 +47,13 @@ namespace gbe {
         AU.addPreservedID(LoopSimplifyID);
         AU.addRequiredID(LCSSAID);
         AU.addPreservedID(LCSSAID);
+#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+        AU.addRequired<ScalarEvolutionWrapperPass>();
+        AU.addPreserved<ScalarEvolutionWrapperPass>();
+#else
         AU.addRequired<ScalarEvolution>();
         AU.addPreserved<ScalarEvolution>();
+#endif
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
       // loop will receive invalid dom info.
@@ -156,7 +161,12 @@ namespace gbe {
       // be unrolled.
       bool handleParentLoops(Loop *L, LPPassManager &LPM) {
         Loop *currL = L;
+#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+        ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+        LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+#else
         ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+#endif
         BasicBlock *ExitBlock = currL->getLoopLatch();
         if (!ExitBlock || !L->isLoopExiting(ExitBlock))
           ExitBlock = currL->getExitingBlock();
@@ -166,6 +176,12 @@ namespace gbe {
         if (ExitBlock)
           currTripCount = SE->getSmallConstantTripCount(L, ExitBlock);
 
+        if (currTripCount > 32) {
+          shouldUnroll = false;
+          setUnrollID(currL, false);
+          return shouldUnroll;
+        }
+
         while(currL) {
           Loop *parentL = currL->getParentLoop();
           unsigned parentTripCount = 0;
@@ -177,16 +193,17 @@ namespace gbe {
             if (parentExitBlock)
               parentTripCount = SE->getSmallConstantTripCount(parentL, parentExitBlock);
           }
-          if ((parentTripCount != 0 && currTripCount / parentTripCount > 16) ||
-              (currTripCount > 32)) {
-            if (currL == L)
-              shouldUnroll = false;
-            setUnrollID(currL, false);
-            if (currL != L)
-              LPM.deleteLoopFromQueue(currL);
+          if (parentTripCount != 0 && currTripCount * parentTripCount > 32) {
+            setUnrollID(parentL, false);
+#if LLVM_VERSION_MAJOR == 3 &&  LLVM_VERSION_MINOR >= 8
+            loopInfo.markAsRemoved(parentL);
+#else
+            LPM.deleteLoopFromQueue(parentL);
+#endif
+            return shouldUnroll;
           }
           currL = parentL;
-          currTripCount = parentTripCount;
+          currTripCount = parentTripCount * currTripCount;
         }
         return shouldUnroll;
       }
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index 52f5365..42e6cc4 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -67,12 +67,19 @@ typedef enum clk_channel_type {
 }clk_channel_type;
 
 typedef enum clk_sampler_type {
+    __CLK_NORMALIZED_BASE          = 0,
+    CLK_NORMALIZED_COORDS_FALSE    = 0,
+    CLK_NORMALIZED_COORDS_TRUE     = (1 << __CLK_NORMALIZED_BASE),
+    __CLK_NORMALIZED_MASK          = (CLK_NORMALIZED_COORDS_FALSE |
+                                      CLK_NORMALIZED_COORDS_TRUE),
+    __CLK_NORMALIZED_BITS          = 1,        // number of bits required to
+                                               // represent normalization
     __CLK_ADDRESS_BASE             = 0,
     CLK_ADDRESS_NONE               = (0 << __CLK_ADDRESS_BASE),
-    CLK_ADDRESS_CLAMP              = (1 << __CLK_ADDRESS_BASE),
     CLK_ADDRESS_CLAMP_TO_EDGE      = (2 << __CLK_ADDRESS_BASE),
-    CLK_ADDRESS_REPEAT             = (3 << __CLK_ADDRESS_BASE),
-    CLK_ADDRESS_MIRROR             = (4 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_CLAMP              = (4 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_REPEAT             = (6 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_MIRROR             = (8 << __CLK_ADDRESS_BASE),
 
 #if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
     CLK_ADDRESS_MIRRORED_REPEAT    = CLK_ADDRESS_MIRROR,
@@ -80,20 +87,12 @@ typedef enum clk_sampler_type {
     __CLK_ADDRESS_MASK             = (CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
                                      CLK_ADDRESS_CLAMP_TO_EDGE |
                                      CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR),
-    __CLK_ADDRESS_BITS             = 3,        // number of bits required to
+    __CLK_ADDRESS_BITS             = 4,        // number of bits required to
                                                // represent address info
-
-    __CLK_NORMALIZED_BASE          = __CLK_ADDRESS_BITS,
-    CLK_NORMALIZED_COORDS_FALSE    = 0,
-    CLK_NORMALIZED_COORDS_TRUE     = (1 << __CLK_NORMALIZED_BASE),
-    __CLK_NORMALIZED_MASK          = (CLK_NORMALIZED_COORDS_FALSE |
-                                      CLK_NORMALIZED_COORDS_TRUE),
-    __CLK_NORMALIZED_BITS          = 1,        // number of bits required to
-                                               // represent normalization
-    __CLK_FILTER_BASE              = (__CLK_NORMALIZED_BASE +  __CLK_NORMALIZED_BITS),
-    CLK_FILTER_NEAREST             = (0 << __CLK_FILTER_BASE),
-    CLK_FILTER_LINEAR              = (1 << __CLK_FILTER_BASE),
-    CLK_FILTER_ANISOTROPIC         = (2 << __CLK_FILTER_BASE),
+    __CLK_FILTER_BASE              = (__CLK_ADDRESS_BASE +  __CLK_ADDRESS_BITS),
+    CLK_FILTER_ANISOTROPIC         = (0 << __CLK_FILTER_BASE),
+    CLK_FILTER_NEAREST             = (1 << __CLK_FILTER_BASE),
+    CLK_FILTER_LINEAR              = (2 << __CLK_FILTER_BASE),
     __CLK_FILTER_MASK              = (CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
                                      CLK_FILTER_ANISOTROPIC),
     __CLK_FILTER_BITS              = 2,        // number of bits required to
@@ -123,4 +122,4 @@ typedef enum clk_sampler_type {
 #define CLK_LOCAL_MEM_FENCE     (1 << 0)
 #define CLK_GLOBAL_MEM_FENCE    (1 << 1)
 
-#endif   /* __OCL_COMMON_DEFINES__ */
\ No newline at end of file
+#endif   /* __OCL_COMMON_DEFINES__ */
diff --git a/backend/src/sys/alloc.hpp b/backend/src/sys/alloc.hpp
index 6ee4e69..cc8aaac 100644
--- a/backend/src/sys/alloc.hpp
+++ b/backend/src/sys/alloc.hpp
@@ -121,7 +121,8 @@ namespace gbe
 {
   /*! STL compliant allocator to intercept all memory allocations */
   template<typename T>
-  class Allocator {
+  class Allocator : public std::allocator<T>
+  {
   public:
     typedef T value_type;
     typedef value_type* pointer;
@@ -156,7 +157,6 @@ namespace gbe
     INLINE size_type max_size(void) const {
       return std::numeric_limits<size_type>::max() / sizeof(T);
     }
-    INLINE void construct(pointer p, const T& t = T()) { ::new(p) T(t); }
     INLINE void destroy(pointer p) { p->~T(); }
     INLINE bool operator==(Allocator const&) { return true; }
     INLINE bool operator!=(Allocator const& a) { return !operator==(a); }
diff --git a/backend/src/sys/platform.hpp b/backend/src/sys/platform.hpp
index 803ce21..9ed9b2d 100644
--- a/backend/src/sys/platform.hpp
+++ b/backend/src/sys/platform.hpp
@@ -339,12 +339,12 @@ public:
   INLINE Serializable(const Serializable&) = default;
   INLINE Serializable& operator= (const Serializable&) = default;
 
-  virtual size_t serializeToBin(std::ostream& outs) = 0;
-  virtual size_t deserializeFromBin(std::istream& ins) = 0;
+  virtual uint32_t serializeToBin(std::ostream& outs) = 0;
+  virtual uint32_t deserializeFromBin(std::istream& ins) = 0;
 
   /* These two will follow LLVM's ABI. */
-  virtual size_t serializeToLLVM(void) { return 0;/* not implemented now. */}
-  virtual size_t deserializeFromLLVM(void) { return 0;/* not implemented now. */}
+  virtual uint32_t serializeToLLVM(void) { return 0;/* not implemented now. */}
+  virtual uint32_t deserializeFromLLVM(void) { return 0;/* not implemented now. */}
 
   virtual void printStatus(int indent = 0, std::ostream& outs = std::cout) { }
 
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 3e43a21..f9b246b 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -16,7 +16,11 @@ set (benchmark_sources
   benchmark_read_buffer.cpp
   benchmark_read_image.cpp
   benchmark_copy_buffer_to_image.cpp
-  benchmark_copy_image_to_buffer.cpp)
+  benchmark_copy_image_to_buffer.cpp
+  benchmark_copy_buffer.cpp
+  benchmark_copy_image.cpp
+  benchmark_workgroup.cpp
+  benchmark_math.cpp)
 
 
 SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
@@ -29,3 +33,4 @@ TARGET_LINK_LIBRARIES(benchmarks cl m)
 
 ADD_EXECUTABLE(benchmark_run benchmark_run.cpp)
 TARGET_LINK_LIBRARIES(benchmark_run benchmarks)
+ADD_CUSTOM_TARGET(benchmark DEPENDS benchmarks benchmark_run)
diff --git a/benchmark/benchmark_copy_buf.cpp b/benchmark/benchmark_copy_buf.cpp
index e21c936..a85af8c 100644
--- a/benchmark/benchmark_copy_buf.cpp
+++ b/benchmark/benchmark_copy_buf.cpp
@@ -16,6 +16,8 @@ double benchmark_copy_buf(void)
 
   buf0 = (cl_char *)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 0, sizeof(char), 0, NULL, NULL, NULL);
 
+  OCL_ASSERT(buf0 != NULL);
+
   for (i=0; i < sz; i++) {
     buf0[i]=(rand() & 0xFF);
   }
@@ -48,4 +50,4 @@ double benchmark_copy_buf(void)
   return BANDWIDTH(sz * sizeof(char) * 100, elapsed);
 }
 
-MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buf);
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buf, "GB/S");
diff --git a/benchmark/benchmark_copy_buffer.cpp b/benchmark/benchmark_copy_buffer.cpp
new file mode 100644
index 0000000..b56dcdf
--- /dev/null
+++ b/benchmark/benchmark_copy_buffer.cpp
@@ -0,0 +1,57 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define BENCH_COPY_BUFFER(J, T, K, M) \
+double benchmark_ ##J ##_buffer_ ##T(void) \
+{ \
+  struct timeval start,stop; \
+ \
+  const size_t w = 1920; \
+  const size_t h = 1080; \
+  const size_t sz = 4 * w * h; \
+ \
+  OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(M), NULL); \
+  OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(M), NULL); \
+ \
+  OCL_CREATE_KERNEL_FROM_FILE("bench_copy_buffer",K); \
+ \
+  OCL_MAP_BUFFER(0); \
+  for (size_t i = 0; i < sz; i ++) { \
+    ((M *)(buf_data[0]))[i] = rand(); \
+  } \
+  OCL_UNMAP_BUFFER(0); \
+ \
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); \
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \
+ \
+  globals[0] = w; \
+  globals[1] = h; \
+  locals[0] = 16; \
+  locals[1] = 4; \
+ \
+  gettimeofday(&start,0); \
+  for (size_t i=0; i<1000; i++) { \
+    OCL_NDRANGE(2); \
+  } \
+  OCL_FINISH(); \
+ \
+  OCL_MAP_BUFFER(1); \
+  OCL_UNMAP_BUFFER(1); \
+  gettimeofday(&stop,0); \
+ \
+  free(buf_data[0]); \
+  buf_data[0] = NULL; \
+ \
+  double elapsed = time_subtract(&stop, &start, 0); \
+ \
+  return (double)(1000 / (elapsed * 1e-3)); \
+} \
+ \
+MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(benchmark_ ##J ##_buffer_ ##T, true, "FPS");
+
+BENCH_COPY_BUFFER(copy, uchar, "bench_copy_buffer_uchar", unsigned char)
+BENCH_COPY_BUFFER(copy, ushort, "bench_copy_buffer_ushort", unsigned short)
+BENCH_COPY_BUFFER(copy, uint, "bench_copy_buffer_uint", unsigned int)
+BENCH_COPY_BUFFER(filter, uchar, "bench_filter_buffer_uchar", unsigned char)
+BENCH_COPY_BUFFER(filter, ushort, "bench_filter_buffer_ushort", unsigned short)
+BENCH_COPY_BUFFER(filter, uint, "bench_filter_buffer_uint", unsigned int)
diff --git a/benchmark/benchmark_copy_buffer_to_image.cpp b/benchmark/benchmark_copy_buffer_to_image.cpp
index 2177cfe..befca6b 100644
--- a/benchmark/benchmark_copy_buffer_to_image.cpp
+++ b/benchmark/benchmark_copy_buffer_to_image.cpp
@@ -63,4 +63,4 @@ double benchmark_copy_buffer_to_image(void)
   return BANDWIDTH(sz * 100, elapsed);
 }
 
-MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buffer_to_image);
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buffer_to_image, "GB/S");
diff --git a/benchmark/benchmark_copy_image.cpp b/benchmark/benchmark_copy_image.cpp
new file mode 100644
index 0000000..dc82a36
--- /dev/null
+++ b/benchmark/benchmark_copy_image.cpp
@@ -0,0 +1,72 @@
+#include <string.h>
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define BENCH_COPY_IMAGE(J, T, K, M, Q) \
+double benchmark_ ##J ##_image_ ##T(void) \
+{ \
+  struct timeval start,stop; \
+\
+  const size_t w = 1920; \
+  const size_t h = 1080; \
+  const size_t sz = 4 * w * h; \
+  cl_image_format format; \
+  cl_image_desc desc; \
+\
+  memset(&desc, 0x0, sizeof(cl_image_desc)); \
+  memset(&format, 0x0, sizeof(cl_image_format)); \
+\
+  OCL_CREATE_KERNEL_FROM_FILE("bench_copy_image",K); \
+  buf_data[0] = (uint32_t*) malloc(sizeof(M) * sz); \
+  for (uint32_t i = 0; i < sz; ++i) { \
+    ((M*)buf_data[0])[i] = rand(); \
+  } \
+\
+  format.image_channel_order = CL_RGBA; \
+  format.image_channel_data_type = Q; \
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D; \
+  desc.image_width = w; \
+  desc.image_height = h; \
+  desc.image_row_pitch = desc.image_width * sizeof(M) * 4; \
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]); \
+\
+  desc.image_row_pitch = 0; \
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL); \
+\
+  free(buf_data[0]); \
+  buf_data[0] = NULL; \
+\
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); \
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \
+\
+  globals[0] = w; \
+  globals[1] = h; \
+  locals[0] = 16; \
+  locals[1] = 4; \
+\
+  gettimeofday(&start,0); \
+  for (size_t i=0; i<1000; i++) { \
+    OCL_NDRANGE(2); \
+  } \
+  OCL_FINISH(); \
+\
+  OCL_MAP_BUFFER_GTT(1); \
+  OCL_UNMAP_BUFFER_GTT(1); \
+  gettimeofday(&stop,0); \
+\
+  free(buf_data[0]); \
+  buf_data[0] = NULL; \
+\
+  double elapsed = time_subtract(&stop, &start, 0); \
+\
+  return (double)(1000 / (elapsed * 1e-3)); \
+} \
+\
+MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(benchmark_ ##J ##_image_ ##T, true, "FPS");
+
+BENCH_COPY_IMAGE(copy,uchar, "bench_copy_image", unsigned char, CL_UNSIGNED_INT8)
+BENCH_COPY_IMAGE(copy,ushort, "bench_copy_image", unsigned short, CL_UNSIGNED_INT16)
+BENCH_COPY_IMAGE(copy,uint, "bench_copy_image", unsigned int,CL_UNSIGNED_INT32)
+BENCH_COPY_IMAGE(filter,uchar, "bench_filter_image", unsigned char,CL_UNSIGNED_INT8)
+BENCH_COPY_IMAGE(filter,ushort, "bench_filter_image", unsigned short,CL_UNSIGNED_INT16)
+BENCH_COPY_IMAGE(filter,uint, "bench_filter_image", unsigned int,CL_UNSIGNED_INT32)
diff --git a/benchmark/benchmark_copy_image_to_buffer.cpp b/benchmark/benchmark_copy_image_to_buffer.cpp
index debed09..35e2e0d 100644
--- a/benchmark/benchmark_copy_image_to_buffer.cpp
+++ b/benchmark/benchmark_copy_image_to_buffer.cpp
@@ -61,4 +61,4 @@ double benchmark_copy_image_to_buffer(void)
   return BANDWIDTH(sz * 100, elapsed);
 }
 
-MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_image_to_buffer);
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_image_to_buffer, "GB/S");
diff --git a/benchmark/benchmark_math.cpp b/benchmark/benchmark_math.cpp
new file mode 100644
index 0000000..72bc316
--- /dev/null
+++ b/benchmark/benchmark_math.cpp
@@ -0,0 +1,126 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+double benchmark_generic_math(const char* str_filename,
+                              const char* str_kernel)
+{
+  double elapsed = 0;
+  struct timeval start,stop;
+  const size_t global_size = 1024 * 1024;
+  const size_t local_size = 64;
+
+  /* Compute math OP, loop times on global size */
+  cl_float base = 1.000002;
+  cl_float pwr = 1.0102003;
+  uint32_t loop = 1000;
+
+  /* Input set will be generated */
+  float* src = (float*)calloc(sizeof(float), global_size);
+  OCL_ASSERT(src != NULL);
+  for(uint32_t i = 0; i < global_size; i++)
+    src[i] = base + i * (base - 1);
+
+  /* Setup kernel and buffers */
+  OCL_CALL(cl_kernel_init, str_filename, str_kernel, SOURCE, "");
+
+  OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL);
+
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, global_size * sizeof(float));
+  OCL_UNMAP_BUFFER(0);
+
+  globals[0] = global_size;
+  locals[0] = local_size;
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_float), &pwr);
+  OCL_SET_ARG(3, sizeof(cl_uint), &loop);
+
+  /* Measure performance */
+  gettimeofday(&start,0);
+  OCL_NDRANGE(1);
+  clFinish(queue);
+  gettimeofday(&stop,0);
+  elapsed = time_subtract(&stop, &start, 0);
+
+  /* Show compute results */
+  OCL_MAP_BUFFER(1);
+  for(uint32_t i = 0; i < global_size; i += 8192)
+    printf("\t%.3f", ((float*)buf_data[1])[i]);
+  OCL_UNMAP_BUFFER(1);
+
+  return BANDWIDTH(global_size * loop, elapsed);
+}
+
+double benchmark_math_pow(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_pow");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_pow, "Mop/s");
+
+double benchmark_math_exp2(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp2");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp2, "Mop/s");
+
+double benchmark_math_exp(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp, "Mop/s");
+
+double benchmark_math_exp10(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp10");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp10, "Mop/s");
+
+double benchmark_math_log2(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_log2");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log2, "Mop/s");
+
+double benchmark_math_log(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_log");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log, "Mop/s");
+
+double benchmark_math_log10(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_log10");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log10, "Mop/s");
+
+double benchmark_math_sqrt(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_sqrt");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_sqrt, "Mop/s");
+
+double benchmark_math_sin(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_sin");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_sin, "Mop/s");
+
+double benchmark_math_cos(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_cos");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_cos, "Mop/s");
+
+double benchmark_math_tan(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_tan");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_tan, "Mop/s");
+
+double benchmark_math_asin(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_asin");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_asin, "Mop/s");
+
+double benchmark_math_acos(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_acos");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_acos, "Mop/s");
diff --git a/benchmark/benchmark_read_buffer.cpp b/benchmark/benchmark_read_buffer.cpp
index 431f42a..0eb8c7a 100644
--- a/benchmark/benchmark_read_buffer.cpp
+++ b/benchmark/benchmark_read_buffer.cpp
@@ -39,7 +39,6 @@ double benchmark_read_buffer(void)
   OCL_FINISH();
   gettimeofday(&stop,0);
 
-  clReleaseMemObject(buf[0]);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
@@ -48,4 +47,4 @@ double benchmark_read_buffer(void)
   return BANDWIDTH(sz * sizeof(float) * 2 * 100, elapsed);
 }
 
-MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_buffer);
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_buffer, "GB/S");
diff --git a/benchmark/benchmark_read_image.cpp b/benchmark/benchmark_read_image.cpp
index e3aa5bd..2f92024 100644
--- a/benchmark/benchmark_read_image.cpp
+++ b/benchmark/benchmark_read_image.cpp
@@ -57,7 +57,6 @@ double benchmark_read_image(void)
   OCL_FINISH();
   gettimeofday(&stop,0);
 
-  clReleaseMemObject(buf[0]);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
@@ -66,4 +65,4 @@ double benchmark_read_image(void)
   return BANDWIDTH(sz * sizeof(float) * 2 * 100, elapsed);
 }
 
-MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_image);
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_image, "GB/S");
diff --git a/benchmark/benchmark_run.cpp b/benchmark/benchmark_run.cpp
index 01748ce..d51bf23 100644
--- a/benchmark/benchmark_run.cpp
+++ b/benchmark/benchmark_run.cpp
@@ -114,4 +114,5 @@ int main(int argc, char *argv[])
   } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1);
 
   cl_ocl_destroy();
+  return 0;
 }
diff --git a/benchmark/benchmark_use_host_ptr_buffer.cpp b/benchmark/benchmark_use_host_ptr_buffer.cpp
index 9e3d155..6a8cdd7 100644
--- a/benchmark/benchmark_use_host_ptr_buffer.cpp
+++ b/benchmark/benchmark_use_host_ptr_buffer.cpp
@@ -28,7 +28,6 @@ double benchmark_use_host_ptr_buffer(void)
   }
   gettimeofday(&stop,0);
 
-  clReleaseMemObject(buf[0]);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
@@ -37,4 +36,4 @@ double benchmark_use_host_ptr_buffer(void)
   return BANDWIDTH(n*sizeof(uint32_t)*100*2, elapsed);
 }
 
-MAKE_BENCHMARK_FROM_FUNCTION(benchmark_use_host_ptr_buffer);
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_use_host_ptr_buffer, "GB/S");
diff --git a/benchmark/benchmark_workgroup.cpp b/benchmark/benchmark_workgroup.cpp
new file mode 100644
index 0000000..3f073bb
--- /dev/null
+++ b/benchmark/benchmark_workgroup.cpp
@@ -0,0 +1,370 @@
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include <sys/time.h>
+#include <iomanip>
+#include <algorithm>
+
+using namespace std;
+
+/* work-group general settings */
+#define WG_GLOBAL_SIZE          (512 * 256)
+#define WG_LOCAL_SIZE           128
+#define WG_LOOP_COUNT           1000
+
+/* work-group broadcast only */
+#define WG_GLOBAL_SIZE_X        1024
+#define WG_GLOBAL_SIZE_Y        1024
+
+#define WG_LOCAL_SIZE_X         32
+#define WG_LOCAL_SIZE_Y         2
+
+#define WG_LOCAL_X    5
+#define WG_LOCAL_Y    0
+
+
+enum WG_FUNCTION
+{
+  WG_BROADCAST_1D,
+  WG_BROADCAST_2D,
+  WG_REDUCE_ADD,
+  WG_REDUCE_MIN,
+  WG_REDUCE_MAX,
+  WG_SCAN_EXCLUSIVE_ADD,
+  WG_SCAN_EXCLUSIVE_MAX,
+  WG_SCAN_EXCLUSIVE_MIN,
+  WG_SCAN_INCLUSIVE_ADD,
+  WG_SCAN_INCLUSIVE_MAX,
+  WG_SCAN_INCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected on CPU function for any workgroup type
+ * and any variable type
+ */
+template<class T>
+static void benchmark_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected,
+                    uint32_t wg_global_size,
+                    uint32_t wg_local_size)
+{
+  if(wg_func == WG_BROADCAST_1D)
+  {
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] = input[WG_LOCAL_X];
+  }
+  else if(wg_func == WG_BROADCAST_2D)
+  {
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] =
+          input[WG_LOCAL_X +
+                WG_LOCAL_Y * WG_LOCAL_SIZE_X];
+  }
+  else if(wg_func == WG_REDUCE_ADD)
+  {
+    T wg_sum = input[0];
+    for(uint32_t i = 1; i < wg_local_size; i++)
+      wg_sum += input[i];
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] = wg_sum;
+  }
+  else if(wg_func == WG_REDUCE_MAX)
+  {
+    T wg_max = input[0];
+    for(uint32_t i = 1; i < wg_local_size; i++)
+      wg_max = max(input[i], wg_max);
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] = wg_max;
+  }
+  else if(wg_func == WG_REDUCE_MIN)
+  {
+    T wg_min = input[0];
+    for(uint32_t i = 1; i < wg_local_size; i++)
+      wg_min = min(input[i], wg_min);
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] = wg_min;
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_ADD)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < wg_local_size; i++)
+      expected[i] = input[i] + expected[i - 1];
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < wg_local_size; i++)
+      expected[i] = max(input[i], expected[i - 1]);
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < wg_local_size; i++)
+      expected[i] = min(input[i], expected[i - 1]);
+  }
+}
+
+/*
+ * Generic input-expected generate function for any workgroup type
+ * and any variable type
+ */
+template<class T>
+static void benchmark_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected,
+                   uint32_t &wg_global_size,
+                   uint32_t &wg_local_size)
+{
+  if(wg_func == WG_BROADCAST_1D)
+  {
+    wg_global_size = WG_GLOBAL_SIZE_X;
+    wg_local_size = WG_LOCAL_SIZE_X;
+  }
+  else if(wg_func == WG_BROADCAST_2D)
+  {
+    wg_global_size = WG_GLOBAL_SIZE_X * WG_GLOBAL_SIZE_Y;
+    wg_local_size = WG_LOCAL_SIZE_X * WG_LOCAL_SIZE_Y;
+  }
+  else
+  {
+    wg_global_size = WG_GLOBAL_SIZE;
+    wg_local_size = WG_LOCAL_SIZE;
+  }
+
+  input = new T[wg_global_size];
+  expected = new T[wg_global_size];
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < wg_global_size; gid += wg_local_size)
+  {
+    /* input values */
+    for(uint32_t lid = 0; lid < wg_local_size; lid++)
+      input[gid + lid] = (rand() % 512) / 3.1415f;
+
+    /* expected values */
+    benchmark_expected(wg_func, input + gid, expected + gid,
+                       wg_global_size, wg_local_size);
+  }
+}
+
+/*
+ * Generic benchmark function for any workgroup type
+ * and any variable type
+ */
+template<class T>
+static double benchmark_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  double elapsed = 0;
+  const uint32_t reduce_loop = WG_LOOP_COUNT;
+  struct timeval start,stop;
+
+  uint32_t wg_global_size = 0;
+  uint32_t wg_local_size = 0;
+
+  /* input and expected data */
+  benchmark_data(wg_func, input, expected, wg_global_size, wg_local_size);
+
+  /* prepare input for datatype */
+  OCL_CREATE_BUFFER(buf[0], 0, wg_global_size * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, wg_global_size * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+
+  if(wg_func == WG_BROADCAST_1D ||
+      wg_func == WG_BROADCAST_2D)
+  {
+    cl_uint wg_local_x = WG_LOCAL_X;
+    cl_uint wg_local_y = WG_LOCAL_Y;
+    OCL_SET_ARG(3, sizeof(cl_uint), &wg_local_x);
+    OCL_SET_ARG(4, sizeof(cl_uint), &wg_local_y);
+  }
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, wg_global_size * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  gettimeofday(&start,0);
+
+  if(wg_func == WG_BROADCAST_1D)
+  {
+    globals[0] = WG_GLOBAL_SIZE_X;
+    locals[0] = WG_LOCAL_SIZE_X;
+    OCL_NDRANGE(1);
+  }
+  else if(wg_func == WG_BROADCAST_2D)
+  {
+    globals[0] = WG_GLOBAL_SIZE_X;
+    locals[0] = WG_LOCAL_SIZE_X;
+    globals[1] = WG_GLOBAL_SIZE_Y;
+    locals[1] = WG_LOCAL_SIZE_Y;
+    OCL_NDRANGE(2);
+  }
+  else
+  { /* reduce, scan inclulsive, scan exclusive */
+    globals[0] = WG_GLOBAL_SIZE;
+    locals[0] = WG_LOCAL_SIZE;
+    OCL_NDRANGE(1);
+  }
+
+  clFinish(queue);
+  gettimeofday(&stop,0);
+  elapsed = time_subtract(&stop, &start, 0);
+
+  /* check if mistmatch, display execution time */
+  OCL_MAP_BUFFER(1);
+  uint32_t mistmatches = 0;
+  for (uint32_t i = 0; i < wg_global_size; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i)){
+      /* uncomment bellow for DEBUG */
+      /* cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl; */
+      mistmatches++;
+    }
+  cout << endl << endl << "Mistmatches " << mistmatches << endl;
+  cout << "Exec time " << elapsed << endl << endl;
+  OCL_UNMAP_BUFFER(1);
+
+  return BANDWIDTH(sizeof(T) * wg_global_size * reduce_loop, elapsed);
+}
+
+/*
+ * Benchmark workgroup broadcast
+ */
+double benchmark_workgroup_broadcast_1D_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_broadcast_1D_int");
+  return benchmark_generic(WG_BROADCAST_1D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_1D_int, "GB/S");
+double benchmark_workgroup_broadcast_1D_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_broadcast_1D_long");
+  return benchmark_generic(WG_BROADCAST_1D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_1D_long, "GB/S");
+double benchmark_workgroup_broadcast_2D_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_broadcast_2D_int");
+  return benchmark_generic(WG_BROADCAST_2D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_2D_int, "GB/S");
+double benchmark_workgroup_broadcast_2D_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_broadcast_2D_long");
+  return benchmark_generic(WG_BROADCAST_2D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_2D_long, "GB/S");
+
+/*
+ * Benchmark workgroup reduce add
+ */
+double benchmark_workgroup_reduce_add_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_add_int");
+  return benchmark_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_add_int, "GB/S");
+double benchmark_workgroup_reduce_add_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_add_long");
+  return benchmark_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_add_long, "GB/S");
+
+/*
+ * Benchmark workgroup reduce min
+ */
+double benchmark_workgroup_reduce_min_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_min_int");
+  return benchmark_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_min_int, "GB/S");
+double benchmark_workgroup_reduce_min_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_min_long");
+  return benchmark_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_min_long, "GB/S");
+
+/*
+ * Benchmark workgroup scan inclusive add
+ */
+double benchmark_workgroup_scan_inclusive_add_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_add_int");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_add_int, "GB/S");
+double benchmark_workgroup_scan_inclusive_add_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_add_long");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_add_long, "GB/S");
+
+/*
+ * Benchmark workgroup scan inclusive min
+ */
+double benchmark_workgroup_scan_inclusive_min_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_min_int");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_min_int, "GB/S");
+double benchmark_workgroup_scan_inclusive_min_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_min_long");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_min_long, "GB/S");
+
+
+
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index 9a2b516..407886a 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -32,26 +32,29 @@ If you don't want to enable ICD, or your system doesn't have ICD OpenCL support,
 you can still link to the beignet OpenCL library. You can find the beignet/libcl.so
 in your system's library installation directories.
 
-Note that the compiler depends on LLVM (Low-Level Virtual Machine project).
-Right now, the code has been compiled with LLVM 3.3/3.4. It will not compile
-with anything older.
+Note that the compiler depends on LLVM (Low-Level Virtual Machine project), and the
+project normally support 3 latest LLVM released version.
+Right now, the code has been compiled with LLVM 3.6, 3.7 and 3.8. With older
+version LLVM from 3.3, build still support, but no full tests cover.
 
 A simple command to install all the above dependencies for ubuntu or debian is:
 
-`sudo apt-get install cmake pkg-config python ocl-icd-dev`
-`     ocl-icd-opencl-dev libdrm-dev libxfixes-dev libxext-dev llvm-3.5-dev`
-`     clang-3.5 libclang-3.5-dev libtinfo-dev libedit-dev zlib1g-dev`
+`sudo apt-get install cmake pkg-config python ocl-icd-dev libegl1-mesa-dev`
+`     ocl-icd-opencl-dev libdrm-dev libxfixes-dev libxext-dev llvm-3.6-dev`
+`     clang-3.6 libclang-3.6-dev libtinfo-dev libedit-dev zlib1g-dev`
 
 [http://llvm.org/releases/](http://llvm.org/releases/)
 
 
-**The recommended LLVM/CLANG version is 3.5 and/or 3.6**
+**The recommended LLVM/CLANG version is 3.6 and/or 3.7**
 
-Based on our test result, LLVM 3.5 has best pass rate on all the test suites. Compare
-to LLVM 3.5, LLVM 3.6 has slightly lower pass rate(caused by one front end bug at clang
-3.6) but has better performance(3% to 8% up).
+Based on our test result, LLVM 3.6 and 3.7 has best pass rate on all the test suites. Compare
+to LLVM 3.6 and 3.7, if you used LLVM 3.8, you should pay attention to float immediate. For example,
+if you use 1.0 in the kernel, LLVM 3.6 will treate it as 1.0f, a single float, because the project
+don't support double float. but LLVM 3.8 will treate it as 1.0, a double foat, at the last it may cause
+error. So we recommend use 1.0f instead of 1.0 if you don't need double float.
 
-For LLVM 3.3 and 3.4, Beignet still support them, but it may be limited to support the
+For LLVM 3.4 and 3.5, Beignet still support them, but it may be limited to support the
 build and major functions.
 
 How to build and install
@@ -86,8 +89,11 @@ The cmake will build the backend firstly. Please refer to:
 [[OpenCL Gen Backend|Beignet/Backend]] to get more dependencies.
 
 Once built, the run-time produces a shared object libcl.so which basically
-directly implements the OpenCL API. A set of tests are also produced. They may
-be found in `utests/`.
+directly implements the OpenCL API.
+
+`> make`
+
+A set of tests are also produced. They may be found in `utests/`.
 
 Simply invoke:
 
@@ -104,6 +110,10 @@ your library installation directory.
 It installs the OCL icd vendor files to /etc/OpenCL/vendors, if the system support ICD.
 - intel-beignet.icd
 
+`> make package`
+
+It packages the driver binaries, you may copy&install the package to another machine with simillar system.
+
 How to run
 ----------
 
@@ -123,9 +133,9 @@ Then in `utests/`:
 
 will run all the unit tests one after the others
 
-`> ./utest_run some_unit_test0 some_unit_test1`
+`> ./utest_run some_unit_test`
 
-will only run `some_unit_test0` and `some_unit_test1` tests
+will only run `some_unit_test` test.
 
 On all supported target platform, the pass rate should be 100%. If it is not, you may
 need to refer the "Known Issues" section. Please be noted, the `. setenv.sh` is only
@@ -140,10 +150,13 @@ beignet provides two alternative to run:
 Supported Targets
 -----------------
 
- * 3rd Generation Intel Core Processors
- * Intel “Bay Trail” platforms with Intel HD Graphics
- * 4th Generation Intel Core Processors "Haswell", need kernel patch currently, see the "Known Issues" section.
+ * 3rd Generation Intel Core Processors "Ivybridge".
+ * 3rd Generation Intel Atom Processors "BayTrail".
+ * 4th Generation Intel Core Processors "Haswell", need kernel patch if your linux kernel older than 4.2, see the "Known Issues" section.
  * 5th Generation Intel Core Processors "Broadwell".
+ * 5th Generation Intel Atom Processors "Braswell".
+ * 6th Generation Intel Core Processors "Skylake" and "Kabylake".
+ * 5th Generation Intel Atom Processors "Broxten" or "Apollolake".
 
 Known Issues
 ------------
@@ -173,17 +186,12 @@ Known Issues
 
   `# echo 0 > /sys/module/i915/parameters/enable_cmd_parser`
 
-  On Haswell hardware, Beignet 1.0.1 to 1.0.3 also required the
-  above workaround on later Linux versions, but this _should not_ be
-  required in current (after [83f8739](http://cgit.freedesktop.org/beignet/commit/?id=83f8739b6fc4893fac60145326052ccb5cf653dc))
-  git master.
-
 * "Beignet: self-test failed" and 15-30 unit tests fail on 4th Generation (Haswell) hardware.
   On Haswell, shared local memory (\_\_local) does not work at all on
   Linux <= 4.0, and requires the i915.enable_ppgtt=2 [boot parameter](https://wiki.ubuntu.com/Kernel/KernelBootParameters)
   on Linux 4.1.
   
-  This will be fixed in Linux 4.2; older versions can be fixed with
+  This is fixed in Linux 4.2; older versions can be fixed with
   [this patch](https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support).
   
   If you do not need \_\_local, you can override the self-test with
@@ -207,49 +215,6 @@ Known Issues
   extension. This feature used to work with a previous mesa git version. But now, it's
   simply broken.
 
-* Illegal pointer issue.
-  If you met the following error message:
-
-  `Illegal pointer which is not from a valid memory space.`  
-  `Aborting...`  
-   
-  That means the computing kernel is running into an unsupported feature which is to
-  store/load pointers to/from memory. As we know, this feature has been used in Luxmark
-  3.0 and maybe the latest BOINC. Beignet could support them currently. We plan to fix
-  it in next major release 1.1.0.
-
-TODO
-----
-
-In terms of the OpenCL 1.2 spec, beignet is quite complete now. We can pass almost
-all the piglit OpenCL test cases now. And the pass rate for the OpenCV test suite
-is also good which is about 99%. There are still some remains work items listed as below,
-most of them are extension support and performance related.
-
-- Performance tuning. There are some major optimizations need to be done,
-  Peephole optimization, futher tuning the structurized BB transformation to
-  support more pattern such as self loop/while loop. And optimize the slow
-  software based sin/cos/... math functions due to the native math instruction
-  lack of necessary precision. And all the code is inlined which will increase
-  the icache miss rate significantly. And many other things which are specified
-  partially in [[here|Beignet/Backend/TODO]].
-
-- Complete cl\_khr\_gl\_sharing support. We lack of some APIs implementation such
-  as clCreateFromGLBuffer,clCreateFromGLRenderbuffer,clGetGLObjectInfo... Currently,
-  the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D. We may need to
-  find a graceful way to co-work with mesa.
-
-- Check that NDRangeKernels can be pushed into _different_ queues from several
-  threads.
-
-- No state tracking at all. One batch buffer is created at each "draw call"
-  (i.e. for each NDRangeKernels). This is really inefficient since some
-  expensive pipe controls are issued for each batch buffer.
-
-More generally, everything in the run-time that triggers the "FATAL" macro means
-that something that must be supported is not implemented properly (either it
-does not comply with the standard or it is just missing)
-
 Project repository
 ------------------
 Right now, we host our project on fdo at:
@@ -275,7 +240,7 @@ Developers from Intel:
 * Luo, Xionghu
 * Wen, Chuanbo
 * Guo, Yejun
-* Lv, Meng
+* Pan, Xiuli
 
 Debian Maintainer:
 
@@ -306,6 +271,8 @@ Documents for OpenCL application developers
 - [[Kernel Optimization Guide|Beignet/optimization-guide]]
 - [[Libva Buffer Sharing|Beignet/howto/libva-buffer-sharing-howto]]
 - [[V4l2 Buffer Sharing|Beignet/howto/v4l2-buffer-sharing-howto]]
+- [[Video Motion Estimation|Beignet/howto/video-motion-estimation-howto]]
+- [[Stand Alone Unit Test|Beignet/howto/stand-alone-utest-howto]]
 
 The wiki URL is as below:
 [http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index eddbe95..d1e94ef 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,5 +1,8 @@
 # News
 
+## Aug 30, 2016
+[Beignet 1.2.0](https://01.org/beignet/downloads/beignet-1.2.0-2016-08-30) is released. This is a major release. Please see the release notes for more information.
+
 ## Apr 19, 2016
 [Beignet 1.1.2](https://01.org/beignet/downloads/beignet-1.1.2-2016-04-19) is released. This is a bug-fix release.
 
diff --git a/docs/howto/stand-alone-utest-howto.mdwn b/docs/howto/stand-alone-utest-howto.mdwn
new file mode 100644
index 0000000..bca23d3
--- /dev/null
+++ b/docs/howto/stand-alone-utest-howto.mdwn
@@ -0,0 +1,45 @@
+Stand Alone Unit Test HowTo
+====================
+
+Beignet provides an independent unit test suite covered most OpenCL language feautures,
+including more than 800 cases which could run in a few minutes, it should be useful for
+testing and comparing different OpenCL implementations.
+
+Prerequisite
+------------
+
+OpenCL ICD. Please check your OpenCL ICD existance by command
+`pkg-config --libs OpenCL`.
+
+Build Stand Alone Unit Test
+-----------------------------------
+The project uses CMake with three profiles:
+
+1. Debug (-g)
+2. RelWithDebInfo (-g with optimizations)
+3. Release (only optimizations)
+
+Basically, from the root directory of the project
+
+`> cd utest`
+
+`> cmake . # to configure`
+
+`> make`
+
+Once built, the 'utest_run' is generated in currenty directory.
+
+How to run
+----------
+
+You need to call setenv.sh in the utests/ directory to set some environment variables
+firstly as below:
+
+`> . setenv.sh`
+
+Then in `utests/`:
+
+`> ./utest_run`
+
+if the utest_run fail to run, please check /etc/vendors/OpenCL to confirm it calls the expected
+OpenCL driver, or export LD_LIBRARAY_PATH to establish the correct link.
diff --git a/docs/howto/video-motion-estimation-howto.mdwn b/docs/howto/video-motion-estimation-howto.mdwn
new file mode 100644
index 0000000..8deaa61
--- /dev/null
+++ b/docs/howto/video-motion-estimation-howto.mdwn
@@ -0,0 +1,71 @@
+Video Motion Vector HowTo
+==========================
+
+Beignet now supports cl_intel_accelerator and cl_intel_motion_estimation, which are
+Khronos official extensions. It provides a hardware acceleration of video motion
+vector to users.
+
+Supported hardware platform
+---------------------------
+
+Only 3rd Generation Intel Core Processors is supported for vme now. We will consider
+to support more platforms if necessary.
+
+Steps
+-----
+
+In order to use video motion estimation provided by Beignet in your program, please follow
+the steps as below:
+
+- Create a cl_accelerator_intel object using extension API clCreateAcceleratorINTEL, like
+  this:
+  _accelerator_type_intel accelerator_type = CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL;
+  cl_motion_estimation_desc_intel vmedesc = {CL_ME_MB_TYPE_16x16_INTEL,
+                                             CL_ME_SUBPIXEL_MODE_INTEGER_INTEL,
+                                             CL_ME_SAD_ADJUST_MODE_NONE_INTEL,
+                                             CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
+                                            };
+
+- Invoke clCreateProgramWithBuiltInKernels to create a program object with built-in kernels
+  information, and invoke clCreateKernel to create a kernel object whose kernel name is
+  block_motion_estimate_intel.
+
+- The prototype of built-in kernel block_motion_estimate_intel is as following:
+  _kernel void
+  block_motion_estimate_intel
+  (
+   accelerator_intel_t accelerator,
+   __read_only  image2d_t src_image,
+   __read_only  image2d_t ref_image,
+   __global short2 * prediction_motion_vector_buffer,
+   __global short2 * motion_vector_buffer,
+   __global ushort * residuals
+   );
+  So you should create related objects and setup these kernel arguments by clSetKernelArg.
+  Create source and reference image object, on which you want to do video motion estimation.
+  The image_channel_order should be CL_R and image_channel_data_type should be CL_UNORM_INT8.
+  Create a buffer object to get the motion vector result. This motion vector buffer representing
+  a vector field of pixel block motion vectors, stored linearly in row-major order. The elements
+  (pixels) of this image contain a motion vector for the corresponding pixel block, with its x/y
+  components packed as two 16-bit integer values. Each component is encoded as a S13.2 fixed
+  point value(two's complement).
+
+- Use clEnqueueNDRangeKernel to enqueue this kernel. The only thing you need to setup is global_work_size:
+  global_work_size[0] equal to width of source image, global_work_size[1] equal to height of source
+  image.
+
+- Use clEnqueueReadBuffer or clEnqueueMapBuffer to get motion vector result.
+
+
+Sample code
+-----------
+
+We have developed an utest case of using video motion vector in utests/builtin_kernel_block_motion_estimate_intel.cpp.
+Please go through it for details.
+
+More references
+---------------
+
+https://www.khronos.org/registry/cl/extensions/intel/cl_intel_accelerator.txt
+https://www.khronos.org/registry/cl/extensions/intel/cl_intel_motion_estimation.txt
+https://software.intel.com/en-us/articles/intro-to-motion-estimation-extension-for-opencl
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 710bea8..0a66d70 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -184,6 +184,109 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
 #define CL_PRINTF_CALLBACK_ARM                      0x40B0
 #define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
 
+/*********************************
+* cl_intel_accelerator extension *
+*********************************/
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+
+typedef struct _cl_accelerator_intel*     cl_accelerator_intel;
+typedef cl_uint                           cl_accelerator_type_intel;
+typedef cl_uint                           cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* Error Codes */
+#define CL_INVALID_ACCELERATOR_INTEL            -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL       -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097
+
+/* Deprecated Error Codes */
+#define CL_INVALID_ACCELERATOR_INTEL_DEPRECATED            -6000
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL_DEPRECATED       -6001
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL_DEPRECATED -6002
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL_DEPRECATED -6003
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL     0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                 0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL            0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                    0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                       0x4093
+
+/*cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                       0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                         0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                         0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL              0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL              0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL            0x5
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_accelerator_intel
+    (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL
+(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+    (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+    (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+    (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
 #ifdef CL_VERSION_1_1
    /***********************************
     * cl_ext_device_fission extension *
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index 28bcb62..47bae46 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -133,6 +133,70 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetMemObjectFdIntel_fn)(
                              cl_mem       /* Memory Obejct */,
                              int*         /* returned fd */);
 
+typedef struct _cl_import_buffer_info_intel {
+    int                     fd;
+    int                     size;
+} cl_import_buffer_info_intel;
+
+typedef struct _cl_import_image_info_intel {
+    int                     fd;
+    int                     size;
+    cl_mem_object_type      type;
+    cl_image_format         fmt;
+    uint32_t                offset;
+    uint32_t                width;
+    uint32_t                height;
+    uint32_t                row_pitch;
+} cl_import_image_info_intel;
+
+/* Create memory object from external buffer object by fd */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferFromFdINTEL(cl_context                            /* context */,
+                          const cl_import_buffer_info_intel *   /* info */,
+                          cl_int *                              /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateBufferFromFdINTEL_fn)(
+                             cl_context                            /* context */,
+                             const cl_import_buffer_info_intel *   /* info */,
+                             cl_int *                              /* errcode_ret */);
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImageFromFdINTEL(cl_context                            /* context */,
+                         const cl_import_image_info_intel *    /* info */,
+                         cl_int *                              /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateImageFromFdINTEL_fn)(
+                             cl_context                            /* context */,
+                             const cl_import_image_info_intel *    /* info */,
+                             cl_int *                              /* errcode_ret */);
+
+#ifndef CL_VERSION_2_0
+typedef cl_uint  cl_kernel_sub_group_info;
+
+/* cl_khr_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR	0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR		0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+						   cl_device_id /*in_device*/,
+						   cl_kernel_sub_group_info /* param_name */,
+						   size_t /*input_value_size*/,
+						   const void * /*input_value*/,
+						   size_t /*param_value_size*/,
+						   void* /*param_value*/,
+						   size_t* /*param_value_size_ret*/ );
+
+typedef CL_API_ENTRY cl_int
+     ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+						      cl_device_id /*in_device*/,
+						      cl_kernel_sub_group_info /* param_name */,
+						      size_t /*input_value_size*/,
+						      const void * /*input_value*/,
+						      size_t /*param_value_size*/,
+						      void* /*param_value*/,
+						      size_t* /*param_value_size_ret*/ );
+#endif
 #ifdef __cplusplus
 }
 #endif
diff --git a/kernels/bench_copy_buffer.cl b/kernels/bench_copy_buffer.cl
new file mode 100644
index 0000000..8d8afd8
--- /dev/null
+++ b/kernels/bench_copy_buffer.cl
@@ -0,0 +1,90 @@
+const constant float filter_flag = 0.111111f;
+__kernel void
+bench_copy_buffer_uchar(__global uchar4* src, __global uchar4* dst)
+{
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+  int x_sz = (int)get_global_size(0);
+  dst[y * x_sz + x] = src[y * x_sz + x];
+}
+
+__kernel void
+bench_copy_buffer_ushort(__global ushort4* src, __global ushort4* dst)
+{
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+  int x_sz = (int)get_global_size(0);
+  dst[y * x_sz + x] = src[y * x_sz + x];
+}
+
+__kernel void
+bench_copy_buffer_uint(__global uint4* src, __global uint4* dst)
+{
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+  int x_sz = (int)get_global_size(0);
+  dst[y * x_sz + x] = src[y * x_sz + x];
+}
+
+__kernel void
+bench_filter_buffer_uchar(__global uchar4* src, __global uchar4* dst)
+{
+  float4 result;
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+  int x_sz = (int)get_global_size(0);
+  int y_sz = (int)get_global_size(1);
+
+  int x0 = x - 1; int x1 = x + 1;
+  int y0 = y - 1; int y1 = y + 1 ;
+  int x_left = (x0 > 0)?x0:x; int x_right = (x1 > x_sz - 1)?x:x1;
+  int y_top = (y0 > 0)?y0:y; int y_bottom = (y1 > y_sz - 1)?y:y1;
+
+  result = convert_float4(src[y_top * x_sz + x_left]) + convert_float4(src[y_top * x_sz + x]) + convert_float4(src[y_top * x_sz + x_right])
+         + convert_float4(src[y * x_sz + x_left]) + convert_float4(src[y * x_sz + x]) + convert_float4(src[y * x_sz + x_right])
+         + convert_float4(src[y_bottom * x_sz + x_left]) + convert_float4(src[y_bottom * x_sz + x]) + convert_float4(src[y_bottom * x_sz +x_right]);
+
+  dst[y * x_sz + x] = convert_uchar4(result * filter_flag);
+}
+
+__kernel void
+bench_filter_buffer_ushort(__global ushort4* src, __global ushort4* dst)
+{
+  float4 result;
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+  int x_sz = (int)get_global_size(0);
+  int y_sz = (int)get_global_size(1);
+
+  int x0 = x - 1; int x1 = x + 1;
+  int y0 = y - 1; int y1 = y + 1 ;
+  int x_left = (x0 > 0)?x0:x; int x_right = (x1 > x_sz - 1)?x:x1;
+  int y_top = (y0 > 0)?y0:y; int y_bottom = (y1 > y_sz - 1)?y:y1;
+
+  result = convert_float4(src[y_top * x_sz + x_left]) + convert_float4(src[y_top * x_sz + x]) + convert_float4(src[y_top * x_sz + x_right])
+         + convert_float4(src[y * x_sz + x_left]) + convert_float4(src[y * x_sz + x]) + convert_float4(src[y * x_sz + x_right])
+         + convert_float4(src[y_bottom * x_sz + x_left]) + convert_float4(src[y_bottom * x_sz + x]) + convert_float4(src[y_bottom * x_sz +x_right]);
+
+  dst[y * x_sz + x] = convert_ushort4(result * filter_flag);
+}
+
+__kernel void
+bench_filter_buffer_uint(__global uint4* src, __global uint4* dst)
+{
+  float4 result;
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+  int x_sz = (int)get_global_size(0);
+  int y_sz = (int)get_global_size(1);
+
+  int x0 = x - 1; int x1 = x + 1;
+  int y0 = y - 1; int y1 = y + 1 ;
+  int x_left = (x0 > 0)?x0:x; int x_right = (x1 > x_sz - 1)?x:x1;
+  int y_top = (y0 > 0)?y0:y; int y_bottom = (y1 > y_sz - 1)?y:y1;
+
+  result = convert_float4(src[y_top * x_sz + x_left]) + convert_float4(src[y_top * x_sz + x]) + convert_float4(src[y_top * x_sz + x_right])
+         + convert_float4(src[y * x_sz + x_left]) + convert_float4(src[y * x_sz + x]) + convert_float4(src[y * x_sz + x_right])
+         + convert_float4(src[y_bottom * x_sz + x_left]) + convert_float4(src[y_bottom * x_sz + x]) + convert_float4(src[y_bottom * x_sz +x_right]);
+
+  dst[y * x_sz + x] = convert_uint4(result * filter_flag);
+}
diff --git a/kernels/bench_copy_image.cl b/kernels/bench_copy_image.cl
new file mode 100644
index 0000000..e6548f3
--- /dev/null
+++ b/kernels/bench_copy_image.cl
@@ -0,0 +1,52 @@
+const constant float filter_flag = 0.111111f;
+__kernel void
+bench_copy_image(__read_only image2d_t src, __write_only image2d_t dst)
+{
+  uint4 color = 0;
+  int2 coord;
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE| CLK_ADDRESS_CLAMP| CLK_FILTER_NEAREST;
+
+  coord.x = x;
+  coord.y = y;
+  color=read_imageui(src, sampler, coord);
+  write_imageui(dst, coord, color);
+}
+
+__kernel void
+bench_filter_image(__read_only image2d_t src, __write_only image2d_t dst)
+{
+  float4 color = 0;
+  int2 coord_00, coord_01, coord_02, coord_10, coord_11, coord_12, coord_20, coord_21, coord_22;
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+  int x_sz = (int)get_global_size(0);
+  int y_sz = (int)get_global_size(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE| CLK_ADDRESS_CLAMP| CLK_FILTER_NEAREST;
+
+  int x0 = x - 1; int x1 = x + 1;
+  int y0 = y - 1; int y1 = y + 1 ;
+  int x_left = (x > 0)?x0:x; int x_right = (x > x_sz - 2)?x:x1;
+  int y_top = (y > 0)?y0:y; int y_bottom = (y > y_sz - 2)?y:y1;
+
+  coord_00.x = x_left;  coord_00.y = y_top;
+  coord_01.x = x; coord_01.y = y_top;
+  coord_02.x = x_right; coord_02.y = y_top;
+
+  coord_10.x = x_left; coord_10.y = y;
+  coord_11.x = x; coord_11.y = y;
+  coord_12.x = x_right; coord_12.y = y;
+
+  coord_20.x = x_left; coord_20.y = y_bottom;
+  coord_21.x = x; coord_21.y = y_bottom;
+  coord_22.x = x_right; coord_22.y = y_bottom;
+
+  color = convert_float4(read_imageui(src, sampler, coord_00)) + convert_float4(read_imageui(src, sampler, coord_01)) + convert_float4(read_imageui(src, sampler, coord_02))
+        + convert_float4(read_imageui(src, sampler, coord_10)) + convert_float4(read_imageui(src, sampler, coord_11)) + convert_float4(read_imageui(src, sampler, coord_12))
+        + convert_float4(read_imageui(src, sampler, coord_20)) + convert_float4(read_imageui(src, sampler, coord_21)) + convert_float4(read_imageui(src, sampler, coord_22));
+
+  write_imageui(dst, coord_11, convert_uint4(color * filter_flag));
+}
diff --git a/kernels/bench_math.cl b/kernels/bench_math.cl
new file mode 100644
index 0000000..8d85d51
--- /dev/null
+++ b/kernels/bench_math.cl
@@ -0,0 +1,272 @@
+//#define BENCHMARK_NATIVE 1
+//#define BENCHMARK_INTERNAL_FAST 2
+
+/* benchmark pow performance */
+kernel void bench_math_pow(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_powr(result, pwr); /* calls native */
+#else
+    result = pow(result, pwr); /* calls internal slow */
+#endif
+  }
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark exp2 performance, exp2 is native */
+kernel void bench_math_exp2(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+    result = exp2(result) * 0.1f;
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark exp performance */
+/* calls internal fast (native) if (x > -0x1.6p1 && x < 0x1.6p1) */
+kernel void bench_math_exp(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_exp((float)-0x1.6p1 - result * 0.1f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = exp((float)-0x1.6p1 + result * 0.1f); /* calls internal fast */
+#else
+    result = exp((float)-0x1.6p1 - result * 0.1f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark exp10 performance */
+/* calls internal fast (native) if (x < -0x1.4p+5) || (x > +0x1.4p+5)  */
+kernel void bench_math_exp10(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_exp10((float)0x1.4p+5 + result * 0.1f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = exp10((float)-0x1.4p+5 - result * 0.1f); /* calls internal fast */
+#else
+    result = exp10((float)-0x1.2p+5 - result * 0.1f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark log2 performance */
+/* calls internal fast (native) if (x > 0x1.1p0)  */
+kernel void bench_math_log2(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_log2((float)0x1.1p0 + result * 0.0001f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = log2((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+#else
+    result = log2((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark log performance */
+/* calls internal fast (native) if (x > 0x1.1p0)  */
+kernel void bench_math_log(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_log((float)0x1.1p0 + result * 0.0001f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = log((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+#else
+    result = log((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark log10 performance */
+/* calls internal fast (native) if (x > 0x1.1p0)  */
+kernel void bench_math_log10(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_log10((float)0x1.1p0 + result * 0.0001f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = log10((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+#else
+    result = log10((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark sqrt performance */
+kernel void bench_math_sqrt(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+    result = sqrt(result) + sqrt(pwr + result);
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark sin performance */
+kernel void bench_math_sin(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_sin(result); /* calls native */
+#else
+    result = sin(result);	/* calls internal, random complexity */
+    //result = sin(0.1f + result); /* calls internal, (1) no reduction */
+    //result = sin(2.f + result); /* calls internal, (2) fast reduction */
+    //result = sin(4001 + result); /* calls internal, (3) slow reduction */
+    result *= 0x1p-16;
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark cos performance */
+kernel void bench_math_cos(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_cos(result); /* calls native */
+#else
+    result = cos(result);	/* calls internal, random complexity */
+    //result = cos(0.1f + result); /* calls internal, (1) no reduction */
+    //result = cos(2.f + result); /* calls internal, (2) fast reduction */
+    //result = cos(4001.f + result); /* calls internal, (3) slow reduction */
+    result *= 0x1p-16;
+#endif
+  }
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark native tan performance */
+kernel void bench_math_tan(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_tan(result); /* calls native */
+#else
+    result = tan(result); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark asin performance */
+kernel void bench_math_asin(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+    result = asin(pwr - 1);
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark acos performance */
+kernel void bench_math_acos(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+    result = acos(pwr - 1);
+
+  dst[get_global_id(0)] = result;
+}
diff --git a/kernels/bench_workgroup.cl b/kernels/bench_workgroup.cl
new file mode 100644
index 0000000..87986fc
--- /dev/null
+++ b/kernels/bench_workgroup.cl
@@ -0,0 +1,239 @@
+/*
+ * Benchmark broadcast 1D
+ */
+kernel void bench_workgroup_broadcast_1D_int(global int *src,
+                                  global int *dst,
+                                  int reduce_loop,
+                                  uint wg_local_x,
+                                  uint wg_local_y)
+{
+  uint offset = 0;
+  uint index = offset + get_global_id(0);
+
+  int val = src[index];
+  /* depending on generated ASM, volatile may be removed */
+  volatile int result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    result = work_group_broadcast(val,
+                                  wg_local_x);
+  }
+
+  dst[index] = result;
+}
+
+kernel void bench_workgroup_broadcast_1D_long(global long *src,
+                                  global long *dst,
+                                  int reduce_loop,
+                                  uint wg_local_x,
+                                  uint wg_local_y)
+{
+  uint offset = 0;
+  uint index = offset + get_global_id(0);
+
+  long val = src[index];
+  /* depending on generated ASM, volatile may be removed */
+  volatile long result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    result = work_group_broadcast(val,
+                                  wg_local_x);
+  }
+
+  dst[index] = result;
+}
+
+
+/*
+ * Benchmark broadcast 2D
+ */
+kernel void bench_workgroup_broadcast_2D_int(global int *src,
+                                  global int *dst,
+                                  int reduce_loop,
+                                  uint wg_local_x,
+                                  uint wg_local_y)
+{
+  uint lsize = get_local_size(0) * get_local_size(1);
+  uint offset = get_group_id(0) * lsize +
+      get_group_id(1) * get_num_groups(0) * lsize;
+  uint index = offset + get_local_id(0) +
+      get_local_id(1) * get_local_size(0);
+
+  int val = src[index];
+  /* depending on generated ASM, volatile may be removed */
+  int result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    result = work_group_broadcast(val,
+                                  wg_local_x,
+                                  wg_local_y);
+  }
+
+  dst[index] = result;
+}
+
+kernel void bench_workgroup_broadcast_2D_long(global long *src,
+                                  global long *dst,
+                                  int reduce_loop,
+                                  uint wg_local_x,
+                                  uint wg_local_y)
+{
+  uint lsize = get_local_size(0) * get_local_size(1);
+  uint offset = get_group_id(0) * lsize +
+      get_group_id(1) * get_num_groups(0) * lsize;
+  uint index = offset + get_local_id(0) +
+      get_local_id(1) * get_local_size(0);
+
+  long val = src[index];
+  /* depending on generated ASM, volatile may be removed */
+  long result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    result = work_group_broadcast(val,
+                                  wg_local_x,
+                                  wg_local_y);
+  }
+
+  dst[index] = result;
+}
+
+/*
+ * Benchmark workgroup reduce add
+ */
+kernel void bench_workgroup_reduce_add_int(
+  global int *src,
+  global int *dst,
+  int reduce_loop)
+{
+  int val;
+  int result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    val = src[get_global_id(0)];
+    result = work_group_reduce_add(val);
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_reduce_add_long(
+  global long *src,
+  global long *dst,
+  int reduce_loop)
+{
+  long val;
+  long result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    val = src[get_global_id(0)];
+    result = work_group_reduce_add(val);
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/*
+ * Benchmark workgroup reduce min
+ */
+kernel void bench_workgroup_reduce_min_int(
+  global int *src,
+  global int *dst,
+  int reduce_loop)
+{
+  int val;
+  int result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    val = src[get_global_id(0)];
+    result = work_group_reduce_min(val);
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_reduce_min_long(
+  global long *src,
+  global long *dst,
+  int reduce_loop)
+{
+  long val;
+  long result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    val = src[get_global_id(0)];
+    result = work_group_reduce_min(val);
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/*
+ * Benchmark workgroup scan inclusive add
+ */
+kernel void bench_workgroup_scan_inclusive_add_int(
+  global int *src,
+  global int *dst,
+  int reduce_loop)
+{
+  int val;
+  int result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    val = src[get_global_id(0)];
+    result = work_group_scan_inclusive_add(val);
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_scan_inclusive_add_long(
+  global long *src,
+  global long *dst,
+  int reduce_loop)
+{
+  long val;
+  long result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    val = src[get_global_id(0)];
+    result = work_group_scan_inclusive_add(val);
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/*
+ * Benchmark workgroup scan inclusive min
+ */
+kernel void bench_workgroup_scan_inclusive_min_int(
+  global int *src,
+  global int *dst,
+  int reduce_loop)
+{
+  int val;
+  int result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    val = src[get_global_id(0)];
+    result = work_group_scan_inclusive_add(val);
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_scan_inclusive_min_long(
+  global long *src,
+  global long *dst,
+  int reduce_loop)
+{
+  long val;
+  long result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    val = src[get_global_id(0)];
+    result = work_group_scan_inclusive_add(val);
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
diff --git a/kernels/builtin_global_linear_id.cl b/kernels/builtin_global_linear_id.cl
new file mode 100644
index 0000000..6810ffd
--- /dev/null
+++ b/kernels/builtin_global_linear_id.cl
@@ -0,0 +1,4 @@
+kernel void builtin_global_linear_id( __global int *ret) {
+  int id = get_global_linear_id();
+  ret[id] = id;
+}
diff --git a/kernels/builtin_local_linear_id.cl b/kernels/builtin_local_linear_id.cl
new file mode 100644
index 0000000..0ddcc73
--- /dev/null
+++ b/kernels/builtin_local_linear_id.cl
@@ -0,0 +1,6 @@
+kernel void builtin_local_linear_id( __global int *ret) {
+  int id = get_local_linear_id() + (get_group_id(0) + \
+           get_group_id(1) * 2 + get_group_id(2) * 6) * \
+           get_local_size(0) * get_local_size(1) * get_local_size(2);
+  ret[id] = id;
+}
diff --git a/kernels/builtin_max_sub_group_size.cl b/kernels/builtin_max_sub_group_size.cl
new file mode 100644
index 0000000..c2f3b5e
--- /dev/null
+++ b/kernels/builtin_max_sub_group_size.cl
@@ -0,0 +1,7 @@
+__kernel void builtin_max_sub_group_size(global int *dst)
+{
+  int lid = get_local_linear_id();
+  int lsz = get_local_size(0) * get_local_size(1) * get_local_size(2);
+  int gid = lid + lsz*(get_num_groups(1) * get_num_groups(0) * get_group_id(2) + get_num_groups(0) * get_group_id(1) + get_group_id(0));
+  dst[gid] = get_max_sub_group_size();
+}
diff --git a/kernels/builtin_num_sub_groups.cl b/kernels/builtin_num_sub_groups.cl
new file mode 100644
index 0000000..08b5673
--- /dev/null
+++ b/kernels/builtin_num_sub_groups.cl
@@ -0,0 +1,7 @@
+__kernel void builtin_num_sub_groups(global int *dst)
+{
+  int lid = get_local_linear_id();
+  int lsz = get_local_size(0) * get_local_size(1) * get_local_size(2);
+  int gid = lid + lsz*(get_num_groups(1) * get_num_groups(0) * get_group_id(2) + get_num_groups(0) * get_group_id(1) + get_group_id(0));
+  dst[gid] = get_num_sub_groups();
+}
diff --git a/kernels/builtin_sub_group_id.cl b/kernels/builtin_sub_group_id.cl
new file mode 100644
index 0000000..accf3ad
--- /dev/null
+++ b/kernels/builtin_sub_group_id.cl
@@ -0,0 +1,7 @@
+__kernel void builtin_sub_group_id(global int *dst)
+{
+  int lid = get_local_linear_id();
+  int lsz = get_local_size(0) * get_local_size(1) * get_local_size(2);
+  int gid = lid + lsz*(get_num_groups(1) * get_num_groups(0) * get_group_id(2) + get_num_groups(0) * get_group_id(1) + get_group_id(0));
+  dst[gid] = get_sub_group_id();
+}
diff --git a/kernels/builtin_sub_group_size.cl b/kernels/builtin_sub_group_size.cl
new file mode 100644
index 0000000..1e034bb
--- /dev/null
+++ b/kernels/builtin_sub_group_size.cl
@@ -0,0 +1,7 @@
+__kernel void builtin_sub_group_size(global int *dst)
+{
+  int lid = get_local_linear_id();
+  int lsz = get_local_size(0) * get_local_size(1) * get_local_size(2);
+  int gid = lid + lsz*(get_num_groups(1) * get_num_groups(0) * get_group_id(2) + get_num_groups(0) * get_group_id(1) + get_group_id(0));
+  dst[gid] = get_sub_group_size();
+}
diff --git a/kernels/cmrt_utest_genx.isa b/kernels/cmrt_utest_genx.isa
new file mode 100644
index 0000000..ab0781e
Binary files /dev/null and b/kernels/cmrt_utest_genx.isa differ
diff --git a/kernels/compiler_bsort.cl b/kernels/compiler_bsort.cl
new file mode 100644
index 0000000..fbec427
--- /dev/null
+++ b/kernels/compiler_bsort.cl
@@ -0,0 +1,47 @@
+#define UP 0
+#define DOWN -1
+
+/* Sort elements in a vector */
+#define SORT_VECTOR(input, dir)                                   \
+   comp = (input < shuffle(input, mask1)) ^ dir;                  \
+   input = shuffle(input, as_uint4(comp + add1));                 \
+   comp = (input < shuffle(input, mask2)) ^ dir;                  \
+   input = shuffle(input, as_uint4(comp * 2 + add2));             \
+   comp = (input < shuffle(input, mask3)) ^ dir;                  \
+   input = shuffle(input, as_uint4(comp + add3));                 \
+
+/* Sort elements between two vectors */
+#define SWAP_VECTORS(input1, input2, dir)                         \
+   temp = input1;                                                 \
+   comp = ((input1 < input2) ^ dir) * 4 + add4;                   \
+   input1 = shuffle2(input1, input2, as_uint4(comp));             \
+   input2 = shuffle2(input2, temp, as_uint4(comp));               \
+
+__kernel void compiler_bsort(__global float4 *data) {
+
+   float4 input1, input2, temp;
+   int4 comp;
+
+   uint4 mask1 = (uint4)(1, 0, 3, 2);
+   uint4 mask2 = (uint4)(2, 3, 0, 1);
+   uint4 mask3 = (uint4)(3, 2, 1, 0);
+
+   int4 add1 = (int4)(1, 1, 3, 3);
+   int4 add2 = (int4)(2, 3, 2, 3);
+   int4 add3 = (int4)(1, 2, 2, 3);
+   int4 add4 = (int4)(4, 5, 6, 7);
+
+   input1 = data[0];
+   input2 = data[1];
+
+   SORT_VECTOR(input1, UP)
+   SORT_VECTOR(input2, DOWN)
+
+   SWAP_VECTORS(input1, input2, UP)
+
+   SORT_VECTOR(input1, UP)
+   SORT_VECTOR(input2, UP)
+
+   data[0] = input1;
+   data[1] = input2;
+}
diff --git a/kernels/compiler_bswap.cl b/kernels/compiler_bswap.cl
index 3a0a373..b1432b2 100644
--- a/kernels/compiler_bswap.cl
+++ b/kernels/compiler_bswap.cl
@@ -1,5 +1,15 @@
+#define SWAP64(A)	  \
+((((A) & 0xff00000000000000) >> 56) | \
+    (((A) & 0x00ff000000000000) >> 40) | \
+    (((A) & 0x0000ff0000000000) >> 24) | \
+    (((A) & 0x000000ff00000000) >> 8) |  \
+    (((A) & 0x00000000ff000000) << 8) |  \
+    (((A) & 0x0000000000ff0000) << 24) | \
+    (((A) & 0x000000000000ff00) << 40) | \
+    (((A) & 0x00000000000000ff) << 56) )
+
 kernel void compiler_bswap(global uint * src0, global uint * dst0, global ushort * src1, global ushort * dst1,
-    int src2, global int * dst2,  short src3, global short * dst3) {
+    int src2, global int * dst2,  short src3, global short * dst3, global ulong* src4, global ulong* dst4, long src5, global long* dst5) {
   if (get_global_id(0) % 2 == 0) {
     dst0[get_global_id(0)] = __builtin_bswap32(src0[get_global_id(0)]);
   } else {
@@ -13,5 +23,7 @@ kernel void compiler_bswap(global uint * src0, global uint * dst0, global ushort
 
   dst2[get_global_id(0)] = __builtin_bswap32(src2);
   dst3[get_global_id(0)] = __builtin_bswap16(src3);
+  dst4[get_global_id(0)] = SWAP64(src4[get_global_id(0)]);
+  dst5[get_global_id(0)] = SWAP64(src5);
 }
 
diff --git a/kernels/compiler_double_2.cl b/kernels/compiler_double_2.cl
deleted file mode 100644
index 20ee614..0000000
--- a/kernels/compiler_double_2.cl
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-kernel void compiler_double_2(global float *src, global double *dst) {
-  int i = get_global_id(0);
-  float d = 1.234567890123456789f;
-  if (i < 14)
-    dst[i] = d * (d + src[i]);
-  else
-    dst[i] = 14;
-}
diff --git a/kernels/compiler_double_4.cl b/kernels/compiler_double_4.cl
deleted file mode 100644
index e5e46f9..0000000
--- a/kernels/compiler_double_4.cl
+++ /dev/null
@@ -1,5 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-kernel void compiler_double_4(global double *src1, global double *src2, global double *dst) {
-  int i = get_global_id(0);
-  dst[i] = src1[i] + src2[i];
-}
diff --git a/kernels/compiler_double_convert.cl b/kernels/compiler_double_convert.cl
new file mode 100644
index 0000000..344f24e
--- /dev/null
+++ b/kernels/compiler_double_convert.cl
@@ -0,0 +1,102 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_convert_int(global double *src, global int *dst0, global uint* dst1) {
+  int i = get_global_id(0);
+
+  if (i%3) {
+    int i32 = src[i];
+    dst0[i] = i32;
+
+    uint u32 = src[i];
+    dst1[i] = u32;
+  }
+}
+
+kernel void compiler_double_convert_float(global double *src, global float *dst) {
+  int i = get_global_id(0);
+
+  float f = src[i];
+  dst[i] = f;
+}
+
+kernel void compiler_double_convert_short(global double *src, global short *dst0, global ushort * dst1) {
+  int i = get_global_id(0);
+
+  if (i%3) {
+    short i16 = src[i];
+    dst0[i] = i16;
+
+    ushort u16 = src[i];
+    dst1[i] = u16;
+  }
+}
+
+kernel void compiler_double_convert_long(global double *src, global long *dst0, global ulong * dst1) {
+  int i = get_global_id(0);
+
+  if (i%3) {
+    long i64 = src[i];
+    dst0[i] = i64;
+
+    ulong u64 = src[i];
+    dst1[i] = u64;
+  }
+}
+
+kernel void compiler_double_convert_char(global double *src, global char *dst0, global uchar * dst1) {
+  int i = get_global_id(0);
+
+  if (i%3) {
+    char i8 = src[i];
+    dst0[i] = i8;
+
+    uchar u8 = src[i];
+    dst1[i] = u8;
+  }
+}
+
+kernel void compiler_long_convert_double(global long *src0, global ulong *src1, global double * dst0, global double *dst1) {
+  int i = get_global_id(0);
+
+  double d = src0[i];
+  dst0[i] = d;
+
+  d = src1[i];
+  dst1[i] = d;
+}
+
+kernel void compiler_int_convert_double(global int *src0, global uint *src1, global double * dst0, global double *dst1) {
+  int i = get_global_id(0);
+
+  double d = src0[i];
+  dst0[i] = d;
+
+  d = src1[i];
+  dst1[i] = d;
+}
+
+kernel void compiler_short_convert_double(global short *src0, global ushort *src1, global double * dst0, global double *dst1) {
+  int i = get_global_id(0);
+
+  double d = src0[i];
+  dst0[i] = d;
+
+  d = src1[i];
+  dst1[i] = d;
+}
+
+kernel void compiler_char_convert_double(global char *src0, global uchar *src1, global double * dst0, global double *dst1) {
+  int i = get_global_id(0);
+
+  double d = src0[i];
+  dst0[i] = d;
+
+  d = src1[i];
+  dst1[i] = d;
+}
+
+kernel void compiler_float_convert_double(global float *src, global double *dst) {
+  int i = get_global_id(0);
+
+  double d = src[i];
+  dst[i] = d;
+}
diff --git a/kernels/compiler_double_div.cl b/kernels/compiler_double_div.cl
new file mode 100644
index 0000000..3022f51
--- /dev/null
+++ b/kernels/compiler_double_div.cl
@@ -0,0 +1,13 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_div(global double *src1, global double *src2, global double *dst) {
+  int i = get_global_id(0);
+  if (i % 3 != 0)
+    dst[i] = src1[i] / src2[i];
+  else
+    dst[i] = 0.0;
+}
+
+kernel void compiler_double_div_uniform(double src1, double src2, double tmp, global double *dst) {
+  tmp = src1 / src2;
+  dst[0] = tmp;
+}
diff --git a/kernels/compiler_get_max_sub_group_size.cl b/kernels/compiler_get_max_sub_group_size.cl
new file mode 100644
index 0000000..8fb263b
--- /dev/null
+++ b/kernels/compiler_get_max_sub_group_size.cl
@@ -0,0 +1,5 @@
+__kernel void compiler_get_max_sub_group_size(global int *dst)
+{
+  int i = get_global_id(0);
+  dst[i] = get_max_sub_group_size();
+}
diff --git a/kernels/compiler_get_sub_group_id.cl b/kernels/compiler_get_sub_group_id.cl
deleted file mode 100644
index 10033ff..0000000
--- a/kernels/compiler_get_sub_group_id.cl
+++ /dev/null
@@ -1,8 +0,0 @@
-__kernel void compiler_get_sub_group_id(global int *dst)
-{
-  int i = get_global_id(0);
-  if (i == 0)
-    dst[0] = get_sub_group_size();
-
-  dst[i+1] = get_sub_group_id();
-}
diff --git a/kernels/compiler_get_sub_group_local_id.cl b/kernels/compiler_get_sub_group_local_id.cl
new file mode 100644
index 0000000..0a28285
--- /dev/null
+++ b/kernels/compiler_get_sub_group_local_id.cl
@@ -0,0 +1,8 @@
+__kernel void compiler_get_sub_group_local_id(global int *dst)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+
+  dst[i+1] = get_sub_group_local_id();
+}
diff --git a/kernels/compiler_get_sub_group_size.cl b/kernels/compiler_get_sub_group_size.cl
deleted file mode 100644
index 4d5e3eb..0000000
--- a/kernels/compiler_get_sub_group_size.cl
+++ /dev/null
@@ -1,5 +0,0 @@
-__kernel void compiler_get_sub_group_size(global int *dst)
-{
-  int i = get_global_id(0);
-  dst[i] = get_sub_group_size();
-}
diff --git a/kernels/compiler_half_convert.cl b/kernels/compiler_half_convert.cl
index c28921e..3587e19 100644
--- a/kernels/compiler_half_convert.cl
+++ b/kernels/compiler_half_convert.cl
@@ -1,5 +1,4 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
 kernel void compiler_half_to_long_sat(global half *src, global long *dst) {
   int i = get_global_id(0);
   dst[i] = convert_long_sat(src[i]);
@@ -54,3 +53,13 @@ kernel void compiler_half_to_float(global half4 *src, global float4 *dst) {
   int i = get_global_id(0);
   dst[i] = convert_float4(src[i]);
 }
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_half_to_double(global half *src, global double *dst) {
+  int i = get_global_id(0);
+  dst[i] = src[i];
+}
+kernel void compiler_double_to_half(global double *src, global half *dst) {
+  int i = get_global_id(0);
+  dst[i] = src[i];
+}
diff --git a/kernels/compiler_math_3op.cl b/kernels/compiler_math_3op.cl
index 95b0398..1a43e1b 100644
--- a/kernels/compiler_math_3op.cl
+++ b/kernels/compiler_math_3op.cl
@@ -1,9 +1,25 @@
-kernel void compiler_math_3op(global float *dst, global float *src1, global float *src2, global float *src3) {
+#ifdef HALF
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_math_3op_half(global half *dst, global half *src1, global half *src2, global half *src3) {
+  int i = get_global_id(0);
+  const half x = src1[i], y = src2[i], z = src3[i];
+  switch (i%2) {
+    case 0: dst[i] = mad(x, y, z); break;
+    case 1: dst[i] = fma(x, y, z); break;
+    default: dst[i] = 1.f; break;
+  };
+  dst[0] = mad(src1[0],src2[0],src3[0]);
+}
+#else
+kernel void compiler_math_3op_float(global float *dst, global float *src1, global float *src2, global float *src3) {
+
   int i = get_global_id(0);
   const float x = src1[i], y = src2[i], z = src3[i];
-  switch (i) {
+  switch (i%2) {
     case 0: dst[i] = mad(x, y, z); break;
     case 1: dst[i] = fma(x, y, z); break;
     default: dst[i] = 1.f; break;
   };
+  dst[0] = mad(src1[0],src2[0],src3[0]);
 }
+#endif
diff --git a/kernels/compiler_mix.cl b/kernels/compiler_mix.cl
new file mode 100644
index 0000000..2164b81
--- /dev/null
+++ b/kernels/compiler_mix.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mix(global float *src1, global float *src2, global float *src3, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = mix(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/compiler_sub_group_all.cl b/kernels/compiler_sub_group_all.cl
deleted file mode 100644
index 30db5bc..0000000
--- a/kernels/compiler_sub_group_all.cl
+++ /dev/null
@@ -1,12 +0,0 @@
-__kernel void compiler_sub_group_all(global int *src, global int *dst)
-{
-  int i = get_global_id(0);
-  if (i % 2 == 1) {
-    if (sub_group_all((src[i] < 12) && (src[i] > 0)))
-      dst[i] = 1;
-    else
-      dst[i] = 2;
-  }
-  else
-    dst[i] = 3;
-}
diff --git a/kernels/compiler_sub_group_any.cl b/kernels/compiler_sub_group_any.cl
deleted file mode 100644
index 15702db..0000000
--- a/kernels/compiler_sub_group_any.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-__kernel void compiler_sub_group_any(global int *src, global int *dst)
-{
-  int i = get_global_id(0);
-
-  if (i % 2 == 1) {
-    if (sub_group_any(src[i] == 5) || sub_group_any(src[i] == 9))
-      dst[i] = 1;
-    else if (sub_group_any(src[i] == 6))
-      dst[i] = 0;
-    else
-      dst[i] = 2;
-  }
-  else
-    dst[i] = 3;
-}
diff --git a/kernels/compiler_sub_group_shuffle.cl b/kernels/compiler_sub_group_shuffle.cl
index 75adde3..322da74 100644
--- a/kernels/compiler_sub_group_shuffle.cl
+++ b/kernels/compiler_sub_group_shuffle.cl
@@ -2,12 +2,12 @@ __kernel void compiler_sub_group_shuffle(global int *dst, int c)
 {
   int i = get_global_id(0);
   if (i == 0)
-    dst[0] = get_sub_group_size();
+    dst[0] = get_max_sub_group_size();
   dst++;
 
   int from = i;
-  int j = get_sub_group_size() - get_sub_group_id() - 1;
-  int o0 = get_sub_group_id();
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int o0 = get_sub_group_local_id();
   int o1 = intel_sub_group_shuffle(from, c);
   int o2 = intel_sub_group_shuffle(from, 5);
   int o3 = intel_sub_group_shuffle(from, j);
diff --git a/kernels/compiler_sub_group_shuffle_down.cl b/kernels/compiler_sub_group_shuffle_down.cl
new file mode 100644
index 0000000..769fc3f
--- /dev/null
+++ b/kernels/compiler_sub_group_shuffle_down.cl
@@ -0,0 +1,19 @@
+__kernel void compiler_sub_group_shuffle_down(global int *dst, int c)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+
+  int from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int k = get_sub_group_local_id() + 1;
+  int o0 = intel_sub_group_shuffle_down(123, 456, c);
+  int o1 = intel_sub_group_shuffle_down(123, from, c);
+  int o2 = intel_sub_group_shuffle_down(from, -from, k);
+  int o3 = intel_sub_group_shuffle_down(from, 321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
diff --git a/kernels/compiler_sub_group_shuffle_up.cl b/kernels/compiler_sub_group_shuffle_up.cl
new file mode 100644
index 0000000..5c5cee1
--- /dev/null
+++ b/kernels/compiler_sub_group_shuffle_up.cl
@@ -0,0 +1,19 @@
+__kernel void compiler_sub_group_shuffle_up(global int *dst, int c)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+
+  int from = i;
+  int j = get_sub_group_local_id() + 1;
+  int k = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int o0 = intel_sub_group_shuffle_up(123, 456, c);
+  int o1 = intel_sub_group_shuffle_up(123, from, c);
+  int o2 = intel_sub_group_shuffle_up(from, -from, k);
+  int o3 = intel_sub_group_shuffle_up(from, 321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
diff --git a/kernels/compiler_sub_group_shuffle_xor.cl b/kernels/compiler_sub_group_shuffle_xor.cl
new file mode 100644
index 0000000..8bc15d3
--- /dev/null
+++ b/kernels/compiler_sub_group_shuffle_xor.cl
@@ -0,0 +1,19 @@
+__kernel void compiler_sub_group_shuffle_xor(global int *dst, int c)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+
+  int from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int k = get_sub_group_local_id() + 1;
+  int o0 = get_sub_group_local_id();
+  int o1 = intel_sub_group_shuffle_xor(from, c);
+  int o2 = intel_sub_group_shuffle_xor(from, j);
+  int o3 = intel_sub_group_shuffle_xor(from, k);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
diff --git a/kernels/compiler_subgroup_broadcast.cl b/kernels/compiler_subgroup_broadcast.cl
new file mode 100644
index 0000000..4f21cf5
--- /dev/null
+++ b/kernels/compiler_subgroup_broadcast.cl
@@ -0,0 +1,34 @@
+/*
+ * Subgroup broadcast 1D functions
+ */
+
+kernel void compiler_subgroup_broadcast_imm_int(global int *src,
+                                                global int *dst,
+                                                uint simd_id)
+{
+  uint index = get_global_id(0);
+
+  int val = src[index];
+  int broadcast_val = sub_group_broadcast(val, 10);
+  dst[index] = broadcast_val;
+}
+kernel void compiler_subgroup_broadcast_int(global int *src,
+                                                global int *dst,
+                                                uint simd_id)
+{
+  uint index = get_global_id(0);
+
+  int val = src[index];
+  int broadcast_val = sub_group_broadcast(val, simd_id);
+  dst[index] = broadcast_val;
+}
+kernel void compiler_subgroup_broadcast_long(global long *src,
+                                                global long *dst,
+                                                uint simd_id)
+{
+  uint index = get_global_id(0);
+
+  long val = src[index];
+  long broadcast_val = sub_group_broadcast(val, simd_id);
+  dst[index] = broadcast_val;
+}
diff --git a/kernels/compiler_subgroup_buffer_block_read.cl b/kernels/compiler_subgroup_buffer_block_read.cl
new file mode 100644
index 0000000..9edaa2e
--- /dev/null
+++ b/kernels/compiler_subgroup_buffer_block_read.cl
@@ -0,0 +1,31 @@
+__kernel void compiler_subgroup_buffer_block_read1(global uint *src, global uint *dst)
+{
+  int id = get_global_id(0);
+  global uint * p = src + get_sub_group_id() * get_max_sub_group_size();
+  uint tmp = intel_sub_group_block_read(p);
+  dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_buffer_block_read2(global uint *src, global uint2 *dst)
+{
+  int id = get_global_id(0);
+  global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*2;
+  uint2 tmp = intel_sub_group_block_read2(p);
+  dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_buffer_block_read4(global uint *src, global uint4 *dst)
+{
+  int id = get_global_id(0);
+  global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*4;
+  uint4 tmp = intel_sub_group_block_read4(p);
+  dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_buffer_block_read8(global uint *src, global uint8 *dst)
+{
+  int id = get_global_id(0);
+  global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*8;
+  uint8 tmp = intel_sub_group_block_read8(p);
+  dst[id] = tmp;
+}
diff --git a/kernels/compiler_subgroup_buffer_block_write.cl b/kernels/compiler_subgroup_buffer_block_write.cl
new file mode 100644
index 0000000..f735855
--- /dev/null
+++ b/kernels/compiler_subgroup_buffer_block_write.cl
@@ -0,0 +1,27 @@
+__kernel void compiler_subgroup_buffer_block_write1(global uint *src, global uint *dst)
+{
+  int id = get_global_id(0);
+  global uint * p = dst + get_sub_group_id() * get_max_sub_group_size();
+  intel_sub_group_block_write(p,src[id]);
+}
+
+__kernel void compiler_subgroup_buffer_block_write2(global uint2 *src, global uint *dst)
+{
+  int id = get_global_id(0);
+  global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*2;
+  intel_sub_group_block_write2(p,src[id]);
+}
+
+__kernel void compiler_subgroup_buffer_block_write4(global uint4 *src, global uint *dst)
+{
+  int id = get_global_id(0);
+  global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*4;
+  intel_sub_group_block_write4(p,src[id]);
+}
+
+__kernel void compiler_subgroup_buffer_block_write8(global uint8 *src, global uint *dst)
+{
+  int id = get_global_id(0);
+  global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*8;
+  intel_sub_group_block_write8(p,src[id]);
+}
diff --git a/kernels/compiler_subgroup_image_block_read.cl b/kernels/compiler_subgroup_image_block_read.cl
new file mode 100644
index 0000000..d5df6db
--- /dev/null
+++ b/kernels/compiler_subgroup_image_block_read.cl
@@ -0,0 +1,31 @@
+__kernel void compiler_subgroup_image_block_read1(image2d_t src, global uint *dst)
+{
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
+  uint tmp = intel_sub_group_block_read(src,coord);
+  dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_image_block_read2(image2d_t src, global uint2 *dst)
+{
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
+  uint2 tmp = intel_sub_group_block_read2(src,coord);
+  dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_image_block_read4(image2d_t src, global uint4 *dst)
+{
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
+  uint4 tmp = intel_sub_group_block_read4(src,coord);
+  dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_image_block_read8(image2d_t src, global uint8 *dst)
+{
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
+  uint8 tmp = intel_sub_group_block_read8(src,coord);
+  dst[id] = tmp;
+}
diff --git a/kernels/compiler_subgroup_image_block_write.cl b/kernels/compiler_subgroup_image_block_write.cl
new file mode 100644
index 0000000..d9b3717
--- /dev/null
+++ b/kernels/compiler_subgroup_image_block_write.cl
@@ -0,0 +1,27 @@
+__kernel void compiler_subgroup_image_block_write1(image2d_t dst, global uint *src)
+{
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
+  intel_sub_group_block_write(dst,coord, src[id]);
+}
+
+__kernel void compiler_subgroup_image_block_write2(image2d_t dst, global uint2 *src)
+{
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
+  intel_sub_group_block_write2(dst,coord, src[id]);
+}
+
+__kernel void compiler_subgroup_image_block_write4(image2d_t dst, global uint4 *src)
+{
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
+  intel_sub_group_block_write4(dst,coord, src[id]);
+}
+
+__kernel void compiler_subgroup_image_block_write8(image2d_t dst, global uint8 *src)
+{
+  int id = get_global_id(0);
+  int2 coord = (int2)(get_simd_size()*get_sub_group_id()*sizeof(uint),0);
+  intel_sub_group_block_write8(dst,coord, src[id]);
+}
diff --git a/kernels/compiler_subgroup_reduce.cl b/kernels/compiler_subgroup_reduce.cl
new file mode 100644
index 0000000..77ffb07
--- /dev/null
+++ b/kernels/compiler_subgroup_reduce.cl
@@ -0,0 +1,136 @@
+/*
+ * Subgroup any all functions
+ */
+kernel void compiler_subgroup_any(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int predicate = sub_group_any(val);
+  dst[get_global_id(0)] = predicate;
+}
+kernel void compiler_subgroup_all(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int predicate = sub_group_all(val);
+  dst[get_global_id(0)] = predicate;
+}
+
+/*
+ * Subgroup reduce add functions
+ */
+kernel void compiler_subgroup_reduce_add_char(global char *src, global char *dst) {
+  char val = src[get_global_id(0)];
+  char sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_uchar(global uchar *src, global uchar *dst) {
+  uchar val = src[get_global_id(0)];
+  uchar sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup reduce max functions
+ */
+kernel void compiler_subgroup_reduce_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup reduce min functions
+ */
+kernel void compiler_subgroup_reduce_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
diff --git a/kernels/compiler_subgroup_scan_exclusive.cl b/kernels/compiler_subgroup_scan_exclusive.cl
new file mode 100644
index 0000000..afc00d0
--- /dev/null
+++ b/kernels/compiler_subgroup_scan_exclusive.cl
@@ -0,0 +1,98 @@
+/*
+ * Subgroup scan exclusive add functions
+ */
+kernel void compiler_subgroup_scan_exclusive_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan exclusive max functions
+ */
+kernel void compiler_subgroup_scan_exclusive_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan exclusive min functions
+ */
+kernel void compiler_subgroup_scan_exclusive_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
diff --git a/kernels/compiler_subgroup_scan_inclusive.cl b/kernels/compiler_subgroup_scan_inclusive.cl
new file mode 100644
index 0000000..da1a6e6
--- /dev/null
+++ b/kernels/compiler_subgroup_scan_inclusive.cl
@@ -0,0 +1,98 @@
+/*
+ * Subgroup scan inclusive add functions
+ */
+kernel void compiler_subgroup_scan_inclusive_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan inclusive max functions
+ */
+kernel void compiler_subgroup_scan_inclusive_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan inclusive min functions
+ */
+kernel void compiler_subgroup_scan_inclusive_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
diff --git a/kernels/compiler_vector_load_store.cl b/kernels/compiler_vector_load_store.cl
index aec38b1..d423174 100644
--- a/kernels/compiler_vector_load_store.cl
+++ b/kernels/compiler_vector_load_store.cl
@@ -17,6 +17,7 @@ __kernel void test_##type ##n(__global type *pin, \
   vstore ##n(value, x, pout); \
 }
 
+#ifndef HALF
 #define TEST_ALL_TYPE(n) \
   TEST_TYPE(char,n)  \
   TEST_TYPE(uchar,n) \
@@ -27,11 +28,12 @@ __kernel void test_##type ##n(__global type *pin, \
   TEST_TYPE(float,n) \
   TEST_TYPE(long,n)  \
   TEST_TYPE(ulong,n)
-//  TEST_TYPE(double,n)
-
-#if 0
+#else
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define TEST_ALL_TYPE(n) \
   TEST_TYPE(half,n)
 #endif
+//  TEST_TYPE(double,n)
 
 TEST_ALL_TYPE(2)
 TEST_ALL_TYPE(3)
diff --git a/kernels/compiler_workgroup_broadcast.cl b/kernels/compiler_workgroup_broadcast.cl
new file mode 100644
index 0000000..118cc9e
--- /dev/null
+++ b/kernels/compiler_workgroup_broadcast.cl
@@ -0,0 +1,122 @@
+/*
+ * Workgroup broadcast 1D functions
+ */
+
+kernel void compiler_workgroup_broadcast_1D_int(global int *src,
+                                                global int *dst,
+                                                uint wg_local_x,
+                                                uint wg_local_y,
+                                                uint wg_local_z)
+{
+  uint offset = 0;
+  uint index = offset + get_global_id(0);
+
+  int val = src[index];
+  int broadcast_val = work_group_broadcast(val,
+                                            wg_local_x);
+  dst[index] = broadcast_val;
+}
+#if 0
+kernel void compiler_workgroup_broadcast_1D_long(global long *src,
+                                                global long *dst,
+                                                uint wg_local_x,
+                                                uint wg_local_y,
+                                                uint wg_local_z)
+{
+  uint offset = 0;
+  uint index = offset + get_global_id(0);
+
+  long val = src[index];
+  long broadcast_val = work_group_broadcast(val,
+                                            wg_local_x);
+  dst[index] = broadcast_val;
+}
+#endif
+/*
+ * Workgroup broadcast 2D functions
+ */
+kernel void compiler_workgroup_broadcast_2D_int(global int *src,
+                                                global int *dst,
+                                                uint wg_local_x,
+                                                uint wg_local_y,
+                                                uint wg_local_z)
+{
+  uint lsize = get_local_size(0) * get_local_size(1);
+  uint offset = get_group_id(0) * lsize +
+      get_group_id(1) * get_num_groups(0) * lsize;
+  uint index = offset + get_local_id(0) +
+      get_local_id(1) * get_local_size(0);
+
+  int val = src[index];
+  int broadcast_val = work_group_broadcast(val,
+                                            wg_local_x,
+                                            wg_local_y);
+  dst[index] = broadcast_val;
+}
+#if 0
+kernel void compiler_workgroup_broadcast_2D_long(global long *src,
+                                                global long *dst,
+                                                uint wg_local_x,
+                                                uint wg_local_y,
+                                                uint wg_local_z)
+{
+  uint lsize = get_local_size(0) * get_local_size(1);
+  uint offset = get_group_id(0) * lsize +
+      get_group_id(1) * get_num_groups(0) * lsize;
+  uint index = offset + get_local_id(0) +
+      get_local_id(1) * get_local_size(0);
+
+  long val = src[index];
+  long broadcast_val = work_group_broadcast(val,
+                                            wg_local_x,
+                                            wg_local_y);
+  dst[index] = broadcast_val;
+}
+#endif
+/*
+ * Workgroup broadcast 3D functions
+ */
+kernel void compiler_workgroup_broadcast_3D_int(global int *src,
+                                                global int *dst,
+                                                uint wg_local_x,
+                                                uint wg_local_y,
+                                                uint wg_local_z)
+{
+  uint lsize = get_local_size(0) * get_local_size(1) * get_local_size(2);
+  uint offset = get_group_id(0) * lsize +
+      get_group_id(1) * get_num_groups(0) * lsize +
+      get_group_id(2) * get_num_groups(1) * get_num_groups(0) * lsize;
+  uint index = offset + get_local_id(0) +
+      get_local_id(1) * get_local_size(0) +
+      get_local_id(2) * get_local_size(1) * get_local_size(0);
+
+  int val = src[index];
+  int broadcast_val = work_group_broadcast(val,
+                                            wg_local_x,
+                                            wg_local_y,
+                                            wg_local_z);
+  dst[index] = broadcast_val;
+}
+#if 0
+kernel void compiler_workgroup_broadcast_3D_long(global long *src,
+                                                global long *dst,
+                                                uint wg_local_x,
+                                                uint wg_local_y,
+                                                uint wg_local_z)
+{
+  uint lsize = get_local_size(0) * get_local_size(1) * get_local_size(2);
+  uint offset = get_group_id(0) * lsize +
+      get_group_id(1) * get_num_groups(0) * lsize +
+      get_group_id(2) * get_num_groups(0) * get_num_groups(1) * lsize;
+  uint index = offset + get_local_id(0) +
+      get_local_id(1) * get_local_size(0) +
+      get_local_id(2) * get_local_size(1) * get_local_size(0);
+
+  long val = src[index];
+  long broadcast_val = work_group_broadcast(val,
+                                            wg_local_x,
+                                            wg_local_y,
+                                            wg_local_z);
+  dst[index] = broadcast_val;
+}
+#endif
diff --git a/kernels/compiler_workgroup_reduce.cl b/kernels/compiler_workgroup_reduce.cl
new file mode 100644
index 0000000..69dcea8
--- /dev/null
+++ b/kernels/compiler_workgroup_reduce.cl
@@ -0,0 +1,137 @@
+/*
+ * Workgroup any all functions
+ */
+kernel void compiler_workgroup_any(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int predicate = work_group_any(val);
+  dst[get_global_id(0)] = predicate;
+}
+kernel void compiler_workgroup_all(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int predicate = work_group_all(val);
+  dst[get_global_id(0)] = predicate;
+}
+
+/*
+ * Workgroup reduce add functions
+ */
+kernel void compiler_workgroup_reduce_add_char(global char *src, global char *dst) {
+  char val = src[get_global_id(0)];
+  char sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_uchar(global uchar *src, global uchar *dst) {
+  uchar val = src[get_global_id(0)];
+  uchar sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Workgroup reduce max functions
+ */
+kernel void compiler_workgroup_reduce_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Workgroup reduce min functions
+ */
+kernel void compiler_workgroup_reduce_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
diff --git a/kernels/compiler_workgroup_scan_exclusive.cl b/kernels/compiler_workgroup_scan_exclusive.cl
new file mode 100644
index 0000000..14c1c61
--- /dev/null
+++ b/kernels/compiler_workgroup_scan_exclusive.cl
@@ -0,0 +1,98 @@
+/*
+ * Workgroup scan exclusive add functions
+ */
+kernel void compiler_workgroup_scan_exclusive_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_add_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_add_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Workgroup scan exclusive max functions
+ */
+kernel void compiler_workgroup_scan_exclusive_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Workgroup scan exclusive min functions
+ */
+kernel void compiler_workgroup_scan_exclusive_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_exclusive_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
diff --git a/kernels/compiler_workgroup_scan_inclusive.cl b/kernels/compiler_workgroup_scan_inclusive.cl
new file mode 100644
index 0000000..915251e
--- /dev/null
+++ b/kernels/compiler_workgroup_scan_inclusive.cl
@@ -0,0 +1,98 @@
+/*
+ * Workgroup scan inclusive add functions
+ */
+kernel void compiler_workgroup_scan_inclusive_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_add_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_add_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Workgroup scan inclusive max functions
+ */
+kernel void compiler_workgroup_scan_inclusive_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Workgroup scan inclusive min functions
+ */
+kernel void compiler_workgroup_scan_inclusive_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_scan_inclusive_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
diff --git a/kernels/image_1D_buffer.cl b/kernels/image_1D_buffer.cl
index 2c1da69..e82aa08 100644
--- a/kernels/image_1D_buffer.cl
+++ b/kernels/image_1D_buffer.cl
@@ -1,4 +1,4 @@
-__kernel void image_1D_buffer(image1d_buffer_t image1, image1d_buffer_t image2)
+__kernel void image_1D_buffer(__read_only image1d_buffer_t image1, __write_only image1d_buffer_t image2)
 {
    int x = get_global_id(0);
 
diff --git a/kernels/image_from_buffer.cl b/kernels/image_from_buffer.cl
new file mode 100644
index 0000000..f970c85
--- /dev/null
+++ b/kernels/image_from_buffer.cl
@@ -0,0 +1,12 @@
+__kernel void image_from_buffer(__read_only image2d_t src, __write_only image2d_t dst)
+{
+  int2 coord;
+  int4 color;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+
+  color = read_imagei(src, sampler, coord);
+  write_imagei(dst, coord, color);
+}
diff --git a/kernels/runtime_use_host_ptr_image.cl b/kernels/runtime_use_host_ptr_image.cl
new file mode 100644
index 0000000..7596ec8
--- /dev/null
+++ b/kernels/runtime_use_host_ptr_image.cl
@@ -0,0 +1,10 @@
+__kernel void
+runtime_use_host_ptr_image(__read_only image2d_t src, __write_only image2d_t dst)
+{
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 coord;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  float4 data = read_imagef(src, sampler, coord);
+  write_imagef(dst, coord, data);
+}
diff --git a/kernels/test_fill_image_2d_array.cl b/kernels/test_fill_image_2d_array.cl
index e756010..e66359f 100644
--- a/kernels/test_fill_image_2d_array.cl
+++ b/kernels/test_fill_image_2d_array.cl
@@ -9,5 +9,5 @@ test_fill_image_2d_array(__write_only image2d_array_t dst)
   coordz = (int)get_global_id(2);
   uint4 color4 = {0, 1, 2 ,3};
   if (coordz < 7)
-    write_imageui(dst, (int3)(coordx, coordy, coordz), color4);
+    write_imageui(dst, (int4)(coordx, coordy, coordz, 0), color4);
 }
diff --git a/kernels/test_get_arg_info.cl b/kernels/test_get_arg_info.cl
index 43a804b..ae08887 100644
--- a/kernels/test_get_arg_info.cl
+++ b/kernels/test_get_arg_info.cl
@@ -3,6 +3,6 @@ typedef struct _test_arg_struct {
     int b;
 }test_arg_struct;
 
-kernel void test_get_arg_info(read_only global float const volatile *src, read_write local int read_only *dst, test_arg_struct extra) {
+kernel void test_get_arg_info(read_only global float const volatile *src, read_write local int *dst, test_arg_struct extra) {
 
 }
diff --git a/kernels/test_printf.cl b/kernels/test_printf.cl
index 0a59e88..a2d3af9 100644
--- a/kernels/test_printf.cl
+++ b/kernels/test_printf.cl
@@ -44,3 +44,49 @@ test_printf(void)
     printf("--- End to the printf test ---\n");
   }
 }
+
+__kernel void
+test_printf_1(void)
+{
+   printf("");// just test null printf
+}
+
+__kernel void
+test_printf_2(void)
+{
+   printf("float %f\n", 2.0);// just test a uniform const
+   printf("long %lx\n", 0xABCD1234CCCCDDDD);
+}
+
+__kernel void
+test_printf_3(char arg)
+{
+   printf("@@ arg from func arg is %c\n", arg);
+}
+
+__kernel void
+test_printf_4(void)
+{
+    int a = get_global_size(0);
+    int b = get_local_size(0);
+    int c = a + 1;
+    int d = b + 2;
+    int e = b * 2;
+    int f = c + 1;
+    int g = d + 2;
+    int h = e * 2;
+    int i = a + 1;
+    int j = c / 2;
+    int k = a * 2;
+    int l = c + 1;
+    int m = f + 2;
+    int n = g * 2;
+    int o = e * 2;
+    int p = a + 1;
+    int q = c / 2;
+    int r = a * 2;
+    int s = c + 1;
+    int t = f + 2;
+    printf("@@ Long result is %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n",
+	   a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t);
+}
diff --git a/src/Android.mk b/src/Android.mk
new file mode 100644
index 0000000..9b63f7e
--- /dev/null
+++ b/src/Android.mk
@@ -0,0 +1,124 @@
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+
+include $(LOCAL_PATH)/../Android.common.mk
+
+ocl_config_file = $(LOCAL_PATH)/OCLConfig.h
+$(shell echo "// the configured options and settings for LIBCL" > $(ocl_config_file))
+$(shell echo "#define LIBCL_DRIVER_VERSION_MAJOR 1" >> $(ocl_config_file))
+$(shell echo "#define LIBCL_DRIVER_VERSION_MINOR 2" >> $(ocl_config_file))
+$(shell echo "#define LIBCL_C_VERSION_MAJOR 1" >> $(ocl_config_file))
+$(shell echo "#define LIBCL_C_VERSION_MINOR 2" >> $(ocl_config_file))
+
+LOCAL_C_INCLUDES := $(TOP_C_INCLUDE) $(BEIGNET_ROOT_PATH)/backend/src/backend/ $(BEIGNET_ROOT_PATH)
+LOCAL_C_INCLUDES += $(DRM_INCLUDE_PATH)
+LOCAL_C_INCLUDES += $(LLVM_INCLUDE_DIRS)
+LOCAL_C_INCLUDES += hardware/drm_gralloc
+LOCAL_CPPFLAGS := $(TOP_CPPFLAGS) -std=c++11 -DHAS_USERPTR
+LOCAL_CFLAGS := $(TOP_CFLAGS) -DHAS_USERPTR
+OPTIONAL_EGL_LIBRARY :=
+LOCAL_LDFLAGS := -Wl,-Bsymbolic
+
+LOCAL_LDLIBS := -lm -ldl
+LOCAL_SHARED_LIBRARIES += liblog libcutils
+LOCAL_ADDITIONAL_DEPENDENCIES := $(GBE_BIN_GENERATER)
+LOCAL_MODULE := libcl
+
+LOCAL_REQUIRED_MODULES := $(HOST_OUT_EXECUTABLES)/gbe_bin_generater
+LOCAL_ADDITIONAL_DEPENDENCIES := $(BEIGNET_ROOT_PATH)/backend/src/Android.mk
+
+KERNEL_PATH := $(BEIGNET_ROOT_PATH)/src/kernels
+KERNEL_NAMES := cl_internal_copy_buf_align4 \
+                cl_internal_copy_buf_align16 \
+                cl_internal_copy_buf_unalign_same_offset \
+                cl_internal_copy_buf_unalign_dst_offset \
+                cl_internal_copy_buf_unalign_src_offset \
+                cl_internal_copy_buf_rect \
+                cl_internal_copy_buf_rect_align4 \
+                cl_internal_copy_image_1d_to_1d \
+                cl_internal_copy_image_2d_to_2d \
+                cl_internal_copy_image_3d_to_2d \
+                cl_internal_copy_image_2d_to_3d \
+                cl_internal_copy_image_3d_to_3d \
+                cl_internal_copy_image_2d_to_2d_array \
+                cl_internal_copy_image_1d_array_to_1d_array \
+                cl_internal_copy_image_2d_array_to_2d_array \
+                cl_internal_copy_image_2d_array_to_2d \
+                cl_internal_copy_image_2d_array_to_3d \
+                cl_internal_copy_image_3d_to_2d_array \
+                cl_internal_copy_image_2d_to_buffer \
+                cl_internal_copy_image_2d_to_buffer_align16 \
+                cl_internal_copy_image_3d_to_buffer \
+                cl_internal_copy_buffer_to_image_2d \
+                cl_internal_copy_buffer_to_image_2d_align16 \
+                cl_internal_copy_buffer_to_image_3d \
+                cl_internal_fill_buf_align8 \
+                cl_internal_fill_buf_align4 \
+                cl_internal_fill_buf_align2 \
+                cl_internal_fill_buf_unalign \
+                cl_internal_fill_buf_align128 \
+                cl_internal_fill_image_1d \
+                cl_internal_fill_image_1d_array \
+                cl_internal_fill_image_2d \
+                cl_internal_fill_image_2d_array \
+                cl_internal_fill_image_3d
+BUILT_IN_NAME := cl_internal_built_in_kernel
+
+GBE_BIN_GENERATER := $(HOST_OUT_EXECUTABLES)/gbe_bin_generater
+
+$(shell rm $(KERNEL_PATH)/$(BUILT_IN_NAME).cl)
+define GEN_INTERNAL_KER
+    # Use the python script to generate the header files.
+    $(shell $(GBE_BIN_GENERATER) -s $(KERNEL_PATH)/$(1).cl -o $(KERNEL_PATH)/$(1)_str.c)
+    $(shell cat $(KERNEL_PATH)/$(1).cl >> $(KERNEL_PATH)/$(BUILT_IN_NAME).cl)
+endef
+$(foreach KERNEL_NAME, ${KERNEL_NAMES}, $(eval $(call GEN_INTERNAL_KER,$(KERNEL_NAME))))
+
+$(shell $(GBE_BIN_GENERATER) -s $(KERNEL_PATH)/$(BUILT_IN_NAME).cl -o $(KERNEL_PATH)/$(BUILT_IN_NAME)_str.c)
+
+GIT_SHA1 = git_sha1.h
+$(shell chmod +x $(LOCAL_PATH)/git_sha1.sh)
+$(shell $(LOCAL_PATH)/git_sha1.sh $(LOCAL_PATH) ${GIT_SHA1})
+
+LOCAL_SRC_FILES:= \
+    $(addprefix kernels/,$(addsuffix _str.c, $(KERNEL_NAMES))) \
+    $(addprefix kernels/,$(addsuffix _str.c, $(BUILT_IN_NAME))) \
+    cl_api.c \
+    cl_alloc.c \
+    cl_kernel.c \
+    cl_program.c \
+    cl_gbe_loader.cpp \
+    cl_sampler.c \
+    cl_event.c \
+    cl_enqueue.c \
+    cl_image.c \
+    cl_mem.c \
+    cl_platform_id.c \
+    cl_extensions.c \
+    cl_device_id.c \
+    cl_context.c \
+    cl_command_queue.c \
+    cl_command_queue.h \
+    cl_command_queue_gen7.c \
+    cl_thread.c \
+    cl_driver.h \
+    cl_driver.cpp \
+    cl_driver_defs.c \
+    intel/intel_gpgpu.c \
+    intel/intel_batchbuffer.c \
+    intel/intel_driver.c \
+    performance.c \
+    cl_accelerator_intel.c
+
+LOCAL_SHARED_LIBRARIES := \
+libgbe \
+libdl \
+$(DRM_INTEL_LIBRARY) \
+$(DRM_LIBRARY) \
+$(OPTIONAL_EGL_LIBRARY) \
+libhardware
+
+#LOCAL_CLANG := true
+include external/libcxx/libcxx.mk
+include $(BUILD_SHARED_LIBRARY)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 40a9afb..a002865 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -56,7 +56,8 @@ cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
 cl_internal_fill_image_1d_array cl_internal_fill_image_2d
-cl_internal_fill_image_2d_array cl_internal_fill_image_3d)
+cl_internal_fill_image_2d_array cl_internal_fill_image_3d
+cl_internal_block_motion_estimate_intel)
 set (BUILT_IN_NAME  cl_internal_built_in_kernel)
 MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
 MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
@@ -70,6 +71,7 @@ set(OPENCL_SRC
     cl_program.c
     cl_gbe_loader.cpp
     cl_sampler.c
+    cl_accelerator_intel.c
     cl_event.c
     cl_enqueue.c
     cl_image.c
@@ -99,35 +101,52 @@ if (X11_FOUND)
       x11/va_dri2.c)
 endif (X11_FOUND)
 
+if (CMRT_FOUND)
+  set(CMAKE_CXX_FLAGS "-DHAS_CMRT ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "-DCMRT_PATH=${CMRT_LIBRARY_DIRS}/libcmrt.so ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_C_FLAGS "-DHAS_CMRT ${CMAKE_C_FLAGS}")
+  set(OPENCL_SRC ${OPENCL_SRC} cl_cmrt.cpp)
+endif (CMRT_FOUND)
+
 if (EGL_FOUND AND MESA_SOURCE_FOUND)
-set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
-SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
-SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
-SET(OPTIONAL_EGL_LIBRARY "${EGL_LIBRARY}")
+  set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
+  SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
+  SET(OPTIONAL_EGL_LIBRARY "${EGL_LIBRARY}")
 else(EGL_FOUND AND MESA_SOURCE_FOUND)
-SET(OPTIONAL_EGL_LIBRARY "")
+  SET(OPTIONAL_EGL_LIBRARY "")
 endif (EGL_FOUND AND MESA_SOURCE_FOUND)
 
 if (OCLIcd_FOUND)
-set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
-SET(CMAKE_CXX_FLAGS "-DHAS_OCLIcd ${CMAKE_CXX_FLAGS}")
-SET(CMAKE_C_FLAGS "-DHAS_OCLIcd ${CMAKE_C_FLAGS}")
+  set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
+  SET(CMAKE_CXX_FLAGS "-DHAS_OCLIcd ${CMAKE_CXX_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DHAS_OCLIcd ${CMAKE_C_FLAGS}")
 endif (OCLIcd_FOUND)
 
-if (DRM_INTEL_USERPTR)
-SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR ${CMAKE_CXX_FLAGS}")
-SET(CMAKE_C_FLAGS "-DHAS_USERPTR ${CMAKE_C_FLAGS}")
-endif (DRM_INTEL_USERPTR)
+if (HAVE_DRM_INTEL_USERPTR)
+  SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR ${CMAKE_CXX_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DHAS_USERPTR ${CMAKE_C_FLAGS}")
+endif (HAVE_DRM_INTEL_USERPTR)
+
+if (HAVE_DRM_INTEL_EU_TOTAL)
+  SET(CMAKE_CXX_FLAGS "-DHAS_EU_TOTAL ${CMAKE_CXX_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DHAS_EU_TOTAL ${CMAKE_C_FLAGS}")
+endif (HAVE_DRM_INTEL_EU_TOTAL)
+
+if (HAVE_DRM_INTEL_SUBSLICE_TOTAL)
+  SET(CMAKE_CXX_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_CXX_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_C_FLAGS}")
+endif (HAVE_DRM_INTEL_SUBSLICE_TOTAL)
 
-if (DRM_INTEL_EU_TOTAL)
-SET(CMAKE_CXX_FLAGS "-DHAS_EU_TOTAL ${CMAKE_CXX_FLAGS}")
-SET(CMAKE_C_FLAGS "-DHAS_EU_TOTAL ${CMAKE_C_FLAGS}")
-endif (DRM_INTEL_EU_TOTAL)
+if (HAVE_DRM_INTEL_POOLED_EU)
+  SET(CMAKE_CXX_FLAGS "-DHAS_POOLED_EU ${CMAKE_CXX_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DHAS_POOLED_EU ${CMAKE_C_FLAGS}")
+endif (HAVE_DRM_INTEL_POOLED_EU)
 
-if (DRM_INTEL_SUBSLICE_TOTAL)
-SET(CMAKE_CXX_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_CXX_FLAGS}")
-SET(CMAKE_C_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_C_FLAGS}")
-endif (DRM_INTEL_SUBSLICE_TOTAL)
+if (HAVE_DRM_INTEL_MIN_EU_IN_POOL)
+  SET(CMAKE_CXX_FLAGS "-DHAS_MIN_EU_IN_POOL ${CMAKE_CXX_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DHAS_MIN_EU_IN_POOL ${CMAKE_C_FLAGS}")
+endif (HAVE_DRM_INTEL_MIN_EU_IN_POOL)
 
 set(GIT_SHA1 "git_sha1.h")
 add_custom_target(${GIT_SHA1} ALL
@@ -142,6 +161,7 @@ add_library(cl SHARED ${OPENCL_SRC})
 ADD_DEPENDENCIES(cl ${GIT_SHA1})
 target_link_libraries(
                       cl
+                      rt
                       ${X11_LIBRARIES}
                       ${XEXT_LIBRARIES}
                       ${XFIXES_LIBRARIES}
diff --git a/src/OCLConfig.h.in b/src/OCLConfig.h.in
index 71de4b3..8662584 100644
--- a/src/OCLConfig.h.in
+++ b/src/OCLConfig.h.in
@@ -1,6 +1,5 @@
 // the configured options and settings for LIBCL
 #define LIBCL_DRIVER_VERSION_MAJOR @LIBCL_DRIVER_VERSION_MAJOR@
 #define LIBCL_DRIVER_VERSION_MINOR @LIBCL_DRIVER_VERSION_MINOR@
-#define LIBCL_DRIVER_VERSION_PATCH @LIBCL_DRIVER_VERSION_PATCH@
 #define LIBCL_C_VERSION_MAJOR @LIBCL_C_VERSION_MAJOR@
 #define LIBCL_C_VERSION_MINOR @LIBCL_C_VERSION_MINOR@
diff --git a/src/cl_accelerator_intel.c b/src/cl_accelerator_intel.c
new file mode 100644
index 0000000..cda8963
--- /dev/null
+++ b/src/cl_accelerator_intel.c
@@ -0,0 +1,86 @@
+#include "cl_context.h"
+#include "cl_accelerator_intel.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+
+#include <assert.h>
+
+LOCAL cl_accelerator_intel
+cl_accelerator_intel_new(cl_context ctx,
+                         cl_accelerator_type_intel accel_type,
+                         size_t desc_sz,
+                         const void* desc,
+                         cl_int* errcode_ret)
+{
+  cl_accelerator_intel accel = NULL;
+  cl_int err = CL_SUCCESS;
+
+  /* Allocate and inialize the structure itself */
+  TRY_ALLOC(accel, CALLOC(struct _cl_accelerator_intel));
+  SET_ICD(accel->dispatch)
+  accel->ref_n = 1;
+  accel->magic = CL_MAGIC_ACCELERATOR_INTEL_HEADER;
+
+  if (accel_type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) {
+    err = CL_INVALID_ACCELERATOR_TYPE_INTEL;
+    goto error;
+  }
+  accel->type = accel_type;
+
+  if (desc == NULL) {   //  and check inside desc
+    err = CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
+    goto error;
+  }
+  accel->desc.me = *(cl_motion_estimation_desc_intel*)desc;
+
+  /* Append the accelerator_intel in the context accelerator_intel list */
+  /* does this really needed? */
+  pthread_mutex_lock(&ctx->accelerator_intel_lock);
+    accel->next = ctx->accels;
+    if (ctx->accels != NULL)
+      ctx->accels->prev = accel;
+    ctx->accels = accel;
+  pthread_mutex_unlock(&ctx->accelerator_intel_lock);
+
+  accel->ctx = ctx;
+  cl_context_add_ref(ctx);
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return accel;
+error:
+  cl_accelerator_intel_delete(accel);
+  accel = NULL;
+  goto exit;
+}
+
+LOCAL void
+cl_accelerator_intel_add_ref(cl_accelerator_intel accel)
+{
+  atomic_inc(&accel->ref_n);
+}
+
+LOCAL void
+cl_accelerator_intel_delete(cl_accelerator_intel accel)
+{
+  if (UNLIKELY(accel == NULL))
+    return;
+  if (atomic_dec(&accel->ref_n) > 1)
+    return;
+
+  /* Remove the accelerator_intel in the context accelerator_intel list */
+  pthread_mutex_lock(&accel->ctx->accelerator_intel_lock);
+    if (accel->prev)
+      accel->prev->next = accel->next;
+    if (accel->next)
+      accel->next->prev = accel->prev;
+    if (accel->ctx->accels == accel)
+      accel->ctx->accels = accel->next;
+  pthread_mutex_unlock(&accel->ctx->accelerator_intel_lock);
+
+  cl_context_delete(accel->ctx);
+  cl_free(accel);
+}
diff --git a/src/cl_accelerator_intel.h b/src/cl_accelerator_intel.h
new file mode 100644
index 0000000..cecfd2a
--- /dev/null
+++ b/src/cl_accelerator_intel.h
@@ -0,0 +1,29 @@
+#ifndef __CL_ACCELERATOR_INTEL_H__
+#define __CL_ACCELERATOR_INTEL_H__
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include <stdint.h>
+
+struct _cl_accelerator_intel {
+  DEFINE_ICD(dispatch)
+  uint64_t magic;            /* To identify it as a accelerator_intel object */
+  volatile int ref_n;        /* This object is reference counted */
+  cl_accelerator_intel prev, next;     /* We chain in the allocator, why chain? */
+  cl_context ctx;            /* Context it belongs to */
+  cl_accelerator_type_intel type;
+  union {
+    cl_motion_estimation_desc_intel me;
+  }desc;                     /* save desc before we decide how to handle it */
+};
+
+cl_accelerator_intel cl_accelerator_intel_new(cl_context ctx,
+                         cl_accelerator_type_intel accel_type,
+                         size_t desc_sz,
+                         const void* desc,
+                         cl_int* errcode_ret);
+
+void cl_accelerator_intel_add_ref(cl_accelerator_intel accel);
+void cl_accelerator_intel_delete(cl_accelerator_intel accel);
+
+#endif
diff --git a/src/cl_api.c b/src/cl_api.c
index 5c47f81..d0d4dc5 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -28,8 +28,10 @@
 #include "cl_mem.h"
 #include "cl_image.h"
 #include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
 #include "cl_alloc.h"
 #include "cl_utils.h"
+#include "cl_cmrt.h"
 
 #include "CL/cl.h"
 #include "CL/cl_ext.h"
@@ -56,6 +58,7 @@ typedef intptr_t cl_device_partition_property;
 	  if (param_value && param_value_size < sizeof(TYPE)*ELT) \
 	      return CL_INVALID_VALUE;  \
 	  if (param_value) { \
+	      memset(param_value, 0, param_value_size); \
 	      memcpy(param_value, (VAL), sizeof(TYPE)*ELT); \
 	  } \
           \
@@ -64,7 +67,7 @@ typedef intptr_t cl_device_partition_property;
 	  return RET; \
 	} while(0)
 
-inline cl_int
+static inline cl_int
 handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
               cl_event* event, enqueue_data* data, cl_command_type type)
 {
@@ -275,6 +278,11 @@ clRetainDevice(cl_device_id device)
 cl_int
 clReleaseDevice(cl_device_id device)
 {
+#ifdef HAS_CMRT
+  if (device->cmrt_device != NULL)
+    cmrt_destroy_device(device);
+#endif
+
   // XXX stub for C++ Bindings
   return CL_SUCCESS;
 }
@@ -550,8 +558,9 @@ clCreateImage(cl_context context,
     goto error;
   }
   /* buffer refers to a valid buffer memory object if image_type is
-     CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */
+     CL_MEM_OBJECT_IMAGE1D_BUFFER or CL_MEM_OBJECT_IMAGE2D. Otherwise it must be NULL. */
   if (image_desc->image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
+      image_desc->image_type != CL_MEM_OBJECT_IMAGE2D &&
          image_desc->buffer) {
     err = CL_INVALID_IMAGE_DESCRIPTOR;
     goto error;
@@ -679,6 +688,7 @@ clGetSupportedImageFormats(cl_context         ctx,
   }
   if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D &&
                image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
+               image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
                image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY &&
                image_type != CL_MEM_OBJECT_IMAGE2D &&
                image_type != CL_MEM_OBJECT_IMAGE3D)) {
@@ -939,11 +949,11 @@ clBuildProgram(cl_program            program,
     INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
   }
 
-  /* TODO support create program from binary */
   assert(program->source_type == FROM_LLVM ||
          program->source_type == FROM_SOURCE ||
          program->source_type == FROM_LLVM_SPIR ||
-         program->source_type == FROM_BINARY);
+         program->source_type == FROM_BINARY ||
+         program->source_type == FROM_CMRT);
   if((err = cl_program_build(program, options)) != CL_SUCCESS) {
     goto error;
   }
@@ -984,6 +994,7 @@ clCompileProgram(cl_program            program ,
   /* TODO support create program from binary */
   assert(program->source_type == FROM_LLVM ||
       program->source_type == FROM_SOURCE ||
+      program->source_type == FROM_LLVM_SPIR ||
       program->source_type == FROM_BINARY);
   if((err = cl_program_compile(program, num_input_headers, input_headers, header_include_names, options)) != CL_SUCCESS) {
     goto error;
@@ -1242,7 +1253,13 @@ clSetKernelArg(cl_kernel     kernel,
 {
   cl_int err = CL_SUCCESS;
   CHECK_KERNEL(kernel);
-  err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
+
+#ifdef HAS_CMRT
+  if (kernel->cmrt_kernel != NULL)
+    err = cmrt_set_kernel_arg(kernel, arg_index, arg_size, arg_value);
+  else
+#endif
+    err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
 error:
   return err;
 }
@@ -1331,6 +1348,26 @@ clGetKernelWorkGroupInfo(cl_kernel                   kernel,
 }
 
 cl_int
+clGetKernelSubGroupInfoKHR(cl_kernel                   kernel,
+                          cl_device_id                device,
+                          cl_kernel_work_group_info   param_name,
+                          size_t                      input_value_size,
+                          const void *                input_value,
+                          size_t                      param_value_size,
+                          void *                      param_value,
+                          size_t *                    param_value_size_ret)
+{
+  return cl_get_kernel_subgroup_info(kernel,
+                                     device,
+                                     param_name,
+                                     input_value_size,
+                                     input_value,
+                                     param_value_size,
+                                     param_value,
+                                     param_value_size_ret);
+}
+
+cl_int
 clWaitForEvents(cl_uint          num_events,
                 const cl_event * event_list)
 {
@@ -1530,6 +1567,12 @@ clFinish(cl_command_queue command_queue)
   cl_int err = CL_SUCCESS;
 
   CHECK_QUEUE (command_queue);
+
+#ifdef HAS_CMRT
+  if (command_queue->cmrt_event != NULL)
+    return cmrt_wait_for_task_finished(command_queue);
+#endif
+
   err = cl_command_queue_finish(command_queue);
 
 error:
@@ -2653,6 +2696,11 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
     goto error;
   }
 
+#ifdef HAS_CMRT
+  if (command_queue->cmrt_event != NULL)
+    cmrt_wait_for_task_finished(command_queue);
+#endif
+
   TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
 
   data = &no_wait_data;
@@ -2741,6 +2789,11 @@ clEnqueueMapImage(cl_command_queue   command_queue,
     goto error;
   }
 
+#ifdef HAS_CMRT
+  if (command_queue->cmrt_event != NULL)
+    cmrt_wait_for_task_finished(command_queue);
+#endif
+
   TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
 
   data = &no_wait_data;
@@ -2912,6 +2965,17 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     goto error;
   }
 
+  if (kernel->vme) {
+    if (work_dim != 2) {
+      err = CL_INVALID_WORK_DIMENSION;
+      goto error;
+    }
+    if (local_work_size != NULL) {
+      err = CL_INVALID_WORK_GROUP_SIZE;
+      goto error;
+    }
+  }
+
   if (global_work_offset != NULL)
     for (i = 0; i < work_dim; ++i) {
       if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
@@ -2935,6 +2999,12 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     goto error;
   }
 
+#ifdef HAS_CMRT
+  if (kernel->cmrt_kernel != NULL) {
+    err = cmrt_enqueue(command_queue, kernel, global_work_size, local_work_size);
+    goto error;
+  }
+#endif
 
   /* XXX No event right now */
   //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
@@ -2945,22 +3015,35 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     for (i = 0; i < work_dim; ++i)
       fixed_local_sz[i] = local_work_size[i];
   } else {
-    uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
-    for (i = 0; i< work_dim; i++) {
-      for (j = maxDimSize; j > 1; j--) {
-        if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
-          fixed_local_sz[i] = j;
-          maxGroupSize = maxGroupSize /j;
-          maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
-          break;  //choose next work_dim
+    if (kernel->vme) {
+        fixed_local_sz[0] = 16;
+        fixed_local_sz[1] = 1;
+    } else {
+      uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+      size_t realGroupSize = 1;
+      for (i = 0; i< work_dim; i++) {
+        for (j = maxDimSize; j > 1; j--) {
+          if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+            fixed_local_sz[i] = j;
+            maxGroupSize = maxGroupSize /j;
+            maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+            break;  //choose next work_dim
+          }
         }
+        realGroupSize *= fixed_local_sz[i];
       }
+      if (realGroupSize % 8 != 0)
+        DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide local_work_size[] explicitly, you can find good values with trial-and-error method.");
     }
   }
 
-  if (global_work_size != NULL)
+  if (kernel->vme) {
+    fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16;
+    fixed_global_sz[1] = (global_work_size[1]+15) / 16;
+  } else {
     for (i = 0; i < work_dim; ++i)
       fixed_global_sz[i] = global_work_size[i];
+  }
   if (global_work_offset != NULL)
     for (i = 0; i < work_dim; ++i)
       fixed_global_off[i] = global_work_offset[i];
@@ -3190,6 +3273,13 @@ internal_clGetExtensionFunctionAddress(const char *func_name)
   EXTFUNC(clCreateBufferFromLibvaIntel)
   EXTFUNC(clCreateImageFromLibvaIntel)
   EXTFUNC(clGetMemObjectFdIntel)
+  EXTFUNC(clCreateBufferFromFdINTEL)
+  EXTFUNC(clCreateImageFromFdINTEL)
+  EXTFUNC(clCreateAcceleratorINTEL)
+  EXTFUNC(clRetainAcceleratorINTEL)
+  EXTFUNC(clReleaseAcceleratorINTEL)
+  EXTFUNC(clGetAcceleratorInfoINTEL)
+  EXTFUNC(clGetKernelSubGroupInfoKHR)
   return NULL;
 }
 
@@ -3358,3 +3448,126 @@ clGetMemObjectFdIntel(cl_context context,
 error:
   return err;
 }
+
+cl_mem
+clCreateBufferFromFdINTEL(cl_context context,
+                          const cl_import_buffer_info_intel* info,
+                          cl_int *errorcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  if (!info) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  mem = cl_mem_new_buffer_from_fd(context, info->fd, info->size, &err);
+
+error:
+  if (errorcode_ret)
+    *errorcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateImageFromFdINTEL(cl_context context,
+                         const cl_import_image_info_intel* info,
+                         cl_int *errorcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  if (!info) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  /* Create image object from fd.
+   * We just support creating CL_MEM_OBJECT_IMAGE2D image object now.
+   * Other image type will be supported later if necessary.
+   */
+  if(info->type == CL_MEM_OBJECT_IMAGE2D){
+    mem = cl_mem_new_image_from_fd(context,
+                                   info->fd, info->size,
+                                   info->offset,
+                                   info->width, info->height,
+                                   info->fmt, info->row_pitch,
+                                   &err);
+  }
+  else{
+    err = CL_INVALID_ARG_VALUE;
+    goto error;
+  }
+
+error:
+  if (errorcode_ret)
+    *errorcode_ret = err;
+  return mem;
+}
+
+cl_accelerator_intel
+clCreateAcceleratorINTEL(cl_context context,
+                         cl_accelerator_type_intel accel_type,
+                         size_t desc_sz,
+                         const void* desc,
+                         cl_int* errcode_ret)
+{
+
+  cl_accelerator_intel accel = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT(context);
+  accel = cl_accelerator_intel_new(context, accel_type, desc_sz, desc, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return accel;
+}
+
+cl_int
+clRetainAcceleratorINTEL(cl_accelerator_intel accel)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_ACCELERATOR_INTEL(accel);
+  cl_accelerator_intel_add_ref(accel);
+error:
+  return err;
+}
+
+cl_int
+clReleaseAcceleratorINTEL(cl_accelerator_intel accel)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_ACCELERATOR_INTEL(accel);
+  cl_accelerator_intel_delete(accel);
+error:
+  return err;
+}
+
+cl_int
+clGetAcceleratorInfoINTEL(cl_accelerator_intel           accel,
+                            cl_accelerator_info_intel    param_name,
+                            size_t                       param_value_size,
+                            void*                        param_value,
+                            size_t*                      param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_ACCELERATOR_INTEL(accel);
+
+  if (param_name == CL_ACCELERATOR_REFERENCE_COUNT_INTEL) {
+    FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&accel->ref_n, CL_SUCCESS);
+  } else if (param_name == CL_ACCELERATOR_CONTEXT_INTEL) {
+    FILL_GETINFO_RET (cl_context, 1, &accel->ctx, CL_SUCCESS);
+  } else if (param_name == CL_ACCELERATOR_TYPE_INTEL) {
+    FILL_GETINFO_RET (cl_uint, 1, &accel->type, CL_SUCCESS);
+  } else if (param_name == CL_ACCELERATOR_DESCRIPTOR_INTEL) {
+    FILL_GETINFO_RET (cl_motion_estimation_desc_intel, 1, &(accel->desc.me), CL_SUCCESS);
+  } else{
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
+}
diff --git a/src/cl_cmrt.cpp b/src/cl_cmrt.cpp
new file mode 100644
index 0000000..25e4d82
--- /dev/null
+++ b/src/cl_cmrt.cpp
@@ -0,0 +1,311 @@
+#include "cl_cmrt.h"
+#include "cl_device_id.h"
+#include "intel/intel_defines.h"
+#include "cl_command_queue.h"
+
+#include "cm_rt.h"      //header file of libcmrt.so
+typedef INT (*CreateCmDeviceFunc)(CmDevice * &pDevice, UINT & version,
+			    CmDriverContext * drivercontext, UINT DevCreateOption);
+typedef INT (*DestroyCmDeviceFunc)(CmDevice * &pDevice);
+
+#include <dlfcn.h>
+
+static void* dlhCMRT = NULL;
+static CreateCmDeviceFunc pfnCreateCmDevice = NULL;
+static DestroyCmDeviceFunc pfnDestroyCmDevice = NULL;
+
+#define XSTR(x) #x
+#define STR(x) XSTR(x)
+
+class CmrtCleanup
+{
+public:
+  CmrtCleanup(){}
+  ~CmrtCleanup()
+  {
+    if (dlhCMRT != NULL)
+      dlclose(dlhCMRT);
+  }
+};
+
+enum CMRT_MEM_TYPE
+{
+    CMRT_BUFFER,
+    CMRT_SURFACE2D,
+};
+
+static CmrtCleanup cmrtCleanup;
+
+static bool LoadCmrtLibrary()
+{
+  if (dlhCMRT == NULL) {
+    dlhCMRT = dlopen(STR(CMRT_PATH), RTLD_LAZY | RTLD_LOCAL);
+
+    if (dlhCMRT == NULL)
+      return false;
+
+    pfnCreateCmDevice = (CreateCmDeviceFunc)dlsym(dlhCMRT, "CreateCmDevice");
+    if (pfnCreateCmDevice == NULL)
+      return false;
+
+    pfnDestroyCmDevice = (DestroyCmDeviceFunc)dlsym(dlhCMRT, "DestroyCmDevice");
+    if (pfnDestroyCmDevice == NULL)
+      return false;
+  }
+  return true;
+}
+
+cl_int cmrt_build_program(cl_program p, const char *options)
+{
+  CmDevice*& cmrt_device = (CmDevice*&)(p->ctx->device->cmrt_device);
+  int result;
+  if (cmrt_device == NULL)
+  {
+    if (!LoadCmrtLibrary())
+      return CL_DEVICE_NOT_AVAILABLE;   //yes, the error is not accurate, but i do not find a bettere one
+
+    CmDriverContext ctx;
+    ctx.shared_bufmgr = 1;
+    ctx.bufmgr = (drm_intel_bufmgr*)cl_context_get_bufmgr(p->ctx);
+    ctx.userptr_enabled = 0;
+    ctx.deviceid = p->ctx->device->device_id;
+    ctx.device_rev = -1;
+    UINT version = 0;
+    result = (*pfnCreateCmDevice)(cmrt_device, version, &ctx, CM_DEVICE_CREATE_OPTION_DEFAULT);
+    if (result != CM_SUCCESS)
+      return CL_DEVICE_NOT_AVAILABLE;
+  }
+
+  CmProgram* cmrt_program = NULL;
+  result = cmrt_device->LoadProgram(p->binary, p->binary_sz, cmrt_program, options);
+  if (result != CM_SUCCESS)
+    return CL_COMPILE_PROGRAM_FAILURE;
+
+  p->cmrt_program = cmrt_program;
+  cmrt_program->GetKernelCount(p->ker_n);
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_destroy_program(cl_program p)
+{
+  CmDevice* cmrt_device = (CmDevice*)(p->ctx->device->cmrt_device);
+  CmProgram*& cmrt_program = (CmProgram*&)(p->cmrt_program);
+  if (cmrt_device->DestroyProgram(cmrt_program) != CM_SUCCESS)
+    return CL_INVALID_PROGRAM;
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_destroy_device(cl_device_id device)
+{
+  CmDevice*& cmrt_device = (CmDevice*&)(device->cmrt_device);
+  if ((*pfnDestroyCmDevice)(cmrt_device) != CM_SUCCESS)
+    return CL_INVALID_DEVICE;
+  return CL_SUCCESS;
+}
+
+void* cmrt_create_kernel(cl_program p, const char *name)
+{
+  CmDevice* cmrt_device = (CmDevice*)(p->ctx->device->cmrt_device);
+  CmKernel* cmrt_kernel = NULL;
+  int result = cmrt_device->CreateKernel((CmProgram*)(p->cmrt_program), name, cmrt_kernel);
+  if (result != CM_SUCCESS)
+    return NULL;
+
+  return cmrt_kernel;
+}
+
+cl_int cmrt_destroy_kernel(cl_kernel k)
+{
+  CmDevice* cmrt_device = (CmDevice*)(k->program->ctx->device->cmrt_device);
+  CmKernel*& cmrt_kernel = (CmKernel*&)(k->cmrt_kernel);
+  if (cmrt_device->DestroyKernel(cmrt_kernel) != CM_SUCCESS)
+    return CL_INVALID_KERNEL;
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_enqueue(cl_command_queue cq, cl_kernel k, const size_t* global_work_size, const size_t* local_work_size)
+{
+  CmDevice* cmrt_device = (CmDevice*)(k->program->ctx->device->cmrt_device);
+  CmKernel* cmrt_kernel = (CmKernel*)(k->cmrt_kernel);
+
+  int result = 0;
+
+  cmrt_kernel->SetThreadCount(global_work_size[0]*global_work_size[1]);
+
+  //no need to destory queue explicitly,
+  //and there is only one queue instance within each device,
+  //CreateQueue always returns the same instance
+  CmQueue* pCmQueue = NULL;
+  cmrt_device->CreateQueue(pCmQueue);
+
+  CmTask *pKernelArray = NULL;
+  cmrt_device->CreateTask(pKernelArray);
+
+  pKernelArray->AddKernel(cmrt_kernel);
+
+  CmEvent* e = NULL;
+
+  if (local_work_size == NULL) {
+    CmThreadSpace* pTS = NULL;
+    cmrt_device->CreateThreadSpace(global_work_size[0], global_work_size[1], pTS);
+    result = pCmQueue->Enqueue(pKernelArray, e, pTS);
+  } else {
+    CmThreadGroupSpace* pTGS = NULL;
+	cmrt_device->CreateThreadGroupSpace(global_work_size[0], global_work_size[1], local_work_size[0], local_work_size[1], pTGS);
+    result = pCmQueue->EnqueueWithGroup(pKernelArray, e, pTGS);
+    cmrt_device->DestroyThreadGroupSpace(pTGS);
+  }
+
+  if (result != CM_SUCCESS)
+    return CL_INVALID_OPERATION;
+
+  cmrt_device->DestroyTask(pKernelArray);
+
+  CmEvent*& olde = (CmEvent*&)cq->cmrt_event;
+  if (olde != NULL)
+    pCmQueue->DestroyEvent(e);
+
+  cq->cmrt_event = e;
+
+  return CL_SUCCESS;
+}
+
+static VA_CM_FORMAT GetCmrtFormat(_cl_mem_image* image)
+{
+    switch (image->intel_fmt)
+    {
+    case I965_SURFACEFORMAT_B8G8R8A8_UNORM:
+      return VA_CM_FMT_A8R8G8B8;
+    case I965_SURFACEFORMAT_B8G8R8X8_UNORM:
+      return VA_CM_FMT_X8R8G8B8;
+    case I965_SURFACEFORMAT_A8_UNORM:
+      return VA_CM_FMT_A8;
+    case I965_SURFACEFORMAT_R10G10B10A2_UNORM:
+      return VA_CM_FMT_A2B10G10R10;
+    case I965_SURFACEFORMAT_R16G16B16A16_UNORM:
+      return VA_CM_FMT_A16B16G16R16;
+    case I965_SURFACEFORMAT_L8_UNORM:
+      return VA_CM_FMT_L8;
+    case I965_SURFACEFORMAT_R16_UINT:
+      return VA_CM_FMT_R16U;
+    case I965_SURFACEFORMAT_R8_UNORM:
+      return VA_CM_FMT_R8U;
+    case I965_SURFACEFORMAT_L16_UNORM:
+      return VA_CM_FMT_L16;
+    case I965_SURFACEFORMAT_R32_FLOAT:
+      return VA_CM_FMT_R32F;
+    default:
+      return VA_CM_FMT_UNKNOWN;
+    }
+}
+
+static bool CreateCmrtMemory(cl_mem mem)
+{
+  if (mem->cmrt_mem != NULL)
+    return true;
+
+  CmDevice* cmrt_device = (CmDevice*)(mem->ctx->device->cmrt_device);
+  int result;
+  CmOsResource osResource;
+  osResource.bo_size = mem->size;
+  osResource.bo_flags = DRM_BO_HANDLE;
+  osResource.bo = (drm_intel_bo*)mem->bo;
+  if (IS_IMAGE(mem)) {
+    _cl_mem_image* image = cl_mem_image(mem);
+    if (CL_MEM_OBJECT_IMAGE2D != image->image_type)
+      return CL_INVALID_ARG_VALUE;
+    osResource.format = GetCmrtFormat(image);
+    if (osResource.format == VA_CM_FMT_UNKNOWN)
+      return false;
+    osResource.aligned_width = image->row_pitch;
+    osResource.aligned_height = mem->size / image->row_pitch;
+    osResource.pitch = image->row_pitch;
+    osResource.tile_type = image->tiling;
+    osResource.orig_width = image->w;
+    osResource.orig_height = image->h;
+    CmSurface2D*& cmrt_surface2d = (CmSurface2D*&)(mem->cmrt_mem);
+    result = cmrt_device->CreateSurface2D(&osResource, cmrt_surface2d);
+    mem->cmrt_mem_type = CMRT_SURFACE2D;
+  } else {
+    osResource.format = VA_CM_FMT_BUFFER;
+    osResource.buf_bytes = mem->size;
+    CmBuffer*& cmrt_buffer = (CmBuffer*&)(mem->cmrt_mem);
+    result = cmrt_device->CreateBuffer(&osResource, cmrt_buffer);
+    mem->cmrt_mem_type = CMRT_BUFFER;
+  }
+
+  if (result != CM_SUCCESS)
+    return false;
+
+  return true;
+}
+
+cl_int cmrt_set_kernel_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
+{
+  if(value == NULL)
+    return CL_INVALID_ARG_VALUE;
+
+  CmKernel* cmrt_kernel = (CmKernel*)(k->cmrt_kernel);
+
+  WORD argKind = -1;
+  if (cmrt_kernel->GetArgKind(index, argKind) != CM_SUCCESS)
+    return CL_INVALID_ARG_INDEX;
+
+  int result;
+  if (argKind == ARG_KIND_GENERAL)
+    result = cmrt_kernel->SetKernelArg(index, sz, value);
+  else {
+    cl_mem mem = *(cl_mem*)value;
+    if (mem->magic == CL_MAGIC_MEM_HEADER) {
+      if (!CreateCmrtMemory(mem))
+        return CL_INVALID_ARG_VALUE;
+
+      SurfaceIndex * memIndex = NULL;
+      if (mem->cmrt_mem_type == CMRT_BUFFER) {
+        CmBuffer* cmrt_buffer = (CmBuffer*)(mem->cmrt_mem);
+        cmrt_buffer->GetIndex(memIndex);
+      } else {
+        CmSurface2D* cmrt_surface2d = (CmSurface2D*)(mem->cmrt_mem);
+        cmrt_surface2d->GetIndex(memIndex);
+      }
+      result = cmrt_kernel->SetKernelArg(index, sizeof(SurfaceIndex), memIndex);
+    } else
+      return CL_INVALID_ARG_VALUE;
+  }
+
+  if (result != CM_SUCCESS)
+    return CL_INVALID_KERNEL_ARGS;
+
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_destroy_memory(cl_mem mem)
+{
+  CmDevice* cmrt_device = (CmDevice*)(mem->ctx->device->cmrt_device);
+  if (mem->cmrt_mem_type == CMRT_BUFFER) {
+    CmBuffer*& cmrt_buffer = (CmBuffer*&)(mem->cmrt_mem);
+    cmrt_device->DestroySurface(cmrt_buffer);
+  } else {
+    CmSurface2D*& cmrt_surface2d = (CmSurface2D*&)(mem->cmrt_mem);
+    cmrt_device->DestroySurface(cmrt_surface2d);
+  }
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_destroy_event(cl_command_queue cq)
+{
+  CmEvent*& cmrt_event = (CmEvent*&)(cq->cmrt_event);
+  CmDevice* cmrt_device = (CmDevice*)(cq->ctx->device->cmrt_device);
+  CmQueue* pCmQueue = NULL;
+  cmrt_event->WaitForTaskFinished();
+  cmrt_device->CreateQueue(pCmQueue);
+  pCmQueue->DestroyEvent(cmrt_event);
+  return CL_SUCCESS;
+}
+
+cl_int cmrt_wait_for_task_finished(cl_command_queue cq)
+{
+  CmEvent* cmrt_event = (CmEvent*)(cq->cmrt_event);
+  cmrt_event->WaitForTaskFinished();
+  return CL_SUCCESS;
+}
diff --git a/src/cl_cmrt.h b/src/cl_cmrt.h
new file mode 100644
index 0000000..316095c
--- /dev/null
+++ b/src/cl_cmrt.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright @2015 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Guo Yejun <yejun.guo at intel.com>
+ */
+
+#ifndef __CL_CMRT_H__
+#define __CL_CMRT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "cl_kernel.h"
+#include "cl_program.h"
+
+cl_int cmrt_build_program(cl_program p, const char *options);
+cl_int cmrt_destroy_program(cl_program p);
+cl_int cmrt_destroy_device(cl_device_id device);
+void* cmrt_create_kernel(cl_program p, const char *name);
+cl_int cmrt_destroy_kernel(cl_kernel k);
+cl_int cmrt_enqueue(cl_command_queue cq, cl_kernel k, const size_t* global_work_size, const size_t* local_work_size);
+cl_int cmrt_set_kernel_arg(cl_kernel k, cl_uint index, size_t sz, const void *value);
+cl_int cmrt_destroy_memory(cl_mem mem);
+cl_int cmrt_destroy_event(cl_command_queue cq);
+cl_int cmrt_wait_for_task_finished(cl_command_queue cq);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 50436fc..b66928f 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -31,6 +31,7 @@
 #include "cl_khr_icd.h"
 #include "cl_event.h"
 #include "performance.h"
+#include "cl_cmrt.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -47,6 +48,7 @@ cl_command_queue_new(cl_context ctx)
   queue->magic = CL_MAGIC_QUEUE_HEADER;
   queue->ref_n = 1;
   queue->ctx = ctx;
+  queue->cmrt_event = NULL;
   if ((queue->thread_data = cl_thread_data_create()) == NULL) {
     goto error;
   }
@@ -76,6 +78,11 @@ cl_command_queue_delete(cl_command_queue queue)
   assert(queue);
   if (atomic_dec(&queue->ref_n) != 1) return;
 
+#ifdef HAS_CMRT
+  if (queue->cmrt_event != NULL)
+    cmrt_destroy_event(queue);
+#endif
+
   // If there is a list of valid events, we need to give them
   // a chance to call the call-back function.
   cl_event_update_last_events(queue,1);
@@ -95,6 +102,7 @@ cl_command_queue_delete(cl_command_queue queue)
   cl_mem_delete(queue->perf);
   cl_context_delete(queue->ctx);
   cl_free(queue->wait_events);
+  cl_free(queue->barrier_events);
   queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(queue);
 }
@@ -133,19 +141,25 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
     struct _cl_mem_image *image;
     assert(interp_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
 
-    //currently, user ptr is not supported for cl image, so offset should be always zero
-    assert(k->args[id].mem->offset == 0);
-
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
-    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
-                        image->intel_fmt, image->image_type, image->bpp,
-                        image->w, image->h, image->depth,
-                        image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+    if(k->vme){
+      if( (image->fmt.image_channel_order != CL_R) || (image->fmt.image_channel_data_type != CL_UNORM_INT8) )
+        return CL_IMAGE_FORMAT_NOT_SUPPORTED;
+      cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
+                          image->intel_fmt, image->image_type, image->bpp,
+                          image->w, image->h, image->depth,
+                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+    }
+    else
+      cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
+                          image->intel_fmt, image->image_type, image->bpp,
+                          image->w, image->h, image->depth,
+                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
     // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
     // on demand.
     if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
-      cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset,
+      cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset + k->args[id].mem->offset,
                           image->intel_fmt, image->image_type, image->bpp,
                           image->w, image->h, image->depth,
                           image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
@@ -162,11 +176,13 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   uint32_t i;
   enum gbe_arg_type arg_type; /* kind of argument */
   for (i = 0; i < k->arg_n; ++i) {
-    uint32_t offset; // location of the address in the curbe
+    int32_t offset; // location of the address in the curbe
     arg_type = interp_kernel_get_arg_type(k->opaque, i);
     if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
       continue;
     offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
+    if (offset < 0)
+      continue;
     if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
       struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
       cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
@@ -218,31 +234,28 @@ error:
 LOCAL int
 cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
 {
-  size_t global_wk_sz[3];
-  size_t outbuf_sz = 0;
-  void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz, &outbuf_sz);
+  void* printf_info = cl_gpgpu_get_printf_info(gpgpu);
+  void* profiling_info;
 
   if (cl_gpgpu_flush(gpgpu) < 0)
     return CL_OUT_OF_RESOURCES;
 
   if (printf_info && interp_get_printf_num(printf_info)) {
-    void *index_addr = cl_gpgpu_map_printf_buffer(gpgpu, 0);
-    void *buf_addr = NULL;
-    if (interp_get_printf_sizeof_size(printf_info))
-      buf_addr = cl_gpgpu_map_printf_buffer(gpgpu, 1);
-
-    interp_output_printf(printf_info, index_addr, buf_addr, global_wk_sz[0],
-                      global_wk_sz[1], global_wk_sz[2], outbuf_sz);
-
-    cl_gpgpu_unmap_printf_buffer(gpgpu, 0);
-    if (interp_get_printf_sizeof_size(printf_info))
-      cl_gpgpu_unmap_printf_buffer(gpgpu, 1);
+    void *addr = cl_gpgpu_map_printf_buffer(gpgpu);
+    interp_output_printf(printf_info, addr);
+    cl_gpgpu_unmap_printf_buffer(gpgpu);
   }
 
   if (printf_info) {
     interp_release_printf_info(printf_info);
-    global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
-    cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
+    cl_gpgpu_set_printf_info(gpgpu, NULL);
+  }
+
+  /* If have profiling info, output it. */
+  profiling_info = cl_gpgpu_get_profiling_info(gpgpu);
+  if (profiling_info) {
+    interp_output_profiling(profiling_info, cl_gpgpu_map_profiling_buffer(gpgpu));
+    cl_gpgpu_unmap_profiling_buffer(gpgpu);
   }
   return CL_SUCCESS;
 }
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 2cd6739..d1b8c44 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -44,6 +44,8 @@ struct _cl_command_queue {
   cl_command_queue prev, next;         /* We chain the command queues together */
   void *thread_data;                   /* Used to store thread context data */
   cl_mem perf;                         /* Where to put the perf counters */
+
+  void* cmrt_event;                    /* the latest CmEvent* of the command queue */
 };
 
 /* The macro to get the thread specified gpgpu struct. */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index bbb04ab..6a9cf1f 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -46,8 +46,9 @@ cl_set_varying_payload(const cl_kernel ker,
 {
   uint32_t *ids[3] = {NULL,NULL,NULL};
   uint16_t *block_ips = NULL;
+  uint32_t *thread_ids = NULL;
   size_t i, j, k, curr = 0;
-  int32_t id_offset[3], ip_offset;
+  int32_t id_offset[3], ip_offset, tid_offset;
   cl_int err = CL_SUCCESS;
   int32_t dw_ip_offset = -1;
 
@@ -55,18 +56,23 @@ cl_set_varying_payload(const cl_kernel ker,
   id_offset[1] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0);
   id_offset[2] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0);
   ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0);
+  tid_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_THREAD_ID, 0);
   if (ip_offset < 0)
     dw_ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_DW_BLOCK_IP, 0);
   assert(ip_offset < 0 || dw_ip_offset < 0);
-  assert(id_offset[0] >= 0 &&
-         id_offset[1] >= 0 &&
-         id_offset[2] >= 0 &&
-         (ip_offset >= 0 || dw_ip_offset >= 0));
-
-  TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
-  TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
-  TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+  assert(ip_offset >= 0 || dw_ip_offset >= 0);
+
+  if (id_offset[0] >= 0)
+    TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+  if (id_offset[1] >= 0)
+    TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+  if (id_offset[2] >= 0)
+    TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
   TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz));
+  if (tid_offset >= 0) {
+    TRY_ALLOC(thread_ids, (uint32_t*) alloca(sizeof(uint32_t)*thread_n));
+    memset(thread_ids, 0, sizeof(uint32_t)*thread_n);
+  }
   /* 0xffff means that the lane is inactivated */
   memset(block_ips, 0xff, sizeof(int16_t)*thread_n*simd_sz);
 
@@ -75,10 +81,15 @@ cl_set_varying_payload(const cl_kernel ker,
   for (k = 0; k < local_wk_sz[2]; ++k)
   for (j = 0; j < local_wk_sz[1]; ++j)
   for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
-    ids[0][curr] = i;
-    ids[1][curr] = j;
-    ids[2][curr] = k;
+    if (id_offset[0] >= 0)
+      ids[0][curr] = i;
+    if (id_offset[1] >= 0)
+      ids[1][curr] = j;
+    if (id_offset[2] >= 0)
+      ids[2][curr] = k;
     block_ips[curr] = 0;
+    if (thread_ids)
+      thread_ids[curr/simd_sz] = curr/simd_sz;
   }
 
   /* Copy them to the curbe buffer */
@@ -89,10 +100,17 @@ cl_set_varying_payload(const cl_kernel ker,
     uint32_t *ids2 = (uint32_t *) (data + id_offset[2]);
     uint16_t *ips  = (uint16_t *) (data + ip_offset);
     uint32_t *dw_ips  = (uint32_t *) (data + dw_ip_offset);
+
+    if (thread_ids)
+      *(uint32_t *)(data + tid_offset) = thread_ids[i];
+
     for (j = 0; j < simd_sz; ++j, ++curr) {
-      ids0[j] = ids[0][curr];
-      ids1[j] = ids[1][curr];
-      ids2[j] = ids[2][curr];
+      if (id_offset[0] >= 0)
+        ids0[j] = ids[0][curr];
+      if (id_offset[1] >= 0)
+        ids1[j] = ids[1][curr];
+      if (id_offset[2] >= 0)
+        ids2[j] = ids[2][curr];
       if (ip_offset >= 0)
         ips[j] = block_ips[curr];
       if (dw_ip_offset >= 0)
@@ -167,7 +185,8 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
       uint32_t alignment = interp_kernel_get_arg_align(ker->opaque, arg);
       offset = ALIGN(offset, alignment);
       curbe_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
-      assert(curbe_offset >= 0);
+      if (curbe_offset < 0)
+        continue;
       *(uint32_t *) (ker->curbe + curbe_offset) = offset;
 
       cl_buffer_map(mem->bo, 1);
@@ -210,23 +229,6 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
 #undef UPLOAD
 
-  /* get_sub_group_id needs it */
-  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LANE_ID, 0)) >= 0) {
-    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
-    uint32_t *laneid = (uint32_t *) (ker->curbe + offset);
-    int32_t i;
-    for (i = 0; i < (int32_t) simd_sz; ++i) laneid[i] = i;
-  }
-
-  /* Write identity for the stack pointer. This is required by the stack pointer
-   * computation in the kernel
-   */
-  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) {
-    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
-    uint32_t *stackptr = (uint32_t *) (ker->curbe + offset);
-    int32_t i;
-    for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
-  }
   /* Handle the various offsets to SLM */
   const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
   int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque);
@@ -239,7 +241,8 @@ cl_curbe_fill(cl_kernel ker,
     assert(align != 0);
     slm_offset = ALIGN(slm_offset, align);
     offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
-    assert(offset >= 0);
+    if (offset < 0)
+      continue;
     uint32_t *slmptr = (uint32_t *) (ker->curbe + offset);
     *slmptr = slm_offset;
     slm_offset += ker->args[arg].local_sz;
@@ -279,30 +282,48 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
 }
 
 static int
-cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num, size_t global_sz) {
-  int32_t value = GBE_CURBE_PRINTF_INDEX_POINTER;
-  int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
-  size_t buf_size = global_sz * sizeof(int) * printf_num;
-  if (offset > 0) {
-    if (cl_gpgpu_set_printf_buffer(gpgpu, 0, buf_size*2, offset, interp_get_printf_indexbuf_bti(printf_info)) != 0)
-      return -1;
+cl_bind_profiling(cl_gpgpu gpgpu, uint32_t simd_sz, cl_kernel ker, size_t global_sz, size_t local_sz, uint32_t bti) {
+  int32_t offset;
+  int i = 0;
+  int thread_num;
+  if (simd_sz == 16) {
+    for(i = 0; i < 3; i++) {
+      offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_PROFILING_TIMESTAMP0 + i, 0);
+      assert(offset >= 0);
+      memset(ker->curbe + offset, 0x0, sizeof(uint32_t)*8*2);
+      thread_num = (local_sz + 15)/16;
+    }
+  } else {
+    assert(simd_sz == 8);
+    for(i = 0; i < 5; i++) {
+      offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_PROFILING_TIMESTAMP0 + i, 0);
+      assert(offset >= 0);
+      memset(ker->curbe + offset, 0x0, sizeof(uint32_t)*8);
+      thread_num = (local_sz + 7)/8;
+    }
   }
 
-  value = GBE_CURBE_PRINTF_BUF_POINTER;
-  offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
-  buf_size = interp_get_printf_sizeof_size(printf_info) * global_sz;
-  /* because of the printf may exist in a loop, which loop number can not be gotten by
-     static analysis. So we set the data buffer as big as we can. Out of bound printf
-     info will be discarded. */
-  if (buf_size < 1*1024)
+  offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_PROFILING_BUF_POINTER, 0);
+  thread_num = thread_num*(global_sz/local_sz);
+  if (cl_gpgpu_set_profiling_buffer(gpgpu, thread_num*128 + 4, offset, bti))
+    return -1;
+
+  return 0;
+}
+
+
+static int
+cl_alloc_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num, size_t global_sz) {
+  /* An guess size. */
+  size_t buf_size = global_sz * sizeof(int) * 16 * printf_num;
+  if (buf_size > 16*1024*1024) //at most.
+    buf_size = 16*1024*1024;
+  if (buf_size < 1*1024*1024) // at least.
     buf_size = 1*1024*1024;
-  else
-    buf_size = 16*1024*1024; //at most.
 
-  if (offset > 0) {
-    if (cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset, interp_get_printf_buf_bti(printf_info)) != 0)
-      return -1;
-  }
+  if (cl_gpgpu_set_printf_buffer(gpgpu, buf_size, interp_get_printf_buf_bti(printf_info)) != 0)
+	return -1;
+
   return 0;
 }
 
@@ -338,27 +359,27 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 
   /* Compute the number of HW threads we need */
   if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) {
-    fprintf(stderr, "Beignet: Work group size exceed Kernel's work group size.\n");
+    DEBUGP(DL_ERROR, "Work group size exceed Kernel's work group size.");
     return err;
   }
   kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
   kernel.curbe_sz = cst_sz;
 
   if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
-    fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz);
+    DEBUGP(DL_ERROR, "Out of scratch memory %d.", scratch_sz);
     return CL_OUT_OF_RESOURCES;
   }
   /* Curbe step 1: fill the constant urb buffer data shared by all threads */
   if (ker->curbe) {
     kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
     if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
-      fprintf(stderr, "Beignet: Out of shared local memory %d.\n", kernel.slm_sz);
+      DEBUGP(DL_ERROR, "Out of shared local memory %d.", kernel.slm_sz);
       return CL_OUT_OF_RESOURCES;
     }
   }
 
   printf_info = interp_dup_printfset(ker->opaque);
-  cl_gpgpu_set_printf_info(gpgpu, printf_info, (size_t *)global_wk_sz);
+  cl_gpgpu_set_printf_info(gpgpu, printf_info);
 
   /* Setup the kernel */
   if (queue->props & CL_QUEUE_PROFILING_ENABLE)
@@ -369,16 +390,27 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
     goto error;
   printf_num = interp_get_printf_num(printf_info);
   if (printf_num) {
-    if (cl_bind_printf(gpgpu, ker, printf_info, printf_num, global_size) != 0)
+    if (cl_alloc_printf(gpgpu, ker, printf_info, printf_num, global_size) != 0)
+      goto error;
+  }
+  if (interp_get_profiling_bti(ker->opaque) != 0) {
+    if (cl_bind_profiling(gpgpu, simd_sz, ker, global_size, local_sz, interp_get_profiling_bti(ker->opaque)))
       goto error;
+    cl_gpgpu_set_profiling_info(gpgpu, interp_dup_profiling(ker->opaque));
+  } else {
+	cl_gpgpu_set_profiling_info(gpgpu, NULL);
   }
 
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
   /* Bind user images */
-  cl_command_queue_bind_image(queue, ker);
+  if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker) != CL_SUCCESS))
+    return err;
   /* Bind all samplers */
-  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
+  if (ker->vme)
+    cl_gpgpu_bind_vme_state(gpgpu, ker->accel);
+  else
+    cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
 
   if (cl_gpgpu_set_scratch(gpgpu, scratch_sz) != 0)
     goto error;
diff --git a/src/cl_context.c b/src/cl_context.c
index c45e0aa..a6bde7d 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -177,6 +177,7 @@ cl_context_new(struct _cl_context_prop *props)
   pthread_mutex_init(&ctx->queue_lock, NULL);
   pthread_mutex_init(&ctx->buffer_lock, NULL);
   pthread_mutex_init(&ctx->sampler_lock, NULL);
+  pthread_mutex_init(&ctx->accelerator_intel_lock, NULL);
 
 exit:
   return ctx;
diff --git a/src/cl_context.h b/src/cl_context.h
index ef94823..489e5d7 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -21,6 +21,7 @@
 #define __CL_CONTEXT_H__
 
 #include "CL/cl.h"
+#include "CL/cl_ext.h"
 #include "cl_internals.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
@@ -107,11 +108,13 @@ struct _cl_context {
   cl_program programs;              /* All programs currently allocated */
   cl_mem buffers;                   /* All memory object currently allocated */
   cl_sampler samplers;              /* All sampler object currently allocated */
+  cl_accelerator_intel accels;      /* All accelerator_intel object currently allocated */
   cl_event   events;                /* All event object currently allocated */
   pthread_mutex_t queue_lock;       /* To allocate and deallocate queues */
   pthread_mutex_t program_lock;     /* To allocate and deallocate programs */
   pthread_mutex_t buffer_lock;      /* To allocate and deallocate buffers */
   pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
+  pthread_mutex_t accelerator_intel_lock;     /* To allocate and deallocate accelerator_intel */
   pthread_mutex_t event_lock;       /* To allocate and deallocate events */
   cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
                                     /* All programs internal used, for example clEnqueuexxx api use */
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index 63e078f..f680219 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -253,6 +253,7 @@
 #define PCI_CHIP_SKYLAKE_ULX_GT2	0x191E   /* Intel(R) Skylake ULX - GT2 */
 #define PCI_CHIP_SKYLAKE_DT_GT1		0x1902   /* Intel(R) Skylake Desktop - GT1 */
 #define PCI_CHIP_SKYLAKE_DT_GT2		0x1912   /* Intel(R) Skylake Desktop - GT2 */
+#define PCI_CHIP_SKYLAKE_DT_GT4   0x1932   /* Intel(R) Skylake Desktop - GT4 */
 #define PCI_CHIP_SKYLAKE_HALO_GT1 	0x190B   /* Intel(R) Skylake HALO - GT1 */
 #define PCI_CHIP_SKYLAKE_HALO_GT2	0x191B   /* Intel(R) Skylake HALO - GT2 */
 #define PCI_CHIP_SKYLAKE_HALO_GT3	0x192B   /* Intel(R) Skylake HALO - GT3 */
@@ -261,6 +262,9 @@
 #define PCI_CHIP_SKYLAKE_SRV_GT2	0x191A   /* Intel(R) Skylake Server - GT2 */
 #define PCI_CHIP_SKYLAKE_SRV_GT3	0x192A   /* Intel(R) Skylake Server - GT3 */
 #define PCI_CHIP_SKYLAKE_SRV_GT4	0x193A   /* Intel(R) Skylake Server - GT4 */
+#define PCI_CHIP_SKYLAKE_WKS_GT2  0x191D   /* Intel(R) Skylake WKS - GT2 */
+#define PCI_CHIP_SKYLAKE_MEDIA_SRV_GT3  0x192D /* Intel(R) Skylake Media Server - GT3 */
+#define PCI_CHIP_SKYLAKE_WKS_GT4  0x193D   /* Intel(R) Skylake WKS - GT4 */
 
 #define IS_SKL_GT1(devid)               \
   (devid == PCI_CHIP_SKYLAKE_ULT_GT1 ||   \
@@ -275,16 +279,20 @@
    devid == PCI_CHIP_SKYLAKE_ULX_GT2 || \
    devid == PCI_CHIP_SKYLAKE_DT_GT2 || \
    devid == PCI_CHIP_SKYLAKE_HALO_GT2 || \
-   devid == PCI_CHIP_SKYLAKE_SRV_GT2)
+   devid == PCI_CHIP_SKYLAKE_SRV_GT2 || \
+   devid == PCI_CHIP_SKYLAKE_WKS_GT2)
 
 #define IS_SKL_GT3(devid)               \
   (devid == PCI_CHIP_SKYLAKE_ULT_GT3 ||   \
    devid == PCI_CHIP_SKYLAKE_HALO_GT3 || \
-   devid == PCI_CHIP_SKYLAKE_SRV_GT3)
+   devid == PCI_CHIP_SKYLAKE_SRV_GT3 || \
+   devid == PCI_CHIP_SKYLAKE_MEDIA_SRV_GT3)
 
 #define IS_SKL_GT4(devid)               \
-  (devid == PCI_CHIP_SKYLAKE_HALO_GT4 || \
-   devid == PCI_CHIP_SKYLAKE_SRV_GT4)
+  (devid == PCI_CHIP_SKYLAKE_DT_GT4 ||  \
+   devid == PCI_CHIP_SKYLAKE_HALO_GT4 || \
+   devid == PCI_CHIP_SKYLAKE_SRV_GT4 || \
+   devid == PCI_CHIP_SKYLAKE_WKS_GT4)
 
 #define IS_SKYLAKE(devid) (IS_SKL_GT1(devid) || IS_SKL_GT2(devid) || IS_SKL_GT3(devid) || IS_SKL_GT4(devid))
 
@@ -294,7 +302,60 @@
 #define IS_BROXTON(devid)               \
   (devid == PCI_CHIP_BROXTON_P)
 
-#define IS_GEN9(devid)      (IS_SKYLAKE(devid) || IS_BROXTON(devid))
+#define PCI_CHIP_KABYLAKE_ULT_GT1     0x5906
+#define PCI_CHIP_KABYLAKE_ULT_GT2     0x5916
+#define PCI_CHIP_KABYLAKE_ULT_GT3     0x5926
+#define PCI_CHIP_KABYLAKE_ULT_GT15    0x5913
+#define PCI_CHIP_KABYLAKE_ULT_GT2_1   0x5921
+#define PCI_CHIP_KABYLAKE_ULT_GT3_1   0x5923
+#define PCI_CHIP_KABYLAKE_ULT_GT3_2   0x5927
+#define PCI_CHIP_KABYLAKE_DT_GT1      0x5902
+#define PCI_CHIP_KABYLAKE_DT_GT2      0x5912
+#define PCI_CHIP_KABYLAKE_DT_GT15     0x5917
+#define PCI_CHIP_KABYLAKE_HALO_GT1    0x590B
+#define PCI_CHIP_KABYLAKE_HALO_GT2    0x591B
+#define PCI_CHIP_KABYLAKE_HALO_GT4    0x593B
+#define PCI_CHIP_KABYLAKE_HALO_GT15   0x5908
+#define PCI_CHIP_KABYLAKE_ULX_GT1     0x590E
+#define PCI_CHIP_KABYLAKE_ULX_GT2     0x591E
+#define PCI_CHIP_KABYLAKE_ULX_GT15    0x5915
+#define PCI_CHIP_KABYLAKE_SRV_GT1     0x590A
+#define PCI_CHIP_KABYLAKE_SRV_GT2     0x591A
+#define PCI_CHIP_KABYLAKE_WKS_GT2     0x591D
+
+#define IS_KBL_GT1(devid)                 \
+  (devid == PCI_CHIP_KABYLAKE_ULT_GT1 ||  \
+   devid == PCI_CHIP_KABYLAKE_DT_GT1 ||   \
+   devid == PCI_CHIP_KABYLAKE_HALO_GT1 || \
+   devid == PCI_CHIP_KABYLAKE_ULX_GT1 ||  \
+   devid == PCI_CHIP_KABYLAKE_SRV_GT1)
+
+#define IS_KBL_GT15(devid)                 \
+  (devid == PCI_CHIP_KABYLAKE_ULT_GT15 ||  \
+   devid == PCI_CHIP_KABYLAKE_DT_GT15 ||   \
+   devid == PCI_CHIP_KABYLAKE_HALO_GT15 || \
+   devid == PCI_CHIP_KABYLAKE_ULX_GT15)
+
+#define IS_KBL_GT2(devid)                  \
+  (devid == PCI_CHIP_KABYLAKE_ULT_GT2 ||   \
+   devid == PCI_CHIP_KABYLAKE_ULT_GT2_1 || \
+   devid == PCI_CHIP_KABYLAKE_DT_GT2 ||    \
+   devid == PCI_CHIP_KABYLAKE_HALO_GT2 ||  \
+   devid == PCI_CHIP_KABYLAKE_ULX_GT2 ||   \
+   devid == PCI_CHIP_KABYLAKE_SRV_GT2 ||   \
+   devid == PCI_CHIP_KABYLAKE_WKS_GT2)
+
+#define IS_KBL_GT3(devid)                  \
+  (devid == PCI_CHIP_KABYLAKE_ULT_GT3 ||   \
+   devid == PCI_CHIP_KABYLAKE_ULT_GT3_1 || \
+   devid == PCI_CHIP_KABYLAKE_ULT_GT3_2)
+
+#define IS_KBL_GT4(devid)               \
+  (devid == PCI_CHIP_KABYLAKE_HALO_GT4)
+
+#define IS_KABYLAKE(devid) (IS_KBL_GT1(devid) || IS_KBL_GT15(devid) || IS_KBL_GT2(devid) || IS_KBL_GT3(devid) || IS_KBL_GT4(devid))
+
+#define IS_GEN9(devid)     (IS_SKYLAKE(devid) || IS_BROXTON(devid) || IS_KABYLAKE(devid))
 
 #endif /* __CL_DEVICE_DATA_H__ */
 
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 7b47c21..d29138d 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -27,6 +27,7 @@
 #include "cl_thread.h"
 #include "CL/cl.h"
 #include "CL/cl_ext.h"
+#include "CL/cl_intel.h"
 #include "cl_gbe_loader.h"
 #include "cl_alloc.h"
 
@@ -116,7 +117,7 @@ static struct _cl_device_id intel_brw_gt1_device = {
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
+#include "cl_gen8_device.h"
 };
 
 static struct _cl_device_id intel_brw_gt2_device = {
@@ -127,7 +128,7 @@ static struct _cl_device_id intel_brw_gt2_device = {
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
+#include "cl_gen8_device.h"
 };
 
 static struct _cl_device_id intel_brw_gt3_device = {
@@ -138,7 +139,7 @@ static struct _cl_device_id intel_brw_gt3_device = {
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
+#include "cl_gen8_device.h"
 };
 
 //Cherryview has the same pciid, must get the max_compute_unit and max_thread_per_unit from drm
@@ -162,7 +163,7 @@ static struct _cl_device_id intel_skl_gt1_device = {
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
+#include "cl_gen9_device.h"
 };
 
 static struct _cl_device_id intel_skl_gt2_device = {
@@ -173,7 +174,7 @@ static struct _cl_device_id intel_skl_gt2_device = {
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
+#include "cl_gen9_device.h"
 };
 
 static struct _cl_device_id intel_skl_gt3_device = {
@@ -184,7 +185,7 @@ static struct _cl_device_id intel_skl_gt3_device = {
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
+#include "cl_gen9_device.h"
 };
 
 static struct _cl_device_id intel_skl_gt4_device = {
@@ -195,7 +196,7 @@ static struct _cl_device_id intel_skl_gt4_device = {
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
+#include "cl_gen9_device.h"
 };
 
 static struct _cl_device_id intel_bxt_device = {
@@ -206,7 +207,62 @@ static struct _cl_device_id intel_bxt_device = {
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
+#include "cl_gen9_device.h"
+};
+
+static struct _cl_device_id intel_kbl_gt1_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 12,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
+static struct _cl_device_id intel_kbl_gt15_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 18,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 3,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
+static struct _cl_device_id intel_kbl_gt2_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 24,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 3,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
+static struct _cl_device_id intel_kbl_gt3_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 48,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 6,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
+};
+
+static struct _cl_device_id intel_kbl_gt4_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 72,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 9,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen9_device.h"
 };
 
 LOCAL cl_device_id
@@ -417,7 +473,11 @@ brw_gt1_break:
       intel_brw_gt1_device.device_id = device_id;
       intel_brw_gt1_device.platform = cl_get_platform_default();
       ret = &intel_brw_gt1_device;
-      cl_intel_platform_enable_fp16_extension(ret);
+      cl_intel_platform_get_default_extension(ret);
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
     case PCI_CHIP_BROADWLL_M_GT2:
@@ -434,7 +494,11 @@ brw_gt2_break:
       intel_brw_gt2_device.device_id = device_id;
       intel_brw_gt2_device.platform = cl_get_platform_default();
       ret = &intel_brw_gt2_device;
-      cl_intel_platform_enable_fp16_extension(ret);
+      cl_intel_platform_get_default_extension(ret);
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
     case PCI_CHIP_BROADWLL_M_GT3:
@@ -453,7 +517,11 @@ brw_gt3_break:
       intel_brw_gt3_device.device_id = device_id;
       intel_brw_gt3_device.platform = cl_get_platform_default();
       ret = &intel_brw_gt3_device;
-      cl_intel_platform_enable_fp16_extension(ret);
+      cl_intel_platform_get_default_extension(ret);
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
     case PCI_CHIP_CHV_0:
@@ -465,7 +533,11 @@ chv_break:
       intel_chv_device.device_id = device_id;
       intel_chv_device.platform = cl_get_platform_default();
       ret = &intel_chv_device;
-      cl_intel_platform_enable_fp16_extension(ret);
+      cl_intel_platform_get_default_extension(ret);
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
 
@@ -483,7 +555,11 @@ skl_gt1_break:
       intel_skl_gt1_device.device_id = device_id;
       intel_skl_gt1_device.platform = cl_get_platform_default();
       ret = &intel_skl_gt1_device;
-      cl_intel_platform_enable_fp16_extension(ret);
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
     case PCI_CHIP_SKYLAKE_ULT_GT2:
@@ -498,11 +574,17 @@ skl_gt1_break:
       DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Halo GT2");
     case PCI_CHIP_SKYLAKE_SRV_GT2:
       DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Server GT2");
+    case PCI_CHIP_SKYLAKE_WKS_GT2:
+      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Workstation GT2");
 skl_gt2_break:
       intel_skl_gt2_device.device_id = device_id;
       intel_skl_gt2_device.platform = cl_get_platform_default();
       ret = &intel_skl_gt2_device;
-      cl_intel_platform_enable_fp16_extension(ret);
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
     case PCI_CHIP_SKYLAKE_ULT_GT3:
@@ -511,22 +593,36 @@ skl_gt2_break:
       DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Halo GT3");
     case PCI_CHIP_SKYLAKE_SRV_GT3:
       DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Server GT3");
+    case PCI_CHIP_SKYLAKE_MEDIA_SRV_GT3:
+      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Media Server GT3");
 skl_gt3_break:
       intel_skl_gt3_device.device_id = device_id;
       intel_skl_gt3_device.platform = cl_get_platform_default();
       ret = &intel_skl_gt3_device;
-      cl_intel_platform_enable_fp16_extension(ret);
+      cl_intel_platform_get_default_extension(ret);
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
+    case PCI_CHIP_SKYLAKE_DT_GT4:
+      DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Desktop GT4");
     case PCI_CHIP_SKYLAKE_HALO_GT4:
       DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Halo GT4");
     case PCI_CHIP_SKYLAKE_SRV_GT4:
       DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Server GT4");
+    case PCI_CHIP_SKYLAKE_WKS_GT4:
+      DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Workstation GT4");
 skl_gt4_break:
       intel_skl_gt4_device.device_id = device_id;
       intel_skl_gt4_device.platform = cl_get_platform_default();
       ret = &intel_skl_gt4_device;
-      cl_intel_platform_enable_fp16_extension(ret);
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
     case PCI_CHIP_BROXTON_P:
@@ -535,7 +631,100 @@ bxt_break:
       intel_bxt_device.device_id = device_id;
       intel_bxt_device.platform = cl_get_platform_default();
       ret = &intel_bxt_device;
-      cl_intel_platform_enable_fp16_extension(ret);
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+      break;
+
+    case PCI_CHIP_KABYLAKE_ULT_GT1:
+      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake ULT GT1");
+    case PCI_CHIP_KABYLAKE_DT_GT1:
+      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake Desktop GT1");
+    case PCI_CHIP_KABYLAKE_HALO_GT1:
+      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake Halo GT1");
+    case PCI_CHIP_KABYLAKE_ULX_GT1:
+      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake ULX GT1");
+    case PCI_CHIP_KABYLAKE_SRV_GT1:
+      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake Server GT1");
+kbl_gt1_break:
+      intel_kbl_gt1_device.device_id = device_id;
+      intel_kbl_gt1_device.platform = cl_get_platform_default();
+      ret = &intel_kbl_gt1_device;
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+      break;
+
+    case PCI_CHIP_KABYLAKE_ULT_GT15:
+      DECL_INFO_STRING(kbl_gt15_break, intel_kbl_gt15_device, name, "Intel(R) HD Graphics Kabylake ULT GT1.5");
+    case PCI_CHIP_KABYLAKE_DT_GT15:
+      DECL_INFO_STRING(kbl_gt15_break, intel_kbl_gt15_device, name, "Intel(R) HD Graphics Kabylake Desktop GT1.5");
+    case PCI_CHIP_KABYLAKE_HALO_GT15:
+      DECL_INFO_STRING(kbl_gt15_break, intel_kbl_gt15_device, name, "Intel(R) HD Graphics Kabylake Halo GT1.5");
+    case PCI_CHIP_KABYLAKE_ULX_GT15:
+      DECL_INFO_STRING(kbl_gt15_break, intel_kbl_gt15_device, name, "Intel(R) HD Graphics Kabylake ULX GT1.5");
+kbl_gt15_break:
+      intel_kbl_gt15_device.device_id = device_id;
+      intel_kbl_gt15_device.platform = cl_get_platform_default();
+      ret = &intel_kbl_gt15_device;
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+      break;
+
+    case PCI_CHIP_KABYLAKE_ULT_GT2:
+    case PCI_CHIP_KABYLAKE_ULT_GT2_1:
+      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake ULT GT2");
+    case PCI_CHIP_KABYLAKE_DT_GT2:
+      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake Desktop GT2");
+    case PCI_CHIP_KABYLAKE_HALO_GT2:
+      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake Halo GT2");
+    case PCI_CHIP_KABYLAKE_ULX_GT2:
+      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake ULX GT2");
+    case PCI_CHIP_KABYLAKE_SRV_GT2:
+      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake Server GT2");
+    case PCI_CHIP_KABYLAKE_WKS_GT2:
+      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake Workstation GT2");
+kbl_gt2_break:
+      intel_kbl_gt2_device.device_id = device_id;
+      intel_kbl_gt2_device.platform = cl_get_platform_default();
+      ret = &intel_kbl_gt2_device;
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+      break;
+
+    case PCI_CHIP_KABYLAKE_ULT_GT3:
+    case PCI_CHIP_KABYLAKE_ULT_GT3_1:
+    case PCI_CHIP_KABYLAKE_ULT_GT3_2:
+      DECL_INFO_STRING(kbl_gt3_break, intel_kbl_gt3_device, name, "Intel(R) HD Graphics Kabylake ULT GT3");
+kbl_gt3_break:
+      intel_kbl_gt3_device.device_id = device_id;
+      intel_kbl_gt3_device.platform = cl_get_platform_default();
+      ret = &intel_kbl_gt3_device;
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+      break;
+
+    case PCI_CHIP_KABYLAKE_HALO_GT4:
+      DECL_INFO_STRING(kbl_gt4_break, intel_kbl_gt4_device, name, "Intel(R) HD Graphics Kabylake ULT GT4");
+kbl_gt4_break:
+      intel_kbl_gt4_device.device_id = device_id;
+      intel_kbl_gt4_device.platform = cl_get_platform_default();
+      ret = &intel_kbl_gt4_device;
+#ifdef ENABLE_FP64
+      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
+#endif
+      cl_intel_platform_get_default_extension(ret);
+      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
       break;
 
     case PCI_CHIP_SANDYBRIDGE_BRIDGE:
@@ -568,13 +757,24 @@ bxt_break:
   /* Apply any driver-dependent updates to the device info */
   cl_driver_update_device_info(ret);
 
+  #define toMB(size) (size)&(0xfffffffffffffff<<20)
+  /* Get the global_mem_size and max_mem_alloc size from
+   * driver, system ram and hardware*/
   struct sysinfo info;
   if (sysinfo(&info) == 0) {
-    uint64_t two_gb = 2 * 1024 * 1024 * 1024ul; 
+    uint64_t totalgpumem = ret->global_mem_size;
+	uint64_t maxallocmem = ret->max_mem_alloc_size;
     uint64_t totalram = info.totalram * info.mem_unit;
-    ret->global_mem_size = (totalram > two_gb) ? 
-                            two_gb : info.totalram;
-    ret->max_mem_alloc_size = ret->global_mem_size / 2;
+	/* In case to keep system stable we just use half
+	 * of the raw as global mem */
+    ret->global_mem_size = toMB((totalram / 2 > totalgpumem) ?
+                            totalgpumem: totalram / 2);
+	/* The hardware has some limit about the alloc size
+	 * and the excution of kernel need some global mem
+	 * so we now make sure single mem does not use much
+	 * than 3/4 global mem*/
+    ret->max_mem_alloc_size = toMB((ret->global_mem_size * 3 / 4 > maxallocmem) ?
+                              maxallocmem: ret->global_mem_size * 3 / 4);
   }
 
   return ret;
@@ -609,6 +809,8 @@ cl_self_test(cl_device_id device, cl_self_test_res atomic_in_l3_flag)
     return ret;
   tested = 1;
   ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
+  if(!ctx)
+    return ret;
   cl_driver_set_atomic_flag(ctx->drv, atomic_in_l3_flag);
   if (status == CL_SUCCESS) {
     queue = clCreateCommandQueue(ctx, device, 0, &status);
@@ -732,6 +934,29 @@ cl_get_device_ids(cl_platform_id    platform,
     memcpy(param_value, device->FIELD, device->JOIN(FIELD,_sz));    \
     return CL_SUCCESS;
 
+LOCAL cl_bool is_gen_device(cl_device_id device) {
+  return device == &intel_ivb_gt1_device ||
+         device == &intel_ivb_gt2_device ||
+         device == &intel_baytrail_t_device ||
+         device == &intel_hsw_gt1_device ||
+         device == &intel_hsw_gt2_device ||
+         device == &intel_hsw_gt3_device ||
+         device == &intel_brw_gt1_device ||
+         device == &intel_brw_gt2_device ||
+         device == &intel_brw_gt3_device ||
+         device == &intel_chv_device ||
+         device == &intel_skl_gt1_device ||
+         device == &intel_skl_gt2_device ||
+         device == &intel_skl_gt3_device ||
+         device == &intel_skl_gt4_device ||
+         device == &intel_bxt_device     ||
+         device == &intel_kbl_gt1_device ||
+         device == &intel_kbl_gt15_device ||
+         device == &intel_kbl_gt2_device ||
+         device == &intel_kbl_gt3_device ||
+         device == &intel_kbl_gt4_device;
+}
+
 LOCAL cl_int
 cl_get_device_info(cl_device_id     device,
                    cl_device_info   param_name,
@@ -739,22 +964,7 @@ cl_get_device_info(cl_device_id     device,
                    void *           param_value,
                    size_t *         param_value_size_ret)
 {
-  if (UNLIKELY(device != &intel_ivb_gt1_device &&
-               device != &intel_ivb_gt2_device &&
-               device != &intel_baytrail_t_device &&
-               device != &intel_hsw_gt1_device &&
-               device != &intel_hsw_gt2_device &&
-               device != &intel_hsw_gt3_device &&
-               device != &intel_brw_gt1_device &&
-               device != &intel_brw_gt2_device &&
-               device != &intel_brw_gt3_device &&
-               device != &intel_chv_device &&
-               device != &intel_skl_gt1_device &&
-               device != &intel_skl_gt2_device &&
-               device != &intel_skl_gt3_device &&
-               device != &intel_skl_gt4_device &&
-               device != &intel_bxt_device
-               ))
+  if (UNLIKELY(is_gen_device(device) == CL_FALSE))
     return CL_INVALID_DEVICE;
 
   /* Find the correct parameter */
@@ -833,6 +1043,8 @@ cl_get_device_info(cl_device_id     device,
     DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain)
     DECL_FIELD(PARTITION_TYPE, partition_type)
     DECL_FIELD(REFERENCE_COUNT, device_reference_count)
+    DECL_FIELD(IMAGE_PITCH_ALIGNMENT, image_pitch_alignment)
+    DECL_FIELD(IMAGE_BASE_ADDRESS_ALIGNMENT, image_base_address_alignment)
 
     case CL_DRIVER_VERSION:
       if (param_value_size_ret) {
@@ -852,22 +1064,7 @@ cl_get_device_info(cl_device_id     device,
 LOCAL cl_int
 cl_device_get_version(cl_device_id device, cl_int *ver)
 {
-  if (UNLIKELY(device != &intel_ivb_gt1_device &&
-               device != &intel_ivb_gt2_device &&
-               device != &intel_baytrail_t_device &&
-               device != &intel_hsw_gt1_device &&
-               device != &intel_hsw_gt2_device &&
-               device != &intel_hsw_gt3_device &&
-               device != &intel_brw_gt1_device &&
-               device != &intel_brw_gt2_device &&
-               device != &intel_brw_gt3_device &&
-               device != &intel_chv_device &&
-               device != &intel_skl_gt1_device &&
-               device != &intel_skl_gt2_device &&
-               device != &intel_skl_gt3_device &&
-               device != &intel_skl_gt4_device &&
-               device != &intel_bxt_device
-               ))
+  if (UNLIKELY(is_gen_device(device) == CL_FALSE))
     return CL_INVALID_DEVICE;
   if (ver == NULL)
     return CL_SUCCESS;
@@ -883,7 +1080,9 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
     *ver = 8;
   } else if (device == &intel_skl_gt1_device || device == &intel_skl_gt2_device
         || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device
-        || device == &intel_bxt_device) {
+        || device == &intel_bxt_device || device == &intel_kbl_gt1_device
+        || device == &intel_kbl_gt2_device || device == &intel_kbl_gt3_device
+        || device == &intel_kbl_gt4_device || device == &intel_kbl_gt15_device) {
     *ver = 9;
   } else
     return CL_INVALID_VALUE;
@@ -958,21 +1157,7 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
 {
   int err = CL_SUCCESS;
   int dimension = 0;
-  if (UNLIKELY(device != &intel_ivb_gt1_device &&
-               device != &intel_ivb_gt2_device &&
-               device != &intel_baytrail_t_device &&
-               device != &intel_hsw_gt1_device &&
-               device != &intel_hsw_gt2_device &&
-               device != &intel_hsw_gt3_device &&
-               device != &intel_brw_gt1_device &&
-               device != &intel_brw_gt2_device &&
-               device != &intel_brw_gt3_device &&
-               device != &intel_chv_device &&
-               device != &intel_skl_gt1_device &&
-               device != &intel_skl_gt2_device &&
-               device != &intel_skl_gt3_device &&
-               device != &intel_skl_gt4_device &&
-               device != &intel_bxt_device))
+  if (UNLIKELY(is_gen_device(device) == CL_FALSE))
     return CL_INVALID_DEVICE;
 
   CHECK_KERNEL(kernel);
@@ -1032,3 +1217,85 @@ error:
   return err;
 }
 
+LOCAL cl_int
+cl_get_kernel_subgroup_info(cl_kernel kernel,
+                            cl_device_id device,
+                            cl_kernel_work_group_info param_name,
+                            size_t input_value_size,
+                            const void* input_value,
+                            size_t param_value_size,
+                            void* param_value,
+                            size_t* param_value_size_ret)
+{
+  int err = CL_SUCCESS;
+  if(device != NULL)
+    if (kernel->program->ctx->device != device)
+      return CL_INVALID_DEVICE;
+
+  CHECK_KERNEL(kernel);
+  switch (param_name) {
+    case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR:
+    {
+      int i, dim = 0;
+      size_t local_sz = 1;
+      if (param_value && param_value_size < sizeof(size_t))
+        return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(size_t);
+      switch (input_value_size)
+      {
+        case sizeof(size_t)*1:
+        case sizeof(size_t)*2:
+        case sizeof(size_t)*3:
+          dim = input_value_size/sizeof(size_t);
+          break;
+        default: return CL_INVALID_VALUE;
+      }
+      if (input_value == NULL )
+        return CL_INVALID_VALUE;
+      for(i = 0; i < dim; i++)
+        local_sz *= ((size_t*)input_value)[i];
+      if (param_value) {
+        size_t simd_sz = cl_kernel_get_simd_width(kernel);
+        size_t sub_group_size = local_sz >= simd_sz? simd_sz : local_sz;
+        *(size_t*)param_value = sub_group_size;
+        return CL_SUCCESS;
+      }
+      break;
+    }
+    case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR:
+    {
+      int i, dim = 0;
+      size_t local_sz = 1;
+      if (param_value && param_value_size < sizeof(size_t))
+        return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(size_t);
+      switch (input_value_size)
+      {
+        case sizeof(size_t)*1:
+        case sizeof(size_t)*2:
+        case sizeof(size_t)*3:
+          dim = input_value_size/sizeof(size_t);
+          break;
+        default: return CL_INVALID_VALUE;
+      }
+      if (input_value == NULL )
+        return CL_INVALID_VALUE;
+      for(i = 0; i < dim; i++)
+        local_sz *= ((size_t*)input_value)[i];
+      if (param_value) {
+        size_t simd_sz = cl_kernel_get_simd_width(kernel);
+        size_t sub_group_num = (local_sz + simd_sz - 1) / simd_sz;
+        *(size_t*)param_value = sub_group_num;
+        return CL_SUCCESS;
+      }
+      break;
+    }
+    default:
+      return CL_INVALID_VALUE;
+  };
+
+error:
+  return err;
+}
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 46f9810..7db125b 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -20,6 +20,9 @@
 #ifndef __CL_DEVICE_ID_H__
 #define __CL_DEVICE_ID_H__
 
+#define EXTENSTION_LENGTH 512
+
+#include "cl_khr_icd.h"
 /* Store complete information about the device */
 struct _cl_device_id {
   DEFINE_ICD(dispatch)
@@ -95,7 +98,7 @@ struct _cl_device_id {
   const char *version;
   const char *profile;
   const char *opencl_c_version;
-  const char extensions[256];
+  const char extensions[EXTENSTION_LENGTH];
   const char *driver_version;
   const char *spir_versions;
   const char *built_in_kernels;
@@ -116,6 +119,11 @@ struct _cl_device_id {
   cl_device_partition_property partition_type[3];
   cl_uint      device_reference_count;
   uint32_t atomic_test_result;
+  uint32_t image_pitch_alignment;
+  uint32_t image_base_address_alignment;
+
+  //inited as NULL, created only when cmrt kernel is used
+  void* cmrt_device;  //realtype: CmDevice*
 };
 
 /* Get a device from the given platform */
@@ -141,6 +149,15 @@ extern cl_int cl_get_kernel_workgroup_info(cl_kernel kernel,
                                            size_t           param_value_size,
                                            void *           param_value,
                                            size_t *         param_value_size_ret);
+
+extern cl_int cl_get_kernel_subgroup_info(cl_kernel kernel,
+                                          cl_device_id     device,
+                                          cl_kernel_work_group_info   param_name,
+                                          size_t           input_value_size,
+                                          const void *     input_value,
+                                          size_t           param_value_size,
+                                          void *           param_value,
+                                          size_t *         param_value_size_ret);
 /* Returns the Gen device ID */
 extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
 extern size_t cl_get_kernel_max_wg_sz(cl_kernel);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 4ffca09..16730db 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -23,9 +23,11 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include "cl_driver_type.h"
+#include "CL/cl_ext.h"
 /* Various limitations we should remove actually */
 #define GEN_MAX_SURFACES 256
 #define GEN_MAX_SAMPLERS 16
+#define GEN_MAX_VME_STATES 8
 
 /**************************************************************************
  * cl_driver:
@@ -145,6 +147,9 @@ extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
 extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
 
+typedef void (cl_gpgpu_bind_vme_state_cb)(cl_gpgpu, cl_accelerator_intel accel);
+extern cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state;
+
 /* get the default cache control value. */
 typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
 extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
@@ -165,6 +170,22 @@ typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
 
 extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
 
+typedef void (cl_gpgpu_bind_image_for_vme_cb)(cl_gpgpu state,
+                                              uint32_t id,
+                                              cl_buffer obj_bo,
+                                              uint32_t obj_bo_offset,
+                                              uint32_t format,
+                                              uint32_t bpp,
+                                              uint32_t type,
+                                              int32_t w,
+                                              int32_t h,
+                                              int32_t depth,
+                                              int pitch,
+                                              int32_t slice_pitch,
+                                              cl_gpgpu_tiling tiling);
+
+extern cl_gpgpu_bind_image_for_vme_cb *cl_gpgpu_bind_image_for_vme;
+
 /* Setup a stack */
 typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
 extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
@@ -252,8 +273,24 @@ extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
 typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
 extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
 
+/* Set the profiling buffer */
+typedef int (cl_gpgpu_set_profiling_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, uint8_t);
+extern cl_gpgpu_set_profiling_buffer_cb *cl_gpgpu_set_profiling_buffer;
+
+typedef int (cl_gpgpu_set_profiling_info_cb)(cl_gpgpu, void *);
+extern cl_gpgpu_set_profiling_info_cb *cl_gpgpu_set_profiling_info;
+
+typedef void* (cl_gpgpu_get_profiling_info_cb)(cl_gpgpu);
+extern cl_gpgpu_get_profiling_info_cb *cl_gpgpu_get_profiling_info;
+
+typedef void* (cl_gpgpu_map_profiling_buffer_cb)(cl_gpgpu);
+extern cl_gpgpu_map_profiling_buffer_cb *cl_gpgpu_map_profiling_buffer;
+
+typedef void (cl_gpgpu_unmap_profiling_buffer_cb)(cl_gpgpu);
+extern cl_gpgpu_unmap_profiling_buffer_cb *cl_gpgpu_unmap_profiling_buffer;
+
 /* Set the printf buffer */
-typedef int (cl_gpgpu_set_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, uint32_t, uint8_t);
+typedef int (cl_gpgpu_set_printf_buffer_cb)(cl_gpgpu, uint32_t, uint8_t);
 extern cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer;
 
 /* get the printf buffer offset in the apeture*/
@@ -261,23 +298,23 @@ typedef unsigned long (cl_gpgpu_reloc_printf_buffer_cb)(cl_gpgpu, uint32_t, uint
 extern cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer;
 
 /* map the printf buffer */
-typedef void* (cl_gpgpu_map_printf_buffer_cb)(cl_gpgpu, uint32_t);
+typedef void* (cl_gpgpu_map_printf_buffer_cb)(cl_gpgpu);
 extern cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer;
 
 /* unmap the printf buffer */
-typedef void (cl_gpgpu_unmap_printf_buffer_cb)(cl_gpgpu, uint32_t);
+typedef void (cl_gpgpu_unmap_printf_buffer_cb)(cl_gpgpu);
 extern cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer;
 
 /* release the printf buffer */
-typedef unsigned long (cl_gpgpu_release_printf_buffer_cb)(cl_gpgpu, uint32_t);
+typedef unsigned long (cl_gpgpu_release_printf_buffer_cb)(cl_gpgpu);
 extern cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer;
 
 /* Set the last printfset pointer */
-typedef int (cl_gpgpu_set_printf_info_cb)(cl_gpgpu, void *, size_t*);
+typedef int (cl_gpgpu_set_printf_info_cb)(cl_gpgpu, void *);
 extern cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info;
 
 /* Get the last printfset pointer */
-typedef void* (cl_gpgpu_get_printf_info_cb)(cl_gpgpu, size_t*, size_t*);
+typedef void* (cl_gpgpu_get_printf_info_cb)(cl_gpgpu);
 extern cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info;
 
 /* Will spawn all threads */
@@ -381,6 +418,12 @@ extern cl_buffer_get_fd_cb *cl_buffer_get_fd;
 typedef int (cl_buffer_get_tiling_align_cb)(cl_context ctx, uint32_t tiling_mode, uint32_t dim);
 extern cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align;
 
+typedef cl_buffer (cl_buffer_get_buffer_from_fd_cb)(cl_context ctx, int fd, int size);
+extern cl_buffer_get_buffer_from_fd_cb *cl_buffer_get_buffer_from_fd;
+
+typedef cl_buffer (cl_buffer_get_image_from_fd_cb)(cl_context ctx, int fd, int size, struct _cl_mem_image *image);
+extern cl_buffer_get_image_from_fd_cb *cl_buffer_get_image_from_fd;
+
 /* Get the device id */
 typedef int (cl_driver_get_device_id_cb)(void);
 extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index b77acdc..31176a4 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -53,6 +53,8 @@ LOCAL cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva = NULL
 LOCAL cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva = NULL;
 LOCAL cl_buffer_get_fd_cb *cl_buffer_get_fd = NULL;
 LOCAL cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align = NULL;
+LOCAL cl_buffer_get_buffer_from_fd_cb *cl_buffer_get_buffer_from_fd = NULL;
+LOCAL cl_buffer_get_image_from_fd_cb *cl_buffer_get_image_from_fd = NULL;
 
 /* cl_khr_gl_sharing */
 LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
@@ -69,6 +71,7 @@ LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
 LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
 LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image_for_vme = NULL;
 LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
 LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
@@ -82,6 +85,7 @@ LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
 LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
 LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
 LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
+LOCAL cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state = NULL;
 LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
 LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
 LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL;
@@ -90,6 +94,11 @@ LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp =
 LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
 LOCAL cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf = NULL;
 LOCAL cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf = NULL;
+LOCAL cl_gpgpu_set_profiling_buffer_cb *cl_gpgpu_set_profiling_buffer = NULL;
+LOCAL cl_gpgpu_set_profiling_info_cb *cl_gpgpu_set_profiling_info = NULL;
+LOCAL cl_gpgpu_get_profiling_info_cb *cl_gpgpu_get_profiling_info = NULL;
+LOCAL cl_gpgpu_map_profiling_buffer_cb *cl_gpgpu_map_profiling_buffer = NULL;
+LOCAL cl_gpgpu_unmap_profiling_buffer_cb *cl_gpgpu_unmap_profiling_buffer = NULL;
 LOCAL cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer = NULL;
 LOCAL cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer = NULL;
 LOCAL cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 9e34bb8..081ffce 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -48,7 +48,10 @@ cl_int cl_enqueue_read_buffer(enqueue_data* data)
     if (src_ptr == NULL)
       err = CL_MAP_FAILURE;
     else {
-      memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
+      //sometimes, application invokes read buffer, instead of map buffer, even if userptr is enabled
+      //memcpy is not necessary for this case
+      if (data->ptr != (char*)src_ptr + data->offset + buffer->sub_offset)
+        memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
       cl_mem_unmap_auto(mem);
     }
   }
@@ -316,8 +319,9 @@ cl_int cl_enqueue_map_image(enqueue_data *data)
 
   if(mem->flags & CL_MEM_USE_HOST_PTR) {
     assert(mem->host_ptr);
-    //src and dst need add offset in function cl_mem_copy_image_region
-    cl_mem_copy_image_region(data->origin, data->region,
+    if (!mem->is_userptr)
+      //src and dst need add offset in function cl_mem_copy_image_region
+      cl_mem_copy_image_region(data->origin, data->region,
                              mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
                              data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
   }
@@ -374,8 +378,9 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
         row_pitch = image->slice_pitch;
       else
         row_pitch = image->row_pitch;
-      //v_ptr have added offset, host_ptr have not added offset.
-      cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
+      if (!memobj->is_userptr)
+        //v_ptr have added offset, host_ptr have not added offset.
+        cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
                                memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
                                image, CL_FALSE, CL_TRUE);
     }
diff --git a/src/cl_event.c b/src/cl_event.c
index 3391669..a2aacea 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -55,7 +55,7 @@ void cl_event_insert_last_events(cl_command_queue queue,cl_event event)
   else set_last_event(queue,event);
 }
 
-inline cl_bool
+static inline cl_bool
 cl_event_is_gpu_command_type(cl_command_type type)
 {
   switch(type) {
@@ -76,6 +76,11 @@ cl_event_is_gpu_command_type(cl_command_type type)
 int cl_event_flush(cl_event event)
 {
   int err = CL_SUCCESS;
+  if(!event) {
+    err = CL_INVALID_VALUE;
+    return err;
+  }
+
   assert(event->gpgpu_event != NULL);
   if (event->gpgpu) {
     err = cl_command_queue_flush_gpgpu(event->queue, event->gpgpu);
@@ -303,7 +308,7 @@ void cl_event_new_enqueue_callback(cl_event event,
 {
   enqueue_callback *cb, *node;
   user_event *user_events, *u_ev;
-  cl_command_queue queue = event->queue;
+  cl_command_queue queue = event ? event->queue : NULL;
   cl_int i;
   cl_int err = CL_SUCCESS;
 
@@ -362,9 +367,10 @@ void cl_event_new_enqueue_callback(cl_event event,
       /* Insert the user event to enqueue_callback's wait_user_events */
       TRY(cl_event_insert_user_event, &cb->wait_user_events, event_wait_list[i]);
       cl_event_add_ref(event_wait_list[i]);
-      cl_command_queue_insert_event(event->queue, event_wait_list[i]);
-      if(data->type == EnqueueBarrier){
-        cl_command_queue_insert_barrier_event(event->queue, event_wait_list[i]);
+      if(queue)
+        cl_command_queue_insert_event(queue, event_wait_list[i]);
+      if(queue && data->type == EnqueueBarrier){
+        cl_command_queue_insert_barrier_event(queue, event_wait_list[i]);
       }
     } else if(event_wait_list[i]->enqueue_cb != NULL) {
       user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
@@ -386,20 +392,22 @@ void cl_event_new_enqueue_callback(cl_event event,
         /* Insert the user event to enqueue_callback's wait_user_events */
         TRY(cl_event_insert_user_event, &cb->wait_user_events, user_events->event);
         cl_event_add_ref(user_events->event);
-        cl_command_queue_insert_event(event->queue, user_events->event);
-        if(data->type == EnqueueBarrier){
+        if(queue)
+          cl_command_queue_insert_event(event->queue, user_events->event);
+        if(queue && data->type == EnqueueBarrier){
           cl_command_queue_insert_barrier_event(event->queue, user_events->event);
         }
         user_events = user_events->next;
       }
     }
   }
-  if(data->queue != NULL && event->gpgpu_event != NULL) {
+  if(event != NULL && event->queue != NULL && event->gpgpu_event != NULL) {
     event->gpgpu = cl_thread_gpgpu_take(event->queue);
     data->ptr = (void *)event->gpgpu_event;
   }
   cb->data = *data;
-  event->enqueue_cb = cb;
+  if(event)
+    event->enqueue_cb = cb;
 
 exit:
   return;
@@ -595,12 +603,12 @@ cl_int cl_event_marker_with_wait_list(cl_command_queue queue,
   if(num_events_in_wait_list > 0){
     if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
       data.type = EnqueueMarker;
-      cl_event_new_enqueue_callback(*event, &data, num_events_in_wait_list, event_wait_list);
+      cl_event_new_enqueue_callback(event?*event:NULL, &data, num_events_in_wait_list, event_wait_list);
       return CL_SUCCESS;
     }
   } else if(queue->wait_events_num > 0) {
     data.type = EnqueueMarker;
-    cl_event_new_enqueue_callback(*event, &data, queue->wait_events_num, queue->wait_events);
+    cl_event_new_enqueue_callback(event?*event:NULL, &data, queue->wait_events_num, queue->wait_events);
     return CL_SUCCESS;
   }
 
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 3eb303f..183aafc 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -46,6 +46,10 @@ void check_opt1_extension(cl_extensions_t *extensions)
     if (id == EXT_ID(khr_spir))
       extensions->extensions[id].base.ext_enabled = 1;
 #endif
+    if (id == EXT_ID(khr_image2d_from_buffer))
+      extensions->extensions[id].base.ext_enabled = 1;
+    if (id == EXT_ID(khr_3d_image_writes))
+      extensions->extensions[id].base.ext_enabled = 1;
   }
 }
 
@@ -63,7 +67,9 @@ check_gl_extension(cl_extensions_t *extensions) {
 void
 check_intel_extension(cl_extensions_t *extensions)
 {
-  /* Should put those map/unmap extensions here. */
+  int id;
+  for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)
+    extensions->extensions[id].base.ext_enabled = 1;
 }
 
 void
@@ -104,24 +110,48 @@ cl_intel_platform_get_default_extension(cl_device_id device)
 }
 
 LOCAL void
-cl_intel_platform_enable_fp16_extension(cl_device_id device)
+cl_intel_platform_enable_extension(cl_device_id device, uint32_t ext)
 {
-  cl_extensions_t new_ext;
-  cl_platform_id pf = device->platform;
   int id;
+  char* ext_str = NULL;
+  cl_platform_id pf = device->platform;
   assert(pf);
 
-  memcpy(&new_ext, pf->internal_extensions, sizeof(new_ext));
-
   for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++) {
-    if (id == EXT_ID(khr_fp16))
-      new_ext.extensions[id].base.ext_enabled = 1;
+    if (id == ext) {
+      if (!pf->internal_extensions->extensions[id].base.ext_enabled)
+        ext_str = pf->internal_extensions->extensions[id].base.ext_name;
+
+      break;
+    }
   }
 
-  process_extension_str(&new_ext);
+  for(id = BASE_EXT_START_ID; id <= BASE_EXT_END_ID; id++) {
+    if (id == ext) {
+      if (!pf->internal_extensions->extensions[id].base.ext_enabled)
+        ext_str = pf->internal_extensions->extensions[id].base.ext_name;
+
+      break;
+    }
+  }
 
-  memcpy((char*)device->extensions, new_ext.ext_str, sizeof(device->extensions));
-  device->extensions_sz = strlen(new_ext.ext_str) + 1;
+  /* already enabled, skip. */
+  if (ext_str && strstr(device->extensions, ext_str))
+    ext_str = NULL;
+
+  if (ext_str) {
+    if (device->extensions_sz <= 1) {
+      memcpy((char*)device->extensions, ext_str, strlen(ext_str));
+      device->extensions_sz = strlen(ext_str) + 1;
+    } else {
+      assert(device->extensions_sz + 1 + strlen(ext_str) < EXTENSTION_LENGTH);
+      *(char*)(device->extensions + device->extensions_sz - 1) = ' ';
+      memcpy((char*)device->extensions + device->extensions_sz, ext_str, strlen(ext_str));
+      device->extensions_sz = device->extensions_sz + strlen(ext_str) + 1;
+    }
+
+    *(char*)(device->extensions + device->extensions_sz - 1) = 0;
+  }
 }
 
 LOCAL void
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index b4544e2..1139775 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -1,3 +1,5 @@
+#ifndef __CL_EXTENSIONS_H__
+#define __CL_EXTENSIONS_H__
 /* The following approved Khronos extension
  * names must be returned by all device that
  * support OpenCL C 1.2. */
@@ -23,6 +25,11 @@
   DECL_EXT(khr_spir) \
   DECL_EXT(khr_icd)
 
+#define DECL_INTEL_EXTENSIONS \
+  DECL_EXT(intel_accelerator) \
+  DECL_EXT(intel_motion_estimation) \
+  DECL_EXT(intel_subgroups)
+
 #define DECL_GL_EXTENSIONS \
   DECL_EXT(khr_gl_sharing)\
   DECL_EXT(khr_gl_event)\
@@ -37,6 +44,7 @@
 #define DECL_ALL_EXTENSIONS \
   DECL_BASE_EXTENSIONS \
   DECL_OPT1_EXTENSIONS \
+  DECL_INTEL_EXTENSIONS \
   DECL_GL_EXTENSIONS \
   DECL_D3D_EXTENSIONS
 
@@ -54,6 +62,8 @@ cl_khr_extension_id_max
 #define BASE_EXT_END_ID EXT_ID(khr_fp64)
 #define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
 #define OPT1_EXT_END_ID EXT_ID(khr_icd)
+#define INTEL_EXT_START_ID EXT_ID(intel_accelerator)
+#define INTEL_EXT_END_ID EXT_ID(intel_subgroups)
 #define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
 #define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
 
@@ -75,6 +85,7 @@ struct EXT_STRUCT_NAME(name) { \
 
 DECL_BASE_EXTENSIONS
 DECL_OPT1_EXTENSIONS
+DECL_INTEL_EXTENSIONS
 DECL_D3D_EXTENSIONS
 DECL_GL_EXTENSIONS
 #undef DECL_EXT
@@ -87,14 +98,16 @@ typedef union {
   #undef DECL_EXT
 } extension_union;
 
+#include "cl_device_id.h"
 typedef struct cl_extensions {
   extension_union extensions[cl_khr_extension_id_max];
-  char ext_str[256];
+  char ext_str[EXTENSTION_LENGTH];
 } cl_extensions_t;
 
 extern void
 cl_intel_platform_extension_init(cl_platform_id intel_platform);
 extern void
-cl_intel_platform_enable_fp16_extension(cl_device_id device);
+cl_intel_platform_enable_extension(cl_device_id device, uint32_t name);
 extern void
 cl_intel_platform_get_default_extension(cl_device_id device);
+#endif /* __CL_EXTENSIONS_H__ */
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index e832a53..aa13a3d 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -64,11 +64,12 @@ gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data = NULL;
 gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size = NULL;
 gbe_kernel_get_image_size_cb *interp_kernel_get_image_size = NULL;
 gbe_kernel_get_image_data_cb *interp_kernel_get_image_data = NULL;
+gbe_output_profiling_cb* interp_output_profiling = NULL;
+gbe_get_profiling_bti_cb* interp_get_profiling_bti = NULL;
+gbe_dup_profiling_cb* interp_dup_profiling = NULL;
 gbe_get_printf_num_cb* interp_get_printf_num = NULL;
 gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti = NULL;
-gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti = NULL;
 gbe_dup_printfset_cb* interp_dup_printfset = NULL;
-gbe_get_printf_sizeof_size_cb* interp_get_printf_sizeof_size = NULL;
 gbe_release_printf_info_cb* interp_release_printf_info = NULL;
 gbe_output_printf_cb* interp_output_printf = NULL;
 gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info = NULL;
@@ -87,7 +88,7 @@ struct GbeLoaderInitializer
   bool LoadInterp(const char*& path)
   {
     const char* interpPath = getenv("OCL_INTERP_PATH");
-    if (interpPath == NULL)
+    if (interpPath == NULL|| !strcmp(interpPath, ""))
       interpPath = INTERP_OBJECT_DIR;
 
     path = interpPath;
@@ -213,6 +214,18 @@ struct GbeLoaderInitializer
     if (interp_kernel_get_image_data == NULL)
       return false;
 
+    interp_output_profiling = *(gbe_output_profiling_cb**)dlsym(dlhInterp, "gbe_output_profiling");
+    if (interp_output_profiling == NULL)
+      return false;
+
+    interp_get_profiling_bti = *(gbe_get_profiling_bti_cb**)dlsym(dlhInterp, "gbe_get_profiling_bti");
+    if (interp_get_profiling_bti == NULL)
+      return false;
+
+    interp_dup_profiling = *(gbe_dup_profiling_cb**)dlsym(dlhInterp, "gbe_dup_profiling");
+    if (interp_dup_profiling == NULL)
+      return false;
+
     interp_get_printf_num = *(gbe_get_printf_num_cb**)dlsym(dlhInterp, "gbe_get_printf_num");
     if (interp_get_printf_num == NULL)
       return false;
@@ -221,18 +234,10 @@ struct GbeLoaderInitializer
     if (interp_get_printf_buf_bti == NULL)
       return false;
 
-    interp_get_printf_indexbuf_bti = *(gbe_get_printf_indexbuf_bti_cb**)dlsym(dlhInterp, "gbe_get_printf_indexbuf_bti");
-    if (interp_get_printf_indexbuf_bti == NULL)
-      return false;
-
     interp_dup_printfset = *(gbe_dup_printfset_cb**)dlsym(dlhInterp, "gbe_dup_printfset");
     if (interp_dup_printfset == NULL)
       return false;
 
-    interp_get_printf_sizeof_size = *(gbe_get_printf_sizeof_size_cb**)dlsym(dlhInterp, "gbe_get_printf_sizeof_size");
-    if (interp_get_printf_sizeof_size == NULL)
-      return false;
-
     interp_release_printf_info = *(gbe_release_printf_info_cb**)dlsym(dlhInterp, "gbe_release_printf_info");
     if (interp_release_printf_info == NULL)
       return false;
@@ -259,7 +264,7 @@ struct GbeLoaderInitializer
     }
 
     const char* gbePath = getenv("OCL_GBE_PATH");
-    if (gbePath == NULL)
+    if (gbePath == NULL || !strcmp(gbePath, ""))
       gbePath = GBE_OBJECT_DIR;
 
     dlhCompiler = dlopen(gbePath, RTLD_LAZY | RTLD_LOCAL);
@@ -315,6 +320,11 @@ struct GbeLoaderInitializer
 
     if (dlhInterp != NULL)
       dlclose(dlhInterp);
+
+    //When destroy, set the release relative functions
+    //to NULL to avoid dangling pointer visit.
+    compiler_program_clean_llvm_resource = NULL;
+    interp_program_delete = NULL;
   }
 
   bool compilerLoaded;
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index de91c85..df808a5 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -64,11 +64,12 @@ extern gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data;
 extern gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size;
 extern gbe_kernel_get_image_size_cb *interp_kernel_get_image_size;
 extern gbe_kernel_get_image_data_cb *interp_kernel_get_image_data;
+extern gbe_output_profiling_cb* interp_output_profiling;
+extern gbe_get_profiling_bti_cb* interp_get_profiling_bti;
+extern gbe_dup_profiling_cb* interp_dup_profiling;
 extern gbe_get_printf_num_cb* interp_get_printf_num;
 extern gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti;
-extern gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti;
 extern gbe_dup_printfset_cb* interp_dup_printfset;
-extern gbe_get_printf_sizeof_size_cb* interp_get_printf_sizeof_size;
 extern gbe_release_printf_info_cb* interp_release_printf_info;
 extern gbe_output_printf_cb* interp_output_printf;
 extern gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info;
diff --git a/src/cl_gen75_device.h b/src/cl_gen75_device.h
index 43f6e8f..7ef2b82 100644
--- a/src/cl_gen75_device.h
+++ b/src/cl_gen75_device.h
@@ -17,14 +17,15 @@
  * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
-/* Common fields for both SNB devices (either GT1 or GT2)
- */
+/* Common fields for both CHV,VLV and HSW devices */
 .max_parameter_size = 1024,
 .global_mem_cache_line_size = 64, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
 .local_mem_type = CL_GLOBAL,
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 2 << 20,
+.max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
+.global_mem_size = 2 * 1024 * 1024 * 1024ul,
 
 #include "cl_gt_device.h"
 
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index 4ad5d96..e755cad 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -24,6 +24,11 @@
 .local_mem_type = CL_GLOBAL,
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 12 << 10,
+.max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
+.global_mem_size = 2 * 1024 * 1024 * 1024ul,
 
+//temporarily define to only export builtin kernel block_motion_estimate_intel only for Gen7
+//will remove after HSW+ also support
+#define GEN7_DEVICE
 #include "cl_gt_device.h"
-
+#undef GEN7_DEVICE
diff --git a/src/cl_gen75_device.h b/src/cl_gen8_device.h
similarity index 87%
copy from src/cl_gen75_device.h
copy to src/cl_gen8_device.h
index 43f6e8f..08fde48 100644
--- a/src/cl_gen75_device.h
+++ b/src/cl_gen8_device.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -17,14 +17,14 @@
  * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
-/* Common fields for both SNB devices (either GT1 or GT2)
- */
+/* Common fields for both BDW devices */
 .max_parameter_size = 1024,
 .global_mem_cache_line_size = 64, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
 .local_mem_type = CL_GLOBAL,
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 2 << 20,
+.max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul,
+.global_mem_size = 4 * 1024 * 1024 * 1024ul,
 
 #include "cl_gt_device.h"
-
diff --git a/src/cl_gen75_device.h b/src/cl_gen9_device.h
similarity index 88%
copy from src/cl_gen75_device.h
copy to src/cl_gen9_device.h
index 43f6e8f..f50f9c7 100644
--- a/src/cl_gen75_device.h
+++ b/src/cl_gen9_device.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -17,14 +17,15 @@
  * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
-/* Common fields for both SNB devices (either GT1 or GT2)
- */
+/* Common fields for both SKL devices */
 .max_parameter_size = 1024,
 .global_mem_cache_line_size = 64, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
 .local_mem_type = CL_GLOBAL,
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 2 << 20,
+.max_mem_alloc_size = 4 * 1024 * 1024 * 1024ul,
+.global_mem_size = 4 * 1024 * 1024 * 1024ul,
 
 #include "cl_gt_device.h"
 
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index f523228..70a0a54 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -40,7 +40,6 @@
 .native_vector_width_double = 2,
 .native_vector_width_half = 8,
 .address_bits = 32,
-.max_mem_alloc_size = 512 * 1024 * 1024,
 .image_support = CL_TRUE,
 .max_read_image_args = BTI_MAX_READ_IMAGE_ARGS,
 .max_write_image_args = BTI_MAX_WRITE_IMAGE_ARGS,
@@ -56,7 +55,6 @@
 .min_data_type_align_size = sizeof(cl_long) * 16,
 .double_fp_config = 0,
 .global_mem_cache_type = CL_READ_WRITE_CACHE,
-.global_mem_size = 1024 * 1024 * 1024,
 .max_constant_buffer_size = 128 * 1024 * 1024,
 .max_constant_args = 8,
 .error_correction_support = CL_FALSE,
@@ -115,7 +113,11 @@ DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
                                    "__cl_fill_image_1d_array;"
                                    "__cl_fill_image_2d;"
                                    "__cl_fill_image_2d_array;"
-                                   "__cl_fill_image_3d;")
+                                   "__cl_fill_image_3d;"
+#ifdef GEN7_DEVICE
+                                   "block_motion_estimate_intel;"
+#endif
+                                   )
 
 DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
 DECL_INFO_STRING(spir_versions, "1.2")
@@ -126,4 +128,6 @@ DECL_INFO_STRING(spir_versions, "1.2")
 .affinity_domain = 0,
 .partition_type = {0},
 .device_reference_count = 1,
-
+.image_pitch_alignment = 1,
+.image_base_address_alignment = 4096,
+.cmrt_device = NULL
diff --git a/src/cl_internals.h b/src/cl_internals.h
index cb3fc23..9aeb8c1 100644
--- a/src/cl_internals.h
+++ b/src/cl_internals.h
@@ -31,6 +31,7 @@
 #define CL_MAGIC_EVENT_HEADER     0x8324a9c810ebf90fLL
 #define CL_MAGIC_MEM_HEADER       0x381a27b9ce6504dfLL
 #define CL_MAGIC_DEAD_HEADER      0xdeaddeaddeaddeadLL
+#define CL_MAGIC_ACCELERATOR_INTEL_HEADER   0x7c6a08c9a7ac3e3fLL
 
 #endif /* __CL_INTERNALS_H__ */
 
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 286e57c..b380abe 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -27,6 +27,8 @@
 #include "cl_khr_icd.h"
 #include "CL/cl.h"
 #include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
+#include "cl_cmrt.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -40,6 +42,15 @@ cl_kernel_delete(cl_kernel k)
   uint32_t i;
   if (k == NULL) return;
 
+#ifdef HAS_CMRT
+  if (k->cmrt_kernel != NULL) {
+    cmrt_destroy_kernel(k);
+    k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+    cl_free(k);
+    return;
+  }
+#endif
+
   /* We are not done with the kernel */
   if (atomic_dec(&k->ref_n) > 1) return;
   /* Release one reference on all bos we own */
@@ -70,6 +81,7 @@ cl_kernel_new(cl_program p)
   k->ref_n = 1;
   k->magic = CL_MAGIC_KERNEL_HEADER;
   k->program = p;
+  k->cmrt_kernel = NULL;
 
 exit:
   return k;
@@ -102,7 +114,7 @@ cl_kernel_add_ref(cl_kernel k)
 LOCAL cl_int
 cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
 {
-  uint32_t offset;            /* where to patch */
+  int32_t offset;            /* where to patch */
   enum gbe_arg_type arg_type; /* kind of argument */
   size_t arg_sz;              /* size of the argument */
   cl_mem mem = NULL;          /* for __global, __constant and image arguments */
@@ -113,10 +125,22 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   arg_type = interp_kernel_get_arg_type(k->opaque, index);
   arg_sz = interp_kernel_get_arg_size(k->opaque, index);
 
-  if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
-    if (arg_type != GBE_ARG_SAMPLER ||
-        (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
+  if (k->vme && index == 0) {
+    //the best method is to return the arg type of GBE_ARG_ACCELERATOR_INTEL
+    //but it is not straightforward since clang does not support it now
+    //the easy way is to consider typedef accelerator_intel_t as a struct,
+    //this easy way makes the size mismatched, so use another size check method.
+    if (sz != sizeof(cl_accelerator_intel) || arg_sz != sizeof(cl_motion_estimation_desc_intel))
       return CL_INVALID_ARG_SIZE;
+    cl_accelerator_intel* accel = (cl_accelerator_intel*)value;
+    if ((*accel)->type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL)
+      return CL_INVALID_ACCELERATOR_TYPE_INTEL;
+  } else {
+    if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
+      if (arg_type != GBE_ARG_SAMPLER ||
+          (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
+        return CL_INVALID_ARG_SIZE;
+    }
   }
 
   if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0))
@@ -152,13 +176,30 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
 
   /* Copy the structure or the value directly into the curbe */
   if (arg_type == GBE_ARG_VALUE) {
-    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    assert(offset + sz <= k->curbe_sz);
-    memcpy(k->curbe + offset, value, sz);
-    k->args[index].local_sz = 0;
-    k->args[index].is_set = 1;
-    k->args[index].mem = NULL;
-    return CL_SUCCESS;
+    if (k->vme && index == 0) {
+      cl_accelerator_intel accel;
+      memcpy(&accel, value, sz);
+      offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+      if (offset >= 0) {
+        assert(offset + sz <= k->curbe_sz);
+        memcpy(k->curbe + offset, &(accel->desc.me), arg_sz);
+      }
+      k->args[index].local_sz = 0;
+      k->args[index].is_set = 1;
+      k->args[index].mem = NULL;
+      k->accel = accel;
+      return CL_SUCCESS;
+    } else {
+      offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+      if (offset >= 0) {
+        assert(offset + sz <= k->curbe_sz);
+        memcpy(k->curbe + offset, value, sz);
+      }
+      k->args[index].local_sz = 0;
+      k->args[index].is_set = 1;
+      k->args[index].mem = NULL;
+      return CL_SUCCESS;
+    }
   }
 
   /* For a local pointer just save the size */
@@ -179,9 +220,10 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     k->args[index].sampler = sampler;
     cl_set_sampler_arg_slot(k, index, sampler);
     offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    //assert(arg_sz == 4);
-    assert(offset + 4 <= k->curbe_sz);
-    memcpy(k->curbe + offset, &sampler->clkSamplerValue, 4);
+    if (offset >= 0) {
+      assert(offset + 4 <= k->curbe_sz);
+      memcpy(k->curbe + offset, &sampler->clkSamplerValue, 4);
+    }
     return CL_SUCCESS;
   }
 
@@ -191,7 +233,8 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   if(value == NULL || mem == NULL) {
     /* for buffer object GLOBAL_PTR CONSTANT_PTR, it maybe NULL */
     int32_t offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    *((uint32_t *)(k->curbe + offset)) = 0;
+    if (offset >= 0)
+      *((uint32_t *)(k->curbe + offset)) = 0;
     assert(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR);
 
     if (k->args[index].mem)
@@ -327,6 +370,12 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   cl_buffer_subdata(k->bo, 0, code_sz, code);
   k->opaque = opaque;
 
+  const char* kname = cl_kernel_get_name(k);
+  if (strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0)
+    k->vme = 1;
+  else
+    k->vme = 0;
+
   /* Create the curbe */
   k->curbe_sz = interp_kernel_get_curbe_size(k->opaque);
 
@@ -363,6 +412,7 @@ cl_kernel_dup(cl_kernel from)
   SET_ICD(to->dispatch)
   to->bo = from->bo;
   to->opaque = from->opaque;
+  to->vme = from->vme;
   to->ref_n = 1;
   to->magic = CL_MAGIC_KERNEL_HEADER;
   to->program = from->program;
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 140bbb1..05a882e 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -24,6 +24,7 @@
 #include "cl_driver.h"
 #include "cl_gbe_loader.h"
 #include "CL/cl.h"
+#include "CL/cl_ext.h"
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -37,6 +38,7 @@ struct _gbe_kernel;
 typedef struct cl_argument {
   cl_mem mem;           /* For image and regular buffers */
   cl_sampler sampler;   /* For sampler. */
+  cl_accelerator_intel accel;
   unsigned char bti;
   uint32_t local_sz:31; /* For __local size specification */
   uint32_t is_set:1;    /* All args must be set before NDRange */
@@ -50,6 +52,7 @@ struct _cl_kernel {
   cl_buffer bo;               /* The code itself */
   cl_program program;         /* Owns this structure (and pointers) */
   gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
+  cl_accelerator_intel accel;     /* accelerator */
   char *curbe;                /* One curbe per kernel */
   size_t curbe_sz;            /* Size of it */
   uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
@@ -63,8 +66,11 @@ struct _cl_kernel {
                                 (i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/
   size_t stack_size;          /* stack size per work item. */
   cl_argument *args;          /* To track argument setting */
-  uint32_t arg_n:31;          /* Number of arguments */
+  uint32_t arg_n:30;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
+  uint32_t vme:1;             /* True only if it is a built-in kernel for VME */
+
+  void* cmrt_kernel;          /* CmKernel* */
 };
 
 /* Allocate an empty kernel */
diff --git a/src/cl_mem.c b/src/cl_mem.c
index b5671bd..06e7c18 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -27,6 +27,7 @@
 #include "cl_khr_icd.h"
 #include "cl_kernel.h"
 #include "cl_command_queue.h"
+#include "cl_cmrt.h"
 
 #include "CL/cl.h"
 #include "CL/cl_intel.h"
@@ -232,7 +233,8 @@ cl_mem_allocate(enum cl_mem_type type,
                 cl_mem_flags flags,
                 size_t sz,
                 cl_int is_tiled,
-                void *host_ptr,
+                void *host_ptr,         //pointer from application
+                cl_mem buffer,          //image2D from buffer
                 cl_int *errcode)
 {
   cl_buffer_mgr bufmgr = NULL;
@@ -267,6 +269,10 @@ cl_mem_allocate(enum cl_mem_type type,
   mem->flags = flags;
   mem->is_userptr = 0;
   mem->offset = 0;
+  mem->cmrt_mem = NULL;
+  if (mem->type == CL_MEM_IMAGE_TYPE) {
+    cl_mem_image(mem)->is_image_from_buffer = 0;
+  }
 
   if (sz != 0) {
     /* Pinning will require stricter alignment rules */
@@ -278,23 +284,25 @@ cl_mem_allocate(enum cl_mem_type type,
     assert(bufmgr);
 
 #ifdef HAS_USERPTR
+    uint8_t bufCreated = 0;
     if (ctx->device->host_unified_memory) {
       int page_size = getpagesize();
       int cacheline_size = 0;
       cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
 
-      /* currently only cl buf is supported, will add cl image support later */
       if (type == CL_MEM_BUFFER_TYPE) {
         if (flags & CL_MEM_USE_HOST_PTR) {
           assert(host_ptr != NULL);
           /* userptr not support tiling */
           if (!is_tiled) {
-            if (ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr) {
+            if ((ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr) &&
+                (ALIGN((unsigned long)sz, cacheline_size) == (unsigned long)sz)) {
               void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & (~(page_size - 1)));
               mem->offset = host_ptr - aligned_host_ptr;
               mem->is_userptr = 1;
               size_t aligned_sz = ALIGN((mem->offset + sz), page_size);
               mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", aligned_host_ptr, aligned_sz, 0);
+              bufCreated = 1;
             }
           }
         }
@@ -304,14 +312,48 @@ cl_mem_allocate(enum cl_mem_type type,
           mem->host_ptr = internal_host_ptr;
           mem->is_userptr = 1;
           mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", internal_host_ptr, alignedSZ, 0);
+          bufCreated = 1;
+        }
+      } else if (type == CL_MEM_IMAGE_TYPE) {
+        if (host_ptr != NULL) {
+          assert(flags & CL_MEM_USE_HOST_PTR);
+          assert(!is_tiled);
+          assert(ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr);
+          void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & (~(page_size - 1)));
+          mem->offset = host_ptr - aligned_host_ptr;
+          mem->is_userptr = 1;
+          size_t aligned_sz = ALIGN((mem->offset + sz), page_size);
+          mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", aligned_host_ptr, aligned_sz, 0);
+          bufCreated = 1;
         }
       }
     }
 
-    if (!mem->is_userptr)
+    if(type == CL_MEM_IMAGE_TYPE && buffer != NULL) {
+      // if create image from USE_HOST_PTR buffer, the buffer's base address need be aligned.
+      if(buffer->is_userptr) {
+        int base_alignement = 0;
+        cl_get_device_info(ctx->device, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, sizeof(base_alignement), &base_alignement, NULL);
+        if(ALIGN((unsigned long)buffer->host_ptr, base_alignement) != (unsigned long)buffer->host_ptr) {
+          err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+          goto error;
+        }
+      }
+      // if the image if created from buffer, should use the bo directly to share same bo.
+      mem->bo = buffer->bo;
+      cl_mem_image(mem)->is_image_from_buffer = 1;
+      bufCreated = 1;
+    }
+
+    if (!bufCreated)
       mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
 #else
-    mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+    if(type == CL_MEM_IMAGE_TYPE && buffer != NULL) {
+      // if the image if created from buffer, should use the bo directly to share same bo.
+      mem->bo = buffer->bo;
+      cl_mem_image(mem)->is_image_from_buffer = 1;
+    } else
+      mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
 #endif
 
     if (UNLIKELY(mem->bo == NULL)) {
@@ -424,7 +466,7 @@ cl_mem_new_buffer(cl_context ctx,
   sz = ALIGN(sz, 4);
 
   /* Create the buffer in video memory */
-  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, data, &err);
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, data, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
@@ -692,7 +734,8 @@ _cl_mem_new_image(cl_context ctx,
                   size_t depth,
                   size_t pitch,
                   size_t slice_pitch,
-                  void *data,
+                  void *data,           //pointer from application
+                  cl_mem buffer,        //for image2D from buffer
                   cl_int *errcode_ret)
 {
   cl_int err = CL_SUCCESS;
@@ -702,6 +745,13 @@ _cl_mem_new_image(cl_context ctx,
   size_t sz = 0, aligned_pitch = 0, aligned_slice_pitch = 0, aligned_h = 0;
   size_t origin_width = w;  // for image1d buffer work around.
   cl_image_tiling_t tiling = CL_NO_TILE;
+  int enable_true_hostptr = 0;
+
+  // can't use BVAR (backend/src/sys/cvar.hpp) here as it's C++
+  const char *env = getenv("OCL_IMAGE_HOSTPTR");
+  if (env != NULL) {
+    sscanf(env, "%i", &enable_true_hostptr);
+  }
 
   /* Check flags consistency */
   if (UNLIKELY((flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) && data == NULL)) {
@@ -756,6 +806,8 @@ _cl_mem_new_image(cl_context ctx,
       h = (w + ctx->device->image2d_max_width - 1) / ctx->device->image2d_max_width;
       w = w > ctx->device->image2d_max_width ? ctx->device->image2d_max_width : w;
       tiling = CL_NO_TILE;
+    } else if(image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL) {
+      tiling = CL_NO_TILE;
     } else if (cl_driver_get_ver(ctx->drv) != 6) {
       /* Pick up tiling mode (we do only linear on SNB) */
       tiling = cl_get_default_tiling(ctx->drv);
@@ -768,7 +820,7 @@ _cl_mem_new_image(cl_context ctx,
     if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
     if (UNLIKELY(h > ctx->device->image2d_max_height)) DO_IMAGE_ERROR;
     if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
-    if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
+    if (UNLIKELY(!data && pitch != 0 && buffer == NULL)) DO_IMAGE_ERROR;
 
     depth = 1;
   } else if (image_type == CL_MEM_OBJECT_IMAGE3D ||
@@ -801,10 +853,34 @@ _cl_mem_new_image(cl_context ctx,
 
 #undef DO_IMAGE_ERROR
 
+  uint8_t enableUserptr = 0;
+  if (enable_true_hostptr && ctx->device->host_unified_memory && data != NULL && (flags & CL_MEM_USE_HOST_PTR)) {
+    int cacheline_size = 0;
+    cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
+    if (ALIGN((unsigned long)data, cacheline_size) == (unsigned long)data &&
+        ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)) == h &&
+        ALIGN(h * pitch * depth, cacheline_size) == h * pitch * depth && //h and pitch should same as aligned_h and aligned_pitch if enable userptr
+        ((image_type != CL_MEM_OBJECT_IMAGE3D && image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY && image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) || pitch * h == slice_pitch)) {
+      tiling = CL_NO_TILE;
+      enableUserptr = 1;
+    }
+  }
+
   /* Tiling requires to align both pitch and height */
   if (tiling == CL_NO_TILE) {
     aligned_pitch = w * bpp;
-    aligned_h  = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
+    if (aligned_pitch < pitch && enableUserptr)
+      aligned_pitch = pitch;
+    //no need align the height if 2d image from buffer.
+    //the pitch should be same with buffer's pitch as they share same bo.
+    if (image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL) {
+      if(aligned_pitch < pitch) {
+        aligned_pitch = pitch;
+      }
+      aligned_h = h;
+    }
+    else
+      aligned_h  = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
   } else if (tiling == CL_TILE_X) {
     aligned_pitch = ALIGN(w * bpp, cl_buffer_get_tiling_align(ctx, CL_TILE_X, 0));
     aligned_h     = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_TILE_X, 1));
@@ -814,6 +890,15 @@ _cl_mem_new_image(cl_context ctx,
   }
 
   sz = aligned_pitch * aligned_h * depth;
+  if (image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL)  {
+    //image 2d created from buffer: per spec, the buffer sz maybe larger than the image 2d.
+    if (buffer->size >= sz)
+      sz = buffer->size;
+    else {
+      err = CL_INVALID_IMAGE_SIZE;
+      goto error;
+    }
+  }
 
   /* If sz is large than 128MB, map gtt may fail in some system.
      Because there is no obviours performance drop, disable tiling. */
@@ -824,10 +909,17 @@ _cl_mem_new_image(cl_context ctx,
     sz = aligned_pitch * aligned_h * depth;
   }
 
-  if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)
-    mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, &err);
-  else {
-    mem = cl_mem_allocate(CL_MEM_BUFFER1D_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, &err);
+  if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+    if (image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL)
+      mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, buffer, &err);
+    else {
+      if (enableUserptr)
+        mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, data, NULL, &err);
+      else
+        mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err);
+    }
+  } else {
+    mem = cl_mem_allocate(CL_MEM_BUFFER1D_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err);
     if (mem != NULL && err == CL_SUCCESS) {
       struct _cl_mem_buffer1d_image *buffer1d_image = (struct _cl_mem_buffer1d_image *)mem;
       buffer1d_image->size = origin_width;;
@@ -837,7 +929,11 @@ _cl_mem_new_image(cl_context ctx,
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
-  cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+  if(!(image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL))  {
+    //no need set tiling if image 2d created from buffer since share same bo.
+    cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+  }
+
   if (image_type == CL_MEM_OBJECT_IMAGE1D ||
       image_type == CL_MEM_OBJECT_IMAGE2D ||
       image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
@@ -851,13 +947,15 @@ _cl_mem_new_image(cl_context ctx,
                     0, 0, 0);
 
   /* Copy the data if required */
-  if (flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) {
+  if (flags & CL_MEM_COPY_HOST_PTR && data)
     cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
-    if (flags & CL_MEM_USE_HOST_PTR) {
-      mem->host_ptr = data;
-      cl_mem_image(mem)->host_row_pitch = pitch;
-      cl_mem_image(mem)->host_slice_pitch = slice_pitch;
-    }
+
+  if (flags & CL_MEM_USE_HOST_PTR && data) {
+    mem->host_ptr = data;
+    cl_mem_image(mem)->host_row_pitch = pitch;
+    cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+    if (!enableUserptr)
+      cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
   }
 
 exit:
@@ -971,33 +1069,51 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
   if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS))
     goto error;
 
-  // Per bspec, a image should has a at least 2 line vertical alignment,
-  // thus we can't simply attach a buffer to a 1d image surface which has the same size.
-  // We have to create a new image, and copy the buffer data to this new image.
-  // And replace all the buffer object's reference to this image.
-  image = _cl_mem_new_image(ctx, flags, image_format, image_desc->image_type,
-                    mem_buffer->base.size / bpp, 0, 0, 0, 0, NULL, errcode_ret);
+  if(image_desc->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    image = _cl_mem_new_image(ctx, flags, image_format, image_desc->image_type,
+                             image_desc->image_width, image_desc->image_height, image_desc->image_depth,
+                             image_desc->image_row_pitch, image_desc->image_slice_pitch,
+                             NULL, image_desc->buffer, errcode_ret);
+  } else if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+    // Per bspec, a image should has a at least 2 line vertical alignment,
+    // thus we can't simply attach a buffer to a 1d image surface which has the same size.
+    // We have to create a new image, and copy the buffer data to this new image.
+    // And replace all the buffer object's reference to this image.
+    image = _cl_mem_new_image(ctx, flags, image_format, image_desc->image_type,
+                    mem_buffer->base.size / bpp, 0, 0, 0, 0, NULL, NULL, errcode_ret);
+  }
+  else
+    assert(0);
+
   if (image == NULL)
     return NULL;
-  void *src = cl_mem_map(buffer, 0);
-  void *dst = cl_mem_map(image, 1);
-  //
-  // FIXME, we could use copy buffer to image to do this on GPU latter.
-  // currently the copy buffer to image function doesn't support 1D image.
-  // 
-  // There is a potential risk that this buffer was mapped and the caller
-  // still hold the pointer and want to access it again. This scenario is
-  // not explicitly forbidden in the spec, although it should not be permitted.
-  memcpy(dst, src, mem_buffer->base.size);
-  cl_mem_unmap(buffer);
-  cl_mem_unmap(image);
+
+  if(image_desc->image_type == CL_MEM_OBJECT_IMAGE2D)  {
+    //no need copy since the image 2d and buffer share same bo.
+  }
+  else if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)  {
+    // FIXME, we could use copy buffer to image to do this on GPU latter.
+    // currently the copy buffer to image function doesn't support 1D image.
+    //
+    // There is a potential risk that this buffer was mapped and the caller
+    // still hold the pointer and want to access it again. This scenario is
+    // not explicitly forbidden in the spec, although it should not be permitted.
+    void *src = cl_mem_map(buffer, 0);
+    void *dst = cl_mem_map(image, 1);
+    memcpy(dst, src, mem_buffer->base.size);
+    cl_mem_unmap(image);
+    cl_mem_unmap(buffer);
+  }
+  else
+    assert(0);
 
   if (err != 0)
     goto error;
  
   // Now replace buffer's bo to this new bo, need to take care of sub buffer
   // case. 
-  cl_mem_replace_buffer(buffer, image->bo);
+  if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+    cl_mem_replace_buffer(buffer, image->bo);
   /* Now point to the right offset if buffer is a SUB_BUFFER. */
   if (buffer->flags & CL_MEM_USE_HOST_PTR)
     image->host_ptr = buffer->host_ptr + offset;
@@ -1025,18 +1141,26 @@ cl_mem_new_image(cl_context context,
 {
   switch (image_desc->image_type) {
   case CL_MEM_OBJECT_IMAGE1D:
-  case CL_MEM_OBJECT_IMAGE2D:
   case CL_MEM_OBJECT_IMAGE3D:
     return _cl_mem_new_image(context, flags, image_format, image_desc->image_type,
                              image_desc->image_width, image_desc->image_height, image_desc->image_depth,
                              image_desc->image_row_pitch, image_desc->image_slice_pitch,
-                             host_ptr, errcode_ret);
+                             host_ptr, NULL, errcode_ret);
+  case CL_MEM_OBJECT_IMAGE2D:
+    if(image_desc->buffer)
+      return _cl_mem_new_image_from_buffer(context, flags, image_format,
+                             image_desc, errcode_ret);
+    else
+      return _cl_mem_new_image(context, flags, image_format, image_desc->image_type,
+                             image_desc->image_width, image_desc->image_height, image_desc->image_depth,
+                             image_desc->image_row_pitch, image_desc->image_slice_pitch,
+                             host_ptr, NULL, errcode_ret);
   case CL_MEM_OBJECT_IMAGE1D_ARRAY:
   case CL_MEM_OBJECT_IMAGE2D_ARRAY:
     return _cl_mem_new_image(context, flags, image_format, image_desc->image_type,
                              image_desc->image_width, image_desc->image_height, image_desc->image_array_size,
                              image_desc->image_row_pitch, image_desc->image_slice_pitch,
-                             host_ptr, errcode_ret);
+                             host_ptr, NULL, errcode_ret);
   case CL_MEM_OBJECT_IMAGE1D_BUFFER:
     return _cl_mem_new_image_from_buffer(context, flags, image_format,
                                          image_desc, errcode_ret);
@@ -1062,12 +1186,22 @@ cl_mem_delete(cl_mem mem)
   }
 #endif
 
+#ifdef HAS_CMRT
+  if (mem->cmrt_mem != NULL)
+    cmrt_destroy_memory(mem);
+#endif
+
   /* iff we are a image, delete the 1d buffer if has. */
   if (IS_IMAGE(mem)) {
     if (cl_mem_image(mem)->buffer_1d) {
-      assert(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER);
-      cl_mem_delete(cl_mem_image(mem)->buffer_1d);
-      cl_mem_image(mem)->buffer_1d = NULL;
+      assert(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER ||
+          cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE2D);
+        cl_mem_delete(cl_mem_image(mem)->buffer_1d);
+        if(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE2D && cl_mem_image(mem)->is_image_from_buffer == 1)
+        {
+          cl_mem_image(mem)->buffer_1d = NULL;
+          mem->bo = NULL;
+        }
     }
   }
 
@@ -1292,6 +1426,9 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
              cl_internal_copy_buf_unalign_src_offset_str,
              (size_t)cl_internal_copy_buf_unalign_src_offset_str_size, NULL);
 
+    if (!ker)
+      return CL_OUT_OF_RESOURCES;
+
     cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
     cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
     cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
@@ -2008,7 +2145,7 @@ LOCAL cl_mem cl_mem_new_libva_buffer(cl_context ctx,
   cl_int err = CL_SUCCESS;
   cl_mem mem = NULL;
 
-  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, NULL, &err);
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, NULL, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
@@ -2053,20 +2190,22 @@ LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx,
     goto error;
   }
 
-  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, NULL, &err);
-  if (mem == NULL || err != CL_SUCCESS) {
-    err = CL_OUT_OF_HOST_MEMORY;
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, NULL, NULL, &err);
+  if (mem == NULL || err != CL_SUCCESS)
     goto error;
-  }
 
   image = cl_mem_image(mem);
 
   mem->bo = cl_buffer_get_image_from_libva(ctx, bo_name, image);
+  if (mem->bo == NULL) {
+    err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    goto error;
+  }
 
   image->w = width;
   image->h = height;
   image->image_type = CL_MEM_OBJECT_IMAGE2D;
-  image->depth = 2;
+  image->depth = 1;
   image->fmt = fmt;
   image->intel_fmt = intel_fmt;
   image->bpp = bpp;
@@ -2097,3 +2236,94 @@ cl_mem_get_fd(cl_mem mem,
 	err = CL_INVALID_OPERATION;
   return err;
 }
+
+LOCAL cl_mem cl_mem_new_buffer_from_fd(cl_context ctx,
+                                       int fd,
+                                       int buffer_sz,
+                                       cl_int* errcode)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, NULL, NULL, &err);
+  if (mem == NULL || err != CL_SUCCESS)
+    goto error;
+
+  mem->bo = cl_buffer_get_buffer_from_fd(ctx, fd, buffer_sz);
+  if (mem->bo == NULL) {
+    err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    goto error;
+  }
+  mem->size = buffer_sz;
+
+exit:
+  if (errcode)
+    *errcode = err;
+  return mem;
+
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
+
+LOCAL cl_mem cl_mem_new_image_from_fd(cl_context ctx,
+                                      int fd, int image_sz,
+                                      size_t offset,
+                                      size_t width, size_t height,
+                                      cl_image_format fmt,
+                                      size_t row_pitch,
+                                      cl_int *errcode)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  struct _cl_mem_image *image = NULL;
+  uint32_t intel_fmt, bpp;
+
+  /* Get the size of each pixel */
+  if (UNLIKELY((err = cl_image_byte_per_pixel(&fmt, &bpp)) != CL_SUCCESS))
+    goto error;
+
+  intel_fmt = cl_image_get_intel_format(&fmt);
+  if (intel_fmt == INTEL_UNSUPPORTED_FORMAT) {
+    err = CL_IMAGE_FORMAT_NOT_SUPPORTED;
+    goto error;
+  }
+
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, NULL, NULL, &err);
+  if (mem == NULL || err != CL_SUCCESS)
+    goto error;
+
+  image = cl_mem_image(mem);
+
+  mem->bo = cl_buffer_get_image_from_fd(ctx, fd, image_sz, image);
+  if (mem->bo == NULL) {
+    err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    goto error;
+  }
+  mem->size = image_sz;
+
+  image->w = width;
+  image->h = height;
+  image->image_type = CL_MEM_OBJECT_IMAGE2D;
+  image->depth = 1;
+  image->fmt = fmt;
+  image->intel_fmt = intel_fmt;
+  image->bpp = bpp;
+  image->row_pitch = row_pitch;
+  image->slice_pitch = 0;
+  // NOTE: tiling of image is set in cl_buffer_get_image_from_fd().
+  image->tile_x = 0;
+  image->tile_y = 0;
+  image->offset = offset;
+
+exit:
+  if (errcode)
+    *errcode = err;
+  return mem;
+
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
diff --git a/src/cl_mem.h b/src/cl_mem.h
index e027f15..c8f256d 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -95,6 +95,9 @@ typedef  struct _cl_mem {
   cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
   uint8_t is_userptr;       /* CL_MEM_USE_HOST_PTR is enabled*/
   size_t offset;            /* offset of host_ptr to the page beginning, only for CL_MEM_USE_HOST_PTR*/
+
+  uint8_t cmrt_mem_type;    /* CmBuffer, CmSurface2D, ... */
+  void* cmrt_mem;
 } _cl_mem;
 
 struct _cl_mem_image {
@@ -110,6 +113,7 @@ struct _cl_mem_image {
   size_t tile_x, tile_y;          /* tile offset, used for mipmap images.  */
   size_t offset;                  /* offset for dri_bo, used when it's reloc. */
   cl_mem buffer_1d;               /* if the image is created from buffer, it point to the buffer.*/
+  uint8_t is_image_from_buffer;       /* IMAGE from Buffer*/
 };
 
 struct _cl_mem_gl_image {
@@ -271,6 +275,7 @@ cl_mem_allocate(enum cl_mem_type type,
                 size_t sz,
                 cl_int is_tiled,
                 void *host_ptr,
+                cl_mem buffer,
                 cl_int *errcode);
 
 void
@@ -293,8 +298,21 @@ extern cl_mem cl_mem_new_libva_image(cl_context ctx,
                                      cl_image_format fmt,
                                      size_t row_pitch,
                                      cl_int *errcode);
+
 extern cl_int cl_mem_get_fd(cl_mem mem, int* fd);
 
+extern cl_mem cl_mem_new_buffer_from_fd(cl_context ctx,
+                                        int fd,
+                                        int buffer_sz,
+                                        cl_int* errcode);
+
+extern cl_mem cl_mem_new_image_from_fd(cl_context ctx,
+                                       int fd, int image_sz,
+                                       size_t offset,
+                                       size_t width, size_t height,
+                                       cl_image_format fmt,
+                                       size_t row_pitch,
+                                       cl_int *errcode);
 
 #endif /* __CL_MEM_H__ */
 
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index be9eedf..b0b2c1b 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -63,7 +63,7 @@ cl_mem_new_gl_texture(cl_context ctx,
     goto error;
   }
 
-  mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, NULL, &err);
+  mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, NULL, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
diff --git a/src/cl_program.c b/src/cl_program.c
index 82dd3e3..17f64ca 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -25,8 +25,10 @@
 #include "cl_utils.h"
 #include "cl_khr_icd.h"
 #include "cl_gbe_loader.h"
+#include "cl_cmrt.h"
 #include "CL/cl.h"
 #include "CL/cl_intel.h"
+#include "CL/cl_ext.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -92,10 +94,17 @@ cl_program_delete(cl_program p)
       p->ctx->programs = p->next;
   pthread_mutex_unlock(&p->ctx->program_lock);
 
-  cl_free(p->bin);               /* Free the blob */
-  for (i = 0; i < p->ker_n; ++i) /* Free the kernels */
-    cl_kernel_delete(p->ker[i]);
-  cl_free(p->ker);
+#ifdef HAS_CMRT
+  if (p->cmrt_program != NULL)
+    cmrt_destroy_program(p);
+  else
+#endif
+  {
+    cl_free(p->bin);               /* Free the blob */
+    for (i = 0; i < p->ker_n; ++i) /* Free the kernels */
+      cl_kernel_delete(p->ker[i]);
+    cl_free(p->ker);
+  }
 
   /* Program belongs to their parent context */
   cl_context_delete(p->ctx);
@@ -103,14 +112,19 @@ cl_program_delete(cl_program p)
   /* Free the program as allocated by the compiler */
   if (p->opaque) {
     if (CompilerSupported())
-      compiler_program_clean_llvm_resource(p->opaque);
-    interp_program_delete(p->opaque);
+      //For static variables release, gbeLoader may have been released, so
+      //compiler_program_clean_llvm_resource and interp_program_delete may be NULL.
+      if(compiler_program_clean_llvm_resource)
+        compiler_program_clean_llvm_resource(p->opaque);
+    if(interp_program_delete)
+      interp_program_delete(p->opaque);
   }
 
   p->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(p);
 }
 
+#define BUILD_LOG_MAX_SIZE (1024*1024U)
 LOCAL cl_program
 cl_program_new(cl_context ctx)
 {
@@ -123,9 +137,10 @@ cl_program_new(cl_context ctx)
   p->ref_n = 1;
   p->magic = CL_MAGIC_PROGRAM_HEADER;
   p->ctx = ctx;
-  p->build_log = calloc(1000, sizeof(char));
+  p->cmrt_program = NULL;
+  p->build_log = calloc(BUILD_LOG_MAX_SIZE, sizeof(char));
   if (p->build_log)
-    p->build_log_max_sz = 1000;
+    p->build_log_max_sz = BUILD_LOG_MAX_SIZE;
   /* The queue also belongs to its context */
   cl_context_add_ref(ctx);
 
@@ -166,29 +181,41 @@ error:
   return err;
 }
 
-inline cl_bool isBitcodeWrapper(const unsigned char *BufPtr, const unsigned char *BufEnd)
-{
-  // See if you can find the hidden message in the magic bytes :-).
-  // (Hint: it's a little-endian encoding.)
-  return BufPtr != BufEnd &&
-    BufPtr[0] == 0xDE &&
-    BufPtr[1] == 0xC0 &&
-    BufPtr[2] == 0x17 &&
-    BufPtr[3] == 0x0B;
-}
+#define BINARY_HEADER_LENGTH 5
+
+static const unsigned char binary_type_header[BHI_MAX][BINARY_HEADER_LENGTH]=  \
+                                              {{'B','C', 0xC0, 0xDE},
+                                               {1, 'B', 'C', 0xC0, 0xDE},
+                                               {2, 'B', 'C', 0xC0, 0xDE},
+                                               {1, 'G','E', 'N', 'C'},
+                                               {'C','I', 'S', 'A'},
+                                               };
 
-inline cl_bool isRawBitcode(const unsigned char *BufPtr, const unsigned char *BufEnd)
+LOCAL cl_bool headerCompare(const unsigned char *BufPtr, BINARY_HEADER_INDEX index)
 {
-  // These bytes sort of have a hidden message, but it's not in
-  // little-endian this time, and it's a little redundant.
-  return BufPtr != BufEnd &&
-    BufPtr[0] == 'B' &&
-    BufPtr[1] == 'C' &&
-    BufPtr[2] == 0xc0 &&
-    BufPtr[3] == 0xde;
+  bool matched = true;
+  int length = (index == BHI_SPIR || index == BHI_CMRT) ? BINARY_HEADER_LENGTH -1 :BINARY_HEADER_LENGTH;
+  int i = 0;
+  if(index == BHI_GEN_BINARY)
+    i = 1;
+  for (; i < length; ++i)
+  {
+    matched = matched && (BufPtr[i] == binary_type_header[index][i]);
+  }
+  if(index == BHI_GEN_BINARY && matched) {
+    if(BufPtr[0] != binary_type_header[index][0]) {
+      DEBUGP(DL_WARNING, "Beignet binary format have been changed, please generate binary again.\n");
+      matched = false;
+    }
+  }
+  return matched;
 }
 
-#define isBitcode(BufPtr,BufEnd)  (isBitcodeWrapper(BufPtr, BufEnd) || isRawBitcode(BufPtr, BufEnd))
+#define isSPIR(BufPtr)      headerCompare(BufPtr, BHI_SPIR)
+#define isLLVM_C_O(BufPtr)  headerCompare(BufPtr, BHI_COMPIRED_OBJECT)
+#define isLLVM_LIB(BufPtr)  headerCompare(BufPtr, BHI_LIBRARY)
+#define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY)
+#define isCMRT(BufPtr)      headerCompare(BufPtr, BHI_CMRT)
 
 LOCAL cl_program
 cl_program_create_from_binary(cl_context             ctx,
@@ -216,7 +243,8 @@ cl_program_create_from_binary(cl_context             ctx,
     goto error;
   }
 
-  if (lengths[0] == 0) {
+  //need at least 4 bytes to check the binary type.
+  if (lengths[0] == 0 || lengths[0] < 4) {
     err = CL_INVALID_VALUE;
     if (binary_status)
       binary_status[0] = CL_INVALID_VALUE;
@@ -229,14 +257,14 @@ cl_program_create_from_binary(cl_context             ctx,
       goto error;
   }
 
-  // TODO:  Need to check the binary format here to return CL_INVALID_BINARY.
   TRY_ALLOC(program->binary, cl_calloc(lengths[0], sizeof(char)));
   memcpy(program->binary, binaries[0], lengths[0]);
   program->binary_sz = lengths[0];
   program->source_type = FROM_BINARY;
 
-  if(isBitcode((unsigned char*)program->binary, (unsigned char*)program->binary+program->binary_sz)) {
-
+  if (isCMRT((unsigned char*)program->binary)) {
+    program->source_type = FROM_CMRT;
+  }else if(isSPIR((unsigned char*)program->binary)) {
     char* typed_binary;
     TRY_ALLOC(typed_binary, cl_calloc(lengths[0]+1, sizeof(char)));
     memcpy(typed_binary+1, binaries[0], lengths[0]);
@@ -249,10 +277,11 @@ cl_program_create_from_binary(cl_context             ctx,
     }
 
     program->source_type = FROM_LLVM_SPIR;
-  }else if(isBitcode((unsigned char*)program->binary+1, (unsigned char*)program->binary+program->binary_sz)) {
-    if(*program->binary == 1){
+    program->binary_type = CL_PROGRAM_BINARY_TYPE_INTERMEDIATE;
+  }else if(isLLVM_C_O((unsigned char*)program->binary) || isLLVM_LIB((unsigned char*)program->binary)) {
+    if(*program->binary == BHI_COMPIRED_OBJECT){
       program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
-    }else if(*program->binary == 2){
+    }else if(*program->binary == BHI_LIBRARY){
       program->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
     }else{
       err= CL_INVALID_BINARY;
@@ -266,7 +295,7 @@ cl_program_create_from_binary(cl_context             ctx,
     }
     program->source_type = FROM_LLVM;
   }
-  else if (*program->binary == 0) {
+  else if (isGenBinary((unsigned char*)program->binary)) {
     program->opaque = interp_program_new_from_binary(program->ctx->device->device_id, program->binary, program->binary_sz);
     if (UNLIKELY(program->opaque == NULL)) {
       err = CL_INVALID_PROGRAM;
@@ -277,6 +306,10 @@ cl_program_create_from_binary(cl_context             ctx,
     TRY (cl_program_load_gen_program, program);
     program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
   }
+  else {
+    err= CL_INVALID_BINARY;
+    goto error;
+  }
 
   if (binary_status)
     binary_status[0] = CL_SUCCESS;
@@ -388,7 +421,7 @@ cl_program_create_from_llvm(cl_context ctx,
       goto error;
   }
 
-  program->opaque = compiler_program_new_from_llvm(ctx->device->device_id, file_name, NULL, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1);
+  program->opaque = compiler_program_new_from_llvm(ctx->device->device_id, file_name, NULL, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1, NULL);
   if (UNLIKELY(program->opaque == NULL)) {
     err = CL_INVALID_PROGRAM;
     goto error;
@@ -513,6 +546,20 @@ cl_program_build(cl_program p, const char *options)
     goto error;
   }
 
+#if HAS_CMRT
+  if (p->source_type == FROM_CMRT) {
+    //only here we begins to invoke cmrt
+    //break spec to return other errors such as CL_DEVICE_NOT_FOUND
+    err = cmrt_build_program(p, options);
+    if (err == CL_SUCCESS) {
+      p->build_status = CL_BUILD_SUCCESS;
+      p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+      return CL_SUCCESS;
+    } else
+      goto error;
+  }
+#endif
+
   if (!check_cl_version_option(p, options)) {
     err = CL_BUILD_PROGRAM_FAILURE;
     goto error;
@@ -525,17 +572,10 @@ cl_program_build(cl_program p, const char *options)
       }
       TRY_ALLOC (p->build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
       memcpy(p->build_opts, options, strlen(options));
-
-      p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
-      if (strstr(options, "-x spir")) {
-        p->source_type = FROM_LLVM_SPIR;
-      }
     }
   }
 
   if (options == NULL && p->build_opts) {
-    p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
-
     cl_free(p->build_opts);
     p->build_opts = NULL;
   }
@@ -573,7 +613,7 @@ cl_program_build(cl_program p, const char *options)
     }
     /* Create all the kernels */
     TRY (cl_program_load_gen_program, p);
-  } else if (p->source_type == FROM_BINARY) {
+  } else if (p->source_type == FROM_BINARY && p->binary_type != CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
     p->opaque = interp_program_new_from_binary(p->ctx->device->device_id, p->binary, p->binary_sz);
     if (UNLIKELY(p->opaque == NULL)) {
       err = CL_BUILD_PROGRAM_FAILURE;
@@ -620,19 +660,23 @@ cl_program_link(cl_context            context,
   int copyed = 0;
   cl_bool ret = 0;
   int avialable_program = 0;
-
   //Although we don't use options, but still need check options
   if(!compiler_program_check_opt(options)) {
     err = CL_INVALID_LINKER_OPTIONS;
     goto error;
   }
-
+  const char kernel_arg_option[] = "-cl-kernel-arg-info";
+  cl_bool option_exist = CL_TRUE;
   for(i = 0; i < num_input_programs; i++) {
     //num_input_programs >0 and input_programs MUST not NULL, so compare with input_programs[0] directly.
     if(input_programs[i]->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY ||
-       input_programs[i]->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+       input_programs[i]->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT ||
+       input_programs[i]->binary_type == CL_PROGRAM_BINARY_TYPE_INTERMEDIATE) {
       avialable_program++;
     }
+    if(input_programs[i]->build_opts == NULL || strstr(input_programs[i]->build_opts, kernel_arg_option) == NULL ) {
+      option_exist = CL_FALSE;
+    }
   }
 
   //None of program contain a compilerd binary or library.
@@ -652,12 +696,17 @@ cl_program_link(cl_context            context,
       goto error;
   }
 
+  if(option_exist) {
+      TRY_ALLOC (p->build_opts, cl_calloc(strlen(kernel_arg_option) + 1, sizeof(char)));
+      memcpy(p->build_opts, kernel_arg_option, strlen(kernel_arg_option));
+  }
+
   if (!check_cl_version_option(p, options)) {
     err = CL_BUILD_PROGRAM_FAILURE;
     goto error;
   }
 
-  p->opaque = compiler_program_new_gen_program(context->device->device_id, NULL, NULL);
+  p->opaque = compiler_program_new_gen_program(context->device->device_id, NULL, NULL, NULL);
   for(i = 0; i < num_input_programs; i++) {
     // if program create with llvm binary, need deserilize first to get module.
     if(input_programs[i])
@@ -708,6 +757,7 @@ error:
   return p;
 }
 
+#define FILE_PATH_LENGTH  1024
 LOCAL cl_int
 cl_program_compile(cl_program            p,
                    cl_uint               num_input_headers,
@@ -736,19 +786,20 @@ cl_program_compile(cl_program            p,
       }
       TRY_ALLOC (p->build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
       memcpy(p->build_opts, options, strlen(options));
-
-      p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
     }
   }
 
   if (options == NULL && p->build_opts) {
-    p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
-
     cl_free(p->build_opts);
     p->build_opts = NULL;
   }
 
+#if defined(__ANDROID__)
+  char temp_header_template[]= "/data/local/tmp/beignet.XXXXXX";
+#else
   char temp_header_template[]= "/tmp/beignet.XXXXXX";
+#endif
+
   char* temp_header_path = mkdtemp(temp_header_template);
 
   if (p->source_type == FROM_SOURCE) {
@@ -762,11 +813,15 @@ cl_program_compile(cl_program            p,
     for (i = 0; i < num_input_headers; i++) {
       if(header_include_names[i] == NULL || input_headers[i] == NULL)
         continue;
-
-      char temp_path[255]="";
-      strncpy(temp_path, temp_header_path, strlen(temp_header_path));
+      char temp_path[FILE_PATH_LENGTH]="";
+      strncat(temp_path, temp_header_path, strlen(temp_header_path));
       strncat(temp_path, "/", 1);
       strncat(temp_path, header_include_names[i], strlen(header_include_names[i]));
+      if(strlen(temp_path) >= FILE_PATH_LENGTH - 1 ) {
+        err = CL_COMPILE_PROGRAM_FAILURE;
+        goto error;
+      }
+      temp_path[strlen(temp_path)+1] = '\0';
       char* dirc = strdup(temp_path);
       char* dir = dirname(dirc);
       mkdir(dir, 0755);
@@ -807,7 +862,6 @@ cl_program_compile(cl_program            p,
     }
 
     /* Create all the kernels */
-    p->source_type = FROM_LLVM;
     p->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
   }else if(p->source_type == FROM_BINARY){
     err = CL_INVALID_OPERATION;
@@ -830,6 +884,20 @@ cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
   cl_int err = CL_SUCCESS;
   uint32_t i = 0;
 
+#ifdef HAS_CMRT
+  if (p->cmrt_program != NULL) {
+    void* cmrt_kernel = cmrt_create_kernel(p, name);
+    if (cmrt_kernel != NULL) {
+      to = cl_kernel_new(p);
+      to->cmrt_kernel = cmrt_kernel;
+      goto exit;
+    } else {
+      err = CL_INVALID_KERNEL_NAME;
+      goto error;
+    }
+  }
+#endif
+
   /* Find the program first */
   for (i = 0; i < p->ker_n; ++i) {
     assert(p->ker[i]);
@@ -897,6 +965,7 @@ cl_program_get_kernel_names(cl_program p, size_t size, char *names, size_t *size
   len = strlen(ker_name);
   if(names) {
     strncpy(names, cl_kernel_get_name(p->ker[0]), size - 1);
+    names[size - 1] = '\0';
     if(size < len - 1) {
       if(size_ret) *size_ret = size;
       return;
diff --git a/src/cl_program.h b/src/cl_program.h
index 7af0206..b69e00c 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -34,9 +34,19 @@ enum {
   FROM_SOURCE = 0,
   FROM_LLVM = 1,
   FROM_BINARY = 2,
-  FROM_LLVM_SPIR = 3
+  FROM_LLVM_SPIR = 3,
+  FROM_CMRT = 4,
 };
 
+typedef enum _BINARY_HEADER_INDEX {
+  BHI_SPIR = 0,
+  BHI_COMPIRED_OBJECT = 1,
+  BHI_LIBRARY = 2,
+  BHI_GEN_BINARY = 3,
+  BHI_CMRT = 4,
+  BHI_MAX,
+}BINARY_HEADER_INDEX;
+
 /* This maps an OCL file containing some kernels */
 struct _cl_program {
   DEFINE_ICD(dispatch)
@@ -52,14 +62,17 @@ struct _cl_program {
   char *binary;           /* Program binary. */
   size_t binary_sz;       /* The binary size. */
   uint32_t binary_type;   /* binary type: COMPILED_OBJECT(LLVM IR), LIBRARY(LLVM IR with option "-create-library"), or EXECUTABLE(GEN binary). */
+                          /* ext binary type: BINARY_TYPE_INTERMIDIATE. */
   uint32_t ker_n;         /* Number of declared kernels */
-  uint32_t source_type:2; /* Built from binary, source or LLVM */
+  uint32_t source_type:3; /* Built from binary, source, CMRT or LLVM*/
   uint32_t is_built:1;    /* Did we call clBuildProgram on it? */
   int32_t build_status;   /* build status. */
   char *build_opts;       /* The build options for this program */
   size_t build_log_max_sz; /*build log maximum size in byte.*/
   char *build_log;         /* The build log for this program. */
   size_t build_log_sz;    /* The actual build log size.*/
+
+  void* cmrt_program;      /* real type: CmProgram* */
 };
 
 /* Create a empty program */
diff --git a/src/cl_thread.c b/src/cl_thread.c
index 5e5a351..0780513 100644
--- a/src/cl_thread.c
+++ b/src/cl_thread.c
@@ -38,9 +38,6 @@ static int *thread_slot_map = NULL;
 static int thread_magic_num = 1;
 static pthread_mutex_t thread_queue_map_lock = PTHREAD_MUTEX_INITIALIZER;
 
-static __thread int thread_id = -1;
-static __thread int thread_magic = -1;
-
 typedef struct _thread_spec_data {
   cl_gpgpu gpgpu ;
   int valid;
@@ -56,18 +53,42 @@ typedef struct _queue_thread_private {
   pthread_mutex_t thread_data_lock;
 } queue_thread_private;
 
+static pthread_once_t key_once = PTHREAD_ONCE_INIT;
+static pthread_key_t thread_id_key;
+static pthread_key_t thread_magic_key;
+
+static void create_thread_key()
+{
+  pthread_key_create(&thread_id_key, NULL);
+  pthread_key_create(&thread_magic_key, NULL);
+}
+
 static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int create)
 {
   queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
   thread_spec_data* spec = NULL;
   int i = 0;
+  int *id = NULL, *magic = NULL;
+
+  pthread_once(&key_once, create_thread_key);
+  id = pthread_getspecific(thread_id_key);
+  if(id == NULL) {
+    id = (int *)malloc(sizeof(int));
+    *id = -1;
+    pthread_setspecific(thread_id_key, id);
+  }
+  magic = pthread_getspecific(thread_magic_key);
+  if(magic == NULL) {
+    magic = (int *)malloc(sizeof(int));
+    *magic = -1;
+    pthread_setspecific(thread_magic_key, magic);
+  }
 
-  if (thread_id == -1) {
-
+  if (*id == -1) {
     pthread_mutex_lock(&thread_queue_map_lock);
     for (i = 0; i < thread_array_num; i++) {
       if (thread_slot_map[i] == 0) {
-        thread_id = i;
+        *id = i;
         break;
       }
     }
@@ -75,13 +96,19 @@ static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int
     if (i == thread_array_num) {
       thread_array_num *= 2;
       thread_slot_map = realloc(thread_slot_map, sizeof(int) * thread_array_num);
+
+      if(thread_slot_map == NULL) {
+        pthread_mutex_unlock(&thread_queue_map_lock);
+        return NULL;
+      }
+
       memset(thread_slot_map + thread_array_num/2, 0, sizeof(int) * (thread_array_num/2));
-      thread_id = thread_array_num/2;
+      *id = thread_array_num/2;
     }
 
-    thread_slot_map[thread_id] = 1;
+    thread_slot_map[*id] = 1;
 
-    thread_magic = thread_magic_num++;
+    *magic = thread_magic_num++;
     pthread_mutex_unlock(&thread_queue_map_lock);
   }
 
@@ -91,16 +118,22 @@ static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int
     thread_private->threads_data_num = thread_array_num;
     thread_private->threads_data = realloc(thread_private->threads_data,
                 thread_private->threads_data_num * sizeof(void *));
+
+    if(thread_private->threads_data == NULL) {
+      pthread_mutex_unlock(&thread_private->thread_data_lock);
+      return NULL;
+    }
+
     memset(thread_private->threads_data + old_num, 0,
            sizeof(void*) * (thread_private->threads_data_num - old_num));
   }
 
-  assert(thread_id != -1 && thread_id < thread_array_num);
-  spec = thread_private->threads_data[thread_id];
+  assert(*id != -1 && *id < thread_array_num);
+  spec = thread_private->threads_data[*id];
   if (!spec && create) {
        spec = CALLOC(thread_spec_data);
-       spec->thread_magic = thread_magic;
-       thread_private->threads_data[thread_id] = spec;
+       spec->thread_magic = *magic;
+       thread_private->threads_data[*id] = spec;
   }
 
   pthread_mutex_unlock(&thread_private->thread_data_lock);
@@ -111,28 +144,32 @@ static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int
 cl_event get_current_event(cl_command_queue queue)
 {
   thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  assert(spec && spec->thread_magic == thread_magic);
+  int *magic = pthread_getspecific(thread_magic_key);
+  assert(spec && magic && spec->thread_magic == *magic);
   return spec->current_event;
 }
 
 cl_event get_last_event(cl_command_queue queue)
 {
   thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  assert(spec && spec->thread_magic == thread_magic);
+  int *magic = pthread_getspecific(thread_magic_key);
+  assert(spec && magic && spec->thread_magic == *magic);
   return spec->last_event;
 }
 
 void set_current_event(cl_command_queue queue, cl_event e)
 {
   thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  assert(spec && spec->thread_magic == thread_magic);
+  int *magic = pthread_getspecific(thread_magic_key);
+  assert(spec && magic && spec->thread_magic == *magic);
   spec->current_event = e;
 }
 
 void set_last_event(cl_command_queue queue, cl_event e)
 {
   thread_spec_data* spec = __create_thread_spec_data(queue, 1);
-  assert(spec && spec->thread_magic == thread_magic);
+  int *magic = pthread_getspecific(thread_magic_key);
+  assert(spec && magic && spec->thread_magic == *magic);
   spec->last_event = e;
 }
 
@@ -164,8 +201,12 @@ void* cl_thread_data_create(void)
 cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
 {
   thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+  if(!spec)
+    return NULL;
+  int *magic = pthread_getspecific(thread_magic_key);
+  assert(magic);
 
-  if (!spec->thread_magic && spec->thread_magic != thread_magic) {
+  if (!spec->thread_magic && spec->thread_magic != *magic) {
     //We may get the slot from last thread. So free the resource.
     spec->valid = 0;
   }
@@ -190,8 +231,9 @@ cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
 void cl_set_thread_batch_buf(cl_command_queue queue, void* buf)
 {
   thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+  int *magic = pthread_getspecific(thread_magic_key);
 
-  assert(spec && spec->thread_magic == thread_magic);
+  assert(spec && magic && spec->thread_magic == *magic);
 
   if (spec->thread_batch_buf) {
     cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
@@ -201,19 +243,22 @@ void cl_set_thread_batch_buf(cl_command_queue queue, void* buf)
 
 void* cl_get_thread_batch_buf(cl_command_queue queue) {
   thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+  int *magic = pthread_getspecific(thread_magic_key);
 
-  assert(spec && spec->thread_magic == thread_magic);
+  assert(spec && magic && spec->thread_magic == *magic);
 
   return spec->thread_batch_buf;
 }
 
 void cl_invalid_thread_gpgpu(cl_command_queue queue)
 {
+  int *id = pthread_getspecific(thread_id_key);
   queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
   thread_spec_data* spec = NULL;
 
   pthread_mutex_lock(&thread_private->thread_data_lock);
-  spec = thread_private->threads_data[thread_id];
+  assert(id);
+  spec = thread_private->threads_data[*id];
   assert(spec);
   pthread_mutex_unlock(&thread_private->thread_data_lock);
 
@@ -229,11 +274,13 @@ void cl_invalid_thread_gpgpu(cl_command_queue queue)
 
 cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue)
 {
+  int *id = pthread_getspecific(thread_id_key);
   queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
   thread_spec_data* spec = NULL;
 
   pthread_mutex_lock(&thread_private->thread_data_lock);
-  spec = thread_private->threads_data[thread_id];
+  assert(id);
+  spec = thread_private->threads_data[*id];
   assert(spec);
   pthread_mutex_unlock(&thread_private->thread_data_lock);
 
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 28fdef6..2926611 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -31,6 +31,22 @@
 #define JOIN(X, Y) _DO_JOIN(X, Y)
 #define _DO_JOIN(X, Y) _DO_JOIN2(X, Y)
 #define _DO_JOIN2(X, Y) X##Y
+enum DEBUGP_LEVEL
+{
+    DL_INFO,
+    DL_WARNING,
+    DL_ERROR
+};
+#ifdef NDEBUG
+  #define DEBUGP(...)
+#else
+  //TODO: decide print or not with the value of level from environment
+  #define DEBUGP(level, fmt, ...)                       \
+  do {                                                  \
+    fprintf(stderr, "Beignet: "#fmt, ##__VA_ARGS__);    \
+    fprintf(stderr, "\n");                              \
+  } while (0)
+#endif
 
 /* Check compile time errors */
 #define STATIC_ASSERT(value)                                        \
@@ -153,6 +169,11 @@ IMAGE = cl_mem_image(MEM);                                  \
 const size_t *REGION;                                       \
 size_t REGION ##_REC[3];                                    \
 do {                                                        \
+  if (PREGION == NULL)                                      \
+  {                                                         \
+    err = CL_INVALID_VALUE;                                 \
+    goto error;                                             \
+  }                                                         \
   if (IMAGE->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {   \
     REGION ##_REC[0] = PREGION[0];                          \
     REGION ##_REC[1] = 1;                                   \
@@ -161,12 +182,22 @@ do {                                                        \
   } else {                                                  \
     REGION = PREGION;                                       \
   }                                                         \
+  if((REGION[0] == 0)||(REGION[1] == 0)||(REGION[2] == 0))  \
+  {                                                         \
+    err = CL_INVALID_VALUE;                                 \
+    goto error;                                             \
+  }                                                         \
 } while(0)
 
 #define FIXUP_IMAGE_ORIGIN(IMAGE, PREGION, REGION)          \
 const size_t *REGION;                                       \
 size_t REGION ##_REC[3];                                    \
 do {                                                        \
+  if (PREGION == NULL)                                      \
+  {                                                         \
+    err = CL_INVALID_VALUE;                                 \
+    goto error;                                             \
+  }                                                         \
   if (IMAGE->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {   \
     REGION ##_REC[0] = PREGION[0];                          \
     REGION ##_REC[1] = 0;                                   \
@@ -202,6 +233,18 @@ do {                                                        \
   }                                                         \
 } while (0)
 
+#define CHECK_ACCELERATOR_INTEL(ACCELERATOR_INTEL)                              \
+do {                                                                            \
+  if (UNLIKELY(ACCELERATOR_INTEL == NULL)) {                                    \
+    err = CL_INVALID_ACCELERATOR_INTEL;                                         \
+    goto error;                                                                 \
+  }                                                                             \
+  if (UNLIKELY(ACCELERATOR_INTEL->magic != CL_MAGIC_ACCELERATOR_INTEL_HEADER)) {\
+    err = CL_INVALID_ACCELERATOR_INTEL;                                         \
+    goto error;                                                                 \
+  }                                                                             \
+} while (0)
+
 #define CHECK_KERNEL(KERNEL)                                \
 do {                                                        \
   if (UNLIKELY(KERNEL == NULL)) {                           \
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 035a103..e561725 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -371,7 +371,7 @@ intel_driver_unlock_hardware(intel_driver_t *driver)
 }
 
 LOCAL dri_bo*
-intel_driver_share_buffer(intel_driver_t *driver, const char *sname, uint32_t name)
+intel_driver_share_buffer_from_name(intel_driver_t *driver, const char *sname, uint32_t name)
 {
   dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
                                              sname,
@@ -383,6 +383,19 @@ intel_driver_share_buffer(intel_driver_t *driver, const char *sname, uint32_t na
   return bo;
 }
 
+LOCAL dri_bo*
+intel_driver_share_buffer_from_fd(intel_driver_t *driver, int fd, int size)
+{
+  dri_bo *bo = drm_intel_bo_gem_create_from_prime(driver->bufmgr,
+                                                  fd,
+                                                  size);
+  if (bo == NULL) {
+    fprintf(stderr, "drm_intel_bo_gem_create_from_prime create bo(size %d) from fd %d failed: %s\n", size, fd, strerror(errno));
+    return NULL;
+  }
+  return bo;
+}
+
 LOCAL uint32_t
 intel_driver_shared_name(intel_driver_t *driver, dri_bo *bo)
 {
@@ -697,7 +710,7 @@ cl_buffer intel_share_buffer_from_libva(cl_context ctx,
 {
   drm_intel_bo *intel_bo;
 
-  intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+  intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
 
   if (intel_bo == NULL)
     return NULL;
@@ -715,7 +728,43 @@ cl_buffer intel_share_image_from_libva(cl_context ctx,
   drm_intel_bo *intel_bo;
   uint32_t intel_tiling, intel_swizzle_mode;
 
-  intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+  intel_bo = intel_driver_share_buffer_from_name((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+
+  if (intel_bo == NULL)
+    return NULL;
+
+  drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
+  image->tiling = get_cl_tiling(intel_tiling);
+
+  return (cl_buffer)intel_bo;
+}
+
+cl_buffer intel_share_buffer_from_fd(cl_context ctx,
+                                     int fd,
+                                     int buffer_size)
+{
+  drm_intel_bo *intel_bo;
+
+  intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, buffer_size);
+
+  if (intel_bo == NULL)
+    return NULL;
+
+  return (cl_buffer)intel_bo;
+}
+
+cl_buffer intel_share_image_from_fd(cl_context ctx,
+                                    int fd,
+                                    int image_size,
+                                    struct _cl_mem_image *image)
+{
+  drm_intel_bo *intel_bo;
+  uint32_t intel_tiling, intel_swizzle_mode;
+
+  intel_bo = intel_driver_share_buffer_from_fd((intel_driver_t *)ctx->drv, fd, image_size);
+
+  if (intel_bo == NULL)
+    return NULL;
 
   drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
   image->tiling = get_cl_tiling(intel_tiling);
@@ -813,9 +862,14 @@ intel_update_device_info(cl_device_id device)
   else if (IS_CHERRYVIEW(device->device_id))
     printf(CHV_CONFIG_WARNING);
 #else
-  if (IS_CHERRYVIEW(device->device_id))
+  if (IS_CHERRYVIEW(device->device_id)) {
+#if defined(__ANDROID__)
+    device->max_compute_unit = 12;
+#else
     printf(CHV_CONFIG_WARNING);
 #endif
+  }
+#endif
 
 #ifdef HAS_SUBSLICE_TOTAL
   unsigned int subslice_total;
@@ -826,9 +880,34 @@ intel_update_device_info(cl_device_id device)
   else if (IS_CHERRYVIEW(device->device_id))
     printf(CHV_CONFIG_WARNING);
 #else
-  if (IS_CHERRYVIEW(device->device_id))
+  if (IS_CHERRYVIEW(device->device_id)) {
+#if defined(__ANDROID__)
+    device->sub_slice_count = 2;
+#else
     printf(CHV_CONFIG_WARNING);
 #endif
+  }
+#endif
+
+#ifdef HAS_POOLED_EU
+  /* BXT pooled eu, 3*6 to 2*9, like sub slice count is 2 */
+  unsigned int has_pooled_eu = 0;
+  if(!drm_intel_get_pooled_eu(driver->fd, &has_pooled_eu) && has_pooled_eu)
+    device->sub_slice_count = 2;
+
+#ifdef HAS_MIN_EU_IN_POOL
+  unsigned int min_eu;
+  /* for fused down 2x6 devices, beignet don't support. */
+  if (has_pooled_eu && !drm_intel_get_min_eu_in_pool(driver->fd, &min_eu)) {
+    assert(min_eu == 9); //don't support fuse down device.
+  }
+#endif //HAS_MIN_EU_IN_POOL
+#endif //HAS_POOLED_EU
+  //We should get the device memory dynamically, but the
+  //mapablce mem size usage is unknown. Just ignore it.
+  size_t total_mem,map_mem;
+  if(drm_intel_get_aperture_sizes(driver->fd,&map_mem,&total_mem) == 0)
+    device->global_mem_size = (cl_ulong)total_mem;
 
   intel_driver_context_destroy(driver);
   intel_driver_close(driver);
@@ -872,5 +951,7 @@ intel_setup_callbacks(void)
   cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
   cl_buffer_get_fd = (cl_buffer_get_fd_cb *) drm_intel_bo_gem_export_to_prime;
   cl_buffer_get_tiling_align = (cl_buffer_get_tiling_align_cb *)intel_buffer_get_tiling_align;
+  cl_buffer_get_buffer_from_fd = (cl_buffer_get_buffer_from_fd_cb *) intel_share_buffer_from_fd;
+  cl_buffer_get_image_from_fd = (cl_buffer_get_image_from_fd_cb *) intel_share_image_from_fd;
   intel_set_gpgpu_callbacks(intel_get_device_id());
 }
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 1a76c99..3314ab4 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -39,6 +39,7 @@
 #include "cl_alloc.h"
 #include "cl_utils.h"
 #include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
 
 #ifndef CL_VERSION_1_2
 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
@@ -141,8 +142,6 @@ intel_gpgpu_delete_finished(intel_gpgpu_t *gpgpu)
     drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
   if(gpgpu->printf_b.bo)
     drm_intel_bo_unreference(gpgpu->printf_b.bo);
-  if(gpgpu->printf_b.ibo)
-    drm_intel_bo_unreference(gpgpu->printf_b.ibo);
   if (gpgpu->aux_buf.bo)
     drm_intel_bo_unreference(gpgpu->aux_buf.bo);
   if (gpgpu->perf_b.bo)
@@ -151,6 +150,8 @@ intel_gpgpu_delete_finished(intel_gpgpu_t *gpgpu)
     drm_intel_bo_unreference(gpgpu->stack_b.bo);
   if (gpgpu->scratch_b.bo)
     drm_intel_bo_unreference(gpgpu->scratch_b.bo);
+  if (gpgpu->profiling_b.bo)
+    drm_intel_bo_unreference(gpgpu->profiling_b.bo);
 
   if(gpgpu->constant_b.bo)
     drm_intel_bo_unreference(gpgpu->constant_b.bo);
@@ -179,6 +180,9 @@ void intel_gpgpu_delete_all(intel_driver_t *drv)
 static void
 intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
 {
+  if (gpgpu == NULL)
+    return;
+
   intel_driver_t *drv = gpgpu->drv;
   struct intel_gpgpu_node *p, *node;
 
@@ -204,7 +208,6 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
       drv->gpgpu_list = drv->gpgpu_list->next;
       intel_gpgpu_delete_finished(node->gpgpu);
       cl_free(node);
-      node = p->next;
     }
   }
   if (gpgpu == NULL)
@@ -911,13 +914,14 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->curb.size_cs_entry = size_cs_entry;
   gpgpu->max_threads = max_threads;
 
-  if (gpgpu->printf_b.ibo)
-    dri_bo_unreference(gpgpu->printf_b.ibo);
-  gpgpu->printf_b.ibo = NULL;
   if (gpgpu->printf_b.bo)
     dri_bo_unreference(gpgpu->printf_b.bo);
   gpgpu->printf_b.bo = NULL;
 
+  if (gpgpu->profiling_b.bo)
+    dri_bo_unreference(gpgpu->profiling_b.bo);
+  gpgpu->profiling_b.bo = NULL;
+
   /* Set the profile buffer*/
   if(gpgpu->time_stamp_b.bo)
     dri_bo_unreference(gpgpu->time_stamp_b.bo);
@@ -955,10 +959,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->aux_offset.idrt_offset = size_aux;
   size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor);
 
-  //sampler state must be 32 bytes aligned
+  //must be 32 bytes aligned
+  //sampler state and vme state share the same buffer,
   size_aux = ALIGN(size_aux, 32);
   gpgpu->aux_offset.sampler_state_offset = size_aux;
-  size_aux += GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t);
+  size_aux += MAX(GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
+                  GEN_MAX_VME_STATES * sizeof(gen7_vme_state_t));
 
   //sampler border color state must be 32 bytes aligned
   size_aux = ALIGN(size_aux, 32);
@@ -999,6 +1005,22 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
                     obj_bo);
 }
 
+static void
+intel_gpgpu_set_buf_reloc_for_vme_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+                               index * sizeof(gen7_surface_state_t);
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_RENDER,
+                    I915_GEM_DOMAIN_RENDER,
+                    obj_bo_offset,
+                    gpgpu->aux_offset.surface_heap_offset +
+                    heap->binding_table[index] +
+                    offsetof(gen7_media_surface_state_t, ss0),
+                    obj_bo);
+}
+
 static dri_bo*
 intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
 {
@@ -1117,6 +1139,43 @@ intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t int
                     buf);
 }
 
+static void
+intel_gpgpu_setup_bti_gen9(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+                                   size_t size, unsigned char index, uint32_t format)
+{
+  assert(size <= (4ul<<30));
+  size_t s = size - 1;
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen8_surface_state_t *ss0 = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
+  memset(ss0, 0, sizeof(gen8_surface_state_t));
+  ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss0->ss0.surface_format = format;
+  if(format != I965_SURFACEFORMAT_RAW) {
+    ss0->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
+    ss0->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
+    ss0->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
+    ss0->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
+  }
+  ss0->ss2.width  = s & 0x7f;   /* bits 6:0 of sz */
+  // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
+  if(format == I965_SURFACEFORMAT_RAW)
+    assert((ss0->ss2.width & 0x03) == 3);
+  ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+  ss0->ss3.depth  = (s >> 21) & 0x7ff; /* bits 31:21 of sz, from bespec only gen 9 support that*/
+  ss0->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl();
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen8_surface_state_t);
+  ss0->ss8.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff;
+  ss0->ss9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_RENDER,
+                    I915_GEM_DOMAIN_RENDER,
+                    internal_offset,
+                    gpgpu->aux_offset.surface_heap_offset +
+                    heap->binding_table[index] +
+                    offsetof(gen8_surface_state_t, ss8),
+                    buf);
+}
+
 static int
 intel_is_surface_array(cl_mem_object_type type)
 {
@@ -1219,6 +1278,55 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
 }
 
 static void
+intel_gpgpu_bind_image_for_vme_gen7(intel_gpgpu_t *gpgpu,
+                                    uint32_t index,
+                                    dri_bo* obj_bo,
+                                    uint32_t obj_bo_offset,
+                                    uint32_t format,
+                                    cl_mem_object_type type,
+                                    uint32_t bpp,
+                                    int32_t w,
+                                    int32_t h,
+                                    int32_t depth,
+                                    int32_t pitch,
+                                    int32_t slice_pitch,
+                                    int32_t tiling)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_media_surface_state_t *ss = (gen7_media_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
+
+  memset(ss, 0, sizeof(*ss));
+  ss->ss0.base_addr = obj_bo->offset + obj_bo_offset;
+  ss->ss1.uv_offset_v_direction = 0;
+  ss->ss1.pic_struct = 0;
+  ss->ss1.width = w - 1;
+  ss->ss1.height = h - 1;
+  if (tiling == GPGPU_NO_TILE) {
+    ss->ss2.tile_mode = 0;
+  }
+  else if (tiling == GPGPU_TILE_X){
+    ss->ss2.tile_mode = 2;
+  }
+  else if (tiling == GPGPU_TILE_Y){
+    ss->ss2.tile_mode = 3;
+  }
+  ss->ss2.half_pitch_for_chroma = 0;
+  ss->ss2.surface_pitch = pitch - 1;
+  ss->ss2.surface_object_control_state = cl_gpgpu_get_cache_ctrl();
+  ss->ss2.interleave_chroma = 0;
+  ss->ss2.surface_format = 12; //Y8_UNORM
+  ss->ss3.y_offset_for_u = 0;
+  ss->ss3.x_offset_for_u = 0;
+  ss->ss4.y_offset_for_v = 0;
+  ss->ss4.x_offset_for_v = 0;
+
+  intel_gpgpu_set_buf_reloc_for_vme_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+  assert(index < GEN_MAX_SURFACES);
+}
+
+
+static void
 intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
                               uint32_t index,
                               dri_bo* obj_bo,
@@ -1430,7 +1538,7 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
   drm_intel_bo* old = gpgpu->scratch_b.bo;
   uint32_t total = per_thread_size * gpgpu->max_threads;
   /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */
-  if (IS_HASWELL(gpgpu->drv->device_id))
+  if (IS_HASWELL(gpgpu->drv->device_id) || IS_CHERRYVIEW(gpgpu->drv->device_id))
       total *= 2;
 
   gpgpu->per_thread_scratch = per_thread_size;
@@ -1654,6 +1762,149 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
    }
 }
 
+static void intel_gpgpu_insert_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel, uint32_t index)
+{
+    gen7_vme_state_t* vme = (gen7_vme_state_t*)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset)  + index;
+    memset(vme, 0, sizeof(*vme));
+    gen7_vme_search_path_state_t* sp = vme->sp;
+
+    if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL){
+      sp[0].dw0.SPD_0_X = 0;
+      sp[0].dw0.SPD_0_Y = 0;
+      sp[0].dw0.SPD_1_X = 0;
+      sp[0].dw0.SPD_1_Y = 0;
+      sp[0].dw0.SPD_2_X = 0;
+      sp[0].dw0.SPD_2_Y = 0;
+      sp[0].dw0.SPD_3_X = 0;
+      sp[0].dw0.SPD_3_Y = 0;
+    }
+    else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL){
+      sp[0].dw0.SPD_0_X = 1;
+      sp[0].dw0.SPD_0_Y = 0;
+      sp[0].dw0.SPD_1_X = 0;
+      sp[0].dw0.SPD_1_Y = 1;
+      sp[0].dw0.SPD_2_X = -1;
+      sp[0].dw0.SPD_2_Y = 0;
+      sp[0].dw0.SPD_3_X = 0;
+      sp[0].dw0.SPD_3_Y = 0;
+    }
+    else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL){
+      sp[0].dw0.SPD_0_X = 1;
+      sp[0].dw0.SPD_0_Y = 0;
+      sp[0].dw0.SPD_1_X = 1;
+      sp[0].dw0.SPD_1_Y = 0;
+      sp[0].dw0.SPD_2_X = 1;
+      sp[0].dw0.SPD_2_Y = 0;
+      sp[0].dw0.SPD_3_X = 1;
+      sp[0].dw0.SPD_3_Y = 0;
+
+      sp[1].dw0.SPD_0_X = 1;
+      sp[1].dw0.SPD_0_Y = 0;
+      sp[1].dw0.SPD_1_X = 1;
+      sp[1].dw0.SPD_1_Y = 0;
+      sp[1].dw0.SPD_2_X = 1;
+      sp[1].dw0.SPD_2_Y = 0;
+      sp[1].dw0.SPD_3_X = 0;
+      sp[1].dw0.SPD_3_Y = 1;
+
+      sp[2].dw0.SPD_0_X = -1;
+      sp[2].dw0.SPD_0_Y = 0;
+      sp[2].dw0.SPD_1_X = -1;
+      sp[2].dw0.SPD_1_Y = 0;
+      sp[2].dw0.SPD_2_X = -1;
+      sp[2].dw0.SPD_2_Y = 0;
+      sp[2].dw0.SPD_3_X = -1;
+      sp[2].dw0.SPD_3_Y = 0;
+
+      sp[3].dw0.SPD_0_X = -1;
+      sp[3].dw0.SPD_0_Y = 0;
+      sp[3].dw0.SPD_1_X = -1;
+      sp[3].dw0.SPD_1_Y = 0;
+      sp[3].dw0.SPD_2_X = -1;
+      sp[3].dw0.SPD_2_Y = 0;
+      sp[3].dw0.SPD_3_X = 0;
+      sp[3].dw0.SPD_3_Y = 1;
+
+      sp[4].dw0.SPD_0_X = 1;
+      sp[4].dw0.SPD_0_Y = 0;
+      sp[4].dw0.SPD_1_X = 1;
+      sp[4].dw0.SPD_1_Y = 0;
+      sp[4].dw0.SPD_2_X = 1;
+      sp[4].dw0.SPD_2_Y = 0;
+      sp[4].dw0.SPD_3_X = 1;
+      sp[4].dw0.SPD_3_Y = 0;
+
+      sp[5].dw0.SPD_0_X = 1;
+      sp[5].dw0.SPD_0_Y = 0;
+      sp[5].dw0.SPD_1_X = 1;
+      sp[5].dw0.SPD_1_Y = 0;
+      sp[5].dw0.SPD_2_X = 1;
+      sp[5].dw0.SPD_2_Y = 0;
+      sp[5].dw0.SPD_3_X = 0;
+      sp[5].dw0.SPD_3_Y = 1;
+
+      sp[6].dw0.SPD_0_X = -1;
+      sp[6].dw0.SPD_0_Y = 0;
+      sp[6].dw0.SPD_1_X = -1;
+      sp[6].dw0.SPD_1_Y = 0;
+      sp[6].dw0.SPD_2_X = -1;
+      sp[6].dw0.SPD_2_Y = 0;
+      sp[6].dw0.SPD_3_X = -1;
+      sp[6].dw0.SPD_3_Y = 0;
+
+      sp[7].dw0.SPD_0_X = -1;
+      sp[7].dw0.SPD_0_Y = 0;
+      sp[7].dw0.SPD_1_X = -1;
+      sp[7].dw0.SPD_1_Y = 0;
+      sp[7].dw0.SPD_2_X = -1;
+      sp[7].dw0.SPD_2_Y = 0;
+      sp[7].dw0.SPD_3_X = 0;
+      sp[7].dw0.SPD_3_Y = 1;
+
+      sp[8].dw0.SPD_0_X = 1;
+      sp[8].dw0.SPD_0_Y = 0;
+      sp[8].dw0.SPD_1_X = 1;
+      sp[8].dw0.SPD_1_Y = 0;
+      sp[8].dw0.SPD_2_X = 1;
+      sp[8].dw0.SPD_2_Y = 0;
+      sp[8].dw0.SPD_3_X = 1;
+      sp[8].dw0.SPD_3_Y = 0;
+
+      sp[9].dw0.SPD_0_X = 1;
+      sp[9].dw0.SPD_0_Y = 0;
+      sp[9].dw0.SPD_1_X = 1;
+      sp[9].dw0.SPD_1_Y = 0;
+      sp[9].dw0.SPD_2_X = 1;
+      sp[9].dw0.SPD_2_Y = 0;
+      sp[9].dw0.SPD_3_X = 0;
+      sp[9].dw0.SPD_3_Y = 1;
+
+      sp[10].dw0.SPD_0_X = -1;
+      sp[10].dw0.SPD_0_Y = 0;
+      sp[10].dw0.SPD_1_X = -1;
+      sp[10].dw0.SPD_1_Y = 0;
+      sp[10].dw0.SPD_2_X = -1;
+      sp[10].dw0.SPD_2_Y = 0;
+      sp[10].dw0.SPD_3_X = -1;
+      sp[10].dw0.SPD_3_Y = 0;
+
+      sp[11].dw0.SPD_0_X = -1;
+      sp[11].dw0.SPD_0_Y = 0;
+      sp[11].dw0.SPD_1_X = -1;
+      sp[11].dw0.SPD_1_Y = 0;
+      sp[11].dw0.SPD_2_X = -1;
+      sp[11].dw0.SPD_2_Y = 0;
+      sp[11].dw0.SPD_3_X = 0;
+      sp[11].dw0.SPD_3_Y = 0;
+    }
+}
+
+static void
+intel_gpgpu_bind_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel)
+{
+  intel_gpgpu_insert_vme_state_gen7(gpgpu, accel, 0);
+}
+
 static void
 intel_gpgpu_insert_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
 {
@@ -1992,11 +2243,15 @@ intel_gpgpu_read_ts_reg_gen7(drm_intel_bufmgr *bufmgr)
      i386 system. It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain
      32 bits data in i386.
   */
-#ifdef __i386__
-  return result & 0x0ffffffff;
-#else
-  return result >> 32;
-#endif  /* __i386__  */
+  struct utsname buf;
+  uname(&buf);
+  /* In some systems, the user space is 32 bit, but kernel is 64 bit, so can't use the
+   * compiler's flag to determine the kernel'a architecture, use uname to get it. */
+  /* x86_64 in linux, amd64 in bsd */
+  if(strcmp(buf.machine, "x86_64") == 0 || strcmp(buf.machine, "amd64") == 0)
+    return result >> 32;
+  else
+    return result & 0x0ffffffff;
 }
 
 /* baytrail's result should clear high 4 bits */
@@ -2048,26 +2303,13 @@ intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
 }
 
 static int
-intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint32_t offset, uint8_t bti)
+intel_gpgpu_set_profiling_buf(intel_gpgpu_t *gpgpu, uint32_t size, uint32_t offset, uint8_t bti)
 {
   drm_intel_bo *bo = NULL;
-  if (i == 0) { // the index buffer.
-    if (gpgpu->printf_b.ibo)
-      dri_bo_unreference(gpgpu->printf_b.ibo);
-    gpgpu->printf_b.ibo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf index buffer", size, 4096);
-    bo = gpgpu->printf_b.ibo;
-  } else if (i == 1) {
-    if (gpgpu->printf_b.bo)
-      dri_bo_unreference(gpgpu->printf_b.bo);
-    gpgpu->printf_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf output buffer", size, 4096);
-    bo = gpgpu->printf_b.bo;
-  } else
-    assert(0);
 
+  gpgpu->profiling_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "Profiling buffer", size, 64);
+  bo = gpgpu->profiling_b.bo;
   if (!bo || (drm_intel_bo_map(bo, 1) != 0)) {
-    if (gpgpu->printf_b.bo)
-      drm_intel_bo_unreference(gpgpu->printf_b.bo);
-    gpgpu->printf_b.bo = NULL;
     fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
     return -1;
   }
@@ -2077,66 +2319,89 @@ intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint
   return 0;
 }
 
+static void
+intel_gpgpu_set_profiling_info(intel_gpgpu_t *gpgpu, void* profiling_info)
+{
+  gpgpu->profiling_info = profiling_info;
+}
+
 static void*
-intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
+intel_gpgpu_get_profiling_info(intel_gpgpu_t *gpgpu)
 {
-  drm_intel_bo *bo = NULL;
-  if (i == 0) {
-    bo = gpgpu->printf_b.ibo;
-  } else if (i == 1) {
-    bo = gpgpu->printf_b.bo;
-  } else
-    assert(0);
+  return gpgpu->profiling_info;
+}
+
+static int
+intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
+{
+  if (gpgpu->printf_b.bo)
+    dri_bo_unreference(gpgpu->printf_b.bo);
+  gpgpu->printf_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf buffer", size, 4096);
 
+  if (!gpgpu->printf_b.bo || (drm_intel_bo_map(gpgpu->printf_b.bo, 1) != 0)) {
+    fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+    return -1;
+  }
+
+  memset(gpgpu->printf_b.bo->virtual, 0, size);
+  *(uint32_t *)(gpgpu->printf_b.bo->virtual) = 4; // first four is for the length.
+  drm_intel_bo_unmap(gpgpu->printf_b.bo);
+  /* No need to bind, we do not need to emit reloc. */
+  intel_gpgpu_setup_bti(gpgpu, gpgpu->printf_b.bo, 0, size, bti, I965_SURFACEFORMAT_RAW);
+  return 0;
+}
+
+static void*
+intel_gpgpu_map_profiling_buf(intel_gpgpu_t *gpgpu)
+{
+  drm_intel_bo *bo = NULL;
+  bo = gpgpu->profiling_b.bo;
   drm_intel_bo_map(bo, 1);
   return bo->virtual;
 }
 
 static void
-intel_gpgpu_unmap_printf_buf_addr(intel_gpgpu_t *gpgpu, uint32_t i)
+intel_gpgpu_unmap_profiling_buf_addr(intel_gpgpu_t *gpgpu)
 {
   drm_intel_bo *bo = NULL;
-  if (i == 0) {
-    bo = gpgpu->printf_b.ibo;
-  } else if (i == 1) {
-    bo = gpgpu->printf_b.bo;
-  } else
-  assert(0);
+  bo = gpgpu->profiling_b.bo;
+  drm_intel_bo_unmap(bo);
+}
+
 
+static void*
+intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu)
+{
+  drm_intel_bo *bo = NULL;
+  bo = gpgpu->printf_b.bo;
+  drm_intel_bo_map(bo, 1);
+  return bo->virtual;
+}
+
+static void
+intel_gpgpu_unmap_printf_buf_addr(intel_gpgpu_t *gpgpu)
+{
+  drm_intel_bo *bo = NULL;
+  bo = gpgpu->printf_b.bo;
   drm_intel_bo_unmap(bo);
 }
 
 static void
-intel_gpgpu_release_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
+intel_gpgpu_release_printf_buf(intel_gpgpu_t *gpgpu)
 {
-  if (i == 0) {
-    drm_intel_bo_unreference(gpgpu->printf_b.ibo);
-    gpgpu->printf_b.ibo = NULL;
-  } else if (i == 1) {
-    drm_intel_bo_unreference(gpgpu->printf_b.bo);
-    gpgpu->printf_b.bo = NULL;
-  } else
-    assert(0);
+  drm_intel_bo_unreference(gpgpu->printf_b.bo);
+  gpgpu->printf_b.bo = NULL;
 }
 
 static void
-intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info, size_t * global_sz)
+intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info)
 {
   gpgpu->printf_info = printf_info;
-  gpgpu->global_wk_sz[0] = global_sz[0];
-  gpgpu->global_wk_sz[1] = global_sz[1];
-  gpgpu->global_wk_sz[2] = global_sz[2];
 }
 
 static void*
-intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu, size_t * global_sz, size_t *outbuf_sz)
+intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu)
 {
-  global_sz[0] = gpgpu->global_wk_sz[0];
-  global_sz[1] = gpgpu->global_wk_sz[1];
-  global_sz[2] = gpgpu->global_wk_sz[2];
-
-  if (gpgpu->printf_b.bo)
-    *outbuf_sz = gpgpu->printf_b.bo->size;
   return gpgpu->printf_info;
 }
 
@@ -2159,6 +2424,7 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
   cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
   cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen7;
+  cl_gpgpu_bind_vme_state = (cl_gpgpu_bind_vme_state_cb *) intel_gpgpu_bind_vme_state_gen7;
   cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
   cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
   cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
@@ -2168,6 +2434,11 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
   cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
   cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
+  cl_gpgpu_set_profiling_buffer = (cl_gpgpu_set_profiling_buffer_cb *)intel_gpgpu_set_profiling_buf;
+  cl_gpgpu_set_profiling_info = (cl_gpgpu_set_profiling_info_cb *)intel_gpgpu_set_profiling_info;
+  cl_gpgpu_get_profiling_info = (cl_gpgpu_get_profiling_info_cb *)intel_gpgpu_get_profiling_info;
+  cl_gpgpu_map_profiling_buffer = (cl_gpgpu_map_profiling_buffer_cb *)intel_gpgpu_map_profiling_buf;
+  cl_gpgpu_unmap_profiling_buffer = (cl_gpgpu_unmap_profiling_buffer_cb *)intel_gpgpu_unmap_profiling_buf_addr;
   cl_gpgpu_set_printf_buffer = (cl_gpgpu_set_printf_buffer_cb *)intel_gpgpu_set_printf_buf;
   cl_gpgpu_map_printf_buffer = (cl_gpgpu_map_printf_buffer_cb *)intel_gpgpu_map_printf_buf;
   cl_gpgpu_unmap_printf_buffer = (cl_gpgpu_unmap_printf_buffer_cb *)intel_gpgpu_unmap_printf_buf_addr;
@@ -2196,15 +2467,15 @@ intel_set_gpgpu_callbacks(int device_id)
 	intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
     return;
   }
-  if (IS_SKYLAKE(device_id) || IS_BROXTON(device_id)) {
+  if (IS_GEN9(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen9;
     intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
     cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen9;
     intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
-    intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //BDW need not restore SLM, same as gen7
+    intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //SKL need not restore SLM, same as gen7
     intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
     intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen9;
-    intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen8;
+    intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen9;
     intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8;
     cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen8;
     intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen9;
@@ -2236,6 +2507,7 @@ intel_set_gpgpu_callbacks(int device_id)
   }
   else if (IS_IVYBRIDGE(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
+    cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen7;
     if (IS_BAYTRAIL_T(device_id)) {
       intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
       intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index ad7290e..904f9e0 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -44,8 +44,8 @@ struct intel_batchbuffer;
 struct intel_gpgpu
 {
   void* ker_opaque;
-  size_t global_wk_sz[3];
   void* printf_info;
+  void* profiling_info;
   struct intel_driver *drv;
   struct intel_batchbuffer *batch;
   cl_gpgpu_kernel *ker;
@@ -64,9 +64,8 @@ struct intel_gpgpu
   struct { drm_intel_bo *bo; } scratch_b;
   struct { drm_intel_bo *bo; } constant_b;
   struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
-  struct { drm_intel_bo *bo;
-           drm_intel_bo *ibo;} printf_b;      /* the printf buf and index buf*/
-
+  struct { drm_intel_bo *bo; } printf_b;      /* the printf buf and index buf*/
+  struct { drm_intel_bo *bo; } profiling_b;   /* the buf for profiling*/
   struct { drm_intel_bo *bo; } aux_buf;
   struct {
     uint32_t surface_heap_offset;
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index fd6a82b..c112a16 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -381,6 +381,57 @@ typedef struct gen8_surface_state
   } ss15;
 } gen8_surface_state_t;
 
+typedef struct gen7_media_surface_state
+{
+  struct {
+    uint32_t base_addr;
+  } ss0;
+
+  struct {
+    uint32_t uv_offset_v_direction:2;
+    uint32_t pic_struct:2;
+    uint32_t width:14;
+    uint32_t height:14;
+  } ss1;
+
+  struct {
+    uint32_t tile_mode:2;
+    uint32_t half_pitch_for_chroma:1;
+    uint32_t surface_pitch:18;
+    uint32_t pad1:1;
+    uint32_t surface_object_control_state:4;
+    uint32_t pad0:1;
+    uint32_t interleave_chroma:1;
+    uint32_t surface_format:4;
+  } ss2;
+
+  struct {
+    uint32_t y_offset_for_u:14;
+    uint32_t pad1:2;
+    uint32_t x_offset_for_u:14;
+    uint32_t pad0:2;
+  } ss3;
+
+  struct {
+    uint32_t y_offset_for_v:15;
+    uint32_t pad1:1;
+    uint32_t x_offset_for_v:14;
+    uint32_t pad0:2;
+  } ss4;
+
+  struct {
+    uint32_t pad0;
+  } ss5;
+
+  struct {
+    uint32_t pad0;
+  } ss6;
+
+  struct {
+    uint32_t pad0;
+  } ss7;
+} gen7_media_surface_state_t;
+
 typedef union gen_surface_state
 {
   gen7_surface_state_t gen7_surface_state;
@@ -555,6 +606,75 @@ typedef struct gen8_pipe_control
   } dw5;
 } gen8_pipe_control_t;
 
+#define GEN7_NUM_VME_SEARCH_PATH_STATES 14
+#define GEN7_NUM_VME_RD_LUT_SETS 4
+
+typedef struct gen7_vme_search_path_state
+{
+    struct {
+        uint32_t SPD_0_X : BITFIELD_RANGE(0, 3);        //search path distance
+        uint32_t SPD_0_Y : BITFIELD_RANGE(4, 7);
+        uint32_t SPD_1_X : BITFIELD_RANGE(8, 11);
+        uint32_t SPD_1_Y : BITFIELD_RANGE(12, 15);
+        uint32_t SPD_2_X : BITFIELD_RANGE(16, 19);
+        uint32_t SPD_2_Y : BITFIELD_RANGE(20, 23);
+        uint32_t SPD_3_X : BITFIELD_RANGE(24, 27);
+        uint32_t SPD_3_Y : BITFIELD_RANGE(28, 31);
+    }dw0;
+}gen7_vme_search_path_state_t;
+
+typedef struct gen7_vme_rd_lut_set
+{
+    struct {
+        uint32_t LUT_MbMode_0 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_1 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_2 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_3 : BITFIELD_RANGE(24, 31);
+    }dw0;
+
+    struct {
+        uint32_t LUT_MbMode_4 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_5 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_6 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_7 : BITFIELD_RANGE(24, 31);
+    }dw1;
+
+    struct {
+        uint32_t LUT_MV_0 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MV_1 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MV_2 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MV_3 : BITFIELD_RANGE(24, 31);
+    }dw2;
+
+    struct {
+        uint32_t LUT_MV_4 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MV_5 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MV_6 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MV_7 : BITFIELD_RANGE(24, 31);
+    }dw3;
+}gen7_vme_rd_lut_set_t;
+
+typedef struct gen7_vme_state
+{
+    gen7_vme_search_path_state_t sp[GEN7_NUM_VME_SEARCH_PATH_STATES];
+
+    struct {
+        uint32_t LUT_MbMode_8_0 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_9_0 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_8_1 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_9_1 : BITFIELD_RANGE(24, 31);
+    }dw14;
+
+    struct {
+        uint32_t LUT_MbMode_8_2 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_9_2 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_8_3 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_9_3 : BITFIELD_RANGE(24, 31);
+    }dw15;
+
+    gen7_vme_rd_lut_set_t lut[GEN7_NUM_VME_RD_LUT_SETS];
+}gen7_vme_state_t;
+
 typedef struct gen6_sampler_state
 {
   struct {
diff --git a/src/kernels/cl_internal_block_motion_estimate_intel.cl b/src/kernels/cl_internal_block_motion_estimate_intel.cl
new file mode 100644
index 0000000..e56520a
--- /dev/null
+++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl
@@ -0,0 +1,369 @@
+typedef struct _motion_estimation_desc_intel {
+  uint mb_block_type;
+  uint subpixel_mode;
+  uint sad_adjust_mode;
+  uint search_path_type;
+} accelerator_intel_t;
+
+__kernel __attribute__((reqd_work_group_size(16,1,1)))
+void block_motion_estimate_intel(accelerator_intel_t accel,
+                                 __read_only  image2d_t src_image,
+                                 __read_only  image2d_t ref_image,
+                                 __global short2 * prediction_motion_vector_buffer,
+                                 __global short2 * motion_vector_buffer,
+                                 __global ushort * residuals){
+
+  uint src_grf0_dw7;
+  uint src_grf0_dw6;
+  uint src_grf0_dw5;
+  uint src_grf0_dw4;
+  uint src_grf0_dw3;
+  uint src_grf0_dw2;
+  uint src_grf0_dw1;
+  uint src_grf0_dw0;
+  uint src_grf1_dw7;
+  uint src_grf1_dw6;
+  uint src_grf1_dw5;
+  uint src_grf1_dw4;
+  uint src_grf1_dw3;
+  uint src_grf1_dw2;
+  uint src_grf1_dw1;
+  uint src_grf1_dw0;
+  uint src_grf2_dw7;
+  uint src_grf2_dw6;
+  uint src_grf2_dw5;
+  uint src_grf2_dw4;
+  uint src_grf2_dw3;
+  uint src_grf2_dw2;
+  uint src_grf2_dw1;
+  uint src_grf2_dw0;
+  uint src_grf3_dw7;
+  uint src_grf3_dw6;
+  uint src_grf3_dw5;
+  uint src_grf3_dw4;
+  uint src_grf3_dw3;
+  uint src_grf3_dw2;
+  uint src_grf3_dw1;
+  uint src_grf3_dw0;
+  uint src_grf4_dw7;
+  uint src_grf4_dw6;
+  uint src_grf4_dw5;
+  uint src_grf4_dw4;
+  uint src_grf4_dw3;
+  uint src_grf4_dw2;
+  uint src_grf4_dw1;
+  uint src_grf4_dw0;
+
+  uint8 vme_result = (0, 0, 0, 0, 0, 0, 0, 0);
+
+  int lgid_x = get_group_id(0);
+  int lgid_y = get_group_id(1);
+
+  int num_groups_x = get_num_groups(0);
+  int index = lgid_y * num_groups_x + lgid_x;
+
+  uint2 srcCoord = 0;
+  short2 predict_mv = 0;
+  if(prediction_motion_vector_buffer != NULL){
+    predict_mv = prediction_motion_vector_buffer[index];
+    predict_mv.x = predict_mv.x / 4;
+    predict_mv.y = predict_mv.y / 4;
+  }
+
+  srcCoord.x = lgid_x * 16;
+  srcCoord.y = lgid_y * 16;
+
+  //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
+  if(accel.search_path_type == 0x0){
+    //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+    src_grf0_dw5 =   (20 << 24)         | (20 << 16)        | (0 << 8)       | (0);
+    //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
+    src_grf0_dw1 =   ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
+    //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);
+    src_grf0_dw0 =   ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
+    //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                | (Start0Y << 20)
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                   //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);
+                     | (0 << 16)                     | (2 << 8)                       | (2);
+  }
+  //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL
+  else if(accel.search_path_type == 0x1){
+    src_grf0_dw5 =   (24 << 24)         | (24 << 16)        | (0 << 8)       | (0);
+    src_grf0_dw1 =   ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & 0x0000ffff);
+    src_grf0_dw0 =   ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & 0x0000ffff);
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                     | (0 << 16)                     | (48 << 8)                       | (48);
+  }
+  //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
+  else if(accel.search_path_type == 0x5){
+    src_grf0_dw5 =   (40 << 24)         | (48 << 16)        | (0 << 8)       | (0);
+    src_grf0_dw1 =   ((-12 + predict_mv.y) << 16 ) | ((-16 + predict_mv.x) & 0x0000ffff);
+    src_grf0_dw0 =   ((-12 + predict_mv.y) << 16 ) | ((-16 +  + predict_mv.x) & 0x0000ffff);
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                     | (0 << 16)                     | (48 << 8)                       | (48);
+  }
+
+  /*Deal with mb_block_type & sad_adjust_mode & subpixel_mode*/
+  uchar sub_mb_part_mask = 0;
+  //CL_ME_MB_TYPE_16x16_INTEL
+  if(accel.mb_block_type == 0x0)
+    sub_mb_part_mask = 0x7e;
+  //CL_ME_MB_TYPE_8x8_INTEL
+  else if(accel.mb_block_type == 0x1)
+    sub_mb_part_mask = 0x77;
+  //CL_ME_MB_TYPE_4x4_INTEL
+  else if(accel.mb_block_type == 0x2)
+    sub_mb_part_mask = 0x3f;
+
+  uchar inter_sad = 0;
+  //CL_ME_SAD_ADJUST_MODE_NONE_INTEL
+  if(accel.sad_adjust_mode == 0x0)
+    inter_sad = 0;
+  //CL_ME_SAD_ADJUST_MODE_HAAR_INTEL
+  else if(accel.sad_adjust_mode == 0x1)
+    inter_sad = 2;
+
+  uchar sub_pel_mode = 0;
+  //CL_ME_SUBPIXEL_MODE_INTEGER_INTEL
+  if(accel.subpixel_mode == 0x0)
+    sub_pel_mode = 0;
+  //CL_ME_SUBPIXEL_MODE_HPEL_INTEL
+  else if(accel.subpixel_mode == 0x1)
+    sub_pel_mode = 1;
+  //CL_ME_SUBPIXEL_MODE_QPEL_INTEL
+  else if(accel.subpixel_mode == 0x2)
+    sub_pel_mode = 3;
+
+  //src_grf0_dw3 = (Reserved << 31)                | (Sub_Mb_Part_Mask << 24)       | (Intra_SAD << 22)
+  src_grf0_dw3 =   (0 << 31)                       | (sub_mb_part_mask << 24)       | (0 << 22)
+                 //| (Inter_SAD << 20)             | (BB_Skip_Enabled << 19)        | (Reserverd << 18)
+                   | (inter_sad << 20)             | (0 << 19)                      | (0 << 18)
+                 //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16)  | (Dis_Field_Cache_Alloc << 15)
+                   | (0 << 17)                     | (0 << 16)                      | (0 << 15)
+                 //| (Skip_Type << 14)             | (Sub_Pel_Mode << 12)           | (Dual_Search_Path_Opt << 11)
+                   | (0 << 14)                     | (sub_pel_mode << 12)           | (0 << 11)
+                 //| (Search_Ctrl << 8)            | (Ref_Access << 7)              | (SrcAccess << 6)
+                   | (0 << 8)                      | (0 << 7)                       | (0 << 6)
+                 //| (Mb_Type_Remap << 4)          | (Reserved_Workaround << 3)     | (Reserved_Workaround << 2)
+                   | (0 << 4)                      | (0 << 3)                       | (0 << 2)
+                 //| (Src_Size);
+                   | (0);
+
+
+  //src_grf0_dw7 = Debug;
+  src_grf0_dw7 = 0;
+  //src_grf0_dw6 = Debug;
+  src_grf0_dw6 = 0;
+  //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+  //src_grf0_dw4 = Ignored;
+  src_grf0_dw4 = 0;
+
+  //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+  src_grf0_dw2 = (srcCoord.y << 16)  | (srcCoord.x);
+  //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
+  //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);
+  /*src_grf1_dw7 = (Skip_Center_Mask << 24)         | (Reserved << 22)               | (Ref1_Field_Polarity << 21)
+                 | (Ref0_Field_Polarity << 20)   | (Src_Field_Polarity << 19)     | (Bilinear_Enable << 18)
+                 | (MV_Cost_Scale_Factor << 16)  | (Mb_Intra_Struct << 8)         | (Intra_Corner_Swap << 7)
+                 | (Non_Skip_Mode_Added << 6)    | (Non_Skip_ZMv_Added << 5)      | (IntraPartMask);*/
+  src_grf1_dw7 = 0;
+  //src_grf1_dw6 = Reserved;
+  src_grf1_dw6 = 0;
+  /*src_grf1_dw5 = (Cost_Center1Y << 16)  | (Cost_Center1X);
+  src_grf1_dw4 = (Cost_Center0Y << 16)  | (Cost_Center0X);
+  src_grf1_dw3 = (Ime_Too_Good << 24 )  | (Ime_Too_Bad << 16)  | (Part_Tolerance_Thrhd << 8) | (FBPrunThrhd);*/
+  src_grf1_dw5 = 0;
+  src_grf1_dw4 = 0;
+  src_grf1_dw3 = 0;
+  //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                | (Start0Y << 20)
+                 //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);
+  /*src_grf1_dw1 = (RepartEn << 31)                 | (FBPrunEn << 30)               | (AdaptiveValidationControl << 29)
+                 | (Uni_Mix_Disable << 28)       | (Bi_Sub_Mb_Part_Mask << 24)    | (Reserverd << 22)
+                 | (Bi_Weight << 16)             | (Reserved << 6)                | (MaxNumMVs);*/
+  //src_grf1_dw1 = (0 << 24) | (2);
+  src_grf1_dw1 = (0 << 24) | (16);
+  /*src_grf1_dw0 = (Early_Ime_Stop << 24)           | (Early_Fme_Success << 16)      | (Skip_Success << 8)
+                 | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6)           | (Early_Ime_Success_En << 5)
+                 | (Early_Success_En << 4)       | (Part_Candidate_En << 3)       | (Bi_Mix_Dis << 2)
+                 | (Adaptive_En  << 1)           | (SkipModeEn);*/
+  src_grf1_dw0 = 0;
+  /*src_grf2_dw7 = Ref1_SkipCenter_3_Delta_XY;
+  src_grf2_dw6 = Ref0_SkipCenter_3_Delta_XY;
+  src_grf2_dw5 = Ref1_SkipCenter_2_Delta_XY;
+  src_grf2_dw4 = Ref0_SkipCenter_3_Delta_XY;
+  src_grf2_dw3 = Ref1_SkipCenter_1_Delta_XY;
+  src_grf2_dw2 = Ref0_SkipCenter_1_Delta_XY;
+  src_grf2_dw1 = Ref1_SkipCenter_0_Delta_XY;
+  src_grf2_dw0 = (Ref0_Skip_Center_0_Delta_Y << 16)  | (Ref0_Skip_Center_0_Delta_X);
+  src_grf3_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1];
+  src_grf3_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1];
+  src_grf3_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1];
+  src_grf3_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1];
+  src_grf3_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1];
+  src_grf3_dw2 = (Neighbor pixel Luma value [3, -1] << 24)    | (Neighbor pixel Luma value [2, -1] << 16)
+                 | (Neighbor pixel Luma value [1, -1] << 8)  | (Neighbor pixel Luma value [0, -1]);
+  //src_grf3_dw1 = (?)  | (Reserved)  | ((Intra_16x16_Mode_Mask);
+  src_grf3_dw0 = (Reserved<<25)  | (Intra_16x16_Mode_Mask << 16)  | (Reserved)  | (Intra_16x16_Mode_Mask);
+  src_grf4_dw7 = Reserved;
+  src_grf4_dw6 = Reserved;
+  src_grf4_dw5 = Reserved;
+  src_grf4_dw4 = (Intra_MxM_Pred_Mode_B15 << 28)    | (Intra_MxM_Pred_Mode_B14 << 24)  | (Intra_MxM_Pred_Mode_B11 << 20)
+                 | (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12)  | (Intra_MxM_Pred_Mode_A13 << 8)
+                 | (Intra_MxM_Pred_Mode_A7 << 4)   | (Intra_MxM_Pred_Mode_A5);
+  //src_grf4_dw3 = (?)  | (Neighbor pixel Luma value [-1, 14] to [-1, 12]);
+  src_grf4_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8];
+  src_grf4_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4];
+  src_grf4_dw0 = (Neighbor pixel Luma value [-1, 3] << 24)    | (Neighbor pixel Luma value [-1, 2] << 16)
+                 | (Neighbor pixel Luma value [-1, 1] << 8)  | (Neighbor pixel Luma value [-1, 0]);*/
+  src_grf2_dw7 = 0;
+  src_grf2_dw6 = 0;
+  src_grf2_dw5 = 0;
+  src_grf2_dw4 = 0;
+  src_grf2_dw3 = 0;
+  src_grf2_dw2 = 0;
+  src_grf2_dw1 = 0;
+  src_grf2_dw0 = 0;
+  src_grf3_dw7 = 0;
+  src_grf3_dw6 = 0;
+  src_grf3_dw5 = 0;
+  src_grf3_dw4 = 0;
+  src_grf3_dw3 = 0;
+  src_grf3_dw2 = 0;
+  src_grf3_dw1 = 0;
+  src_grf3_dw0 = 0;
+  src_grf4_dw7 = 0;
+  src_grf4_dw6 = 0;
+  src_grf4_dw5 = 0;
+  src_grf4_dw4 = 0;
+  src_grf4_dw3 = 0;
+  src_grf4_dw2 = 0;
+  src_grf4_dw1 = 0;
+  src_grf4_dw0 = 0;
+
+  int lid_x = get_local_id(0);
+
+  vme_result = __gen_ocl_vme(src_image, ref_image,
+                src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+                src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+                src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+                src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+                src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+                src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+                src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+                src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+                src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+                src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+                //msg_type, vme_search_path_lut, lut_sub,
+                1, 0, 0);
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  short2 mv[16];
+  ushort res[16];
+
+  uint write_back_dwx;
+  uint simd_width = get_max_sub_group_size();
+
+  /* In simd 8 mode, one kernel variable 'uint' map to 8 dword.
+   * In simd 16 mode, one kernel variable 'uint' map to 16 dword.
+   * That's why we should treat simd8 and simd16 differently when
+   * use __gen_ocl_region.
+   * */
+  if(simd_width == 8){
+    write_back_dwx = __gen_ocl_region(0, vme_result.s1);
+    mv[0] = as_short2( write_back_dwx );
+
+    if(accel.mb_block_type > 0x0){
+      for(int i = 2, j = 1; j < 4; i += 2, j++){
+        write_back_dwx = __gen_ocl_region(i, vme_result.s1);
+        mv[j] = as_short2( write_back_dwx );
+      }
+      if(accel.mb_block_type > 0x1){
+        for(int i = 0, j = 4; j < 8; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s2);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 8; j < 12; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s3);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 12; j < 16; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s4);
+          mv[j] = as_short2( write_back_dwx );
+        }
+      }
+    }
+    ushort2 temp_res;
+    for(int i = 0; i < 8; i++){
+      write_back_dwx = __gen_ocl_region(i, vme_result.s5);
+      temp_res = as_ushort2(write_back_dwx);
+      res[i*2] = temp_res.s0;
+      res[i*2+1] = temp_res.s1;
+    }
+  }
+  else if(simd_width == 16){
+    write_back_dwx = __gen_ocl_region(0 + 8, vme_result.s0);
+    mv[0] = as_short2( write_back_dwx );
+
+    if(accel.mb_block_type > 0x0){
+      for(int i = 2, j = 1; j < 4; i += 2, j++){
+        write_back_dwx = __gen_ocl_region(i + 8, vme_result.s0);
+        mv[j] = as_short2( write_back_dwx );
+      }
+      if(accel.mb_block_type > 0x1){
+        for(int i = 0, j = 4; j < 8; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s1);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 8; j < 12; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i + 8, vme_result.s1);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 12; j < 16; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s2);
+          mv[j] = as_short2( write_back_dwx );
+        }
+      }
+    }
+    ushort2 temp_res;
+    for(int i = 0; i < 8; i++){
+      write_back_dwx = __gen_ocl_region(i + 8, vme_result.s2);
+      temp_res = as_ushort2(write_back_dwx);
+      res[i*2] = temp_res.s0;
+      res[i*2+1] = temp_res.s1;
+    }
+  }
+
+  int mv_index;
+
+  //CL_ME_MB_TYPE_16x16_INTEL
+  if(accel.mb_block_type == 0x0){
+    mv_index = index * 1;
+    if( lid_x == 0 ){
+      motion_vector_buffer[mv_index] = mv[lid_x];
+      if(residuals)
+        residuals[mv_index] = 2 * res[lid_x];
+    }
+  }
+  //CL_ME_MB_TYPE_8x8_INTEL
+  else if(accel.mb_block_type == 0x1){
+    if(lid_x < 4){
+      mv_index = lgid_y * num_groups_x * 4 + lgid_x * 2;
+      mv_index = mv_index + num_groups_x * 2 * (lid_x / 2) + (lid_x % 2);
+      motion_vector_buffer[mv_index] = mv[lid_x];
+      if(residuals)
+        residuals[mv_index] = 2 * res[lid_x];
+    }
+  }
+  //CL_ME_MB_TYPE_4x4_INTEL
+  else if(accel.mb_block_type == 0x2){
+    if(lid_x < 16){
+      mv_index = lgid_y * num_groups_x * 16 + lgid_x * 4;
+      mv_index = mv_index + num_groups_x * 4 * (lid_x / 4) + (lid_x % 4);
+      motion_vector_buffer[mv_index] = mv[lid_x];
+      if(residuals)
+        residuals[mv_index] = 2 * res[lid_x];
+    }
+  }
+
+}
diff --git a/src/performance.c b/src/performance.c
index 85cd481..28bd6c6 100644
--- a/src/performance.c
+++ b/src/performance.c
@@ -280,6 +280,9 @@ static void insert(cl_context context, const char *kernel_name, const char *buil
   }
   context_storage_node *p_context = find_context(context);
   kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, build_opt);
+  if(!p_kernel)
+    return;
+
   prev_context_pointer = p_context;
   prev_kernel_pointer = p_kernel;
   p_kernel->kernel_times[p_kernel->current_count++] = time;
diff --git a/src/x11/dricommon.c b/src/x11/dricommon.c
index 16f50e4..98eb713 100644
--- a/src/x11/dricommon.c
+++ b/src/x11/dricommon.c
@@ -68,6 +68,9 @@ dri_state_do_drawable_hash(dri_state_t *state, XID drawable)
   }
 
   dri_drawable = dri_state_create_drawable(state, drawable);
+  if(dri_drawable == NULL)
+    return NULL;
+
   dri_drawable->x_drawable = drawable;
   dri_drawable->next = state->drawable_hash[index];
   state->drawable_hash[index] = dri_drawable;
@@ -283,7 +286,8 @@ getDRI2State(Display* dpy, int screen, char **driver_name)
         &internal_driver_name, &device_name))
     goto err_out;
 
-  fd = open(device_name, O_RDWR);
+  if(device_name != NULL )
+    fd = open(device_name, O_RDWR);
 
   if (fd < 0)
     goto err_out;
diff --git a/utests/Android.mk b/utests/Android.mk
new file mode 100644
index 0000000..63dba2a
--- /dev/null
+++ b/utests/Android.mk
@@ -0,0 +1,248 @@
+MY_LOCAL_PATH := $(call my-dir)
+LOCAL_PATH:= $(MY_LOCAL_PATH)
+include $(CLEAR_VARS)
+
+include $(LOCAL_PATH)/../Android.common.mk
+
+SUBDIR_C_INCLUDES := $(TOP_C_INCLUDE) $(LOCAL_PATH)/../include
+SUBDIR_CPPFLAGS := $(TOP_CPPFLAGS)
+SUBDIR_CPPFLAGS += -fexceptions -std=c++11
+SUBDIR_LOCAL_CFLAGS := $(TOP_CFLAGS)
+LOCAL_LDFLAGS := -Wl,-Bsymbolic
+
+LOCAL_SRC_FILES:= \
+  utest_error.c \
+  utest_assert.cpp \
+  utest.cpp \
+  utest_file_map.cpp \
+  utest_helper.cpp \
+  compiler_basic_arithmetic.cpp \
+  compiler_displacement_map_element.cpp \
+  compiler_mandelbrot.cpp \
+  compiler_mandelbrot_alternate.cpp \
+  compiler_box_blur_float.cpp \
+  compiler_box_blur_image.cpp \
+  compiler_box_blur.cpp \
+  compiler_insert_to_constant.cpp \
+  compiler_argument_structure.cpp \
+  compiler_argument_structure_indirect.cpp \
+  compiler_argument_structure_select.cpp \
+  compiler_arith_shift_right.cpp \
+  compiler_mixed_pointer.cpp \
+  compiler_array0.cpp \
+  compiler_array.cpp \
+  compiler_array1.cpp \
+  compiler_array2.cpp \
+  compiler_array3.cpp \
+  compiler_array4.cpp \
+  compiler_byte_scatter.cpp \
+  compiler_ceil.cpp \
+  compiler_popcount.cpp \
+  compiler_convert_uchar_sat.cpp \
+  compiler_copy_buffer.cpp \
+  compiler_copy_image.cpp \
+  compiler_copy_image_1d.cpp \
+  compiler_copy_image_3d.cpp \
+  compiler_copy_buffer_row.cpp \
+  compiler_degrees.cpp \
+  compiler_step.cpp \
+  compiler_fabs.cpp \
+  compiler_abs.cpp \
+  compiler_abs_diff.cpp \
+  compiler_fill_image.cpp \
+  compiler_fill_image0.cpp \
+  compiler_fill_image_1d.cpp \
+  compiler_fill_image_3d.cpp \
+  compiler_fill_image_3d_2.cpp \
+  compiler_function_argument0.cpp \
+  compiler_function_argument1.cpp \
+  compiler_function_argument2.cpp \
+  compiler_function_argument.cpp \
+  compiler_function_constant0.cpp \
+  compiler_function_constant1.cpp \
+  compiler_function_constant.cpp \
+  compiler_global_constant.cpp \
+  compiler_global_constant_2.cpp \
+  compiler_group_size.cpp \
+  compiler_hadd.cpp \
+  compiler_if_else.cpp \
+  compiler_integer_division.cpp \
+  compiler_integer_remainder.cpp \
+  compiler_insert_vector.cpp \
+  compiler_lower_return0.cpp \
+  compiler_lower_return1.cpp \
+  compiler_lower_return2.cpp \
+  compiler_mad_hi.cpp \
+  compiler_mul_hi.cpp \
+  compiler_mad24.cpp \
+  compiler_mul24.cpp \
+  compiler_multiple_kernels.cpp \
+  compiler_radians.cpp \
+  compiler_rhadd.cpp \
+  compiler_rotate.cpp \
+  compiler_saturate.cpp \
+  compiler_saturate_sub.cpp \
+  compiler_shift_right.cpp \
+  compiler_short_scatter.cpp \
+  compiler_smoothstep.cpp \
+  compiler_uint2_copy.cpp \
+  compiler_uint3_copy.cpp \
+  compiler_uint8_copy.cpp \
+  compiler_uint16_copy.cpp \
+  compiler_uint3_unaligned_copy.cpp \
+  compiler_upsample_int.cpp \
+  compiler_upsample_long.cpp \
+  compiler_unstructured_branch0.cpp \
+  compiler_unstructured_branch1.cpp \
+  compiler_unstructured_branch2.cpp \
+  compiler_unstructured_branch3.cpp \
+  compiler_write_only_bytes.cpp \
+  compiler_write_only.cpp \
+  compiler_write_only_shorts.cpp \
+  compiler_switch.cpp \
+  compiler_bswap.cpp \
+  compiler_clz.cpp \
+  compiler_math.cpp \
+  compiler_atomic_functions.cpp \
+  compiler_async_copy.cpp \
+  compiler_async_stride_copy.cpp \
+  compiler_insn_selection_min.cpp \
+  compiler_insn_selection_max.cpp \
+  compiler_insn_selection_masked_min_max.cpp \
+  compiler_load_bool_imm.cpp \
+  compiler_global_memory_barrier.cpp \
+  compiler_local_memory_two_ptr.cpp \
+  compiler_local_memory_barrier.cpp \
+  compiler_local_memory_barrier_wg64.cpp \
+  compiler_local_memory_barrier_2.cpp \
+  compiler_local_slm.cpp \
+  compiler_movforphi_undef.cpp \
+  compiler_volatile.cpp \
+  compiler_copy_image1.cpp \
+  compiler_get_image_info.cpp \
+  compiler_get_image_info_array.cpp \
+  compiler_vect_compare.cpp \
+  compiler_vector_load_store.cpp \
+  compiler_vector_inc.cpp \
+  compiler_cl_finish.cpp \
+  get_cl_info.cpp \
+  builtin_atan2.cpp \
+  builtin_bitselect.cpp \
+  builtin_frexp.cpp \
+  builtin_mad_sat.cpp \
+  builtin_modf.cpp \
+  builtin_nextafter.cpp \
+  builtin_remquo.cpp \
+  builtin_shuffle.cpp \
+  builtin_shuffle2.cpp \
+  builtin_sign.cpp \
+  builtin_lgamma.cpp \
+  builtin_lgamma_r.cpp \
+  builtin_tgamma.cpp \
+  buildin_work_dim.cpp \
+  builtin_global_size.cpp \
+  builtin_local_size.cpp \
+  builtin_global_id.cpp \
+  builtin_num_groups.cpp \
+  builtin_local_id.cpp \
+  builtin_acos_asin.cpp \
+  builtin_pow.cpp \
+  builtin_convert_sat.cpp \
+  sub_buffer.cpp \
+  runtime_createcontext.cpp \
+  runtime_set_kernel_arg.cpp \
+  runtime_null_kernel_arg.cpp \
+  runtime_event.cpp \
+  runtime_barrier_list.cpp \
+  runtime_marker_list.cpp \
+  runtime_compile_link.cpp \
+  compiler_long.cpp \
+  compiler_long_2.cpp \
+  compiler_long_not.cpp \
+  compiler_long_hi_sat.cpp \
+  compiler_long_div.cpp \
+  compiler_long_convert.cpp \
+  compiler_long_shl.cpp \
+  compiler_long_shr.cpp \
+  compiler_long_asr.cpp \
+  compiler_long_mult.cpp \
+  compiler_long_cmp.cpp \
+  compiler_long_bitcast.cpp \
+  compiler_half.cpp \
+  compiler_function_argument3.cpp \
+  compiler_function_qualifiers.cpp \
+  compiler_bool_cross_basic_block.cpp \
+  compiler_private_const.cpp \
+  compiler_private_data_overflow.cpp \
+  compiler_getelementptr_bitcast.cpp \
+  compiler_time_stamp.cpp \
+  compiler_double_precision.cpp \
+  load_program_from_gen_bin.cpp \
+  load_program_from_spir.cpp \
+  get_arg_info.cpp \
+  profiling_exec.cpp \
+  enqueue_copy_buf.cpp \
+  enqueue_copy_buf_unaligned.cpp \
+  test_printf.cpp \
+  enqueue_fill_buf.cpp \
+  builtin_kernel_max_global_size.cpp \
+  image_1D_buffer.cpp \
+  image_from_buffer.cpp \
+  compare_image_2d_and_1d_array.cpp \
+  compiler_fill_image_1d_array.cpp \
+  compiler_fill_image_2d_array.cpp \
+  compiler_constant_expr.cpp \
+  compiler_assignment_operation_in_if.cpp \
+  vload_bench.cpp \
+  runtime_use_host_ptr_buffer.cpp \
+  runtime_alloc_host_ptr_buffer.cpp \
+  runtime_use_host_ptr_image.cpp \
+  compiler_get_max_sub_group_size.cpp \
+  compiler_get_sub_group_local_id.cpp \
+  compiler_sub_group_shuffle.cpp
+
+ifeq ($(EGL_FOUND),true)
+LOCAL_SRC_FILES += \
+    compiler_fill_gl_image.cpp
+SUBDIR_CPPFLAGS += -DHAS_EGL
+SUBDIR_CFLAGS += -DHAS_EGL
+endif
+
+LOCAL_SHARED_LIBRARIES := \
+libcl \
+libm \
+libdl
+
+LOCAL_C_INCLUDES := $(SUBDIR_C_INCLUDES)
+LOCAL_CPPFLAGS := $(SUBDIR_CPPFLAGS)
+LOCAL_CFLAGS := $(SURDIR_CFLAGS)
+LOCAL_MODULE := libutests
+
+#LOCAL_CLANG := true
+include external/libcxx/libcxx.mk
+include $(BUILD_SHARED_LIBRARY)
+
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := utest_run
+LOCAL_SRC_FILES:= utest_run.cpp
+
+LOCAL_SHARED_LIBRARIES := \
+libutests \
+libm \
+libdl
+
+LOCAL_C_INCLUDES := $(SUBDIR_C_INCLUDES)
+LOCAL_CPPFLAGS := $(SUBDIR_CPPFLAGS)
+LOCAL_CFLAGS := $(SURDIR_CFLAGS)
+
+LOCAL_MULTILIB := both
+LOCAL_MODULE_STEM_32 := utest_run-x86
+LOCAL_MODULE_STEM_64 := utest_run-x86_64
+
+
+#LOCAL_CLANG := true
+include external/libcxx/libcxx.mk
+include $(BUILD_EXECUTABLE)
+
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index e7a9e26..0c3cb00 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -1,3 +1,44 @@
+###################################################################################
+# these configurations are copied from beignet root directory cmake for stand alone build.
+# do NOT set the NOT_BUILD_STAND_ALONE_UTEST if build the utest alone.
+if (NOT NOT_BUILD_STAND_ALONE_UTEST)
+  message(STATUS "Building Stand Alone Utest")
+
+  CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
+
+  INCLUDE (FindPkgConfig)
+  Find_Package(PythonInterp)
+
+  # OpenCL
+  pkg_check_modules(OPENCL REQUIRED OpenCL)
+  IF(OPENCL_FOUND)
+    INCLUDE_DIRECTORIES(${OPENCL_INCLUDE_DIRS})
+  ENDIF(OPENCL_FOUND)
+
+  # Force Release with debug info
+  if (NOT CMAKE_BUILD_TYPE)
+    set (CMAKE_BUILD_TYPE RelWithDebInfo)
+  endif (NOT CMAKE_BUILD_TYPE)
+  message(STATUS "Building mode: " ${CMAKE_BUILD_TYPE})
+
+  set (CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "assure config" FORCE)
+
+  # Threads
+  Find_Package(Threads)
+
+  set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -std=c++0x -Wno-invalid-offsetof")
+  set (CMAKE_C_FLAGS "${CMAKE_C_CXX_FLAGS}")
+  set (CMAKE_CXX_FLAGS_DEBUG          "-O0 -g -DGBE_DEBUG=1")
+  set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+  set (CMAKE_CXX_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_CXX_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_C_FLAGS_DEBUG          "-O0 -g -DGBE_DEBUG=1")
+  set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+  set (CMAKE_C_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_C_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
+endif (NOT NOT_BUILD_STAND_ALONE_UTEST)
+###################################################################################
+
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include)
 
@@ -27,7 +68,9 @@ set (utests_basic_sources
   utest_helper.cpp)
 
 # the test case with binary kernel
-set (utests_binary_kernel_sources load_program_from_bin_file.cpp enqueue_built_in_kernels.cpp)
+if (NOT_BUILD_STAND_ALONE_UTEST)
+  set (utests_binary_kernel_sources load_program_from_bin_file.cpp enqueue_built_in_kernels.cpp)
+endif (NOT_BUILD_STAND_ALONE_UTEST)
 
 set (utests_sources
   compiler_basic_arithmetic.cpp
@@ -119,6 +162,18 @@ set (utests_sources
   compiler_math.cpp
   compiler_atomic_functions.cpp
   compiler_async_copy.cpp
+  compiler_workgroup_broadcast.cpp
+  compiler_workgroup_reduce.cpp
+  compiler_workgroup_scan_exclusive.cpp
+  compiler_workgroup_scan_inclusive.cpp
+  compiler_subgroup_broadcast.cpp
+  compiler_subgroup_reduce.cpp
+  compiler_subgroup_scan_exclusive.cpp
+  compiler_subgroup_scan_inclusive.cpp
+  compiler_subgroup_buffer_block_read.cpp
+  compiler_subgroup_buffer_block_write.cpp
+  compiler_subgroup_image_block_read.cpp
+  compiler_subgroup_image_block_write.cpp
   compiler_async_stride_copy.cpp
   compiler_insn_selection_min.cpp
   compiler_insn_selection_max.cpp
@@ -159,6 +214,10 @@ set (utests_sources
   builtin_global_id.cpp
   builtin_num_groups.cpp
   builtin_local_id.cpp
+  builtin_sub_group_size.cpp
+  builtin_max_sub_group_size.cpp
+  builtin_num_sub_groups.cpp
+  builtin_sub_group_id.cpp
   builtin_acos_asin.cpp
   builtin_pow.cpp
   builtin_exp.cpp
@@ -190,10 +249,11 @@ set (utests_sources
   compiler_private_const.cpp
   compiler_private_data_overflow.cpp
   compiler_getelementptr_bitcast.cpp
-  compiler_sub_group_any.cpp
-  compiler_sub_group_all.cpp
   compiler_time_stamp.cpp
   compiler_double_precision.cpp
+  compiler_double.cpp
+  compiler_double_div.cpp
+  compiler_double_convert.cpp
   load_program_from_gen_bin.cpp
   load_program_from_spir.cpp
   get_arg_info.cpp
@@ -204,6 +264,7 @@ set (utests_sources
   enqueue_fill_buf.cpp
   builtin_kernel_max_global_size.cpp
   image_1D_buffer.cpp
+  image_from_buffer.cpp
   compare_image_2d_and_1d_array.cpp
   compiler_fill_image_1d_array.cpp
   compiler_fill_image_2d_array.cpp
@@ -212,9 +273,18 @@ set (utests_sources
   vload_bench.cpp
   runtime_use_host_ptr_buffer.cpp
   runtime_alloc_host_ptr_buffer.cpp
-  compiler_get_sub_group_size.cpp
-  compiler_get_sub_group_id.cpp
-  compiler_sub_group_shuffle.cpp)
+  runtime_use_host_ptr_image.cpp
+  compiler_get_max_sub_group_size.cpp
+  compiler_get_sub_group_local_id.cpp
+  compiler_sub_group_shuffle.cpp
+  compiler_sub_group_shuffle_down.cpp
+  compiler_sub_group_shuffle_up.cpp
+  compiler_sub_group_shuffle_xor.cpp
+  builtin_global_linear_id.cpp
+  builtin_local_linear_id.cpp
+  compiler_mix.cpp
+  compiler_math_3op.cpp
+  compiler_bsort.cpp)
 
 if (LLVM_VERSION_NODOT VERSION_GREATER 34)
   SET(utests_sources
@@ -222,14 +292,22 @@ if (LLVM_VERSION_NODOT VERSION_GREATER 34)
       compiler_overflow.cpp)
 endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
 
-if (X11_FOUND)
-  SET(utests_sources
+if (NOT_BUILD_STAND_ALONE_UTEST)
+  if (X11_FOUND)
+    SET(utests_sources
       ${utests_sources}
       runtime_climage_from_boname.cpp)
-  SET(UTESTS_REQUIRED_X11_LIB ${X11_LIBRARIES} ${XEXT_LIBRARIES})
-else()
-  SET(UTESTS_REQUIRED_X11_LIB "")
-endif (X11_FOUND)
+    SET(UTESTS_REQUIRED_X11_LIB ${X11_LIBRARIES} ${XEXT_LIBRARIES})
+  else()
+    SET(UTESTS_REQUIRED_X11_LIB "")
+  endif (X11_FOUND)
+endif (NOT_BUILD_STAND_ALONE_UTEST)
+
+if (CMRT_FOUND)
+  SET(utests_sources
+      ${utests_sources}
+      runtime_cmrt.cpp)
+endif (CMRT_FOUND)
 
 SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
 
@@ -246,8 +324,10 @@ else(GEN_PCI_ID)
   DEPENDS ${GBE_BIN_FILE} ${kernel_bin}.cl)
 endif(GEN_PCI_ID)
 
-ADD_CUSTOM_TARGET(kernel_bin.bin
-    DEPENDS ${kernel_bin}.bin)
+if (NOT_BUILD_STAND_ALONE_UTEST)
+  SET(utests_sources ${utests_sources} builtin_kernel_block_motion_estimate_intel.cpp)
+  ADD_CUSTOM_TARGET(kernel_bin.bin DEPENDS ${kernel_bin}.bin)
+endif (NOT_BUILD_STAND_ALONE_UTEST)
 
 add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/generated
     COMMAND mkdir ${CMAKE_CURRENT_SOURCE_DIR}/generated -p
@@ -281,12 +361,21 @@ endif ()
 
 ADD_LIBRARY(utests SHARED ${utests_sources})
 
-TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT} ${UTESTS_REQUIRED_X11_LIB})
+if (NOT_BUILD_STAND_ALONE_UTEST)
+  TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT} ${UTESTS_REQUIRED_X11_LIB})
+else()
+  TARGET_LINK_LIBRARIES(utests ${OPENCL_LIBRARIES} m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT} ${UTESTS_REQUIRED_X11_LIB})
+endif()
 
 ADD_EXECUTABLE(utest_run utest_run.cpp)
 TARGET_LINK_LIBRARIES(utest_run utests)
-ADD_DEPENDENCIES (utest_run kernel_bin.bin)
+
+if (NOT_BUILD_STAND_ALONE_UTEST)
+  ADD_DEPENDENCIES (utest_run kernel_bin.bin)
+endif (NOT_BUILD_STAND_ALONE_UTEST)
+
 ADD_DEPENDENCIES (utests utest_generator)
 
 ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp)
 TARGET_LINK_LIBRARIES(flat_address_space utests)
+ADD_CUSTOM_TARGET(utest DEPENDS utest_run utests flat_address_space)
diff --git a/utests/buildin_work_dim.cpp b/utests/buildin_work_dim.cpp
index d678c0f..4740c80 100644
--- a/utests/buildin_work_dim.cpp
+++ b/utests/buildin_work_dim.cpp
@@ -3,8 +3,6 @@
 static void buildin_work_dim(void)
 {
   // Setup kernel and buffers
-
-  int result, err;
   OCL_CREATE_KERNEL("buildin_work_dim");
 
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
@@ -23,14 +21,9 @@ static void buildin_work_dim(void)
     // Run the kernel
     OCL_NDRANGE(i);
 
-    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &result, 0, NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-       printf("Error: Failed to read output array! %d\n", err);
-       exit(1);
-    }
-
-    OCL_ASSERT( result == i);
+    OCL_MAP_BUFFER(0);
+    OCL_ASSERT( ((int*)buf_data[0])[0]== i);
+    OCL_UNMAP_BUFFER(0);
   }
 }
 
diff --git a/utests/builtin_acos_asin.cpp b/utests/builtin_acos_asin.cpp
index 0187226..21fe461 100644
--- a/utests/builtin_acos_asin.cpp
+++ b/utests/builtin_acos_asin.cpp
@@ -10,7 +10,9 @@
   printf("\033[0m");\
 }
 
-const float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 1, 30};
+namespace {
+
+float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 1, 30};
 const int count_input = sizeof(input_data) / sizeof(input_data[0]);
 const int max_function = 5;
 
@@ -44,7 +46,8 @@ static void builtin_acos_asin(void)
   locals[0] = 1;
 
   clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data, 0, NULL, NULL);
-  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &max_function , 0, NULL, NULL);
+  int maxfunc = max_function;
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &maxfunc, 0, NULL, NULL);
 
    // Run the kernel
   OCL_NDRANGE( 1 );
@@ -59,10 +62,10 @@ static void builtin_acos_asin(void)
     {
       index_cur = k * max_function + i;
 #if udebug
-      if (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])){
+      if (std::isinf(cpu_data[index_cur]) && !std::isinf(gpu_data[index_cur])){
         printf_c("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
       }
-      else if (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])){
+      else if (std::isnan(cpu_data[index_cur]) && !std::isnan(gpu_data[index_cur])){
         printf_c("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
       }
       else if(fabs(gpu_data[index_cur] - cpu_data[index_cur]) > 1e-3f){
@@ -71,10 +74,10 @@ static void builtin_acos_asin(void)
       else
         printf("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
 #else
-     if (isinf(cpu_data[index_cur]))
-       OCL_ASSERT(isinf(gpu_data[index_cur]));
-     else if (isnan(cpu_data[index_cur]))
-       OCL_ASSERT(isnan(gpu_data[index_cur]));
+     if (std::isinf(cpu_data[index_cur]))
+       OCL_ASSERT(std::isinf(gpu_data[index_cur]));
+     else if (std::isnan(cpu_data[index_cur]))
+       OCL_ASSERT(std::isnan(gpu_data[index_cur]));
      else
      {
        OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < 1e-3f);
@@ -85,3 +88,4 @@ static void builtin_acos_asin(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_acos_asin)
+}
diff --git a/utests/builtin_exp.cpp b/utests/builtin_exp.cpp
index d5288c8..2c214bd 100644
--- a/utests/builtin_exp.cpp
+++ b/utests/builtin_exp.cpp
@@ -5,7 +5,7 @@
 #define udebug 0
 
 #define FLT_MAX 0x1.fffffep127f
-#define FLT_MIN 0x1.0p-126f
+#define FLT_MIN ldexpf(1.0,-126)
 #define FLT_ULP  (1.0e-6f)
 
 #define printf_c(...) \
@@ -15,7 +15,9 @@
   printf("\033[0m");\
 }
 
-const float input_data[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, 80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0 };
+namespace{
+
+float input_data[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, 80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0 };
 const int count_input = sizeof(input_data) / sizeof(input_data[0]);
 const int max_function = 5;
 
@@ -51,7 +53,8 @@ static void builtin_exp(void)
   locals[0] = 1;
 
   clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data, 0, NULL, NULL);
-  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &max_function , 0, NULL, NULL);
+  int maxfunc = max_function;
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &maxfunc, 0, NULL, NULL);
 
    // Run the kernel
   OCL_NDRANGE( 1 );
@@ -71,10 +74,10 @@ static void builtin_exp(void)
          diff/gpu_data[index_cur], 3 * FLT_ULP);
 
 #if udebug
-      if (isinf(cpu_data[index_cur]) && isinf(gpu_data[index_cur])){
+      if (std::isinf(cpu_data[index_cur]) && std::isinf(gpu_data[index_cur])){
         printf(log);
       }
-      else if (isnan(cpu_data[index_cur]) && isnan(gpu_data[index_cur])){
+      else if (std::isnan(cpu_data[index_cur]) && std::isnan(gpu_data[index_cur])){
         printf(log);
       }
       else if( diff / cpu_data[index_cur] < 3 * FLT_ULP \
@@ -86,10 +89,10 @@ static void builtin_exp(void)
       else
         printf_c(log);
 #else
-      if (isinf(cpu_data[index_cur]))
-        OCL_ASSERTM(isinf(gpu_data[index_cur]), log);
-      else if (isnan(cpu_data[index_cur]))
-        OCL_ASSERTM(isnan(gpu_data[index_cur]), log);
+      if (std::isinf(cpu_data[index_cur]))
+        OCL_ASSERTM(std::isinf(gpu_data[index_cur]), log);
+      else if (std::isnan(cpu_data[index_cur]))
+        OCL_ASSERTM(std::isnan(gpu_data[index_cur]), log);
       else if ( gpu_data[index_cur] > FLT_ULP || cpu_data[index_cur] > FLT_ULP)
         OCL_ASSERTM(fabs( diff / cpu_data[index_cur]) < 3 * FLT_ULP, log);
       else
@@ -100,3 +103,4 @@ static void builtin_exp(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_exp)
+}
diff --git a/utests/builtin_global_id.cpp b/utests/builtin_global_id.cpp
index 9601cab..1fa9f0d 100644
--- a/utests/builtin_global_id.cpp
+++ b/utests/builtin_global_id.cpp
@@ -28,7 +28,7 @@ static void builtin_global_id(void)
 {
 
   // Setup kernel and buffers
-  int dim, global_id[80], err, i, buf_len=1;
+  int dim, i, buf_len=1;
   OCL_CREATE_KERNEL("builtin_global_id");
 
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*80, NULL);
@@ -53,24 +53,18 @@ static void builtin_global_id(void)
     OCL_NDRANGE( dim );
     clFinish(queue);
 
-    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &global_id, 0, NULL, NULL);
-
-    if (err != CL_SUCCESS)
-    {
-      printf("Error: Failed to read output array! %d\n", err);
-      exit(1);
-    }
-
+    OCL_MAP_BUFFER(0);
 #if udebug
     for(i = 0; i < buf_len; i++)
     {
-      printf("%2d ", global_id[i]);
+      printf("%2d ", ((int*)buf_data[0])[i]);
       if ((i + 1) % 3 == 0) printf("\n");
     }
 #endif
 
     for( i = 0; i < buf_len; i++)
-      OCL_ASSERT( global_id[i] == i);
+      OCL_ASSERT( ((int*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
   }
 }
 
diff --git a/utests/builtin_global_id.cpp b/utests/builtin_global_linear_id.cpp
similarity index 60%
copy from utests/builtin_global_id.cpp
copy to utests/builtin_global_linear_id.cpp
index 9601cab..cda7e84 100644
--- a/utests/builtin_global_id.cpp
+++ b/utests/builtin_global_linear_id.cpp
@@ -1,12 +1,15 @@
 /*
-According to the OpenCL v1.1 & v1.2 chapter 6.11.
+According to the OpenCL v2.0 chapter 6.13.1
 Now define global size as following:
   globals[0] = 3;
   globals[1] = 4;
   globals[2] = 5;
+  offsets[0] = 1;
+  offsets[1] = 2;
+  offsets[2] = 3;
 
 Kernel:
-id = get_global_id(0) + get_global_id(1)*3 + get_global_id(2)*3*4
+id = get_global_linear_id(0)
 
 dimension:1
  0  1  2
@@ -24,12 +27,15 @@ dimension:3
 
 #define udebug 0
 #include "utest_helper.hpp"
-static void builtin_global_id(void)
+static void builtin_global_linear_id(void)
 {
+  if (!cl_check_ocl20())
+    return;
 
   // Setup kernel and buffers
-  int dim, global_id[80], err, i, buf_len=1;
-  OCL_CREATE_KERNEL("builtin_global_id");
+  int dim, err, i, buf_len=1;
+  size_t offsets[3] = {0,0,0};
+  OCL_CREATE_KERNEL("builtin_global_linear_id");
 
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*80, NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -41,37 +47,39 @@ static void builtin_global_id(void)
     {
       globals[i - 1] = 2 + i;
       locals[i - 1] = 2 + i;
+      offsets[i - 1] = i;
       buf_len *= 2 + i;
     }
     for(i=dim+1; i <= 3; i++)
     {
       globals[i - 1] = 0;
       locals[i - 1] = 0;
+      offsets[i - 1] = 0;
     }
 
     // Run the kernel
-    OCL_NDRANGE( dim );
-    clFinish(queue);
-
-    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &global_id, 0, NULL, NULL);
-
+    err = clEnqueueNDRangeKernel(queue, kernel, dim, offsets, globals, locals, 0, NULL, NULL);
     if (err != CL_SUCCESS)
     {
-      printf("Error: Failed to read output array! %d\n", err);
+      printf("Error: Failed to excute kernel! %d\n", err);
       exit(1);
     }
 
+    clFinish(queue);
+
+    OCL_MAP_BUFFER(0);
 #if udebug
     for(i = 0; i < buf_len; i++)
     {
-      printf("%2d ", global_id[i]);
+      printf("%2d ", ((int*)buf_data[0])[i]);
       if ((i + 1) % 3 == 0) printf("\n");
     }
 #endif
 
     for( i = 0; i < buf_len; i++)
-      OCL_ASSERT( global_id[i] == i);
+      OCL_ASSERT( ((int*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
   }
 }
 
-MAKE_UTEST_FROM_FUNCTION(builtin_global_id);
+MAKE_UTEST_FROM_FUNCTION(builtin_global_linear_id);
diff --git a/utests/builtin_global_size.cpp b/utests/builtin_global_size.cpp
index 094e019..a2ec24a 100644
--- a/utests/builtin_global_size.cpp
+++ b/utests/builtin_global_size.cpp
@@ -80,12 +80,8 @@ static void builtin_global_size(void)
       // Run the kernel
       OCL_NDRANGE( dim );
 
-      err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &global_size, 0, NULL, NULL);
-      if (err != CL_SUCCESS)
-      {
-        printf("Error: Failed to read output array! %d\n", err);
-        exit(1);
-      }
+      OCL_MAP_BUFFER(0);
+      global_size = ((int*)buf_data[0])[0];
 
       //printf("get_global_size(%d) = %d (dimension:%d)\n", dim_arg_global, global_size, dim);
 
@@ -101,6 +97,7 @@ static void builtin_global_size(void)
         OCL_ASSERT( global_size == 1);
       #endif
       }
+      OCL_UNMAP_BUFFER(0);
     }
   }
 }
diff --git a/utests/builtin_kernel_block_motion_estimate_intel.cpp b/utests/builtin_kernel_block_motion_estimate_intel.cpp
new file mode 100644
index 0000000..5a48753
--- /dev/null
+++ b/utests/builtin_kernel_block_motion_estimate_intel.cpp
@@ -0,0 +1,135 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+typedef cl_accelerator_intel (OCLCREATEACCELERATORINTEL)(cl_context, cl_accelerator_type_intel accel_type, size_t desc_sz, const void* desc, cl_int* errcode_ret);
+OCLCREATEACCELERATORINTEL * oclCreateAcceleratorIntel = NULL;
+typedef cl_int (OCLRELEASEACCELERATORINTEL)(cl_accelerator_intel accel_type);
+OCLRELEASEACCELERATORINTEL * oclReleaseAcceleratorIntel = NULL;
+
+void builtin_kernel_block_motion_estimate_intel(void)
+{
+  char* built_in_kernel_names;
+  size_t built_in_kernels_size;
+  cl_int err = CL_SUCCESS;
+  size_t ret_sz;
+
+  OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+  built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
+  OCL_ASSERT(ret_sz == built_in_kernels_size);
+
+  if (strstr(built_in_kernel_names, "block_motion_estimate_intel") == NULL)
+  {
+        free(built_in_kernel_names);
+        return;
+  }
+
+  cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
+  OCL_ASSERT(built_in_prog != NULL);
+  kernel = clCreateKernel(built_in_prog, "block_motion_estimate_intel",  &err);
+  OCL_ASSERT(kernel != NULL);
+
+  cl_motion_estimation_desc_intel vmedesc = {CL_ME_MB_TYPE_16x16_INTEL,       //0x0
+                                          CL_ME_SUBPIXEL_MODE_INTEGER_INTEL,  //0x0
+                                          CL_ME_SAD_ADJUST_MODE_NONE_INTEL,   //0x0
+                                          CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL //0x5
+                                          };
+#ifdef CL_VERSION_1_2
+  oclCreateAcceleratorIntel = (OCLCREATEACCELERATORINTEL*)clGetExtensionFunctionAddressForPlatform(platform, "clCreateAcceleratorINTEL");
+#else
+  oclCreateAcceleratorIntel  = (OCLCREATEACCELERATORINTEL*)clGetExtensionFunctionAddress("clCreateAcceleratorINTEL");
+#endif
+  if(!oclCreateAcceleratorIntel){
+    fprintf(stderr, "Failed to get extension clCreateImageFromLibvaIntel\n");
+    exit(1);
+  }
+  cl_accelerator_intel accel = oclCreateAcceleratorIntel(ctx, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL,sizeof(cl_motion_estimation_desc_intel), &vmedesc, &err);
+  OCL_ASSERT(accel != NULL);
+
+  const size_t w = 71; //80
+  const size_t h = 41; //48
+
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  uint8_t* image_data1 = (uint8_t *)malloc(w * h);    //src
+  uint8_t* image_data2 = (uint8_t *)malloc(w * h);    //ref
+  for (size_t j = 0; j < h; j++) {
+    for (size_t i = 0; i < w; i++) {
+      if (i >= 32 && i <= 47 && j >= 16 && j <= 31)
+        image_data2[w * j + i] = image_data1[w * j + i] = 100;
+      else
+        image_data2[w * j + i] = image_data1[w * j + i] = 17;
+    }
+  }
+
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data1);        //src
+  OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data2);        //ref
+
+  const size_t mv_w = (w + 15) / 16;
+  const size_t mv_h = (h + 15) / 16;
+  OCL_CREATE_BUFFER(buf[2], 0, mv_w * mv_h * sizeof(short) * 2, NULL);
+
+  OCL_SET_ARG(0, sizeof(cl_accelerator_intel), &accel);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(3, sizeof(cl_mem), NULL);
+  OCL_SET_ARG(4, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(5, sizeof(cl_mem), NULL);
+
+  globals[0] = w;
+  globals[1] = h;
+  OCL_CALL(clEnqueueNDRangeKernel, queue, kernel, 2, NULL, globals, NULL, 0, NULL, NULL);
+
+  OCL_MAP_BUFFER(2);
+  short expected[] = {-64, -48,   //S13.2 fixed point value
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    0, 0,
+                    0, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    0, -48,
+                    -64, -48};
+  short* res = (short*)buf_data[2];
+  for (uint32_t j = 0; j < mv_h - 1; ++j) {
+    for (uint32_t i = 0; i < mv_w - 1; ++i) {
+        uint32_t index = j * mv_w * 2 + i * 2;
+        OCL_ASSERT(res[index + 0] == expected[index + 0]);
+        OCL_ASSERT(res[index + 1] == expected[index + 1]);
+    }
+  }
+  OCL_UNMAP_BUFFER(2);
+
+#ifdef CL_VERSION_1_2
+  oclReleaseAcceleratorIntel = (OCLRELEASEACCELERATORINTEL*)clGetExtensionFunctionAddressForPlatform(platform, "clReleaseAcceleratorINTEL");
+#else
+  oclReleaseAcceleratorIntel  = (OCLRELEASEACCELERATORINTEL*)clGetExtensionFunctionAddress("clReleaseAcceleratorINTEL");
+#endif
+  if(!oclReleaseAcceleratorIntel){
+    fprintf(stderr, "Failed to get extension clCreateImageFromLibvaIntel\n");
+    exit(1);
+  }
+  oclReleaseAcceleratorIntel(accel);
+  clReleaseProgram(built_in_prog);
+  free(built_in_kernel_names);
+  free(image_data1);
+  free(image_data2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_kernel_block_motion_estimate_intel);
diff --git a/utests/builtin_kernel_max_global_size.cpp b/utests/builtin_kernel_max_global_size.cpp
index e6910cd..d3e8373 100644
--- a/utests/builtin_kernel_max_global_size.cpp
+++ b/utests/builtin_kernel_max_global_size.cpp
@@ -1,4 +1,5 @@
 #include "utest_helper.hpp"
+#include <string.h>
 
 void builtin_kernel_max_global_size(void)
 {
@@ -9,12 +10,17 @@ void builtin_kernel_max_global_size(void)
 
 
   OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+  if(built_in_kernels_size == 0)
+    return;
+
   built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
   OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
   OCL_ASSERT(ret_sz == built_in_kernels_size);
   cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
   OCL_ASSERT(built_in_prog != NULL);
-  cl_kernel builtin_kernel_1d = clCreateKernel(built_in_prog, "__cl_copy_region_unalign_src_offset",  &err);
+  char* first_kernel = strtok(built_in_kernel_names, ";");
+  OCL_ASSERT(first_kernel);
+  cl_kernel builtin_kernel_1d = clCreateKernel(built_in_prog, first_kernel,  &err);
   OCL_ASSERT(builtin_kernel_1d != NULL);
   size_t param_value_size;
   void* param_value;
diff --git a/utests/builtin_lgamma.cpp b/utests/builtin_lgamma.cpp
index 876699a..57945de 100644
--- a/utests/builtin_lgamma.cpp
+++ b/utests/builtin_lgamma.cpp
@@ -29,7 +29,7 @@ void builtin_lgamma(void) {
 			float cpu = lgamma(src[i]);
 			float gpu = dst[i];
 			if (fabsf(cpu - gpu) >= 1e-3) {
-				printf("%f %f %f\n", src[i], cpu, gpu);
+				printf("%f %f %f", src[i], cpu, gpu);
 				OCL_ASSERT(0);
 			}
 		}
diff --git a/utests/builtin_lgamma_r.cpp b/utests/builtin_lgamma_r.cpp
index b6e5d0e..0258767 100644
--- a/utests/builtin_lgamma_r.cpp
+++ b/utests/builtin_lgamma_r.cpp
@@ -34,7 +34,7 @@ void builtin_lgamma_r(void) {
 			int gpu_signp = ((int*)buf_data[2])[i];
 			float gpu = dst[i];
 			if (cpu_signp != gpu_signp || fabsf(cpu - gpu) >= 1e-3) {
-				printf("%f %f %f\n", src[i], cpu, gpu);
+				printf("%f %f %f", src[i], cpu, gpu);
 				OCL_ASSERT(0);
 			}
 		}
diff --git a/utests/builtin_local_id.cpp b/utests/builtin_local_id.cpp
index 1f07615..9f0adee 100644
--- a/utests/builtin_local_id.cpp
+++ b/utests/builtin_local_id.cpp
@@ -32,7 +32,7 @@ static void builtin_local_id(void)
 {
 
   // Setup kernel and buffers
-  int dim, local_id[576], err, i, buf_len=1;
+  int dim, i, buf_len=1;
   OCL_CREATE_KERNEL("builtin_local_id");
 
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
@@ -57,24 +57,18 @@ static void builtin_local_id(void)
     OCL_NDRANGE( dim );
     clFinish(queue);
 
-    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &local_id, 0, NULL, NULL);
-
-    if (err != CL_SUCCESS)
-    {
-      printf("Error: Failed to read output array! %d\n", err);
-      exit(1);
-    }
-
+    OCL_MAP_BUFFER(0);
 #if udebug
     for(i = 0; i < buf_len; i++)
     {
-      printf("%2d ", local_id[i]);
+      printf("%2d ", ((int*)buf_data[0])[i]);
       if ((i + 1) % 4  == 0) printf("\n");
     }
 #endif
 
     for( i = 0; i < buf_len; i++)
-      OCL_ASSERT( local_id[i] == i);
+      OCL_ASSERT( ((int*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
   }
 }
 
diff --git a/utests/builtin_local_id.cpp b/utests/builtin_local_linear_id.cpp
similarity index 62%
copy from utests/builtin_local_id.cpp
copy to utests/builtin_local_linear_id.cpp
index 1f07615..88cb357 100644
--- a/utests/builtin_local_id.cpp
+++ b/utests/builtin_local_linear_id.cpp
@@ -1,5 +1,5 @@
 /*
-According to the OpenCL v1.1 & v1.2 chapter 6.11.
+According to the OpenCL v2.0 chapter 6.13.1
 Now define local and global size as following:
   globals[0] = 4;
   globals[1] = 9;
@@ -9,9 +9,9 @@ Now define local and global size as following:
   locals[2] = 4;
 
 Kernel:
-int id = get_local_id(0) +  get_group_id(0)*2 + \
-         get_local_id(1) * 4 + get_group_id(1)*12 +\
-         get_local_id(2) *36 + get_group_id(2)*144;
+  int id = get_local_linear_id() + (get_group_id(0) + \
+           get_group_id(1) * 2 + get_group_id(2) * 2 * 3) * \
+           get_local_size(0) * get_local_size(1) * get_local_size(2);
 
 dimension:1
  0  1  2  3
@@ -28,12 +28,14 @@ dimension:3
 
 #define udebug 0
 #include "utest_helper.hpp"
-static void builtin_local_id(void)
+static void builtin_local_linear_id(void)
 {
+  if (!cl_check_ocl20())
+    return;
 
   // Setup kernel and buffers
-  int dim, local_id[576], err, i, buf_len=1;
-  OCL_CREATE_KERNEL("builtin_local_id");
+  int dim, i, buf_len=1;
+  OCL_CREATE_KERNEL("builtin_local_linear_id");
 
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -57,25 +59,19 @@ static void builtin_local_id(void)
     OCL_NDRANGE( dim );
     clFinish(queue);
 
-    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &local_id, 0, NULL, NULL);
-
-    if (err != CL_SUCCESS)
-    {
-      printf("Error: Failed to read output array! %d\n", err);
-      exit(1);
-    }
-
+    OCL_MAP_BUFFER(0);
 #if udebug
     for(i = 0; i < buf_len; i++)
     {
-      printf("%2d ", local_id[i]);
+      printf("%2d ", ((int*)buf_data[0])[i]);
       if ((i + 1) % 4  == 0) printf("\n");
     }
 #endif
 
     for( i = 0; i < buf_len; i++)
-      OCL_ASSERT( local_id[i] == i);
+      OCL_ASSERT( ((int*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
   }
 }
 
-MAKE_UTEST_FROM_FUNCTION(builtin_local_id);
+MAKE_UTEST_FROM_FUNCTION(builtin_local_linear_id);
diff --git a/utests/builtin_local_size.cpp b/utests/builtin_local_size.cpp
index a9dac2e..491175d 100644
--- a/utests/builtin_local_size.cpp
+++ b/utests/builtin_local_size.cpp
@@ -65,13 +65,8 @@ static void builtin_local_size(void)
       // Run the kernel
       OCL_NDRANGE( dim );
 
-      err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &local_size, 0, NULL, NULL);
-      if (err != CL_SUCCESS)
-      {
-        printf("Error: Failed to read output array! %d\n", err);
-        exit(1);
-      }
-
+      OCL_MAP_BUFFER(0);
+      local_size = ((int*)buf_data[0])[0];
 #if udebug
       printf("get_local_size(%d) = %d (dimension:%d)\n", dim_arg_global, local_size, dim);
 #endif
@@ -81,6 +76,7 @@ static void builtin_local_size(void)
       {
         OCL_ASSERT( local_size == 1);
       }
+      OCL_UNMAP_BUFFER(0);
     }
   }
 }
diff --git a/utests/builtin_max_sub_group_size.cpp b/utests/builtin_max_sub_group_size.cpp
new file mode 100644
index 0000000..310d880
--- /dev/null
+++ b/utests/builtin_max_sub_group_size.cpp
@@ -0,0 +1,62 @@
+/*
+According to the OpenCL cl_intel_subgroups.
+Now define local and global size as following:
+  globals[0] = 4;
+  globals[1] = 9;
+  globals[2] = 16;
+  locals[0] = 2;
+  locals[1] = 3;
+  locals[2] = 4;
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_max_sub_group_size(void)
+{
+  if(!cl_check_subgroups())
+    return;
+
+  // Setup kernel and buffers
+  size_t dim, i,local_sz = 1,buf_len = 1;
+  OCL_CREATE_KERNEL("builtin_max_sub_group_size");
+  size_t sub_sz;
+
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+    buf_len = 1;
+    local_sz = 1;
+    for(i=1; i <= dim; i++)
+    {
+      locals[i - 1] = i + 1;
+      globals[i - 1] = (i + 1) * (i + 1);
+      buf_len *= ((i + 1) * (i + 1));
+      local_sz *= i + 1;
+    }
+    for(i = dim+1; i <= 3; i++)
+    {
+      globals[i - 1] = 0;
+      locals[i - 1] = 0;
+    }
+
+    OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*dim,locals,sizeof(size_t),&sub_sz,NULL);
+    // Run the kernel
+    OCL_NDRANGE( dim );
+    clFinish(queue);
+
+    OCL_MAP_BUFFER(0);
+
+    for( i = 0; i < buf_len; i++) {
+#if udebug
+      printf("got %d expect %d\n", ((uint32_t*)buf_data[0])[i], sub_sz);
+#endif
+      OCL_ASSERT( ((uint32_t*)buf_data[0])[i] == sub_sz);
+    }
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_max_sub_group_size);
diff --git a/utests/builtin_num_groups.cpp b/utests/builtin_num_groups.cpp
index bbff435..832766e 100644
--- a/utests/builtin_num_groups.cpp
+++ b/utests/builtin_num_groups.cpp
@@ -62,13 +62,8 @@ static void builtin_num_groups(void)
       // Run the kernel
       OCL_NDRANGE( dim );
 
-      err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &num_groups, 0, NULL, NULL);
-      if (err != CL_SUCCESS)
-      {
-        printf("Error: Failed to read output array! %d\n", err);
-        exit(1);
-      }
-
+      OCL_MAP_BUFFER(0);
+      num_groups = ((int*)buf_data[0])[0];
 #if udebug
       printf("get_num_groups(%d) = %d (dimension:%d)\n", dim_arg_global, num_groups, dim);
 #endif
@@ -78,6 +73,7 @@ static void builtin_num_groups(void)
       {
         OCL_ASSERT( num_groups == 1);
       }
+      OCL_UNMAP_BUFFER(0);
     }
   }
 }
diff --git a/utests/builtin_num_sub_groups.cpp b/utests/builtin_num_sub_groups.cpp
new file mode 100644
index 0000000..dcd691a
--- /dev/null
+++ b/utests/builtin_num_sub_groups.cpp
@@ -0,0 +1,62 @@
+/*
+According to the OpenCL cl_intel_subgroups.
+Now define local and global size as following:
+  globals[0] = 4;
+  globals[1] = 9;
+  globals[2] = 16;
+  locals[0] = 2;
+  locals[1] = 3;
+  locals[2] = 4;
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_num_sub_groups(void)
+{
+  if(!cl_check_subgroups())
+    return;
+
+  // Setup kernel and buffers
+  size_t dim, i,local_sz = 1,buf_len = 1;
+  OCL_CREATE_KERNEL("builtin_num_sub_groups");
+  size_t num_sub;
+
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+    buf_len = 1;
+    local_sz = 1;
+    for(i=1; i <= dim; i++)
+    {
+      locals[i - 1] = i + 1;
+      globals[i - 1] = (i + 1) * (i + 1);
+      buf_len *= ((i + 1) * (i + 1));
+      local_sz *= i + 1;
+    }
+    for(i = dim+1; i <= 3; i++)
+    {
+      globals[i - 1] = 0;
+      locals[i - 1] = 0;
+    }
+
+    OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR ,sizeof(size_t)*dim,locals,sizeof(size_t),&num_sub,NULL);
+    // Run the kernel
+    OCL_NDRANGE( dim );
+    clFinish(queue);
+
+    OCL_MAP_BUFFER(0);
+
+    for( i = 0; i < buf_len; i++) {
+#if udebug
+      printf("%zu get %d, expect %zu\n",i, ((uint32_t*)buf_data[0])[i], num_sub);
+#endif
+      OCL_ASSERT( ((uint32_t*)buf_data[0])[i] == num_sub);
+    }
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_num_sub_groups);
diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
index f586448..1f6af0e 100644
--- a/utests/builtin_pow.cpp
+++ b/utests/builtin_pow.cpp
@@ -10,6 +10,9 @@
   printf( __VA_ARGS__ );\
   printf("\033[0m");\
 }
+
+namespace {
+
 const float ori_data[] = {-20.5, -1, -0.9, -0.01, 0, 0.01, 0.9, 1.0, 20.5};
 const int count_input_ori = sizeof(ori_data) / sizeof(ori_data[0]);
 const int count_input = count_input_ori * count_input_ori;
@@ -59,7 +62,8 @@ static void builtin_pow(void)
 
   clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data1, 0, NULL, NULL);
   clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, count_input * sizeof(float), input_data2, 0, NULL, NULL);
-  clEnqueueWriteBuffer( queue, buf[3], CL_TRUE, 0, sizeof(int), &max_function, 0, NULL, NULL);
+  int maxfunc = max_function;
+  clEnqueueWriteBuffer( queue, buf[3], CL_TRUE, 0, sizeof(int), &maxfunc, 0, NULL, NULL);
 
    // Run the kernel
   OCL_NDRANGE( 1 );
@@ -74,8 +78,8 @@ static void builtin_pow(void)
     {
       index_cur = k * max_function + i;
 #if udebug
-      if ( (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])) ||
-           (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])) ||
+      if ( (std::isinf(cpu_data[index_cur]) && !std::isinf(gpu_data[index_cur])) ||
+           (std::isnan(cpu_data[index_cur]) && !std::isnan(gpu_data[index_cur])) ||
            (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR
            && (denormals_supported || gpu_data[index_cur]!=0 || std::fpclassify(cpu_data[index_cur])!=FP_SUBNORMAL) ) )
 
@@ -85,10 +89,10 @@ static void builtin_pow(void)
       else
         printf("%d/%d: x:%f, y:%f -> gpu:%f  cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
 #else
-     if (isinf(cpu_data[index_cur]))
-       OCL_ASSERT(isinf(gpu_data[index_cur]));
-     else if (isnan(cpu_data[index_cur]))
-       OCL_ASSERT(isnan(gpu_data[index_cur]));
+     if (std::isinf(cpu_data[index_cur]))
+       OCL_ASSERT(std::isinf(gpu_data[index_cur]));
+     else if (std::isnan(cpu_data[index_cur]))
+       OCL_ASSERT(std::isnan(gpu_data[index_cur]));
      else
      {
        OCL_ASSERT((fabs(gpu_data[index_cur] - cpu_data[index_cur]) < cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR) ||
@@ -100,3 +104,4 @@ static void builtin_pow(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_pow)
+}
diff --git a/utests/builtin_sub_group_id.cpp b/utests/builtin_sub_group_id.cpp
new file mode 100644
index 0000000..89064bd
--- /dev/null
+++ b/utests/builtin_sub_group_id.cpp
@@ -0,0 +1,63 @@
+/*
+According to the OpenCL cl_intel_subgroups.
+Now define local and global size as following:
+  globals[0] = 4;
+  globals[1] = 9;
+  globals[2] = 16;
+  locals[0] = 2;
+  locals[1] = 3;
+  locals[2] = 4;
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_sub_group_id(void)
+{
+  if(!cl_check_subgroups())
+    return;
+
+  // Setup kernel and buffers
+  size_t dim, i,local_sz = 1,buf_len = 1;
+  OCL_CREATE_KERNEL("builtin_sub_group_id");
+  size_t max_sub_sz;
+
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+    buf_len = 1;
+    local_sz = 1;
+    for(i=1; i <= dim; i++)
+    {
+      locals[i - 1] = i + 1;
+      globals[i - 1] = (i + 1) * (i + 1);
+      buf_len *= ((i + 1) * (i + 1));
+      local_sz *= i + 1;
+    }
+    for(i = dim+1; i <= 3; i++)
+    {
+      globals[i - 1] = 0;
+      locals[i - 1] = 0;
+    }
+
+    OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*dim,locals,sizeof(size_t),&max_sub_sz,NULL);
+    // Run the kernel
+    OCL_NDRANGE( dim );
+    clFinish(queue);
+
+    OCL_MAP_BUFFER(0);
+
+    for( i = 0; i < buf_len; i++) {
+      size_t expect_id = (i % local_sz) / max_sub_sz;
+#if udebug
+      printf("%zu get %d, expect %zu\n",i, ((uint32_t*)buf_data[0])[i], expect_id);
+#endif
+      OCL_ASSERT( ((uint32_t*)buf_data[0])[i] == expect_id);
+    }
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_sub_group_id);
diff --git a/utests/builtin_sub_group_size.cpp b/utests/builtin_sub_group_size.cpp
new file mode 100644
index 0000000..7f7c3e4
--- /dev/null
+++ b/utests/builtin_sub_group_size.cpp
@@ -0,0 +1,63 @@
+/*
+According to the OpenCL cl_intel_subgroups.
+Now define local and global size as following:
+  globals[0] = 4;
+  globals[1] = 9;
+  globals[2] = 16;
+  locals[0] = 2;
+  locals[1] = 3;
+  locals[2] = 4;
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_sub_group_size(void)
+{
+  if(!cl_check_subgroups())
+    return;
+
+  // Setup kernel and buffers
+  size_t dim, i,local_sz = 1,buf_len = 1;
+  OCL_CREATE_KERNEL("builtin_sub_group_size");
+  size_t max_sub_sz;
+
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+    buf_len = 1;
+    local_sz = 1;
+    for(i=1; i <= dim; i++)
+    {
+      locals[i - 1] = i + 1;
+      globals[i - 1] = (i + 1) * (i + 1);
+      buf_len *= ((i + 1) * (i + 1));
+      local_sz *= i + 1;
+    }
+    for(i = dim+1; i <= 3; i++)
+    {
+      globals[i - 1] = 0;
+      locals[i - 1] = 0;
+    }
+
+    OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*dim,locals,sizeof(size_t),&max_sub_sz,NULL);
+    // Run the kernel
+    OCL_NDRANGE( dim );
+    clFinish(queue);
+
+    OCL_MAP_BUFFER(0);
+
+    for( i = 0; i < buf_len; i++) {
+      size_t expect_sz = (i % local_sz) < (local_sz / max_sub_sz * max_sub_sz) ? max_sub_sz : (local_sz % max_sub_sz);
+#if udebug
+      printf("%zu get %d, expect %zu\n",i, ((uint32_t*)buf_data[0])[i], expect_sz);
+#endif
+      OCL_ASSERT( ((uint32_t*)buf_data[0])[i] == expect_sz);
+    }
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_sub_group_size);
diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
index db9ab3c..eb6bdd7 100644
--- a/utests/builtin_tgamma.cpp
+++ b/utests/builtin_tgamma.cpp
@@ -43,10 +43,10 @@ void builtin_tgamma(void)
         max_ulp = fabsf(cpu - dst[i]) / cl_FLT_ULP(cpu);
         max_ulp_at = src[i];
       }
-      if (isinf(cpu)) {
-        OCL_ASSERT(isinf(dst[i]));
+      if (std::isinf(cpu)) {
+        OCL_ASSERT(std::isinf(dst[i]));
       } else if (fabsf(cpu - dst[i]) >= cl_FLT_ULP(cpu) * ULPSIZE_FACTOR) {
-        printf("%f %f %f\n", src[i], cpu, dst[i]);
+        printf("%f %f %f", src[i], cpu, dst[i]);
         OCL_ASSERT(0);
       }
     }
diff --git a/utests/compare_image_2d_and_1d_array.cpp b/utests/compare_image_2d_and_1d_array.cpp
index dfa4273..f2db680 100644
--- a/utests/compare_image_2d_and_1d_array.cpp
+++ b/utests/compare_image_2d_and_1d_array.cpp
@@ -84,6 +84,8 @@ static void compare_image_2d_and_1d_array(void)
 
   free(dst0);
   free(dst1);
+  free(image_data1);
+  free(image_data2);
   OCL_CALL(clReleaseSampler, sampler);
 }
 
diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
index 3f477a8..49b381d 100644
--- a/utests/compiler_abs.cpp
+++ b/utests/compiler_abs.cpp
@@ -119,6 +119,18 @@ template <typename T, typename U> static void dump_data (T* src, U* dst, int n)
     }
 }
 
+template <typename T>
+static void check_result(T* actual, T* expected)
+{
+    OCL_ASSERT(*actual == *expected);
+}
+
+template <typename T, int N>
+static void check_result(cl_vec<T, N>* actual, cl_vec<T, N>* expected)
+{
+    OCL_ASSERT(!memcmp(actual, expected, sizeof(T)*N));
+}
+
 template <typename T, typename U> static void compiler_abs_with_type(void)
 {
     const size_t n = 16;
@@ -160,7 +172,11 @@ template <typename T, typename U> static void compiler_abs_with_type(void)
 
 //      dump_data(cpu_src, cpu_dst, n);
 
-        OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+        U* actual = (U*)buf_data[1];
+        U* expected = cpu_dst;
+        for (size_t i = 0; i < n; ++i)
+            check_result(&actual[i], &expected[i]);
+
         OCL_UNMAP_BUFFER(1);
         OCL_UNMAP_BUFFER(0);
     }
diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
index 15a1f90..1df7d47 100644
--- a/utests/compiler_abs_diff.cpp
+++ b/utests/compiler_abs_diff.cpp
@@ -127,6 +127,18 @@ template <typename T, typename U> static void dump_data (T* x, T* y, U* diff, in
     }
 }
 
+template <typename T>
+static void check_result(T* actual, T* expected)
+{
+    OCL_ASSERT(*actual == *expected);
+}
+
+template <typename T, int N>
+static void check_result(cl_vec<T, N>* actual, cl_vec<T, N>* expected)
+{
+    OCL_ASSERT(!memcmp(actual, expected, sizeof(T)*N));
+}
+
 template <typename T, typename U> static void compiler_abs_diff_with_type(void)
 {
     const size_t n = 16;
@@ -174,7 +186,10 @@ template <typename T, typename U> static void compiler_abs_diff_with_type(void)
 
 //      dump_data(cpu_x, cpu_y, cpu_diff, n);
 
-        OCL_ASSERT(!memcmp(buf_data[2], cpu_diff, sizeof(T) * n));
+        U* actual = (U*)buf_data[2];
+        U* expected = cpu_diff;
+        for (size_t i = 0; i < n; ++i)
+            check_result(&actual[i], &expected[i]);
 
         OCL_UNMAP_BUFFER(0);
         OCL_UNMAP_BUFFER(1);
diff --git a/utests/compiler_array1.cpp b/utests/compiler_array1.cpp
index fe1ecec..70ff049 100644
--- a/utests/compiler_array1.cpp
+++ b/utests/compiler_array1.cpp
@@ -3,7 +3,7 @@
 static void cpu(int global_id, int *src, int *dst) {
   int final[16];
   for (int i = 0; i < 16; ++i) {
-    int array[16];
+    int array[16] = {0};
     for (int j = 0; j < src[0]; ++j)
       array[j] = 1+src[0];
     for (int j = src[0]; j < 16; ++j)
diff --git a/utests/compiler_assignment_operation_in_if.cpp b/utests/compiler_assignment_operation_in_if.cpp
index 676c222..65f8853 100644
--- a/utests/compiler_assignment_operation_in_if.cpp
+++ b/utests/compiler_assignment_operation_in_if.cpp
@@ -18,7 +18,7 @@ static void cpu(int gidx, int *dst) {
 
 void compiler_assignment_operation_in_if(void){
   const size_t n = 16;
-  int cpu_dst[16];
+  int cpu_dst[16] = {0};
 	
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("compiler_assignment_operation_in_if");
diff --git a/utests/compiler_box_blur_float.cpp b/utests/compiler_box_blur_float.cpp
index 8a75a25..8d500fc 100644
--- a/utests/compiler_box_blur_float.cpp
+++ b/utests/compiler_box_blur_float.cpp
@@ -14,6 +14,9 @@ static void compiler_box_blur_float()
 
   /* Load the picture */
   tmp = cl_read_bmp("sample.bmp", &w, &h);
+  if(tmp == NULL)
+    return;
+
   sz = w * h * sizeof(float[4]);
   src = (float4*)malloc(sz);
 
diff --git a/utests/compiler_bsort.cpp b/utests/compiler_bsort.cpp
new file mode 100644
index 0000000..31607aa
--- /dev/null
+++ b/utests/compiler_bsort.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+/*
+ * This test is for non-constant extractelement scalarize
+ * this bitonic sort test will use this path in
+ *
+ *  comp = input < shuffle(input, mask1) ^ dir;                    \
+ *  input = shuffle(input, as_uint4(comp + add1));                 \
+ *
+ * The origin buff is
+ * {3.0 5.0 4.0 6.0 0.0 7.0 2.0 1.0}
+ * and the expected result is
+ * {0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0}
+ */
+void compiler_bsort(void)
+{
+  const int n = 8;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_bsort");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = 1;
+  locals[0] = 1;
+
+  OCL_MAP_BUFFER(0);
+  ((float *)(buf_data[0]))[0] = 3.0f;
+  ((float *)(buf_data[0]))[1] = 5.0f;
+  ((float *)(buf_data[0]))[2] = 4.0f;
+  ((float *)(buf_data[0]))[3] = 6.0f;
+  ((float *)(buf_data[0]))[4] = 0.0f;
+  ((float *)(buf_data[0]))[5] = 7.0f;
+  ((float *)(buf_data[0]))[6] = 2.0f;
+  ((float *)(buf_data[0]))[7] = 1.0f;
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+  for (int i = 0; i < n; i ++) {
+    OCL_ASSERT(((float *)(buf_data[0]))[i] == (float)i);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_bsort);
diff --git a/utests/compiler_bswap.cpp b/utests/compiler_bswap.cpp
index 3af9ef5..ed22750 100644
--- a/utests/compiler_bswap.cpp
+++ b/utests/compiler_bswap.cpp
@@ -7,6 +7,14 @@
     (((uint32_t)(A) & 0x00ff0000) >> 8) | \
     (((uint32_t)(A) & 0x0000ff00) << 8) | \
     (((uint32_t)(A) & 0x000000ff) << 24))
+#define cpu_htonll(A)     ((((uint64_t)(A) & 0xff00000000000000) >> 56) | \
+    (((uint64_t)(A) & 0x00ff000000000000) >> 40) | \
+    (((uint64_t)(A) & 0x0000ff0000000000) >> 24) | \
+    (((uint64_t)(A) & 0x000000ff00000000) >> 8) |  \
+    (((uint64_t)(A) & 0x00000000ff000000) << 8) |  \
+    (((uint64_t)(A) & 0x0000000000ff0000) << 24) | \
+    (((uint64_t)(A) & 0x000000000000ff00) << 40) | \
+    (((uint64_t)(A) & 0x00000000000000ff) << 56) )
 
 
 template <typename T> static void gen_rand_val(T & val)
@@ -22,6 +30,8 @@ template <typename T> static void cpu(int global_id, T *src, T *dst)
     g = cpu_htons(f);
   else if (sizeof(T) == sizeof(int32_t))
     g = cpu_htonl(f);
+  else if (sizeof(T) == sizeof(int64_t))
+    g = cpu_htonll(f);
   dst[global_id] = g;
 }
 
@@ -33,15 +43,19 @@ template <typename T> static void cpu(int global_id, T src, T *dst)
     g = cpu_htons(f);
   else if (sizeof(T) == sizeof(int32_t))
     g = cpu_htonl(f);
+  else if (sizeof(T) == sizeof(int64_t))
+    g = cpu_htonll(f);
   dst[global_id] = g;
 }
 
 template <typename T> inline static void print_data(T& val)
 {
   if(sizeof(T) == sizeof(uint16_t))
-    printf(" 0x%hx", val);
-  else
-    printf(" 0x%x", val);
+    printf(" 0x%hx", (uint16_t)val);
+  else if(sizeof(T) == sizeof(uint32_t))
+    printf(" 0x%x", (uint32_t)val);
+  else if(sizeof(T) == sizeof(uint64_t))
+    printf(" 0x%lx", (uint64_t)val);
 }
 
 template <typename T> static void dump_data(T* raw, T* cpu, T* gpu, int n)
@@ -78,7 +92,7 @@ template <typename T> static void dump_data(T raw, T* cpu, T* gpu, int n)
 
 void compiler_bswap(void)
 {
-  const size_t n = 32;
+  const size_t n = 16;
   uint32_t src0[n];
   uint16_t src1[n];
   uint32_t dst0[n];
@@ -87,6 +101,10 @@ void compiler_bswap(void)
   int32_t dst2[n];
   int16_t src3 = static_cast<int16_t>(rand());
   int16_t dst3[n];
+  uint64_t src4[n];
+  uint64_t dst4[n];
+  int64_t src5 = static_cast<int64_t>(rand()) << 32| static_cast<int64_t>(rand());
+  int64_t dst5[n];
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", "compiler_bswap");
@@ -108,6 +126,15 @@ void compiler_bswap(void)
   OCL_CREATE_BUFFER(buf[5], 0, sizeof(dst3), NULL);
   OCL_SET_ARG(7, sizeof(cl_mem), &buf[5]);
 
+  OCL_CREATE_BUFFER(buf[6], 0, sizeof(src4), NULL);
+  OCL_SET_ARG(8, sizeof(cl_mem), &buf[6]);
+  OCL_CREATE_BUFFER(buf[7], 0, sizeof(dst4), NULL);
+  OCL_SET_ARG(9, sizeof(cl_mem), &buf[7]);
+
+  OCL_SET_ARG(10, sizeof(int64_t), &src5);
+  OCL_CREATE_BUFFER(buf[8], 0, sizeof(dst5), NULL);
+  OCL_SET_ARG(11, sizeof(cl_mem), &buf[8]);
+
   OCL_MAP_BUFFER(0);
   for (int32_t i = 0; i < (int32_t) n; ++i) {
     gen_rand_val(src0[i]);
@@ -142,6 +169,16 @@ void compiler_bswap(void)
   memset(buf_data[5], 0, sizeof(dst3));
   OCL_UNMAP_BUFFER(5);
 
+  OCL_MAP_BUFFER(6);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    uint64_t x, y;
+    gen_rand_val(x);
+    gen_rand_val(y);
+    src4[i] = (x << 32)| y;
+  }
+  memcpy(buf_data[6], src4, sizeof(src4));
+  OCL_UNMAP_BUFFER(6);
+
   globals[0] = n;
   locals[0] = 16;
   OCL_NDRANGE(1);
@@ -173,6 +210,14 @@ void compiler_bswap(void)
   for (int32_t i = 0; i < (int32_t) n; ++i)
     cpu(i, src3, dst3);
 
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu(i, src4, dst4);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu(i, src5, dst5);
+
   OCL_MAP_BUFFER(1);
   //dump_data(src0, dst0, (uint32_t *)buf_data[1], n);
   OCL_ASSERT(!memcmp(buf_data[1], dst0, sizeof(dst0)));
@@ -192,6 +237,16 @@ void compiler_bswap(void)
   //dump_data(src3, dst3, (int16_t *)buf_data[5], n);
   OCL_ASSERT(!memcmp(buf_data[5], dst3, sizeof(dst3)));
   OCL_UNMAP_BUFFER(5);
+
+  OCL_MAP_BUFFER(7);
+  //dump_data(src4, dst4, (uint64_t *)buf_data[7], n);
+  OCL_ASSERT(!memcmp(buf_data[7], dst4, sizeof(dst4)));
+  OCL_UNMAP_BUFFER(7);
+
+  OCL_MAP_BUFFER(8);
+  //dump_data(src5, dst5, (int64_t *)buf_data[8], n);
+  OCL_ASSERT(!memcmp(buf_data[8], dst5, sizeof(dst5)));
+  OCL_UNMAP_BUFFER(8);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_bswap);
diff --git a/utests/compiler_cl_finish.cpp b/utests/compiler_cl_finish.cpp
index c637ecc..1bd2304 100644
--- a/utests/compiler_cl_finish.cpp
+++ b/utests/compiler_cl_finish.cpp
@@ -9,7 +9,7 @@ static void compiler_cl_finish(void)
 {
   const size_t n = 16*1024*1024;
   struct timeval t1, t2;
-  float t_map_w_fin,t_map_wo_fin;
+  float t_fin, t_map_w_fin,t_map_wo_fin;
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("test_cl_finish");
@@ -26,11 +26,15 @@ static void compiler_cl_finish(void)
 
   // 1st time map after clFinish
   OCL_NDRANGE(1);
+  T_GET(t1);
   OCL_FINISH();
+  T_GET(t2);
+  t_fin = T_LAPSE(t1, t2);
 
   T_GET(t1);
   OCL_MAP_BUFFER(0);
   T_GET(t2);
+  OCL_UNMAP_BUFFER(0);
   t_map_w_fin = T_LAPSE(t1, t2);
 
   // 2nd time map without clFinish
@@ -40,7 +44,7 @@ static void compiler_cl_finish(void)
   T_GET(t2);
   t_map_wo_fin = T_LAPSE(t1, t2);
 
-  OCL_ASSERT(t_map_wo_fin > t_map_w_fin);
+  OCL_ASSERT(t_fin > t_map_w_fin && t_map_wo_fin > t_map_w_fin);
   OCL_UNMAP_BUFFER(0);
 }
 
diff --git a/utests/compiler_clz.cpp b/utests/compiler_clz.cpp
index 9116608..53a418f 100644
--- a/utests/compiler_clz.cpp
+++ b/utests/compiler_clz.cpp
@@ -81,13 +81,13 @@ void test(const char *kernel_name, int s_type)
   {
     for (uint32_t i = 0; i < n; ++i) {
       if(sizeof(U) == 1 && i < 8 )
-        OCL_ASSERT(((U*)buf_data[1])[i] == i );
+        OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
       else if(sizeof(U) == 2 && i < 16 )
-        OCL_ASSERT(((U*)buf_data[1])[i] == i );
+        OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
       else if(sizeof(U) == 4 && i < 32 )
-        OCL_ASSERT(((U*)buf_data[1])[i] == i );
+        OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
       else if(sizeof(U) == 8 && i < 64 )
-        OCL_ASSERT(((U*)buf_data[1])[i] == i );
+        OCL_ASSERT(((U*)buf_data[1])[i] == (U)i );
     }
   }
   else  // signed type
@@ -96,28 +96,28 @@ void test(const char *kernel_name, int s_type)
       if(sizeof(U) == 1)
       {
         if( i < 8 )
-          OCL_ASSERT(((U*)buf_data[1])[i] == i+1 );
+          OCL_ASSERT(((U*)buf_data[1])[i] == (U)i+1 );
         else if( i == 8 )
           OCL_ASSERT(((U*)buf_data[1])[i] == 0 );
       }
       else if(sizeof(U) == 2)
       {
         if( i < 16 )
-          OCL_ASSERT(((U*)buf_data[1])[i] == i+1 );
+          OCL_ASSERT(((U*)buf_data[1])[i] == (U)i+1 );
         else if( i == 16 )
           OCL_ASSERT(((U*)buf_data[1])[i] == 0 );
       }
       else if(sizeof(U) == 4)
       {
         if( i < 32 )
-          OCL_ASSERT(((U*)buf_data[1])[i] == i+1 );
+          OCL_ASSERT(((U*)buf_data[1])[i] == (U)i+1 );
         else if( i == 32 )
           OCL_ASSERT(((U*)buf_data[1])[i] == 0 );
       }
       else if(sizeof(U) == 8)
       {
         if( i < 63 )
-          OCL_ASSERT(((U*)buf_data[1])[i] == i+1 );
+          OCL_ASSERT(((U*)buf_data[1])[i] == (U)i+1 );
       }
     }
   }
diff --git a/utests/compiler_copy_image.cpp b/utests/compiler_copy_image.cpp
index 150fd8a..ca15cc6 100644
--- a/utests/compiler_copy_image.cpp
+++ b/utests/compiler_copy_image.cpp
@@ -44,13 +44,13 @@ static void compiler_copy_image(void)
   OCL_NDRANGE(2);
 
   // Check result
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
   for (uint32_t j = 0; j < h; ++j)
     for (uint32_t i = 0; i < w; i++)
       OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1])[j * w + i]);
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
 
   OCL_CALL(clReleaseSampler, sampler);
 }
diff --git a/utests/compiler_copy_image1.cpp b/utests/compiler_copy_image1.cpp
index 659dddc..cc9ef85 100644
--- a/utests/compiler_copy_image1.cpp
+++ b/utests/compiler_copy_image1.cpp
@@ -57,12 +57,12 @@ static void compiler_copy_image1(void)
   OCL_NDRANGE(2);
 
   // Check result
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
-  OCL_MAP_BUFFER(2);
-  OCL_MAP_BUFFER(3);
-  OCL_MAP_BUFFER(4);
-  OCL_MAP_BUFFER(5);
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  OCL_MAP_BUFFER_GTT(2);
+  OCL_MAP_BUFFER_GTT(3);
+  OCL_MAP_BUFFER_GTT(4);
+  OCL_MAP_BUFFER_GTT(5);
 
   for(uint32_t k = 0; k < 5; k++)
   {
@@ -70,12 +70,12 @@ static void compiler_copy_image1(void)
       for (uint32_t i = 0; i < w; i++)
         OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1 + k])[j * w + i]);
   }
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
-  OCL_UNMAP_BUFFER(2);
-  OCL_UNMAP_BUFFER(3);
-  OCL_UNMAP_BUFFER(4);
-  OCL_UNMAP_BUFFER(5);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+  OCL_UNMAP_BUFFER_GTT(2);
+  OCL_UNMAP_BUFFER_GTT(3);
+  OCL_UNMAP_BUFFER_GTT(4);
+  OCL_UNMAP_BUFFER_GTT(5);
 
   OCL_CALL(clReleaseSampler, sampler);
 }
diff --git a/utests/compiler_copy_image_1d.cpp b/utests/compiler_copy_image_1d.cpp
index 5af6a77..6599d30 100644
--- a/utests/compiler_copy_image_1d.cpp
+++ b/utests/compiler_copy_image_1d.cpp
@@ -39,14 +39,14 @@ static void compiler_copy_image_1d(void)
   OCL_NDRANGE(1);
 
   // Check result
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
   for (uint32_t i = 0; i < w; i++) {
       //printf (" %x", ((uint32_t*)buf_data[1])[i]);
       OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
   }
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_copy_image_1d);
diff --git a/utests/compiler_double.cpp b/utests/compiler_double.cpp
index 7c54ddf..fc89a0f 100644
--- a/utests/compiler_double.cpp
+++ b/utests/compiler_double.cpp
@@ -12,6 +12,9 @@ void compiler_double(void)
   const size_t n = 16;
   double cpu_dst[n], cpu_src[n];
 
+  if (!cl_check_double())
+    return;
+
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("compiler_double");
   OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
@@ -38,7 +41,7 @@ void compiler_double(void)
     // Compare
     OCL_MAP_BUFFER(1);
     for (int32_t i = 0; i < (int32_t) n; ++i)
-      OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+      OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-32);
     OCL_UNMAP_BUFFER(1);
   }
 }
diff --git a/utests/compiler_double_2.cpp b/utests/compiler_double_2.cpp
deleted file mode 100644
index 7e3ae4b..0000000
--- a/utests/compiler_double_2.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <cmath>
-#include "utest_helper.hpp"
-
-static void cpu(int global_id, float *src, double *dst) {
-  float f = src[global_id];
-  float d = 1.234567890123456789;
-  dst[global_id] = global_id < 14 ? d * (d + f) : 14;
-}
-
-void compiler_double_2(void)
-{
-  const size_t n = 16;
-  float cpu_src[n];
-  double cpu_dst[n];
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_double_2");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  globals[0] = n;
-  locals[0] = 16;
-
-  // Run random tests
-  for (uint32_t pass = 0; pass < 1; ++pass) {
-    OCL_MAP_BUFFER(0);
-    for (int32_t i = 0; i < (int32_t) n; ++i)
-      cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
-    OCL_UNMAP_BUFFER(0);
-
-    // Run the kernel on GPU
-    OCL_NDRANGE(1);
-
-    // Run on CPU
-    for (int32_t i = 0; i < (int32_t) n; ++i)
-      cpu(i, cpu_src, cpu_dst);
-
-    // Compare
-    OCL_MAP_BUFFER(1);
-    for (int32_t i = 0; i < (int32_t) n; ++i)
-      OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
-    OCL_UNMAP_BUFFER(1);
-  }
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_double_2);
diff --git a/utests/compiler_double_4.cpp b/utests/compiler_double_4.cpp
deleted file mode 100644
index cb25bd4..0000000
--- a/utests/compiler_double_4.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include <cmath>
-#include "utest_helper.hpp"
-
-void compiler_double_4(void)
-{
-  const size_t n = 16;
-  double cpu_src1[n], cpu_src2[n];
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_double_4");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
-  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
-  globals[0] = n;
-  locals[0] = 16;
-
-  // Run random tests
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
-  for (int32_t i = 0; i < (int32_t) n; ++i) {
-    cpu_src1[i] = ((double*)buf_data[0])[i] = rand() * 1e-2;
-    cpu_src2[i] = ((double*)buf_data[1])[i] = rand() * 1e-2;
-  }
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
-
-  // Run the kernel on GPU
-  OCL_NDRANGE(1);
-
-  // Compare
-  OCL_MAP_BUFFER(2);
-  for (int32_t i = 0; i < (int32_t) n; ++i)
-    OCL_ASSERT(fabs(((double*)buf_data[2])[i] - cpu_src1[i] - cpu_src2[i]) < 1e-4);
-  OCL_UNMAP_BUFFER(2);
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_double_4);
diff --git a/utests/compiler_double_convert.cpp b/utests/compiler_double_convert.cpp
new file mode 100644
index 0000000..510118a
--- /dev/null
+++ b/utests/compiler_double_convert.cpp
@@ -0,0 +1,621 @@
+#include <cmath>
+#include <string.h>
+#include "utest_helper.hpp"
+
+void compiler_double_convert_int(void)
+{
+  const size_t n = 16;
+  double src[n];
+  int32_t cpu_dst0[n];
+  uint32_t cpu_dst1[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst0, 0, sizeof(cpu_dst0));
+  memset(cpu_dst1, 0, sizeof(cpu_dst1));
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_double_convert_int");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = ((double*)buf_data[0])[i] = 32.1 * (rand() & 1324135) + 1434342.73209855531;
+    ((int32_t*)buf_data[1])[i] = 0;
+    ((uint32_t*)buf_data[2])[i] = 0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i%3 == 0) continue;
+    cpu_dst0[i] = (int32_t)src[i];
+    cpu_dst1[i] = (uint32_t)src[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("Return Int is %d, ref is %d,\t Uint is %u, ref is %u,\t double is %f\n",
+    //   ((int*)buf_data[1])[i], cpu_dst0[i], ((uint32_t*)buf_data[2])[i], cpu_dst1[i], src[i]);
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst0[i]);
+    OCL_ASSERT(((uint32_t*)buf_data[2])[i] == cpu_dst1[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_convert_int);
+
+void compiler_double_convert_float(void)
+{
+  const size_t n = 16;
+  double src[n];
+  float cpu_dst[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst, 0, sizeof(cpu_dst));
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_double_convert_float");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = ((double*)buf_data[0])[i] = 1332.1 * (rand() & 1324135) - 1434342.73209855531 * (rand() & 135);
+    ((float*)buf_data[1])[i] = 0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu_dst[i] = (float)src[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("Return float is %f,\t ref is %f,\t double is %f\n", ((float*)buf_data[1])[i], cpu_dst[i], src[i]);
+    OCL_ASSERT(((float*)buf_data[1])[i] == cpu_dst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_convert_float);
+
+void compiler_double_convert_short(void)
+{
+  const size_t n = 16;
+  double src[n];
+  int16_t cpu_dst0[n];
+  uint16_t cpu_dst1[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst0, 0, sizeof(cpu_dst0));
+  memset(cpu_dst1, 0, sizeof(cpu_dst1));
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_double_convert_short");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int16_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = ((double*)buf_data[0])[i] = 10.3443 * (rand() & 15) + 14.8924323;
+    ((int16_t*)buf_data[1])[i] = 0;
+    ((uint16_t*)buf_data[2])[i] = 0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i%3 == 0) continue;
+    cpu_dst0[i] = (int16_t)src[i];
+    cpu_dst1[i] = (uint16_t)src[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("Return Int is %d, ref is %d,\t Uint is %u, ref is %u,\t double is %f\n",
+    //   ((int16_t*)buf_data[1])[i], cpu_dst0[i], ((uint16_t*)buf_data[2])[i], cpu_dst1[i], src[i]);
+    OCL_ASSERT(((int16_t*)buf_data[1])[i] == cpu_dst0[i]);
+    OCL_ASSERT(((uint16_t*)buf_data[2])[i] == cpu_dst1[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_convert_short);
+
+void compiler_double_convert_char(void)
+{
+  const size_t n = 16;
+  double src[n];
+  int8_t cpu_dst0[n];
+  uint8_t cpu_dst1[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst0, 0, sizeof(cpu_dst0));
+  memset(cpu_dst1, 0, sizeof(cpu_dst1));
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_double_convert_char");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int8_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint8_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = ((double*)buf_data[0])[i] = 10.3443 * (rand() & 7) + 2.8924323;
+    ((int8_t*)buf_data[1])[i] = 0;
+    ((uint8_t*)buf_data[2])[i] = 0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i%3 == 0) continue;
+    cpu_dst0[i] = (int8_t)src[i];
+    cpu_dst1[i] = (uint8_t)src[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+//    printf("Return Int is %d, ref is %d,\t Uint is %u, ref is %u,\t double is %f\n",
+//       ((int8_t*)buf_data[1])[i], cpu_dst0[i], ((uint8_t*)buf_data[2])[i], cpu_dst1[i], src[i]);
+    OCL_ASSERT(((int8_t*)buf_data[1])[i] == cpu_dst0[i]);
+    OCL_ASSERT(((uint8_t*)buf_data[2])[i] == cpu_dst1[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_convert_char);
+
+void compiler_double_convert_long(void)
+{
+  const size_t n = 16;
+  double src[n];
+  int64_t cpu_dst0[n];
+  uint64_t cpu_dst1[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst0, 0, sizeof(cpu_dst0));
+  memset(cpu_dst1, 0, sizeof(cpu_dst1));
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_double_convert_long");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = ((double*)buf_data[0])[i] = 10.3443 * (rand() & 7) + 2.8924323;
+    ((int64_t*)buf_data[1])[i] = 0;
+    ((uint64_t*)buf_data[2])[i] = 0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i%3 == 0) continue;
+    cpu_dst0[i] = (int64_t)src[i];
+    cpu_dst1[i] = (uint64_t)src[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+//    printf("Return Int is %d, ref is %d,\t Uint is %u, ref is %u,\t double is %f\n",
+//       ((int8_t*)buf_data[1])[i], cpu_dst0[i], ((uint8_t*)buf_data[2])[i], cpu_dst1[i], src[i]);
+    OCL_ASSERT(((int64_t*)buf_data[1])[i] == cpu_dst0[i]);
+    OCL_ASSERT(((uint64_t*)buf_data[2])[i] == cpu_dst1[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_convert_long);
+
+void compiler_long_convert_double(void)
+{
+  const size_t n = 16;
+  int64_t src0[n];
+  uint64_t src1[n];
+  double cpu_dst0[n];
+  double cpu_dst1[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst0, 0, sizeof(cpu_dst0));
+  memset(cpu_dst1, 0, sizeof(cpu_dst1));
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_long_convert_double");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src0[i] = ((int64_t*)buf_data[0])[i] = 0xABC8ABACDA00C * (rand() & 7);
+    src1[i] = ((uint64_t*)buf_data[1])[i] = 0xCABC8ABACDA00C * (rand() & 15);
+    ((double*)buf_data[2])[i] = 0.0;
+    ((double*)buf_data[3])[i] = 0.0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu_dst0[i] = (double)src0[i];
+    cpu_dst1[i] = (double)src1[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+//    printf("long is %ld, ref is %f, double is %f    \t"
+//           "ulong is %lu, ref is %f, double is %f\n",
+//           src0[i], cpu_dst0[i], ((double*)buf_data[2])[i],
+//           src1[i], cpu_dst1[i], ((double*)buf_data[3])[i]);
+    OCL_ASSERT(((double*)buf_data[2])[i] == cpu_dst0[i]);
+    OCL_ASSERT(((double*)buf_data[3])[i] == cpu_dst1[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_convert_double);
+
+void compiler_int_convert_double(void)
+{
+  const size_t n = 16;
+  int32_t src0[n];
+  uint32_t src1[n];
+  double cpu_dst0[n];
+  double cpu_dst1[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst0, 0, sizeof(cpu_dst0));
+  memset(cpu_dst1, 0, sizeof(cpu_dst1));
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_int_convert_double");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src0[i] = ((int32_t*)buf_data[0])[i] = 0xCABC8A0C * (rand() & 7);
+    src1[i] = ((uint32_t*)buf_data[1])[i] = 0xCACDA00C * (rand() & 15);
+    ((double*)buf_data[2])[i] = 0.0;
+    ((double*)buf_data[3])[i] = 0.0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu_dst0[i] = (double)src0[i];
+    cpu_dst1[i] = (double)src1[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+//    printf("int is %d, ref is %f, double is %f    \t"
+//           "uint is %u, ref is %f, double is %f\n",
+//           src0[i], cpu_dst0[i], ((double*)buf_data[2])[i],
+//           src1[i], cpu_dst1[i], ((double*)buf_data[3])[i]);
+    OCL_ASSERT(((double*)buf_data[2])[i] == cpu_dst0[i]);
+    OCL_ASSERT(((double*)buf_data[3])[i] == cpu_dst1[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_int_convert_double);
+
+void compiler_short_convert_double(void)
+{
+  const size_t n = 16;
+  int16_t src0[n];
+  uint16_t src1[n];
+  double cpu_dst0[n];
+  double cpu_dst1[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst0, 0, sizeof(cpu_dst0));
+  memset(cpu_dst1, 0, sizeof(cpu_dst1));
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_short_convert_double");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src0[i] = ((int16_t*)buf_data[0])[i] = 0x8A0C * (rand() & 7);
+    src1[i] = ((uint16_t*)buf_data[1])[i] = 0xC00C * (rand() & 15);
+    ((double*)buf_data[2])[i] = 0.0;
+    ((double*)buf_data[3])[i] = 0.0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu_dst0[i] = (double)src0[i];
+    cpu_dst1[i] = (double)src1[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+//    printf("short is %d, ref is %f, double is %f    \t"
+//           "ushort is %u, ref is %f, double is %f\n",
+//           src0[i], cpu_dst0[i], ((double*)buf_data[2])[i],
+//           src1[i], cpu_dst1[i], ((double*)buf_data[3])[i]);
+    OCL_ASSERT(((double*)buf_data[2])[i] == cpu_dst0[i]);
+    OCL_ASSERT(((double*)buf_data[3])[i] == cpu_dst1[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_short_convert_double);
+
+void compiler_char_convert_double(void)
+{
+  const size_t n = 16;
+  int8_t src0[n];
+  uint8_t src1[n];
+  double cpu_dst0[n];
+  double cpu_dst1[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst0, 0, sizeof(cpu_dst0));
+  memset(cpu_dst1, 0, sizeof(cpu_dst1));
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_char_convert_double");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int8_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint8_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src0[i] = ((int8_t*)buf_data[0])[i] = 0x8C * (rand() & 7);
+    src1[i] = ((uint8_t*)buf_data[1])[i] = 0xC0 * (rand() & 15);
+    ((double*)buf_data[2])[i] = 0.0;
+    ((double*)buf_data[3])[i] = 0.0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu_dst0[i] = (double)src0[i];
+    cpu_dst1[i] = (double)src1[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+//    printf("char is %d, ref is %f, double is %f    \t"
+//           "uchar is %u, ref is %f, double is %f\n",
+//           src0[i], cpu_dst0[i], ((double*)buf_data[2])[i],
+//           src1[i], cpu_dst1[i], ((double*)buf_data[3])[i]);
+    OCL_ASSERT(((double*)buf_data[2])[i] == cpu_dst0[i]);
+    OCL_ASSERT(((double*)buf_data[3])[i] == cpu_dst1[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_char_convert_double);
+
+void compiler_float_convert_double(void)
+{
+  const size_t n = 16;
+  float src[n];
+  double cpu_dst[n];
+
+  if (!cl_check_double())
+    return;
+
+  memset(cpu_dst, 0, sizeof(cpu_dst));
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_convert", "compiler_float_convert_double");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = ((float*)buf_data[0])[i] = (float)(0x8C * (rand() & 7)) * 1342.42f;
+    ((double*)buf_data[1])[i] = 0.0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu_dst[i] = (double)src[i];
+  }
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%f,   \t%f\n", ((double*)buf_data[1])[i], cpu_dst[i]);
+    OCL_ASSERT(((double*)buf_data[1])[i] == cpu_dst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_float_convert_double);
diff --git a/utests/compiler_double_div.cpp b/utests/compiler_double_div.cpp
new file mode 100644
index 0000000..f78e238
--- /dev/null
+++ b/utests/compiler_double_div.cpp
@@ -0,0 +1,83 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void compiler_double_div(void)
+{
+  const size_t n = 16;
+  double cpu_src0[n], cpu_src1[n];
+
+  if (!cl_check_double())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_double_div");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu_src0[i] = ((double*)buf_data[0])[i] = ((double)(((i - 5)*1334) * 11105));
+    cpu_src1[i] = ((double*)buf_data[1])[i] = 499.13542123*(i + 132.43 + 142.32*i);
+    ((double*)buf_data[2])[i] = 0.0;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i % 3 != 0)
+      OCL_ASSERT(fabs(((double*)buf_data[2])[i] - cpu_src0[i]/cpu_src1[i]) < 1e-32);
+    else
+      OCL_ASSERT(((double*)buf_data[2])[i] == 0.0);
+
+    //printf("%d :  %f        ref value: %f\n", i, ((double*)buf_data[2])[i], cpu_src0[i]/cpu_src1[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_div);
+
+void compiler_double_div_uniform(void)
+{
+  double src0 = 13234.1438786319;
+  double src1 = 0.000134123;
+  double tmp = 25.128;
+
+  if (!cl_check_double())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_double_div", "compiler_double_div_uniform");
+  OCL_CREATE_BUFFER(buf[0], 0, sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(double), &src0);
+  OCL_SET_ARG(1, sizeof(double), &src1);
+  OCL_SET_ARG(2, sizeof(double), &tmp);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[0]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  OCL_ASSERT(fabs(((double*)buf_data[0])[0] - src0/src1) < 1e-32);
+  //printf("%f        ref value: %f\n", ((double*)buf_data[0])[0], src0/src1);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_div_uniform);
diff --git a/utests/compiler_double_precision.cpp b/utests/compiler_double_precision.cpp
index 217fd18..f77a059 100644
--- a/utests/compiler_double_precision.cpp
+++ b/utests/compiler_double_precision.cpp
@@ -9,6 +9,9 @@ static void double_precision_check(void)
   double d1 = 0.12355678922345678;
   float cpu_result = d1 - d0;
 
+  if (!cl_check_double())
+    return;
+
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("double_precision_check");
   //OCL_CREATE_KERNEL("compiler_array");
diff --git a/utests/compiler_fill_image.cpp b/utests/compiler_fill_image.cpp
index 5a38b8c..0fb53df 100644
--- a/utests/compiler_fill_image.cpp
+++ b/utests/compiler_fill_image.cpp
@@ -34,11 +34,11 @@ static void compiler_fill_image(void)
   OCL_NDRANGE(2);
 
   // Check result
-  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER_GTT(0);
   for (uint32_t j = 0; j < h; ++j)
     for (uint32_t i = 0; i < w; i++)
       OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == 0x78563412);
-  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER_GTT(0);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_image);
diff --git a/utests/compiler_fill_image_1d_array.cpp b/utests/compiler_fill_image_1d_array.cpp
index 67f9643..23a8425 100644
--- a/utests/compiler_fill_image_1d_array.cpp
+++ b/utests/compiler_fill_image_1d_array.cpp
@@ -69,6 +69,7 @@ static void compiler_fill_image_1d_array(void)
   for (uint32_t i = 0; i < w; i++) {
     OCL_ASSERT(dst[(array - 1)*w + i] == 0x0);
   }
+  free(src);
   free(dst);
 }
 
diff --git a/utests/compiler_fill_image_2d_array.cpp b/utests/compiler_fill_image_2d_array.cpp
index fc09362..ab7470e 100644
--- a/utests/compiler_fill_image_2d_array.cpp
+++ b/utests/compiler_fill_image_2d_array.cpp
@@ -11,6 +11,7 @@ static void compiler_fill_image_2d_array(void)
   size_t origin[3] = { };
   size_t region[3];
   uint32_t* dst;
+  uint32_t* src;
 
   memset(&desc, 0x0, sizeof(cl_image_desc));
   memset(&format, 0x0, sizeof(cl_image_format));
@@ -28,9 +29,16 @@ static void compiler_fill_image_2d_array(void)
 
   OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
 
-  OCL_MAP_BUFFER_GTT(0);
-  memset(buf_data[0], 0, sizeof(uint32_t) * w * h * array);
-  OCL_UNMAP_BUFFER_GTT(0);
+  region[0] = w;
+  region[1] = h;
+  region[2] = array;
+
+  // As we don't know the pitch right now, we cannot
+  // use map to setup the image. It is safer to use
+  // write image
+  src = (uint32_t*)malloc(sizeof(uint32_t) * w * h * array);
+  memset(src, 0, sizeof(uint32_t) * w * h * array);
+  OCL_WRITE_IMAGE(buf[0], origin, region, src);
 
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -43,9 +51,6 @@ static void compiler_fill_image_2d_array(void)
   OCL_NDRANGE(3);
 
   // Check result
-  region[0] = w;
-  region[1] = h;
-  region[2] = array;
   dst = (uint32_t*)malloc(w*h*array*sizeof(uint32_t));
   OCL_READ_IMAGE(buf[0], origin, region, dst);
 
@@ -79,6 +84,7 @@ static void compiler_fill_image_2d_array(void)
     }
   }
   free(dst);
+  free(src);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_2d_array);
diff --git a/utests/compiler_fill_image_3d.cpp b/utests/compiler_fill_image_3d.cpp
index ec96e80..fd84e76 100644
--- a/utests/compiler_fill_image_3d.cpp
+++ b/utests/compiler_fill_image_3d.cpp
@@ -39,12 +39,12 @@ static void compiler_fill_image_3d(void)
   OCL_NDRANGE(3);
 
   // Check result
-  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER_GTT(0);
   for (uint32_t k = 0; k < depth; k++)
     for (uint32_t j = 0; j < h; ++j)
       for (uint32_t i = 0; i < w; i++)
         OCL_ASSERT(((uint32_t*)buf_data[0])[k*w*h + j*w + i] == 0x78563412);
-  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER_GTT(0);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_3d);
diff --git a/utests/compiler_function_qualifiers.cpp b/utests/compiler_function_qualifiers.cpp
index 622313c..4599c95 100644
--- a/utests/compiler_function_qualifiers.cpp
+++ b/utests/compiler_function_qualifiers.cpp
@@ -13,6 +13,7 @@ void compiler_function_qualifiers(void)
   param_value = malloc(param_value_size);
   err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, param_value_size, param_value, NULL);
   OCL_ASSERT(err == CL_SUCCESS);
+  free(param_value);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_function_qualifiers);
diff --git a/utests/compiler_get_sub_group_size.cpp b/utests/compiler_get_max_sub_group_size.cpp
similarity index 67%
rename from utests/compiler_get_sub_group_size.cpp
rename to utests/compiler_get_max_sub_group_size.cpp
index 20339d7..6f56b33 100644
--- a/utests/compiler_get_sub_group_size.cpp
+++ b/utests/compiler_get_max_sub_group_size.cpp
@@ -1,11 +1,13 @@
 #include "utest_helper.hpp"
 
-void compiler_get_sub_group_size(void)
+void compiler_get_max_sub_group_size(void)
 {
+  if(!cl_check_subgroups())
+    return;
   const size_t n = 256;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_get_sub_group_size");
+  OCL_CREATE_KERNEL("compiler_get_max_sub_group_size");
   OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
@@ -24,9 +26,9 @@ void compiler_get_sub_group_size(void)
   OCL_MAP_BUFFER(0);
   int* dst = (int *)buf_data[0];
   for (int32_t i = 0; i < (int32_t) n; ++i){
-    OCL_ASSERT(8 == dst[i] || 16 == dst[i]);
+    OCL_ASSERT(8 == dst[i] || 16 == dst[i] || 32 == dst[i]);
   }
   OCL_UNMAP_BUFFER(0);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_get_sub_group_size);
+MAKE_UTEST_FROM_FUNCTION(compiler_get_max_sub_group_size);
diff --git a/utests/compiler_get_sub_group_id.cpp b/utests/compiler_get_sub_group_local_id.cpp
similarity index 75%
rename from utests/compiler_get_sub_group_id.cpp
rename to utests/compiler_get_sub_group_local_id.cpp
index 0d88d29..84fbce0 100644
--- a/utests/compiler_get_sub_group_id.cpp
+++ b/utests/compiler_get_sub_group_local_id.cpp
@@ -1,11 +1,13 @@
 #include "utest_helper.hpp"
 
-void compiler_get_sub_group_id(void)
+void compiler_get_sub_group_local_id(void)
 {
+  if(!cl_check_subgroups())
+    return;
   const size_t n = 256;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_get_sub_group_id");
+  OCL_CREATE_KERNEL("compiler_get_sub_group_local_id");
   OCL_CREATE_BUFFER(buf[0], 0, (n+1) * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
@@ -30,4 +32,4 @@ void compiler_get_sub_group_id(void)
   OCL_UNMAP_BUFFER(0);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_get_sub_group_id);
+MAKE_UTEST_FROM_FUNCTION(compiler_get_sub_group_local_id);
diff --git a/utests/compiler_half.cpp b/utests/compiler_half.cpp
index e8ed286..2b4ea2e 100644
--- a/utests/compiler_half.cpp
+++ b/utests/compiler_half.cpp
@@ -2,127 +2,8 @@
 #include <cstring>
 #include <iostream>
 #include <cmath>
-#include <algorithm>
 #include "utest_helper.hpp"
 
-static uint32_t __half_to_float(uint16_t h, bool* isInf = NULL, bool* infSign = NULL)
-{
-  struct __FP32 {
-    uint32_t mantissa:23;
-    uint32_t exponent:8;
-    uint32_t sign:1;
-  };
-  struct __FP16 {
-    uint32_t mantissa:10;
-    uint32_t exponent:5;
-    uint32_t sign:1;
-  };
-  uint32_t f;
-  __FP32 o;
-  memset(&o, 0, sizeof(o));
-  __FP16 i;
-  memcpy(&i, &h, sizeof(uint16_t));
-
-  if (isInf)
-    *isInf = false;
-  if (infSign)
-    *infSign = false;
-
-  if (i.exponent == 0 && i.mantissa == 0) // (Signed) zero
-    o.sign = i.sign;
-  else {
-    if (i.exponent == 0) { // Denormal (converts to normalized)
-      // Adjust mantissa so it's normalized (and keep
-      // track of exponent adjustment)
-      int e = -1;
-      uint m = i.mantissa;
-      do {
-        e++;
-        m <<= 1;
-      } while ((m & 0x400) == 0);
-
-      o.mantissa = (m & 0x3ff) << 13;
-      o.exponent = 127 - 15 - e;
-      o.sign = i.sign;
-    } else if (i.exponent == 0x1f) { // Inf/NaN
-      // NOTE: Both can be handled with same code path
-      // since we just pass through mantissa bits.
-      o.mantissa = i.mantissa << 13;
-      o.exponent = 255;
-      o.sign = i.sign;
-
-      if (isInf) {
-        *isInf = (i.mantissa == 0);
-        if (infSign)
-          *infSign = !i.sign;
-      }
-    } else { // Normalized number
-      o.mantissa = i.mantissa << 13;
-      o.exponent = 127 - 15 + i.exponent;
-      o.sign = i.sign;
-    }
-  }
-
-  memcpy(&f, &o, sizeof(uint32_t));
-  return f;
-}
-
-
-static uint16_t __float_to_half(uint32_t x)
-{
-  uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
-  uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
-  unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
-
-  /* If zero, or denormal, or exponent underflows too much for a denormal
-   * half, return signed zero. */
-  if (e < 103)
-    return bits;
-
-  /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
-  if (e > 142) {
-    bits |= 0x7c00u;
-    /* If exponent was 0xff and one mantissa bit was set, it means NaN,
-     * not Inf, so make sure we set one mantissa bit too. */
-    bits |= e == 255 && (x & 0x007fffffu);
-    return bits;
-  }
-
-  /* If exponent underflows but not too much, return a denormal */
-  if (e < 113) {
-    m |= 0x0800u;
-    /* Extra rounding may overflow and set mantissa to 0 and exponent
-     * to 1, which is OK. */
-    bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
-    return bits;
-  }
-
-  bits |= ((e - 112) << 10) | (m >> 1);
-  /* Extra rounding. An overflow will set mantissa to 0 and increment
-   * the exponent, which is OK. */
-  bits += m & 1;
-  return bits;
-}
-
-static int check_half_device(void)
-{
-  std::string extStr;
-  size_t param_value_size;
-  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, &param_value_size);
-  std::vector<char> param_value(param_value_size);
-  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
-           param_value.empty() ? NULL : &param_value.front(), &param_value_size);
-  if (!param_value.empty())
-    extStr = std::string(&param_value.front(), param_value_size-1);
-
-  if (std::strstr(extStr.c_str(), "cl_khr_fp16") == NULL) {
-    printf("No cl_khr_fp16, Skip!");
-    return 0;
-  }
-
-  return 1;
-}
-
 void compiler_half_basic(void)
 {
   const size_t n = 16;
@@ -131,7 +12,7 @@ void compiler_half_basic(void)
   float f = 2.5;
   uint32_t tmp_f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   memcpy(&tmp_f, &f, sizeof(float));
@@ -172,7 +53,7 @@ void compiler_half_basic(void)
   for (int32_t i = 0; i < (int32_t) n; ++i) {
     tmp_f = __half_to_float(((uint16_t *)buf_data[1])[i]);
     memcpy(&f, &tmp_f, sizeof(float));
-    printf("%f %f\n", f, fdst[i]);
+    //printf("%f %f\n", f, fdst[i]);
     OCL_ASSERT(fabs(f - fdst[i]) <= 0.01 * fabs(fdst[i]) || (fdst[i] == 0.0 && f == 0.0));
   }
   OCL_UNMAP_BUFFER(1);
@@ -180,17 +61,24 @@ void compiler_half_basic(void)
 
 MAKE_UTEST_FROM_FUNCTION(compiler_half_basic);
 
+static const int half_n = 16;
+static float half_test_src[half_n] = {
+  -0.23455f, 1.23413f, 2.3412, 8.234f,
+  -122.31f, -14.233f, 0.0023f, 99.322f,
+  0.0f, 0.332f, 123.12f, -0.003f,
+  16.0f, 19.22f, 128.006f, 25.032f
+};
 
-#define HALF_MATH_TEST_1ARG(NAME, CPPNAME, RANGE_L, RANGE_H)            \
+#define HALF_MATH_TEST_1ARG(NAME, CPPNAME)                              \
   void compiler_half_math_##NAME(void)                                  \
   {                                                                     \
-    const size_t n = 16;                                                \
+    const size_t n = half_n;                                            \
     uint16_t hsrc[n];                                                   \
     float fsrc[n], fdst[n];                                             \
     uint32_t tmp_f;                                                     \
     float f;                                                            \
                                                                         \
-    if (!check_half_device())                                           \
+    if (!cl_check_half())                                           \
       return;                                                           \
                                                                         \
     OCL_CREATE_KERNEL_FROM_FILE("compiler_half_math", "compiler_half_math_" #NAME); \
@@ -202,7 +90,7 @@ MAKE_UTEST_FROM_FUNCTION(compiler_half_basic);
     locals[0] = 16;                                                     \
                                                                         \
     for (int32_t i = 0; i < (int32_t) n; ++i) {                         \
-      fsrc[i] = RANGE_L + ((rand()%1000) / 1000.0f ) * ((RANGE_H) - (RANGE_L)); \
+      fsrc[i] = half_test_src[i];                                       \
       memcpy(&tmp_f, &fsrc[i], sizeof(float));                          \
       hsrc[i] = __float_to_half(tmp_f);                                 \
     }                                                                   \
@@ -225,27 +113,27 @@ MAKE_UTEST_FROM_FUNCTION(compiler_half_basic);
       bool isInf, infSign;                                              \
       tmp_f = __half_to_float(((uint16_t *)buf_data[1])[i], &isInf, &infSign); \
       memcpy(&f, &tmp_f, sizeof(float));                                \
-      /*printf("%.15f %.15f, diff is %%%f\n", f, fdst[i], (fabs(f - fdst[i])/fabs(fdst[i]))); */ \
+      /* printf("%.15f %.15f, diff is %f\n", f, fdst[i], (fabs(f - fdst[i])/fabs(fdst[i]))); */ \
       OCL_ASSERT(((fabs(fdst[i]) < 6e-8f) && (fabs(f) < 6e-8f)) ||      \
                  (fabs(f - fdst[i]) <= 0.03 * fabs(fdst[i])) ||         \
                  (isInf && ((infSign && fdst[i] > 65504.0f) || (!infSign && fdst[i] < -65504.0f))) || \
-                 (isnan(f) && isnan(fdst[i])));                         \
+                 (std::isnan(f) && std::isnan(fdst[i])));               \
     }                                                                   \
     OCL_UNMAP_BUFFER(1);                                                \
   }                                                                     \
   MAKE_UTEST_FROM_FUNCTION(compiler_half_math_##NAME);
 
-HALF_MATH_TEST_1ARG(sin, sinf, -10, 10);
-HALF_MATH_TEST_1ARG(cos, cosf, -10, 10);
-HALF_MATH_TEST_1ARG(sinh, sinh, -10, 10);
-HALF_MATH_TEST_1ARG(cosh, cosh, -10, 10);
-HALF_MATH_TEST_1ARG(tan, tanf, -3.14/2, 3.14/2);
-HALF_MATH_TEST_1ARG(log10, log10f, 0.1, 100);
-HALF_MATH_TEST_1ARG(log, logf, 0.01, 1000);
-HALF_MATH_TEST_1ARG(trunc, truncf, -1000, 1000);
-HALF_MATH_TEST_1ARG(exp, expf, -19.0, 20.0);
-HALF_MATH_TEST_1ARG(sqrt, sqrtf, -19.0, 10.0);
-HALF_MATH_TEST_1ARG(ceil, ceilf, -19.0, 20.0);
+HALF_MATH_TEST_1ARG(sin, sinf);
+HALF_MATH_TEST_1ARG(cos, cosf);
+HALF_MATH_TEST_1ARG(sinh, sinh);
+HALF_MATH_TEST_1ARG(cosh, cosh);
+HALF_MATH_TEST_1ARG(tan, tanf);
+HALF_MATH_TEST_1ARG(log10, log10f);
+HALF_MATH_TEST_1ARG(log, logf);
+HALF_MATH_TEST_1ARG(trunc, truncf);
+HALF_MATH_TEST_1ARG(exp, expf);
+HALF_MATH_TEST_1ARG(sqrt, sqrtf);
+HALF_MATH_TEST_1ARG(ceil, ceilf);
 
 #define HALF_MATH_TEST_2ARG(NAME, CPPNAME, RANGE_L, RANGE_H)            \
   void compiler_half_math_##NAME(void)                                  \
@@ -256,7 +144,7 @@ HALF_MATH_TEST_1ARG(ceil, ceilf, -19.0, 20.0);
     uint32_t tmp_f;                                                     \
     float f;                                                            \
                                                                         \
-    if (!check_half_device())                                           \
+    if (!cl_check_half())                                           \
       return;                                                           \
                                                                         \
     OCL_CREATE_KERNEL_FROM_FILE("compiler_half_math", "compiler_half_math_" #NAME); \
@@ -273,7 +161,7 @@ HALF_MATH_TEST_1ARG(ceil, ceilf, -19.0, 20.0);
       fsrc0[i] = RANGE_L + (((RANGE_H) - (RANGE_L))/n) * i;            \
       memcpy(&tmp_f, &fsrc0[i], sizeof(float));                         \
       hsrc0[i] = __float_to_half(tmp_f);                                \
-      fsrc1[i] = RANGE_L + ((rand()%1000) / 1000.0f ) * ((RANGE_H) - (RANGE_L));            \
+      fsrc1[i] = RANGE_L + (half_test_src[i/4] + 63) * ((RANGE_H) - (RANGE_L));            \
       memcpy(&tmp_f, &fsrc1[i], sizeof(float));                         \
       hsrc1[i] = __float_to_half(tmp_f);                                \
     }                                                                   \
@@ -303,7 +191,7 @@ HALF_MATH_TEST_1ARG(ceil, ceilf, -19.0, 20.0);
     OCL_ASSERT(((fabs(fdst[i]) < 6e-8f) && (fabs(f) < 6e-8f)) ||        \
                (fabs(f - fdst[i]) <= 0.03 * fabs(fdst[i])) ||           \
                (isInf && ((infSign && fdst[i] > 65504.0f) || (!infSign && fdst[i] < -65504.0f))) || \
-               (isnan(f) && isnan(fdst[i])));                           \
+               (std::isnan(f) && std::isnan(fdst[i])));                 \
     }                                                                   \
     OCL_UNMAP_BUFFER(2);                                                \
   }                                                                     \
@@ -318,7 +206,7 @@ void compiler_half_isnan(void)
   const size_t n = 16*2;
   uint16_t hsrc[n];
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -359,7 +247,7 @@ void compiler_half_isinf(void)
   const size_t n = 16;
   uint16_t hsrc[n];
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -406,7 +294,7 @@ void compiler_half_to_float(void)
   float fdst[n];
   uint32_t tmp_f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -451,7 +339,7 @@ void compiler_half_as_char2(void)
   uint16_t hsrc[n];
   uint8_t* csrc = (uint8_t*)hsrc;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -493,7 +381,7 @@ void compiler_half2_as_int(void)
   uint16_t hsrc[n];
   int* isrc = (int*)hsrc;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -537,7 +425,7 @@ void compiler_half_to_char_sat(void)
   char dst[n];
   uint32_t tmp_f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -590,7 +478,7 @@ void compiler_half_to_ushort_sat(void)
   uint16_t dst[n];
   uint32_t tmp_f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -641,7 +529,7 @@ void compiler_half_to_uint_sat(void)
   uint32_t dst[n];
   uint32_t tmp_f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -691,7 +579,7 @@ void compiler_uchar_to_half(void)
   float fdst[n];
   uint32_t tmp_f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -738,7 +626,7 @@ void compiler_int_to_half(void)
   float fdst[n];
   uint32_t tmp_f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -786,7 +674,7 @@ void compiler_half_to_long(void)
   uint32_t tmp_f;
   float f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -833,7 +721,7 @@ void compiler_ulong_to_half(void)
   uint32_t tmp_f;
   float f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -880,7 +768,7 @@ void compiler_half_to_long_sat(void)
   uint32_t tmp_f;
   float f;
 
-  if (!check_half_device())
+  if (!cl_check_half())
     return;
 
   // Setup kernel and buffers
@@ -922,3 +810,105 @@ void compiler_half_to_long_sat(void)
   OCL_UNMAP_BUFFER(1);
 }
 MAKE_UTEST_FROM_FUNCTION(compiler_half_to_long_sat);
+
+void compiler_half_to_double(void)
+{
+  const size_t n = 16;
+  uint16_t hsrc[n];
+  double ddst[n];
+  uint32_t tmp_f;
+  float f;
+
+//  if (!cl_check_half())
+//    return;
+  if (!cl_check_double())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_half_to_double");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    f = -100.1f + 10.3f * i;
+    memcpy(&tmp_f, &f, sizeof(float));
+    hsrc[i] = __float_to_half(tmp_f);
+    ddst[i] = (double)f;
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, n*sizeof(double));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    double dd = ((double *)(buf_data[1]))[i];
+//    printf("%f	   %f, diff is %%%f\n", dd, ddst[i], fabs(dd - ddst[i])/fabs(ddst[i]));
+    OCL_ASSERT(fabs(dd - ddst[i]) < 0.001f * fabs(ddst[i]));
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_to_double);
+
+void compiler_double_to_half(void)
+{
+  const size_t n = 16;
+  uint16_t hdst[n];
+  double src[n];
+  uint32_t tmp_f;
+  float f;
+
+//  if (!cl_check_half())
+//    return;
+  if (!cl_check_double())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_double_to_half");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    f = -100.1f + 10.3f * i;
+    src[i] = (double)f;
+    memcpy(&tmp_f, &f, sizeof(float));
+    hdst[i] = __float_to_half(tmp_f);
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src, sizeof(src));
+  memset(buf_data[1], 0, n*sizeof(uint16_t));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    uint16_t hf = ((uint16_t *)(buf_data[1]))[i];
+    //tmp_f = __half_to_float(hf);
+    //memcpy(&f, &tmp_f, sizeof(float));
+    //printf("%f, %x, %x\n", f, hf, hdst[i]);
+    OCL_ASSERT(hf == hdst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_double_to_half);
diff --git a/utests/compiler_mad24.cpp b/utests/compiler_mad24.cpp
index a3890a1..ba2dcf6 100644
--- a/utests/compiler_mad24.cpp
+++ b/utests/compiler_mad24.cpp
@@ -34,7 +34,7 @@ void compiler_mad24(void)
 
   OCL_MAP_BUFFER(3);
   for (int i = 0; i < n; ++i)
-    OCL_ASSERT(((int*)buf_data[3])[i] == ((src1[i] << 8) >> 8) * ((src2[i] << 8) >> 8) + src3[i]);
+    OCL_ASSERT(((int*)buf_data[3])[i] == (src1[i]) * (src2[i]) + src3[i]);
   OCL_UNMAP_BUFFER(3);
 }
 
diff --git a/utests/compiler_math.cpp b/utests/compiler_math.cpp
index e0c4487..0c238c9 100644
--- a/utests/compiler_math.cpp
+++ b/utests/compiler_math.cpp
@@ -72,10 +72,10 @@ static void compiler_math(void)
     for (int i = 0; i < 16; ++i) {
       const float cpu = cpu_dst[i];
       const float gpu = ((float*)buf_data[0])[i];
-      if (isinf(cpu))
-        OCL_ASSERT(isinf(gpu));
-      else if (isnan(cpu))
-        OCL_ASSERT(isnan(gpu));
+      if (std::isinf(cpu))
+        OCL_ASSERT(std::isinf(gpu));
+      else if (std::isnan(cpu))
+        OCL_ASSERT(std::isnan(gpu));
       else
         OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
     }
diff --git a/utests/compiler_math_2op.cpp b/utests/compiler_math_2op.cpp
index 454967d..d771dba 100644
--- a/utests/compiler_math_2op.cpp
+++ b/utests/compiler_math_2op.cpp
@@ -65,10 +65,10 @@ static void compiler_math_2op(void)
     for (int i = 0; i < 16; ++i) {
       const float cpu = cpu_dst[i];
       const float gpu = ((float*)buf_data[0])[i];
-      if (isinf(cpu))
-        OCL_ASSERT(isinf(gpu));
-      else if (isnan(cpu))
-        OCL_ASSERT(isnan(gpu));
+      if (std::isinf(cpu))
+        OCL_ASSERT(std::isinf(gpu));
+      else if (std::isnan(cpu))
+        OCL_ASSERT(std::isnan(gpu));
       else {
         OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
       }
diff --git a/utests/compiler_math_3op.cpp b/utests/compiler_math_3op.cpp
index a382b0a..e8713a1 100644
--- a/utests/compiler_math_3op.cpp
+++ b/utests/compiler_math_3op.cpp
@@ -5,20 +5,22 @@
 static void cpu_compiler_math(float *dst, float *src1, float *src2, float *src3, int i)
 {
   const float x = src1[i], y = src2[i], z = src3[i];
-  switch (i) {
+  switch (i%2) {
     case 0: dst[i] = x * y + z; break;
     case 1: dst[i] = x * y + z; break;
     default: dst[i] = 1.f; break;
   };
+  dst[0] = (src1[0]*src2[0]+src3[0]);
 }
 
-static void compiler_math_3op(void)
+static void compiler_math_3op_float(void)
 {
   const size_t n = 32;
   float cpu_dst[32], cpu_src1[32], cpu_src2[32], cpu_src3[32];
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_math_3op");
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_math_3op",
+                              "compiler_math_3op_float");
   OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
   OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
@@ -35,9 +37,9 @@ static void compiler_math_3op(void)
     OCL_MAP_BUFFER(2);
     OCL_MAP_BUFFER(3);
     for (uint32_t i = 0; i < 32; ++i) {
-      cpu_src1[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
-      cpu_src2[i] = ((float*)buf_data[2])[i] = .1f * (rand() & 15);
-      cpu_src3[i] = ((float*)buf_data[3])[i] = .1f * (rand() & 15);
+      cpu_src1[i] = ((float*)buf_data[1])[i] = .001f * (rand() & 15);
+      cpu_src2[i] = ((float*)buf_data[2])[i] = .002f * (rand() & 15);
+      cpu_src3[i] = ((float*)buf_data[3])[i] = .003f * (rand() & 15);
     }
     OCL_UNMAP_BUFFER(1);
     OCL_UNMAP_BUFFER(2);
@@ -50,15 +52,67 @@ static void compiler_math_3op(void)
     for (int i = 0; i < 16; ++i) {
       const float cpu = cpu_dst[i];
       const float gpu = ((float*)buf_data[0])[i];
-      if (isinf(cpu))
-        OCL_ASSERT(isinf(gpu));
-      else if (isnan(cpu))
-        OCL_ASSERT(isnan(gpu));
+      //printf("cpu:%f, gpu:%f\n", cpu, gpu);
+      if (std::isinf(cpu))
+        OCL_ASSERT(std::isinf(gpu));
+      else if (std::isnan(cpu))
+        OCL_ASSERT(std::isnan(gpu));
       else
         OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
     }
     OCL_UNMAP_BUFFER(0);
   }
 }
+MAKE_UTEST_FROM_FUNCTION(compiler_math_3op_float)
+static void compiler_math_3op_half(void)
+{
+  if (!cl_check_half())
+    return;
+  const size_t n = 32;
+  float cpu_dst[32], cpu_src1[32], cpu_src2[32], cpu_src3[32];
+
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_math_3op.cl",
+                           "compiler_math_3op_half",
+                           SOURCE, "-DHALF");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(cl_half), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(cl_half), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(cl_half), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(cl_half), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = 16;
+  locals[0] = 16;
 
-MAKE_UTEST_FROM_FUNCTION(compiler_math_3op)
+  for (int j = 0; j < 1000; j ++) {
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    OCL_MAP_BUFFER(3);
+    for (uint32_t i = 0; i < 32; ++i) {
+      ((cl_half*)buf_data[1])[i] = __float_to_half(as_uint(cpu_src1[i] = 0.1f*(rand() & 63)));
+      ((cl_half*)buf_data[2])[i] = __float_to_half(as_uint(cpu_src2[i] = 0.02f*(rand() & 63)));
+      ((cl_half*)buf_data[3])[i] = __float_to_half(as_uint(cpu_src3[i] = 0.02f*(rand() & 63)));
+    }
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+    OCL_UNMAP_BUFFER(3);
+    OCL_NDRANGE(1);
+
+    for (int i = 0; i < 16; ++i)
+      cpu_compiler_math(cpu_dst, cpu_src1, cpu_src2, cpu_src3, i);
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < 16; ++i) {
+      const float cpu = cpu_dst[i];
+      bool isInf, infSign;
+      const float gpu = as_float(__half_to_float(((uint16_t*)buf_data[0])[i], &isInf, &infSign));
+      //printf("cpu:(%f*%f+%f) = %f, gpu:%f\n", cpu_src1[i], cpu_src2[i], cpu_src3[i],cpu,gpu);
+      OCL_ASSERT(((fabs(cpu) < 6e-8f) && (gpu < 6e-8f)) || (fabs(cpu - gpu) <= 0.3 * fabs(cpu)) ||
+                 (isInf && ((infSign && cpu > 65504.0f) || (!infSign && cpu < -65504.0f))) ||
+                 (std::isnan(gpu) && std::isnan(cpu)));
+    }
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_math_3op_half)
diff --git a/utests/compiler_mix.cpp b/utests/compiler_mix.cpp
new file mode 100644
index 0000000..f1ddde0
--- /dev/null
+++ b/utests/compiler_mix.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+#include <cmath>
+void compiler_mix(void)
+{
+  const float MAXERR = 1e-3f;
+  const int n = 1024;
+  float src1[n], src2[n], src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mix");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((float*)buf_data[0])[i] = (float)rand();
+    src2[i] = ((float*)buf_data[1])[i] = (float)rand();
+    src3[i] = ((float*)buf_data[2])[i] = (float)rand()/(float)RAND_MAX;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  float res, err;
+  float max_err = 0.0f;
+  for (int i = 0; i < n; ++i)
+  {
+    res = src1[i] + ((src2[i] - src1[i]) * src3[i]);
+    err = fabsf((((float*)buf_data[3])[i] - res)/ res);
+    max_err = err > max_err? err: max_err;
+  }
+  OCL_UNMAP_BUFFER(3);
+  printf("\tmix max err is %g\n",max_err);
+  OCL_ASSERT(max_err < MAXERR);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mix);
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_movforphi_undef.cpp
index 8f1e66e..01eae06 100644
--- a/utests/compiler_movforphi_undef.cpp
+++ b/utests/compiler_movforphi_undef.cpp
@@ -42,8 +42,8 @@ static void compiler_movforphi_undef(void)
   OCL_NDRANGE(2);
 
   // Check result
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
   // Just compare the initial 2 data is enough for this case, as the initial 2 data must in the first
   // tile box and we can just get the correct coords.
   for (uint32_t j = 0; j < 1; ++j)
@@ -52,8 +52,8 @@ static void compiler_movforphi_undef(void)
       if (i == 0)
         OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i + 1] == ((uint32_t*)buf_data[1])[j * w + i]);
     }
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
 
   OCL_CALL(clReleaseSampler, sampler);
 }
diff --git a/utests/compiler_mul24.cpp b/utests/compiler_mul24.cpp
index 8a36947..f1a9a40 100644
--- a/utests/compiler_mul24.cpp
+++ b/utests/compiler_mul24.cpp
@@ -29,7 +29,7 @@ void compiler_mul24(void)
 
   OCL_MAP_BUFFER(2);
   for (int i = 0; i < n; ++i)
-    OCL_ASSERT(((int*)buf_data[2])[i] == ((src1[i] << 8) >> 8) * ((src2[i] << 8) >> 8));
+    OCL_ASSERT(((int*)buf_data[2])[i] == (src1[i]) * (src2[i]));
   OCL_UNMAP_BUFFER(2);
 }
 
diff --git a/utests/compiler_popcount.cpp b/utests/compiler_popcount.cpp
index c960ae6..c149690 100644
--- a/utests/compiler_popcount.cpp
+++ b/utests/compiler_popcount.cpp
@@ -51,7 +51,7 @@ void test(const char *kernel_name, int s_type)
   OCL_MAP_BUFFER(1);
   OCL_ASSERT(((T*)buf_data[1])[0] == 0);
   for (int i = 1; i < n; ++i){
-    OCL_ASSERT(((T*)buf_data[1])[i] == n-i-s_type);
+    OCL_ASSERT(((T*)buf_data[1])[i] == (T)n-i-s_type);
   }
   OCL_UNMAP_BUFFER(1);
 }
diff --git a/utests/compiler_sub_group_all.cpp b/utests/compiler_sub_group_all.cpp
deleted file mode 100644
index d8e4130..0000000
--- a/utests/compiler_sub_group_all.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#include "utest_helper.hpp"
-
-void compiler_sub_group_all(void)
-{
-  const size_t n = 40;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_all");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-
-  globals[0] = n;
-  locals[0] = 10;
-
-  OCL_MAP_BUFFER(0);
-  for (int32_t i = 0; i < (int32_t) n; ++i)
-    ((int*)buf_data[0])[i] = i;
-  OCL_UNMAP_BUFFER(0);
-
-  // Run the kernel on GPU
-  OCL_NDRANGE(1);
-
-  // Run on CPU
-
-  // Compare
-  OCL_MAP_BUFFER(1);
-  for (int32_t i = 0; i < (int32_t) n; ++i) {
-    //printf("%d %d\n", i, ((int *)buf_data[1])[i]);
-    if (i % 2 == 1) {
-      if (i < (int32_t)locals[0])
-        OCL_ASSERT(((int *)buf_data[1])[i] == 1);
-      else
-        OCL_ASSERT(((int *)buf_data[1])[i] == 2);
-    }
-    else
-      OCL_ASSERT(((int *)buf_data[1])[i] == 3);
-  }
-  OCL_UNMAP_BUFFER(1);
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_all);
diff --git a/utests/compiler_sub_group_any.cpp b/utests/compiler_sub_group_any.cpp
deleted file mode 100644
index 98b1bdd..0000000
--- a/utests/compiler_sub_group_any.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#include "utest_helper.hpp"
-
-void compiler_sub_group_any(void)
-{
-  const size_t n = 40;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_any");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-
-  globals[0] = n;
-  locals[0] = 10;
-
-  OCL_MAP_BUFFER(0);
-  for (int32_t i = 0; i < (int32_t) n; ++i)
-    ((int*)buf_data[0])[i] = i;
-  OCL_UNMAP_BUFFER(0);
-
-  // Run the kernel on GPU
-  OCL_NDRANGE(1);
-
-  // Run on CPU
-
-  // Compare
-  OCL_MAP_BUFFER(1);
-  for (int32_t i = 0; i < (int32_t) n; ++i){
-    //printf("%d %d\n", i, ((int *)buf_data[1])[i]);
-    if (i % 2 == 1) {
-      if (i < (int32_t)locals[0])
-        OCL_ASSERT(((int *)buf_data[1])[i] == 1);
-      else
-        OCL_ASSERT(((int *)buf_data[1])[i] == 2);
-    }
-    else
-      OCL_ASSERT(((int *)buf_data[1])[i] == 3);
-  }
-  OCL_UNMAP_BUFFER(1);
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_any);
diff --git a/utests/compiler_sub_group_shuffle.cpp b/utests/compiler_sub_group_shuffle.cpp
index 4ba8b99..f33e9de 100644
--- a/utests/compiler_sub_group_shuffle.cpp
+++ b/utests/compiler_sub_group_shuffle.cpp
@@ -2,6 +2,8 @@
 
 void compiler_sub_group_shuffle(void)
 {
+  if(!cl_check_subgroups())
+    return;
   const size_t n = 32;
   const int32_t buf_size = 4 * n + 1;
 
diff --git a/utests/compiler_sub_group_shuffle.cpp b/utests/compiler_sub_group_shuffle_down.cpp
similarity index 51%
copy from utests/compiler_sub_group_shuffle.cpp
copy to utests/compiler_sub_group_shuffle_down.cpp
index 4ba8b99..8b23234 100644
--- a/utests/compiler_sub_group_shuffle.cpp
+++ b/utests/compiler_sub_group_shuffle_down.cpp
@@ -1,16 +1,18 @@
 #include "utest_helper.hpp"
 
-void compiler_sub_group_shuffle(void)
+void compiler_sub_group_shuffle_down(void)
 {
+  if(!cl_check_subgroups())
+    return;
   const size_t n = 32;
   const int32_t buf_size = 4 * n + 1;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle");
+  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_down");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
-  int c = 3;
+  int c = 13;
   OCL_SET_ARG(1, sizeof(int), &c);
 
   globals[0] = n;
@@ -34,12 +36,13 @@ void compiler_sub_group_shuffle(void)
   for (int32_t i = 0; i < (int32_t) n; ++i){
     int round = i / suggroupsize;
     int index = i % suggroupsize;
-    OCL_ASSERT(index == dst[4*i]);
-    OCL_ASSERT((round * suggroupsize + c) == dst[4*i+1]);
-    OCL_ASSERT((round * suggroupsize + 5) == dst[4*i+2]);
-    OCL_ASSERT((round * suggroupsize + (suggroupsize - index - 1)) == dst[4*i+3]);
+    //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+    OCL_ASSERT( (index + c >= suggroupsize ? 456 : 123) == dst[4*i]);
+    OCL_ASSERT( (index + c >= suggroupsize ? (round * suggroupsize + (i + c) % suggroupsize): 123) == dst[4*i+1]);
+    OCL_ASSERT( (index + index + 1 >= suggroupsize ? -(round * suggroupsize + (i + index + 1) % suggroupsize) : (round * suggroupsize + (i + index + 1) % suggroupsize))  == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]);
   }
   OCL_UNMAP_BUFFER(0);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle);
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down);
diff --git a/utests/compiler_sub_group_shuffle.cpp b/utests/compiler_sub_group_shuffle_up.cpp
similarity index 55%
copy from utests/compiler_sub_group_shuffle.cpp
copy to utests/compiler_sub_group_shuffle_up.cpp
index 4ba8b99..6c32ca4 100644
--- a/utests/compiler_sub_group_shuffle.cpp
+++ b/utests/compiler_sub_group_shuffle_up.cpp
@@ -1,16 +1,18 @@
 #include "utest_helper.hpp"
 
-void compiler_sub_group_shuffle(void)
+void compiler_sub_group_shuffle_up(void)
 {
+  if(!cl_check_subgroups())
+    return;
   const size_t n = 32;
   const int32_t buf_size = 4 * n + 1;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle");
+  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_up");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
-  int c = 3;
+  int c = 13;
   OCL_SET_ARG(1, sizeof(int), &c);
 
   globals[0] = n;
@@ -34,12 +36,13 @@ void compiler_sub_group_shuffle(void)
   for (int32_t i = 0; i < (int32_t) n; ++i){
     int round = i / suggroupsize;
     int index = i % suggroupsize;
-    OCL_ASSERT(index == dst[4*i]);
-    OCL_ASSERT((round * suggroupsize + c) == dst[4*i+1]);
-    OCL_ASSERT((round * suggroupsize + 5) == dst[4*i+2]);
-    OCL_ASSERT((round * suggroupsize + (suggroupsize - index - 1)) == dst[4*i+3]);
+    //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+    OCL_ASSERT( ((c - index) > 0 ? 123 : 456) == dst[4*i]);
+    OCL_ASSERT( ((c - index) > 0 ? 123 : (i - c)) == dst[4*i+1]);
+    OCL_ASSERT( ((suggroupsize - index - 1 - index) > 0 ? (i + index + 1) : -(i + index + 1 - suggroupsize)) == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]);
   }
   OCL_UNMAP_BUFFER(0);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle);
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up);
diff --git a/utests/compiler_sub_group_shuffle.cpp b/utests/compiler_sub_group_shuffle_xor.cpp
similarity index 61%
copy from utests/compiler_sub_group_shuffle.cpp
copy to utests/compiler_sub_group_shuffle_xor.cpp
index 4ba8b99..967ec3e 100644
--- a/utests/compiler_sub_group_shuffle.cpp
+++ b/utests/compiler_sub_group_shuffle_xor.cpp
@@ -1,12 +1,14 @@
 #include "utest_helper.hpp"
 
-void compiler_sub_group_shuffle(void)
+void compiler_sub_group_shuffle_xor(void)
 {
+  if(!cl_check_subgroups())
+    return;
   const size_t n = 32;
   const int32_t buf_size = 4 * n + 1;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle");
+  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_xor");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
@@ -35,11 +37,12 @@ void compiler_sub_group_shuffle(void)
     int round = i / suggroupsize;
     int index = i % suggroupsize;
     OCL_ASSERT(index == dst[4*i]);
-    OCL_ASSERT((round * suggroupsize + c) == dst[4*i+1]);
-    OCL_ASSERT((round * suggroupsize + 5) == dst[4*i+2]);
-    OCL_ASSERT((round * suggroupsize + (suggroupsize - index - 1)) == dst[4*i+3]);
+    //printf("%d %d %d %d\n", i, dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+    OCL_ASSERT((round * suggroupsize + (c ^ index)) == dst[4*i+1]);
+    OCL_ASSERT((round * suggroupsize + (index ^ (suggroupsize - index -1))) == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (index ^ (index + 1) % suggroupsize)) == dst[4*i+3]);
   }
   OCL_UNMAP_BUFFER(0);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle);
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor);
diff --git a/utests/compiler_subgroup_broadcast.cpp b/utests/compiler_subgroup_broadcast.cpp
new file mode 100644
index 0000000..2835161
--- /dev/null
+++ b/utests/compiler_subgroup_broadcast.cpp
@@ -0,0 +1,187 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  30
+#define WG_LOCAL_SIZE   30
+/*
+ * Generic compute-expected function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(T* input,
+                             T* expected,
+                             size_t SIMD_ID,
+                             size_t SIMD_SIZE)
+{
+  for(uint32_t i = 0; i < SIMD_SIZE; i++)
+    expected[i] = input[SIMD_ID];
+}
+
+/*
+ * Generic input-expected generate function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(T* &input,
+                          T* &expected,
+                          size_t SIMD_ID,
+                          size_t SIMD_SIZE)
+{
+  /* allocate input and expected arrays */
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+    {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+
+      /* check all data types, test ideal for QWORD types */
+      input[gid + lid] += ((rand() % 2 - 1) * base_val);
+      /* add trailing random bits, tests GENERAL cases */
+      input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(input + gid, expected + gid, SIMD_ID, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++){
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic subgroup utest function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(T* input,
+                             T* expected)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  cl_uint SIMD_ID = 10;
+  /* input and expected data */
+  generate_data(input, expected, SIMD_ID, SIMD_SIZE);
+
+  /* prepare input for datatype */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_uint), &SIMD_ID);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input,  WG_GLOBAL_SIZE* sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch, increment */
+      mismatches++;
+
+#if DEBUG_STDOUT
+      /* output mismatch */
+      cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup broadcast 1D functions
+ */
+void compiler_subgroup_broadcast_imm_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+                              "compiler_subgroup_broadcast_imm_int");
+  subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_imm_int);
+void compiler_subgroup_broadcast_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+                              "compiler_subgroup_broadcast_int");
+  subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_int);
+void compiler_subgroup_broadcast_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+                              "compiler_subgroup_broadcast_long");
+  subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_broadcast_long);
diff --git a/utests/compiler_subgroup_buffer_block_read.cpp b/utests/compiler_subgroup_buffer_block_read.cpp
new file mode 100644
index 0000000..9707f19
--- /dev/null
+++ b/utests/compiler_subgroup_buffer_block_read.cpp
@@ -0,0 +1,202 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  32
+#define WG_LOCAL_SIZE   32
+/*
+ * Generic compute-expected function for buffer block read
+ */
+template<class T>
+static void compute_expected(T* input,
+                             T* expected,
+                             size_t VEC_SIZE,
+                             size_t SIMD_SIZE)
+{
+  for(uint32_t i = 0; i < SIMD_SIZE; i++)
+    for(uint32_t j = 0; j < VEC_SIZE; j++)
+      expected[i * VEC_SIZE + j] = input[SIMD_SIZE * j + i];
+}
+
+/*
+ * Generic input-expected generate function for block read
+ */
+template<class T>
+static void generate_data(T* &input,
+                          T* &expected,
+                          size_t VEC_SIZE,
+                          size_t SIMD_SIZE)
+{
+  /* allocate input and expected arrays */
+  input = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+  expected = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+    {
+      for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+      {
+        /* initially 0, augment after */
+        input[(gid + lid)*VEC_SIZE + vsz] = 0;
+
+        /* check all data types, test ideal for QWORD types */
+        input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val);
+        /* add trailing random bits, tests GENERAL cases */
+        input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112);
+
+#if DEBUG_STDOUT
+        /* output generated input */
+        cout << setw(4) << input[(gid + lid)*VEC_SIZE + vsz] << ", " ;
+        if((lid + 1) % 8 == 0)
+          cout << endl;
+#endif
+      }
+    }
+
+    /* expected values */
+    compute_expected(input + gid * VEC_SIZE, expected + gid * VEC_SIZE, VEC_SIZE, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE ; lid++){
+      for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+        cout << setw(4) << expected[(gid + lid)*VEC_SIZE + vsz] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic subgroup utest function for buffer block read
+ */
+template<class T>
+static void subgroup_generic(T* input,
+                             T* expected,
+                             size_t VEC_SIZE)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  size_t buf_sz = VEC_SIZE * WG_GLOBAL_SIZE;
+  /* input and expected data */
+  generate_data(input, expected, VEC_SIZE, SIMD_SIZE);
+
+  /* prepare input for datatype */
+  OCL_CREATE_BUFFER(buf[0], 0, buf_sz * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, buf_sz * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input,  buf_sz* sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < buf_sz; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch, increment */
+      mismatches++;
+
+#if DEBUG_STDOUT
+      /* output mismatch */
+      cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+  free(input);
+  free(expected);
+}
+
+/*
+ * subgroup buffer block read
+ */
+void compiler_subgroup_buffer_block_read1(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read",
+                              "compiler_subgroup_buffer_block_read1");
+  subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read1);
+void compiler_subgroup_buffer_block_read2(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read",
+                              "compiler_subgroup_buffer_block_read2");
+  subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read2);
+void compiler_subgroup_buffer_block_read4(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read",
+                              "compiler_subgroup_buffer_block_read4");
+  subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read4);
+void compiler_subgroup_buffer_block_read8(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read",
+                              "compiler_subgroup_buffer_block_read8");
+  subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read8);
diff --git a/utests/compiler_subgroup_buffer_block_write.cpp b/utests/compiler_subgroup_buffer_block_write.cpp
new file mode 100644
index 0000000..6b257c5
--- /dev/null
+++ b/utests/compiler_subgroup_buffer_block_write.cpp
@@ -0,0 +1,202 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  32
+#define WG_LOCAL_SIZE   32
+/*
+ * Generic input-expected generate function for block write
+ */
+template<class T>
+static void compute_expected(T* input,
+                             T* expected,
+                             size_t VEC_SIZE,
+                             size_t SIMD_SIZE)
+{
+  for(uint32_t i = 0; i < SIMD_SIZE; i++)
+    for(uint32_t j = 0; j < VEC_SIZE; j++)
+      expected[SIMD_SIZE * j + i] = input[i * VEC_SIZE + j];
+}
+
+/*
+ * Generic compute-expected function for buffer block write
+ */
+template<class T>
+static void generate_data(T* &input,
+                          T* &expected,
+                          size_t VEC_SIZE,
+                          size_t SIMD_SIZE)
+{
+  /* allocate input and expected arrays */
+  input = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+  expected = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+    {
+      for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+      {
+        /* initially 0, augment after */
+        input[(gid + lid)*VEC_SIZE + vsz] = 0;
+
+        /* check all data types, test ideal for QWORD types */
+        input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val);
+        /* add trailing random bits, tests GENERAL cases */
+        input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112);
+
+#if DEBUG_STDOUT
+        /* output generated input */
+        cout << setw(4) << input[(gid + lid)*VEC_SIZE + vsz] << ", " ;
+        if((lid + 1) % 8 == 0)
+          cout << endl;
+#endif
+      }
+    }
+
+    /* expected values */
+    compute_expected(input + gid * VEC_SIZE, expected + gid * VEC_SIZE, VEC_SIZE, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE ; lid++){
+      for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+        cout << setw(4) << expected[(gid + lid)*VEC_SIZE + vsz] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic subgroup utest function for buffer block write
+ */
+template<class T>
+static void subgroup_generic(T* input,
+                             T* expected,
+                             size_t VEC_SIZE)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  size_t buf_sz = VEC_SIZE * WG_GLOBAL_SIZE;
+  /* input and expected data */
+  generate_data(input, expected, VEC_SIZE, SIMD_SIZE);
+
+  /* prepare input for datatype */
+  OCL_CREATE_BUFFER(buf[0], 0, buf_sz * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, buf_sz * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input,  buf_sz* sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < buf_sz; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch, increment */
+      mismatches++;
+
+#if DEBUG_STDOUT
+      /* output mismatch */
+      cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+  free(input);
+  free(expected);
+}
+
+/*
+ * subgroup buffer block write
+ */
+void compiler_subgroup_buffer_block_write1(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write",
+                              "compiler_subgroup_buffer_block_write1");
+  subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write1);
+void compiler_subgroup_buffer_block_write2(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write",
+                              "compiler_subgroup_buffer_block_write2");
+  subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write2);
+void compiler_subgroup_buffer_block_write4(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write",
+                              "compiler_subgroup_buffer_block_write4");
+  subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write4);
+void compiler_subgroup_buffer_block_write8(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write",
+                              "compiler_subgroup_buffer_block_write8");
+  subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write8);
diff --git a/utests/compiler_subgroup_image_block_read.cpp b/utests/compiler_subgroup_image_block_read.cpp
new file mode 100644
index 0000000..02c8f07
--- /dev/null
+++ b/utests/compiler_subgroup_image_block_read.cpp
@@ -0,0 +1,197 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  32
+#define WG_LOCAL_SIZE   32
+/*
+ * Generic compute-expected function for meida block read
+ */
+template<class T>
+static void compute_expected(T* input,
+                             T* expected,
+                             size_t VEC_SIZE)
+{
+  for(uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    for(uint32_t j = 0; j < VEC_SIZE; j++)
+      expected[i * VEC_SIZE + j] = input[WG_GLOBAL_SIZE * j + i];
+}
+
+/*
+ * Generic input-expected generate function for media block read
+ */
+template<class T>
+static void generate_data(T* &input,
+                          T* &expected,
+                          size_t VEC_SIZE)
+{
+  /* allocate input and expected arrays */
+  input = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+  expected = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+
+  /* base value for all data types */
+  T base_val = (int)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE * VEC_SIZE; gid++)
+  {
+    /* initially 0, augment after */
+    input[gid] = ((rand() % 2 - 1) * base_val) + (rand() % 112);
+
+#if DEBUG_STDOUT
+    /* output generated input */
+    cout << setw(4) << input[gid] << ", " ;
+    if((gid + 1) % 8 == 0)
+          cout << endl;
+#endif
+
+  }
+  /* expected values */
+  compute_expected(input, expected, VEC_SIZE);
+
+#if DEBUG_STDOUT
+  /* output expected input */
+  cout << endl << "EXP: " << endl;
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid++)
+  {
+    cout << "(";
+    for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+      cout << setw(4) << expected[gid* VEC_SIZE + vsz] << ", " ;
+    cout << ")";
+    if((gid + 1) % 8 == 0)
+        cout << endl;
+    cout << endl;
+  }
+#endif
+}
+
+/*
+ * Generic subgroup utest function for media block read
+ */
+template<class T>
+static void subgroup_generic(T* input,
+                             T* expected,
+                             size_t VEC_SIZE)
+{
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  size_t buf_sz = VEC_SIZE * WG_GLOBAL_SIZE;
+  /* input and expected data */
+  generate_data(input, expected, VEC_SIZE);
+
+  /* prepare input for datatype */
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = WG_GLOBAL_SIZE;
+  desc.image_height = VEC_SIZE;
+  desc.image_row_pitch = WG_GLOBAL_SIZE * sizeof(uint32_t);
+
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, input);
+  OCL_CREATE_BUFFER(buf[1], 0, buf_sz * sizeof(T), NULL);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < buf_sz; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch, increment */
+      mismatches++;
+
+#if DEBUG_STDOUT
+      /* output mismatch */
+      cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+  free(input);
+  free(expected);
+}
+
+/*
+ * sub_group image block read functions
+ */
+void compiler_subgroup_image_block_read1(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_read",
+                              "compiler_subgroup_image_block_read1");
+  subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read1);
+void compiler_subgroup_image_block_read2(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_read",
+                              "compiler_subgroup_image_block_read2");
+  subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read2);
+void compiler_subgroup_image_block_read4(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_read",
+                              "compiler_subgroup_image_block_read4");
+  subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read4);
+void compiler_subgroup_image_block_read8(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_read",
+                              "compiler_subgroup_image_block_read8");
+  subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_read8);
diff --git a/utests/compiler_subgroup_image_block_write.cpp b/utests/compiler_subgroup_image_block_write.cpp
new file mode 100644
index 0000000..2b85167
--- /dev/null
+++ b/utests/compiler_subgroup_image_block_write.cpp
@@ -0,0 +1,201 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  32
+#define WG_LOCAL_SIZE   32
+/*
+ * Generic compute-expected function for meida block write
+ */
+template<class T>
+static void compute_expected(T* input,
+                             T* expected,
+                             size_t VEC_SIZE)
+{
+  for(uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    for(uint32_t j = 0; j < VEC_SIZE; j++)
+      expected[WG_GLOBAL_SIZE * j + i] = input[i * VEC_SIZE + j];
+}
+
+/*
+ * Generic input-expected generate function for media block write
+ */
+template<class T>
+static void generate_data(T* &input,
+                          T* &expected,
+                          size_t VEC_SIZE)
+{
+  /* allocate input and expected arrays */
+  input = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+  expected = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE * VEC_SIZE; gid++)
+  {
+    /* initially 0, augment after */
+    input[gid] = ((rand() % 2 - 1) * base_val) + (rand() % 112);
+
+#if DEBUG_STDOUT
+    /* output generated input */
+    cout << setw(4) << input[gid] << ", " ;
+    if((gid + 1) % 8 == 0)
+          cout << endl;
+#endif
+
+  }
+  /* expected values */
+  compute_expected(input, expected, VEC_SIZE);
+
+#if DEBUG_STDOUT
+  /* output expected input */
+  cout << endl << "EXP: " << endl;
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid++)
+  {
+    cout << "(";
+    for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+      cout << setw(4) << expected[gid* VEC_SIZE + vsz] << ", " ;
+    cout << ")";
+    if((gid + 1) % 8 == 0)
+        cout << endl;
+    cout << endl;
+  }
+#endif
+}
+
+/*
+ * Generic subgroup utest function for media block write
+ */
+template<class T>
+static void subgroup_generic(T* input,
+                             T* expected,
+                             size_t VEC_SIZE)
+{
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  size_t buf_sz = VEC_SIZE * WG_GLOBAL_SIZE;
+  /* input and expected data */
+  generate_data(input, expected, VEC_SIZE);
+
+  /* prepare input for datatype */
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = WG_GLOBAL_SIZE;
+  desc.image_height = VEC_SIZE;
+  desc.image_row_pitch = 0;
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, buf_sz * sizeof(T), NULL);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[1], input,  buf_sz* sizeof(T));
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER_GTT(0);
+  uint32_t mismatches = 0;
+  size_t image_row_pitch = 0;
+  OCL_CALL(clGetImageInfo, buf[0], CL_IMAGE_ROW_PITCH, sizeof(image_row_pitch), &image_row_pitch, NULL);
+  image_row_pitch /= sizeof(T);
+  T *out = (T *)buf_data[0];
+
+  for (uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+    for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+      if (out[vsz * image_row_pitch + i] != expected[WG_GLOBAL_SIZE * vsz + i]) {
+        /* found mismatch, increment */
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << WG_GLOBAL_SIZE * vsz + i << ", " << out[vsz * image_row_pitch + i]
+             << " != " << expected[WG_GLOBAL_SIZE * vsz + i] << endl;
+#endif
+      }
+
+  OCL_UNMAP_BUFFER_GTT(0);
+
+  OCL_ASSERT(mismatches == 0);
+  free(input);
+  free(expected);
+}
+
+/*
+ * sub_group image block write functions
+ */
+void compiler_subgroup_image_block_write1(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_write",
+                              "compiler_subgroup_image_block_write1");
+  subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write1);
+void compiler_subgroup_image_block_write2(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_write",
+                              "compiler_subgroup_image_block_write2");
+  subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write2);
+void compiler_subgroup_image_block_write4(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_write",
+                              "compiler_subgroup_image_block_write4");
+  subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write4);
+void compiler_subgroup_image_block_write8(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_image_block_write",
+                              "compiler_subgroup_image_block_write8");
+  subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_image_block_write8);
diff --git a/utests/compiler_subgroup_reduce.cpp b/utests/compiler_subgroup_reduce.cpp
new file mode 100644
index 0000000..3c3df06
--- /dev/null
+++ b/utests/compiler_subgroup_reduce.cpp
@@ -0,0 +1,425 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+#include <cmath>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  30
+#define WG_LOCAL_SIZE   30
+enum WG_FUNCTION
+{
+  WG_ANY,
+  WG_ALL,
+  WG_REDUCE_ADD,
+  WG_REDUCE_MIN,
+  WG_REDUCE_MAX
+};
+
+/*
+ * Generic compute-expected function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected,
+                    size_t SIMD_SIZE)
+{
+  if(wg_func == WG_ANY)
+  {
+    T wg_predicate = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_predicate = (int)wg_predicate || (int)input[i];
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_predicate;
+  }
+  else if(wg_func == WG_ALL)
+  {
+    T wg_predicate = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_predicate = (int)wg_predicate && (int)input[i];
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_predicate;
+  }
+  else if(wg_func == WG_REDUCE_ADD)
+  {
+    T wg_sum = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_sum += input[i];
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_sum;
+  }
+  else if(wg_func == WG_REDUCE_MAX)
+  {
+    T wg_max = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_max = max(input[i], wg_max);
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_max;
+  }
+  else if(wg_func == WG_REDUCE_MIN)
+  {
+    T wg_min = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_min = min(input[i], wg_min);
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_min;
+  }
+}
+
+/*
+ * Generic input-expected generate function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected,
+                   size_t SIMD_SIZE)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for (uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+
+      if (numeric_limits<T>::is_integer) {
+        /* check all data types, test ideal for QWORD types */
+        input[gid + lid] += ((rand() % 2 - 1) * base_val);
+        /* add trailing random bits, tests GENERAL cases */
+        input[gid + lid] += (rand() % 112);
+        /* always last bit is 1, ideal test ALL/ANY */
+      } else {
+        input[gid + lid] += rand();
+        input[gid + lid] += rand() / ((float)RAND_MAX + 1);
+      }
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic subgroup utest function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  /* input and expected data */
+  generate_data(wg_func, input, expected, SIMD_SIZE);
+
+  /* prepare input for data type */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch on integer, increment */
+      if (numeric_limits<T>::is_integer) {
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << i << ", " << ((T *)buf_data[1])[i]
+             << " != " << *(expected + i) << endl;
+#endif
+      }
+      /* float error is tolerable though */
+      else {
+        float num_computed = ((T *)buf_data[1])[i];
+        float num_expected = *(expected + i);
+        float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+        if (num_diff > 0.01f) {
+          mismatches++;
+
+#if DEBUG_STDOUT
+          /* output mismatch */
+          cout << "Err at " << i << ", " << ((T *)buf_data[1])[i]
+               << " != " << *(expected + i) << endl;
+#endif
+        }
+      }
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup any/all utest functions
+ */
+void compiler_subgroup_any(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_any");
+  subgroup_generic(WG_ANY, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_any);
+void compiler_subgroup_all(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_all");
+  subgroup_generic(WG_ALL, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_all);
+/*
+ * Workgroup reduce add utest functions
+ */
+void compiler_subgroup_reduce_add_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_int");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_int);
+void compiler_subgroup_reduce_add_uint(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_uint");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_uint);
+void compiler_subgroup_reduce_add_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_long");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_add_long);
+void compiler_subgroup_reduce_add_ulong(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_ulong");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_add_ulong);
+void compiler_subgroup_reduce_add_float(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_float");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_float);
+
+/*
+ * Workgroup reduce max utest functions
+ */
+void compiler_subgroup_reduce_max_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_int");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_int);
+void compiler_subgroup_reduce_max_uint(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_uint");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_uint);
+void compiler_subgroup_reduce_max_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_long");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_max_long);
+void compiler_subgroup_reduce_max_ulong(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_ulong");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_max_ulong);
+void compiler_subgroup_reduce_max_float(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_float");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_float);
+
+/*
+ * Workgroup reduce min utest functions
+ */
+void compiler_subgroup_reduce_min_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_int");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_int);
+void compiler_subgroup_reduce_min_uint(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_uint");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_uint);
+void compiler_subgroup_reduce_min_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_long");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_min_long);
+void compiler_subgroup_reduce_min_ulong(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_ulong");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_min_ulong);
+void compiler_subgroup_reduce_min_float(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_float");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_float);
diff --git a/utests/compiler_subgroup_scan_exclusive.cpp b/utests/compiler_subgroup_scan_exclusive.cpp
new file mode 100644
index 0000000..1a21b59
--- /dev/null
+++ b/utests/compiler_subgroup_scan_exclusive.cpp
@@ -0,0 +1,381 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+#include <cmath>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  30
+#define WG_LOCAL_SIZE   30
+
+enum WG_FUNCTION
+{
+  WG_SCAN_EXCLUSIVE_ADD,
+  WG_SCAN_EXCLUSIVE_MAX,
+  WG_SCAN_EXCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected,
+                    size_t SIMD_SIZE)
+{
+  if(wg_func == WG_SCAN_EXCLUSIVE_ADD)
+  {
+    expected[0] = 0;
+    expected[1] = input[0];
+    for(uint32_t i = 2; i < SIMD_SIZE; i++)
+      expected[i] = input[i - 1] + expected[i - 1];
+  }
+  else if(wg_func == WG_SCAN_EXCLUSIVE_MAX)
+  {
+    if(numeric_limits<T>::is_integer)
+      expected[0] = numeric_limits<T>::min();
+    else
+      expected[0] = - numeric_limits<T>::infinity();
+
+    expected[1] = input[0];
+    for(uint32_t i = 2; i < SIMD_SIZE; i++)
+      expected[i] = max(input[i - 1], expected[i - 1]);
+  }
+  else if(wg_func == WG_SCAN_EXCLUSIVE_MIN)
+  {
+    if(numeric_limits<T>::is_integer)
+      expected[0] = numeric_limits<T>::max();
+    else
+      expected[0] = numeric_limits<T>::infinity();
+
+    expected[1] = input[0];
+    for(uint32_t i = 2; i < SIMD_SIZE; i++)
+      expected[i] = min(input[i - 1], expected[i - 1]);
+  }
+}
+
+/*
+ * Generic subgroup utest function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected,
+                   size_t SIMD_SIZE)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+    {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+      /* check all data types, test ideal for QWORD types */
+      input[gid + lid] += ((rand() % 2 - 1) * base_val);
+      /* add trailing random bits, tests GENERAL cases */
+      input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic subgroup utest function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  /* input and expected data */
+  generate_data(wg_func, input, expected, SIMD_SIZE);
+
+  /* prepare input for data type */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch on integer, increment */
+      if(numeric_limits<T>::is_integer){
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << i << ", " <<
+          ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+      }
+      /* float error is tolerable though */
+      else {
+          float num_computed = ((T *)buf_data[1])[i];
+          float num_expected = *(expected + i);
+          float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+          if(num_diff > 0.01f){
+            mismatches++;
+
+#if DEBUG_STDOUT
+          /* output mismatch */
+          cout << "Err at " << i << ", " <<
+            ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+        }
+      }
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup scan_exclusive add utest functions
+ */
+void compiler_subgroup_scan_exclusive_add_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_int");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_int);
+void compiler_subgroup_scan_exclusive_add_uint(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_uint");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_uint);
+void compiler_subgroup_scan_exclusive_add_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_long");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_add_long);
+void compiler_subgroup_scan_exclusive_add_ulong(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_ulong");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_add_ulong);
+void compiler_subgroup_scan_exclusive_add_float(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_float");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_float);
+
+/*
+ * Workgroup scan_exclusive max utest functions
+ */
+void compiler_subgroup_scan_exclusive_max_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_int");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_int);
+void compiler_subgroup_scan_exclusive_max_uint(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_uint");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_uint);
+void compiler_subgroup_scan_exclusive_max_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_long");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_max_long);
+void compiler_subgroup_scan_exclusive_max_ulong(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_ulong");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_max_ulong);
+void compiler_subgroup_scan_exclusive_max_float(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_float");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_float);
+
+/*
+ * Workgroup scan_exclusive min utest functions
+ */
+void compiler_subgroup_scan_exclusive_min_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_int");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_int);
+void compiler_subgroup_scan_exclusive_min_uint(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_uint");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_uint);
+void compiler_subgroup_scan_exclusive_min_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_long");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_min_long);
+void compiler_subgroup_scan_exclusive_min_ulong(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_ulong");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_min_ulong);
+void compiler_subgroup_scan_exclusive_min_float(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_float");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_float);
diff --git a/utests/compiler_subgroup_scan_inclusive.cpp b/utests/compiler_subgroup_scan_inclusive.cpp
new file mode 100644
index 0000000..fa32855
--- /dev/null
+++ b/utests/compiler_subgroup_scan_inclusive.cpp
@@ -0,0 +1,372 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+#include <cmath>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  30
+#define WG_LOCAL_SIZE   30
+
+enum WG_FUNCTION
+{
+  WG_SCAN_INCLUSIVE_ADD,
+  WG_SCAN_INCLUSIVE_MAX,
+  WG_SCAN_INCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected,
+                    size_t SIMD_SIZE)
+{
+  if(wg_func == WG_SCAN_INCLUSIVE_ADD)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      expected[i] = input[i] + expected[i - 1];
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      expected[i] = max(input[i], expected[i - 1]);
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      expected[i] = min(input[i], expected[i - 1]);
+  }
+}
+
+/*
+ * Generic input-expected generate function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected,
+                   size_t SIMD_SIZE)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+    {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+
+      /* check all data types, test ideal for QWORD types */
+      input[gid + lid] += ((rand() % 2 - 1) * base_val);
+      /* add trailing random bits, tests GENERAL cases */
+      input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic subgroup utest function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(utestclGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  /* input and expected data */
+  generate_data(wg_func, input, expected, SIMD_SIZE);
+
+  /* prepare input for data type */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch on integer, increment */
+      if(numeric_limits<T>::is_integer){
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << i << ", " <<
+          ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+      }
+      /* float error is tolerable though */
+      else {
+          float num_computed = ((T *)buf_data[1])[i];
+          float num_expected = *(expected + i);
+          float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+          if(num_diff > 0.01f){
+            mismatches++;
+
+#if DEBUG_STDOUT
+          /* output mismatch */
+          cout << "Err at " << i << ", " <<
+            ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+        }
+      }
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup scan_inclusive add utest functions
+ */
+void compiler_subgroup_scan_inclusive_add_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_int");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_int);
+void compiler_subgroup_scan_inclusive_add_uint(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_uint");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_uint);
+void compiler_subgroup_scan_inclusive_add_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_long");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_add_long);
+void compiler_subgroup_scan_inclusive_add_ulong(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_ulong");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_add_ulong);
+void compiler_subgroup_scan_inclusive_add_float(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_float");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_float);
+
+/*
+ * Workgroup scan_inclusive max utest functions
+ */
+void compiler_subgroup_scan_inclusive_max_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_int");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_int);
+void compiler_subgroup_scan_inclusive_max_uint(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_uint");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_uint);
+void compiler_subgroup_scan_inclusive_max_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_long");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_max_long);
+void compiler_subgroup_scan_inclusive_max_ulong(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_ulong");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_max_ulong);
+void compiler_subgroup_scan_inclusive_max_float(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_float");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_float);
+
+/*
+ * Workgroup scan_inclusive min utest functions
+ */
+void compiler_subgroup_scan_inclusive_min_int(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_int");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_int);
+void compiler_subgroup_scan_inclusive_min_uint(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_uint");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_uint);
+void compiler_subgroup_scan_inclusive_min_long(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_long");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_min_long);
+void compiler_subgroup_scan_inclusive_min_ulong(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_ulong");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_min_ulong);
+void compiler_subgroup_scan_inclusive_min_float(void)
+{
+  if(!cl_check_subgroups())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_float");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_float);
+
diff --git a/utests/compiler_time_stamp.cpp b/utests/compiler_time_stamp.cpp
index 4da5752..43165c1 100644
--- a/utests/compiler_time_stamp.cpp
+++ b/utests/compiler_time_stamp.cpp
@@ -16,6 +16,11 @@ static void cpu(int global_id, int *src, int *dst) {
 
 void compiler_time_stamp(void)
 {
+  if (!cl_check_beignet()) {
+    printf("Not beignet device , Skip!");
+    return;
+  }
+
   const size_t n = 16;
   int cpu_dst[16], cpu_src[16];
 
diff --git a/utests/compiler_unstructured_branch3.cpp b/utests/compiler_unstructured_branch3.cpp
index 0c6992a..1782df5 100644
--- a/utests/compiler_unstructured_branch3.cpp
+++ b/utests/compiler_unstructured_branch3.cpp
@@ -37,6 +37,8 @@ static void compiler_unstructured_branch3(void)
   OCL_MAP_BUFFER(1);
   for (uint32_t i = 0; i < n; ++i)
     OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
 
   // Third control flow
   OCL_MAP_BUFFER(0);
@@ -52,6 +54,8 @@ static void compiler_unstructured_branch3(void)
     OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
   for (uint32_t i = 8; i < n; ++i)
     OCL_ASSERT(((int32_t*)buf_data[1])[i] == 3);
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_unstructured_branch3);
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
index 5a1a8d1..80e72a9 100644
--- a/utests/compiler_vector_load_store.cpp
+++ b/utests/compiler_vector_load_store.cpp
@@ -1,15 +1,27 @@
 #include "utest_helper.hpp"
 #include <string.h>
+#include <math.h>
 template<typename T>
 static void compiler_vector_load_store(int elemNum, const char *kernelName)
 {
   const size_t n = elemNum * 256;
+  if (strstr(kernelName, "half") != NULL)
+    if (!cl_check_half())
+      return;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_vector_load_store", kernelName);
+  if (strstr(kernelName, "half") != NULL)
+    OCL_CALL(cl_kernel_init, "compiler_vector_load_store.cl", kernelName,
+                             SOURCE, "-DHALF");
+  else
+    OCL_CREATE_KERNEL_FROM_FILE("compiler_vector_load_store", kernelName);
   buf_data[0] = (T*) malloc(sizeof(T) * n);
-  for (uint32_t i = 0; i < n; ++i)
-    ((T*)buf_data[0])[i] = i;
+  for (uint32_t i = 0; i < n; ++i) {
+    if (strstr(kernelName, "half") != NULL)
+      ((T*)buf_data[0])[i] = __float_to_half(as_uint((float)i/(float)n));
+    else
+      ((T*)buf_data[0])[i] = i;
+  }
   OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
   OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
   free(buf_data[0]);
@@ -28,10 +40,17 @@ static void compiler_vector_load_store(int elemNum, const char *kernelName)
   for (uint32_t i = 0; i < n; ++i)
   {
     int shift = ((i % elemNum) + 1);
-    if (strstr(kernelName, "double") == NULL)
-      OCL_ASSERT(((T*)buf_data[1])[i] == (T)(((T*)buf_data[0])[i] + shift));
-    else
+    if (strstr(kernelName, "double") != NULL)
       OCL_ASSERT((((T*)buf_data[1])[i] - ((T)((T*)buf_data[0])[i] + shift)) < 1e-5);
+    else if (strstr(kernelName, "half") != NULL) {
+      float fdst = as_float(__half_to_float(((T*)buf_data[1])[i]));
+      float fsrc = as_float(__half_to_float((T)(((T*)buf_data[0])[i])));
+      fsrc += shift;
+      //printf("%d (%f, %f)\n",i, fdst, fsrc);
+      OCL_ASSERT((fabs(fsrc - fdst) <= 0.03 * fabs(fdst)));
+    }
+    else
+      OCL_ASSERT(((T*)buf_data[1])[i] == (T)(((T*)buf_data[0])[i] + shift));
   }
   OCL_UNMAP_BUFFER(0);
   OCL_UNMAP_BUFFER(1);
@@ -61,3 +80,4 @@ test_all_vector(float, float, true)
 //test_all_vector(double, double, true)
 test_all_vector(int64_t, long, true)
 test_all_vector(uint64_t, ulong, false)
+test_all_vector(uint16_t, half, false)
diff --git a/utests/compiler_workgroup_broadcast.cpp b/utests/compiler_workgroup_broadcast.cpp
new file mode 100644
index 0000000..a323fb6
--- /dev/null
+++ b/utests/compiler_workgroup_broadcast.cpp
@@ -0,0 +1,320 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE_X        16
+#define WG_GLOBAL_SIZE_Y        4
+#define WG_GLOBAL_SIZE_Z        4
+
+#define WG_LOCAL_SIZE_X         16
+#define WG_LOCAL_SIZE_Y         2
+#define WG_LOCAL_SIZE_Z         2
+
+/* TODO debug bellow case, lid2 always stays 0, instead of 0 and 1
+ *
+ * #define WG_GLOBAL_SIZE_X        16
+ * #define WG_GLOBAL_SIZE_Y        1
+ * #define WG_GLOBAL_SIZE_Z        4
+ *
+ * #define WG_LOCAL_SIZE_X         16
+ * #define WG_LOCAL_SIZE_Y         1
+ * #define WG_LOCAL_SIZE_Z         2
+ */
+
+#define WG_LOCAL_X    5
+#define WG_LOCAL_Y    0
+#define WG_LOCAL_Z    0
+
+enum WG_BROADCAST
+{
+  WG_BROADCAST_1D,
+  WG_BROADCAST_2D,
+  WG_BROADCAST_3D
+};
+
+/*
+ * Generic compute-expected function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_BROADCAST wg_broadcast,
+                             T* input,
+                             T* expected,
+                             uint32_t wg_global_size,
+                             uint32_t wg_local_size)
+{
+  if(wg_broadcast == WG_BROADCAST_1D)
+  {
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] = input[WG_LOCAL_X];
+  }
+  else if(wg_broadcast == WG_BROADCAST_2D)
+  {
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] =
+          input[WG_LOCAL_X +
+                WG_LOCAL_Y * WG_LOCAL_SIZE_X];
+  }
+  else if(wg_broadcast == WG_BROADCAST_3D)
+  {
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] =
+        input[WG_LOCAL_X +
+              WG_LOCAL_Y * WG_LOCAL_SIZE_X +
+              WG_LOCAL_Z * WG_LOCAL_SIZE_X * WG_LOCAL_SIZE_Y];
+  }
+}
+
+/*
+ * Generic input-expected generate function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_BROADCAST wg_broadcast,
+                   T* &input,
+                   T* &expected,
+                   uint32_t &wg_global_size,
+                   uint32_t &wg_local_size)
+{
+  if(wg_broadcast == WG_BROADCAST_1D)
+  {
+    wg_global_size = WG_GLOBAL_SIZE_X;
+    wg_local_size = WG_LOCAL_SIZE_X;
+  }
+  else if(wg_broadcast == WG_BROADCAST_2D)
+  {
+    wg_global_size = WG_GLOBAL_SIZE_X * WG_GLOBAL_SIZE_Y;
+    wg_local_size = WG_LOCAL_SIZE_X * WG_LOCAL_SIZE_Y;
+  }
+  else if(wg_broadcast == WG_BROADCAST_3D)
+  {
+    wg_global_size = WG_GLOBAL_SIZE_X * WG_GLOBAL_SIZE_Y * WG_GLOBAL_SIZE_Z;
+    wg_local_size = WG_LOCAL_SIZE_X * WG_LOCAL_SIZE_Y * WG_LOCAL_SIZE_Z;
+  }
+
+  /* allocate input and expected arrays */
+  input = new T[wg_global_size];
+  expected = new T[wg_global_size];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < wg_global_size; gid += wg_local_size)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+
+    /* input values */
+    for(uint32_t lid = 0; lid < wg_local_size; lid++)
+    {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+
+      /* check all data types, test ideal for QWORD types */
+      input[gid + lid] += ((rand() % 2 - 1) * base_val);
+      /* add trailing random bits, tests GENERAL cases */
+      input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_broadcast, input + gid, expected + gid, wg_global_size, wg_local_size);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < wg_local_size; lid++){
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+#endif
+
+  }
+}
+
+/*
+ * Generic workgroup utest function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void workgroup_generic(WG_BROADCAST wg_broadcast,
+                       T* input,
+                       T* expected)
+{
+  uint32_t wg_global_size = 0;
+  uint32_t wg_local_size = 0;
+
+  cl_uint wg_local_x = WG_LOCAL_X;
+  cl_uint wg_local_y = WG_LOCAL_Y;
+  cl_uint wg_local_z = WG_LOCAL_Z;
+
+  /* input and expected data */
+  generate_data(wg_broadcast, input, expected, wg_global_size, wg_local_size);
+
+  /* prepare input for datatype */
+  OCL_CREATE_BUFFER(buf[0], 0, wg_global_size * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, wg_global_size * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_uint), &wg_local_x);
+  OCL_SET_ARG(3, sizeof(cl_uint), &wg_local_y);
+  OCL_SET_ARG(4, sizeof(cl_uint), &wg_local_z);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, wg_global_size * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  if(wg_broadcast == WG_BROADCAST_1D)
+  {
+    globals[0] = WG_GLOBAL_SIZE_X;
+    locals[0] = WG_LOCAL_SIZE_X;
+    OCL_NDRANGE(1);
+  }
+  else if(wg_broadcast == WG_BROADCAST_2D)
+  {
+    globals[0] = WG_GLOBAL_SIZE_X;
+    locals[0] = WG_LOCAL_SIZE_X;
+    globals[1] = WG_GLOBAL_SIZE_Y;
+    locals[1] = WG_LOCAL_SIZE_Y;
+    OCL_NDRANGE(2);
+  }
+  else if(wg_broadcast == WG_BROADCAST_3D)
+  {
+    globals[0] = WG_GLOBAL_SIZE_X;
+    locals[0] = WG_LOCAL_SIZE_X;
+    globals[1] = WG_GLOBAL_SIZE_Y;
+    locals[1] = WG_LOCAL_SIZE_Y;
+    globals[2] = WG_GLOBAL_SIZE_Z;
+    locals[2] = WG_LOCAL_SIZE_Y;
+    OCL_NDRANGE(3);
+  }
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < wg_global_size; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch, increment */
+      mismatches++;
+
+#if DEBUG_STDOUT
+      /* output mismatch */
+      cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup broadcast 1D functions
+ */
+void compiler_workgroup_broadcast_1D_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
+                              "compiler_workgroup_broadcast_1D_int");
+  workgroup_generic(WG_BROADCAST_1D, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_broadcast_1D_int);
+
+void compiler_workgroup_broadcast_1D_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
+                              "compiler_workgroup_broadcast_1D_long");
+  workgroup_generic(WG_BROADCAST_1D, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_broadcast_1D_long);
+
+/*
+ * Workgroup broadcast 2D functions
+ */
+void compiler_workgroup_broadcast_2D_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
+                              "compiler_workgroup_broadcast_2D_int");
+  workgroup_generic(WG_BROADCAST_2D, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_broadcast_2D_int);
+
+void compiler_workgroup_broadcast_2D_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
+                              "compiler_workgroup_broadcast_2D_long");
+  workgroup_generic(WG_BROADCAST_2D, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_broadcast_2D_long);
+
+
+/*
+ * Workgroup broadcast 3D functions
+ */
+void compiler_workgroup_broadcast_3D_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
+                              "compiler_workgroup_broadcast_3D_int");
+  workgroup_generic(WG_BROADCAST_3D, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_broadcast_3D_int);
+
+void compiler_workgroup_broadcast_3D_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_broadcast",
+                              "compiler_workgroup_broadcast_3D_long");
+  workgroup_generic(WG_BROADCAST_3D, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_broadcast_3D_long);
diff --git a/utests/compiler_workgroup_reduce.cpp b/utests/compiler_workgroup_reduce.cpp
new file mode 100644
index 0000000..1cf4b08
--- /dev/null
+++ b/utests/compiler_workgroup_reduce.cpp
@@ -0,0 +1,417 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+#include <cmath>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  60
+#define WG_LOCAL_SIZE   30
+
+enum WG_FUNCTION
+{
+  WG_ANY,
+  WG_ALL,
+  WG_REDUCE_ADD,
+  WG_REDUCE_MIN,
+  WG_REDUCE_MAX
+};
+
+/*
+ * Generic compute-expected function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected)
+{
+  if(wg_func == WG_ANY)
+  {
+    T wg_predicate = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_predicate = (int)wg_predicate || (int)input[i];
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_predicate;
+  }
+  else if(wg_func == WG_ALL)
+  {
+    T wg_predicate = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_predicate = (int)wg_predicate && (int)input[i];
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_predicate;
+  }
+  else if(wg_func == WG_REDUCE_ADD)
+  {
+    T wg_sum = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_sum += input[i];
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_sum;
+  }
+  else if(wg_func == WG_REDUCE_MAX)
+  {
+    T wg_max = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_max = max(input[i], wg_max);
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_max;
+  }
+  else if(wg_func == WG_REDUCE_MIN)
+  {
+    T wg_min = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_min = min(input[i], wg_min);
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_min;
+  }
+}
+
+/*
+ * Generic input-expected generate function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += WG_LOCAL_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+
+    /* input values */
+    for (uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++) {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+
+      if (numeric_limits<T>::is_integer) {
+        /* check all data types, test ideal for QWORD types */
+        input[gid + lid] += ((rand() % 2 - 1) * base_val);
+        /* add trailing random bits, tests GENERAL cases */
+        input[gid + lid] += (rand() % 112);
+        /* always last bit is 1, ideal test ALL/ANY */
+      } else {
+        input[gid + lid] += rand();
+        input[gid + lid] += rand() / ((float)RAND_MAX + 1);
+      }
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_func, input + gid, expected + gid);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++) {
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+#endif
+
+  }
+}
+
+/*
+ * Generic workgroup utest function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void workgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  /* input and expected data */
+  generate_data(wg_func, input, expected);
+
+  /* prepare input for data type */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch on integer, increment */
+      if (numeric_limits<T>::is_integer) {
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << i << ", " << ((T *)buf_data[1])[i]
+             << " != " << *(expected + i) << endl;
+#endif
+      }
+      /* float error is tolerable though */
+      else {
+        float num_computed = ((T *)buf_data[1])[i];
+        float num_expected = *(expected + i);
+        float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+        if (num_diff > 0.01f) {
+          mismatches++;
+
+#if DEBUG_STDOUT
+          /* output mismatch */
+          cout << "Err at " << i << ", " << ((T *)buf_data[1])[i]
+               << " != " << *(expected + i) << endl;
+#endif
+        }
+      }
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup any/all utest functions
+ */
+void compiler_workgroup_any(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_any");
+  workgroup_generic(WG_ANY, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_any);
+void compiler_workgroup_all(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_all");
+  workgroup_generic(WG_ALL, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_all);
+/*
+ * Workgroup reduce add utest functions
+ */
+void compiler_workgroup_reduce_add_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_int");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_int);
+void compiler_workgroup_reduce_add_uint(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_uint");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_uint);
+void compiler_workgroup_reduce_add_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_long");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_reduce_add_long);
+void compiler_workgroup_reduce_add_ulong(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_ulong");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_reduce_add_ulong);
+void compiler_workgroup_reduce_add_float(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_float");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_float);
+
+/*
+ * Workgroup reduce max utest functions
+ */
+void compiler_workgroup_reduce_max_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_int");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_int);
+void compiler_workgroup_reduce_max_uint(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_uint");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_uint);
+void compiler_workgroup_reduce_max_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_long");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_reduce_max_long);
+void compiler_workgroup_reduce_max_ulong(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_ulong");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_reduce_max_ulong);
+void compiler_workgroup_reduce_max_float(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_float");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_float);
+
+/*
+ * Workgroup reduce min utest functions
+ */
+void compiler_workgroup_reduce_min_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_int");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_int);
+void compiler_workgroup_reduce_min_uint(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_uint");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_uint);
+void compiler_workgroup_reduce_min_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_long");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_reduce_min_long);
+void compiler_workgroup_reduce_min_ulong(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_ulong");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_reduce_min_ulong);
+void compiler_workgroup_reduce_min_float(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_float");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_float);
diff --git a/utests/compiler_workgroup_scan_exclusive.cpp b/utests/compiler_workgroup_scan_exclusive.cpp
new file mode 100644
index 0000000..50afdf3
--- /dev/null
+++ b/utests/compiler_workgroup_scan_exclusive.cpp
@@ -0,0 +1,373 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+#include <cmath>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  64
+#define WG_LOCAL_SIZE   32
+
+enum WG_FUNCTION
+{
+  WG_SCAN_EXCLUSIVE_ADD,
+  WG_SCAN_EXCLUSIVE_MAX,
+  WG_SCAN_EXCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected)
+{
+  if(wg_func == WG_SCAN_EXCLUSIVE_ADD)
+  {
+    expected[0] = 0;
+    expected[1] = input[0];
+    for(uint32_t i = 2; i < WG_LOCAL_SIZE; i++)
+      expected[i] = input[i - 1] + expected[i - 1];
+  }
+  else if(wg_func == WG_SCAN_EXCLUSIVE_MAX)
+  {
+    if(numeric_limits<T>::is_integer)
+      expected[0] = numeric_limits<T>::min();
+    else
+      expected[0] = - numeric_limits<T>::infinity();
+
+    expected[1] = input[0];
+    for(uint32_t i = 2; i < WG_LOCAL_SIZE; i++)
+      expected[i] = max(input[i - 1], expected[i - 1]);
+  }
+  else if(wg_func == WG_SCAN_EXCLUSIVE_MIN)
+  {
+    if(numeric_limits<T>::is_integer)
+      expected[0] = numeric_limits<T>::max();
+    else
+      expected[0] = numeric_limits<T>::infinity();
+
+    expected[1] = input[0];
+    for(uint32_t i = 2; i < WG_LOCAL_SIZE; i++)
+      expected[i] = min(input[i - 1], expected[i - 1]);
+  }
+}
+
+/*
+ * Generic workgroup utest function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += WG_LOCAL_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+
+    /* input values */
+    for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++)
+    {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+      /* check all data types, test ideal for QWORD types */
+      input[gid + lid] += ((rand() % 2 - 1) * base_val);
+      /* add trailing random bits, tests GENERAL cases */
+      input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_func, input + gid, expected + gid);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++) {
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+#endif
+
+  }
+}
+
+/*
+ * Generic workgroup utest function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void workgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  /* input and expected data */
+  generate_data(wg_func, input, expected);
+
+  /* prepare input for data type */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch on integer, increment */
+      if(numeric_limits<T>::is_integer){
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << i << ", " <<
+          ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+      }
+      /* float error is tolerable though */
+      else {
+          float num_computed = ((T *)buf_data[1])[i];
+          float num_expected = *(expected + i);
+          float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+          if(num_diff > 0.01f){
+            mismatches++;
+
+#if DEBUG_STDOUT
+          /* output mismatch */
+          cout << "Err at " << i << ", " <<
+            ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+        }
+      }
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup scan_exclusive add utest functions
+ */
+void compiler_workgroup_scan_exclusive_add_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_add_int");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_exclusive_add_int);
+void compiler_workgroup_scan_exclusive_add_uint(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_add_uint");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_exclusive_add_uint);
+void compiler_workgroup_scan_exclusive_add_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_add_long");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_exclusive_add_long);
+void compiler_workgroup_scan_exclusive_add_ulong(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_add_ulong");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_exclusive_add_ulong);
+void compiler_workgroup_scan_exclusive_add_float(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_add_float");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_exclusive_add_float);
+
+/*
+ * Workgroup scan_exclusive max utest functions
+ */
+void compiler_workgroup_scan_exclusive_max_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_max_int");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_exclusive_max_int);
+void compiler_workgroup_scan_exclusive_max_uint(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_max_uint");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_exclusive_max_uint);
+void compiler_workgroup_scan_exclusive_max_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_max_long");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_exclusive_max_long);
+void compiler_workgroup_scan_exclusive_max_ulong(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_max_ulong");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_exclusive_max_ulong);
+void compiler_workgroup_scan_exclusive_max_float(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_max_float");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_exclusive_max_float);
+
+/*
+ * Workgroup scan_exclusive min utest functions
+ */
+void compiler_workgroup_scan_exclusive_min_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_min_int");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_exclusive_min_int);
+void compiler_workgroup_scan_exclusive_min_uint(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_min_uint");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_exclusive_min_uint);
+void compiler_workgroup_scan_exclusive_min_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_min_long");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_exclusive_min_long);
+void compiler_workgroup_scan_exclusive_min_ulong(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_min_ulong");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_exclusive_min_ulong);
+void compiler_workgroup_scan_exclusive_min_float(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_exclusive",
+                              "compiler_workgroup_scan_exclusive_min_float");
+  workgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_exclusive_min_float);
diff --git a/utests/compiler_workgroup_scan_inclusive.cpp b/utests/compiler_workgroup_scan_inclusive.cpp
new file mode 100644
index 0000000..e203ba2
--- /dev/null
+++ b/utests/compiler_workgroup_scan_inclusive.cpp
@@ -0,0 +1,364 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+#include <cmath>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  64
+#define WG_LOCAL_SIZE   32
+
+enum WG_FUNCTION
+{
+  WG_SCAN_INCLUSIVE_ADD,
+  WG_SCAN_INCLUSIVE_MAX,
+  WG_SCAN_INCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected)
+{
+  if(wg_func == WG_SCAN_INCLUSIVE_ADD)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      expected[i] = input[i] + expected[i - 1];
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      expected[i] = max(input[i], expected[i - 1]);
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      expected[i] = min(input[i], expected[i - 1]);
+  }
+}
+
+/*
+ * Generic input-expected generate function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += WG_LOCAL_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+
+    /* input values */
+    for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++)
+    {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+
+      /* check all data types, test ideal for QWORD types */
+      input[gid + lid] += ((rand() % 2 - 1) * base_val);
+      /* add trailing random bits, tests GENERAL cases */
+      input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_func, input + gid, expected + gid);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++) {
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+#endif
+
+  }
+}
+
+/*
+ * Generic workgroup utest function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void workgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  /* input and expected data */
+  generate_data(wg_func, input, expected);
+
+  /* prepare input for data type */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch on integer, increment */
+      if(numeric_limits<T>::is_integer){
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << i << ", " <<
+          ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+      }
+      /* float error is tolerable though */
+      else {
+          float num_computed = ((T *)buf_data[1])[i];
+          float num_expected = *(expected + i);
+          float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+          if(num_diff > 0.01f){
+            mismatches++;
+
+#if DEBUG_STDOUT
+          /* output mismatch */
+          cout << "Err at " << i << ", " <<
+            ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+        }
+      }
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup scan_inclusive add utest functions
+ */
+void compiler_workgroup_scan_inclusive_add_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_add_int");
+  workgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_inclusive_add_int);
+void compiler_workgroup_scan_inclusive_add_uint(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_add_uint");
+  workgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_inclusive_add_uint);
+void compiler_workgroup_scan_inclusive_add_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_add_long");
+  workgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_inclusive_add_long);
+void compiler_workgroup_scan_inclusive_add_ulong(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_add_ulong");
+  workgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_inclusive_add_ulong);
+void compiler_workgroup_scan_inclusive_add_float(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_add_float");
+  workgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_inclusive_add_float);
+
+/*
+ * Workgroup scan_inclusive max utest functions
+ */
+void compiler_workgroup_scan_inclusive_max_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_max_int");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_inclusive_max_int);
+void compiler_workgroup_scan_inclusive_max_uint(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_max_uint");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_inclusive_max_uint);
+void compiler_workgroup_scan_inclusive_max_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_max_long");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_inclusive_max_long);
+void compiler_workgroup_scan_inclusive_max_ulong(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_max_ulong");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_inclusive_max_ulong);
+void compiler_workgroup_scan_inclusive_max_float(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_max_float");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_inclusive_max_float);
+
+/*
+ * Workgroup scan_inclusive min utest functions
+ */
+void compiler_workgroup_scan_inclusive_min_int(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_min_int");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_inclusive_min_int);
+void compiler_workgroup_scan_inclusive_min_uint(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_min_uint");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_inclusive_min_uint);
+void compiler_workgroup_scan_inclusive_min_long(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_min_long");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_inclusive_min_long);
+void compiler_workgroup_scan_inclusive_min_ulong(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_min_ulong");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_workgroup_scan_inclusive_min_ulong);
+void compiler_workgroup_scan_inclusive_min_float(void)
+{
+  if (!cl_check_ocl20())
+    return;
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_scan_inclusive",
+                              "compiler_workgroup_scan_inclusive_min_float");
+  workgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_scan_inclusive_min_float);
+
diff --git a/utests/enqueue_copy_buf_unaligned.cpp b/utests/enqueue_copy_buf_unaligned.cpp
index e1bd0aa..f501d29 100644
--- a/utests/enqueue_copy_buf_unaligned.cpp
+++ b/utests/enqueue_copy_buf_unaligned.cpp
@@ -77,7 +77,7 @@ void enqueue_copy_buf_unaligned(void)
     size_t i;
     size_t j;
     const size_t sz = 1024;
-    int offset = 0;
+    unsigned int offset = 0;
 
     OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
     OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
index 7c03d95..42edf1a 100644
--- a/utests/get_cl_info.cpp
+++ b/utests/get_cl_info.cpp
@@ -78,9 +78,11 @@ struct Info_Result<char **> {
     int *elt_size;
     int size;
     typedef char** type_value;
+    int array_size;
 
     Info_Result(char **other, int *sz, int elt_num) {
-        size = elt_num;
+        array_size = elt_num;
+        size = elt_num * sizeof(char**);
 
         ret = (char **)malloc(elt_num * sizeof(char *));
         memset(ret, 0, (elt_num * sizeof(char *)));
@@ -106,7 +108,7 @@ struct Info_Result<char **> {
 
     ~Info_Result(void) {
         int i = 0;
-        for (; i < size; i++) {
+        for (; i < array_size; i++) {
             if (refer[i])
                 free(refer[i]);
             free(ret[i]);
@@ -122,7 +124,7 @@ struct Info_Result<char **> {
 
     bool check_result (void) {
         int i = 0;
-        for (; i < size; i++) {
+        for (; i < array_size; i++) {
             if (refer[i] && ::memcmp(ret[i], refer[i], elt_size[i]))
                 return false;
         }
@@ -181,9 +183,15 @@ void get_program_info(void)
     int sz;
     char *ker_path = (char *)malloc(4096 * sizeof(char));
     const char *kiss_path = getenv("OCL_KERNEL_PATH");
+    if(!kiss_path)
+      return;
+
     string line;
     string source_code;
 
+    if(strlen(kiss_path) > 4000)
+      return;
+
     sprintf(ker_path, "%s/%s", kiss_path, "compiler_if_else.cl");
 
     ifstream in(ker_path);
@@ -216,7 +224,7 @@ void get_program_info(void)
     expect_value = NO_STANDARD_REF;
     maps.insert(make_pair(CL_PROGRAM_BINARY_SIZES,
                           (void *)(new Info_Result<size_t>((size_t)expect_value))));
-    sz = 4096; //big enough?
+    sz = 8192; //big enough?
     expect_source = NULL;
     maps.insert(make_pair(CL_PROGRAM_BINARIES,
                           (void *)(new Info_Result<char **>(&expect_source, &sz, 1))));
@@ -405,19 +413,124 @@ void get_build_llvm_info(void)
         }
     }
 
-    //Test is successful if the backend created the file
-    if( (fp = fopen(llvm_file, "r")) == NULL) {
-        std::cout << "LLVM file creation.. FAILED";
+    if (cl_check_beignet()) {
+       //Test is successful if the backend created the file
+       if( (fp = fopen(llvm_file, "r")) == NULL) {
+           std::cout << "LLVM file creation.. FAILED";
+           OCL_ASSERT(0);
+       } else {
+           fclose(fp);
+           std::cout << "LLVM file created.. SUCCESS";
+       }
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_build_llvm_info);
+
+
+// This method uses clGetProgramBuildInfo to check the dump-spir-binary options
+// and verifies that the spir dump file is actually generated in the backend.
+void compile_spir_binary(void)
+{
+    map<cl_program_info, void *> maps;
+    cl_build_status expect_status;
+    char spir_file[] = "test_spir_dump.txt";
+    char compile_opt[] = "-dump-spir-binary=test_spir_dump.txt";
+    FILE *fp = NULL;
+    int sz;
+
+    //Remove any pre-existing file
+    if( (fp = fopen(spir_file, "r")) != NULL) {
+        fclose(fp);
+        std::remove(spir_file);
+    }
+
+    OCL_CALL (cl_kernel_compile, "compiler_ceil.cl", "compiler_ceil", compile_opt);
+
+    /* Do our test.*/
+    expect_status = CL_BUILD_SUCCESS;
+    maps.insert(make_pair(CL_PROGRAM_BUILD_STATUS,
+                          (void *)(new Info_Result<cl_build_status>(expect_status))));
+    sz = strlen(compile_opt) + 1;
+    maps.insert(make_pair(CL_PROGRAM_BUILD_OPTIONS,
+                          (void *)(new Info_Result<char *>(compile_opt, sz))));
+
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
+        case CL_PROGRAM_BUILD_STATUS:
+            CALL_PROG_BUILD_INFO_AND_RET(cl_build_status);
+            break;
+        case CL_PROGRAM_BUILD_OPTIONS:
+            CALL_PROG_BUILD_INFO_AND_RET(char *);
+            break;
+        default:
+            break;
+        }
+    }
+
+    if (cl_check_beignet()) {
+      //Test is successful if the backend created the file
+      if( (fp = fopen(spir_file, "r")) == NULL) {
+        std::cout << "SPIR file creation.. FAILED";
         OCL_ASSERT(0);
-    } else {
+      } else {
         fclose(fp);
-        std::cout << "LLVM file created.. SUCCESS";
+        std::cout << "SPIR file created.. SUCCESS";
+      }
     }
 }
+MAKE_UTEST_FROM_FUNCTION(compile_spir_binary);
 
-MAKE_UTEST_FROM_FUNCTION(get_build_llvm_info);
+void build_spir_binary(void)
+{
+    map<cl_program_info, void *> maps;
+    cl_build_status expect_status;
+    char spir_file[] = "test_spir_dump.txt";
+    char build_opt[] = "-dump-spir-binary=test_spir_dump.txt";
+    FILE *fp = NULL;
+    int sz;
+
+    //Remove any pre-existing file
+    if( (fp = fopen(spir_file, "r")) != NULL) {
+        fclose(fp);
+        std::remove(spir_file);
+    }
+
+    OCL_CALL (cl_kernel_init, "compiler_ceil.cl", "compiler_ceil", SOURCE, build_opt);
+
+    /* Do our test.*/
+    expect_status = CL_BUILD_SUCCESS;
+    maps.insert(make_pair(CL_PROGRAM_BUILD_STATUS,
+                          (void *)(new Info_Result<cl_build_status>(expect_status))));
+    sz = strlen(build_opt) + 1;
+    maps.insert(make_pair(CL_PROGRAM_BUILD_OPTIONS,
+                          (void *)(new Info_Result<char *>(build_opt, sz))));
 
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
+        case CL_PROGRAM_BUILD_STATUS:
+            CALL_PROG_BUILD_INFO_AND_RET(cl_build_status);
+            break;
+        case CL_PROGRAM_BUILD_OPTIONS:
+            CALL_PROG_BUILD_INFO_AND_RET(char *);
+            break;
+        default:
+            break;
+        }
+    }
 
+    if (cl_check_beignet()) {
+      //Test is successful if the backend created the file
+      if( (fp = fopen(spir_file, "r")) == NULL) {
+          std::cout << "SPIR file creation.. FAILED";
+          OCL_ASSERT(0);
+      } else {
+          fclose(fp);
+          std::cout << "SPIR file created.. SUCCESS";
+      }
+    }
+}
+MAKE_UTEST_FROM_FUNCTION(build_spir_binary);
 // This method uses clGetProgramBuildInfo to check the asm dump build options sent
 // And verifies that the asm dump file is actually generated in the backend.
 void get_build_asm_info(void)
@@ -458,18 +571,118 @@ void get_build_asm_info(void)
         }
     }
 
-    //Test is successful if the backend created the file
-    if( (fp = fopen(asm_file, "r")) == NULL) {
-        std::cout << "ASM file creation.. FAILED";
-        OCL_ASSERT(0);
-    } else {
-        fclose(fp);
-        std::cout << "ASM file created.. SUCCESS";
+    if (cl_check_beignet()) {
+      //Test is successful if the backend created the file
+      if( (fp = fopen(asm_file, "r")) == NULL) {
+          std::cout << "ASM file creation.. FAILED";
+          OCL_ASSERT(0);
+      } else {
+          fclose(fp);
+          std::cout << "ASM file created.. SUCCESS";
+      }
     }
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_build_asm_info);
 
+void get_compile_llvm_info(void)
+{
+    map<cl_program_info, void *> maps;
+    cl_build_status expect_status;
+    char llvm_file[] = "test_llvm_dump.txt";
+    char compile_opt[] = "-dump-opt-llvm=test_llvm_dump.txt";
+    FILE *fp = NULL;
+
+    //Remove any pre-existing file
+    if( (fp = fopen(llvm_file, "r")) != NULL) {
+        fclose(fp);
+        std::remove(llvm_file);
+    }
+
+    OCL_CALL (cl_kernel_compile, "compiler_if_else.cl", "compiler_if_else", compile_opt);
+
+    /* Do our test.*/
+    expect_status = CL_BUILD_SUCCESS;
+    maps.insert(make_pair(CL_PROGRAM_BUILD_STATUS,
+                          (void *)(new Info_Result<cl_build_status>(expect_status))));
+
+
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
+        case CL_PROGRAM_BUILD_STATUS:
+            CALL_PROG_BUILD_INFO_AND_RET(cl_build_status);
+            break;
+        case CL_PROGRAM_BUILD_OPTIONS:
+            CALL_PROG_BUILD_INFO_AND_RET(char *);
+            break;
+        default:
+            break;
+        }
+    }
+
+    if (cl_check_beignet()) {
+      //Test is successful if the backend created the file
+      if( (fp = fopen(llvm_file, "r")) == NULL) {
+          std::cout << "LLVM file creation.. FAILED";
+          OCL_ASSERT(0);
+      } else {
+          fclose(fp);
+          std::cout << "LLVM file created.. SUCCESS";
+      }
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_compile_llvm_info);
+
+void get_link_asm_info(void)
+{
+    map<cl_program_info, void *> maps;
+    cl_build_status expect_status;
+    char asm_file[] = "test_asm_dump.txt";
+    char link_opt[] = "-dump-opt-asm=test_asm_dump.txt";
+    FILE *fp = NULL;
+
+    //Remove any pre-existing file
+    if( (fp = fopen(asm_file, "r")) != NULL) {
+        fclose(fp);
+        std::remove(asm_file);
+    } 
+
+    OCL_CALL (cl_kernel_link, "compiler_if_else.cl", "compiler_if_else", link_opt);
+
+    /* Do our test.*/
+    expect_status = CL_BUILD_SUCCESS;
+    maps.insert(make_pair(CL_PROGRAM_BUILD_STATUS,
+                          (void *)(new Info_Result<cl_build_status>(expect_status))));
+
+
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
+        case CL_PROGRAM_BUILD_STATUS:
+            CALL_PROG_BUILD_INFO_AND_RET(cl_build_status);
+            break;
+        case CL_PROGRAM_BUILD_OPTIONS:
+            CALL_PROG_BUILD_INFO_AND_RET(char *);
+            break;
+        default:
+            break;
+        }
+    }
+
+    if (cl_check_beignet()) {
+      //Test is successful if the backend created the file
+      if( (fp = fopen(asm_file, "r")) == NULL) {
+          std::cout << "ASM file creation.. FAILED";
+          OCL_ASSERT(0);
+      } else {
+          fclose(fp);
+          std::cout << "ASM file created.. SUCCESS";
+      }
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_link_asm_info);
+
 
 /* ***************************************************** *
  * clGetContextInfo                                      *
diff --git a/utests/image_1D_buffer.cpp b/utests/image_1D_buffer.cpp
index e2cfcde..66eb6e7 100644
--- a/utests/image_1D_buffer.cpp
+++ b/utests/image_1D_buffer.cpp
@@ -55,11 +55,12 @@ void image_1D_buffer(void)
   OCL_MAP_BUFFER(1);
   for (uint32_t i = 0; i < buffer_sz; i++) {
     if (((uint32_t*)buf_data[1])[i] != ((uint32_t*)buf_data[0])[i])
-      printf("i %d expected %x got %x \n", i, ((uint32_t*)buf_data[0])[i], ((uint32_t*)buf_data[1])[i]);
+      printf("i %d expected %x got %x", i, ((uint32_t*)buf_data[0])[i], ((uint32_t*)buf_data[1])[i]);
     OCL_ASSERT(((uint32_t*)buf_data[1])[i] == ((uint32_t*)buf_data[0])[i]);
   }
   OCL_UNMAP_BUFFER(0);
   OCL_UNMAP_BUFFER(1);
+  free(buf_content);
 }
 
 MAKE_UTEST_FROM_FUNCTION(image_1D_buffer);
diff --git a/utests/image_from_buffer.cpp b/utests/image_from_buffer.cpp
new file mode 100644
index 0000000..0084f50
--- /dev/null
+++ b/utests/image_from_buffer.cpp
@@ -0,0 +1,109 @@
+#include <string.h>
+#include "utest_helper.hpp"
+#include <malloc.h>
+#include <cstring>
+
+static void image_from_buffer(void)
+{
+  size_t param_value_size;
+  std::string extensionStr;
+  OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_EXTENSIONS, 0, 0, &param_value_size);
+  std::vector<char> param_value(param_value_size);
+  OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_EXTENSIONS, param_value_size, param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+  if (!param_value.empty())
+    extensionStr = std::string(&param_value.front(), param_value_size-1);
+
+  if (!std::strstr(extensionStr.c_str(), "cl_khr_image2d_from_buffer")) {
+    return;
+  }
+
+  size_t base_address_alignment = 0;
+  OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, sizeof(base_address_alignment), &base_address_alignment, NULL);
+  const size_t w = 512;
+  const size_t h = 512;
+  cl_image_format format;
+  cl_image_desc desc;
+  int error;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  OCL_CREATE_KERNEL("image_from_buffer");
+
+  // Setup kernel and images
+  size_t buffer_sz = sizeof(uint32_t) * w * h;
+  uint32_t* src_data;
+  src_data = (uint32_t*)memalign(base_address_alignment, buffer_sz);
+  if(!src_data) {
+    fprintf(stderr, "run out of memory\n");
+    return;
+  }
+
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+      src_data[j * w + i] = j * w + i;
+
+  cl_mem buff = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, buffer_sz, src_data, &error);
+
+  OCL_ASSERT(error == CL_SUCCESS);
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+
+  desc.buffer = 0;
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, src_data);
+
+  desc.buffer = buff;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+
+  desc.buffer = 0;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[2], CL_MEM_WRITE_ONLY, &format, &desc, NULL);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[2]);
+
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 4;
+
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  OCL_MAP_BUFFER_GTT(2);
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+    {
+      //printf("%d,%d\n", ((uint32_t*)buf_data[0])[j * w + i], ((uint32_t*)buf_data[1])[j * w + i]);
+      //printf("%d,%d,%d,%d\n", j, i, ((uint32_t*)buf_data[0])[j * w + i], ((uint32_t*)buf_data[2])[j * w + i]);
+      OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1])[j * w + i]);
+      OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[2])[j * w + i]);
+    }
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+  OCL_UNMAP_BUFFER_GTT(2);
+
+  free(src_data);
+
+  //spec didn't tell the sequence of release buffer of image. so release either buffer or image first is ok here.
+  //we follow the rule of destroy the bo at the last release, then the access of buffer after release image is legal
+  //and vice verse.
+#if 1
+  clReleaseMemObject(buf[1]);
+  clReleaseMemObject(buff);
+#else
+  clReleaseMemObject(buff);
+  clReleaseMemObject(buf[1]);
+#endif
+  clReleaseMemObject(buf[2]);
+  buf[1] = NULL;
+  buf[2] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(image_from_buffer);
diff --git a/utests/load_program_from_bin_file.cpp b/utests/load_program_from_bin_file.cpp
index feefacc..117e15a 100644
--- a/utests/load_program_from_bin_file.cpp
+++ b/utests/load_program_from_bin_file.cpp
@@ -18,6 +18,10 @@ static void test_load_program_from_bin_file(void)
     char *ker_path = NULL;
 
     cl_file_map_t *fm = cl_file_map_new();
+    if(!fm) {
+      fprintf(stderr, "run out of memory\n");
+      return;
+    }
     ker_path = cl_do_kiss_path("compiler_ceil.bin", device);
     OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
 
diff --git a/utests/load_program_from_gen_bin.cpp b/utests/load_program_from_gen_bin.cpp
index 3db13b2..5a9c901 100644
--- a/utests/load_program_from_gen_bin.cpp
+++ b/utests/load_program_from_gen_bin.cpp
@@ -18,6 +18,10 @@ static void test_load_program_from_gen_bin(void)
     char *ker_path = NULL;
 
     cl_file_map_t *fm = cl_file_map_new();
+    if(!fm) {
+      fprintf(stderr, "run out of memory\n");
+      return;
+    }
     ker_path = cl_do_kiss_path("compiler_ceil.cl", device);
     OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
 
@@ -88,6 +92,10 @@ static void test_load_program_from_gen_bin(void)
             OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
         OCL_UNMAP_BUFFER(1);
     }
+  cl_file_map_delete(fm);
+  clReleaseProgram(bin_program);
+  free(binary);
+  free(ker_path);
 }
 
 MAKE_UTEST_FROM_FUNCTION(test_load_program_from_gen_bin);
diff --git a/utests/load_program_from_spir.cpp b/utests/load_program_from_spir.cpp
index 8ea1cd4..bb53947 100644
--- a/utests/load_program_from_spir.cpp
+++ b/utests/load_program_from_spir.cpp
@@ -31,6 +31,10 @@ static void test_load_program_from_spir(void)
     char *ker_path = NULL;
 
     cl_file_map_t *fm = cl_file_map_new();
+    if(!fm) {
+      fprintf(stderr, "run out of memory\n");
+      return;
+    }
     ker_path = cl_do_kiss_path("compiler_ceil32.spir", device);
     OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
 
@@ -85,6 +89,8 @@ static void test_load_program_from_spir(void)
             OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
         OCL_UNMAP_BUFFER(1);
     }
+  free(ker_path);
+  cl_file_map_delete(fm);
 }
 
 MAKE_UTEST_FROM_FUNCTION(test_load_program_from_spir);
diff --git a/utests/profiling_exec.cpp b/utests/profiling_exec.cpp
index 4232772..437a628 100644
--- a/utests/profiling_exec.cpp
+++ b/utests/profiling_exec.cpp
@@ -45,7 +45,6 @@ static void profiling_exec(void)
     const size_t n = 512;
     cl_int status = CL_SUCCESS;
     cl_command_queue profiling_queue = NULL;
-    cl_command_queue tmp_queue = NULL;
     float* cpu_src = (float *)malloc(n*sizeof(float));
     float* cpu_dst = (float *)malloc(n*sizeof(float));
     cl_event exec_event;
@@ -56,10 +55,6 @@ static void profiling_exec(void)
     profiling_queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &status);
     OCL_ASSERT(status == CL_SUCCESS);
 
-    /* save the default queue. */
-    tmp_queue = queue;
-    queue = profiling_queue;
-
     OCL_CREATE_KERNEL("compiler_fabs");
 
     OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
@@ -77,7 +72,7 @@ static void profiling_exec(void)
     cpu_exec(n, cpu_src, cpu_dst);
 
     // Run the kernel on GPU
-    OCL_CALL(clEnqueueNDRangeKernel, queue, kernel, 1, NULL, globals, locals, 0, NULL, &exec_event);
+    OCL_CALL(clEnqueueNDRangeKernel, profiling_queue, kernel, 1, NULL, globals, locals, 0, NULL, &exec_event);
     OCL_CALL(clWaitForEvents, 1, &exec_event);
 
     OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &time_queue, NULL);
@@ -94,7 +89,6 @@ static void profiling_exec(void)
 	OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
     OCL_UNMAP_BUFFER(1);
 
-    queue = tmp_queue;
     clReleaseCommandQueue(profiling_queue);
     free(cpu_dst);
     free(cpu_src);
diff --git a/utests/runtime_alloc_host_ptr_buffer.cpp b/utests/runtime_alloc_host_ptr_buffer.cpp
index 793682b..a5a2dda 100644
--- a/utests/runtime_alloc_host_ptr_buffer.cpp
+++ b/utests/runtime_alloc_host_ptr_buffer.cpp
@@ -16,10 +16,10 @@ static void runtime_alloc_host_ptr_buffer(void)
   OCL_NDRANGE(1);
 
   // Check result
-  uint32_t* mapptr = (uint32_t*)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_READ, 0, n*sizeof(uint32_t), 0, NULL, NULL, NULL);
+  OCL_MAP_BUFFER(0);
   for (uint32_t i = 0; i < n; ++i)
-    OCL_ASSERT(mapptr[i] == i / 2);
-  clEnqueueUnmapMemObject(queue, buf[0], mapptr, 0, NULL, NULL);
+    OCL_ASSERT(((int*)buf_data[0])[i] == (int)i / 2);
+  OCL_UNMAP_BUFFER(0);
 }
 
 MAKE_UTEST_FROM_FUNCTION(runtime_alloc_host_ptr_buffer);
diff --git a/utests/runtime_barrier_list.cpp b/utests/runtime_barrier_list.cpp
index 135996f..3b8d3c3 100644
--- a/utests/runtime_barrier_list.cpp
+++ b/utests/runtime_barrier_list.cpp
@@ -65,7 +65,6 @@ void runtime_barrier_list(void)
   for (uint32_t i = 0; i < n; ++i) {
     OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
   }
-  clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
 
   for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
     clReleaseEvent(ev[i]);
diff --git a/utests/runtime_climage_from_boname.cpp b/utests/runtime_climage_from_boname.cpp
index 4e7f06a..2160886 100644
--- a/utests/runtime_climage_from_boname.cpp
+++ b/utests/runtime_climage_from_boname.cpp
@@ -22,6 +22,9 @@ extern "C"
 #include <X11/extensions/extutil.h>
 }
 
+typedef cl_mem (OCLCREATEIMAGEFROMLIBVAINTEL)(cl_context, const cl_libva_image *, cl_int *);
+OCLCREATEIMAGEFROMLIBVAINTEL *oclCreateImageFromLibvaIntel = NULL;
+
 // part of following code is copy from beignet/src/x11/
 typedef struct {
     CARD8   reqType;
@@ -151,7 +154,16 @@ void runtime_climage_from_boname(void)
   imageParam.height = h - hStart;
   imageParam.row_pitch = w;
 
-  cl_mem dst = clCreateImageFromLibvaIntel(ctx, &imageParam, NULL);
+#ifdef CL_VERSION_1_2
+  oclCreateImageFromLibvaIntel = (OCLCREATEIMAGEFROMLIBVAINTEL *)clGetExtensionFunctionAddressForPlatform(platform, "clCreateImageFromLibvaIntel");
+#else
+  oclCreateImageFromLibvaIntel = (OCLCREATEIMAGEFROMLIBVAINTEL *)clGetExtensionFunctionAddress("clCreateImageFromLibvaIntel");
+#endif
+  if(!oclCreateImageFromLibvaIntel){
+    fprintf(stderr, "Failed to get extension clCreateImageFromLibvaIntel\n");
+    exit(1);
+  }
+  cl_mem dst = oclCreateImageFromLibvaIntel(ctx, &imageParam, NULL);
 
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &dst);
diff --git a/utests/runtime_cmrt.cpp b/utests/runtime_cmrt.cpp
new file mode 100644
index 0000000..837f09a
--- /dev/null
+++ b/utests/runtime_cmrt.cpp
@@ -0,0 +1,274 @@
+/*
+this test case shows how to execute CM kernel via OpenCL APIs.
+the CM kernel source code is already compiled into file "cmrt_utest_genx.isa" with offline compiler.
+
+I also copied the CM kernel source code and CM host source code here for your reference.
+
+CM kernel source code:
+#include <cm/cm.h>
+extern "C" _GENX_MAIN_  void
+simplemov(SurfaceIndex ibuf, SurfaceIndex obuf, uint d)
+{
+    matrix<uchar, 1, 4> in;
+    matrix<uchar, 1, 4> out;
+
+	uint h_pos = get_thread_origin_x();
+	uint v_pos = get_thread_origin_y();
+
+	read(ibuf, h_pos*4, v_pos, in);
+
+	out = in / d;
+	write(obuf, h_pos*4, v_pos, out);
+}
+
+CM host source code:
+#include "cm_rt.h"
+
+int main()
+{
+    FILE* pISA = fopen("cmrt_utest_genx.isa", "rb");
+    if (pISA == NULL) {
+        perror("cmrt_utest_genx.isa");
+        return -1;
+    }
+
+    fseek (pISA, 0, SEEK_END);
+    int codeSize = ftell (pISA);
+    rewind(pISA);
+
+    if(codeSize == 0)
+    {
+        perror("cmrt_utest_genx.isa");
+        return -1;
+    }
+
+    void *pCommonISACode = (BYTE*) malloc(codeSize);
+    if( !pCommonISACode )
+    {
+        return -1;
+    }
+
+    if (fread(pCommonISACode, 1, codeSize, pISA) != codeSize) {
+        perror("cmrt_utest_genx.isa");
+        return -1;
+    }
+    fclose(pISA);
+
+    unsigned int  width = 256;
+    unsigned int  height = 128;
+
+    unsigned char *src;
+    unsigned char *dst;
+    src = (unsigned char*) malloc(width*height*4);
+    dst = (unsigned char*) malloc(width*height*4);
+
+    for (unsigned int i = 0; i < width*height*4; i++) {
+        src[i] = i % 256;
+        dst[i] = 0;
+    }
+
+    CmDevice* pCmDev = NULL;;
+    UINT version = 0;
+
+    int result = CreateCmDevice( pCmDev, version );
+    if (result != CM_SUCCESS ) {
+        printf("CmDevice creation error");
+        return -1;
+    }
+    if( version < CM_1_0 ){
+        printf(" The runtime API version is later than runtime DLL version");
+        return -1;
+    }
+
+    CmProgram* program = NULL;
+    result = pCmDev->LoadProgram(pCommonISACode, codeSize, program);
+    if (result != CM_SUCCESS ) {
+        perror("CM LoadProgram error");
+        return -1;
+    }
+
+    CmKernel* kernel = NULL;
+    result = pCmDev->CreateKernel(program, CM_KERNEL_FUNCTION(simplemov) , kernel);
+    if (result != CM_SUCCESS ) {
+        perror("CM CreateKernel error");
+        return -1;
+    }
+
+    CmSurface2D*  pInputSurf = NULL;
+    result = pCmDev->CreateSurface2D( width, height, CM_SURFACE_FORMAT_A8R8G8B8, pInputSurf );
+    if (result != CM_SUCCESS ) {
+        printf("CM CreateSurface2D error");
+        return -1;
+    }
+
+    CmSurface2D*  pOutputSurf = NULL;
+    result = pCmDev->CreateSurface2D( width, height, CM_SURFACE_FORMAT_A8R8G8B8, pOutputSurf );
+    if (result != CM_SUCCESS ) {
+        printf("CM CreateSurface2D error");
+        return -1;
+    }
+
+    result = pInputSurf->WriteSurface( src, NULL );
+    if (result != CM_SUCCESS ) {
+        printf("CM WriteSurface error");
+        return -1;
+    }
+
+    kernel->SetThreadCount( width * height );
+
+    CmThreadSpace* pTS = NULL;
+    result = pCmDev->CreateThreadSpace(width, height, pTS);
+    if (result != CM_SUCCESS ) {
+        printf("CM WriteSurface error");
+        return -1;
+    }
+
+    SurfaceIndex * index0= NULL;
+    pInputSurf->GetIndex(index0);
+    kernel->SetKernelArg(0,sizeof(SurfaceIndex),index0);
+
+    SurfaceIndex * index1= NULL;
+    pOutputSurf->GetIndex(index1);
+    kernel->SetKernelArg(1,sizeof(SurfaceIndex),index1);
+
+	unsigned int  d = 3;
+	kernel->SetKernelArg(2, sizeof(unsigned int), &d);
+
+    CmQueue* pCmQueue = NULL;
+    result = pCmDev->CreateQueue( pCmQueue );
+    if (result != CM_SUCCESS ) {
+        perror("CM CreateQueue error");
+        return -1;
+    }
+
+    CmTask *pKernelArray = NULL;
+
+    result = pCmDev->CreateTask(pKernelArray);
+    if (result != CM_SUCCESS ) {
+        printf("CmDevice CreateTask error");
+        return -1;
+    }
+
+    result = pKernelArray-> AddKernel (kernel);
+    if (result != CM_SUCCESS ) {
+        printf("CmDevice AddKernel error");
+        return -1;
+    }
+
+    CmEvent* e = NULL;
+    result = pCmQueue->Enqueue(pKernelArray, e, pTS);
+    if (result != CM_SUCCESS ) {
+        printf("CmDevice enqueue error");
+        return -1;
+    }
+
+    pCmDev->DestroyTask(pKernelArray);
+    result = pOutputSurf->ReadSurface( dst, e );
+    if (result != CM_SUCCESS ) {
+        printf("CM ReadSurface error");
+        return -1;
+    }
+
+	for (unsigned int i = 0; i < width*height*4; i++) {
+        if (src[i] / d != dst[i]) {
+			printf("test failed at %d, expected %d, got %d\n", i, src[i]/d, dst[i]);
+			return -1;
+		}
+    }
+
+	printf("test passed!\n");
+
+    result = DestroyCmDevice( pCmDev );
+
+    free(pCommonISACode);
+    free(src);
+    free(dst);
+
+    return 0;
+}
+
+*/
+
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+#include <string.h>
+
+void runtime_cmrt(void)
+{
+  uint32_t w = 256;
+  uint32_t h = 128;
+  cl_int status;
+  cl_int binary_status;
+  char *ker_path = NULL;
+
+  cl_file_map_t *fm = cl_file_map_new();
+  ker_path = cl_do_kiss_path("cmrt_utest_genx.isa", NULL);
+  OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
+
+  const unsigned char *kbin = (const unsigned char *)cl_file_map_begin(fm);
+  const size_t sz = cl_file_map_size(fm);
+
+  program = clCreateProgramWithBinary(ctx, 1,
+            &device, &sz, &kbin, &binary_status, &status);
+
+  OCL_ASSERT(program && status == CL_SUCCESS);
+
+  /* OCL requires to build the program even if it is created from a binary */
+  OCL_ASSERT(clBuildProgram(program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+  kernel = clCreateKernel(program, "simplemov", &status);
+  OCL_ASSERT(status == CL_SUCCESS);
+
+
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_BGRA;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  uint8_t* src = (uint8_t*)buf_data[0];
+  uint8_t* dst = (uint8_t*)buf_data[1];
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w*4; i++) {
+      src[j * w * 4 + i] = i;
+      dst[j * w * 4 + i] = 0;
+    }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  unsigned int d = 3;
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(unsigned int), &d);
+  globals[0] = w;
+  globals[1] = h;
+
+  //if kernel uses get_origin_thread_x/y, locals must be NULL to invoke pCmQueue->Enqueue
+  //if kernel uses cm_linear_global_id, locals must be not NULL to invoke pCmQueue->EnqueueWithGroup
+  OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, globals, NULL, 0, NULL, NULL);
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  src = (uint8_t*)buf_data[0];
+  dst = (uint8_t*)buf_data[1];
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w*4; i++) {
+      OCL_ASSERT(src[j * w * 4 + i] / d == dst[j * w * 4 + i]);
+    }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_cmrt);
diff --git a/utests/runtime_compile_link.cpp b/utests/runtime_compile_link.cpp
index 4a39b6a..74eab2f 100644
--- a/utests/runtime_compile_link.cpp
+++ b/utests/runtime_compile_link.cpp
@@ -12,6 +12,9 @@ int init_program(const char* name, cl_context ctx, cl_program *pg )
   char* ker_path = cl_do_kiss_path(name, device);
 
   cl_file_map_t *fm = cl_file_map_new();
+  if(!fm)
+    return CL_FALSE;
+
   err = cl_file_map_open(fm, ker_path);
   if(err != CL_FILE_MAP_SUCCESS)
     OCL_ASSERT(0);
@@ -20,7 +23,7 @@ int init_program(const char* name, cl_context ctx, cl_program *pg )
   *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
   free(ker_path);
   cl_file_map_delete(fm);
-  return 0;
+  return CL_SUCCESS;
 
 }
 
@@ -31,15 +34,18 @@ void runtime_compile_link(void)
 
   const char* header_file_name="runtime_compile_link.h";
   cl_program foo_pg;
-  init_program(header_file_name, ctx, &foo_pg);
+  err = init_program(header_file_name, ctx, &foo_pg);
+  OCL_ASSERT(err==CL_SUCCESS);
 
   const char* myinc_file_name="include/runtime_compile_link_inc.h";
   cl_program myinc_pg;
-  init_program(myinc_file_name, ctx, &myinc_pg);
+  err = init_program(myinc_file_name, ctx, &myinc_pg);
+  OCL_ASSERT(err==CL_SUCCESS);
 
   const char* file_name_A="runtime_compile_link_a.cl";
   cl_program program_A;
-  init_program(file_name_A, ctx, &program_A);
+  err = init_program(file_name_A, ctx, &program_A);
+  OCL_ASSERT(err==CL_SUCCESS);
 
   cl_program input_headers[2] = { foo_pg, myinc_pg};
   const char * input_header_names[2] = {header_file_name, myinc_file_name}; 
@@ -55,7 +61,8 @@ void runtime_compile_link(void)
   OCL_ASSERT(err==CL_SUCCESS);
   const char* file_name_B="runtime_compile_link_b.cl";
   cl_program program_B;
-  init_program(file_name_B, ctx, &program_B);
+  err = init_program(file_name_B, ctx, &program_B);
+  OCL_ASSERT(err==CL_SUCCESS);
 
   err = clCompileProgram(program_B,
                                 0, NULL, // num_devices & device_list
@@ -157,6 +164,14 @@ void runtime_compile_link(void)
   }
   OCL_UNMAP_BUFFER(2);
   OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+  clReleaseProgram(foo_pg);
+  clReleaseProgram(myinc_pg);
+  clReleaseProgram(program_A);
+  clReleaseProgram(program_B);
+  clReleaseProgram(linked_program);
+  clReleaseProgram(new_linked_program);
+  clReleaseProgram(program_with_binary);
+  free(binary);
 }
 
 MAKE_UTEST_FROM_FUNCTION(runtime_compile_link);
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
index f8170a3..00e02f1 100644
--- a/utests/runtime_event.cpp
+++ b/utests/runtime_event.cpp
@@ -50,7 +50,6 @@ void runtime_event(void)
   for (uint32_t i = 0; i < n; ++i) {
     OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
   }
-  clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
 
   for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
     clReleaseEvent(ev[i]);
diff --git a/utests/runtime_flat_address_space.cpp b/utests/runtime_flat_address_space.cpp
index 9b8bece..cf94cf5 100644
--- a/utests/runtime_flat_address_space.cpp
+++ b/utests/runtime_flat_address_space.cpp
@@ -53,21 +53,19 @@ main(int argc, char *argv[])
                                   NULL);
 
     // Be sure that everything run fine
-    dst_buffer = (int *) clMapBufferIntel(dst[j], &status);
+    dst_buffer = (int *)clEnqueueMapBuffer(queue, dst[j], CL_TRUE, CL_MAP_READ, 0, sizeof(int)*n, 0, NULL, NULL, &status);
     if (status != CL_SUCCESS)
       goto error;
-    for (uint32_t i = 0; i < n; ++i)
+    for (uint32_t i = 0; dst_buffer && i < n; ++i)
       if (dst_buffer[i] != int(i)) {
         fprintf(stderr, "run-time flat address space failed\n");
         exit(-1);
       }
-    OCL_CALL (clUnmapBufferIntel, dst[j]);
+    clEnqueueUnmapMemObject(queue, dst[j], dst_buffer, 0, NULL, NULL);
   }
 
   for (uint32_t j = 0; j < 24; ++j) OCL_CALL (clReleaseMemObject, dst[j]);
   cl_test_destroy();
-  printf("%i memory leaks\n", clReportUnfreedIntel());
-  assert(clReportUnfreedIntel() == 0);
 
 error:
   return status;
diff --git a/utests/runtime_marker_list.cpp b/utests/runtime_marker_list.cpp
index f64b1d1..751f4a0 100644
--- a/utests/runtime_marker_list.cpp
+++ b/utests/runtime_marker_list.cpp
@@ -65,7 +65,6 @@ void runtime_marker_list(void)
   for (uint32_t i = 0; i < n; ++i) {
     OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
   }
-  clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
 
   for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
     clReleaseEvent(ev[i]);
diff --git a/utests/runtime_use_host_ptr_image.cpp b/utests/runtime_use_host_ptr_image.cpp
new file mode 100644
index 0000000..4a30e89
--- /dev/null
+++ b/utests/runtime_use_host_ptr_image.cpp
@@ -0,0 +1,76 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+static void runtime_use_host_ptr_image(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+
+  size_t alignment = 4096;  //page size
+  if (cl_check_beignet())
+    alignment = 64;     //cacheline size, beignet has loose limitaiont to enable userptr
+
+  //src image
+  int ret = posix_memalign(&buf_data[0], alignment, sizeof(uint32_t) * w * h);
+  OCL_ASSERT(ret == 0);
+  for (size_t i = 0; i < w*h; ++i)
+    ((uint32_t*)buf_data[0])[i] = i;
+
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_USE_HOST_PTR, &format, &desc, buf_data[0]);
+
+  //dst image
+  ret = posix_memalign(&buf_data[1], alignment, sizeof(uint32_t) * w * h);
+  OCL_ASSERT(ret == 0);
+  for (size_t i = 0; i < w*h; ++i)
+    ((uint32_t*)buf_data[1])[i] = 0;
+
+  OCL_CREATE_IMAGE(buf[1], CL_MEM_USE_HOST_PTR, &format, &desc, buf_data[1]);
+
+  OCL_CREATE_KERNEL("runtime_use_host_ptr_image");
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  size_t origin[3];
+  origin[0] = 0;
+  origin[1] = 0;
+  origin[2] = 0;
+  size_t region[3];
+  region[0] = w;
+  region[1] = h;
+  region[2] = 1;
+  size_t pitch = 0;
+  void* mapptr = (int*)clEnqueueMapImage(queue, buf[1], CL_TRUE, CL_MAP_READ, origin, region, &pitch, NULL, 0, NULL, NULL, NULL);
+  OCL_ASSERT(mapptr == buf_data[1]);
+  for (uint32_t i = 0; i < w*h; ++i) {
+    //printf("%d: src: 0x%x, dst: 0x%x\n", i, ((uint32_t*)buf_data[0])[i], ((uint32_t*)buf_data[1])[i]);
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+  }
+  clEnqueueUnmapMemObject(queue, buf[1], mapptr, 0, NULL, NULL);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+  free(buf_data[1]);
+  buf_data[1] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_use_host_ptr_image);
diff --git a/utests/sub_buffer.cpp b/utests/sub_buffer.cpp
index 6228034..04cfee7 100644
--- a/utests/sub_buffer.cpp
+++ b/utests/sub_buffer.cpp
@@ -116,7 +116,7 @@ void sub_buffer_check(void)
 #endif
             for (int i = 0; i < 32; i++) {
 
-                if (((char *)mapped_ptr)[i] != sub_buf_content[i]) {
+                if (mapped_ptr && ((char *)mapped_ptr)[i] != sub_buf_content[i]) {
                     printf ("different index is %d\n", i);
                     OCL_ASSERT(0);
                 }
diff --git a/utests/test_printf.cpp b/utests/test_printf.cpp
index 3601574..84c3fae 100644
--- a/utests/test_printf.cpp
+++ b/utests/test_printf.cpp
@@ -16,3 +16,57 @@ void test_printf(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(test_printf);
+
+void test_printf_1(void)
+{
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("test_printf", "test_printf_1");
+  globals[0] = 1;
+  locals[0] = 1;
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_printf_1);
+
+void test_printf_2(void)
+{
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("test_printf", "test_printf_2");
+  globals[0] = 4;
+  locals[0] = 2;
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_printf_2);
+
+void test_printf_3(void)
+{
+  char c = '@';
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("test_printf", "test_printf_3");
+  globals[0] = 1;
+  locals[0] = 1;
+  OCL_SET_ARG(0, sizeof(char), &c);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_printf_3);
+
+void test_printf_4(void)
+{
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("test_printf", "test_printf_4");
+  globals[0] = 1;
+  locals[0] = 1;
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_printf_4);
diff --git a/utests/utest.cpp b/utests/utest.cpp
index 0a03d8b..336fe67 100644
--- a/utests/utest.cpp
+++ b/utests/utest.cpp
@@ -31,6 +31,12 @@
 #include <cstring>
 #include <stdlib.h>
 #include <csignal>
+#include <algorithm>
+#include <random>
+#include <chrono>
+#include <iterator>
+#include <semaphore.h>
+#include <unistd.h>
 
 struct signalMap
 {
@@ -39,7 +45,9 @@ struct signalMap
 };
 
 using namespace std;
+sem_t tag;
 vector<UTest> *UTest::utestList = NULL;
+vector<int> v;
 // Initialize and declare statistics struct
 RStatistics UTest::retStatistics;
 
@@ -61,7 +69,7 @@ void runSummaryAtExit(void) {
 
 void signalHandler( int signum )
 {
-  const char* name = NULL;
+  const char* name = "";
 
   signalMap arr[] = {
     {"SIGILL",  SIGILL},
@@ -105,6 +113,31 @@ void catch_signal(void){
       perror("Could not set signal handler");
   }
 }
+void *multithread(void * arg)
+{
+  int SerialNumber;
+  //size_t PhtreadNumber = (size_t)arg;
+
+  while(! v.empty()){
+    sem_wait(&tag);
+
+    SerialNumber = v.back();
+    v.pop_back();
+
+    sem_post(&tag);
+
+    const  UTest &utest = (*UTest::utestList)[SerialNumber];
+    if (utest.fn == NULL || utest.haveIssue || utest.isBenchMark) continue;
+   // printf("thread%lu  %d, utests.name is %s\n",PhtreadNumber, SerialNumber,utest.name);
+
+    UTest::do_run(utest);
+    cl_kernel_destroy(true);
+    cl_buffer_destroy();
+  }
+
+  return 0;
+}
+
 
 UTest::UTest(Function fn, const char *name, bool isBenchMark, bool haveIssue, bool needDestroyProgram)
        : fn(fn), name(name), isBenchMark(isBenchMark), haveIssue(haveIssue), needDestroyProgram(needDestroyProgram) {
@@ -148,6 +181,34 @@ void UTest::run(const char *name) {
   }
 }
 
+void UTest::runMultiThread(const char *number) {
+  if (number == NULL) return;
+  if (utestList == NULL) return;
+
+  unsigned long i, num;
+  sem_init(&tag, 0, 1);
+
+  num = atoi(number);
+
+  unsigned long max_num = sysconf(_SC_NPROCESSORS_ONLN);
+
+  if(num < 1 || num > max_num){
+    printf("the value range of multi-thread is [1 - %lu]",max_num);
+    return;
+  }
+
+  for(i = 0; i < utestList->size(); ++i) v.push_back (i);
+  unsigned seed = chrono::system_clock::now ().time_since_epoch ().count ();
+  shuffle (v.begin (), v.end (), std::default_random_engine (seed));
+
+  pthread_t pthread_arry[num];
+
+  for(i=0; i<num;i++) pthread_create(&pthread_arry[i], NULL, multithread, (void *)i);
+  for(i=0; i<num;i++) pthread_join(pthread_arry[i], NULL);
+
+  sem_destroy(&tag);
+}
+
 void UTest::runAll(void) {
   if (utestList == NULL) return;
 
@@ -187,9 +248,30 @@ void UTest::runAllBenchMark(void) {
 void UTest::listAllCases()
 {
   if (utestList == NULL) return;
-    for (size_t i = 0; i < utestList->size(); ++i) {
-      const UTest &utest = (*utestList)[i];
-      if (utest.fn == NULL) continue;
+  for (size_t i = 0; i < utestList->size(); ++i) {
+    const UTest &utest = (*utestList)[i];
+    if (utest.fn == NULL)
+      continue;
+    std::cout << utest.name << std::endl;
+  }
+}
+void UTest::listCasesCanRun()
+{
+  if (utestList == NULL) return;
+  for (size_t i = 0; i < utestList->size(); ++i) {
+    const UTest &utest = (*utestList)[i];
+    if (utest.fn == NULL || utest.haveIssue || utest.isBenchMark)
+      continue;
     std::cout << utest.name << std::endl;
- }
+  }
+}
+void UTest::listCasesWithIssue()
+{
+  if (utestList == NULL) return;
+  for (size_t i = 0; i < utestList->size(); ++i) {
+    const UTest &utest = (*utestList)[i];
+    if (utest.fn == NULL || !utest.haveIssue || utest.isBenchMark)
+      continue;
+    std::cout << utest.name << std::endl;
+  }
 }
diff --git a/utests/utest.hpp b/utests/utest.hpp
index 7ae8b87..ca233aa 100644
--- a/utests/utest.hpp
+++ b/utests/utest.hpp
@@ -54,6 +54,8 @@ struct UTest
   Function fn;
   /*! Name of the test */
   const char *name;
+  /*! numbers of the jobs */
+  const char *number;
   /*! whether it is a bench mark. */
   bool isBenchMark;
   /*! Indicate whether current test cases has issue to be fixes */
@@ -64,6 +66,8 @@ struct UTest
   static std::vector<UTest> *utestList;
   /*! Run the test with the given name */
   static void run(const char *name);
+  /*! Run the test with the given name */
+  static void runMultiThread(const char *number);
   /*! Run all the tests without known issue*/
   static void runAllNoIssue(void);
   /*! Run all the benchmark. */
@@ -72,6 +76,10 @@ struct UTest
   static void runAll(void);
   /*! List all test cases */
   static void listAllCases(void);
+  /*! List test cases that can run*/
+  static void listCasesCanRun(void);
+  /*! List test cases with issue*/
+  static void listCasesWithIssue(void);
   /*! Statistics struct */
   static RStatistics retStatistics;
   /*! Do run a test case actually */
@@ -94,15 +102,15 @@ struct UTest
 /*! Register a test case which has issue to be fixed */
 #define MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(FN) \
   static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
-  static const UTest __##FN##__(__ANON__##FN##__, #FN, true);
+  static const UTest __##FN##__(__ANON__##FN##__, #FN, false ,true);
 
 /*! Turn a function into a unit performance test */
-#define MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(FN, KEEP_PROGRAM) \
-  static void __ANON__##FN##__(void) { BENCHMARK(FN()); } \
+#define MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(FN, KEEP_PROGRAM, ...) \
+  static void __ANON__##FN##__(void) { BENCHMARK(FN(), __VA_ARGS__); } \
   static const UTest __##FN##__(__ANON__##FN##__, #FN, true, false, !(KEEP_PROGRAM));
 
-#define MAKE_BENCHMARK_FROM_FUNCTION(FN) \
-  static void __ANON__##FN##__(void) { BENCHMARK(FN()); } \
+#define MAKE_BENCHMARK_FROM_FUNCTION(FN, ...) \
+  static void __ANON__##FN##__(void) { BENCHMARK(FN(), __VA_ARGS__); } \
   static const UTest __##FN##__(__ANON__##FN##__, #FN, true);
 
 
@@ -134,12 +142,12 @@ struct UTest
     } \
   } while (0)
 
-#define BENCHMARK(EXPR) \
+#define BENCHMARK(EXPR, ...) \
  do { \
     double ret = 0;\
     try { \
       ret = EXPR; \
-      std::cout << "    [Result: " << std::fixed<< std::setprecision(3) << ret << " GB/S]    [SUCCESS]" << std::endl; \
+      std::cout << "    [Result: " << std::fixed<< std::setprecision(3) << ret << " " << __VA_ARGS__ << "]    [SUCCESS]" << std::endl; \
       UTest::retStatistics.passCount += 1; \
     } \
     catch (Exception e) { \
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index 91cc938..2c02ad6 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -1,11 +1,11 @@
 #!/usr/bin/python
 from __future__ import print_function
-import os,sys,re
+import os,sys,re,string
 
 FLT_MAX_POSI='0x1.fffffep127f'
 FLT_MIN_NEGA='-0x1.fffffep127f'
-FLT_MIN_POSI='0x1.0p-126f'
-FLT_MAX_NEGA='-0x1.0p-126f'
+FLT_MIN_POSI='ldexpf(1.0, -126)'
+FLT_MAX_NEGA='ldexpf(-1.0, -126)'
 
 paraTypeList={'float':'%e','int':'%d','double':'%lf','uint':'%d','string':'%s'}
 
@@ -112,10 +112,10 @@ def udebug(ulpSize,returnType,function):
     ULPSIZE_FACTOR = select_ulpsize(ULPSIZE_FAST_MATH,ULPSIZE_NO_FAST_MATH);
     bool fast_math = ULPSIZE_FACTOR == ULPSIZE_FAST_MATH;
 
-    if (isinf(cpu_data[index])){
+    if (std::isinf(cpu_data[index])){
       INFORNAN="INF";
     }
-    else if (isnan(cpu_data[index])){
+    else if (std::isnan(cpu_data[index])){
       INFORNAN="NAN";
     }
     else{
@@ -124,14 +124,14 @@ def udebug(ulpSize,returnType,function):
     }
 
 #if udebug 
-    if (isinf(cpu_data[index])){ 
-      if (isinf(gpu_data[index]))
+    if (std::isinf(cpu_data[index])){
+      if (std::isinf(gpu_data[index]))
         printf("%s expect:%s\\n", log, INFORNAN);
       else
         printf_c("%s expect:%s\\n", log, INFORNAN);
       }
-    else if (isnan(cpu_data[index])){
-      if (isnan(gpu_data[index]))
+    else if (std::isnan(cpu_data[index])){
+      if (std::isnan(gpu_data[index]))
         printf("%s expect:%s\\n", log, INFORNAN);
       else
         printf_c("%s expect:%s\\n", log, INFORNAN);
@@ -142,13 +142,13 @@ def udebug(ulpSize,returnType,function):
     else
       printf_c("%s expect:%s\\n", log, ULPSIZE);
 #else
-    if (isinf(cpu_data[index])){
+    if (std::isinf(cpu_data[index])){
       sprintf(log, "%s expect:%s\\n", log, INFORNAN);
-      OCL_ASSERTM(isinf(gpu_data[index]) || fast_math,log);
+      OCL_ASSERTM(std::isinf(gpu_data[index]) || fast_math,log);
     }
-    else if (isnan(cpu_data[index])){
+    else if (std::isnan(cpu_data[index])){
       sprintf(log, "%s expect:%s\\n", log, INFORNAN);
-      OCL_ASSERTM(isnan(gpu_data[index]) || fast_math,log);
+      OCL_ASSERTM(std::isnan(gpu_data[index]) || fast_math,log);
     }
     else{
       sprintf(log, "%s expect:%s\\n", log, ULPSIZE);
@@ -247,7 +247,7 @@ which can print more values and information to assist debuging the issue.
   def argvector(self,paraN,index):
     vector=re.findall(r"[0-9]+",self.inputtype[paraN][index])
     if vector:
-      vector=vector[0]
+      vector=string.atoi(vector[0])
     else:
       vector=1
     return vector
@@ -272,10 +272,17 @@ which can print more values and information to assist debuging the issue.
 #####Cpu values analyse
   def GenInputValues(self,index):
     #namesuffix=self.inputtype[0][index]
+    vlen = self.argvector(self.inputtype.__len__()-1,index)
     for i in range(0,self.values.__len__()):
-      self.cpplines += [ "const %s input_data%d[] = {%s};" %(self.argtype(i,index),i+1,str(self.values[i]).strip('[]').replace('\'','')) ]
+      vals = []
+      for j in range(0, vlen):
+        if (len(vals) >= 128):	#avoid too many data
+          vals = vals[0:128]
+          break
+        vals += self.values[i]
+      self.cpplines += [ "%s input_data%d[] = {%s};" %(self.argtype(i,index),i+1,str(vals).strip('[]').replace('\'','')) ]
     self.cpplines += [ "const int count_input = sizeof(input_data1) / sizeof(input_data1[0]);" ]
-    self.cpplines += [ "const int vector = %s;\n"%(self.argvector(self.inputtype.__len__()-1,index)) ]
+    self.cpplines += [ "int vector = %s;\n"%(vlen) ]
 
 #####Cpu Function
   def GenCpuCompilerMath(self,index):
@@ -340,7 +347,7 @@ static void %s_%s(void)
   OCL_CREATE_KERNEL(\"%s_%s\");
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * sizeof(%s), NULL); 
 
-  globals[0] = count_input;
+  globals[0] = count_input / vector;
   locals[0] = 1;
  '''%(self.fileName,namesuffix,\
      self.retType(index),\
@@ -361,11 +368,15 @@ static void %s_%s(void)
 
     funcrun='''
   // Run the kernel:
+  //int errRead = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(%s) * count_input, gpu_data, 0, NULL, NULL);
   OCL_NDRANGE( 1 );
-  clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(%s) * count_input, gpu_data, 0, NULL, NULL);
-'''%(self.inputtype.__len__()+1)
+  OCL_MAP_BUFFER(0);
+'''%(self.argtype(0,index))
     funcline += [ funcrun ]
 
+    text = ''' memcpy(gpu_data, buf_data[0], sizeof(gpu_data)); '''
+    funcline += [ text ]
+
     funcsprintfa='    sprintf(log, \"'
     funcsprintfb=''
     if (self.returnVector(index) == 1 and self.argvector(0,index) != 1):
@@ -418,8 +429,8 @@ static void %s_%s(void)
       clhead += ' __global %s *src%d,'%(self.argtype(i,index),i+1)
       clvalueDef +=   '  %s x%d = (%s) ('%(self.inputtype[i][index],i+1,self.inputtype[i][index])
       tmp = 'src%d[i * (*vector) + '%(i+1)
-      for j in range(0,int(self.argvector(i,index))):
-        clvalueDef += tmp + ((int(self.argvector(i-1,index)) == j+1 ) and '%d]);\n'%(j) or '%d],'%(j))
+      for j in range(0,self.argvector(i,index)):
+        clvalueDef += tmp + ((self.argvector(i-1,index) == j+1 ) and '%d]);\n'%(j) or '%d],'%(j))
       clcomputer += (self.values.__len__() == i+1) and 'x%d);'%(i+1) or 'x%d,'%(i+1)
       
     clhead += ' __global int *vector) {\n'
@@ -446,6 +457,8 @@ static void %s_%s(void)
       #The head:
       self.cpplines += [self.Head]
 
+      self.cpplines += ["namespace {\n"]
+
       #Parameters:
       self.GenInputValues(i)
 
@@ -458,6 +471,8 @@ static void %s_%s(void)
       #utest function
       self.utestFunc(i)
 
+      self.cpplines += ["}\n"]
+
       #kernel cl
       self.genCL(i)
 
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 8f772fd..70a69cc 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -28,6 +28,7 @@
 #include <cstring>
 #include <cassert>
 #include <cmath>
+#include <algorithm>
 
 #define FATAL(...) \
 do { \
@@ -46,14 +47,15 @@ do { \
 cl_platform_id platform = NULL;
 cl_device_id device = NULL;
 cl_context ctx = NULL;
-cl_program program = NULL;
-cl_kernel kernel = NULL;
+__thread cl_program program = NULL;
+__thread cl_kernel kernel = NULL;
 cl_command_queue queue = NULL;
-cl_mem buf[MAX_BUFFER_N] = {};
-void *buf_data[MAX_BUFFER_N] = {};
-size_t globals[3] = {};
-size_t locals[3] = {};
+__thread cl_mem buf[MAX_BUFFER_N] = {};
+__thread void *buf_data[MAX_BUFFER_N] = {};
+__thread size_t globals[3] = {};
+__thread size_t locals[3] = {};
 float ULPSIZE_FAST_MATH = 10000.;
+__attribute__ ((visibility ("internal"))) clGetKernelSubGroupInfoKHR_cb* utestclGetKernelSubGroupInfoKHR = NULL;
 
 #ifdef HAS_EGL
 Display    *xDisplay;
@@ -209,19 +211,12 @@ clpanic(const char *msg, int rval)
 char*
 cl_do_kiss_path(const char *file, cl_device_id device)
 {
-  cl_int ver;
   const char *sub_path = NULL;
   char *ker_path = NULL;
   const char *kiss_path = getenv("OCL_KERNEL_PATH");
   size_t sz = strlen(file);
 
-  if (device == NULL)
-    sub_path = "";
-  else {
-    if (clGetGenVersionIntel(device, &ver) != CL_SUCCESS)
-      clpanic("Unable to get Gen version", -1);
-    sub_path = "";
-  }
+  sub_path = "";
 
   if (kiss_path == NULL)
     clpanic("set OCL_KERNEL_PATH. This is where the kiss kernels are", -1);
@@ -244,10 +239,14 @@ cl_kernel_init(const char *file_name, const char *kernel_name, int format, const
   if (!program || (program && (!prevFileName || strcmp(prevFileName, file_name)))) {
     if (program) clReleaseProgram(program);
     ker_path = cl_do_kiss_path(file_name, device);
-    if (format == LLVM)
-      program = clCreateProgramWithLLVMIntel(ctx, 1, &device, ker_path, &status);
-    else if (format == SOURCE) {
+    if (format == LLVM) {
+      assert(0);
+    } else if (format == SOURCE) {
       cl_file_map_t *fm = cl_file_map_new();
+      if(!fm) {
+        fprintf(stderr, "run out of memory\n");
+        goto error;
+      }
       FATAL_IF (cl_file_map_open(fm, ker_path) != CL_FILE_MAP_SUCCESS,
                 "Failed to open file \"%s\" with kernel \"%s\". Did you properly set OCL_KERNEL_PATH variable?",
                 file_name, kernel_name);
@@ -286,6 +285,122 @@ error:
   goto exit;
 }
 
+int
+cl_kernel_compile(const char *file_name, const char *kernel_name, const char * compile_opt)
+{
+  cl_file_map_t *fm = NULL;
+  char *ker_path = NULL;
+  cl_int status = CL_SUCCESS;
+  static const char *prevFileName = NULL;
+
+  /* Load the program and build it */
+  if (!program || (program && (!prevFileName || strcmp(prevFileName, file_name)))) {
+    if (program) clReleaseProgram(program);
+    ker_path = cl_do_kiss_path(file_name, device);
+    cl_file_map_t *fm = cl_file_map_new();
+    if(!fm) {
+      fprintf(stderr, "run out of memory\n");
+      goto error;
+    }
+    FATAL_IF (cl_file_map_open(fm, ker_path) != CL_FILE_MAP_SUCCESS,
+                "Failed to open file \"%s\" with kernel \"%s\". Did you properly set OCL_KERNEL_PATH variable?",
+                file_name, kernel_name);
+    const char *src = cl_file_map_begin(fm);
+    const size_t sz = cl_file_map_size(fm);
+    program = clCreateProgramWithSource(ctx, 1, &src, &sz, &status);
+    cl_file_map_delete(fm);
+    
+    if (status != CL_SUCCESS) {
+      fprintf(stderr, "error calling clCreateProgramWithSource\n");
+      goto error;
+    }
+    prevFileName = file_name;
+
+    OCL_CALL (clCompileProgram, program,
+                                1, &device, // num_devices & device_list
+                                compile_opt, // compile_options
+                                0, // num_input_headers
+                                NULL,
+                                NULL,
+                                NULL, NULL);
+   OCL_ASSERT(status == CL_SUCCESS);
+
+  }
+
+exit:
+  free(ker_path);
+  cl_file_map_delete(fm);
+  return status;
+error:
+  prevFileName = NULL;
+  goto exit;
+}
+
+int
+cl_kernel_link(const char *file_name, const char *kernel_name, const char * link_opt)
+{
+  cl_file_map_t *fm = NULL;
+  char *ker_path = NULL;
+  cl_int status = CL_SUCCESS;
+  static const char *prevFileName = NULL;
+
+  /* Load the program and build it */
+  if (!program || (program && (!prevFileName || strcmp(prevFileName, file_name)))) {
+    if (program) clReleaseProgram(program);
+    ker_path = cl_do_kiss_path(file_name, device);
+    cl_file_map_t *fm = cl_file_map_new();
+    if(!fm) {
+      fprintf(stderr, "run out of memory\n");
+      goto error;
+    }
+    FATAL_IF (cl_file_map_open(fm, ker_path) != CL_FILE_MAP_SUCCESS,
+                "Failed to open file \"%s\" with kernel \"%s\". Did you properly set OCL_KERNEL_PATH variable?",
+                file_name, kernel_name);
+    const char *src = cl_file_map_begin(fm);
+    const size_t sz = cl_file_map_size(fm);
+    program = clCreateProgramWithSource(ctx, 1, &src, &sz, &status);
+    cl_file_map_delete(fm);
+    
+    if (status != CL_SUCCESS) {
+      fprintf(stderr, "error calling clCreateProgramWithSource\n");
+      goto error;
+    }
+    prevFileName = file_name;
+
+    OCL_CALL (clCompileProgram, program,
+                                1, &device, // num_devices & device_list
+                                NULL, // compile_options
+                                0, // num_input_headers
+                                NULL,
+                                NULL,
+                                NULL, NULL);
+    OCL_ASSERT(status==CL_SUCCESS);
+    cl_program input_programs[1] = {program};
+    program = clLinkProgram(ctx, 1, &device, link_opt, 1, input_programs, NULL, NULL, &status);
+    OCL_ASSERT(program != NULL);
+    OCL_ASSERT(status == CL_SUCCESS);
+    clReleaseProgram(input_programs[0]);
+  }
+  
+  /* Create a kernel from the program */
+  if (kernel)
+    clReleaseKernel(kernel);
+  kernel = clCreateKernel(program, kernel_name, &status);
+  if (status != CL_SUCCESS) {
+    fprintf(stderr, "error calling clCreateKernel\n");
+    goto error;
+  }
+
+exit:
+  free(ker_path);
+  cl_file_map_delete(fm);
+  return status;
+error:
+  prevFileName = NULL;
+  goto exit;
+}
+
+
 #define GET_PLATFORM_STR_INFO(LOWER_NAME, NAME) \
   { \
     size_t param_value_size; \
@@ -454,8 +569,6 @@ cl_test_destroy(void)
 {
   cl_kernel_destroy();
   cl_ocl_destroy();
-  printf("%i memory leaks\n", clReportUnfreedIntel());
-  assert(clReportUnfreedIntel() == 0);
 }
 
 void
@@ -464,7 +577,7 @@ cl_buffer_destroy(void)
   int i;
   for (i = 0; i < MAX_BUFFER_N; ++i) {
     if (buf_data[i] != NULL) {
-      clUnmapBufferIntel(buf[i]);
+      clEnqueueUnmapMemObject(queue, buf[i], buf_data[i], 0, NULL, NULL);
       buf_data[i] = NULL;
     }
     if (buf[i] != NULL) {
@@ -482,7 +595,7 @@ cl_report_perf_counters(cl_mem perf)
   uint32_t i;
   if (perf == NULL)
     return;
-  start = (uint32_t*) clMapBufferIntel(perf, &status);
+  start = (uint32_t*)clEnqueueMapBuffer(queue, perf, CL_TRUE, CL_MAP_READ, 0,  128 * sizeof(uint32_t)/*size*/, 0, NULL, NULL, &status);
   assert(status == CL_SUCCESS && start != NULL);
   end = start + 128;
 
@@ -507,7 +620,7 @@ cl_report_perf_counters(cl_mem perf)
   }
   printf("\n\n");
 
-  clUnmapBufferIntel(perf);
+  clEnqueueUnmapMemObject(queue, perf, start, 0, NULL, NULL);
 }
 
 struct bmphdr {
@@ -586,7 +699,15 @@ void cl_write_bmp(const int *data, int width, int height, const char *filename)
 {
   int x, y;
 
-  FILE *fp = fopen(filename, "wb");
+  FILE *fp = NULL;
+#if defined(__ANDROID__)
+  char dst_img[256];
+  snprintf(dst_img, sizeof(dst_img), "/sdcard/ocl/%s", filename);
+  fp = fopen(dst_img, "wb");
+  if(fp == NULL) return;
+#else
+  fp = fopen(filename, "wb");
+#endif
   assert(fp);
 
   char *raw = (char *) malloc(width * height * sizeof(int));	// at most
@@ -714,3 +835,224 @@ float select_ulpsize(float ULPSIZE_FAST_MATH, float ULPSIZE_NO_FAST_MATH)
 
   return ULPSIZE_FACTOR;
 }
+
+int cl_check_double(void)
+{
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, &param_value_size);
+  std::vector<char> param_value(param_value_size);
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+           param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+  if (!param_value.empty())
+    extStr = std::string(&param_value.front(), param_value_size-1);
+
+  if (std::strstr(extStr.c_str(), "cl_khr_fp64") == NULL) {
+    printf("No cl_khr_fp64, Skip!");
+    return 0;
+  }
+
+  return 1;
+}
+
+int cl_check_beignet(void)
+{
+  size_t param_value_size;
+  size_t ret_sz;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_VERSION, 0, 0, &param_value_size);
+  if(param_value_size == 0) {
+    return 0;
+  }
+  char* device_version_str = (char* )malloc(param_value_size * sizeof(char) );
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_VERSION, param_value_size, (void*)device_version_str, &ret_sz);
+  OCL_ASSERT(ret_sz == param_value_size);
+
+  if(!strstr(device_version_str, "beignet")) {
+    free(device_version_str);
+    return 0;
+  }
+  free(device_version_str);
+  return 1;
+}
+
+int cl_check_subgroups(void)
+{
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, &param_value_size);
+  std::vector<char> param_value(param_value_size);
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+           param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+  if (!param_value.empty())
+    extStr = std::string(&param_value.front(), param_value_size-1);
+
+  if (std::strstr(extStr.c_str(), "cl_intel_subgroups") == NULL) {
+    printf("No cl_intel_subgroups, Skip!");
+    return 0;
+  }
+  if(utestclGetKernelSubGroupInfoKHR == NULL)
+    utestclGetKernelSubGroupInfoKHR  = (clGetKernelSubGroupInfoKHR_cb*) clGetExtensionFunctionAddress("clGetKernelSubGroupInfoKHR");
+  return 1;
+}
+
+int cl_check_ocl20(void)
+{
+  size_t param_value_size;
+  size_t ret_sz;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_OPENCL_C_VERSION, 0, 0, &param_value_size);
+  if(param_value_size == 0) {
+    printf("Not OpenCL 2.0 device, ");
+    if(cl_check_beignet()) {
+      printf("Beignet extension test!");
+      return 1;
+    } else {
+      printf("Not beignet device , Skip!");
+      return 0;
+    }
+  }
+  char* device_version_str = (char* )malloc(param_value_size * sizeof(char) );
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_OPENCL_C_VERSION, param_value_size, (void*)device_version_str, &ret_sz);
+  OCL_ASSERT(ret_sz == param_value_size);
+
+  if(!strstr(device_version_str, "2.0")) {
+    free(device_version_str);
+    printf("Not OpenCL 2.0 device, ");
+    if(cl_check_beignet()) {
+      printf("Beignet extension test!");
+      return 1;
+    } else {
+      printf("Not beignet device , Skip!");
+      return 0;
+    }
+  }
+  free(device_version_str);
+  return 1;
+}
+
+int cl_check_half(void)
+{
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, &param_value_size);
+  std::vector<char> param_value(param_value_size);
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+           param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+  if (!param_value.empty())
+    extStr = std::string(&param_value.front(), param_value_size-1);
+
+  if (std::strstr(extStr.c_str(), "cl_khr_fp16") == NULL) {
+    printf("No cl_khr_fp16, Skip!");
+    return 0;
+  }
+
+  return 1;
+}
+
+uint32_t __half_to_float(uint16_t h, bool* isInf, bool* infSign)
+{
+  struct __FP32 {
+    uint32_t mantissa:23;
+    uint32_t exponent:8;
+    uint32_t sign:1;
+  };
+  struct __FP16 {
+    uint32_t mantissa:10;
+    uint32_t exponent:5;
+    uint32_t sign:1;
+  };
+  uint32_t f;
+  __FP32 o;
+  memset(&o, 0, sizeof(o));
+  __FP16 i;
+  memcpy(&i, &h, sizeof(uint16_t));
+
+  if (isInf)
+    *isInf = false;
+  if (infSign)
+    *infSign = false;
+
+  if (i.exponent == 0 && i.mantissa == 0) // (Signed) zero
+    o.sign = i.sign;
+  else {
+    if (i.exponent == 0) { // Denormal (converts to normalized)
+      // Adjust mantissa so it's normalized (and keep
+      // track of exponent adjustment)
+      int e = -1;
+      uint m = i.mantissa;
+      do {
+        e++;
+        m <<= 1;
+      } while ((m & 0x400) == 0);
+
+      o.mantissa = (m & 0x3ff) << 13;
+      o.exponent = 127 - 15 - e;
+      o.sign = i.sign;
+    } else if (i.exponent == 0x1f) { // Inf/NaN
+      // NOTE: Both can be handled with same code path
+      // since we just pass through mantissa bits.
+      o.mantissa = i.mantissa << 13;
+      o.exponent = 255;
+      o.sign = i.sign;
+
+      if (isInf) {
+        *isInf = (i.mantissa == 0);
+        if (infSign)
+          *infSign = !i.sign;
+      }
+    } else { // Normalized number
+      o.mantissa = i.mantissa << 13;
+      o.exponent = 127 - 15 + i.exponent;
+      o.sign = i.sign;
+    }
+  }
+
+  memcpy(&f, &o, sizeof(uint32_t));
+  return f;
+}
+
+
+uint16_t __float_to_half(uint32_t x)
+{
+  uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
+  uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
+  unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
+
+  /* If zero, or denormal, or exponent underflows too much for a denormal
+   * half, return signed zero. */
+  if (e < 103)
+    return bits;
+
+  /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
+  if (e > 142) {
+    bits |= 0x7c00u;
+    /* If exponent was 0xff and one mantissa bit was set, it means NaN,
+     * not Inf, so make sure we set one mantissa bit too. */
+    bits |= e == 255 && (x & 0x007fffffu);
+    return bits;
+  }
+
+  /* If exponent underflows but not too much, return a denormal */
+  if (e < 113) {
+    m |= 0x0800u;
+    /* Extra rounding may overflow and set mantissa to 0 and exponent
+     * to 1, which is OK. */
+    bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
+    return bits;
+  }
+
+  bits |= ((e - 112) << 10) | (m >> 1);
+  /* Extra rounding. An overflow will set mantissa to 0 and increment
+   * the exponent, which is OK. */
+  bits += m & 1;
+  return bits;
+}
+uint32_t as_uint(float f) {
+  union uint32_cast _tmp;
+  _tmp._float = f;
+  return _tmp._uint;
+}
+float as_float(uint32_t i) {
+  union uint32_cast _tmp;
+  _tmp._uint = i;
+  return _tmp._float;
+}
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 3b17606..5f2fea6 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -26,6 +26,7 @@
 #define __UTEST_HELPER_HPP__
 
 #include "CL/cl.h"
+#include "CL/cl_ext.h"
 #include "CL/cl_intel.h"
 #include "utest.hpp"
 #include "utest_assert.hpp"
@@ -34,6 +35,10 @@
 #include <cstdio>
 #include <cstdlib>
 
+#if defined(__ANDROID__)
+#define __thread
+#endif
+
 #ifdef HAS_EGL
 #define EGL_WINDOW_WIDTH 256
 #define EGL_WINDOW_HEIGHT 256
@@ -47,6 +52,11 @@ extern EGLContext  eglContext;
 extern EGLSurface  eglSurface;
 #endif
 
+union uint32_cast {
+  uint32_t _uint;
+  float _float;
+};
+
 #define OCL_THROW_ERROR(FN, STATUS) \
   do { \
     char msg[2048]; \
@@ -122,24 +132,60 @@ extern EGLSurface  eglSurface;
 #define OCL_CREATE_SAMPLER(SAMPLER, ADDRESS_MODE, FILTER_MODE)          \
     OCL_CALL2(clCreateSampler, SAMPLER, ctx, 0, ADDRESS_MODE, FILTER_MODE)
 
+#define OCL_CALL_MAP(FN, ID, RET, ...) \
+  do { \
+    cl_int status; \
+    size_t size = 0; \
+    status = clGetMemObjectInfo(buf[ID], CL_MEM_SIZE, sizeof(size), &size, NULL);\
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+    RET = FN(__VA_ARGS__, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, size, 0, NULL, NULL, &status);\
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+  } while (0)
+
 #define OCL_MAP_BUFFER(ID) \
-    OCL_CALL2(clMapBufferIntel, buf_data[ID], buf[ID])
+    OCL_CALL_MAP(clEnqueueMapBuffer, ID, buf_data[ID], queue, buf[ID])
 
 #define OCL_UNMAP_BUFFER(ID) \
   do { \
     if (buf[ID] != NULL) { \
-      OCL_CALL (clUnmapBufferIntel, buf[ID]); \
+      OCL_CALL (clEnqueueUnmapMemObject, queue, buf[ID], buf_data[ID], 0, NULL, NULL); \
       buf_data[ID] = NULL; \
     } \
   } while (0)
 
+#define OCL_CALL_MAP_GTT(FN, ID, RET, ...) \
+  do { \
+    cl_int status; \
+    size_t image_row_pitch = 0; \
+    status = clGetImageInfo(buf[ID], CL_IMAGE_ROW_PITCH, sizeof(image_row_pitch), &image_row_pitch, NULL);\
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+    size_t image_slice_pitch = 0; \
+    status = clGetImageInfo(buf[ID], CL_IMAGE_ROW_PITCH, sizeof(image_slice_pitch), &image_slice_pitch, NULL);\
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+    size_t image_width = 0; \
+    status = clGetImageInfo(buf[ID], CL_IMAGE_WIDTH, sizeof(image_width), &image_width, NULL);\
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+    size_t image_height = 0; \
+    status = clGetImageInfo(buf[ID], CL_IMAGE_HEIGHT, sizeof(image_height), &image_height, NULL);\
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+    size_t image_depth= 0; \
+    status = clGetImageInfo(buf[ID], CL_IMAGE_DEPTH, sizeof(image_depth), &image_depth, NULL);\
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+    if(image_depth == 0) image_depth = 1; \
+    if(image_height == 0) image_height = 1; \
+    size_t origin[3] = {0, 0, 0}; \
+    size_t region[3] = {image_width, image_height, image_depth}; \
+    RET = FN(__VA_ARGS__, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, origin, region, &image_row_pitch, &image_slice_pitch, 0, NULL, NULL, &status);\
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+  } while (0)
+
 #define OCL_MAP_BUFFER_GTT(ID) \
-    OCL_CALL2(clMapBufferGTTIntel, buf_data[ID], buf[ID])
+    OCL_CALL_MAP_GTT(clEnqueueMapImage, ID, buf_data[ID], queue, buf[ID])
 
 #define OCL_UNMAP_BUFFER_GTT(ID) \
   do { \
     if (buf[ID] != NULL) { \
-      OCL_CALL (clUnmapBufferGTTIntel, buf[ID]); \
+      OCL_CALL (clEnqueueUnmapMemObject, queue, buf[ID], buf_data[ID], 0, NULL, NULL); \
       buf_data[ID] = NULL; \
     } \
   } while (0)
@@ -158,13 +204,13 @@ enum { MAX_BUFFER_N = 16 };
 extern cl_platform_id platform;
 extern cl_device_id device;
 extern cl_context ctx;
-extern cl_program program;
-extern cl_kernel kernel;
+extern __thread cl_program program;
+extern __thread cl_kernel kernel;
 extern cl_command_queue queue;
-extern cl_mem buf[MAX_BUFFER_N];
-extern void* buf_data[MAX_BUFFER_N];
-extern size_t globals[3];
-extern size_t locals[3];
+extern __thread cl_mem buf[MAX_BUFFER_N];
+extern __thread void* buf_data[MAX_BUFFER_N];
+extern __thread size_t globals[3];
+extern __thread size_t locals[3];
 extern float ULPSIZE_FAST_MATH;
 
 enum {
@@ -194,6 +240,10 @@ extern int cl_ocl_init(void);
 /* Init program and kernel for the test */
 extern int cl_kernel_init(const char *file_name,
                 const char *kernel_name, int format, const char * build_opt);
+extern int cl_kernel_compile(const char *file_name, const char *kernel_name, 
+                const char * compile_opt);
+extern int cl_kernel_link(const char *file_name, const char *kernel_name, 
+                const char * link_opt);
 
 /* Get the file path */
 extern char* cl_do_kiss_path(const char *file, cl_device_id device);
@@ -237,5 +287,30 @@ double time_subtract(struct timeval *y, struct timeval *x, struct timeval *resul
 /* check ulpsize */
 float select_ulpsize(float ULPSIZE_FAST_MATH, float ULPSIZE_NO_FAST_MATH);
 
-#endif /* __UTEST_HELPER_HPP__ */
+/* Check is FP64 enabled. */
+extern int cl_check_double(void);
 
+/* Check is beignet device. */
+extern int cl_check_beignet(void);
+
+/* Check is intel subgroups enabled. */
+extern int cl_check_subgroups(void);
+
+typedef cl_int(clGetKernelSubGroupInfoKHR_cb)(cl_kernel, cl_device_id,
+                                              cl_kernel_sub_group_info, size_t,
+                                              const void *, size_t, void *,
+                                              size_t *);
+extern clGetKernelSubGroupInfoKHR_cb* utestclGetKernelSubGroupInfoKHR;
+
+/* Check is cl version 2.0. */
+extern int cl_check_ocl20(void);
+
+/* Check is FP16 enabled. */
+extern int cl_check_half(void);
+
+/* Helper function for half type numbers */
+extern uint32_t __half_to_float(uint16_t h, bool* isInf = NULL, bool* infSign = NULL);
+extern uint16_t __float_to_half(uint32_t x);
+extern float as_float(uint32_t i);
+extern uint32_t as_uint(float f);
+#endif /* __UTEST_HELPER_HPP__ */
diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
index 83edcc3..a4bfd51 100755
--- a/utests/utest_math_gen.py
+++ b/utests/utest_math_gen.py
@@ -349,16 +349,18 @@ static float atan2pi(float y, float x){
   ldexp_input_type2 = ['int','int2','int4','int8','int16']
   ldexp_output_type = ['float','float2','float4','float8','float16']
   ldexp_cpu_func='''
+namespace utest {
 static float ldexp(float x, int y){
     return x * exp2(y);
+}
 } '''
-  ldexpUtests = func('ldexp','ldexp',[ldexp_input_type1,ldexp_input_type2],ldexp_output_type,[ldexp_input_values1,ldexp_input_values2],'0 * FLT_ULP', ldexp_cpu_func)
+  ldexpUtests = func('ldexp','utest::ldexp',[ldexp_input_type1,ldexp_input_type2],ldexp_output_type,[ldexp_input_values1,ldexp_input_values2],'0 * FLT_ULP', ldexp_cpu_func)
 
   ##### gentype lgamma(gentype x)
   lgamma_input_values = base_input_values
   lgamma_input_type = ['float','float2','float4','float8','float16']
   lgamma_output_type = ['float','float2','float4','float8','float16']
-  lgammaUtests = func('lgamma','lgamma',[lgamma_input_type],lgamma_output_type,[lgamma_input_values],'4 * FLT_ULP')
+  lgammaUtests = func('lgamma','lgamma',[lgamma_input_type],lgamma_output_type,[lgamma_input_values],'16 * FLT_ULP')
 
   ##### gentype log(gentype)
   log_input_values = base_input_values
@@ -467,14 +469,30 @@ static float pown(float x, int y){
   pownUtests = func('pown','pown',[pown_input_type1,pown_input_type2],pown_output_type,[pown_input_values1,pown_input_values2],'16 * FLT_ULP', pown_cpu_func)
   
   ##### gentype powr(gentype x, gentype y)
-  powr_input_values1 = [80, -80, 3.14, -3.14, 0.5, 1, -1, 0.0,6,1500.24,-1500.24]
-  powr_input_values2 = [5,6,7,8,10,11,12,13,14,0,12]
+  powr_input_values1 = [80, -80, 3.14, 1, 1.257, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +1, +1, -80, +0.0, -0.0, +0.0, -0.0, 'INFINITY','INFINITY', +1, +1, +0.0, 2.5,' NAN', 'NAN', 'NAN']
+  powr_input_values2 = [5.5, 6,7, +0.0, -0.0, -1, -15.67, '-INFINITY', '-INFINITY', 1,  -2.7, 10.5, 3.1415, 3.5, -0.0, -0.0, +0.0, +0.0, +0.0, -0.0, 'INFINITY', '-INFINITY', 'NAN', 'NAN', -1.5, +0.0, 1.5]
   powr_input_type1 = ['float','float2','float4','float8','float16']
   powr_input_type2 = ['float','float2','float4','float8','float16']
   powr_output_type = ['float','float2','float4','float8','float16']
   powr_cpu_func='''
-static float powr(float x, int y){
-    if (x<0)
+static float powr(float x, float y){
+    if (((x > 0) && (x != +INFINITY)) && (y == 0.0f))
+        return 1;
+    else if ((x == 0.0f) && ((y < 0 ) || (y == -INFINITY)))
+        return +INFINITY;
+    else if ((x == 0.0f) && (y > 0))
+        return +0;
+    else if ((x == 0.0f) && (y == 0.0f))
+        return NAN;
+    else if ((x == +1) && ((y == +INFINITY) || (y == -INFINITY)))
+        return NAN;
+    else if ((x == +1) && ((y != +INFINITY) && (y != -INFINITY)))
+        return 1;
+    else if ((x == +INFINITY) && (y == 0.0f))
+        return NAN;
+    else if (std::isnan(x) || (x < 0))
+        return NAN;
+    else if ((x >=  0) && (std::isnan(y)))
         return NAN;
     else
         return powf(x,y);
diff --git a/utests/utest_run.cpp b/utests/utest_run.cpp
index 3cc1b6c..1866d9a 100644
--- a/utests/utest_run.cpp
+++ b/utests/utest_run.cpp
@@ -26,12 +26,14 @@
 #include "utest_helper.hpp"
 #include "utest_exception.hpp"
 #include <iostream>
+#include <string.h>
 #include <getopt.h>
 
-static const char *shortopts = "c:lanh";
+static const char *shortopts = "c:j:l::anh";
 struct option longopts[] = {
 {"casename", required_argument, NULL, 'c'},
-{"list", no_argument, NULL, 'l'},
+{"jobs", required_argument, NULL, 'j'},
+{"list", optional_argument, NULL, 'l'},
 {"all", no_argument, NULL, 'a'},
 {"allnoissue", no_argument, NULL, 'n'},
 {"help", no_argument, NULL, 'h'},
@@ -46,7 +48,8 @@ Usage:\n\
 \n\
   option:\n\
     -c <casename>: run sub-case named 'casename'\n\
-    -l           : list all the available case name\n\
+    -j <number>  : specifies the 'number' of jobs (multi-thread)\n\
+    -l <a/i>     : list case name that can run(a for all case, i for case with issue)\n\
     -a           : run all test cases\n\
     -n           : run all test cases without known issue (default option)\n\
     -h           : display this usage\n\
@@ -85,8 +88,32 @@ int main(int argc, char *argv[])
 
         break;
 
+      case 'j':
+        try {
+#if defined(__ANDROID__)
+          std::cout << "Do not support multithread in android, use single thread instead." << std::endl;
+          UTest::run(optarg);
+#else
+          UTest::runMultiThread(optarg);
+#endif
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
       case 'l':
-        UTest::listAllCases();
+        if (optarg == NULL)
+          UTest::listCasesCanRun();
+        else if (strcmp(optarg,"a") == 0)
+          UTest::listAllCases();
+        else if (strcmp(optarg,"i") == 0)
+          UTest::listCasesWithIssue();
+        else {
+          usage();
+          exit(1);
+        }
         break;
 
       case 'a':
@@ -129,5 +156,6 @@ int main(int argc, char *argv[])
 
 clean:
   cl_ocl_destroy();
+  return 0;
 }
 
diff --git a/utests/vload_bench.cpp b/utests/vload_bench.cpp
index ddfaaee..44c1dba 100644
--- a/utests/vload_bench.cpp
+++ b/utests/vload_bench.cpp
@@ -89,7 +89,7 @@ static double vload_bench_ ##kT(void) \
   } \
   return totBandwidth/j;\
 }\
-MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(vload_bench_ ##kT, true)
+MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(vload_bench_ ##kT, true, "GB/S")
 
 #ifdef BUILD_BENCHMARK
 VLOAD_BENCH(uint8_t, uchar)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git