[beignet] 01/07: Imported Upstream version 1.1.0

Sun Aug 9 15:34:56 UTC 2015

This is an automated email from the git hooks/post-receive script.

rnpalmer-guest pushed a commit to branch master
in repository beignet.

commit dd62b9e07dfe3339639ecc4a0d6a9903916d5b1c
Author: Rebecca N. Palmer <rebecca_palmer at zoho.com>
Date:   Sun Aug 9 12:48:41 2015 +0100

    Imported Upstream version 1.1.0
---
 CMake/FindLLVM.cmake                               |   16 +-
 CMakeLists.txt                                     |   49 +-
 backend/src/CMakeLists.txt                         |   11 +-
 backend/src/backend/context.cpp                    |    4 +-
 backend/src/backend/gen/gen_mesa_disasm.c          |  253 ++-
 backend/src/backend/gen75_context.cpp              |   17 +-
 backend/src/backend/gen75_encoder.cpp              |   80 +-
 backend/src/backend/gen75_encoder.hpp              |    9 +-
 backend/src/backend/gen7_encoder.cpp               |   15 +-
 backend/src/backend/gen8_context.cpp               | 1002 ++++++++++++
 backend/src/backend/gen8_context.hpp               |   52 +
 backend/src/backend/gen8_encoder.cpp               |  118 +-
 backend/src/backend/gen8_encoder.hpp               |   12 +-
 backend/src/backend/gen8_instruction.hpp           |    2 +
 backend/src/backend/gen9_context.cpp               |   57 +
 .../backend/{gen8_context.cpp => gen9_context.hpp} |   47 +-
 backend/src/backend/gen9_encoder.cpp               |   68 +
 backend/src/backend/gen9_encoder.hpp               |   53 +
 backend/src/backend/gen_context.cpp                |  437 +++++-
 backend/src/backend/gen_context.hpp                |   47 +-
 backend/src/backend/gen_defs.hpp                   |    2 +
 backend/src/backend/gen_encoder.cpp                |  314 +++-
 backend/src/backend/gen_encoder.hpp                |   39 +-
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |    3 +
 backend/src/backend/gen_insn_scheduling.cpp        |   45 +-
 backend/src/backend/gen_insn_selection.cpp         | 1626 +++++++++++++++-----
 backend/src/backend/gen_insn_selection.hpp         |   39 +-
 backend/src/backend/gen_insn_selection.hxx         |    4 +
 backend/src/backend/gen_program.cpp                |   22 +-
 backend/src/backend/gen_reg_allocation.cpp         |   29 +-
 backend/src/backend/gen_register.hpp               |  167 +-
 backend/src/backend/program.cpp                    |   11 +-
 backend/src/backend/program.h                      |    2 +
 backend/src/gbe_bin_generater.cpp                  |    8 +
 backend/src/ir/context.hpp                         |   14 +-
 backend/src/ir/function.cpp                        |   23 +-
 backend/src/ir/function.hpp                        |    2 +
 backend/src/ir/half.cpp                            |  220 +++
 backend/src/ir/half.hpp                            |   64 +
 backend/src/ir/immediate.cpp                       |    5 +-
 backend/src/ir/immediate.hpp                       |   58 +-
 backend/src/ir/instruction.cpp                     |  278 +++-
 backend/src/ir/instruction.hpp                     |   79 +-
 backend/src/ir/instruction.hxx                     |    6 +
 backend/src/ir/liveness.cpp                        |    5 +
 backend/src/ir/lowering.cpp                        |  153 +-
 backend/src/ir/profile.cpp                         |    6 +-
 backend/src/ir/profile.hpp                         |    6 +-
 backend/src/ir/structurizer.cpp                    |  996 ++++++++++++
 backend/src/ir/structurizer.hpp                    |  247 +++
 backend/src/libocl/CMakeLists.txt                  |    4 +-
 backend/src/libocl/include/ocl.h                   |    3 +-
 backend/src/libocl/include/ocl_async.h             |    2 +-
 backend/src/libocl/include/ocl_misc.h              |    8 -
 backend/src/libocl/include/ocl_printf.h            |    3 +
 backend/src/libocl/include/ocl_sync.h              |    2 +-
 backend/src/libocl/include/ocl_types.h             |    4 +-
 backend/src/libocl/include/ocl_workitem.h          |   16 +-
 backend/src/libocl/script/gen_vector.py            |   39 +-
 backend/src/libocl/script/ocl_as.sh                |    2 +-
 backend/src/libocl/script/ocl_common.def           |    6 +
 backend/src/libocl/script/ocl_convert.sh           |   57 +-
 backend/src/libocl/script/ocl_math.def             |   31 +-
 backend/src/libocl/script/ocl_relational.def       |   16 +-
 backend/src/libocl/script/ocl_simd.def             |    4 +
 backend/src/libocl/src/ocl_async.cl                |    2 +-
 backend/src/libocl/src/ocl_barrier.ll              |    2 +-
 backend/src/libocl/src/ocl_clz.ll                  |   62 +
 backend/src/libocl/src/ocl_geometric.cl            |   31 +-
 backend/src/libocl/src/ocl_workitem.cl             |    6 +-
 backend/src/libocl/tmpl/ocl_common.tmpl.cl         |   72 +-
 backend/src/libocl/tmpl/ocl_common.tmpl.h          |   13 +
 backend/src/libocl/tmpl/ocl_defines.tmpl.h         |    2 +
 backend/src/libocl/tmpl/ocl_integer.tmpl.cl        |  110 +-
 backend/src/libocl/tmpl/ocl_integer.tmpl.h         |    9 +
 backend/src/libocl/tmpl/ocl_math.tmpl.cl           |  507 +++++-
 backend/src/libocl/tmpl/ocl_math.tmpl.h            |  102 +-
 backend/src/libocl/tmpl/ocl_relational.tmpl.cl     |   66 +-
 backend/src/libocl/tmpl/ocl_relational.tmpl.h      |   23 +-
 .../ocl_printf.h => tmpl/ocl_simd.tmpl.cl}         |   17 +-
 .../{include/ocl_sync.h => tmpl/ocl_simd.tmpl.h}   |   23 +-
 backend/src/llvm/llvm_bitcode_link.cpp             |    7 +-
 backend/src/llvm/llvm_gen_backend.cpp              | 1242 +++++++++++----
 backend/src/llvm/llvm_gen_backend.hpp              |    3 -
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   26 +-
 backend/src/llvm/llvm_legalize.cpp                 |  704 ---------
 backend/src/llvm/llvm_printf_parser.cpp            |   15 +-
 backend/src/llvm/llvm_scalarize.cpp                |   33 +-
 backend/src/llvm/llvm_to_gen.cpp                   |   15 +-
 benchmark/CMakeLists.txt                           |    6 +-
 benchmark/benchmark_copy_buf.cpp                   |   51 +
 benchmark/benchmark_copy_buffer_to_image.cpp       |   66 +
 benchmark/benchmark_copy_image_to_buffer.cpp       |   64 +
 benchmark/benchmark_read_buffer.cpp                |    6 +-
 benchmark/benchmark_read_image.cpp                 |    6 +-
 benchmark/benchmark_use_host_ptr_buffer.cpp        |    6 +-
 benchmark/enqueue_copy_buf.cpp                     |   47 -
 docs/Beignet.mdwn                                  |   57 +-
 docs/Beignet/Backend.mdwn                          |   11 +-
 docs/NEWS.mdwn                                     |   11 +-
 docs/howto/cross-compiler-howto.mdwn               |   75 +-
 docs/howto/v4l2-buffer-sharing-howto.mdwn          |   64 +
 examples/CMakeLists.txt                            |   35 +-
 .../v4l2_buffer_sharing/v4l2_buffer_sharing.cpp    |  590 +++++++
 kernels/compiler_argument_structure_indirect.cl    |    4 +-
 kernels/compiler_argument_structure_select.cl      |   18 +
 kernels/compiler_async_copy.cl                     |    4 +-
 kernels/compiler_async_stride_copy.cl              |    4 +-
 kernels/compiler_bswap.cl                          |   24 +-
 kernels/compiler_ceil32.spir                       |  Bin 0 -> 1732 bytes
 kernels/compiler_clz.cl                            |   16 +
 kernels/compiler_clz_int.cl                        |    5 -
 kernels/compiler_clz_short.cl                      |    5 -
 kernels/compiler_get_sub_group_id.cl               |    8 +
 kernels/compiler_get_sub_group_size.cl             |    5 +
 kernels/compiler_half.cl                           |   11 +
 kernels/compiler_half_convert.cl                   |   56 +
 kernels/compiler_half_math.cl                      |   28 +
 kernels/compiler_half_relation.cl                  |   10 +
 kernels/compiler_long_div.cl                       |   12 +
 kernels/compiler_long_hi_sat.cl                    |   19 +
 kernels/compiler_long_not.cl                       |    6 +
 kernels/compiler_simd_all.cl                       |   12 -
 kernels/compiler_simd_any.cl                       |   15 -
 kernels/compiler_sub_group_all.cl                  |   12 +
 kernels/compiler_sub_group_any.cl                  |   15 +
 kernels/compiler_sub_group_shuffle.cl              |   18 +
 kernels/runtime_yuy2_processing.cl                 |   15 +
 src/CMakeLists.txt                                 |   19 +-
 src/cl_api.c                                       |   10 +-
 src/cl_command_queue.c                             |   27 +-
 src/cl_command_queue.h                             |    2 -
 src/cl_command_queue_gen7.c                        |    8 +
 src/cl_context.c                                   |   96 +-
 src/cl_context.h                                   |    6 +-
 src/cl_device_data.h                               |   58 +-
 src/cl_device_id.c                                 |  311 +++-
 src/cl_device_id.h                                 |    4 +-
 src/cl_driver.h                                    |   15 +
 src/cl_driver_defs.c                               |    2 +
 src/cl_event.c                                     |   20 +-
 src/cl_extensions.c                                |   70 +-
 src/cl_extensions.h                                |    4 +
 src/cl_gt_device.h                                 |    1 +
 src/cl_image.c                                     |    6 +
 src/cl_mem.c                                       |   62 +-
 src/cl_platform_id.c                               |   30 +-
 src/cl_platform_id.h                               |    4 +-
 src/cl_program.c                                   |   21 +-
 src/cl_program.h                                   |    3 +-
 src/cl_thread.c                                    |   30 +
 src/cl_thread.h                                    |    5 +
 src/intel/intel_defines.h                          |    2 +-
 src/intel/intel_driver.c                           |   94 +-
 src/intel/intel_driver.h                           |    1 +
 src/intel/intel_gpgpu.c                            |  273 +++-
 src/intel/intel_structs.h                          |   61 +-
 .../cl_internal_copy_buffer_to_image_2d_align16.cl |   18 +
 utests/CMakeLists.txt                              |   27 +-
 utests/builtin_pow.cpp                             |   16 +-
 utests/builtin_tgamma.cpp                          |   25 +-
 utests/compiler_argument_structure_indirect.cpp    |    7 +-
 ....cpp => compiler_argument_structure_select.cpp} |   20 +-
 utests/compiler_bswap.cpp                          |  203 ++-
 utests/compiler_clz.cpp                            |  144 ++
 utests/compiler_clz_int.cpp                        |   31 -
 utests/compiler_clz_short.cpp                      |   31 -
 utests/compiler_get_sub_group_id.cpp               |   33 +
 utests/compiler_get_sub_group_size.cpp             |   32 +
 utests/compiler_half.cpp                           |  924 +++++++++++
 utests/compiler_long_div.cpp                       |   88 ++
 utests/compiler_long_hi_sat.cpp                    |  187 +++
 utests/compiler_long_not.cpp                       |   52 +
 ...ler_simd_all.cpp => compiler_sub_group_all.cpp} |    6 +-
 ...ler_simd_any.cpp => compiler_sub_group_any.cpp} |    6 +-
 utests/compiler_sub_group_shuffle.cpp              |   45 +
 utests/load_program_from_spir.cpp                  |   90 ++
 utests/setenv.sh.in                                |    2 +
 utests/utest.hpp                                   |    8 +-
 utests/utest_generator.py                          |   15 +-
 utests/utest_helper.cpp                            |   18 +-
 utests/utest_helper.hpp                            |    6 +-
 utests/utest_math_gen.py                           |   38 +-
 utests/vload_bench.cpp                             |    6 +-
 184 files changed, 12118 insertions(+), 2733 deletions(-)

diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index 2340392..a148321 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -30,22 +30,16 @@ execute_process(
 string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1\\2" LLVM_VERSION_NODOT ${LLVM_VERSION})
 string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1.\\2" LLVM_VERSION_NOPATCH ${LLVM_VERSION})
 
-SET(LLVM_STABLE_VERSION_MAJOR "3")
-SET(LLVM_STABLE_VERSION_MINOR "5")
-SET(LLVM_STABLE_VERSION_NODOT "${LLVM_STABLE_VERSION_MAJOR}${LLVM_STABLE_VERSION_MINOR}")
-SET(LLVM_STABLE_VERSION "${LLVM_STABLE_VERSION_MAJOR}.${LLVM_STABLE_VERSION_MINOR}")
-
 if (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
   SET(LLVM_FIND_VERSION_NODOT "${LLVM_FIND_VERSION_MAJOR}${LLVM_FIND_VERSION_MINOR}")
   if (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
     message(FATAL_ERROR "imcompatible LLVM version ${LLVM_VERSION} required ${LLVM_FIND_VERSION}")
   else (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
-    if (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_STABLE_VERSION_NODOT)
-      message(STATUS "Found stable LLVM version ${LLVM_VERSION}")
-    else (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_STABLE_VERSION_NODOT)
-      message(STATUS "\tWarning: found unstable LLVM version ${LLVM_VERSION}")
-      message(STATUS "\tWarning: Stable version is ${LLVM_STABLE_VERSION}")
-    endif (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_STABLE_VERSION_NODOT)
+    if (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
+      message(STATUS "find stable LLVM version ${LLVM_VERSION}")
+    else (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
+      message(STATUS "find unstable LLVM version ${LLVM_VERSION}")
+    endif (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
     add_definitions("-DLLVM_${LLVM_VERSION_NODOT}")
   endif (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
 endif (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 437f489..9713a32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,8 +17,8 @@ endif ()
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
 set (LIBCL_DRIVER_VERSION_MAJOR 1)
-set (LIBCL_DRIVER_VERSION_MINOR 0)
-set (LIBCL_DRIVER_VERSION_PATCH 3)
+set (LIBCL_DRIVER_VERSION_MINOR 1)
+set (LIBCL_DRIVER_VERSION_PATCH 0)
 set (LIBCL_C_VERSION_MAJOR 1)
 set (LIBCL_C_VERSION_MINOR 2)
 if( ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
@@ -101,6 +101,7 @@ IF (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
   Find_Package(StandaloneGbeCompiler)
 ELSE (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
   # Front end stuff we need
+  #INCLUDE(CMake/FindLLVM.cmake)
   Find_Package(LLVM 3.3)
   SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
 ENDIF (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
@@ -137,6 +138,15 @@ IF(DRM_INTEL_FOUND)
   ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
     MESSAGE(STATUS "Disable userptr support")
   ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+  IF(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
+    MESSAGE(STATUS "Enable EU total query support")
+    SET(DRM_INTEL_EU_TOTAL "enable")
+    MESSAGE(STATUS "Enable subslice total query support")
+    SET(DRM_INTEL_SUBSLICE_TOTAL "enable")
+  ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
+    MESSAGE(STATUS "Disable EU total query support")
+    MESSAGE(STATUS "Disable subslice total query support")
+  ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
 ELSE(DRM_INTEL_FOUND)
   MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
 ENDIF(DRM_INTEL_FOUND)
@@ -207,23 +217,30 @@ IF(BUILD_EXAMPLES)
 IF(NOT X11_FOUND)
   MESSAGE(FATAL_ERROR "XLib is necessary for examples - not found")
 ENDIF(NOT X11_FOUND)
-# libva
-pkg_check_modules(LIBVA REQUIRED libva>=0.36.0)
-IF(LIBVA_FOUND)
+# libva & libva-x11
+#pkg_check_modules(LIBVA REQUIRED libva>=0.36.0)
+pkg_check_modules(LIBVA REQUIRED libva)
+pkg_check_modules(LIBVA-X11 REQUIRED libva-x11)
+set(LIBVA_BUF_SH_DEP false)
+set(V4L2_BUF_SH_DEP false)
+IF(LIBVA_FOUND AND LIBVA-X11_FOUND)
   MESSAGE(STATUS "Looking for LIBVA - found at ${LIBVA_PREFIX} ${LIBVA_VERSION}")
-  INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS})
-ELSE(LIBVA_FOUND)
-  MESSAGE(STATUS "Looking for LIBVA (>= 0.36.0) - not found")
-ENDIF(LIBVA_FOUND)
-
-# libva-x11
-pkg_check_modules(LIBVA-X11 REQUIRED libva-x11>=0.36.0)
-IF(LIBVA-X11_FOUND)
   MESSAGE(STATUS "Looking for LIBVA-X11 - found at ${LIBVA-X11_PREFIX} ${LIBVA-X11_VERSION}")
+  INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS})
   INCLUDE_DIRECTORIES(${LIBVA-X11_INCLUDE_DIRS})
-ELSE(LIBVA-X11_FOUND)
-  MESSAGE(STATUS "Looking for LIBVA-X11 (>= 0.36.0) - not found")
-ENDIF(LIBVA-X11_FOUND)
+  set(V4L2_BUF_SH_DEP true)
+  IF(LIBVA_VERSION VERSION_LESS "0.36.0" OR LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+    IF(LIBVA_VERSION VERSION_LESS "0.36.0")
+      MESSAGE(STATUS "Looking for LIBVA (>= 0.36.0) - not found")
+    ENDIF(LIBVA_VERSION VERSION_LESS "0.36.0")
+    IF(LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+      MESSAGE(STATUS "Looking for LIBVA-X11 (>= 0.36.0) - not found")
+    ENDIF(LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+    MESSAGE(STATUS "Example libva_buffer_sharing will not be built")
+  ELSE(LIBVA_VERSION VERSION_LESS "0.36.0" OR LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+    set(LIBVA_BUF_SH_DEP true)
+  ENDIF(LIBVA_VERSION VERSION_LESS "0.36.0" OR LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+ENDIF(LIBVA_FOUND AND LIBVA-X11_FOUND)
 ENDIF(BUILD_EXAMPLES)
 
 ADD_SUBDIRECTORY(include)
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index a6736ec..c0d0c23 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -53,6 +53,8 @@ set (GBE_SRC
     ir/sampler.hpp
     ir/image.cpp
     ir/image.hpp
+    ir/half.cpp
+    ir/half.hpp
     ir/instruction.cpp
     ir/instruction.hpp
     ir/liveness.cpp
@@ -66,10 +68,10 @@ set (GBE_SRC
     ir/lowering.hpp
     ir/printf.cpp
     ir/printf.hpp
-    ir/structural_analysis.cpp
-    ir/structural_analysis.hpp
     ir/immediate.hpp
     ir/immediate.cpp
+    ir/structurizer.hpp
+    ir/structurizer.cpp
     backend/context.cpp
     backend/context.hpp
     backend/program.cpp
@@ -80,7 +82,6 @@ set (GBE_SRC
     llvm/llvm_gen_backend.cpp
     llvm/llvm_passes.cpp
     llvm/llvm_scalarize.cpp
-    llvm/llvm_legalize.cpp
     llvm/llvm_intrinsic_lowering.cpp
     llvm/llvm_barrier_nodup.cpp
     llvm/llvm_printf_parser.cpp
@@ -108,6 +109,8 @@ set (GBE_SRC
     backend/gen75_context.cpp
     backend/gen8_context.hpp
     backend/gen8_context.cpp
+    backend/gen9_context.hpp
+    backend/gen9_context.cpp
     backend/gen_program.cpp
     backend/gen_program.hpp
     backend/gen_program.h
@@ -123,6 +126,8 @@ set (GBE_SRC
     backend/gen75_encoder.cpp
     backend/gen8_encoder.hpp
     backend/gen8_encoder.cpp
+    backend/gen9_encoder.hpp
+    backend/gen9_encoder.cpp
     )
 
 set (GBE_LINK_LIBRARIES
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 0dc60b7..b8dfa8c 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -400,9 +400,9 @@ namespace gbe
       return;
     // Be sure that the stack pointer is set
     // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
-    uint32_t stackSize = 1*KB;
+    uint32_t stackSize = 128;
     while (stackSize < fn.getStackSize()) {
-      stackSize <<= 1;
+      stackSize *= 3;
       GBE_ASSERT(stackSize <= 64*KB);
     }
     this->kernel->stackSize = stackSize;
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 4822de3..5220233 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -99,8 +99,8 @@ static const struct {
   [GEN_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
 
-  [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
-  [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -143,7 +143,7 @@ static const char *_abs[2] = {
   [1] = "(abs)",
 };
 
-static const char *vert_stride_gen7[16] = {
+static const char *vert_stride[16] = {
   [0] = "0",
   [1] = "1",
   [2] = "2",
@@ -153,15 +153,6 @@ static const char *vert_stride_gen7[16] = {
   [6] = "32",
   [15] = "VxH",
 };
-static const char *vert_stride_gen8[16] = {
-  [0] = "0",
-  [1] = "1",
-  [2] = "2",
-  [3] = "4",
-  [4] = "8",
-  [5] = "16",
-  [6] = "32",
-};
 
 static const char *width[8] = {
   [0] = "1",
@@ -266,7 +257,7 @@ static const char *access_mode[2] = {
   [1] = "align16",
 };
 
-static const char *reg_encoding[8] = {
+static const char *reg_encoding[11] = {
   [0] = ":UD",
   [1] = ":D",
   [2] = ":UW",
@@ -274,10 +265,13 @@ static const char *reg_encoding[8] = {
   [4] = ":UB",
   [5] = ":B",
   [6] = ":DF",
-  [7] = ":F"
+  [7] = ":F",
+  [8] = ":UQ",
+  [9] = ":Q",
+  [10] = ":HF"
 };
 
-int reg_type_size[8] = {
+int reg_type_size[11] = {
   [0] = 4,
   [1] = 4,
   [2] = 2,
@@ -285,7 +279,10 @@ int reg_type_size[8] = {
   [4] = 1,
   [5] = 1,
   [6] = 8,
-  [7] = 4
+  [7] = 4,
+  [8] = 8,
+  [9] = 8,
+  [10] = 2,
 };
 
 static const char *reg_file[4] = {
@@ -468,6 +465,17 @@ static int gen_version;
     bits;                                                       \
   })
 
+#define GEN_BITS_FIELD_WITH_TYPE(inst, gen, TYPE)               \
+  ({                                                            \
+    TYPE bits;                                                  \
+    if (gen_version < 80)                                       \
+      bits = ((const union Gen7NativeInstruction *)inst)->gen;	\
+    else                                                        \
+      bits = ((const union Gen8NativeInstruction *)inst)->gen;	\
+    bits;                                                       \
+  })
+
+
 #define GEN_BITS_FIELD2(inst, gen7, gen8)                       \
   ({                                                            \
     int bits;                                                   \
@@ -713,11 +721,7 @@ static int src_align1_region(FILE *file,
 {
   int err = 0;
   string(file, "<");
-  if (gen_version < 80) {
-    err |= control(file, "vert stride", vert_stride_gen7, _vert_stride, NULL);
-  } else {
-    err |= control(file, "vert stride", vert_stride_gen8, _vert_stride, NULL);
-  }
+  err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
   string(file, ",");
   err |= control(file, "width", width, _width, NULL);
   string(file, ",");
@@ -797,11 +801,7 @@ static int src_da16(FILE *file,
     format(file, ".%d", 16 / reg_type_size[_reg_type]);
   string(file, "<");
 
-  if (gen_version < 80) {
-    err |= control(file, "vert stride", vert_stride_gen7, _vert_stride, NULL);
-  } else {
-    err |= control(file, "vert stride", vert_stride_gen8, _vert_stride, NULL);
-  }
+  err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
   string(file, ",4,1>");
   /*
    * Three kinds of swizzle display:
@@ -844,7 +844,10 @@ static int src0_3src(FILE *file, const void* inst)
     return 0;
   if (GEN_BITS_FIELD(inst, bits2.da3src.src0_subreg_nr))
     format(file, ".%d", GEN_BITS_FIELD(inst, bits2.da3src.src0_subreg_nr));
-  string(file, "<4,1,1>");
+  if (GEN_BITS_FIELD(inst, bits2.da3src.src0_rep_ctrl))
+    string(file, "<0,1,0>");
+  else
+    string(file, "<8,8,1>");
   err |= control(file, "src da16 reg type", reg_encoding,
                  GEN_TYPE_F, NULL);
   /*
@@ -889,7 +892,10 @@ static int src1_3src(FILE *file, const void* inst)
     return 0;
   if (src1_subreg_nr)
     format(file, ".%d", src1_subreg_nr);
-  string(file, "<4,1,1>");
+  if (GEN_BITS_FIELD(inst, bits2.da3src.src1_rep_ctrl))
+    string(file, "<0,1,0>");
+  else
+    string(file, "<8,8,1>");
   err |= control(file, "src da16 reg type", reg_encoding,
                  GEN_TYPE_F, NULL);
   /*
@@ -931,7 +937,10 @@ static int src2_3src(FILE *file, const void* inst)
     return 0;
   if (GEN_BITS_FIELD(inst, bits3.da3src.src2_subreg_nr))
     format(file, ".%d", GEN_BITS_FIELD(inst, bits3.da3src.src2_subreg_nr));
-  string(file, "<4,1,1>");
+  if (GEN_BITS_FIELD(inst, bits3.da3src.src2_rep_ctrl))
+    string(file, "<0,1,0>");
+  else
+    string(file, "<8,8,1>");
   err |= control(file, "src da16 reg type", reg_encoding,
                  GEN_TYPE_F, NULL);
   /*
@@ -958,6 +967,57 @@ static int src2_3src(FILE *file, const void* inst)
   return err;
 }
 
+static uint32_t __conv_half_to_float(uint16_t h)
+{
+  struct __FP32 {
+    uint32_t mantissa:23;
+    uint32_t exponent:8;
+    uint32_t sign:1;
+  };
+  struct __FP16 {
+    uint32_t mantissa:10;
+    uint32_t exponent:5;
+    uint32_t sign:1;
+  };
+  uint32_t f;
+  struct __FP32 o;
+  memset(&o, 0, sizeof(o));
+  struct __FP16 i;
+  memcpy(&i, &h, sizeof(uint16_t));
+
+  if (i.exponent == 0 && i.mantissa == 0) // (Signed) zero
+    o.sign = i.sign;
+  else {
+    if (i.exponent == 0) { // Denormal (converts to normalized)
+      // Adjust mantissa so it's normalized (and keep
+      // track of exponent adjustment)
+      int e = -1;
+      uint m = i.mantissa;
+      do {
+        e++;
+        m <<= 1;
+      } while ((m & 0x400) == 0);
+
+      o.mantissa = (m & 0x3ff) << 13;
+      o.exponent = 127 - 15 - e;
+      o.sign = i.sign;
+    } else if (i.exponent == 0x1f) { // Inf/NaN
+      // NOTE: Both can be handled with same code path
+      // since we just pass through mantissa bits.
+      o.mantissa = i.mantissa << 13;
+      o.exponent = 255;
+      o.sign = i.sign;
+    } else { // Normalized number
+      o.mantissa = i.mantissa << 13;
+      o.exponent = 127 - 15 + i.exponent;
+      o.sign = i.sign;
+    }
+  }
+
+  memcpy(&f, &o, sizeof(uint32_t));
+  return f;
+}
+
 static int imm(FILE *file, uint32_t type, const void* inst)
 {
   switch (type) {
@@ -983,7 +1043,29 @@ static int imm(FILE *file, uint32_t type, const void* inst)
       format(file, "0x%xV", GEN_BITS_FIELD(inst, bits3.ud));
       break;
     case GEN_TYPE_F:
-      format(file, "%-gF", GEN_BITS_FIELD(inst, bits3.f));
+      format(file, "%-gF", GEN_BITS_FIELD_WITH_TYPE(inst, bits3.f, float));
+      break;
+    case GEN_TYPE_UL:
+      assert(!(gen_version < 80));
+      format(file, "0x%.8x %.8xUQ", (((const union Gen8NativeInstruction *)inst)->bits3).ud,
+                                (((const union Gen8NativeInstruction *)inst)->bits2).ud);
+      break;
+    case GEN_TYPE_L:
+    {
+      assert(!(gen_version < 80));
+      uint64_t val = (((const union Gen8NativeInstruction *)inst)->bits3).ud;
+      val = (val << 32) + ((((const union Gen8NativeInstruction *)inst)->bits2).ud);
+      format(file, "0x%lldQ", val);
+    }
+    case GEN_TYPE_HF_IMM:
+    {
+      uint16_t h = GEN_BITS_FIELD_WITH_TYPE(inst, bits3.d, uint16_t);
+      uint32_t uf = __conv_half_to_float(h);
+      float f;
+      memcpy(&f, &uf, sizeof(float));
+      format(file, "%-gHF", f);
+      break;
+    }
   }
   return 0;
 }
@@ -1005,10 +1087,15 @@ static int src0(FILE *file, const void* inst)
                      GEN_BITS_FIELD(inst, bits2.da1.src0_abs),
                      GEN_BITS_FIELD(inst, bits2.da1.src0_negate));
     } else {
+      int32_t imm_off = GEN_BITS_FIELD(inst, bits2.ia1.src0_indirect_offset);
+      if (gen_version >= 80) {
+        imm_off = imm_off +
+          ((((const union Gen8NativeInstruction *)inst)->bits2.ia1.src0_indirect_offset_9) << 9);
+      }
       return src_ia1(file,
                      GEN_BITS_FIELD(inst, bits1.ia1.src0_reg_type),
                      GEN_BITS_FIELD(inst, bits1.ia1.src0_reg_file),
-                     GEN_BITS_FIELD(inst, bits2.ia1.src0_indirect_offset),
+                     imm_off,
                      GEN_BITS_FIELD(inst, bits2.ia1.src0_subreg_nr),
                      GEN_BITS_FIELD(inst, bits2.ia1.src0_negate),
                      GEN_BITS_FIELD(inst, bits2.ia1.src0_abs),
@@ -1131,12 +1218,14 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
 {
   int err = 0;
   int space = 0;
-  if (IS_IVYBRIDGE(deviceID)) {
+  if (IS_GEN7(deviceID)) {
     gen_version = 70;
-  } else if (IS_HASWELL(deviceID)) {
+  } else if (IS_GEN75(deviceID)) {
     gen_version = 75;
-  } else if (IS_BROADWELL(deviceID)) {
+  } else if (IS_GEN8(deviceID)) {
     gen_version = 80;
+  } else if (IS_GEN9(deviceID)) {
+    gen_version = 90;
   }
 
   if (PRED_CTRL(inst)) {
@@ -1242,59 +1331,61 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
                      target, &space);
     }
 
-    switch (target) {
-      case GEN_SFID_SAMPLER:
-        format(file, " (%d, %d, %d, %d)",
-               SAMPLE_BTI(inst),
-               SAMPLER(inst),
-               SAMPLER_MSG_TYPE(inst),
-               SAMPLER_SIMD_MODE(inst));
-        break;
-      case GEN_SFID_DATAPORT_DATA:
-        if(UNTYPED_RW_CATEGORY(inst) == 0) {
+    if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+      switch (target) {
+        case GEN_SFID_SAMPLER:
+          format(file, " (%d, %d, %d, %d)",
+                 SAMPLE_BTI(inst),
+                 SAMPLER(inst),
+                 SAMPLER_MSG_TYPE(inst),
+                 SAMPLER_SIMD_MODE(inst));
+          break;
+        case GEN_SFID_DATAPORT_DATA:
+          if(UNTYPED_RW_CATEGORY(inst) == 0) {
+            format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                   UNTYPED_RW_BTI(inst),
+                   UNTYPED_RW_RGBA(inst),
+                   data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+                   data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                   data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+          } else {
+            format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
+                   SCRATCH_RW_OFFSET(inst),
+                   data_port_scratch_block_size[SCRATCH_RW_BLOCK_SIZE(inst)],
+                   data_port_scratch_invalidate[SCRATCH_RW_INVALIDATE_AFTER_READ(inst)],
+                   data_port_scratch_channel_mode[SCRATCH_RW_CHANNEL_MODE(inst)],
+                   data_port_scratch_msg_type[SCRATCH_RW_MSG_TYPE(inst)]);
+          }
+          break;
+        case GEN_SFID_DATAPORT1_DATA:
           format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
                  UNTYPED_RW_BTI(inst),
                  UNTYPED_RW_RGBA(inst),
                  data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
                  data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
-                 data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
-        } else {
-          format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
-                 SCRATCH_RW_OFFSET(inst),
-                 data_port_scratch_block_size[SCRATCH_RW_BLOCK_SIZE(inst)],
-                 data_port_scratch_invalidate[SCRATCH_RW_INVALIDATE_AFTER_READ(inst)],
-                 data_port_scratch_channel_mode[SCRATCH_RW_CHANNEL_MODE(inst)],
-                 data_port_scratch_msg_type[SCRATCH_RW_MSG_TYPE(inst)]);
-        }
-        break;
-      case GEN_SFID_DATAPORT1_DATA:
-        format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
-               UNTYPED_RW_BTI(inst),
-               UNTYPED_RW_RGBA(inst),
-               data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
-               data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
-               data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
-        break;
-      case GEN_SFID_DATAPORT_CONSTANT:
-        format(file, " (bti: %d, %s)",
-               DWORD_RW_BTI(inst),
-               data_port_data_cache_msg_type[DWORD_RW_MSG_TYPE(inst)]);
-        break;
-      case GEN_SFID_MESSAGE_GATEWAY:
-        format(file, " (subfunc: %s, notify: %d, ackreq: %d)",
-               gateway_sub_function[MSG_GW_SUBFUNC(inst)],
-               MSG_GW_NOTIFY(inst),
-               MSG_GW_ACKREQ(inst));
-        break;
-
-      default:
-        format(file, "unsupported target %d", target);
-        break;
+                 data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+          break;
+        case GEN_SFID_DATAPORT_CONSTANT:
+          format(file, " (bti: %d, %s)",
+                 DWORD_RW_BTI(inst),
+                 data_port_data_cache_msg_type[DWORD_RW_MSG_TYPE(inst)]);
+          break;
+        case GEN_SFID_MESSAGE_GATEWAY:
+          format(file, " (subfunc: %s, notify: %d, ackreq: %d)",
+                 gateway_sub_function[MSG_GW_SUBFUNC(inst)],
+                 MSG_GW_NOTIFY(inst),
+                 MSG_GW_ACKREQ(inst));
+          break;
+
+        default:
+          format(file, "unsupported target %d", target);
+          break;
+      }
+      if (space)
+        string(file, " ");
+      format(file, "mlen %d", GENERIC_MSG_LENGTH(inst));
+      format(file, " rlen %d", GENERIC_RESPONSE_LENGTH(inst));
     }
-    if (space)
-      string(file, " ");
-    format(file, "mlen %d", GENERIC_MSG_LENGTH(inst));
-    format(file, " rlen %d", GENERIC_RESPONSE_LENGTH(inst));
   }
   pad(file, 64);
   if (OPCODE(inst) != GEN_OPCODE_NOP) {
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index a830260..b9dfb18 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -74,20 +74,14 @@ namespace gbe
     const uint32_t perLaneSize = kernel->getStackSize();
     const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
-    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
-    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
 
-    // Use shifts rather than muls which are limited to 32x16 bit sources
-    const uint32_t perLaneShift = logi2(perLaneSize);
-    const uint32_t perThreadShift = logi2(perThreadSize);
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
-    const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
-    const GenRegister bufferptr = ra->genReg(selStackBuffer);
 
     // We compute the per-lane stack pointer here
+    // private address start from zero
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
@@ -96,13 +90,16 @@ namespace gbe
       p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
       p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
       p->curr.execWidth = this->simdWidth;
-      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
       p->curr.execWidth = 1;
       p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
       p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
-      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+      if(perThreadSize > 0xffff) {
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
+      } else
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
       p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, bufferptr);
       p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
     p->pop();
   }
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
index c77ce4d..135be02 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -96,8 +96,7 @@ namespace gbe
     gen7_insn->bits3.gen7_typed_rw.slot = 1;
   }
 
-  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
-    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+  unsigned Gen75Encoder::setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
     Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
@@ -111,11 +110,6 @@ namespace gbe
     } else
       NOT_IMPLEMENTED;
 
-    this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
-
     const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
     gen7_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
@@ -129,11 +123,26 @@ namespace gbe
       gen7_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
     else
       NOT_SUPPORTED;
+    return gen7_insn->bits3.ud;
   }
 
-  void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned Gen75Encoder::setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     if (this->curr.execWidth == 8) {
@@ -144,44 +153,75 @@ namespace gbe
       response_length = 2 * elemNum;
     } else
       NOT_IMPLEMENTED;
-
-    this->setHeader(insn);
-    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN75_P1_UNTYPED_READ,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
-  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+  void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned Gen75Encoder::setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
-    this->setHeader(insn);
     if (this->curr.execWidth == 8) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
       msg_length = 1 + elemNum;
     } else if (this->curr.execWidth == 16) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
       msg_length = 2 * (1 + elemNum);
     }
     else
       NOT_IMPLEMENTED;
-    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN75_P1_UNTYPED_SURFACE_WRITE,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+
   void Gen75Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
     union { double d; unsigned u[2]; } u;
     u.d = value;
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
index 9545157..5d80bbd 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -48,15 +48,18 @@ namespace gbe
     virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
     virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
-    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
-    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
-    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
     virtual void setHeader(GenNativeInstruction *insn);
     virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
                    uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
     virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
                                       unsigned char msg_type, uint32_t msg_length,
                                       bool header_present);
+    virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+    virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
   };
 }
 #endif /* __GBE_GEN75_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index ecf5b39..fc358be 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -118,21 +118,20 @@ namespace gbe
     } else {
       gen7_insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
       gen7_insn->bits1.ia1.src0_reg_type = reg.type;
-      gen7_insn->bits2.ia1.src0_subreg_nr = 0;
-      gen7_insn->bits2.ia1.src0_indirect_offset = 0;
-      gen7_insn->bits2.ia1.src0_abs = 0;
-      gen7_insn->bits2.ia1.src0_negate = 0;
+      gen7_insn->bits2.ia1.src0_subreg_nr = reg.a0_subnr;
+      gen7_insn->bits2.ia1.src0_indirect_offset = reg.addr_imm;
+      gen7_insn->bits2.ia1.src0_abs = reg.absolute;
+      gen7_insn->bits2.ia1.src0_negate = reg.negation;
       gen7_insn->bits2.ia1.src0_address_mode = reg.address_mode;
-      gen7_insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-      gen7_insn->bits2.ia1.src0_width = GEN_WIDTH_1;
-      gen7_insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      gen7_insn->bits2.ia1.src0_horiz_stride = reg.hstride;
+      gen7_insn->bits2.ia1.src0_width = reg.width;
+      gen7_insn->bits2.ia1.src0_vert_stride = reg.vstride;
     }
   }
 
   void Gen7Encoder::setSrc1(GenNativeInstruction *insn, GenRegister reg) {
     Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
     assert(reg.nr < 128);
-    assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
 
     gen7_insn->bits1.da1.src1_reg_file = reg.file;
     gen7_insn->bits1.da1.src1_reg_type = reg.type;
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 776c92b..b497ee5 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -50,4 +50,1006 @@ namespace gbe
   void Gen8Context::newSelection(void) {
     this->sel = GBE_NEW(Selection8, *this);
   }
+
+  bool Gen8Context::patchBranches(void) {
+    using namespace ir;
+    for (auto pair : branchPos2) {
+      const LabelIndex label = pair.first;
+      const int32_t insnID = pair.second;
+      const int32_t targetID = labelPos.find(label)->second;
+      p->patchJMPI(insnID, (targetID - insnID), 0);
+    }
+    for (auto pair : branchPos3) {
+      const LabelPair labelPair = pair.first;
+      const int32_t insnID = pair.second;
+      const int32_t jip = labelPos.find(labelPair.l0)->second;
+      const int32_t uip = labelPos.find(labelPair.l1)->second;
+      p->patchJMPI(insnID, jip - insnID, uip - insnID);
+    }
+    return true;
+  }
+
+  void Gen8Context::emitUnaryInstruction(const SelectionInstruction &insn)
+  {
+    switch (insn.opcode) {
+      case SEL_OP_CONVI64_TO_I:
+        /* Should never come to here, just use the common OPCODE. */
+        GBE_ASSERT(0);
+        break;
+      default:
+        GenContext::emitUnaryInstruction(insn);
+    }
+  }
+
+  void Gen8Context::emitUnaryWithTempInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister src = ra->genReg(insn.src(0));
+    GenRegister tmp = ra->genReg(insn.dst(1));
+    switch (insn.opcode) {
+      case SEL_OP_CONVI_TO_I64:
+        /* Should never come to here, just use the common OPCODE. */
+        GBE_ASSERT(0);
+        break;
+      case SEL_OP_BSWAP:
+        {
+          uint32_t simd = p->curr.execWidth;
+          GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
+          uint16_t new_a0[16];
+          memset(new_a0, 0, sizeof(new_a0));
+
+          GBE_ASSERT(src.type == dst.type);
+          uint32_t start_addr = src.nr*32 + src.subnr;
+
+          if (simd == 1) {
+            GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
+                && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+            if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 3;
+              new_a0[1] = start_addr + 2;
+              new_a0[2] = start_addr + 1;
+              new_a0[3] = start_addr;
+              this->setA0Content(new_a0, 0, 4);
+
+              p->push();
+              p->curr.execWidth = 4;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
+              GenRegister dst_ = dst;
+              dst_.type = GEN_TYPE_UB;
+              dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
+              dst_.width = GEN_WIDTH_4;
+              dst_.vstride = GEN_VERTICAL_STRIDE_4;
+              p->MOV(dst_, ind_src);
+              p->pop();
+            } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+              p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
+                  GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
+              p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
+                  GenRegister::retype(src, GEN_TYPE_UB));
+            } else {
+              GBE_ASSERT(0);
+            }
+          } else {
+            if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+              bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+              GBE_ASSERT(uniform_src || src.subnr == 0);
+              GBE_ASSERT(dst.subnr == 0);
+              GBE_ASSERT(tmp.subnr == 0);
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 3;
+              new_a0[1] = start_addr + 2;
+              new_a0[2] = start_addr + 1;
+              new_a0[3] = start_addr;
+              if (!uniform_src) {
+                new_a0[4] = start_addr + 7;
+                new_a0[5] = start_addr + 6;
+                new_a0[6] = start_addr + 5;
+                new_a0[7] = start_addr + 4;
+                new_a0[8] = start_addr + 11;
+                new_a0[9] = start_addr + 10;
+                new_a0[10] = start_addr + 9;
+                new_a0[11] = start_addr + 8;
+                new_a0[12] = start_addr + 15;
+                new_a0[13] = start_addr + 14;
+                new_a0[14] = start_addr + 13;
+                new_a0[15] = start_addr + 12;
+              } else {
+                new_a0[4] = start_addr + 3;
+                new_a0[5] = start_addr + 2;
+                new_a0[6] = start_addr + 1;
+                new_a0[7] = start_addr;
+                new_a0[8] = start_addr + 3;
+                new_a0[9] = start_addr + 2;
+                new_a0[10] = start_addr + 1;
+                new_a0[11] = start_addr;
+                new_a0[12] = start_addr + 3;
+                new_a0[13] = start_addr + 2;
+                new_a0[14] = start_addr + 1;
+                new_a0[15] = start_addr;
+              }
+              this->setA0Content(new_a0, 48);
+
+              p->push();
+              p->curr.execWidth = 16;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
+              p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+              ind_src.addr_imm += 16;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 16), ind_src);
+              if (simd == 16) {
+                for (int i = 0; i < 2; i++) {
+                  ind_src.addr_imm += 16;
+                  p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 16*i), ind_src);
+                }
+              }
+              p->pop();
+
+              p->MOV(dst, tmp);
+            } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+              bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+              GBE_ASSERT(uniform_src || src.subnr == 0 || src.subnr == 16);
+              GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
+              GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 1;
+              new_a0[1] = start_addr;
+              if (!uniform_src) {
+                new_a0[2] = start_addr + 3;
+                new_a0[3] = start_addr + 2;
+                new_a0[4] = start_addr + 5;
+                new_a0[5] = start_addr + 4;
+                new_a0[6] = start_addr + 7;
+                new_a0[7] = start_addr + 6;
+                new_a0[8] = start_addr + 9;
+                new_a0[9] = start_addr + 8;
+                new_a0[10] = start_addr + 11;
+                new_a0[11] = start_addr + 10;
+                new_a0[12] = start_addr + 13;
+                new_a0[13] = start_addr + 12;
+                new_a0[14] = start_addr + 15;
+                new_a0[15] = start_addr + 14;
+              } else {
+                new_a0[2] = start_addr + 1;
+                new_a0[3] = start_addr;
+                new_a0[4] = start_addr + 1;
+                new_a0[5] = start_addr;
+                new_a0[6] = start_addr + 1;
+                new_a0[7] = start_addr;
+                new_a0[8] = start_addr + 1;
+                new_a0[9] = start_addr;
+                new_a0[10] = start_addr + 1;
+                new_a0[11] = start_addr;
+                new_a0[12] = start_addr + 1;
+                new_a0[13] = start_addr;
+                new_a0[14] = start_addr + 1;
+                new_a0[15] = start_addr;
+              }
+              this->setA0Content(new_a0, 48);
+
+              p->push();
+              p->curr.execWidth = 16;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
+              p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+              if (simd == 16) {
+                ind_src.addr_imm += 16;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 16), ind_src);
+              }
+              p->pop();
+
+              p->MOV(dst, tmp);
+            } else {
+              GBE_ASSERT(0);
+            }
+          }
+        }
+        break;
+      default:
+        GenContext::emitUnaryWithTempInstruction(insn);
+    }
+  }
+
+  void Gen8Context::emitSimdShuffleInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
+    assert(insn.opcode == SEL_OP_SIMD_SHUFFLE);
+
+    uint32_t simd = p->curr.execWidth;
+    if (src1.file == GEN_IMMEDIATE_VALUE) {
+      uint32_t offset = src1.value.ud % simd;
+      GenRegister reg = GenRegister::suboffset(src0, offset);
+      p->MOV(dst, GenRegister::retype(GenRegister::ud1grf(reg.nr, reg.subnr / typeSize(reg.type)), reg.type));
+    } else {
+      uint32_t base = src0.nr * 32 + src0.subnr * 4;
+      GenRegister baseReg = GenRegister::immuw(base);
+      const GenRegister a0 = GenRegister::addr8(0);
+      p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+      GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+      p->MOV(dst, indirect);
+    }
+  }
+
+  void Gen8Context::emitBinaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
+    switch (insn.opcode) {
+      case SEL_OP_SEL_INT64:
+      case SEL_OP_I64AND:
+      case SEL_OP_I64OR:
+      case SEL_OP_I64XOR:
+        /* Should never come to here, just use the common OPCODE. */
+        GBE_ASSERT(0);
+        break;
+      case SEL_OP_UPSAMPLE_LONG:
+      {
+        p->MOV(dst, src0);
+        p->SHL(dst, dst, GenRegister::immud(32));
+        p->ADD(dst, dst, src1);
+        break;
+      }
+      default:
+        GenContext::emitBinaryInstruction(insn);
+    }
+  }
+
+  void Gen8Context::emitBinaryWithTempInstruction(const SelectionInstruction &insn)
+  {
+    switch (insn.opcode) {
+      case SEL_OP_I64ADD:
+      case SEL_OP_I64SUB:
+        /* Should never come to here, just use the common OPCODE. */
+        GBE_ASSERT(0);
+        break;
+      default:
+        GenContext::emitBinaryWithTempInstruction(insn);
+    }
+  }
+
+  void Gen8Context::emitI64ShiftInstruction(const SelectionInstruction &insn)
+  {
+    switch (insn.opcode) {
+      case SEL_OP_I64SHL:
+      case SEL_OP_I64SHR:
+      case SEL_OP_I64ASR:
+        /* Should never come to here, just use the common OPCODE. */
+        GBE_ASSERT(0);
+        break;
+      default:
+        GenContext::emitI64ShiftInstruction(insn);
+    }
+  }
+
+  void Gen8Context::emitI64CompareInstruction(const SelectionInstruction &insn)
+  {
+    /* Should never come to here, just use the common OPCODE. */
+    GBE_ASSERT(0);
+  }
+
+  void Gen8Context::emitI64SATADDInstruction(const SelectionInstruction &insn)
+  {
+    /* Should never come to here, just use the common OPCODE. */
+    GBE_ASSERT(0);
+  }
+
+  void Gen8Context::emitI64SATSUBInstruction(const SelectionInstruction &insn)
+  {
+    /* Should never come to here, just use the common OPCODE. */
+    GBE_ASSERT(0);
+  }
+
+  void Gen8Context::emitI64ToFloatInstruction(const SelectionInstruction &insn)
+  {
+    /* Should never come to here, just use the common OPCODE. */
+    GBE_ASSERT(0);
+  }
+
+  void Gen8Context::emitFloatToI64Instruction(const SelectionInstruction &insn)
+  {
+    /* Should never come to here, just use the common OPCODE. */
+    GBE_ASSERT(0);
+  }
+
+  static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0)
+  {
+    if(reg.hstride == GEN_HORIZONTAL_STRIDE_0) {
+      if(offset == 0)
+        return GenRegister::retype(reg, GEN_TYPE_UD);
+      else
+        return GenRegister::retype(GenRegister::offset(reg, 0, typeSize(GEN_TYPE_UD)*offset), GEN_TYPE_UD);
+    } else
+      return GenRegister::unpacked_ud(reg.nr, reg.subnr + offset);
+  }
+
+  void Gen8Context::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                  GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
+  {
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst_h.type = dst_l.type = GEN_TYPE_UL;
+    s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL;
+
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    GenRegister s0h = GenRegister::offset(s0l, 0, 4);
+    GenRegister s1h = GenRegister::offset(s1l, 0, 4);
+
+    /* Low 32 bits X low 32 bits. */
+    p->MUL(dst_l, s0l, s1l);
+    /* High 32 bits X High 32 bits. */
+    p->MUL(dst_h, s0h, s1h);
+    /* Low 32 bits X high 32 bits. */
+    p->MUL(s0l_s1h, s0l, s1h);
+    /* High 32 bits X low 32 bits. */
+    p->MUL(s0h_s1l, s0h, s1l);
+
+    /*  Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + 1 - 2^(N+1), here N = 32
+        The max of addding 2 32bits integer to it is
+        2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1
+        which means the product s0h_s1l adds dst_l's high 32 bits and then adds s0l_s1h's low 32 bits will not
+        overflow and have no carry.
+        By this manner, we can avoid using acc register, which has a lot of restrictions. */
+
+    GenRegister dst_l_h = unpacked_ud(dst_l, 1);
+    p->ADD(s0h_s1l, s0h_s1l, dst_l_h);
+    GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h);
+    p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l);
+    GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h, 1);
+    p->ADD(dst_h, dst_h, s0l_s1h_h);
+
+    // No longer need s0l_s1h
+    GenRegister tmp = s0l_s1h;
+
+    p->SHL(tmp, s0h_s1l, GenRegister::immud(32));
+    GenRegister tmp_unpacked = unpacked_ud(tmp, 1);
+    p->MOV(dst_l_h, tmp_unpacked);
+
+    p->SHR(tmp, s0h_s1l, GenRegister::immud(32));
+    p->ADD(dst_h, dst_h, tmp);
+  }
+
+  void Gen8Context::calculateFullS64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                  GenRegister dst_l, GenRegister s0_abs, GenRegister s1_abs, 
+                                  GenRegister tmp0, GenRegister tmp1, GenRegister sign, GenRegister flagReg)
+  {
+    tmp0.type = tmp1.type = GEN_TYPE_UL;
+    sign.type = GEN_TYPE_UL;
+    src0.type = src1.type = GEN_TYPE_UL;
+    /* First, need to get the sign. */
+    p->SHR(tmp0, src0, GenRegister::immud(63));
+    p->SHR(tmp1, src1, GenRegister::immud(63));
+    p->XOR(sign, tmp0, tmp1);
+
+    src0.type = src1.type = GEN_TYPE_L;
+
+    tmp0.type = tmp1.type = GEN_TYPE_UL;
+    s0_abs.type = s1_abs.type = GEN_TYPE_L;
+    p->MOV(s0_abs, GenRegister::abs(src0));
+    p->MOV(s1_abs, GenRegister::abs(src1));
+    calculateFullU64MUL(s0_abs, s1_abs, dst_h, dst_l, tmp0, tmp1);
+
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+    p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0), tmp0);
+    p->curr.noMask = 0;
+    p->curr.predicate = GEN_PREDICATE_NORMAL;
+
+    /* Calculate the neg for the whole 128 bits. */
+    dst_l.type = GEN_TYPE_UL;
+    dst_h.type = GEN_TYPE_L;
+    p->NOT(dst_l, dst_l);
+    p->NOT(dst_h, dst_h);
+    p->ADD(dst_l, dst_l, GenRegister::immud(0x01));
+    p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+    p->CMP(GEN_CONDITIONAL_Z, dst_l, GenRegister::immud(0), tmp0);
+    p->ADD(dst_h, dst_h, GenRegister::immud(0x01));
+    p->pop();
+  }
+
+  void Gen8Context::emitI64MULHIInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister dst_h = ra->genReg(insn.dst(0));
+    GenRegister dst_l = ra->genReg(insn.dst(1));
+    GenRegister s0_abs = ra->genReg(insn.dst(2));
+    GenRegister s1_abs = ra->genReg(insn.dst(3));
+    GenRegister tmp0 = ra->genReg(insn.dst(4));
+    GenRegister tmp1 = ra->genReg(insn.dst(5));
+    GenRegister sign = ra->genReg(insn.dst(6));
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+
+    if(src0.type == GEN_TYPE_UL) {
+      GBE_ASSERT(src1.type == GEN_TYPE_UL);
+      calculateFullU64MUL(src0, src1, dst_h, dst_l, tmp0, tmp1);
+    } else {
+      GBE_ASSERT(src0.type == GEN_TYPE_L);
+      GBE_ASSERT(src1.type == GEN_TYPE_L);
+      calculateFullS64MUL(src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
+                          tmp1, sign, flagReg);
+    }
+  }
+
+  void Gen8Context::emitI64MADSATInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister src2 = ra->genReg(insn.src(2));
+    GenRegister dst_l = ra->genReg(insn.dst(0));
+    GenRegister dst_h = ra->genReg(insn.dst(1));
+    GenRegister s0_abs = ra->genReg(insn.dst(2));
+    GenRegister s1_abs = ra->genReg(insn.dst(3));
+    GenRegister tmp0 = ra->genReg(insn.dst(4));
+    GenRegister tmp1 = ra->genReg(insn.dst(5));
+    GenRegister sign = ra->genReg(insn.dst(6));
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+
+    if (src0.type == GEN_TYPE_UL) {
+      /* Always should be the same long type. */
+      GBE_ASSERT(src1.type == GEN_TYPE_UL);
+      GBE_ASSERT(src2.type == GEN_TYPE_UL);
+      dst_l.type = dst_h.type = GEN_TYPE_UL;
+      tmp0.type = tmp1.type = GEN_TYPE_UL;
+      calculateFullU64MUL(src0, src1, dst_h, dst_l, tmp0, tmp1);
+
+      /* Inplement the logic:
+      dst_l += src2;
+      if (dst_h)
+        dst_l = 0xFFFFFFFFFFFFFFFFULL;
+      if (dst_l < src2)  // carry if overflow
+        dst_l = 0xFFFFFFFFFFFFFFFFULL;
+      */
+      p->ADD(dst_l, dst_l, src2);
+
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_NZ, dst_h, GenRegister::immud(0), tmp0);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.noMask = 0;
+      p->MOV(dst_l, GenRegister::immuint64(0xFFFFFFFFFFFFFFFF));
+      p->pop();
+
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_L, dst_l, src2, tmp0);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.noMask = 0;
+      p->MOV(dst_l, GenRegister::immuint64(0xFFFFFFFFFFFFFFFF));
+      p->pop();
+    } else {
+      GBE_ASSERT(src0.type == GEN_TYPE_L);
+      GBE_ASSERT(src1.type == GEN_TYPE_L);
+      GBE_ASSERT(src2.type == GEN_TYPE_L);
+
+      calculateFullS64MUL(src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
+                          tmp1, sign, flagReg);
+
+      GenRegister sum = sign;
+      sum.type = GEN_TYPE_UL;
+      src2.type = GEN_TYPE_L;
+      dst_l.type = GEN_TYPE_UL;
+      p->ADD(sum, src2, dst_l);
+
+      /* Implement this logic:
+      if(src2 >= 0) {
+        if(dst_l > sum) {
+          dst_h++;
+          if(CL_LONG_MIN == dst_h) {
+            dst_h = CL_LONG_MAX;
+            sum = CL_ULONG_MAX;
+          }
+        }
+      } */
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_GE, src2, GenRegister::immud(0), tmp1);
+      p->curr.noMask = 0;
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_G, dst_l, sum, tmp1);
+      p->ADD(dst_h, dst_h, GenRegister::immud(1));
+      p->MOV(tmp0, GenRegister::immint64(-0x7FFFFFFFFFFFFFFFLL - 1LL));
+      p->CMP(GEN_CONDITIONAL_EQ, dst_h, tmp0, tmp1);
+      p->MOV(dst_h, GenRegister::immint64(0x7FFFFFFFFFFFFFFFLL));
+      p->MOV(sum, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFULL));
+      p->pop();
+
+      /* Implement this logic:
+      else {
+        if(dst_l < sum) {
+          dst_h--;
+          if(CL_LONG_MAX == dst_h) {
+            dst_h = CL_LONG_MIN;
+            sum = 0;
+          }
+        }
+      } */
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_L, src2, GenRegister::immud(0), tmp1);
+      p->curr.noMask = 0;
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_L, dst_l, sum, tmp1);
+      p->ADD(dst_h, dst_h, GenRegister::immd(-1));
+      p->MOV(tmp0, GenRegister::immint64(0x7FFFFFFFFFFFFFFFLL));
+      p->CMP(GEN_CONDITIONAL_EQ, dst_h, tmp0, tmp1);
+      p->MOV(dst_h, GenRegister::immint64(-0x7FFFFFFFFFFFFFFFLL - 1LL));
+      p->MOV(sum, GenRegister::immud(0));
+      p->pop();
+
+      /* saturate logic:
+      if(dst_h > 0)
+        sum = CL_LONG_MAX;
+      else if (dst_h == 0 && sum > 0x7FFFFFFFFFFFFFFFLL) {
+        sum = CL_LONG_MAX;
+      else if (dst_h == -1 && sum < 0x8000000000000000)
+        sum = CL_LONG_MIN;
+      else (dst_h < -1)
+        sum = CL_LONG_MIN;
+      cl_long result = (cl_long) sum; */
+      p->MOV(dst_l, sum);
+      tmp0.type = GEN_TYPE_UL;
+
+      dst_h.type = GEN_TYPE_L;
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_G, dst_h, GenRegister::immud(0), tmp1);
+      p->curr.noMask = 0;
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(dst_l, GenRegister::immint64(0x7FFFFFFFFFFFFFFFLL));
+      p->pop();
+
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_EQ, dst_h, GenRegister::immd(0x0L), tmp1);
+      p->curr.noMask = 0;
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(tmp0, GenRegister::immuint64(0x7FFFFFFFFFFFFFFFUL));
+      p->CMP(GEN_CONDITIONAL_G, dst_l, tmp0, tmp1);
+      p->MOV(dst_l, GenRegister::immint64(0x7FFFFFFFFFFFFFFFLL));
+      p->pop();
+
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      /* Fixme: HW bug ? 0xFFFFFFFFFFFFFFFF != 0xFFFFFFFFFFFFFFFF */
+      p->ADD(tmp0, dst_h, GenRegister::immud(1));
+      p->CMP(GEN_CONDITIONAL_EQ, tmp0, GenRegister::immud(0), tmp1);
+      p->curr.noMask = 0;
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(tmp0, GenRegister::immuint64(0x8000000000000000UL));
+      p->CMP(GEN_CONDITIONAL_L, dst_l, tmp0, tmp1);
+      p->MOV(dst_l, GenRegister::immint64(-0x7FFFFFFFFFFFFFFFLL - 1LL));
+      p->pop();
+
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_L, dst_h, GenRegister::immd(-1), tmp1);
+      p->curr.noMask = 0;
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(dst_l, GenRegister::immint64(-0x7FFFFFFFFFFFFFFFLL - 1LL));
+      p->pop();
+    }
+  }
+
+  void Gen8Context::emitI64MULInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister res = ra->genReg(insn.dst(1));
+
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst.type = GEN_TYPE_UL;
+    res.type = GEN_TYPE_UL;
+
+    /* Low 32 bits X low 32 bits. */
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    p->MUL(dst, s0l, s1l);
+
+    /* Low 32 bits X high 32 bits. */
+    GenRegister s1h = GenRegister::offset(s1l, 0, 4);
+    p->MUL(res, s0l, s1h);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+
+    /* High 32 bits X low 32 bits. */
+    GenRegister s0h = GenRegister::offset(s0l, 0, 4);
+    p->MUL(res, s0h, s1l);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+  }
+
+  void Gen8Context::emitI64HADDInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister tmp0 = ra->genReg(insn.dst(1));
+    GenRegister tmp1 = ra->genReg(insn.dst(2));
+
+    /* Src0 and Src1 are always unsigned long type.*/
+    GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type == GEN_TYPE_UL);
+    dst.type = src0.type;
+    tmp0.type = tmp1.type = GEN_TYPE_UL;
+
+    //hadd = (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1))
+    p->AND(tmp0, src0, GenRegister::immud(1));
+    p->AND(dst, src1, tmp0);
+    p->SHR(tmp0, src0, GenRegister::immud(1));
+    p->SHR(tmp1, src1, GenRegister::immud(1));
+    p->ADD(dst, dst, tmp0);
+    p->ADD(dst, dst, tmp1);
+  }
+
+  void Gen8Context::emitI64RHADDInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister tmp0 = ra->genReg(insn.dst(1));
+    GenRegister tmp1 = ra->genReg(insn.dst(2));
+
+    /* Src0 and Src1 are always unsigned long type.*/
+    GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type == GEN_TYPE_UL);
+    dst.type = src0.type;
+    tmp0.type = tmp1.type = GEN_TYPE_UL;
+
+    //rhadd = (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1))
+    p->AND(tmp0, src0, GenRegister::immud(1));
+    p->AND(tmp1, src1, GenRegister::immud(1));
+    p->OR(dst, tmp0, tmp1);
+    p->SHR(tmp0, src0, GenRegister::immud(1));
+    p->SHR(tmp1, src1, GenRegister::immud(1));
+    p->ADD(dst, dst, tmp0);
+    p->ADD(dst, dst, tmp1);
+  }
+
+  void Gen8Context::emitI64DIVREMInstruction(const SelectionInstruction &cnst_insn)
+  {
+    SelectionInstruction* insn = const_cast<SelectionInstruction*>(&cnst_insn);
+    GenRegister packed_src0 = ra->genReg(insn->src(0));
+    GenRegister packed_src1 = ra->genReg(insn->src(1));
+    GenRegister dst = ra->genReg(insn->dst(0));
+    int tmp_reg_n = 14;
+
+    if (packed_src0.hstride != GEN_HORIZONTAL_STRIDE_0) {
+      GenRegister unpacked_src0 = ra->genReg(insn->dst(tmp_reg_n));
+      unpackLongVec(packed_src0, unpacked_src0, p->curr.execWidth);
+      tmp_reg_n++;
+      insn->src(0) = unpacked_src0;
+    }
+    if (packed_src1.hstride != GEN_HORIZONTAL_STRIDE_0) {
+      GenRegister unpacked_src1 = ra->genReg(insn->dst(tmp_reg_n));
+      unpackLongVec(packed_src1, unpacked_src1, p->curr.execWidth);
+      tmp_reg_n++;
+      insn->src(1) = unpacked_src1;
+    }
+    GBE_ASSERT(tmp_reg_n <= insn->dstNum);
+
+    GenContext::emitI64DIVREMInstruction(*insn);
+
+    if (dst.hstride != GEN_HORIZONTAL_STRIDE_0) {
+      GenRegister dst_packed = ra->genReg(insn->dst(14));
+      packLongVec(dst, dst_packed, p->curr.execWidth);
+      p->MOV(dst, dst_packed);
+    }
+  }
+
+  void Gen8Context::packLongVec(GenRegister unpacked, GenRegister packed, uint32_t simd)
+  {
+    bool isScalar = false;
+    if (unpacked.hstride == GEN_HORIZONTAL_STRIDE_0)
+      isScalar = true;
+
+    GBE_ASSERT(packed.subnr == 0);
+    GBE_ASSERT(packed.hstride != GEN_HORIZONTAL_STRIDE_0);
+    GBE_ASSERT(unpacked.subnr == 0 || isScalar);
+
+    unpacked = GenRegister::retype(unpacked, GEN_TYPE_UD);
+    packed = GenRegister::retype(packed, GEN_TYPE_UD);
+
+    if (isScalar) {
+      p->MOV(packed, unpacked);
+    } else {
+      if (simd == 16) {
+        p->push();
+        p->curr.execWidth = 8;
+        p->MOV(GenRegister::h2(packed), unpacked);
+        p->MOV(GenRegister::h2(GenRegister::offset(packed, 0, typeSize(GEN_TYPE_UD))),
+               GenRegister::offset(unpacked, 2));
+        p->curr.quarterControl = 1;
+        p->MOV(GenRegister::h2(GenRegister::offset(packed, 2, 0)), GenRegister::offset(unpacked, 1));
+        p->MOV(GenRegister::h2(GenRegister::offset(packed, 2, typeSize(GEN_TYPE_UD))),
+               GenRegister::offset(unpacked, 3));
+        p->pop();
+      } else {
+        GBE_ASSERT(simd == 8);
+        p->MOV(GenRegister::h2(packed), unpacked);
+        p->MOV(GenRegister::h2(GenRegister::offset(packed, 0, typeSize(GEN_TYPE_UD))),
+               GenRegister::offset(unpacked, 1));
+      }
+    }
+  }
+
+  void Gen8Context::unpackLongVec(GenRegister packed, GenRegister unpacked, uint32_t simd)
+  {
+    bool isScalar = false;
+    if (packed.hstride == GEN_HORIZONTAL_STRIDE_0)
+      isScalar = true;
+
+    GBE_ASSERT(packed.subnr == 0 || isScalar);
+    GBE_ASSERT(unpacked.hstride != GEN_HORIZONTAL_STRIDE_0);
+    GBE_ASSERT(unpacked.subnr == 0);
+
+    unpacked = GenRegister::retype(unpacked, GEN_TYPE_UD);
+    packed = GenRegister::retype(packed, GEN_TYPE_UD);
+
+    if (isScalar) {
+      p->MOV(unpacked, packed);
+
+      if (simd == 16) {
+        p->MOV(GenRegister::offset(unpacked, 2), GenRegister::offset(packed, 0, typeSize(GEN_TYPE_UD)));
+      } else {
+        p->MOV(GenRegister::offset(unpacked, 1), GenRegister::offset(packed, 0, typeSize(GEN_TYPE_UD)));
+      }
+    } else {
+      packed.vstride = GEN_VERTICAL_STRIDE_8;
+      packed.width = GEN_WIDTH_4;
+
+      p->push();
+      p->curr.execWidth = 8;
+      if (simd == 16) {
+        p->MOV(unpacked, GenRegister::h2(packed));
+        p->MOV(GenRegister::offset(unpacked, 2),
+               GenRegister::h2(GenRegister::offset(packed, 0, typeSize(GEN_TYPE_UD))));
+
+        p->curr.quarterControl = 1;
+        p->MOV(GenRegister::offset(unpacked, 1), GenRegister::h2(GenRegister::offset(packed, 2)));
+        p->MOV(GenRegister::offset(unpacked, 3),
+               GenRegister::h2(GenRegister::offset(packed, 2, typeSize(GEN_TYPE_UD))));
+      } else {
+        GBE_ASSERT(simd == 8);
+        p->MOV(unpacked, GenRegister::h2(packed));
+        p->MOV(GenRegister::offset(unpacked, 1),
+               GenRegister::h2(GenRegister::offset(packed, 0, typeSize(GEN_TYPE_UD))));
+      }
+      p->pop();
+    }
+  }
+  void Gen8Context::emitRead64Instruction(const SelectionInstruction &insn)
+  {
+    const uint32_t elemNum = insn.extra.elem;
+    GBE_ASSERT(elemNum == 1);
+
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister bti = ra->genReg(insn.src(1));
+
+    /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
+       which we can not accept. We just fallback to 2 DW untyperead here. */
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_READ(dst, src, bti, 2*elemNum);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(2*elemNum));
+      unsigned desc = p->generateUntypedReadMessageDesc(0, 2*elemNum);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), 2*elemNum);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
+
+    for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+      GenRegister long_tmp = ra->genReg(insn.dst(elemID));
+      GenRegister the_long = ra->genReg(insn.dst(elemID + elemNum));
+      this->packLongVec(long_tmp, the_long, p->curr.execWidth);
+    }
+  }
+
+  void Gen8Context::emitWrite64Instruction(const SelectionInstruction &insn)
+  {
+    const uint32_t elemNum = insn.extra.elem;
+    GBE_ASSERT(elemNum == 1);
+    const GenRegister addr = ra->genReg(insn.src(elemNum));
+    const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
+
+    /* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
+       which we can not accept. We just fallback to 2 DW untypewrite here. */
+    for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+      GenRegister the_long = ra->genReg(insn.src(elemID));
+      GenRegister long_tmp = ra->genReg(insn.src(elemNum + 1 + elemID));
+      this->unpackLongVec(the_long, long_tmp, p->curr.execWidth);
+    }
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_WRITE(addr, bti, elemNum*2);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+      unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
+  }
+  void Gen8Context::emitPackLongInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister dst = ra->genReg(insn.dst(0));
+
+    /* Scalar register need not to convert. */
+    GBE_ASSERT(dst.hstride != GEN_HORIZONTAL_STRIDE_0 && src.hstride != GEN_HORIZONTAL_STRIDE_0);
+    this->packLongVec(src, dst, p->curr.execWidth);
+  }
+
+  void Gen8Context::emitUnpackLongInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister dst = ra->genReg(insn.dst(0));
+
+    /* Scalar register need not to convert. */
+    GBE_ASSERT(dst.hstride != GEN_HORIZONTAL_STRIDE_0 && src.hstride != GEN_HORIZONTAL_STRIDE_0);
+    this->unpackLongVec(src, dst, p->curr.execWidth);
+  }
+
+  void Gen8Context::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    if (sz == 0)
+      sz = 16;
+    GBE_ASSERT(sz%4 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+
+    p->push();
+    p->curr.execWidth = 1;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    for (int i = 0; i < sz/4; i++) {
+      uint64_t addr = (new_a0[i*4 + 3] << 16) | (new_a0[i*4 + 2]);
+      addr = addr << 32;
+      addr = addr | (new_a0[i*4 + 1] << 16) | (new_a0[i*4]);
+      p->MOV(GenRegister::retype(GenRegister::addr1(i*4), GEN_TYPE_UL), GenRegister::immuint64(addr));
+    }
+    p->pop();
+  }
+
+  void ChvContext::newSelection(void) {
+    this->sel = GBE_NEW(SelectionChv, *this);
+  }
+
+  void ChvContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                             GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
+  {
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst_h.type = dst_l.type = GEN_TYPE_UL;
+    s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL;
+
+    //GenRegister tmp;
+
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, reuse s0l_s1h
+    GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, reuse dst_l
+
+    p->MOV(s0h, GenRegister::offset(s0l, 0, 4));
+    p->MOV(s1h, GenRegister::offset(s1l, 0, 4));
+
+    /* High 32 bits X High 32 bits. */
+    p->MUL(dst_h, s0h, s1h);
+    /* High 32 bits X low 32 bits. */
+    p->MUL(s0h_s1l, s0h, s1l);
+    /* Low 32 bits X high 32 bits. */
+    p->MUL(s0l_s1h, s0l, s1h);
+    /* Low 32 bits X low 32 bits. */
+    p->MUL(dst_l, s0l, s1l);
+
+    /*  Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + 1 - 2^(N+1), here N = 32
+        The max of addding 2 32bits integer to it is
+        2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1
+        which means the product s0h_s1l adds dst_l's high 32 bits and then adds s0l_s1h's low 32 bits will not
+        overflow and have no carry.
+        By this manner, we can avoid using acc register, which has a lot of restrictions. */
+
+    GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h);
+    p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l);
+
+    p->SHR(s0l_s1h, s0l_s1h, GenRegister::immud(32));
+    GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h);
+    p->ADD(dst_h, dst_h, s0l_s1h_h);
+
+    GenRegister dst_l_h = unpacked_ud(s0l_s1h);
+    p->MOV(dst_l_h, unpacked_ud(dst_l, 1));
+    p->ADD(s0h_s1l, s0h_s1l, dst_l_h);
+
+    // No longer need s0l_s1h
+    GenRegister tmp = s0l_s1h;
+
+    p->SHL(tmp, s0h_s1l, GenRegister::immud(32));
+    GenRegister tmp_unpacked = unpacked_ud(tmp, 1);
+    p->MOV(unpacked_ud(dst_l, 1), tmp_unpacked);
+
+    p->SHR(tmp, s0h_s1l, GenRegister::immud(32));
+    p->ADD(dst_h, dst_h, tmp);
+  }
+
+  void ChvContext::emitI64MULInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister res = ra->genReg(insn.dst(1));
+
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst.type = GEN_TYPE_UL;
+    res.type = GEN_TYPE_UL;
+
+    /* Low 32 bits X low 32 bits. */
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    p->MUL(dst, s0l, s1l);
+
+    /* Low 32 bits X high 32 bits. */
+    GenRegister s1h = unpacked_ud(res);
+    p->MOV(s1h, unpacked_ud(src1, 1));
+
+    p->MUL(res, s0l, s1h);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+
+    /* High 32 bits X low 32 bits. */
+    GenRegister s0h = unpacked_ud(res);
+    p->MOV(s0h, unpacked_ud(src0, 1));
+
+    p->MUL(res, s0h, s1l);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+  }
+
+  void ChvContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    if (sz == 0)
+      sz = 16;
+    GBE_ASSERT(sz%4 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+
+    p->push();
+    p->curr.execWidth = 1;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    for (int i = 0; i < sz/2; i++) {
+      p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+             GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+    }
+    p->pop();
+  }
+
 }
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index 49193f5..84508e9 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -47,8 +47,35 @@ namespace gbe
     }
     /*! Get the pointer argument size for curbe alloc */
     virtual uint32_t getPointerSize(void) { return 8; }
+    /*! Set the correct target values for the branches */
+    virtual bool patchBranches(void);
+
+    virtual void emitUnaryInstruction(const SelectionInstruction &insn);
+    virtual void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
+    virtual void emitSimdShuffleInstruction(const SelectionInstruction &insn);
+    virtual void emitBinaryInstruction(const SelectionInstruction &insn);
+    virtual void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
+    virtual void emitI64MULHIInstruction(const SelectionInstruction &insn);
+    virtual void emitI64RHADDInstruction(const SelectionInstruction &insn);
+    virtual void emitI64HADDInstruction(const SelectionInstruction &insn);
+    virtual void emitI64ShiftInstruction(const SelectionInstruction &insn);
+    virtual void emitI64CompareInstruction(const SelectionInstruction &insn);
+    virtual void emitI64SATADDInstruction(const SelectionInstruction &insn);
+    virtual void emitI64SATSUBInstruction(const SelectionInstruction &insn);
+    virtual void emitI64ToFloatInstruction(const SelectionInstruction &insn);
+    virtual void emitFloatToI64Instruction(const SelectionInstruction &insn);
+    virtual void emitI64MADSATInstruction(const SelectionInstruction &insn);
+
+    virtual void emitWrite64Instruction(const SelectionInstruction &insn);
+    virtual void emitRead64Instruction(const SelectionInstruction &insn);
+    virtual void emitI64MULInstruction(const SelectionInstruction &insn);
+    virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
+
+    virtual void emitPackLongInstruction(const SelectionInstruction &insn);
+    virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
 
   protected:
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
     virtual GenEncoder* generateEncoder(void) {
       return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
     }
@@ -56,6 +83,31 @@ namespace gbe
   private:
     virtual void emitSLMOffset(void);
     virtual void newSelection(void);
+    void packLongVec(GenRegister unpacked, GenRegister packed, uint32_t simd);
+    void unpackLongVec(GenRegister packed, GenRegister unpacked, uint32_t simd);
+    void calculateFullS64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                             GenRegister dst_l, GenRegister s0_abs, GenRegister s1_abs,
+                             GenRegister tmp0, GenRegister tmp1, GenRegister sign, GenRegister flagReg);
+    virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                           GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+  };
+
+  class ChvContext : public Gen8Context
+  {
+  public:
+    virtual ~ChvContext(void) { }
+    ChvContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+            : Gen8Context(unit, name, deviceID, relaxMath) {
+    };
+    virtual void emitI64MULInstruction(const SelectionInstruction &insn);
+
+  protected:
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
+  private:
+    virtual void newSelection(void);
+    virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                           GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
   };
 }
 #endif /* __GBE_GEN8_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index ae2d4eb..69eabb2 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -103,9 +103,7 @@ namespace gbe
   void Gen8Encoder::F32TO16(GenRegister dest, GenRegister src0) {
     MOV(GenRegister::retype(dest, GEN_TYPE_HF), GenRegister::retype(src0, GEN_TYPE_F));
   }
-
-  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
-    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+  unsigned Gen8Encoder::setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
@@ -119,11 +117,6 @@ namespace gbe
     } else
       NOT_IMPLEMENTED;
 
-    this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
-
     const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
     gen8_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
@@ -137,11 +130,26 @@ namespace gbe
       gen8_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
     else
       NOT_SUPPORTED;
+    return gen8_insn->bits3.ud;
   }
 
-  void Gen8Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+  unsigned Gen8Encoder::setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     if (this->curr.execWidth == 8) {
@@ -152,44 +160,73 @@ namespace gbe
       response_length = 2 * elemNum;
     } else
       NOT_IMPLEMENTED;
-
-    this->setHeader(insn);
-    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN75_P1_UNTYPED_READ,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
-  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+  void Gen8Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned Gen8Encoder::setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
-    this->setHeader(insn);
     if (this->curr.execWidth == 8) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
       msg_length = 1 + elemNum;
     } else if (this->curr.execWidth == 16) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
       msg_length = 2 * (1 + elemNum);
     }
     else
       NOT_IMPLEMENTED;
-    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN75_P1_UNTYPED_SURFACE_WRITE,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
+  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
   void Gen8Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
     union { double d; unsigned u[2]; } u;
     u.d = value;
@@ -219,6 +256,10 @@ namespace gbe
     pop();
   }
 
+  void Gen8Encoder::LOAD_INT64_IMM(GenRegister dest, GenRegister value) {
+    MOV(dest, value);
+  }
+
   void Gen8Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp) {
     GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
     GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F);
@@ -331,11 +372,15 @@ namespace gbe
       gen8_insn->bits2.da1.src0_negate = reg.negation;
       gen8_insn->bits2.da1.src0_address_mode = reg.address_mode;
       if (reg.file == GEN_IMMEDIATE_VALUE) {
-        gen8_insn->bits3.ud = reg.value.ud;
-
-        /* Required to set some fields in src1 as well: */
-        gen8_insn->bits2.da1.src1_reg_file = 0; /* arf */
-        gen8_insn->bits2.da1.src1_reg_type = reg.type;
+        if (reg.type == GEN_TYPE_L || reg.type == GEN_TYPE_UL) {
+          gen8_insn->bits3.ud = (uint32_t)(reg.value.i64 >> 32);
+          gen8_insn->bits2.ud = (uint32_t)(reg.value.i64);
+        } else {
+          gen8_insn->bits3.ud = reg.value.ud;
+          /* Required to set some fields in src1 as well: */
+          gen8_insn->bits2.da1.src1_reg_file = 0; /* arf */
+          gen8_insn->bits2.da1.src1_reg_type = reg.type;
+        }
       }
       else {
         if (gen8_insn->header.access_mode == GEN_ALIGN_1) {
@@ -361,21 +406,21 @@ namespace gbe
     } else {
       gen8_insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
       gen8_insn->bits1.ia1.src0_reg_type = reg.type;
-      gen8_insn->bits2.ia1.src0_subreg_nr = 0;
-      gen8_insn->bits2.ia1.src0_indirect_offset = 0;
-      gen8_insn->bits2.ia1.src0_abs = 0;
-      gen8_insn->bits2.ia1.src0_negate = 0;
+      gen8_insn->bits2.ia1.src0_subreg_nr = reg.a0_subnr;
+      gen8_insn->bits2.ia1.src0_indirect_offset = (reg.addr_imm & 0x1ff);
+      gen8_insn->bits2.ia1.src0_abs = reg.absolute;
+      gen8_insn->bits2.ia1.src0_negate = reg.negation;
       gen8_insn->bits2.ia1.src0_address_mode = reg.address_mode;
-      gen8_insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-      gen8_insn->bits2.ia1.src0_width = GEN_WIDTH_1;
-      gen8_insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      gen8_insn->bits2.ia1.src0_horiz_stride = reg.hstride;
+      gen8_insn->bits2.ia1.src0_width = reg.width;
+      gen8_insn->bits2.ia1.src0_vert_stride = reg.vstride;
+      gen8_insn->bits2.ia1.src0_indirect_offset_9 = (reg.addr_imm & 0x02) >> 9;
     }
   }
 
   void Gen8Encoder::setSrc1(GenNativeInstruction *insn, GenRegister reg) {
     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
     assert(reg.nr < 128);
-    assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
 
     gen8_insn->bits2.da1.src1_reg_file = reg.file;
     gen8_insn->bits2.da1.src1_reg_type = reg.type;
@@ -409,6 +454,11 @@ namespace gbe
     }
   }
 
+  bool Gen8Encoder::canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1)
+  {
+    return false;
+  }
+
 #define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
 
   void Gen8Encoder::alu3(uint32_t opcode,
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index e0d934f..504e13d 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -48,9 +48,10 @@ namespace gbe
     virtual void F32TO16(GenRegister dest, GenRegister src0);
     virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
-    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
-    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
-    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
     virtual void setHeader(GenNativeInstruction *insn);
     virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
                    uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
@@ -63,6 +64,11 @@ namespace gbe
     virtual bool disableCompact() { return true; }
     virtual void alu3(uint32_t opcode, GenRegister dst,
                        GenRegister src0, GenRegister src1, GenRegister src2);
+    virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
+                            GenRegister src1 = GenRegister::null());
+    virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+    virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
   };
 }
 #endif /* __GBE_GEN8_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index bfa86c7..5cf1032 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -232,6 +232,8 @@ union Gen8NativeInstruction
       struct {
         uint32_t uip:32;
       } gen8_branch;
+
+      uint32_t ud;
     } bits2;
 
     union {
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
new file mode 100644
index 0000000..326f5a1
--- /dev/null
+++ b/backend/src/backend/gen9_context.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen9_context.cpp
+ */
+
+#include "backend/gen9_context.hpp"
+#include "backend/gen_insn_selection.hpp"
+
+namespace gbe
+{
+  void Gen9Context::newSelection(void) {
+    this->sel = GBE_NEW(Selection9, *this);
+  }
+
+  void Gen9Context::emitBarrierInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister fenceDst = ra->genReg(insn.dst(0));
+    uint32_t barrierType = insn.extra.barrierType;
+    const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+
+    if (barrierType == ir::syncGlobalBarrier) {
+      p->FENCE(fenceDst);
+      p->MOV(fenceDst, fenceDst);
+    }
+    p->push();
+      // As only the payload.2 is used and all the other regions are ignored
+      // SIMD8 mode here is safe.
+      p->curr.execWidth = 8;
+      p->curr.physicalFlag = 0;
+      p->curr.noMask = 1;
+      // Copy barrier id from r0.
+      p->AND(src, barrierId, GenRegister::immud(0x8f000000));
+      // A barrier is OK to start the thread synchronization *and* SLM fence
+      p->BARRIER(src);
+      p->curr.execWidth = 1;
+      // Now we wait for the other threads
+      p->WAIT();
+    p->pop();
+  }
+}
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen9_context.hpp
similarity index 50%
copy from backend/src/backend/gen8_context.cpp
copy to backend/src/backend/gen9_context.hpp
index 776c92b..8acad8c 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -17,37 +17,34 @@
  */
 
 /**
- * \file gen8_context.cpp
+ * \file gen9_context.hpp
  */
+#ifndef __GBE_gen9_CONTEXT_HPP__
+#define __GBE_gen9_CONTEXT_HPP__
 
 #include "backend/gen8_context.hpp"
-#include "backend/gen8_encoder.hpp"
-#include "backend/gen_program.hpp"
-#include "backend/gen_defs.hpp"
-#include "backend/gen_encoder.hpp"
-#include "backend/gen_insn_selection.hpp"
-#include "backend/gen_insn_scheduling.hpp"
-#include "backend/gen_reg_allocation.hpp"
-#include "sys/cvar.hpp"
-#include "ir/function.hpp"
-#include "ir/value.hpp"
-#include <cstring>
+#include "backend/gen9_encoder.hpp"
 
 namespace gbe
 {
-  void Gen8Context::emitSLMOffset(void) {
-    return;
-  }
+  /* This class is used to implement the HSW
+     specific logic for context. */
+  class Gen9Context : public Gen8Context
+  {
+  public:
+    virtual ~Gen9Context(void) { };
+    Gen9Context(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+            : Gen8Context(unit, name, deviceID, relaxMath) {
+    };
+    virtual void emitBarrierInstruction(const SelectionInstruction &insn);
 
-  uint32_t Gen8Context::alignScratchSize(uint32_t size){
-    if(size == 0)
-      return 0;
-    uint32_t i = 1024;
-    while(i < size) i *= 2;
-    return i;
-  }
+  protected:
+    virtual GenEncoder* generateEncoder(void) {
+      return GBE_NEW(Gen9Encoder, this->simdWidth, 9, deviceID);
+    }
 
-  void Gen8Context::newSelection(void) {
-    this->sel = GBE_NEW(Selection8, *this);
-  }
+  private:
+    virtual void newSelection(void);
+  };
 }
+#endif /* __GBE_GEN9_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen9_encoder.cpp b/backend/src/backend/gen9_encoder.cpp
new file mode 100644
index 0000000..80df50d
--- /dev/null
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -0,0 +1,68 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+
+#include "backend/gen9_encoder.hpp"
+
+namespace gbe
+{
+  void Gen9Encoder::SAMPLE(GenRegister dest,
+                          GenRegister msg,
+                          unsigned int msg_len,
+                          bool header_present,
+                          unsigned char bti,
+                          unsigned char sampler,
+                          uint32_t simdWidth,
+                          uint32_t writemask,
+                          uint32_t return_format,
+                          bool isLD,
+                          bool isUniform)
+  {
+     if (writemask == 0) return;
+     uint32_t msg_type = isLD ? GEN_SAMPLER_MESSAGE_SIMD8_LD :
+                                GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+     uint32_t response_length = (4 * (simdWidth / 8));
+     uint32_t msg_length = (msg_len * (simdWidth / 8));
+     if (header_present)
+       msg_length++;
+     uint32_t simd_mode = (simdWidth == 16) ?
+                            GEN_SAMPLER_SIMD_MODE_SIMD16 : GEN_SAMPLER_SIMD_MODE_SIMD8;
+    if(isUniform) {
+      response_length = 1;
+      msg_type = GEN_SAMPLER_MESSAGE_SIMD4X2_LD;
+      msg_length = 1;
+      simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD8;
+    }
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, dest);
+     this->setSrc0(insn, msg);
+     setSamplerMessage(insn, bti, sampler, msg_type,
+                       response_length, msg_length,
+                       header_present,
+                       simd_mode, return_format);
+  }
+} /* End of the name space. */
diff --git a/backend/src/backend/gen9_encoder.hpp b/backend/src/backend/gen9_encoder.hpp
new file mode 100644
index 0000000..319e871
--- /dev/null
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen9_context.hpp
+ */
+#ifndef __GBE_GEN9_ENCODER_HPP__
+#define __GBE_GEN9_ENCODER_HPP__
+
+#include "backend/gen8_encoder.hpp"
+
+namespace gbe
+{
+  /* This class is used to implement the SKL
+     specific logic for encoder. */
+  class Gen9Encoder : public Gen8Encoder
+  {
+  public:
+    virtual ~Gen9Encoder(void) { }
+
+    Gen9Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
+         : Gen8Encoder(simdWidth, gen, deviceID) { }
+    /*! Send instruction for the sampler */
+    virtual void SAMPLE(GenRegister dest,
+                GenRegister msg,
+                unsigned int msg_len,
+                bool header_present,
+                unsigned char bti,
+                unsigned char sampler,
+                unsigned int simdWidth,
+                uint32_t writemask,
+                uint32_t return_format,
+                bool isLD,
+                bool isUniform);
+
+  };
+}
+#endif /* __GBE_GEN9_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 13c7664..e16b0a9 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -182,30 +182,28 @@ namespace gbe
     const uint32_t perLaneSize = kernel->getStackSize();
     const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
-    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
-    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
 
-    // Use shifts rather than muls which are limited to 32x16 bit sources
-    const uint32_t perLaneShift = logi2(perLaneSize);
-    const uint32_t perThreadShift = logi2(perThreadSize);
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
-    const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
-    const GenRegister bufferptr = ra->genReg(selStackBuffer);
 
     // We compute the per-lane stack pointer here
+    // threadId * perThreadSize + laneId*perLaneSize
+    // let private address start from zero
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
       p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
       p->curr.execWidth = this->simdWidth;
-      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
       p->curr.execWidth = 1;
-      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+      if(perThreadSize > 0xffff) {
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
+      } else
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
       p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, bufferptr);
       p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
     p->pop();
   }
@@ -224,6 +222,7 @@ namespace gbe
       case SEL_OP_FBH: p->FBH(dst, src); break;
       case SEL_OP_FBL: p->FBL(dst, src); break;
       case SEL_OP_CBIT: p->CBIT(dst, src); break;
+      case SEL_OP_LZD: p->LZD(dst, src); break;
       case SEL_OP_NOT: p->NOT(dst, src); break;
       case SEL_OP_RNDD: p->RNDD(dst, src); break;
       case SEL_OP_RNDU: p->RNDU(dst, src); break;
@@ -231,7 +230,7 @@ namespace gbe
       case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
       case SEL_OP_F16TO32: p->F16TO32(dst, src); break;
       case SEL_OP_F32TO16: p->F32TO16(dst, src); break;
-      case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
+      case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src); break;
       case SEL_OP_CONVI64_TO_I:
        {
         p->MOV(dst, src.bottom_half());
@@ -314,6 +313,133 @@ namespace gbe
           p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
         break;
       }
+      case SEL_OP_BSWAP: {
+        uint32_t simd = p->curr.execWidth;
+        GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
+        uint16_t new_a0[16];
+        memset(new_a0, 0, sizeof(new_a0));
+
+        GBE_ASSERT(src.type == dst.type);
+        uint32_t start_addr = src.nr*32 + src.subnr;
+
+        if (simd == 1) {
+          GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
+              && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 3;
+            new_a0[1] = start_addr + 2;
+            new_a0[2] = start_addr + 1;
+            new_a0[3] = start_addr;
+            this->setA0Content(new_a0, 0, 4);
+
+            p->push();
+            p->curr.execWidth = 4;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
+            GenRegister dst_ = dst;
+            dst_.type = GEN_TYPE_UB;
+            dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
+            dst_.width = GEN_WIDTH_4;
+            dst_.vstride = GEN_VERTICAL_STRIDE_4;
+            p->MOV(dst_, ind_src);
+            p->pop();
+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+            p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
+                GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
+            p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
+                GenRegister::retype(src, GEN_TYPE_UB));
+          } else {
+            GBE_ASSERT(0);
+          }
+        } else {
+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+            bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+            GBE_ASSERT(uniform_src || src.subnr == 0);
+            GBE_ASSERT(dst.subnr == 0);
+            GBE_ASSERT(tmp.subnr == 0);
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 3;
+            new_a0[1] = start_addr + 2;
+            new_a0[2] = start_addr + 1;
+            new_a0[3] = start_addr;
+            if (!uniform_src) {
+              new_a0[4] = start_addr + 7;
+              new_a0[5] = start_addr + 6;
+              new_a0[6] = start_addr + 5;
+              new_a0[7] = start_addr + 4;
+            } else {
+              new_a0[4] = start_addr + 3;
+              new_a0[5] = start_addr + 2;
+              new_a0[6] = start_addr + 1;
+              new_a0[7] = start_addr;
+            }
+            this->setA0Content(new_a0, 56);
+
+            p->push();
+            p->curr.execWidth = 8;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+            for (int i = 1; i < 4; i++) {
+              ind_src.addr_imm += 8;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
+            }
+            if (simd == 16) {
+              for (int i = 0; i < 4; i++) {
+                ind_src.addr_imm += 8;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src);
+              }
+            }
+            p->pop();
+
+            p->MOV(dst, tmp);
+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+            bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+            GBE_ASSERT(uniform_src || src.subnr == 0 || src.subnr == 16);
+            GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
+            GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 1;
+            new_a0[1] = start_addr;
+            if (!uniform_src) {
+              new_a0[2] = start_addr + 3;
+              new_a0[3] = start_addr + 2;
+              new_a0[4] = start_addr + 5;
+              new_a0[5] = start_addr + 4;
+              new_a0[6] = start_addr + 7;
+              new_a0[7] = start_addr + 6;
+            } else {
+              new_a0[2] = start_addr + 1;
+              new_a0[3] = start_addr;
+              new_a0[4] = start_addr + 1;
+              new_a0[5] = start_addr;
+              new_a0[6] = start_addr + 1;
+              new_a0[7] = start_addr;
+            }
+            this->setA0Content(new_a0, 56);
+
+            p->push();
+            p->curr.execWidth = 8;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB), new_a0[0], 0);
+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+            for (int i = 1; i < (simd == 8 ? 2 : 4); i++) {
+              ind_src.addr_imm += 8;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
+            }
+            p->pop();
+
+            p->MOV(dst, tmp);
+          } else {
+            GBE_ASSERT(0);
+          }
+        }
+      }
+      break;
       default:
         NOT_IMPLEMENTED;
     }
@@ -419,6 +545,42 @@ namespace gbe
     }
   }
 
+  void GenContext::emitSimdShuffleInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
+    assert(insn.opcode == SEL_OP_SIMD_SHUFFLE);
+
+    uint32_t simd = p->curr.execWidth;
+    if (src1.file == GEN_IMMEDIATE_VALUE) {
+      uint32_t offset = src1.value.ud % simd;
+      GenRegister reg = GenRegister::suboffset(src0, offset);
+      p->MOV(dst, GenRegister::retype(GenRegister::ud1grf(reg.nr, reg.subnr / typeSize(reg.type)), reg.type));
+    } else {
+      uint32_t base = src0.nr * 32 + src0.subnr * 4;
+      GenRegister baseReg = GenRegister::immuw(base);
+      const GenRegister a0 = GenRegister::addr8(0);
+
+      p->push();
+        if (simd == 8) {
+          p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+          GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+          p->MOV(dst, indirect);
+        } else if (simd == 16) {
+          p->curr.execWidth = 8;
+          p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+          GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+          p->MOV(dst, indirect);
+
+          p->curr.quarterControl = 1;
+          p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+          p->MOV(GenRegister::offset(dst, 1, 0), indirect);
+        } else
+          NOT_IMPLEMENTED;
+      p->pop();
+    }
+  }
+
   void GenContext::emitBinaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src0 = ra->genReg(insn.src(0));
@@ -1561,27 +1723,55 @@ namespace gbe
     const GenRegister src = ra->genReg(insn.src(0));
     const GenRegister dst = ra->genReg(insn.dst(0));
     const uint32_t function = insn.extra.function;
-    const uint32_t bti = insn.getbti();
+    unsigned srcNum = insn.extra.elem;
+
+    const GenRegister bti = ra->genReg(insn.src(srcNum));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->ATOMIC(dst, function, src, bti, srcNum);
+    } else {
+      GenRegister flagTemp = ra->genReg(insn.dst(1));
+
+      unsigned desc = p->generateAtomicMessageDesc(function, 0, srcNum);
 
-    p->ATOMIC(dst, function, src, bti, insn.srcNum);
+      unsigned jip0 = beforeMessage(insn, bti, flagTemp, desc);
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->ATOMIC(dst, function, src, GenRegister::addr1(0), srcNum);
+      p->pop();
+      afterMessage(insn, bti, flagTemp, jip0);
+    }
   }
 
   void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
-    GenRegister src = ra->genReg(insn.src(0));
-    if(sel->isScalarReg(src.reg()))
-      src = GenRegister::retype(src, GEN_TYPE_UW);
-    else
-      src = GenRegister::unpacked_uw(src.nr, src.subnr / typeSize(GEN_TYPE_UW));
+    GenRegister baseReg = ra->genReg(insn.src(0));
+    GenRegister offset = ra->genReg(insn.src(1));
+    uint32_t immoffset = insn.extra.indirect_offset;
 
     const GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister tmp = ra->genReg(insn.dst(1));
     const GenRegister a0 = GenRegister::addr8(0);
     uint32_t simdWidth = p->curr.execWidth;
+    GenRegister indirect_src;
+
+    if(sel->isScalarReg(offset.reg()))
+      offset = GenRegister::retype(offset, GEN_TYPE_UW);
+    else
+      offset = GenRegister::unpacked_uw(offset.nr, offset.subnr / typeSize(GEN_TYPE_UW));
+    uint32_t baseRegOffset = GenRegister::grfOffset(baseReg);
+    //There is a restrict that: lower 5 bits indirect reg SubRegNum and
+    //the lower 5 bits of indirect imm SubRegNum cannot exceed 5 bits.
+    //So can't use AddrImm field, need a add.
+    p->ADD(tmp, offset, GenRegister::immuw(baseRegOffset + immoffset));
+    indirect_src = GenRegister::indirect(dst.type, 0, GEN_WIDTH_1,
+                                         GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL, GEN_HORIZONTAL_STRIDE_0);
 
     p->push();
       p->curr.execWidth = 8;
       p->curr.quarterControl = GEN_COMPRESSION_Q1;
-      p->MOV(a0, src);
-      p->MOV(dst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+      p->MOV(a0, tmp);
+      p->MOV(dst, indirect_src);
     p->pop();
 
     if (simdWidth == 16) {
@@ -1590,9 +1780,9 @@ namespace gbe
         p->curr.quarterControl = GEN_COMPRESSION_Q2;
 
         const GenRegister nextDst = GenRegister::Qn(dst, 1);
-        const GenRegister nextSrc = GenRegister::Qn(src, 1);
-        p->MOV(a0, nextSrc);
-        p->MOV(nextDst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+        const GenRegister nextOffset = GenRegister::Qn(tmp, 1);
+        p->MOV(a0, nextOffset);
+        p->MOV(nextDst, indirect_src);
       p->pop();
     }
   }
@@ -1683,48 +1873,188 @@ namespace gbe
   }
 
   void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
-    const uint32_t elemNum = insn.extra.elem;
+    const uint32_t elemNum = insn.extra.elem * 2;
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
-    p->UNTYPED_READ(dst, src, bti, elemNum*2);
+    const GenRegister bti = ra->genReg(insn.src(1));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_READ(dst, src, bti, elemNum);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+      unsigned desc = p->generateUntypedReadMessageDesc(0, elemNum);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
+  }
+  unsigned GenContext::beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned desc) {
+      const GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+      setFlag(flagReg, GenRegister::immuw(0));
+      p->CMP(GEN_CONDITIONAL_NZ, flagReg, GenRegister::immuw(1));
+
+      GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
+      GenRegister btiUW = ra->genReg(GenRegister::uw1grf(ir::ocl::btiUtil));
+      GenRegister btiUB = ra->genReg(GenRegister::ub1grf(ir::ocl::btiUtil));
+      unsigned jip0 = p->n_instruction();
+      p->push();
+        p->curr.execWidth = 1;
+        p->curr.noMask = 1;
+        p->AND(btiUD, flagReg, GenRegister::immud(0xffffffff));
+        p->LZD(btiUD, btiUD);
+        p->ADD(btiUW, GenRegister::negate(btiUW), GenRegister::immuw(0x1f));
+        p->MUL(btiUW, btiUW, GenRegister::immuw(0x4));
+        p->ADD(GenRegister::addr1(0), btiUW, GenRegister::immud(bti.nr*32));
+        p->MOV(btiUD, GenRegister::indirect(GEN_TYPE_UD, 0, GEN_WIDTH_1, GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL, GEN_HORIZONTAL_STRIDE_0));
+        //save flag
+        p->MOV(tmp, flagReg);
+      p->pop();
+
+      p->CMP(GEN_CONDITIONAL_Z, bti, btiUD);
+      p->push();
+        p->curr.execWidth = 1;
+        p->curr.noMask = 1;
+        p->OR(GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), btiUB, GenRegister::immud(desc));
+      p->pop();
+      return jip0;
+  }
+  void GenContext::afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned jip0) {
+    const GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
+      //restore flag
+      setFlag(GenRegister::flag(insn.state.flag, insn.state.subFlag), tmp);
+      // get active channel
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->CMP(GEN_CONDITIONAL_NZ, bti, btiUD);
+        unsigned jip1 = p->n_instruction();
+        p->WHILE(GenRegister::immud(0));
+      p->pop();
+      p->patchJMPI(jip1, jip0 - jip1, 0);
   }
 
   void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
+    const GenRegister bti = ra->genReg(insn.src(1));
+
     const uint32_t elemNum = insn.extra.elem;
-    p->UNTYPED_READ(dst, src, bti, elemNum);
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_READ(dst, src, bti, elemNum);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+      unsigned desc = p->generateUntypedReadMessageDesc(0, elemNum);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
   }
 
   void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.dst(0));
     const uint32_t elemNum = insn.extra.elem;
-    const uint32_t bti = insn.getbti();
-    p->UNTYPED_WRITE(src, bti, elemNum*2);
+    const GenRegister bti = ra->genReg(insn.src(elemNum+1));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_WRITE(src, bti, elemNum*2);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(0));
+      unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum*2);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
   }
 
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
     const uint32_t elemNum = insn.extra.elem;
-    p->UNTYPED_WRITE(src, bti, elemNum);
+    const GenRegister bti = ra->genReg(insn.src(elemNum+1));
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->UNTYPED_WRITE(src, bti, elemNum);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(0));
+      unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
   }
 
   void GenContext::emitByteGatherInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
+    const GenRegister bti = ra->genReg(insn.src(1));
     const uint32_t elemSize = insn.extra.elem;
-    p->BYTE_GATHER(dst, src, bti, elemSize);
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->BYTE_GATHER(dst, src, bti, elemSize);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(1));
+      unsigned desc = p->generateByteGatherMessageDesc(0, elemSize);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->BYTE_GATHER(dst, src, GenRegister::addr1(0), elemSize);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
   }
 
   void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.getbti();
     const uint32_t elemSize = insn.extra.elem;
-    p->BYTE_SCATTER(src, bti, elemSize);
+    const GenRegister bti = ra->genReg(insn.src(2));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      p->BYTE_SCATTER(src, bti, elemSize);
+    } else {
+      const GenRegister tmp = ra->genReg(insn.dst(0));
+      unsigned desc = p->generateByteScatterMessageDesc(0, elemSize);
+
+      unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+      //predicated load
+      p->push();
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+        p->BYTE_SCATTER(src, GenRegister::addr1(0), elemSize);
+      p->pop();
+      afterMessage(insn, bti, tmp, jip0);
+    }
+
   }
 
   void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
@@ -1756,6 +2086,14 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::emitUnpackLongInstruction(const SelectionInstruction &insn) {
+    GBE_ASSERT(0);
+  }
+
+  void GenContext::emitPackLongInstruction(const SelectionInstruction &insn) {
+    GBE_ASSERT(0);
+  }
+
   void GenContext::emitDWordGatherInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
@@ -1810,6 +2148,23 @@ namespace gbe
     p->TYPED_WRITE(header, true, bti);
   }
 
+  void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    if (sz == 0)
+      sz = 8;
+    GBE_ASSERT(sz%4 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+
+    p->push();
+    p->curr.execWidth = 1;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    for (int i = 0; i < sz/2; i++) {
+      p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+             GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+    }
+    p->pop();
+  }
+
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
   BVAR(OCL_OUTPUT_ASM, false);
 
@@ -1835,6 +2190,7 @@ namespace gbe
     allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
     allocCurbeReg(zero, GBE_CURBE_ZERO);
     allocCurbeReg(one, GBE_CURBE_ONE);
+    allocCurbeReg(btiUtil, GBE_CURBE_BTI_UTIL);
     if (stackUse.size() != 0)
       allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
     // Go over the arguments and find the related patch locations
@@ -1860,9 +2216,14 @@ namespace gbe
       if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
       allocCurbeReg(reg, GBE_CURBE_##PATCH); \
     } else
-  
+
+    bool needLaneID = false;
     fn.foreachInstruction([&](ir::Instruction &insn) {
       const uint32_t srcNum = insn.getSrcNum();
+      if (insn.getOpcode() == ir::OP_SIMD_ID) {
+        GBE_ASSERT(srcNum == 0);
+        needLaneID = true;
+      }
       for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
         const ir::Register reg = insn.getSrc(srcID);
         if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
@@ -1901,6 +2262,8 @@ namespace gbe
     });
 #undef INSERT_REG
 
+    if (needLaneID)
+      allocCurbeReg(laneid, GBE_CURBE_LANE_ID);
 
     // After this point the vector is immutable. Sorting it will make
     // research faster
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 45347b9..69fe513 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -90,7 +90,7 @@ namespace gbe
     /*! Emit the instructions */
     void emitInstructionStream(void);
     /*! Set the correct target values for the branches */
-    bool patchBranches(void);
+    virtual bool patchBranches(void);
     /*! Forward ir::Function isSpecialReg method */
     INLINE bool isSpecialReg(ir::Register reg) const {
       return fn.isSpecialReg(reg);
@@ -124,32 +124,33 @@ namespace gbe
 
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
-    void emitUnaryInstruction(const SelectionInstruction &insn);
-    void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
-    void emitBinaryInstruction(const SelectionInstruction &insn);
-    void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
+    virtual void emitUnaryInstruction(const SelectionInstruction &insn);
+    virtual void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
+    virtual void emitBinaryInstruction(const SelectionInstruction &insn);
+    virtual void emitSimdShuffleInstruction(const SelectionInstruction &insn);
+    virtual void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
     void emitTernaryInstruction(const SelectionInstruction &insn);
-    void emitI64MULHIInstruction(const SelectionInstruction &insn);
-    void emitI64MADSATInstruction(const SelectionInstruction &insn);
-    void emitI64HADDInstruction(const SelectionInstruction &insn);
-    void emitI64RHADDInstruction(const SelectionInstruction &insn);
-    void emitI64ShiftInstruction(const SelectionInstruction &insn);
-    void emitI64CompareInstruction(const SelectionInstruction &insn);
-    void emitI64SATADDInstruction(const SelectionInstruction &insn);
-    void emitI64SATSUBInstruction(const SelectionInstruction &insn);
-    void emitI64ToFloatInstruction(const SelectionInstruction &insn);
-    void emitFloatToI64Instruction(const SelectionInstruction &insn);
+    virtual void emitI64MULHIInstruction(const SelectionInstruction &insn);
+    virtual void emitI64MADSATInstruction(const SelectionInstruction &insn);
+    virtual void emitI64HADDInstruction(const SelectionInstruction &insn);
+    virtual void emitI64RHADDInstruction(const SelectionInstruction &insn);
+    virtual void emitI64ShiftInstruction(const SelectionInstruction &insn);
+    virtual void emitI64CompareInstruction(const SelectionInstruction &insn);
+    virtual void emitI64SATADDInstruction(const SelectionInstruction &insn);
+    virtual void emitI64SATSUBInstruction(const SelectionInstruction &insn);
+    virtual void emitI64ToFloatInstruction(const SelectionInstruction &insn);
+    virtual void emitFloatToI64Instruction(const SelectionInstruction &insn);
     void emitCompareInstruction(const SelectionInstruction &insn);
     void emitJumpInstruction(const SelectionInstruction &insn);
     void emitIndirectMoveInstruction(const SelectionInstruction &insn);
     void emitEotInstruction(const SelectionInstruction &insn);
     void emitNoOpInstruction(const SelectionInstruction &insn);
     void emitWaitInstruction(const SelectionInstruction &insn);
-    void emitBarrierInstruction(const SelectionInstruction &insn);
+    virtual void emitBarrierInstruction(const SelectionInstruction &insn);
     void emitFenceInstruction(const SelectionInstruction &insn);
     void emitMathInstruction(const SelectionInstruction &insn);
-    void emitRead64Instruction(const SelectionInstruction &insn);
-    void emitWrite64Instruction(const SelectionInstruction &insn);
+    virtual void emitRead64Instruction(const SelectionInstruction &insn);
+    virtual void emitWrite64Instruction(const SelectionInstruction &insn);
     void emitUntypedReadInstruction(const SelectionInstruction &insn);
     void emitUntypedWriteInstruction(const SelectionInstruction &insn);
     void emitAtomicInstruction(const SelectionInstruction &insn);
@@ -157,16 +158,20 @@ namespace gbe
     void emitByteScatterInstruction(const SelectionInstruction &insn);
     void emitPackByteInstruction(const SelectionInstruction &insn);
     void emitUnpackByteInstruction(const SelectionInstruction &insn);
+    virtual void emitPackLongInstruction(const SelectionInstruction &insn);
+    virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
     void emitDWordGatherInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
     void emitSpillRegInstruction(const SelectionInstruction &insn);
     void emitUnSpillRegInstruction(const SelectionInstruction &insn);
     void emitGetImageInfoInstruction(const SelectionInstruction &insn);
-    void emitI64MULInstruction(const SelectionInstruction &insn);
-    void emitI64DIVREMInstruction(const SelectionInstruction &insn);
+    virtual void emitI64MULInstruction(const SelectionInstruction &insn);
+    virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
     void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+    unsigned beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned desc);
+    void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned jip0);
 
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
@@ -206,6 +211,8 @@ namespace gbe
     /*! allocate a new curbe register and insert to curbe pool. */
     void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
 
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
   private:
     CompileErrorCode errCode;
     bool ifEndifFix;
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index cd6b7c8..1ca148c 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -252,6 +252,8 @@ enum GenMessageTarget {
 #define GEN_TYPE_UL  8
 #define GEN_TYPE_L   9
 #define GEN_TYPE_HF  10
+#define GEN_TYPE_DF_IMM  10 /* For the double float in imm. */
+#define GEN_TYPE_HF_IMM  11 /* For the half float in imm. */
 
 #define GEN_ARF_NULL                  0x00
 #define GEN_ARF_ADDRESS               0x10
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index bbf1472..cac29e8 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -67,6 +67,26 @@ namespace gbe
       return false;
   }
 
+  INLINE bool isVectorOfLongs(GenRegister reg) {
+    if (reg.hstride != GEN_HORIZONTAL_STRIDE_0 &&
+        (reg.type == GEN_TYPE_UL || reg.type == GEN_TYPE_L))
+      return true;
+    else
+      return false;
+  }
+
+  INLINE bool isCrossMoreThan2(GenRegister reg) {
+    if (reg.hstride == GEN_HORIZONTAL_STRIDE_0)
+      return false;
+
+    const uint32_t typeSz = typeSize(reg.type);
+    const uint32_t horizontal = stride(reg.hstride);
+    if (horizontal * typeSz * 16 > GEN_REG_SIZE * 2) {
+      return true;
+    }
+    return false;
+  }
+
   INLINE bool isSrcDstDiffSpan(GenRegister dst, GenRegister src) {
     if (src.hstride == GEN_HORIZONTAL_STRIDE_0) return false;
 
@@ -103,36 +123,75 @@ namespace gbe
   }
 
   INLINE bool needToSplitAlu1(GenEncoder *p, GenRegister dst, GenRegister src) {
-    if (p->curr.execWidth != 16 || src.hstride == GEN_HORIZONTAL_STRIDE_0) return false;
+    if (p->curr.execWidth != 16) return false;
+    if (isVectorOfLongs(dst) == true) return true;
+    if (isCrossMoreThan2(dst) == true) return true;
+
+    if (src.hstride == GEN_HORIZONTAL_STRIDE_0) return false;
+
+    if (isCrossMoreThan2(src) == true) return true;
+    if (isVectorOfLongs(src) == true) return true;
+
+    if (isSrcDstDiffSpan(dst, src) == true) return true;
+
+    if (isVectorOfBytes(dst) == true &&
+        ((isVectorOfBytes(src) == true && src.hstride == dst.hstride)
+          || src.hstride == GEN_HORIZONTAL_STRIDE_0))
+      return false;
     if (isVectorOfBytes(dst) == true) return true;
     if (isVectorOfBytes(src) == true) return true;
-    if (isSrcDstDiffSpan(dst, src)) return true;
     return false;
   }
 
   INLINE bool needToSplitAlu2(GenEncoder *p, GenRegister dst, GenRegister src0, GenRegister src1) {
-    if (p->curr.execWidth != 16 ||
-         (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
-          src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+    if (p->curr.execWidth != 16) return false;
+    if (isVectorOfLongs(dst) == true) return true;
+    if (isCrossMoreThan2(dst) == true) return true;
+
+    if (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+		src1.hstride == GEN_HORIZONTAL_STRIDE_0)
       return false;
 
-    if (isSrcDstDiffSpan(dst, src0)) return true;
-    if (isSrcDstDiffSpan(dst, src1)) return true;
-    if (isVectorOfBytes(dst) == true) return true;
+    if (isVectorOfLongs(src0) == true) return true;
+    if (isVectorOfLongs(src1) == true) return true;
+    if (isCrossMoreThan2(src0) == true) return true;
+    if (isCrossMoreThan2(src1) == true) return true;
+
+    if (isSrcDstDiffSpan(dst, src0) == true) return true;
+    if (isSrcDstDiffSpan(dst, src1) == true) return true;
+
+    if (isVectorOfBytes(dst) == true &&
+        ((isVectorOfBytes(src0) == true && src0.hstride == dst.hstride) ||
+         src0.hstride == GEN_HORIZONTAL_STRIDE_0) &&
+        ((isVectorOfBytes(src1) == true && src1.hstride == dst.hstride) ||
+         src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+      return false;
+    if (isVectorOfBytes(dst) == true ) return true;
     if (isVectorOfBytes(src0) == true) return true;
     if (isVectorOfBytes(src1) == true) return true;
     return false;
   }
 
   INLINE bool needToSplitCmp(GenEncoder *p, GenRegister src0, GenRegister src1, GenRegister dst) {
-    if (p->curr.execWidth != 16 ||
-         (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
-          src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+    if (p->curr.execWidth != 16) return false;
+    if (isVectorOfLongs(dst) == true) return true;
+    if (isCrossMoreThan2(dst) == true) return true;
+
+    if (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+            src1.hstride == GEN_HORIZONTAL_STRIDE_0)
       return false;
-    if (isSrcDstDiffSpan(dst, src0)) return true;
-    if (isSrcDstDiffSpan(dst, src1)) return true;
+
     if (isVectorOfBytes(src0) == true) return true;
     if (isVectorOfBytes(src1) == true) return true;
+
+    if (isVectorOfLongs(src0) == true) return true;
+    if (isVectorOfLongs(src1) == true) return true;
+    if (isCrossMoreThan2(src0) == true) return true;
+    if (isCrossMoreThan2(src1) == true) return true;
+
+    if (isSrcDstDiffSpan(dst, src0) == true) return true;
+    if (isSrcDstDiffSpan(dst, src1) == true) return true;
+
     if (src0.type == GEN_TYPE_D || src0.type == GEN_TYPE_UD || src0.type == GEN_TYPE_F)
       return true;
     if (src1.type == GEN_TYPE_D || src1.type == GEN_TYPE_UD || src1.type == GEN_TYPE_F)
@@ -218,25 +277,6 @@ namespace gbe
   }
 #endif
 
-  static void setSamplerMessage(GenEncoder *p,
-                                GenNativeInstruction *insn,
-                                unsigned char bti,
-                                unsigned char sampler,
-                                uint32_t msg_type,
-                                uint32_t response_length,
-                                uint32_t msg_length,
-                                bool header_present,
-                                uint32_t simd_mode,
-                                uint32_t return_format)
-  {
-     const GenMessageTarget sfid = GEN_SFID_SAMPLER;
-     p->setMessageDescriptor(insn, sfid, msg_length, response_length);
-     insn->bits3.sampler_gen7.bti = bti;
-     insn->bits3.sampler_gen7.sampler = sampler;
-     insn->bits3.sampler_gen7.msg_type = msg_type;
-     insn->bits3.sampler_gen7.simd_mode = simd_mode;
-  }
-
   static void setDWordScatterMessgae(GenEncoder *p,
                                      GenNativeInstruction *insn,
                                      uint32_t bti,
@@ -289,10 +329,13 @@ namespace gbe
     GEN_UNTYPED_ALPHA,
     0
   };
+  unsigned GenEncoder::generateUntypedReadMessageDesc(unsigned bti, unsigned elemNum) {
+    GenNativeInstruction insn;
+    memset(&insn, 0, sizeof(GenNativeInstruction));
+    return setUntypedReadMessageDesc(&insn, bti, elemNum);
+  }
 
-  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
-    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    assert(elemNum >= 1 || elemNum <= 4);
+  unsigned GenEncoder::setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     if (this->curr.execWidth == 8) {
@@ -300,49 +343,88 @@ namespace gbe
       response_length = elemNum;
     } else if (this->curr.execWidth == 16) {
       msg_length = 2;
-      response_length = 2*elemNum;
+      response_length = 2 * elemNum;
     } else
       NOT_IMPLEMENTED;
-
-    this->setHeader(insn);
-    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN7_UNTYPED_READ,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
-  void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned GenEncoder::generateUntypedWriteMessageDesc(unsigned bti, unsigned elemNum) {
+    GenNativeInstruction insn;
+    memset(&insn, 0, sizeof(GenNativeInstruction));
+    return setUntypedWriteMessageDesc(&insn, bti, elemNum);
+  }
+
+  unsigned GenEncoder::setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
-    this->setHeader(insn);
     if (this->curr.execWidth == 8) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
-      msg_length = 1+elemNum;
+      msg_length = 1 + elemNum;
     } else if (this->curr.execWidth == 16) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
-      msg_length = 2*(1+elemNum);
+      msg_length = 2 * (1 + elemNum);
     }
     else
       NOT_IMPLEMENTED;
-    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
                    GEN7_UNTYPED_WRITE,
                    msg_length,
                    response_length);
+    return insn->bits3.ud;
   }
 
-  void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize) {
+  void GenEncoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    this->setHeader(insn);
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned GenEncoder::generateByteGatherMessageDesc(unsigned bti, unsigned elemSize) {
+    GenNativeInstruction insn;
+    memset(&insn, 0, sizeof(GenNativeInstruction));
+    return setByteGatherMessageDesc(&insn, bti, elemSize);
+  }
+
+  unsigned GenEncoder::setByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     if (this->curr.execWidth == 8) {
@@ -353,11 +435,6 @@ namespace gbe
       response_length = 2;
     } else
       NOT_IMPLEMENTED;
-
-    this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
     setDPByteScatterGather(this,
                            insn,
                            bti,
@@ -365,23 +442,42 @@ namespace gbe
                            GEN7_BYTE_GATHER,
                            msg_length,
                            response_length);
+    return insn->bits3.ud;
+
   }
 
-  void GenEncoder::BYTE_SCATTER(GenRegister msg, uint32_t bti, uint32_t elemSize) {
+  void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setByteGatherMessageDesc(insn, bti.value.ud, elemSize);
+    } else {
+      this->setSrc1(insn, bti);
+    }
+  }
+
+  unsigned GenEncoder::generateByteScatterMessageDesc(unsigned bti, unsigned elemSize) {
+    GenNativeInstruction insn;
+    memset(&insn, 0, sizeof(GenNativeInstruction));
+    return setByteScatterMessageDesc(&insn, bti, elemSize);
+  }
+
+  unsigned GenEncoder::setByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
-    this->setHeader(insn);
     if (this->curr.execWidth == 8) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
       msg_length = 2;
     } else if (this->curr.execWidth == 16) {
-      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
       msg_length = 4;
     } else
       NOT_IMPLEMENTED;
-    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
+
     setDPByteScatterGather(this,
                            insn,
                            bti,
@@ -389,6 +485,30 @@ namespace gbe
                            GEN7_BYTE_SCATTER,
                            msg_length,
                            response_length);
+    return insn->bits3.ud;
+  }
+
+  void GenEncoder::BYTE_SCATTER(GenRegister msg, GenRegister bti, uint32_t elemSize) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setByteScatterMessageDesc(insn, bti.value.ud, elemSize);
+    } else {
+      this->setSrc1(insn, bti);
+    }
   }
 
   void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
@@ -421,8 +541,13 @@ namespace gbe
 
   }
 
-  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
-    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+  unsigned GenEncoder::generateAtomicMessageDesc(unsigned function, unsigned bti, unsigned srcNum) {
+    GenNativeInstruction insn;
+    memset(&insn, 0, sizeof(GenNativeInstruction));
+    return setAtomicMessageDesc(&insn, function, bti, srcNum);
+  }
+
+  unsigned GenEncoder::setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
 
@@ -430,16 +555,11 @@ namespace gbe
       msg_length = srcNum;
       response_length = 1;
     } else if (this->curr.execWidth == 16) {
-      msg_length = 2*srcNum;
+      msg_length = 2 * srcNum;
       response_length = 2;
     } else
       NOT_IMPLEMENTED;
 
-    this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    this->setSrc1(insn, GenRegister::immud(0));
-
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_atomic_op.msg_type = GEN7_UNTYPED_ATOMIC_READ;
@@ -453,7 +573,23 @@ namespace gbe
       insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
     else
       NOT_SUPPORTED;
+    return insn->bits3.ud;
+  }
+
+  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
 
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    if (bti.file == GEN_IMMEDIATE_VALUE) {
+      this->setSrc1(insn, GenRegister::immud(0));
+      setAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
+    } else {
+      this->setSrc1(insn, bti);
+    }
   }
   GenCompactInstruction *GenEncoder::nextCompact(uint32_t opcode) {
     GenCompactInstruction insn;
@@ -472,6 +608,14 @@ namespace gbe
      return (GenNativeInstruction *)(&this->store.back()-1);
   }
 
+  bool GenEncoder::canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1)
+  {
+	/* By now, just alu1 insn will come to here. So just MOV */
+    this->MOV(dst.bottom_half(), src0.bottom_half());
+    this->MOV(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth));
+    return true;
+  }
+
   INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
                             GenRegister src0, GenRegister src1 = GenRegister::null()) {
        int w = p->curr.execWidth;
@@ -521,9 +665,9 @@ namespace gbe
             GenRegister src, uint32_t condition) {
      if (dst.isdf() && src.isdf()) {
        handleDouble(p, opcode, dst, src);
-     } else if (dst.isint64() && src.isint64()) { // handle int64
-       p->MOV(dst.bottom_half(), src.bottom_half());
-       p->MOV(dst.top_half(p->simdWidth), src.top_half(p->simdWidth));
+     } else if (dst.isint64() && src.isint64()
+                && p->canHandleLong(opcode, dst, src)) { // handle int64
+       return;
      } else if (needToSplitAlu1(p, dst, src) == false) {
       if(compactAlu1(p, opcode, dst, src, condition, false))
         return;
@@ -653,8 +797,8 @@ namespace gbe
     pop();
   }
 
-  void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
-    GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
+  void GenEncoder::LOAD_INT64_IMM(GenRegister dest, GenRegister value) {
+    GenRegister u0 = GenRegister::immd((int)value.value.i64), u1 = GenRegister::immd(value.value.i64 >> 32);
     MOV(dest.bottom_half(), u0);
     MOV(dest.top_half(this->simdWidth), u1);
   }
@@ -845,6 +989,8 @@ namespace gbe
   ALU2_BRA(BRD)
   ALU2_BRA(BRC)
 
+  // jip is the distance between jump instruction and jump-target. we have handled
+  // pre/post-increment in patchJMPI() function body
   void GenEncoder::patchJMPI(uint32_t insnID, int32_t jip, int32_t uip) {
     GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
     GBE_ASSERT(insnID < this->store.size());
@@ -1030,6 +1176,24 @@ namespace gbe
      this->setSrc0(insn, src);
   }
 
+  void GenEncoder::setSamplerMessage(GenNativeInstruction *insn,
+                                unsigned char bti,
+                                unsigned char sampler,
+                                uint32_t msg_type,
+                                uint32_t response_length,
+                                uint32_t msg_length,
+                                bool header_present,
+                                uint32_t simd_mode,
+                                uint32_t return_format)
+  {
+     const GenMessageTarget sfid = GEN_SFID_SAMPLER;
+     setMessageDescriptor(insn, sfid, msg_length, response_length);
+     insn->bits3.sampler_gen7.bti = bti;
+     insn->bits3.sampler_gen7.sampler = sampler;
+     insn->bits3.sampler_gen7.msg_type = msg_type;
+     insn->bits3.sampler_gen7.simd_mode = simd_mode;
+  }
+
   void GenEncoder::SAMPLE(GenRegister dest,
                           GenRegister msg,
                           unsigned int msg_len,
@@ -1061,7 +1225,7 @@ namespace gbe
      this->setHeader(insn);
      this->setDst(insn, dest);
      this->setSrc0(insn, msg);
-     setSamplerMessage(this, insn, bti, sampler, msg_type,
+     setSamplerMessage(insn, bti, sampler, msg_type,
                        response_length, msg_length,
                        header_present,
                        simd_mode, return_format);
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 9343581..79e7b6e 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -139,7 +139,7 @@ namespace gbe
     virtual int getDoubleExecWidth(void) = 0;
     virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
-    void LOAD_INT64_IMM(GenRegister dest, int64_t value);
+    virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
     /*! Barrier message (to synchronize threads of a workgroup) */
     void BARRIER(GenRegister src);
     /*! Memory fence message (to order loads and stores between threads) */
@@ -169,15 +169,15 @@ namespace gbe
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
     /*! Atomic instructions */
-    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
     /*! Untyped read (upto 4 channels) */
-    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
-    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
     /*! Byte gather (for unaligned bytes, shorts and ints) */
-    void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
+    void BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
-    void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+    void BYTE_SCATTER(GenRegister src, GenRegister bti, uint32_t elemSize);
     /*! DWord gather (for constant cache read) */
     void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
     /*! for scratch memory read */
@@ -185,7 +185,7 @@ namespace gbe
     /*! for scratch memory write */
     void SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode);
     /*! Send instruction for the sampler */
-    void SAMPLE(GenRegister dest,
+    virtual void SAMPLE(GenRegister dest,
                 GenRegister msg,
                 unsigned int msg_len,
                 bool header_present,
@@ -196,6 +196,15 @@ namespace gbe
                 uint32_t return_format,
                 bool isLD,
                 bool isUniform);
+    void setSamplerMessage(GenNativeInstruction *insn,
+                           unsigned char bti,
+                           unsigned char sampler,
+                           uint32_t msg_type,
+                           uint32_t response_length,
+                           uint32_t msg_length,
+                           bool header_present,
+                           uint32_t simd_mode,
+                           uint32_t return_format);
 
     /*! TypedWrite instruction for texture */
     virtual void TYPED_WRITE(GenRegister header,
@@ -221,6 +230,18 @@ namespace gbe
     void setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
                               unsigned msg_length, unsigned response_length,
                               bool header_present = false, bool end_of_thread = false);
+    virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+    virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+    unsigned setByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+    unsigned setByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+
+    unsigned generateAtomicMessageDesc(unsigned function, unsigned bti, unsigned srcNum);
+    unsigned generateUntypedReadMessageDesc(unsigned bti, unsigned elemNum);
+    unsigned generateUntypedWriteMessageDesc(unsigned bti, unsigned elemNum);
+    unsigned generateByteGatherMessageDesc(unsigned bti, unsigned elemSize);
+    unsigned generateByteScatterMessageDesc(unsigned bti, unsigned elemSize);
+
     virtual void setHeader(GenNativeInstruction *insn) = 0;
     virtual void setDst(GenNativeInstruction *insn, GenRegister dest) = 0;
     virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg) = 0;
@@ -229,8 +250,10 @@ namespace gbe
     virtual bool disableCompact() { return false; }
     GenNativeInstruction *next(uint32_t opcode);
     uint32_t n_instruction(void) const { return store.size(); }
-    GBE_CLASS(GenEncoder); //!< Use custom allocators
+    virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
+                            GenRegister src1 = GenRegister::null());
 
+    GBE_CLASS(GenEncoder); //!< Use custom allocators
     virtual void alu3(uint32_t opcode, GenRegister dst,
                        GenRegister src0, GenRegister src1, GenRegister src2) = 0;
   };
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 8535b4a..d073770 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -3,6 +3,7 @@ DECL_GEN7_SCHEDULE(Label,           0,         0,        0)
 DECL_GEN7_SCHEDULE(Unary,           20,        4,        2)
 DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        40,      20)
 DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
+DECL_GEN7_SCHEDULE(SimdShuffle,     20,        4,        2)
 DECL_GEN7_SCHEDULE(BinaryWithTemp,  20,        40,      20)
 DECL_GEN7_SCHEDULE(Ternary,         20,        4,        2)
 DECL_GEN7_SCHEDULE(I64Shift,        20,        40,      20)
@@ -32,6 +33,8 @@ DECL_GEN7_SCHEDULE(ByteScatter,     160,       1,        1)
 DECL_GEN7_SCHEDULE(DWordGather,     160,       1,        1)
 DECL_GEN7_SCHEDULE(PackByte,        40,        1,        1)
 DECL_GEN7_SCHEDULE(UnpackByte,      40,        1,        1)
+DECL_GEN7_SCHEDULE(PackLong,        40,        1,        1)
+DECL_GEN7_SCHEDULE(UnpackLong,      40,        1,        1)
 DECL_GEN7_SCHEDULE(Sample,          160,       1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
 DECL_GEN7_SCHEDULE(SpillReg,        20,        1,        1)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index f849c52..b3b7042 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -159,7 +159,7 @@ namespace gbe
     /*! Get an index in the node array for the given register */
     uint32_t getIndex(GenRegister reg) const;
     /*! Get an index in the node array for the given memory system */
-    uint32_t getIndex(uint32_t bti) const;
+    uint32_t getMemoryIndex() const;
     /*! Add a new dependency "node0 depends on node1" */
     void addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1, DepMode m);
     /*! Add a new dependency "node0 depends on node located at index" */
@@ -256,7 +256,7 @@ namespace gbe
     if (this->ignoreDependency(reg) == false) {
       const uint32_t index = this->getIndex(reg);
       this->addDependency(node0, index, m);
-      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64() || reg.is_unpacked_long()))
         this->addDependency(node0, index + 1, m);
     }
   }
@@ -265,7 +265,7 @@ namespace gbe
     if (this->ignoreDependency(reg) == false) {
       const uint32_t index = this->getIndex(reg);
       this->addDependency(index, node0, m);
-      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64() || reg.is_unpacked_long()))
         this->addDependency(index + 1, node0, m);
     }
   }
@@ -353,9 +353,9 @@ namespace gbe
       return reg.value.reg;
   }
 
-  uint32_t DependencyTracker::getIndex(uint32_t bti) const {
+  uint32_t DependencyTracker::getMemoryIndex() const {
     const uint32_t memDelta = grfNum + MAX_ARF_REGISTER;
-    return bti == 0xfe ? memDelta + LOCAL_MEMORY : (bti == 0xff ? memDelta + SCRATCH_MEMORY : memDelta + GLOBAL_MEMORY);
+    return memDelta;
   }
 
   void DependencyTracker::updateWrites(ScheduleDAGNode *node) {
@@ -367,7 +367,7 @@ namespace gbe
       if (this->ignoreDependency(dst) == false) {
         const uint32_t index = this->getIndex(dst);
         this->nodes[index] = node;
-        if (scheduler.policy == POST_ALLOC && (dst.isdf() || dst.isint64()))
+        if (scheduler.policy == POST_ALLOC && (dst.isdf() || dst.isint64() || dst.is_unpacked_long()))
           this->nodes[index + 1] = node;
       }
     }
@@ -386,22 +386,21 @@ namespace gbe
 
     // Track writes in memory
     if (insn.isWrite()) {
-      const uint32_t index = this->getIndex(insn.getbti());
+      const uint32_t index = this->getMemoryIndex();
       this->nodes[index] = node;
     }
 
     // Track writes in scratch memory
     if(insn.opcode == SEL_OP_SPILL_REG) {
-      const uint32_t index = this->getIndex(0xff);
+      const uint32_t index = this->getMemoryIndex();
       this->nodes[index] = node;
     }
     // Consider barriers and wait write to memory
     if (insn.opcode == SEL_OP_BARRIER ||
         insn.opcode == SEL_OP_FENCE ||
         insn.opcode == SEL_OP_WAIT) {
-      const uint32_t local = this->getIndex(0xfe);
-      const uint32_t global = this->getIndex(0x00);
-      this->nodes[local] = this->nodes[global] = node;
+      const uint32_t memIndex = this->getMemoryIndex();
+      this->nodes[memIndex] = node;
     }
   }
 
@@ -489,12 +488,12 @@ namespace gbe
 
       // read-after-write in memory
       if (insn.isRead()) {
-        const uint32_t index = tracker.getIndex(insn.getbti());
+        const uint32_t index = tracker.getMemoryIndex();
         tracker.addDependency(node, index, READ_AFTER_WRITE);
       }
       //read-after-write of scratch memory
       if (insn.opcode == SEL_OP_UNSPILL_REG) {
-        const uint32_t index = tracker.getIndex(0xff);
+        const uint32_t index = tracker.getMemoryIndex();
         tracker.addDependency(node, index, READ_AFTER_WRITE);
       }
 
@@ -502,10 +501,8 @@ namespace gbe
     if (insn.opcode == SEL_OP_BARRIER ||
         insn.opcode == SEL_OP_FENCE ||
         insn.opcode == SEL_OP_WAIT) {
-        const uint32_t local = tracker.getIndex(0xfe);
-        const uint32_t global = tracker.getIndex(0x00);
-        tracker.addDependency(node, local, READ_AFTER_WRITE);
-        tracker.addDependency(node, global, READ_AFTER_WRITE);
+        const uint32_t memIndex = tracker.getMemoryIndex();
+        tracker.addDependency(node, memIndex, READ_AFTER_WRITE);
       }
 
       // write-after-write in registers
@@ -522,13 +519,13 @@ namespace gbe
 
       // write-after-write in memory
       if (insn.isWrite()) {
-        const uint32_t index = tracker.getIndex(insn.getbti());
+        const uint32_t index = tracker.getMemoryIndex();
         tracker.addDependency(node, index, WRITE_AFTER_WRITE);
       }
 
       // write-after-write in scratch memory
       if (insn.opcode == SEL_OP_SPILL_REG) {
-        const uint32_t index = tracker.getIndex(0xff);
+        const uint32_t index = tracker.getMemoryIndex();
         tracker.addDependency(node, index, WRITE_AFTER_WRITE);
       }
 
@@ -552,13 +549,13 @@ namespace gbe
 
       // write-after-read in memory
       if (insn.isRead()) {
-        const uint32_t index = tracker.getIndex(insn.getbti());
+        const uint32_t index = tracker.getMemoryIndex();
         tracker.addDependency(index, node, WRITE_AFTER_READ);
       }
 
       // write-after-read in scratch memory
       if (insn.opcode == SEL_OP_UNSPILL_REG) {
-        const uint32_t index = tracker.getIndex(0xff);
+        const uint32_t index = tracker.getMemoryIndex();
         tracker.addDependency(index, node, WRITE_AFTER_READ);
       }
 
@@ -566,10 +563,8 @@ namespace gbe
       if (insn.opcode == SEL_OP_BARRIER ||
           insn.opcode == SEL_OP_FENCE ||
           insn.opcode == SEL_OP_WAIT) {
-        const uint32_t local = tracker.getIndex(0xfe);
-        const uint32_t global = tracker.getIndex(0x00);
-        tracker.addDependency(local, node, WRITE_AFTER_READ);
-        tracker.addDependency(global, node, WRITE_AFTER_READ);
+        const uint32_t memIndex = tracker.getMemoryIndex();
+        tracker.addDependency(memIndex, node, WRITE_AFTER_READ);
       }
 
       // Track all writes done by the instruction
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7d7a8c3..b0ba9e3 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -126,6 +126,7 @@ namespace gbe
       case TYPE_U64: return GEN_TYPE_UL;
       case TYPE_FLOAT: return GEN_TYPE_F;
       case TYPE_DOUBLE: return GEN_TYPE_DF;
+      case TYPE_HALF: return GEN_TYPE_HF;
       default: NOT_SUPPORTED; return GEN_TYPE_F;
     }
   }
@@ -249,6 +250,9 @@ namespace gbe
     this->vectorList.push_back(vec);
   }
 
+#define LD_MSG_ORDER_IVB 7
+#define LD_MSG_ORDER_SKL 9
+
   ///////////////////////////////////////////////////////////////////////////
   // Maximal munch selection on DAG
   ///////////////////////////////////////////////////////////////////////////
@@ -356,19 +360,47 @@ namespace gbe
     INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
     bool has32X32Mul() const { return bHas32X32Mul; }
     void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
+    bool hasLongType() const { return bHasLongType; }
+    bool hasHalfType() const { return bHasHalfType; }
+    void setHasLongType(bool b) { bHasLongType = b; }
+    void setHasHalfType(bool b) { bHasHalfType = b; }
+    bool hasLongRegRestrict() { return bLongRegRestrict; }
+    void setLongRegRestrict(bool b) { bLongRegRestrict = b; }
+    void setLdMsgOrder(uint32_t type)  { ldMsgOrder = type; }
+    uint32_t getLdMsgOrder()  const { return ldMsgOrder; }
+    void setSlowByteGather(bool b) { slowByteGather = b; }
+    bool getSlowByteGather() { return slowByteGather; }
     /*! indicate whether a register is a scalar/uniform register. */
+    INLINE bool isPartialWrite(const ir::Register &reg) const {
+      return partialWriteRegs.find(reg.value()) != partialWriteRegs.end();
+    }
     INLINE bool isScalarReg(const ir::Register &reg) const {
       const ir::RegisterData &regData = getRegisterData(reg);
       return regData.isUniform();
     }
+    INLINE bool isLongReg(const ir::Register &reg) const {
+      const ir::RegisterData &regData = getRegisterData(reg);
+      return regData.family == ir::FAMILY_QWORD;
+    }
+
+    INLINE GenRegister unpacked_ud(const ir::Register &reg) const {
+      return GenRegister::unpacked_ud(reg, isScalarReg(reg));
+    }
 
     INLINE GenRegister unpacked_uw(const ir::Register &reg) const {
-      return GenRegister::unpacked_uw(reg, isScalarReg(reg));
+      return GenRegister::unpacked_uw(reg, isScalarReg(reg), isLongReg(reg));
     }
 
     INLINE GenRegister unpacked_ub(const ir::Register &reg) const {
       return GenRegister::unpacked_ub(reg, isScalarReg(reg));
     }
+
+    INLINE GenRegister getOffsetReg(GenRegister reg, int nr, int subnr, bool isDst = true) {
+      if (isDst)
+        partialWriteRegs.insert(reg.value.reg);
+      return GenRegister::offset(reg, nr, subnr);
+    }
+
     /*! Implement public class */
     INLINE uint32_t getRegNum(void) const { return file.regNum(); }
     /*! Implements public interface */
@@ -462,6 +494,11 @@ namespace gbe
     /*! To make function prototypes more readable */
     typedef const GenRegister &Reg;
 
+    /*! Check for destination register. Major purpose is to find
+        out partially updated dst registers. These registers will
+        be unspillable. */
+    set<uint32_t> partialWriteRegs;
+
 #define ALU1(OP) \
   INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); }
 #define ALU1WithTemp(OP) \
@@ -483,6 +520,7 @@ namespace gbe
     ALU1(RNDE)
     ALU1(F16TO32)
     ALU1(F32TO16)
+    ALU1WithTemp(BSWAP)
     ALU2(SEL)
     ALU2(SEL_INT64)
     ALU1(NOT)
@@ -526,18 +564,20 @@ namespace gbe
 #undef ALU2WithTemp
 #undef ALU3
 #undef I64Shift
+    /*! simd shuffle */
+    void SIMD_SHUFFLE(Reg dst, Reg src0, Reg src1);
     /*! Convert 64-bit integer to 32-bit float */
     void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[6]);
     /*! Convert 64-bit integer to 32-bit float */
     void CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[2]);
     /*! Saturated 64bit x*y + z */
-    void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[9]);
+    void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister* tmp, int tmp_num);
     /*! High 64bit of x*y */
-    void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[9]);
+    void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num);
     /*! (x+y)>>1 without mod. overflow */
-    void I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
+    void I64HADD(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num);
     /*! (x+y+1)>>1 without mod. overflow */
-    void I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
+    void I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num);
     /*! Shift a 64-bit integer */
     void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]);
     /*! Compare 64-bit integer */
@@ -571,7 +611,7 @@ namespace gbe
     /*! Select instruction with embedded comparison */
     void SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1);
     /* Constant buffer move instruction */
-    void INDIRECT_MOVE(Reg dst, Reg src);
+    void INDIRECT_MOVE(Reg dst, Reg tmp, Reg base, Reg regOffset, uint32_t immOffset);
     /*! EOT is used to finish GPGPU threads */
     void EOT(void);
     /*! No-op */
@@ -579,25 +619,29 @@ namespace gbe
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
     /*! Atomic instruction */
-    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
+    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp);
     /*! Read 64 bits float/int array */
-    void READ64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, GenRegister *flagTemp);
     /*! Write 64 bits float/int array */
-    void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, uint32_t bti);
+    void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, GenRegister *flagTemp);
     /*! Untyped read (up to 4 elements) */
-    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
     /*! Untyped write (up to 4 elements) */
-    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
     /*! Byte gather (for unaligned bytes, shorts and ints) */
-    void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
+    void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
-    void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
+    void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
     /*! DWord scatter (for constant cache read) */
     void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
     /*! Unpack the uint to charN */
     void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum);
     /*! pack the charN to uint */
     void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum);
+    /*! Unpack the uint to charN */
+    void UNPACK_LONG(const GenRegister dst, const GenRegister src);
+    /*! pack the charN to uint */
+    void PACK_LONG(const GenRegister dst, const GenRegister src);
     /*! Extended math function (2 arguments) */
     void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
     /*! Extended math function (1 argument) */
@@ -619,11 +663,11 @@ namespace gbe
     /*! Get image information */
     void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
     /*! Multiply 64-bit integers */
-    void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+    void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool native_long);
     /*! 64-bit integer division */
-    void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
+    void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_int);
     /*! 64-bit integer remainder of division */
-    void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
+    void I64REM(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_int);
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI
        as much as possible. */
@@ -699,6 +743,11 @@ namespace gbe
     /*! Auxiliary label for if/endif. */ 
     uint32_t currAuxLabel;
     bool bHas32X32Mul;
+    bool bHasLongType;
+    bool bHasHalfType;
+    bool bLongRegRestrict;
+    uint32_t ldMsgOrder;
+    bool slowByteGather;
     INLINE ir::LabelIndex newAuxLabel()
     {
       currAuxLabel++;
@@ -738,7 +787,8 @@ namespace gbe
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
     stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
-    bHas32X32Mul(false)
+    bHas32X32Mul(false), bHasLongType(false), bHasHalfType(false), bLongRegRestrict(false),
+    ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false)
   {
     const ir::Function &fn = ctx.getFunction();
     this->regNum = fn.regNum();
@@ -1058,7 +1108,13 @@ namespace gbe
       case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
       case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
       case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
-      case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
+      case FAMILY_QWORD:
+        if (!this->hasLongType()) {
+          SEL_REG(df16grf, df8grf, df1grf);
+        } else {
+          SEL_REG(ul16grf, ul8grf, ul1grf);
+        }
+        break;
       default: NOT_SUPPORTED;
     }
     GBE_ASSERT(false);
@@ -1161,25 +1217,39 @@ namespace gbe
     insn->src(1) = src1;
     insn->extra.function = conditional;
   }
-  void Selection::Opaque::INDIRECT_MOVE(Reg dst, Reg src) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_INDIRECT_MOVE, 1, 1);
+  void Selection::Opaque::INDIRECT_MOVE(Reg dst, Reg tmp, Reg base, Reg regOffset, uint32_t immOffset) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_INDIRECT_MOVE, 2, 2);
     insn->dst(0) = dst;
-    insn->src(0) = src;
+    insn->dst(1) = tmp;
+    insn->src(0) = base;
+    insn->src(1) = regOffset;
+    insn->extra.indirect_offset = immOffset;
   }
 
   void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
                                      uint32_t srcNum, Reg src0,
-                                     Reg src1, Reg src2, uint32_t bti) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
+                                     Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp) {
+    unsigned dstNum = flagTemp == NULL ? 1 : 2;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum, srcNum + 1);
+
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
     insn->dst(0) = dst;
+    if(flagTemp) insn->dst(1) = *flagTemp;
+
     insn->src(0) = src0;
     if(srcNum > 1) insn->src(1) = src1;
     if(srcNum > 2) insn->src(2) = src2;
+    insn->src(srcNum) = bti;
     insn->extra.function = function;
-    insn->setbti(bti);
-    SelectionVector *vector = this->appendVector();
+    insn->extra.elem = srcNum;
 
+    SelectionVector *vector = this->appendVector();
     vector->regNum = srcNum;
+    vector->offsetID = 0;
     vector->reg = &insn->src(0);
     vector->isSrc = 1;
   }
@@ -1190,25 +1260,62 @@ namespace gbe
 
   void Selection::Opaque::READ64(Reg addr,
                                  const GenRegister *dst,
+                                 const GenRegister *tmp,
                                  uint32_t elemNum,
-                                 uint32_t bti)
+                                 const GenRegister bti,
+                                 bool native_long,
+                                 GenRegister *flagTemp)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
-    SelectionVector *srcVector = this->appendVector();
-    SelectionVector *dstVector = this->appendVector();
+    SelectionInstruction *insn = NULL;
+    SelectionVector *srcVector = NULL;
+    SelectionVector *dstVector = NULL;
+
+    if (!native_long) {
+      unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
+      insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
+      srcVector = this->appendVector();
+      dstVector = this->appendVector();
+      // Regular instruction to encode
+      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+        insn->dst(elemID) = dst[elemID];
+
+      // flagTemp don't need to be put in SelectionVector
+      if (flagTemp)
+        insn->dst(elemNum) = *flagTemp;
+    } else {
+      unsigned dstNum = flagTemp == NULL ? elemNum*2 : elemNum*2+1;
+      insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
+      srcVector = this->appendVector();
+      dstVector = this->appendVector();
+
+      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+        insn->dst(elemID) = tmp[elemID];
+
+      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+        insn->dst(elemID + elemNum) = dst[elemID];
+
+      // flagTemp don't need to be put in SelectionVector
+      if (flagTemp)
+        insn->dst(2*elemNum) = *flagTemp;
+    }
+
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
 
-    // Regular instruction to encode
-    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-      insn->dst(elemID) = dst[elemID];
     insn->src(0) = addr;
-    insn->setbti(bti);
+    insn->src(1) = bti;
+
     insn->extra.elem = elemNum;
 
     dstVector->regNum = elemNum;
     dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
     dstVector->reg = &insn->dst(0);
 
     srcVector->regNum = 1;
+    srcVector->offsetID = 0;
     srcVector->isSrc = 1;
     srcVector->reg = &insn->src(0);
   }
@@ -1216,9 +1323,11 @@ namespace gbe
   void Selection::Opaque::UNTYPED_READ(Reg addr,
                                        const GenRegister *dst,
                                        uint32_t elemNum,
-                                       uint32_t bti)
+                                       GenRegister bti,
+                                       GenRegister *flagTemp)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, elemNum, 1);
+    unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, dstNum, 2);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
     if (this->isScalarReg(dst[0].reg()))
@@ -1226,98 +1335,186 @@ namespace gbe
     // Regular instruction to encode
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->dst(elemID) = dst[elemID];
+    if (flagTemp)
+      insn->dst(elemNum) = *flagTemp;
+
     insn->src(0) = addr;
-    insn->setbti(bti);
+    insn->src(1) = bti;
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
     insn->extra.elem = elemNum;
 
     // Sends require contiguous allocation
     dstVector->regNum = elemNum;
     dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
     dstVector->reg = &insn->dst(0);
 
     srcVector->regNum = 1;
     srcVector->isSrc = 1;
+    srcVector->offsetID = 0;
     srcVector->reg = &insn->src(0);
   }
 
   void Selection::Opaque::WRITE64(Reg addr,
                                   const GenRegister *src,
+                                  const GenRegister *tmp,
                                   uint32_t srcNum,
-                                  uint32_t bti)
+                                  GenRegister bti,
+                                  bool native_long,
+                                  GenRegister *flagTemp)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
-    SelectionVector *vector = this->appendVector();
-
-    // Regular instruction to encode
-    insn->src(0) = addr;
-    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
-      insn->src(elemID + 1) = src[elemID];
-
-    insn->setbti(bti);
-    insn->extra.elem = srcNum;
+    SelectionVector *vector = NULL;
+    SelectionInstruction *insn = NULL;
+
+    if (!native_long) {
+      unsigned dstNum = flagTemp == NULL ? 0 : 1;
+      insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 2);
+      vector = this->appendVector();
+      // Register layout:
+      // dst: (flagTemp)
+      // src: addr, srcNum, bti
+      insn->src(0) = addr;
+      for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+        insn->src(elemID + 1) = src[elemID];
+
+      insn->src(srcNum+1) = bti;
+      if (flagTemp)
+        insn->dst(0) = *flagTemp;
+      insn->extra.elem = srcNum;
+
+      vector->regNum = srcNum + 1;
+      vector->offsetID = 0;
+      vector->reg = &insn->src(0);
+      vector->isSrc = 1;
+    } else { // handle the native long case
+      unsigned dstNum = flagTemp == NULL ? srcNum : srcNum+1;
+      // Register layout:
+      // dst: srcNum, (flagTemp)
+      // src: srcNum, addr, srcNum, bti.
+      insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
+      vector = this->appendVector();
+
+      for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+        insn->src(elemID) = src[elemID];
+
+      insn->src(srcNum) = addr;
+      for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+        insn->src(srcNum + 1 + elemID) = tmp[0];
+
+      insn->src(srcNum*2+1) = bti;
+      /* We also need to add the tmp reigster to dst, in order
+         to avoid the post schedule error . */
+      for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+        insn->dst(elemID) = tmp[0];
+
+      if (flagTemp)
+        insn->dst(srcNum) = *flagTemp;
+      insn->extra.elem = srcNum;
+
+      vector->regNum = srcNum + 1;
+      vector->offsetID = srcNum;
+      vector->reg = &insn->src(srcNum);
+      vector->isSrc = 1;
+    }
 
-    vector->regNum = srcNum + 1;
-    vector->reg = &insn->src(0);
-    vector->isSrc = 1;
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
   }
 
   void Selection::Opaque::UNTYPED_WRITE(Reg addr,
                                         const GenRegister *src,
                                         uint32_t elemNum,
-                                        uint32_t bti)
+                                        GenRegister bti,
+                                        GenRegister *flagTemp)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, 0, elemNum+1);
+    unsigned dstNum = flagTemp == NULL ? 0 : 1;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, dstNum, elemNum+2);
     SelectionVector *vector = this->appendVector();
 
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
+    if (flagTemp) insn->dst(0) = *flagTemp;
     // Regular instruction to encode
     insn->src(0) = addr;
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->src(elemID+1) = src[elemID];
-    insn->setbti(bti);
+    insn->src(elemNum+1) = bti;
+    if (flagTemp)
+      insn->src(elemNum+2) = *flagTemp;
     insn->extra.elem = elemNum;
 
     // Sends require contiguous allocation for the sources
     vector->regNum = elemNum+1;
     vector->reg = &insn->src(0);
+    vector->offsetID = 0;
     vector->isSrc = 1;
   }
 
-  void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, 1, 1);
+  void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
+    unsigned dstNum = flagTemp == NULL ? 1 : 2;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, dstNum, 2);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
 
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
     if (this->isScalarReg(dst.reg()))
       insn->state.noMask = 1;
     // Instruction to encode
     insn->src(0) = addr;
+    insn->src(1) = bti;
     insn->dst(0) = dst;
-    insn->setbti(bti);
+    if (flagTemp)
+      insn->dst(1) = *flagTemp;
+
     insn->extra.elem = elemSize;
 
     // byte gather requires vector in the sense that scalar are not allowed
     // (yet)
     dstVector->regNum = 1;
     dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
     dstVector->reg = &insn->dst(0);
     srcVector->regNum = 1;
     srcVector->isSrc = 1;
+    srcVector->offsetID = 0;
     srcVector->reg = &insn->src(0);
   }
 
-  void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, 0, 2);
+  void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
+    unsigned dstNum = flagTemp == NULL ? 0 : 1;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, dstNum, 3);
     SelectionVector *vector = this->appendVector();
 
+    if (bti.file != GEN_IMMEDIATE_VALUE) {
+      insn->state.flag = 0;
+      insn->state.subFlag = 1;
+    }
+
+    if (flagTemp)
+      insn->dst(0) = *flagTemp;
     // Instruction to encode
     insn->src(0) = addr;
     insn->src(1) = src;
-    insn->setbti(bti);
+    insn->src(2) = bti;
     insn->extra.elem = elemSize;
 
     // value and address are contiguous in the send
     vector->regNum = 2;
     vector->isSrc = 1;
+    vector->offsetID = 0;
     vector->reg = &insn->src(0);
   }
 
@@ -1333,9 +1530,11 @@ namespace gbe
     insn->setbti(bti);
     vector->regNum = 1;
     vector->isSrc = 0;
+    vector->offsetID = 0;
     vector->reg = &insn->dst(0);
     srcVector->regNum = 1;
     srcVector->isSrc = 1;
+    srcVector->offsetID = 0;
     srcVector->reg = &insn->src(0);
   }
 
@@ -1354,6 +1553,18 @@ namespace gbe
     insn->dst(0) = dst;
   }
 
+  void Selection::Opaque::UNPACK_LONG(const GenRegister dst, const GenRegister src) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_LONG, 1, 1);
+    insn->src(0) = src;
+    insn->dst(0) = dst;
+  }
+
+  void Selection::Opaque::PACK_LONG(const GenRegister dst, const GenRegister src) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_LONG, 1, 1);
+    insn->src(0) = src;
+    insn->dst(0) = dst;
+  }
+
   void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 2);
     insn->dst(0) = dst;
@@ -1369,30 +1580,40 @@ namespace gbe
     insn->extra.function = function;
   }
 
-  void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MUL, 7, 2);
+  void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool native_long) {
+    SelectionInstruction *insn = NULL;
+    if (native_long)
+      insn = this->appendInsn(SEL_OP_I64MUL, 2, 2);
+    else
+      insn = this->appendInsn(SEL_OP_I64MUL, 7, 2);
+
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 6; i++)
-      insn->dst(i + 1) = tmp[i];
+
+    if (native_long) {
+      insn->dst(1) = tmp[0];
+    } else {
+      for (int i = 0; i < 6; i++)
+        insn->dst(i + 1) = tmp[i];
+    }
   }
 
-  void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, 14, 2);
+  void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmp_num) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, tmp_num + 1, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 13; i++)
+    for(int i = 0; i < tmp_num; i++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, 14, 2);
+  void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister* tmp, int tmp_num) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, tmp_num + 1, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 13; i++)
+    for(int i = 0; i < tmp_num; i++)
       insn->dst(i + 1) = tmp[i];
   }
 
@@ -1432,6 +1653,14 @@ namespace gbe
     insn->src(2) = src2;
   }
 
+  void Selection::Opaque::SIMD_SHUFFLE(Reg dst, Reg src0, Reg src1)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_SIMD_SHUFFLE, 1, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+  }
+
   void Selection::Opaque::I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_I64CMP, 3, 2);
     insn->src(0) = src0;
@@ -1475,40 +1704,40 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[9]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, 10, 3);
+  void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister *tmp, int tmp_num) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, tmp_num + 1, 3);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
     insn->src(2) = src2;
-    for(int i = 0; i < 9; i ++)
+    for(int i = 0; i < tmp_num; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[9]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, 10, 2);
+  void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, tmp_num + 1, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 9; i ++)
+    for(int i = 0; i < tmp_num; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64HADD, 5, 2);
+  void Selection::Opaque::I64HADD(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64HADD, tmp_num + 1, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 4; i ++)
+    for(int i = 0; i < tmp_num; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64RHADD, 5, 2);
+  void Selection::Opaque::I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister *tmp, int tmp_num) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64RHADD, tmp_num + 1, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 4; i ++)
+    for(int i = 0; i < tmp_num; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
@@ -1790,11 +2019,13 @@ namespace gbe
     // Sends require contiguous allocation
     dstVector->regNum = dstNum;
     dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
     dstVector->reg = &insn->dst(0);
 
     // Only the messages require contiguous registers.
     msgVector->regNum = msgNum;
     msgVector->isSrc = 1;
+    msgVector->offsetID = 0;
     msgVector->reg = &insn->src(0);
 
     insn->setbti(bti);
@@ -1811,13 +2042,34 @@ namespace gbe
   Selection::Selection(GenContext &ctx) {
     this->blockList = NULL;
     this->opaque = GBE_NEW(Selection::Opaque, ctx);
+    this->opaque->setSlowByteGather(true);
   }
 
   Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setSlowByteGather(false);
   }
 
   Selection8::Selection8(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
+    this->opaque->setHasLongType(true);
+    this->opaque->setSlowByteGather(true);
+    this->opaque->setHasHalfType(true);
+  }
+
+  SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setHas32X32Mul(true);
+    this->opaque->setHasLongType(true);
+    this->opaque->setLongRegRestrict(true);
+    this->opaque->setSlowByteGather(true);
+    this->opaque->setHasHalfType(true);
+  }
+
+  Selection9::Selection9(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setHas32X32Mul(true);
+    this->opaque->setHasLongType(true);
+    this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
+    this->opaque->setSlowByteGather(true);
+    this->opaque->setHasHalfType(true);
   }
 
   void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
@@ -1836,6 +2088,7 @@ namespace gbe
     // Sends require contiguous allocation
     msgVector->regNum = msgNum;
     msgVector->isSrc = 1;
+    msgVector->offsetID = 0;
     msgVector->reg = &insn->src(0);
   }
 
@@ -1881,6 +2134,10 @@ namespace gbe
     return this->opaque->isScalarReg(reg);
   }
 
+  bool Selection::isPartialWrite(const ir::Register &reg) const {
+    return this->opaque->isPartialWrite(reg);
+  }
+
   SelectionInstruction *Selection::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) {
     return this->opaque->create(opcode, dstNum, srcNum);
   }
@@ -1912,6 +2169,13 @@ namespace gbe
       case TYPE_S8:  return GenRegister::immw((int8_t)imm.getIntegerValue() * sign);
       case TYPE_DOUBLE: return GenRegister::immdf(imm.getDoubleValue() * sign);
       case TYPE_BOOL: return GenRegister::immw((imm.getIntegerValue() == 0) ? 0 : -1);  //return 0xffff when true
+      case TYPE_HALF: {
+        ir::half hf = imm.getHalfValue();
+        int16_t _sign = negate ? -1 : 1;
+        ir::half hfSign = ir::half::convToHalf(_sign);
+        hf = hf * hfSign;
+        return GenRegister::immh(hf.getVal());
+      }
       default: NOT_SUPPORTED; return GenRegister::immuw(0);
     }
   }
@@ -2008,18 +2272,56 @@ namespace gbe
 #define DECL_CTOR(FAMILY, INSN_NUM, COST) \
   FAMILY##Pattern(void) : OneToManyPattern<FAMILY##Pattern, ir::FAMILY>(INSN_NUM, COST) {}
 
+  /*! Nullary instruction patterns */
+  class NullaryInstructionPattern : public SelectionPattern
+  {
+  public:
+    NullaryInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::NullaryInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::NullaryInstruction &insn = cast<NullaryInstruction>(dag.insn);
+      const Opcode opcode = insn.getOpcode();
+      const Type type = insn.getType();
+      GenRegister dst = sel.selReg(insn.getDst(0), type);
+
+      sel.push();
+      switch (opcode) {
+        case ir::OP_SIMD_SIZE:
+          {
+            const GenRegister src = GenRegister::immud(sel.curr.execWidth);
+            sel.MOV(dst, src);
+          }
+          break;
+        case ir::OP_SIMD_ID:
+          {
+            const GenRegister selLaneID = sel.selReg(ir::ocl::laneid, ir::TYPE_U32);
+            sel.MOV(dst, selLaneID);
+          }
+          break;
+        default: NOT_SUPPORTED;
+      }
+      sel.pop();
+      return true;
+    }
+  };
+
   /*! Unary instruction patterns */
   DECL_PATTERN(UnaryInstruction)
   {
     static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType, bool isSrc = false) {
-
       if (opcode == ir::OP_CBIT)
         return isSrc ? insnType : ir::TYPE_U32;
-
       if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
         return insnType;
-      if (opcode == ir::OP_FBH || opcode == ir::OP_FBL)
+      if (opcode == ir::OP_FBH || opcode == ir::OP_FBL || opcode == ir::OP_LZD)
         return ir::TYPE_U32;
+      if (opcode == ir::OP_SIMD_ANY || opcode == ir::OP_SIMD_ALL)
+        return ir::TYPE_S32;
       if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
         return insnType;
       if (insnType == ir::TYPE_BOOL)
@@ -2040,13 +2342,10 @@ namespace gbe
         }
         switch (opcode) {
           case ir::OP_ABS:
-            if (insn.getType() == ir::TYPE_S32) {
-              const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D);
-              const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D);
+            {
+              const GenRegister src_ = GenRegister::retype(src, getGenType(insnType));
+              const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType));
               sel.MOV(dst_, GenRegister::abs(src_));
-            } else {
-              GBE_ASSERT(insn.getType() == ir::TYPE_FLOAT);
-              sel.MOV(dst, GenRegister::abs(src));
             }
             break;
           case ir::OP_MOV:
@@ -2073,6 +2372,7 @@ namespace gbe
           case ir::OP_FBH: sel.FBH(dst, src); break;
           case ir::OP_FBL: sel.FBL(dst, src); break;
           case ir::OP_CBIT: sel.CBIT(dst, src); break;
+          case ir::OP_LZD: sel.LZD(dst, src); break;
           case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
           case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
           case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
@@ -2080,6 +2380,14 @@ namespace gbe
           case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
           case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
           case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
+          case ir::OP_BSWAP:
+            {
+              ir::Register tmp = sel.reg(getFamily(insnType));
+              const GenRegister src_ = GenRegister::retype(src, getGenType(insnType));
+              const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType));
+              sel.BSWAP(dst_, src_, sel.selReg(tmp, insnType));
+              break;
+            }
           case ir::OP_SIMD_ANY:
             {
               const GenRegister constZero = GenRegister::immuw(0);;
@@ -2175,7 +2483,7 @@ namespace gbe
                           GEN_MATH_FUNCTION_INT_DIV_REMAINDER;
 
       //bytes and shorts must be converted to int for DIV and REM per GEN restriction
-      if((family == FAMILY_WORD || family == FAMILY_BYTE)) {
+      if((family == FAMILY_WORD || family == FAMILY_BYTE) && (type != TYPE_HALF)) {
         GenRegister tmp0, tmp1;
         ir::Register reg = sel.reg(FAMILY_DWORD, isUniform);
         tmp0 = sel.selReg(reg, ir::TYPE_S32);
@@ -2191,24 +2499,56 @@ namespace gbe
         }
         unpacked = GenRegister::retype(unpacked, getGenType(type));
         sel.MOV(dst, unpacked);
+      } else if (type == TYPE_HALF) {
+        ir::Register reg = sel.reg(FAMILY_DWORD, isUniform);
+        GenRegister tmp0 = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_FLOAT);
+        GenRegister tmp1 = sel.selReg(reg, ir::TYPE_FLOAT);
+        sel.MOV(tmp0, src0);
+        sel.MOV(tmp1, src1);
+        GBE_ASSERT(op != OP_REM);
+        sel.MATH(tmp0, GEN_MATH_FUNCTION_FDIV, tmp0, tmp1);
+        GenRegister unpacked = GenRegister::retype(sel.unpacked_uw(reg), GEN_TYPE_HF);
+        sel.MOV(unpacked, tmp0);
+        sel.MOV(dst, unpacked);
       } else if (type == TYPE_S32 || type == TYPE_U32 ) {
         sel.MATH(dst, function, src0, src1);
       } else if(type == TYPE_FLOAT) {
         GBE_ASSERT(op != OP_REM);
         sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
       } else if (type == TYPE_S64 || type == TYPE_U64) {
-        GenRegister tmp[13];
+        GenRegister tmp[15];
+        int tmp_num = 13;
         for(int i=0; i < 13; i++) {
           tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
           tmp[i].type = GEN_TYPE_UD;
         }
+
+        if (sel.hasLongType()) {
+          if (!sel.isScalarReg(insn.getSrc(0))) {
+            tmp[tmp_num] = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), src0.type);
+            tmp_num++;
+          }
+
+          if (!sel.isScalarReg(insn.getSrc(1))) {
+            tmp[tmp_num] = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), src1.type);
+            tmp_num++;
+          }
+
+          /* We at least one tmp register to convert if dst is not scalar. */
+          if (!sel.isScalarReg(insn.getDst(0)) && sel.isScalarReg(insn.getSrc(0))
+              && sel.isScalarReg(insn.getSrc(1))) {
+            GBE_ASSERT(tmp_num == 13);
+            tmp[tmp_num] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+            tmp_num++;
+          }
+        }
         sel.push();
           sel.curr.flag = 0;
           sel.curr.subFlag = 1;
           if(op == OP_DIV)
-            sel.I64DIV(dst, src0, src1, tmp);
+            sel.I64DIV(dst, src0, src1, tmp, tmp_num);
           else
-            sel.I64REM(dst, src0, src1, tmp);
+            sel.I64REM(dst, src0, src1, tmp, tmp_num);
         sel.pop();
       }
       markAllChildren(dag);
@@ -2269,14 +2609,14 @@ namespace gbe
 
       switch (opcode) {
         case OP_ADD:
-          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) {
             GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
             sel.I64ADD(dst, src0, src1, t);
           } else
             sel.ADD(dst, src0, src1);
           break;
         case OP_ADDSAT:
-          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) {
             GenRegister tmp[5];
             for(int i=0; i<5; i++) {
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
@@ -2295,32 +2635,32 @@ namespace gbe
           sel.pop();
           break;
         case OP_XOR:
-          if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType())
             sel.I64XOR(dst, src0, src1);
           else
             sel.XOR(dst, src0, src1);
           break;
         case OP_OR:
-          if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType())
             sel.I64OR(dst, src0, src1);
           else
             sel.OR(dst, src0, src1);
           break;
         case OP_AND:
-          if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType())
             sel.I64AND(dst, src0, src1);
           else
             sel.AND(dst, src0, src1);
           break;
         case OP_SUB:
-          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) {
             GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
             sel.I64SUB(dst, src0, src1, t);
           } else
             sel.ADD(dst, src0, GenRegister::negate(src1));
           break;
         case OP_SUBSAT:
-          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) {
             GenRegister tmp[5];
             for(int i=0; i<5; i++) {
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
@@ -2339,7 +2679,7 @@ namespace gbe
           sel.pop();
           break;
         case OP_SHL:
-          if (type == TYPE_S64 || type == TYPE_U64) {
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) {
             GenRegister tmp[6];
             for(int i = 0; i < 6; i ++)
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
@@ -2352,7 +2692,7 @@ namespace gbe
             sel.SHL(dst, src0, src1);
           break;
         case OP_SHR:
-          if (type == TYPE_S64 || type == TYPE_U64) {
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) {
             GenRegister tmp[6];
             for(int i = 0; i < 6; i ++)
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
@@ -2365,7 +2705,7 @@ namespace gbe
             sel.SHR(dst, src0, src1);
           break;
         case OP_ASR:
-          if (type == TYPE_S64 || type == TYPE_U64) {
+          if ((type == Type::TYPE_U64 || type == Type::TYPE_S64) && !sel.hasLongType()) {
             GenRegister tmp[6];
             for(int i = 0; i < 6; i ++)
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
@@ -2384,27 +2724,42 @@ namespace gbe
           }
         case OP_I64_MUL_HI:
          {
-          GenRegister temp[9];
-          for(int i=0; i<9; i++) {
-            temp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-            temp[i].type = GEN_TYPE_UD;
-          }
-          sel.push();
-            sel.curr.flag = 0;
-            sel.curr.subFlag = 1;
-            sel.I64_MUL_HI(dst, src0, src1, temp);
-          sel.pop();
-          break;
+           int tmp_num;
+           GenRegister temp[9];
+           if (sel.hasLongType()) {
+             for(int i=0; i<9; i++) {
+               temp[i] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+             }
+             tmp_num = 6;
+           } else {
+             for(int i=0; i<9; i++) {
+               temp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+               temp[i].type = GEN_TYPE_UD;
+             }
+             tmp_num = 9;
+           }
+           sel.push();
+           sel.curr.flag = 0;
+           sel.curr.subFlag = 1;
+           sel.I64_MUL_HI(dst, src0, src1, temp, tmp_num);
+           sel.pop();
+           break;
          }
         case OP_MUL:
           if (type == TYPE_U32 || type == TYPE_S32) {
             sel.pop();
             return false;
           } else if (type == TYPE_S64 || type == TYPE_U64) {
-            GenRegister tmp[6];
-            for(int i = 0; i < 6; i++)
-              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-            sel.I64MUL(dst, src0, src1, tmp);
+            if (sel.hasLongType()) {
+              GenRegister tmp;
+              tmp = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              sel.I64MUL(dst, src0, src1, &tmp, true);
+            } else {
+              GenRegister tmp[6];
+              for(int i = 0; i < 6; i++)
+                tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+              sel.I64MUL(dst, src0, src1, tmp, false);
+            }
           } else
             sel.MUL(dst, src0, src1);
           break;
@@ -2419,29 +2774,40 @@ namespace gbe
             break;
           }
         case OP_I64HADD:
-         {
-          GenRegister tmp[4];
-          for(int i=0; i<4; i++)
-            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-          sel.I64HADD(dst, src0, src1, tmp);
-          break;
-         }
+          {
+            GenRegister tmp[4];
+            if (!sel.hasLongType()) {
+              for(int i=0; i<4; i++)
+                tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+              sel.I64HADD(dst, src0, src1, tmp, 4);
+            } else {
+              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              sel.I64HADD(dst, src0, src1, tmp, 2);
+            }
+            break;
+          }
         case OP_I64RHADD:
-         {
-          GenRegister tmp[4];
-          for(int i=0; i<4; i++)
-            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-          sel.I64RHADD(dst, src0, src1, tmp);
-          break;
-         }
+          {
+            GenRegister tmp[4];
+            if (!sel.hasLongType()) {
+              for(int i=0; i<4; i++)
+                tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+              sel.I64RHADD(dst, src0, src1, tmp, 4);
+            } else {
+              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              sel.I64RHADD(dst, src0, src1, tmp, 2);
+            }
+            break;
+          }
         case OP_UPSAMPLE_SHORT:
         {
           dst = GenRegister::retype(sel.unpacked_uw(dst.reg()), GEN_TYPE_B);
           src0 = GenRegister::retype(sel.unpacked_uw(src0.reg()), GEN_TYPE_B);
           src1 = GenRegister::retype(sel.unpacked_uw(src1.reg()), GEN_TYPE_B);
           sel.MOV(dst, src1);
-          dst.subphysical = 1;
-          dst = dst.offset(dst, 0, typeSize(GEN_TYPE_B));
+          dst = sel.getOffsetReg(dst, 0, typeSize(GEN_TYPE_B));
           sel.MOV(dst, src0);
           break;
         }
@@ -2451,8 +2817,7 @@ namespace gbe
           src0 = sel.unpacked_uw(src0.reg());
           src1 = sel.unpacked_uw(src1.reg());
           sel.MOV(dst, src1);
-          dst.subphysical = 1;
-          dst = dst.offset(dst, 0, typeSize(GEN_TYPE_W));
+          dst = sel.getOffsetReg(dst, 0, typeSize(GEN_TYPE_W));
           sel.MOV(dst, src0);
           break;
         }
@@ -2846,13 +3211,18 @@ namespace gbe
           sel.MOV(GenRegister::retype(dst, GEN_TYPE_F),
                   GenRegister::immf(imm.asFloatValue()));
         break;
+        case TYPE_HALF: {
+          ir::half hf = imm.getHalfValue();
+          sel.MOV(GenRegister::retype(dst, GEN_TYPE_HF), GenRegister::immh(hf.getVal()));
+          break;
+	}
         case TYPE_U16: sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
         case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
         case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
         case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
         case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.getDoubleValue()), sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U64)); break;
         case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
-        case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
+        case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immuint64(imm.getIntegerValue())); break;
         default: NOT_SUPPORTED;
       }
       sel.pop();
@@ -2879,7 +3249,7 @@ namespace gbe
     DECL_CTOR(SyncInstruction, 1,1);
   };
 
-  INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
+  INLINE uint32_t getByteScatterGatherSize(Selection::Opaque &sel, ir::Type type) {
     using namespace ir;
     switch (type) {
       case TYPE_DOUBLE:
@@ -2897,39 +3267,32 @@ namespace gbe
       case TYPE_U8:
       case TYPE_S8:
         return GEN_BYTE_SCATTER_BYTE;
+      case TYPE_HALF:
+        if (sel.hasHalfType())
+          return GEN_BYTE_SCATTER_WORD;
       default: NOT_SUPPORTED;
         return GEN_BYTE_SCATTER_BYTE;
     }
   }
 
-  /*! Load instruction pattern */
-  DECL_PATTERN(LoadInstruction)
+  class LoadInstructionPattern : public SelectionPattern
   {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    LoadInstructionPattern(void) : SelectionPattern(1, 1) {
+       this->opcodes.push_back(ir::OP_LOAD);
+    }
     void readDWord(Selection::Opaque &sel,
                    vector<GenRegister> &dst,
-                   vector<GenRegister> &dst2,
                    GenRegister addr,
                    uint32_t valueNum,
                    ir::BTI bti) const
     {
-      for (uint32_t x = 0; x < bti.count; x++) {
-        if(x > 0)
-          for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-            dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
-
-        GenRegister temp = getRelativeAddress(sel, addr, bti.bti[x]);
-        sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
-        if(x > 0) {
-          sel.push();
-            if(sel.isScalarReg(dst[0].reg())) {
-              sel.curr.noMask = 1;
-              sel.curr.execWidth = 1;
-            }
-            for (uint32_t y = 0; y < valueNum; y++)
-              sel.ADD(dst[y], dst[y], dst2[y]);
-          sel.pop();
-        }
-      }
+        //GenRegister temp = getRelativeAddress(sel, addr, sel.selReg(bti.base, ir::TYPE_U32));
+
+        GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+        GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
+        sel.UNTYPED_READ(addr, dst.data(), valueNum, b, bti.isConst ? NULL : &tmp);
     }
 
     void emitUntypedRead(Selection::Opaque &sel,
@@ -2940,10 +3303,9 @@ namespace gbe
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> dst(valueNum);
-      vector<GenRegister> dst2(valueNum);
       for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-        dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
-      readDWord(sel, dst, dst2, addr, valueNum, bti);
+        dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
+      readDWord(sel, dst, addr, valueNum, bti);
     }
 
     void emitDWordGather(Selection::Opaque &sel,
@@ -2952,15 +3314,15 @@ namespace gbe
                          ir::BTI bti) const
     {
       using namespace ir;
-      GBE_ASSERT(bti.count == 1);
-      const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
+      GBE_ASSERT(bti.isConst == 1);
       GBE_ASSERT(insn.getValueNum() == 1);
+      const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
 
       if(isUniform) {
         GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
         sel.push();
           sel.curr.noMask = 1;
-          sel.SAMPLE(&dst, 1, &addr, 1, bti.bti[0], 0, true, true);
+          sel.SAMPLE(&dst, 1, &addr, 1, bti.imm, 0, true, true);
         sel.pop();
         return;
       }
@@ -2976,7 +3338,7 @@ namespace gbe
         sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
       sel.pop();
 
-      sel.DWORD_GATHER(dst, addrDW, bti.bti[0]);
+      sel.DWORD_GATHER(dst, addrDW, bti.imm);
     }
 
     void emitRead64(Selection::Opaque &sel,
@@ -2988,12 +3350,23 @@ namespace gbe
       const uint32_t valueNum = insn.getValueNum();
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      GBE_ASSERT(bti.count == 1);
+      GBE_ASSERT(bti.isConst == 1);
       vector<GenRegister> dst(valueNum);
-      GenRegister tmpAddr = getRelativeAddress(sel, addr, bti.bti[0]);
+      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+      GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
       for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
-      sel.READ64(tmpAddr, dst.data(), valueNum, bti.bti[0]);
+
+      if (sel.hasLongType()) {
+        vector<GenRegister> tmp(valueNum);
+        for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+          tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+        }
+
+        sel.READ64(addr, dst.data(), tmp.data(), valueNum, b, true, bti.isConst ? NULL : &tmpFlag);
+      } else {
+        sel.READ64(addr, dst.data(), NULL, valueNum, b, false, bti.isConst ? NULL : &tmpFlag);
+      }
     }
 
     void readByteAsDWord(Selection::Opaque &sel,
@@ -3001,12 +3374,16 @@ namespace gbe
                         GenRegister address,
                         GenRegister dst,
                         bool isUniform,
-                        uint8_t bti) const
+                        ir::BTI bti) const
     {
       using namespace ir;
         Register tmpReg = sel.reg(FAMILY_DWORD, isUniform);
         GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
         GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32);
+
+        GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+        GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
+
         // Get dword aligned addr
         sel.push();
           if (isUniform) {
@@ -3018,7 +3395,7 @@ namespace gbe
         sel.push();
           if (isUniform)
             sel.curr.noMask = 1;
-          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, bti);
+          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, b, bti.isConst ? NULL : &tmpFlag);
 
           if (isUniform)
             sel.curr.execWidth = 1;
@@ -3054,14 +3431,11 @@ namespace gbe
 
       uint32_t tmpRegNum = (typeSize*valueNum + 3) / 4;
       vector<GenRegister> tmp(tmpRegNum);
-      vector<GenRegister> tmp2(tmpRegNum);
-      vector<Register> tmpReg(tmpRegNum);
       for(uint32_t i = 0; i < tmpRegNum; i++) {
-        tmpReg[i] = sel.reg(FAMILY_DWORD, isUniform);
-        tmp2[i] = tmp[i] = sel.selReg(tmpReg[i], ir::TYPE_U32);
+        tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
       }
 
-      readDWord(sel, tmp, tmp2, address, tmpRegNum, bti);
+      readDWord(sel, tmp, address, tmpRegNum, bti);
 
       for(uint32_t i = 0; i < tmpRegNum; i++) {
         unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
@@ -3166,7 +3540,7 @@ namespace gbe
               sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
             sel.pop();
           }
-          readDWord(sel, t1, t2, alignedAddr, width, bti);
+          readDWord(sel, t1, alignedAddr, width, bti);
           remainedReg -= width;
           pos += width;
         } while(remainedReg);
@@ -3185,63 +3559,62 @@ namespace gbe
         GBE_ASSERT(insn.getValueNum() == 1);
         const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
         GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
-        GenRegister tmp = value;
+        if(sel.getSlowByteGather())
+          readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
+        else {
+          GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+          GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
 
-        for (int x = 0; x < bti.count; x++) {
-          if (x > 0)
-            tmp = sel.selReg(sel.reg(family, isUniform), insn.getValueType());
+          // We need a temporary register if we read bytes or words
+          Register dst = sel.reg(FAMILY_DWORD, isUniform);
+          sel.push();
+            if (isUniform)
+              sel.curr.noMask = 1;
+            sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, elemSize, b, bti.isConst ? NULL : & tmpFlag);
+          sel.pop();
 
-          GenRegister addr = getRelativeAddress(sel, address, bti.bti[x]);
-          readByteAsDWord(sel, elemSize, addr, tmp, isUniform, bti.bti[x]);
-          if (x > 0) {
-            sel.push();
-              if (isUniform) {
-                sel.curr.noMask = 1;
-                sel.curr.execWidth = 1;
-              }
-              sel.ADD(value, value, tmp);
-            sel.pop();
-          }
+          sel.push();
+            if (isUniform) {
+              sel.curr.noMask = 1;
+              sel.curr.execWidth = 1;
+            }
+            if (elemSize == GEN_BYTE_SCATTER_WORD)
+              sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst));
+            else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+              sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
+          sel.pop();
         }
       }
     }
 
-    void emitIndirectMove(Selection::Opaque &sel,
-                         const ir::LoadInstruction &insn,
-                         GenRegister address) const
-    {
-      using namespace ir;
-      GBE_ASSERT(insn.getValueNum() == 1);   //todo: handle vec later
-
-      const GenRegister dst = sel.selReg(insn.getValue(0), insn.getValueType());
-      const GenRegister src = address;
-      sel.INDIRECT_MOVE(dst, src);
-    }
-
-    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti) const {
-      if (bti == 0xfe || bti == BTI_CONSTANT)
-        return address;
-
-      sel.push();
-        sel.curr.noMask = 1;
-        if (GenRegister::hstride_size(address) == 0)
-          sel.curr.execWidth = 1;
-        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD, sel.curr.execWidth == 1), ir::TYPE_U32);
-        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
-      sel.pop();
-      return temp;
-    }
     // check whether all binded table index point to constant memory
     INLINE bool isAllConstant(const ir::BTI &bti) const {
-      for (int x = 0; x < bti.count; x++) {
-         if (bti.bti[x] != BTI_CONSTANT)
-           return false;
+      if (bti.isConst && bti.imm == BTI_CONSTANT)
+        return true;
+      return false;
+    }
+
+    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::LoadInstruction &insn) const {
+      using namespace ir;
+      SelectionDAG *child0 = dag.child[0];
+      ir::BTI b;
+      if (insn.isFixedBTI()) {
+        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
+        const auto imm = immInsn.getImmediate();
+        b.isConst = 1;
+        b.imm = imm.getIntegerValue();
+      } else {
+        b.isConst = 0;
+        b.reg = insn.getBTI();
       }
-      return true;
+      return b;
     }
 
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
+    /*! Implements base class */
+    virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
+    {
       using namespace ir;
+      const ir::LoadInstruction &insn = cast<ir::LoadInstruction>(dag.insn);
       GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
                  insn.getAddressSpace() == MEM_CONSTANT ||
@@ -3249,9 +3622,11 @@ namespace gbe
                  insn.getAddressSpace() == MEM_LOCAL ||
                  insn.getAddressSpace() == MEM_MIXED);
       //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
+
+      BTI bti = getBTI(dag, insn);
+
       const Type type = insn.getValueType();
-      const uint32_t elemSize = getByteScatterGatherSize(type);
-      const BTI &bti = insn.getBTI();
+      const uint32_t elemSize = getByteScatterGatherSize(sel, type);
       bool allConstant = isAllConstant(bti);
 
       if (allConstant) {
@@ -3276,56 +3651,79 @@ namespace gbe
         else
           this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
       }
+
+
+      // for fixed bti, don't generate the useless loadi
+      if (insn.isFixedBTI())
+        dag.child[0] = NULL;
+      markAllChildren(dag);
+
       return true;
     }
-    DECL_CTOR(LoadInstruction, 1, 1);
   };
-
-  /*! Store instruction pattern */
-  DECL_PATTERN(StoreInstruction)
+  class StoreInstructionPattern : public SelectionPattern
   {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    StoreInstructionPattern(void) : SelectionPattern(1, 1) {
+       this->opcodes.push_back(ir::OP_STORE);
+    }
     void emitUntypedWrite(Selection::Opaque &sel,
                           const ir::StoreInstruction &insn,
-                          GenRegister addr,
-                          uint32_t bti) const
+                          GenRegister address,
+                          ir::BTI &bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> value(valueNum);
+      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
 
-      addr = GenRegister::retype(addr, GEN_TYPE_F);
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
-        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
-      sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
+        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
+      GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
+      sel.UNTYPED_WRITE(address, value.data(), valueNum, b, bti.isConst? NULL : &tmp);
     }
 
     void emitWrite64(Selection::Opaque &sel,
                      const ir::StoreInstruction &insn,
-                     GenRegister addr,
-                     uint32_t bti) const
+                     GenRegister address,
+                     ir::BTI &bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      addr = GenRegister::retype(addr, GEN_TYPE_UD);
+      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
       vector<GenRegister> src(valueNum);
 
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
-      sel.WRITE64(addr, src.data(), valueNum, bti);
+
+      GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
+
+      if (sel.hasLongType()) {
+        vector<GenRegister> tmp(valueNum);
+        for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+          tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+        }
+        sel.WRITE64(address, src.data(), tmp.data(), valueNum, b, true, bti.isConst? NULL : &tmpFlag);
+      } else {
+        sel.WRITE64(address, src.data(), NULL, valueNum, b, false, bti.isConst? NULL : &tmpFlag);
+      }
     }
 
     void emitByteScatter(Selection::Opaque &sel,
                          const ir::StoreInstruction &insn,
                          const uint32_t elemSize,
-                         GenRegister addr,
-                         uint32_t bti,
+                         GenRegister address,
+                         ir::BTI &bti,
                          bool isUniform) const
     {
       using namespace ir;
       uint32_t valueNum = insn.getValueNum();
 
+      GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+      GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
       if(valueNum > 1) {
         const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
         vector<GenRegister> value(valueNum);
@@ -3345,11 +3743,12 @@ namespace gbe
           sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
         }
 
-        sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
+        sel.UNTYPED_WRITE(address, tmp.data(), tmpRegNum, b, bti.isConst ? NULL : &tmpFlag);
       } else {
         const GenRegister value = sel.selReg(insn.getValue(0));
         GBE_ASSERT(insn.getValueNum() == 1);
         const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+
         sel.push();
           if (isUniform) {
             sel.curr.noMask = 1;
@@ -3361,47 +3760,52 @@ namespace gbe
           else if (elemSize == GEN_BYTE_SCATTER_BYTE)
             sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
         sel.pop();
-        sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
+        sel.BYTE_SCATTER(address, tmp, elemSize, b, bti.isConst ? NULL : &tmpFlag);
       }
     }
 
-    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti, bool isUniform) const {
-      if(bti == 0xfe)
-        return address;
 
-      sel.push();
-        sel.curr.noMask = 1;
-        if (isUniform)
-          sel.curr.execWidth = 1;
-        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD, isUniform), ir::TYPE_U32);
-        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
-      sel.pop();
-      return temp;
+    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::StoreInstruction &insn) const {
+      using namespace ir;
+      SelectionDAG *child0 = dag.child[0];
+      ir::BTI b;
+      if (insn.isFixedBTI()) {
+        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
+        const auto imm = immInsn.getImmediate();
+        b.isConst = 1;
+        b.imm = imm.getIntegerValue();
+      } else {
+        b.isConst = 0;
+        b.reg = insn.getBTI();
+      }
+      return b;
     }
-
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn, bool &markChildren) const
+    virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
     {
       using namespace ir;
-      const Type type = insn.getValueType();
-      const uint32_t elemSize = getByteScatterGatherSize(type);
+      const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
       GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+      const Type type = insn.getValueType();
+      const uint32_t elemSize = getByteScatterGatherSize(sel, type);
 
       const bool isUniform = sel.isScalarReg(insn.getAddress()) && sel.isScalarReg(insn.getValue(0));
+      BTI bti = getBTI(dag, insn);
 
-      BTI bti = insn.getBTI();
-      for (int x = 0; x < bti.count; x++) {
-        GenRegister temp = getRelativeAddress(sel, address, bti.bti[x], isUniform);
-        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-          this->emitWrite64(sel, insn, temp, bti.bti[x]);
-        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-          this->emitUntypedWrite(sel, insn, temp,  bti.bti[x]);
-        else {
-          this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x], isUniform);
-        }
+      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+        this->emitWrite64(sel, insn, address, bti);
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+        this->emitUntypedWrite(sel, insn, address,  bti);
+      else {
+        this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
       }
-      return true;
+
+      // for fixed bti, don't generate the useless loadi
+      if (insn.isFixedBTI())
+        dag.child[0] = NULL;
+      markAllChildren(dag);
+
+      return true;
     }
-    DECL_CTOR(StoreInstruction, 1, 1);
   };
 
   /*! Compare instruction pattern */
@@ -3448,7 +3852,7 @@ namespace gbe
         sel.curr.modFlag = 1;
         sel.curr.flagIndex = dst.value();
         sel.curr.grfFlag = needStoreBool; // indicate whether we need to allocate grf to store this boolean.
-        if (type == TYPE_S64 || type == TYPE_U64) {
+        if ((type == TYPE_S64 || type == TYPE_U64) && !sel.hasLongType()) {
           GenRegister tmp[3];
           for(int i=0; i<3; i++)
             tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
@@ -3492,20 +3896,25 @@ namespace gbe
       const Type srcType = insn.getSrcType();
       const uint32_t dstNum = insn.getDstNum();
       const uint32_t srcNum = insn.getSrcNum();
-      int index = 0, multiple, narrowNum;
+      int index = 0, multiple, narrowNum, wideNum;
       bool narrowDst;
       Type narrowType;
+      bool wideScalar = false;
 
       if(dstNum > srcNum) {
         multiple = dstNum / srcNum;
         narrowType = dstType;
         narrowNum = dstNum;
+        wideNum = srcNum;
         narrowDst = 1;
+        wideScalar = sel.isScalarReg(insn.getSrc(0));
       } else {
         multiple = srcNum / dstNum;
         narrowType = srcType;
         narrowNum = srcNum;
+        wideNum = dstNum;
         narrowDst = 0;
+        wideScalar = sel.isScalarReg(insn.getDst(0));
       }
 
       sel.push();
@@ -3521,31 +3930,71 @@ namespace gbe
       const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || dstType == TYPE_S64 || dstType == TYPE_U64);
       const int simdWidth = sel.curr.execWidth;
 
+      /* because we do not have hstride = 8, here, we need to seperate
+         the long into top have and bottom half. */
+      vector<GenRegister> tmp(wideNum);
+      if (multiple == 8 && sel.hasLongType() && !wideScalar) {
+        GBE_ASSERT(isInt64); // Must relate to long and char conversion.
+        if (narrowDst) {
+          for (int i = 0; i < wideNum; i++) {
+            tmp[i] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+            sel.UNPACK_LONG(tmp[i], sel.selReg(insn.getSrc(i), srcType));
+          }
+        } else {
+          for (int i = 0; i < wideNum; i++) {
+            tmp[i] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+          }
+        }
+      }
+
       for(int i = 0; i < narrowNum; i++, index++) {
         GenRegister narrowReg, wideReg;
-        if(narrowDst) {
-          narrowReg = sel.selReg(insn.getDst(i), narrowType);
-          wideReg = sel.selReg(insn.getSrc(index/multiple), narrowType);  //retype to narrow type
+        if (multiple == 8 && sel.hasLongType() && !wideScalar) {
+          if(narrowDst) {
+            narrowReg = sel.selReg(insn.getDst(i), narrowType);
+            wideReg = GenRegister::retype(tmp[index/multiple], narrowType);  //retype to narrow type
+          } else {
+            wideReg = GenRegister::retype(tmp[index/multiple], narrowType);
+            narrowReg = sel.selReg(insn.getSrc(i), narrowType);  //retype to narrow type
+          }
         } else {
-          wideReg = sel.selReg(insn.getDst(index/multiple), narrowType);
-          narrowReg = sel.selReg(insn.getSrc(i), narrowType);  //retype to narrow type
+          if(narrowDst) {
+            narrowReg = sel.selReg(insn.getDst(i), narrowType);
+            wideReg = sel.selReg(insn.getSrc(index/multiple), narrowType);  //retype to narrow type
+          } else {
+            wideReg = sel.selReg(insn.getDst(index/multiple), narrowType);
+            narrowReg = sel.selReg(insn.getSrc(i), narrowType);  //retype to narrow type
+          }
         }
 
         // set correct horizontal stride
         if(wideReg.hstride != GEN_HORIZONTAL_STRIDE_0) {
           if(multiple == 2) {
-            wideReg = sel.unpacked_uw(wideReg.reg());
-            wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
-            if(isInt64) {
-              wideReg.hstride = GEN_HORIZONTAL_STRIDE_1;
-              wideReg.vstride = GEN_VERTICAL_STRIDE_8;
+            if (sel.hasLongType() && isInt64) {
+              // long to int or int to long
+              wideReg = sel.unpacked_ud(wideReg.reg());
+              wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+            } else {
+              wideReg = sel.unpacked_uw(wideReg.reg());
+              wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+              if(isInt64) {
+                wideReg.width = GEN_WIDTH_8;
+                wideReg.hstride = GEN_HORIZONTAL_STRIDE_1;
+                wideReg.vstride = GEN_VERTICAL_STRIDE_8;
+              }
             }
           } else if(multiple == 4) {
-            wideReg = sel.unpacked_ub(wideReg.reg());
-            wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
-            if(isInt64) {
-              wideReg.hstride = GEN_HORIZONTAL_STRIDE_2;
-              wideReg.vstride = GEN_VERTICAL_STRIDE_16;
+            if (sel.hasLongType() && isInt64) {
+              // long to short or short to long
+              wideReg = sel.unpacked_uw(wideReg.reg());
+              wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+            } else {
+              wideReg = sel.unpacked_ub(wideReg.reg());
+              wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+              if(isInt64) {
+                wideReg.hstride = GEN_HORIZONTAL_STRIDE_2;
+                wideReg.vstride = GEN_VERTICAL_STRIDE_16;
+              }
             }
           } else if(multiple == 8) {
             // we currently store high/low 32bit separately in register,
@@ -3557,18 +4006,16 @@ namespace gbe
           }
         }
 
-        if(!isInt64 && index % multiple) {
-          wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
-          wideReg.subphysical = 1;
+        if((!isInt64 || (sel.hasLongType() && multiple != 8)) && index % multiple) {
+          wideReg = sel.getOffsetReg(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
         }
-        if(isInt64) {
-          wideReg.subphysical = 1;
+        if(isInt64 && (multiple == 8 || !sel.hasLongType())) {
           // Offset to next half
           if((i % multiple) >= multiple/2)
-            wideReg = GenRegister::offset(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4);
+            wideReg = sel.getOffsetReg(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4);
           // Offset to desired narrow element in wideReg
           if(index % (multiple/2))
-            wideReg = GenRegister::offset(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type));
+            wideReg = sel.getOffsetReg(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type));
         }
 
         GenRegister xdst = narrowDst ? narrowReg : wideReg;
@@ -3579,18 +4026,23 @@ namespace gbe
         } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) {
           sel.push();
             sel.curr.execWidth = 8;
-            xdst.subphysical = 1;
-            xsrc.subphysical = 1;
             for(int i = 0; i < simdWidth/4; i ++) {
               sel.curr.chooseNib(i);
               sel.MOV(xdst, xsrc);
-              xdst = GenRegister::offset(xdst, 0, 4 * typeSize(getGenType(dstType)));
-              xsrc = GenRegister::offset(xsrc, 0, 4 * typeSize(getGenType(srcType)));
+              xdst = sel.getOffsetReg(xdst, 0, 4 * typeSize(getGenType(dstType)));
+              xsrc = sel.getOffsetReg(xsrc, 0, 4 * typeSize(getGenType(srcType)));
             }
           sel.pop();
         } else
           sel.MOV(xdst, xsrc);
       }
+
+      if (multiple == 8 && sel.hasLongType() && !wideScalar && !narrowDst) {
+        for (int i = 0; i < wideNum; i++) {
+          sel.PACK_LONG(sel.selReg(insn.getDst(i), dstType), tmp[i]);
+        }
+      }
+
       sel.pop();
 
       return true;
@@ -3676,10 +4128,17 @@ namespace gbe
           sel.F32TO16(unpacked, src);
         sel.pop();
         sel.MOV(dst, unpacked);
-      } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
+      } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {//convert i32 to small int and half
         GenRegister unpacked;
         if (dstFamily == FAMILY_WORD) {
-          const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+          uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+
+	  /* The special case, when dst is half, float->word->half will lose accuracy. */
+	  if (dstType == TYPE_HALF) {
+            GBE_ASSERT(sel.hasHalfType());
+            type = GEN_TYPE_HF;
+          }
+
           if (!sel.isScalarReg(dst.reg())) {
             unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
             unpacked = GenRegister::retype(unpacked, type);
@@ -3693,12 +4152,131 @@ namespace gbe
           } else
             unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
         }
-        if(srcFamily == FAMILY_QWORD) {
+
+        sel.push();
+          if (sel.isScalarReg(insn.getSrc(0))) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+          sel.MOV(unpacked, src);
+        sel.pop();
+
+        if (unpacked.reg() != dst.reg())
+          sel.MOV(dst, unpacked);
+      } else if (dstFamily == FAMILY_WORD && srcFamily == FAMILY_QWORD) { //convert i64 to i16 and half.
+        if (dstType == TYPE_HALF) {
+          /* There is no MOV for Long <---> Half. So Long-->Float-->half. */
+          GBE_ASSERT(sel.hasLongType());
+          GBE_ASSERT(sel.hasHalfType());
+          sel.push();
+          if (sel.isScalarReg(insn.getSrc(0))) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+
+          GenRegister funpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          funpacked = GenRegister::retype(funpacked, GEN_TYPE_F);
+          sel.MOV(funpacked, src);
+          GenRegister ftmp = sel.selReg(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+          ftmp = GenRegister::retype(ftmp, GEN_TYPE_F);
+          sel.MOV(ftmp, funpacked);
+          GenRegister unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, GEN_TYPE_HF);
+          sel.MOV(unpacked, ftmp);
+          sel.pop();
+          sel.MOV(dst, unpacked);
+        } else {
+          uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+
+          GenRegister unpacked;
+          if (!sel.isScalarReg(dst.reg())) {
+            if (sel.hasLongType()) {
+              unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            } else {
+              unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+            }
+            unpacked = GenRegister::retype(unpacked, type);
+          } else {
+            unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
+          }
+
+          if(!sel.hasLongType()) {
+           GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
+            tmp.type = GEN_TYPE_D;
+            sel.CONVI64_TO_I(tmp, src);
+            sel.MOV(unpacked, tmp);
+          } else {
+            sel.push();
+              if (sel.isScalarReg(insn.getSrc(0))) {
+                sel.curr.execWidth = 1;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.noMask = 1;
+              }
+              sel.MOV(unpacked, src);
+            sel.pop();
+          }
+
+          if (unpacked.reg() != dst.reg()) {
+            sel.MOV(dst, unpacked);
+          }
+        }
+      } else if (dstFamily == FAMILY_BYTE && srcFamily == FAMILY_QWORD) { //convert i64 to i8
+        GenRegister unpacked;
+        const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
+
+        if (sel.hasLongType()) { // handle the native long logic.
+          if (!sel.isScalarReg(dst.reg())) {
+            /* When convert i64 to i8, the hstride should be 8, but the hstride do not
+               support more than 4, so we need to split it to 2 steps. */
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
+          } else {
+            unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
+          }
+
+          sel.push();
+          if (sel.isScalarReg(insn.getSrc(0))) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+          sel.MOV(unpacked, src);
+          sel.pop();
+
+          if (unpacked.reg() != dst.reg()) {
+            sel.MOV(dst, unpacked);
+          }
+        } else { // Do not have native long
+          if (!sel.isScalarReg(dst.reg())) {
+            unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, type);
+          } else {
+            unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
+          }
+
           GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
           tmp.type = GEN_TYPE_D;
           sel.CONVI64_TO_I(tmp, src);
           sel.MOV(unpacked, tmp);
-        } else {
+
+          if (unpacked.reg() != dst.reg()) {
+            sel.MOV(dst, unpacked);
+          }
+        }
+      } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) &&
+                 (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) {// Convert i64 to i32
+        if (sel.hasLongType()) {
+          GenRegister unpacked;
+          const uint32_t type = dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D;
+          if (!sel.isScalarReg(dst.reg())) {
+            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D);
+          } else {
+            unpacked = GenRegister::retype(sel.unpacked_ud(dst.reg()), type);
+          }
+
           sel.push();
             if (sel.isScalarReg(insn.getSrc(0))) {
               sel.curr.execWidth = 1;
@@ -3707,13 +4285,14 @@ namespace gbe
             }
             sel.MOV(unpacked, src);
           sel.pop();
+
+          if (unpacked.reg() != dst.reg()) {
+            sel.MOV(dst, unpacked);
+          }
+        } else {
+          sel.CONVI64_TO_I(dst, src);
         }
-        if (unpacked.reg() != dst.reg())
-          sel.MOV(dst, unpacked);
-      } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) &&
-                 (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64))
-        sel.CONVI64_TO_I(dst, src);
-      else if (dstType == ir::TYPE_FLOAT && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) {
+      } else if (dstType == ir::TYPE_FLOAT && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) { //i64 to float
         auto dag = sel.regDAG[src.reg()];
         // FIXME, in the future, we need to do a common I64 lower to I32 analysis
         // at llvm IR layer which could cover more cases then just this one.
@@ -3747,38 +4326,156 @@ namespace gbe
             }
           }
         }
-        GenRegister tmp[6];
-        for(int i=0; i<6; i++) {
-          tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+
+        if (!sel.hasLongType()) {
+          GenRegister tmp[6];
+          for(int i=0; i<6; i++) {
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+          }
+          sel.push();
+            sel.curr.flag = 0;
+            sel.curr.subFlag = 1;
+            sel.CONVI64_TO_F(dst, src, tmp);
+          sel.pop();
+        } else {
+          GenRegister unpacked;
+          const uint32_t type = GEN_TYPE_F;
+          unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+          unpacked = GenRegister::retype(unpacked, type);
+
+          sel.push();
+            if (sel.isScalarReg(insn.getSrc(0))) {
+              sel.curr.execWidth = 1;
+              sel.curr.predicate = GEN_PREDICATE_NONE;
+              sel.curr.noMask = 1;
+            }
+            sel.MOV(unpacked, src);
+          sel.pop();
+
+          if (unpacked.reg() != dst.reg()) {
+            sel.MOV(dst, unpacked);
+          }
         }
+      }   else if (sel.hasLongType() && sel.hasLongRegRestrict() && dstFamily == FAMILY_QWORD && srcFamily != FAMILY_QWORD) {
+        // Convert i32/i16/i8/float to i64/double if hasLongRegRestrict(src and dst hstride must be aligned to the same qword).
+        GenRegister unpacked;
+        GenRegister unpacked_src = src;
+
         sel.push();
-          sel.curr.flag = 0;
-          sel.curr.subFlag = 1;
-          sel.CONVI64_TO_F(dst, src, tmp);
+          if (sel.isScalarReg(insn.getSrc(0))) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+
+          if (srcType == ir::TYPE_FLOAT) {
+            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, GEN_TYPE_F);
+          } else if(srcFamily == FAMILY_DWORD) {
+            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, srcType == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_D);
+          } else if(srcFamily == FAMILY_WORD) {
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, srcType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W);
+          } else if(srcFamily == FAMILY_BYTE) {
+            GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, sel.isScalarReg(insn.getSrc(0))));
+            tmp = GenRegister::retype(tmp, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, srcType == TYPE_U8 ? GEN_TYPE_UW : GEN_TYPE_W);
+            sel.MOV(tmp, src);
+            unpacked_src = tmp;
+          } else
+            GBE_ASSERT(0);
+
+          sel.MOV(unpacked, unpacked_src);
         sel.pop();
-      } else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
-                 (src.isdf() && dstType == ir::TYPE_FLOAT)) {
+        sel.MOV(dst, unpacked);
+      }else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
+                 (src.isdf() && dstType == ir::TYPE_FLOAT)) { // float and double conversion
         ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
         sel.MOV_DF(dst, src, sel.selReg(r, TYPE_U64));
-      } else if (dst.isint64()) {
+      } else if (dst.isint64()) { // promote to i64
         switch(src.type) {
           case GEN_TYPE_F:
-          {
-            GenRegister tmp[2];
-            tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-            tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_FLOAT);
-            sel.push();
-              sel.curr.flag = 0;
-              sel.curr.subFlag = 1;
-              sel.CONVF_TO_I64(dst, src, tmp);
-            sel.pop();
-            break;
-          }
+            {
+              if (!sel.hasLongType()) {
+                GenRegister tmp[2];
+                tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+                tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_FLOAT);
+                sel.push();
+                  sel.curr.flag = 0;
+                  sel.curr.subFlag = 1;
+                  sel.CONVF_TO_I64(dst, src, tmp);
+                sel.pop();
+              } else {
+                sel.MOV(dst, src);
+              }
+              break;
+            }
+          case GEN_TYPE_HF:
+            {
+              GBE_ASSERT(sel.hasLongType());
+              GBE_ASSERT(sel.hasHalfType());
+              uint32_t type = dstType == TYPE_U64 ? GEN_TYPE_UD : GEN_TYPE_D;
+              GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))), TYPE_U32), type);
+              sel.push();
+              if (sel.isScalarReg(insn.getSrc(0))) {
+                sel.curr.execWidth = 1;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.noMask = 1;
+              }
+              sel.MOV(tmp, src);
+              sel.pop();
+              sel.MOV(dst, tmp);
+              break;
+            }
           case GEN_TYPE_DF:
             NOT_IMPLEMENTED;
           default:
-            sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
+            if (sel.hasLongType()) {
+              sel.MOV(dst, src);
+            } else {
+              sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
+            }
         }
+      } else if (srcType == ir::TYPE_HALF && (dstFamily == FAMILY_BYTE || dstFamily == FAMILY_WORD)) {
+      // Special case, half -> char/short.
+      /* [DevBDW+]:  Format conversion to or from HF (Half Float) must be DWord-aligned and
+         strided by a DWord on the destination. */
+        GBE_ASSERT(sel.hasHalfType());
+        GenRegister tmp;
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        if (dstFamily == FAMILY_BYTE) {
+          const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
+          tmp = GenRegister::retype(sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), type);
+          sel.MOV(tmp, src);
+        } else {
+          const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+          tmp = GenRegister::retype(sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), type);
+          sel.MOV(tmp, src);
+        }
+        sel.pop();
+        sel.MOV(dst, tmp);
+      } else if (dstType == ir::TYPE_HALF && (srcFamily == FAMILY_BYTE || srcFamily == FAMILY_WORD)) {
+        // Special case, char/uchar -> half
+        /* [DevBDW+]:  Format conversion to or from HF (Half Float) must be DWord-aligned and
+           strided by a DWord on the destination. */
+        GBE_ASSERT(sel.hasHalfType());
+        GenRegister tmp = GenRegister::retype(sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0)))), GEN_TYPE_HF);
+        sel.push();
+        if (sel.isScalarReg(insn.getSrc(0))) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.MOV(tmp, src);
+        sel.pop();
+        sel.MOV(dst, tmp);
       } else
         sel.MOV(dst, src);
 
@@ -3789,38 +4486,61 @@ namespace gbe
     DECL_CTOR(ConvertInstruction, 1, 1);
   };
 
-  /*! Convert instruction pattern */
-  DECL_PATTERN(AtomicInstruction)
+  /*! atomic instruction pattern */
+  class AtomicInstructionPattern : public SelectionPattern
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn, bool &markChildren) const
-    {
+  public:
+    AtomicInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::AtomicInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::AtomicInstruction &insn) const {
+      using namespace ir;
+      SelectionDAG *child0 = dag.child[0];
+      ir::BTI b;
+      if (insn.isFixedBTI()) {
+        const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
+        const auto imm = immInsn.getImmediate();
+        b.isConst = 1;
+        b.imm = imm.getIntegerValue();
+      } else {
+        b.isConst = 0;
+        b.reg = insn.getBTI();
+      }
+      return b;
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
       using namespace ir;
+      const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
+
+      ir::BTI b = getBTI(dag, insn);
       const AtomicOps atomicOp = insn.getAtomicOpcode();
-      const AddressSpace space = insn.getAddressSpace();
-      const uint32_t srcNum = insn.getSrcNum();
+      unsigned srcNum = insn.getSrcNum();
+      unsigned opNum = srcNum - 1;
 
-      GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
-      GenRegister src1 = src0, src2 = src0;
-      if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
-      if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
       GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
+      GenRegister bti =  b.isConst ? GenRegister::immud(b.imm) : sel.selReg(b.reg, ir::TYPE_U32);
+      GenRegister src0 = sel.selReg(insn.getSrc(1), TYPE_U32);   //address
+      GenRegister src1 = src0, src2 = src0;
+      if(srcNum > 2) src1 = sel.selReg(insn.getSrc(2), TYPE_U32);
+      if(srcNum > 3) src2 = sel.selReg(insn.getSrc(3), TYPE_U32);
+
+      GenRegister flagTemp = sel.selReg(sel.reg(FAMILY_WORD, true), TYPE_U16);
+
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
-      if(space == MEM_LOCAL) {
-        sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
-      } else {
-        ir::BTI b = insn.getBTI();
-        for (int x = 0; x < b.count; x++) {
-          sel.push();
-            sel.curr.noMask = 1;
-            GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
-            sel.ADD(temp, src0, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(b.bti[x]), ir::TYPE_U32)));
-          sel.pop();
-          sel.ATOMIC(dst, genAtomicOp, srcNum, temp, src1, src2, b.bti[x]);
-        }
-      }
+
+      sel.ATOMIC(dst, genAtomicOp, opNum, src0, src1, src2, bti, b.isConst ? NULL : &flagTemp);
+
+      // for fixed bti, don't generate the useless loadi
+      if (insn.isFixedBTI())
+        dag.child[0] = NULL;
+      markAllChildren(dag);
+
       return true;
     }
-    DECL_CTOR(AtomicInstruction, 1, 1);
   };
 
   /*! Select instruction pattern */
@@ -3873,7 +4593,7 @@ namespace gbe
         // just a hint. We need to fix it in the future.
         if (!dag0 || (sel.isScalarReg(dag0->insn.getDst(0))))
           sel.curr.externFlag = 1;
-        if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
+        if((type == ir::TYPE_S64 || type == ir::TYPE_U64) && !sel.hasLongType())
           sel.SEL_INT64(dst, src0, src1);
         else
           sel.SEL(dst, src0, src1);
@@ -3895,17 +4615,27 @@ namespace gbe
       switch(insn.getOpcode()) {
         case OP_I64MADSAT:
          {
-          GenRegister tmp[9];
-          for(int i=0; i<9; i++) {
-            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-            tmp[i].type = GEN_TYPE_UD;
-          }
-          sel.push();
-            sel.curr.flag = 0;
-            sel.curr.subFlag = 1;
-            sel.I64MADSAT(dst, src0, src1, src2, tmp);
-          sel.pop();
-          break;
+           GenRegister tmp[9];
+           int tmp_num;
+           if (!sel.hasLongType()) {
+             tmp_num = 9;
+             for(int i=0; i<9; i++) {
+               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+               tmp[i].type = GEN_TYPE_UD;
+             }
+           } else {
+             tmp_num = 6;
+             for(int i=0; i<6; i++) {
+               tmp[i] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+               tmp[i].type = GEN_TYPE_UL;
+             }
+           }
+           sel.push();
+           sel.curr.flag = 0;
+           sel.curr.subFlag = 1;
+           sel.I64MADSAT(dst, src0, src1, src2, tmp, tmp_num);
+           sel.pop();
+           break;
          }
         case OP_MAD:
          {
@@ -4024,6 +4754,44 @@ namespace gbe
 
   DECL_PATTERN(SampleInstruction)
   {
+    INLINE void emitLd_ivb(Selection::Opaque &sel, const ir::SampleInstruction &insn,
+                           GenRegister msgPayloads[4], uint32_t &msgLen) const
+    {
+      // pre SKL: U, lod, [V], [W]
+      using namespace ir;
+      GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
+      uint32_t srcNum = insn.getSrcNum();
+      msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
+      msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      sel.MOV(msgPayloads[1], GenRegister::immud(0));
+      if (srcNum > 1)
+        msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
+      if (srcNum > 2)
+        msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
+      // Clear the lod to zero.
+      msgLen = srcNum + 1;
+    }
+
+    INLINE void emitLd_skl(Selection::Opaque &sel, const ir::SampleInstruction &insn,
+                           GenRegister msgPayloads[4], uint32_t &msgLen) const
+    {
+      // SKL: U, [V], [lod], [W]
+      using namespace ir;
+      GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
+      uint32_t srcNum = msgLen = insn.getSrcNum();
+      msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
+      if (srcNum > 1)
+        msgPayloads[1] = sel.selReg(insn.getSrc(1), insn.getSrcType());
+      if (srcNum > 2) {
+        // Clear the lod to zero.
+        msgPayloads[2] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        sel.MOV(msgPayloads[2], GenRegister::immud(0));
+        msgLen += 1;
+
+        msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
+      }
+    }
+
     INLINE bool emitOne(Selection::Opaque &sel, const ir::SampleInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
@@ -4035,24 +4803,11 @@ namespace gbe
       for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
         dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
 
-      GBE_ASSERT(srcNum == 3);
-      if (insn.getSrc(1) == ir::ocl::invalid) //not 3D
-        srcNum = 1;
-      else if (insn.getSrc(2) == ir::ocl::invalid)
-        srcNum = 2;
-
       if (insn.getSamplerOffset() != 0) {
-        // U, lod, [V], [W]
-        GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
-        msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
-        msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        if (srcNum > 1)
-          msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
-        if (srcNum > 2)
-          msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
-        // Clear the lod to zero.
-        sel.MOV(msgPayloads[1], GenRegister::immud(0));
-        msgLen = srcNum + 1;
+        if(sel.getLdMsgOrder() < LD_MSG_ORDER_SKL)
+          this->emitLd_ivb(sel, insn, msgPayloads, msgLen);
+        else
+          this->emitLd_skl(sel, insn, msgPayloads, msgLen);
       } else {
         // U, V, [W]
         GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT);
@@ -4083,26 +4838,27 @@ namespace gbe
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
       const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
-      const uint32_t coordNum = 3;
+      const uint32_t dim = insn.getSrcNum() - 4;
 
       if (simdWidth == 16) {
         for(uint32_t i = 0; i < msgNum; i++)
           msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
       } else {
         uint32_t valueID = 0;
-        msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        for(uint32_t msgID = 1; msgID < 1 + coordNum; msgID++, valueID++)
+        uint32_t msgID = 0;
+        msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(; msgID < 1 + dim; msgID++, valueID++)
           msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
 
-        // fake u.
-        if (insn.getSrc(1) == ir::ocl::invalid)
-          msgs[2] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        // fake v.
+        if (dim < 2)
+          msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         // fake w.
-        if (insn.getSrc(2) == ir::ocl::invalid)
-          msgs[3] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        if (dim < 3)
+          msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         // LOD.
-        msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        for(uint32_t msgID = 5; valueID < insn.getSrcNum(); msgID++, valueID++)
+        msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(; valueID < insn.getSrcNum(); msgID++, valueID++)
           msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
       }
 
@@ -4112,8 +4868,7 @@ namespace gbe
       sel.MOV(msgs[0], GenRegister::immud(0));
       sel.curr.execWidth = 1;
 
-      GenRegister channelEn = GenRegister::offset(msgs[0], 0, 7*4);
-      channelEn.subphysical = 1;
+      GenRegister channelEn = sel.getOffsetReg(msgs[0], 0, 7*4);
       // Enable all channels.
       sel.MOV(channelEn, GenRegister::immud(0xffff));
       sel.curr.execWidth = 8;
@@ -4126,7 +4881,7 @@ namespace gbe
 
       uint32_t bti = insn.getImageIndex();
       if (simdWidth == 8)
-        sel.TYPED_WRITE(msgs, msgNum, bti, insn.getSrc(2) != ir::ocl::invalid);
+        sel.TYPED_WRITE(msgs, msgNum, bti, dim == 3);
       else {
         sel.push();
         sel.curr.execWidth = 8;
@@ -4142,16 +4897,16 @@ namespace gbe
           sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
           // Set U,V,W
           QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), insn.getCoordType()));
-          if (insn.getSrc(1) != ir::ocl::invalid) //not 2D
+          if (dim > 1)
             QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), insn.getCoordType()));
-          if (insn.getSrc(2) != ir::ocl::invalid) //not 3D
+          if (dim > 2)
             QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), insn.getCoordType()));
           // Set R, G, B, A
-          QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(3), insn.getSrcType()));
-          QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(4), insn.getSrcType()));
-          QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(5), insn.getSrcType()));
-          QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(6), insn.getSrcType()));
-          sel.TYPED_WRITE(msgs, msgNum, bti, insn.getSrc(2) != ir::ocl::invalid);
+          QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(dim), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(dim + 1), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(dim + 2), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(dim + 3), insn.getSrcType()));
+          sel.TYPED_WRITE(msgs, msgNum, bti, dim == 3);
           #undef QUARTER_MOV0
           #undef QUARTER_MOV1
         }
@@ -4216,6 +4971,46 @@ namespace gbe
     }
   };
 
+  class SimdShuffleInstructionPattern : public SelectionPattern
+  {
+  public:
+    SimdShuffleInstructionPattern(void) : SelectionPattern(1,1) {
+      this->opcodes.push_back(ir::OP_SIMD_SHUFFLE);
+    }
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::SimdShuffleInstruction &insn = cast<SimdShuffleInstruction>(dag.insn);
+      assert(insn.getOpcode() == OP_SIMD_SHUFFLE);
+      const Type type = insn.getType();
+      GenRegister dst  = sel.selReg(insn.getDst(0), type);
+      GenRegister src0  = sel.selReg(insn.getSrc(0), type);
+      GenRegister src1;
+
+      SelectionDAG *dag0 = dag.child[0];
+      SelectionDAG *dag1 = dag.child[1];
+      if (dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
+        const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
+        src1 = getRegisterFromImmediate(childInsn.getImmediate(), TYPE_U32);
+        if (dag0) dag0->isRoot = 1;
+      } else {
+        markAllChildren(dag);
+        src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
+      }
+
+      sel.push();
+      if (src1.file == GEN_IMMEDIATE_VALUE)
+        sel.SIMD_SHUFFLE(dst, src0, src1);
+      else {
+        GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        sel.SHL(shiftL, src1, GenRegister::immud(0x2));
+        sel.SIMD_SHUFFLE(dst, src0, shiftL);
+      }
+      sel.pop();
+      return true;
+    }
+
+  };
+
   /*! Get a region of a register */
   class RegionInstructionPattern : public SelectionPattern
   {
@@ -4229,8 +5024,7 @@ namespace gbe
       GenRegister dst, src;
       dst = sel.selReg(insn.getDst(0), ir::TYPE_U32);
       src = GenRegister::ud1grf(insn.getSrc(0));
-      src.subphysical = 1;
-      src = GenRegister::offset(src, 0, insn.getOffset()*4);
+      src = sel.getOffsetReg(src, 0, insn.getOffset()*4);
 
       sel.push();
         sel.curr.noMask = 1;
@@ -4242,6 +5036,29 @@ namespace gbe
     }
   };
 
+  /*! Get a region of a register */
+  class IndirectMovInstructionPattern : public SelectionPattern
+  {
+  public:
+    IndirectMovInstructionPattern(void) : SelectionPattern(1,1) {
+      this->opcodes.push_back(ir::OP_INDIRECT_MOV);
+    }
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::IndirectMovInstruction &insn = cast<ir::IndirectMovInstruction>(dag.insn);
+      GenRegister dst, src0, src1;
+      uint32_t offset = insn.getOffset();
+      dst = sel.selReg(insn.getDst(0), insn.getType());
+      src0 = sel.selReg(insn.getSrc(0), TYPE_U32);
+      src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
+      GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD), TYPE_U16);
+
+      sel.INDIRECT_MOVE(dst, tmp, src0, src1, offset);
+      markAllChildren(dag);
+      return true;
+    }
+  };
+
   /*! Branch instruction pattern */
   class BranchInstructionPattern : public SelectionPattern
   {
@@ -4468,6 +5285,9 @@ namespace gbe
     this->insert<GetImageInfoInstructionPattern>();
     this->insert<ReadARFInstructionPattern>();
     this->insert<RegionInstructionPattern>();
+    this->insert<SimdShuffleInstructionPattern>();
+    this->insert<IndirectMovInstructionPattern>();
+    this->insert<NullaryInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index d3f7363..ffc79e1 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -100,7 +100,7 @@ namespace gbe
       struct {
         /*! Store bti for loads/stores and function for math, atomic and compares */
         uint16_t function:8;
-        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
+        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, operand number for atomic */
         uint16_t elem:8;
       };
       struct {
@@ -131,6 +131,7 @@ namespace gbe
       };
       uint32_t barrierType;
       bool longjmp;
+      uint32_t indirect_offset;
     } extra;
     /*! Gen opcode */
     uint8_t opcode;
@@ -149,14 +150,7 @@ namespace gbe
     INLINE uint32_t getbti() const {
       GBE_ASSERT(isRead() || isWrite());
       switch (opcode) {
-        case SEL_OP_ATOMIC: return extra.elem;
-        case SEL_OP_BYTE_SCATTER:
-        case SEL_OP_WRITE64:
-        case SEL_OP_DWORD_GATHER:
-        case SEL_OP_UNTYPED_WRITE:
-        case SEL_OP_UNTYPED_READ:
-        case SEL_OP_BYTE_GATHER:
-        case SEL_OP_READ64: return extra.function;
+        case SEL_OP_DWORD_GATHER: return extra.function;
         case SEL_OP_SAMPLE: return extra.rdbti;
         case SEL_OP_TYPED_WRITE: return extra.bti;
         default:
@@ -168,14 +162,7 @@ namespace gbe
     INLINE void setbti(uint32_t bti) {
       GBE_ASSERT(isRead() || isWrite());
       switch (opcode) {
-        case SEL_OP_ATOMIC: extra.elem = bti; return;
-        case SEL_OP_BYTE_SCATTER:
-        case SEL_OP_WRITE64:
-        case SEL_OP_UNTYPED_WRITE:
-        case SEL_OP_DWORD_GATHER:
-        case SEL_OP_UNTYPED_READ:
-        case SEL_OP_BYTE_GATHER:
-        case SEL_OP_READ64: extra.function = bti; return;
+        case SEL_OP_DWORD_GATHER: extra.function = bti; return;
         case SEL_OP_SAMPLE: extra.rdbti = bti; return;
         case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
         default:
@@ -199,6 +186,8 @@ namespace gbe
     GenRegister *reg;
     /*! Number of registers in the vector */
     uint16_t regNum;
+    /*! offset in insn src() or dst() */
+    uint16_t offsetID;
     /*! Indicate if this a destination or a source vector */
     uint16_t isSrc;
   };
@@ -267,6 +256,8 @@ namespace gbe
     bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
     /*! Indicate if a register is scalar or not */
     bool isScalarReg(const ir::Register &reg) const;
+    /*! is this register a partially written register.*/
+    bool isPartialWrite(const ir::Register &reg) const;
     /*! Create a new selection instruction */
     SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
     /*! List of emitted blocks */
@@ -293,6 +284,20 @@ namespace gbe
       Selection8(GenContext &ctx);
   };
 
+  class SelectionChv: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      SelectionChv(GenContext &ctx);
+  };
+
+  class Selection9: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      Selection9(GenContext &ctx);
+  };
+
 } /* namespace gbe */
 
 #endif /*  __GEN_INSN_SELECTION_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index da8086e..adbb137 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,5 +1,6 @@
 DECL_SELECTION_IR(LABEL, LabelInstruction)
 DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(BSWAP, UnaryWithTempInstruction)
 DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
@@ -25,6 +26,7 @@ DECL_SELECTION_IR(SHL, BinaryInstruction)
 DECL_SELECTION_IR(RSR, BinaryInstruction)
 DECL_SELECTION_IR(RSL, BinaryInstruction)
 DECL_SELECTION_IR(ASR, BinaryInstruction)
+DECL_SELECTION_IR(SIMD_SHUFFLE, SimdShuffleInstruction)
 DECL_SELECTION_IR(I64SHR, I64ShiftInstruction)
 DECL_SELECTION_IR(I64SHL, I64ShiftInstruction)
 DECL_SELECTION_IR(I64ASR, I64ShiftInstruction)
@@ -60,6 +62,8 @@ DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
 DECL_SELECTION_IR(PACK_BYTE, PackByteInstruction)
 DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction)
+DECL_SELECTION_IR(PACK_LONG, PackLongInstruction)
+DECL_SELECTION_IR(UNPACK_LONG, UnpackLongInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
 DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 53d6d86..c761a2f 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -49,6 +49,7 @@
 #include "backend/gen_context.hpp"
 #include "backend/gen75_context.hpp"
 #include "backend/gen8_context.hpp"
+#include "backend/gen9_context.hpp"
 #include "backend/gen_defs.hpp"
 #include "backend/gen/gen_mesa_disasm.h"
 #include "backend/gen_reg_allocation.hpp"
@@ -165,6 +166,10 @@ namespace gbe {
       ctx = GBE_NEW(Gen75Context, unit, name, deviceID, relaxMath);
     } else if (IS_BROADWELL(deviceID)) {
       ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
+    } else if (IS_CHERRYVIEW(deviceID)) {
+      ctx = GBE_NEW(ChvContext, unit, name, deviceID, relaxMath);
+    } else if (IS_SKYLAKE(deviceID)) {
+      ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath);
     }
     GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
 
@@ -206,7 +211,9 @@ namespace gbe {
                                       (IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
                                       (IS_BAYTRAIL_T(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
                                       (IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) ||  \
-                                      (IS_BROADWELL(typeA) && !strcmp(src_hw_info, "BDW")) )
+                                      (IS_BROADWELL(typeA) && !strcmp(src_hw_info, "BDW")) ||  \
+                                      (IS_CHERRYVIEW(typeA) && !strcmp(src_hw_info, "CHV")) ||  \
+                                      (IS_SKYLAKE(typeA) && !strcmp(src_hw_info, "SKL")) )
 
   static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
     using namespace gbe;
@@ -257,6 +264,11 @@ namespace gbe {
     acquireLLVMContextLock();
     llvm::Module* module = llvm::ParseIR(memory_buffer, Err, c);
 #endif
+    // if load 32 bit spir binary, the triple should be spir-unknown-unknown.
+    llvm::Triple triple(module->getTargetTriple());
+    if(triple.getArchName() == "spir" && triple.getVendorName() == "unknown" && triple.getOSName() == "unknown"){
+      module->setTargetTriple("spir");
+    }
     releaseLLVMContextLock();
     if(module == NULL){
       GBE_ASSERT(0);
@@ -307,6 +319,14 @@ namespace gbe {
         src_hw_info[0]='B';
         src_hw_info[1]='D';
         src_hw_info[2]='W';
+      }else if(IS_CHERRYVIEW(prog->deviceID)){
+        src_hw_info[0]='C';
+        src_hw_info[1]='H';
+        src_hw_info[2]='V';
+      }else if(IS_SKYLAKE(prog->deviceID)){
+        src_hw_info[0]='S';
+        src_hw_info[1]='K';
+        src_hw_info[2]='L';
       }
       FILL_DEVICE_ID(*binary, src_hw_info);
       memcpy(*binary+BINARY_HEADER_LENGTH, oss.str().c_str(), sz*sizeof(char));
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index a5d601a..4cb88e9 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -141,7 +141,7 @@ namespace gbe
     /*! Allocate the vectors detected in the instruction selection pass */
     void allocateVector(Selection &selection);
     /*! Allocate the given interval. Return true if success */
-    bool createGenReg(const GenRegInterval &interval);
+    bool createGenReg(const Selection &selection, const GenRegInterval &interval);
     /*! Indicate if the registers are already allocated in vectors */
     bool isAllocated(const SelectionVector *vector) const;
     /*! Reallocate registers if needed to make the registers in the vector
@@ -180,7 +180,7 @@ namespace gbe
     uint32_t reservedReg;
     /*! Current vector to expire */
     uint32_t expiringID;
-    INLINE void insertNewReg(ir::Register reg, uint32_t grfOffset, bool isVector = false);
+    INLINE void insertNewReg(const Selection &selection, ir::Register reg, uint32_t grfOffset, bool isVector = false);
     INLINE bool expireReg(ir::Register reg);
     INLINE bool spillAtInterval(GenRegInterval interval, int size, uint32_t alignment);
     INLINE uint32_t allocateReg(GenRegInterval interval, uint32_t size, uint32_t alignment);
@@ -250,7 +250,7 @@ namespace gbe
     }
   }
 
-  bool GenRegAllocator::Opaque::createGenReg(const GenRegInterval &interval) {
+  bool GenRegAllocator::Opaque::createGenReg(const Selection &selection, const GenRegInterval &interval) {
     using namespace ir;
     const ir::Register reg = interval.reg;
     if (RA.contains(reg) == true)
@@ -262,7 +262,7 @@ namespace gbe
     if (grfOffset == 0) {
       return false;
     }
-    insertNewReg(reg, grfOffset);
+    insertNewReg(selection, reg, grfOffset);
     return true;
   }
 
@@ -319,7 +319,7 @@ namespace gbe
       else {
         ir::Register tmp;
         ir::Type type = getIRType(vector->reg[regID].type);
-        tmp = this->replaceReg(selection, vector->insn, regID, vector->isSrc, type);
+        tmp = this->replaceReg(selection, vector->insn, regID + vector->offsetID, vector->isSrc, type);
         const VectorLocation location = std::make_pair(vector, regID);
         this->vectorMap.insert(std::make_pair(tmp, location));
       }
@@ -620,7 +620,10 @@ namespace gbe
             // set a temporary register to avoid switch in this block.
             bool isSrc = false;
             bool needMov = false;
-            this->replaceReg(selection, &insn, 0, isSrc, ir::TYPE_FLOAT, needMov);
+            ir::Type ir_type = ir::TYPE_FLOAT;
+            if (insn.src(0).isint64())
+              ir_type = ir::TYPE_U64;
+            this->replaceReg(selection, &insn, 0, isSrc, ir_type, needMov);
           }
           // If the instruction requires to generate (CMP for long/int/float..)
           // the flag value to the register, and it's not a pure flag boolean,
@@ -710,13 +713,13 @@ namespace gbe
           getRegAttrib(reg, alignment, NULL);
           // check all sub registers aligned correctly
           GBE_ASSERT((grfOffset + subOffset) % alignment == 0 || (grfOffset + subOffset) % GEN_REG_SIZE == 0);
-          insertNewReg(reg, grfOffset + subOffset, true);
+          insertNewReg(selection, reg, grfOffset + subOffset, true);
           ctx.splitBlock(grfOffset, subOffset);  //splitBlock will not split if regID == 0
           subOffset += alignment;
         }
       }
       // Case 2: This is a regular scalar register, allocate it alone
-      else if (this->createGenReg(interval) == false) {
+      else if (this->createGenReg(selection, interval) == false) {
         if (!spillReg(interval))
           return false;
       }
@@ -800,7 +803,10 @@ namespace gbe
 
   // insert a new register with allocated offset,
   // put it to the RA map and the spill map if it could be spilled.
-  INLINE void GenRegAllocator::Opaque::insertNewReg(ir::Register reg, uint32_t grfOffset, bool isVector)
+  INLINE void GenRegAllocator::Opaque::insertNewReg(const Selection &selection,
+                                                    ir::Register reg,
+                                                    uint32_t grfOffset,
+                                                    bool isVector)
   {
      RA.insert(std::make_pair(reg, grfOffset));
 
@@ -815,8 +821,9 @@ namespace gbe
        if (ctx.getSimdWidth() == 16 && reg.value() >= ctx.getFunction().getRegisterFile().regNum())
          return;
 
-       if ((regSize == ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_DWORD)
-          || (regSize == 2 * ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_QWORD)) {
+       if (((regSize == ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_DWORD)
+          || (regSize == 2 * ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_QWORD))
+          && !selection.isPartialWrite(reg)) {
          GBE_ASSERT(offsetReg.find(grfOffset) == offsetReg.end());
          offsetReg.insert(std::make_pair(grfOffset, reg));
          spillCandidate.insert(intervals[reg]);
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index e166af4..4f37e30 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -80,6 +80,7 @@ namespace gbe
       case GEN_TYPE_UW:
       case GEN_TYPE_W:
       case GEN_TYPE_HF:
+      case GEN_TYPE_HF_IMM:
         return 2;
       case GEN_TYPE_UB:
       case GEN_TYPE_B:
@@ -205,6 +206,8 @@ namespace gbe
       this->quarter = 0;
       this->nr = this->subnr = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! For specific physical registers only */
@@ -229,6 +232,8 @@ namespace gbe
       this->hstride = hstride;
       this->quarter = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! Return the IR virtual register */
@@ -242,6 +247,7 @@ namespace gbe
       uint32_t ud;
       uint32_t reg;
       int64_t i64;
+      uint64_t u64;
     } value;
 
     uint32_t nr:8;         //!< Just for some physical registers (acc, null)
@@ -257,14 +263,21 @@ namespace gbe
     uint32_t hstride:2;      //!< Horizontal stride
     uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
     uint32_t address_mode:1; //!< direct or indirect
+    uint32_t a0_subnr:4;     //!< In indirect mode, use a0.nr as the base.
+    int32_t addr_imm:10;     //!< In indirect mode, the imm as address offset from a0.
 
     static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
       GenRegister r = reg;
       r.nr += nr;
       r.subnr += subnr;
+      r.subphysical = 1;
       return r;
     }
 
+    static INLINE uint32_t grfOffset(GenRegister reg) {
+      return reg.nr * GEN_REG_SIZE + reg.subnr;
+    }
+
     // split a DWORD register into unpacked Byte or Short register
     static INLINE GenRegister splitReg(GenRegister reg, uint32_t count, uint32_t sub_part) {
       GenRegister r = reg;
@@ -296,6 +309,15 @@ namespace gbe
       return false;
     }
 
+    /* Besides long and double, there are also some cases which can also stride
+       several registers, eg. unpacked ud for long<8,4:2> and unpacked uw for
+       long<16,4:4> */
+    INLINE bool is_unpacked_long(void) const {
+      if (file != GEN_GENERAL_REGISTER_FILE) return false;
+      if (width == GEN_WIDTH_4 && hstride > GEN_HORIZONTAL_STRIDE_1) return true;
+      return false;
+    }
+
     INLINE bool isimmdf(void) const {
       if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
         return true;
@@ -447,6 +469,20 @@ namespace gbe
       return retype(vec1(file, reg), GEN_TYPE_DF);
     }
 
+    /* Because we can not crossing row with horizontal stride, so for long
+       type, we need to set it to <4,4:1>:UQ */
+    static INLINE GenRegister ul16(uint32_t file, ir::Register reg) {
+      return retype(vec4(file, reg), GEN_TYPE_UL);
+    }
+
+    static INLINE GenRegister ul8(uint32_t file, ir::Register reg) {
+      return retype(vec4(file, reg), GEN_TYPE_UL);
+    }
+
+    static INLINE GenRegister ul1(uint32_t file, ir::Register reg) {
+      return retype(vec1(file, reg), GEN_TYPE_UL);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
       return retype(vec16(file, reg), GEN_TYPE_UD);
     }
@@ -497,13 +533,47 @@ namespace gbe
       return retype(vec1(file, reg), GEN_TYPE_UB);
     }
 
-    static INLINE GenRegister unpacked_uw(ir::Register reg, bool uniform = false) {
-        return GenRegister(GEN_GENERAL_REGISTER_FILE,
-                           reg,
-                           GEN_TYPE_UW,
-                           uniform ? GEN_VERTICAL_STRIDE_0 : GEN_VERTICAL_STRIDE_16,
-                           uniform ? GEN_WIDTH_1 : GEN_WIDTH_8,
-                           uniform ? GEN_HORIZONTAL_STRIDE_0 : GEN_HORIZONTAL_STRIDE_2);
+    static INLINE GenRegister unpacked_ud(ir::Register reg, bool uniform = false) {
+      uint32_t width;
+      uint32_t vstride;
+      uint32_t hstride;
+
+      if (uniform) {
+        width = GEN_WIDTH_1;
+        vstride = GEN_VERTICAL_STRIDE_0;
+        hstride = GEN_HORIZONTAL_STRIDE_0;
+      } else {
+        width = GEN_WIDTH_4;
+        vstride = GEN_VERTICAL_STRIDE_8;
+        hstride = GEN_HORIZONTAL_STRIDE_2;
+      }
+
+      return GenRegister(GEN_GENERAL_REGISTER_FILE, reg,
+                         GEN_TYPE_UD, vstride, width, hstride);
+    }
+
+    static INLINE GenRegister unpacked_uw(ir::Register reg, bool uniform = false,
+                                          bool islong = false) {
+      uint32_t width;
+      uint32_t vstride;
+      uint32_t hstride;
+
+      if (uniform) {
+        width = GEN_WIDTH_1;
+        vstride = GEN_VERTICAL_STRIDE_0;
+        hstride = GEN_HORIZONTAL_STRIDE_0;
+      } else if (islong) {
+        width = GEN_WIDTH_4;
+        vstride = GEN_VERTICAL_STRIDE_16;
+        hstride = GEN_HORIZONTAL_STRIDE_4;
+      } else {
+        width = GEN_WIDTH_8;
+        vstride = GEN_VERTICAL_STRIDE_16;
+        hstride = GEN_HORIZONTAL_STRIDE_2;
+      }
+
+      return GenRegister(GEN_GENERAL_REGISTER_FILE, reg,
+                         GEN_TYPE_UW, vstride, width, hstride);
     }
 
     static INLINE GenRegister unpacked_ub(ir::Register reg, bool uniform = false) {
@@ -525,6 +595,12 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_0);
     }
 
+    static INLINE GenRegister immuint64(uint64_t i) {
+      GenRegister immediate = imm(GEN_TYPE_UL);
+      immediate.value.u64 = i;
+      return immediate;
+    }
+
     static INLINE GenRegister immint64(int64_t i) {
       GenRegister immediate = imm(GEN_TYPE_L);
       immediate.value.i64 = i;
@@ -567,6 +643,12 @@ namespace gbe
       return immediate;
     }
 
+    static INLINE GenRegister immh(uint16_t uw) {
+      GenRegister immediate = imm(GEN_TYPE_HF_IMM);
+      immediate.value.ud = uw;
+      return immediate;
+    }
+
     static INLINE GenRegister immv(uint32_t v) {
       GenRegister immediate = imm(GEN_TYPE_V);
       immediate.vstride = GEN_VERTICAL_STRIDE_0;
@@ -626,6 +708,18 @@ namespace gbe
       return df16(GEN_GENERAL_REGISTER_FILE, reg);
     }
 
+    static INLINE GenRegister ul16grf(ir::Register reg) {
+      return ul16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister ul8grf(ir::Register reg) {
+      return ul8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister ul1grf(ir::Register reg) {
+      return ul1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
     static INLINE GenRegister ud16grf(ir::Register reg) {
       return ud16(GEN_GENERAL_REGISTER_FILE, reg);
     }
@@ -744,21 +838,45 @@ namespace gbe
     }
 
     /*! Build an indirectly addressed source */
-    static INLINE GenRegister indirect(uint32_t type, uint32_t subnr, uint32_t width) {
+    static INLINE GenRegister indirect(uint32_t type, uint32_t subnr, uint32_t width,
+                                        uint32_t vstride, uint32_t hstride) {
       GenRegister reg;
       reg.type = type;
       reg.file = GEN_GENERAL_REGISTER_FILE;
       reg.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
       reg.width = width;
-      reg.subnr = subnr;
+      reg.a0_subnr = subnr;
       reg.nr = 0;
+      reg.addr_imm = 0;
       reg.negation = 0;
       reg.absolute = 0;
-      reg.vstride = 0;
-      reg.hstride = 0;
+      reg.vstride = vstride;
+      reg.hstride = hstride;
       return reg;
     }
 
+    /*! convert one register to indirectly mode */
+    static INLINE GenRegister to_indirect1xN(GenRegister reg, uint32_t base_addr,
+                                          int32_t imm_off = 4096, int a0_subnr = 0) {
+      GenRegister r = reg;
+      int32_t offset;
+      if (imm_off > 4095) {
+        offset = (r.nr*32 + r.subnr) - base_addr;
+      } else {
+        offset = imm_off;
+      }
+
+      GBE_ASSERT(offset <= 511 && offset>=-512);
+      r.a0_subnr = a0_subnr;
+      r.addr_imm = offset;
+      r.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
+
+      r.width = GEN_WIDTH_1;
+      r.vstride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      r.hstride = GEN_HORIZONTAL_STRIDE_0;
+      return r;
+    }
+
     static INLINE GenRegister vec16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return GenRegister(file,
                          nr,
@@ -840,6 +958,18 @@ namespace gbe
       return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
     }
 
+    static INLINE GenRegister ul16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec4(file, nr, subnr), GEN_TYPE_UL);
+    }
+
+    static INLINE GenRegister ul8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec4(file, nr, subnr), GEN_TYPE_UL);
+    }
+
+    static INLINE GenRegister ul1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec1(file, nr, subnr), GEN_TYPE_UL);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
     }
@@ -865,7 +995,7 @@ namespace gbe
     }
 
     static INLINE GenRegister uw1(uint32_t file, uint32_t nr, uint32_t subnr) {
-      return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UW), subnr);
+      return offset(retype(vec1(file, nr, 0), GEN_TYPE_UW), 0, typeSize(GEN_TYPE_UW)*subnr);
     }
 
     static INLINE GenRegister ub16(uint32_t file, uint32_t nr, uint32_t subnr) {
@@ -924,6 +1054,18 @@ namespace gbe
       return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
 
+    static INLINE GenRegister ul16grf(uint32_t nr, uint32_t subnr) {
+      return ul16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister ul8grf(uint32_t nr, uint32_t subnr) {
+      return ul8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister ul1grf(uint32_t nr, uint32_t subnr) {
+      return ul1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
     static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
       return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
@@ -1049,6 +1191,7 @@ namespace gbe
         return SIMD1(values...); \
       } \
     }
+    // TODO: Should add native long type here.
     DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
     DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
     DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index eee7c3c..e4cdeaa 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -113,7 +113,7 @@ namespace gbe {
 
 #ifdef GBE_COMPILER_AVAILABLE
   BVAR(OCL_OUTPUT_GEN_IR, false);
-  BVAR(OCL_STRICT_CONFORMANCE, false);
+  BVAR(OCL_STRICT_CONFORMANCE, true);
 
   bool Program::buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel) {
     ir::Unit *unit = new ir::Unit();
@@ -634,6 +634,7 @@ namespace gbe {
 
   SVAR(OCL_PCH_PATH, OCL_PCH_OBJECT);
   SVAR(OCL_HEADER_FILE_DIR, OCL_HEADER_DIR);
+  BVAR(OCL_OUTPUT_KERNEL_SOURCE, false);
 
   static bool processSourceAndOption(const char *source,
                                      const char *options,
@@ -665,6 +666,14 @@ namespace gbe {
       }
     }
     assert(findOcl);
+    if (OCL_OUTPUT_KERNEL_SOURCE) {
+      if(options) {
+        std::cout << "Build options:" << std::endl;
+        std::cout << options << std::endl;
+      }
+      std::cout << "CL kernel source:" << std::endl;
+      std::cout << source;
+    }
     std::string includePath  = "-I" + headerFilePath;
     clOpt.push_back(includePath);
     bool useDefaultCLCVersion = true;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 554fb16..3637ebb 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -101,7 +101,9 @@ enum gbe_curbe_type {
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_ZERO,
   GBE_CURBE_ONE,
+  GBE_CURBE_LANE_ID,
   GBE_CURBE_SLM_OFFSET,
+  GBE_CURBE_BTI_UTIL,
 };
 
 /*! Extra arguments use the negative range of sub-values */
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index f4be488..86197e1 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -178,6 +178,14 @@ void program_build_instance::serialize_program(void) throw(int)
         src_hw_info[0]='B';
         src_hw_info[1]='D';
         src_hw_info[2]='W';
+    }else if(IS_CHERRYVIEW(gen_pci_id)){
+        src_hw_info[0]='C';
+        src_hw_info[1]='H';
+        src_hw_info[2]='V';
+    }else if(IS_SKYLAKE(gen_pci_id)){
+        src_hw_info[0]='S';
+        src_hw_info[1]='K';
+        src_hw_info[2]='L';
     }
 
     if (str_fmt_out) {
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index cf5109d..54265d0 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -176,6 +176,12 @@ namespace ir {
     DECL_THREE_SRC_INSN(MAD);
 #undef DECL_THREE_SRC_INSN
 
+    /*! For all nullary functions */
+    void ALU0(Opcode opcode, Type type, Register dst) {
+      const Instruction insn = gbe::ir::ALU0(opcode, type, dst);
+      this->append(insn);
+    }
+
     /*! For all unary functions */
     void ALU1(Opcode opcode, Type type, Register dst, Register src) {
       const Instruction insn = gbe::ir::ALU1(opcode, type, dst, src);
@@ -184,22 +190,22 @@ namespace ir {
 
     /*! LOAD with the destinations directly specified */
     template <typename... Args>
-    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
     {
       const Tuple index = this->tuple(values...);
       const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
       GBE_ASSERT(valueNum > 0);
-      this->LOAD(type, index, offset, space, valueNum, dwAligned, bti);
+      this->LOAD(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
     }
 
     /*! STORE with the sources directly specified */
     template <typename... Args>
-    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
     {
       const Tuple index = this->tuple(values...);
       const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
       GBE_ASSERT(valueNum > 0);
-      this->STORE(type, index, offset, space, valueNum, dwAligned, bti);
+      this->STORE(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
     }
     void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti, reg); }
 
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 79dc997..f87f23a 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -156,7 +156,7 @@ namespace ir {
       case TYPE_U32:
       case TYPE_S64: out << imm.getIntegerValue(); break;
       case TYPE_U64: out << (uint64_t)imm.getIntegerValue(); break;
-      case TYPE_HALF: out << "half(" << imm.getIntegerValue() << ")"; break;
+      case TYPE_HALF: out << "half(" << (float)imm.getHalfValue() << ")"; break;
       case TYPE_FLOAT: out << imm.getFloatValue(); break;
       case TYPE_DOUBLE: out << imm.getDoubleValue(); break;
       default:
@@ -281,6 +281,27 @@ namespace ir {
     });
   }
 
+  void Function::outputCFG(void) {
+    std::string fileName = getName() + std::string(".dot");
+    ::FILE *fp = fopen(fileName.c_str(), "w");
+    if (fp == NULL) return;
+
+    printf("writing Gen IR CFG to %s\n", fileName.c_str());
+    fprintf(fp, "digraph \"%s\" {\n", getName().c_str());
+    this->foreachBlock([this, fp](BasicBlock &bb) {
+      uint32_t lid = bb.getLabelIndex();
+      fprintf(fp, "Node%d [shape=record, label=\"{%d}\"];\n", lid, lid);
+      set<BasicBlock*> &succ = bb.successors;
+      for (auto x : succ) {
+        uint32_t next = x->getLabelIndex();
+        fprintf(fp, "Node%d -> Node%d\n", lid, next);
+      }
+    });
+    fprintf(fp, "}\n");
+    fclose(fp);
+  }
+
+
   std::ostream &operator<< (std::ostream &out, const Function &fn)
   {
     out << ".decl_function " << fn.getName() << std::endl;
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 2a3d067..5d00cca 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -471,6 +471,8 @@ namespace ir {
     /*! Get surface starting address register from bti */
     Register getSurfaceBaseReg(uint8_t bti) const;
     void appendSurface(uint8_t bti, Register reg);
+    /*! Output the control flow graph to .dot file */
+    void outputCFG();
   private:
     friend class Context;           //!< Can freely modify a function
     std::string name;               //!< Function name
diff --git a/backend/src/ir/half.cpp b/backend/src/ir/half.cpp
new file mode 100644
index 0000000..1c0d7eb
--- /dev/null
+++ b/backend/src/ir/half.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file half.cpp
+ *
+ */
+#include "llvm/ADT/APSInt.h"
+#include "half.hpp"
+
+namespace gbe {
+namespace ir {
+  static llvm::APFloat convU16ToAPFloat(const uint16_t v)
+  {
+    uint64_t v64 = static_cast<uint64_t>(v);
+    llvm::APInt apInt(16, v64, false);
+    return llvm::APFloat(llvm::APFloat::IEEEhalf, apInt);
+  }
+
+  static uint16_t convAPFloatToU16(const llvm::APFloat& apf)
+  {
+    llvm::APInt api = apf.bitcastToAPInt();
+    uint64_t v64 = api.getZExtValue();
+    return static_cast<uint16_t>(v64);
+  }
+
+  half::operator float(void) const {
+    bool loseInfo;
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    apf_self.convert(llvm::APFloat::IEEEsingle, llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+    return apf_self.convertToFloat();
+  }
+
+  half::operator double(void) const {
+    bool loseInfo;
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    apf_self.convert(llvm::APFloat::IEEEdouble, llvm::APFloat::rmNearestTiesToEven, &loseInfo);
+    return apf_self.convertToDouble();
+  }
+
+  half::operator uint16_t(void) const {
+    llvm::APSInt apsInt(16, false);
+    bool isExact;
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    apf_self.convertToInteger(apsInt, llvm::APFloat::rmNearestTiesToEven, &isExact);
+    return static_cast<uint16_t>(apsInt.getZExtValue());
+  }
+
+  half::operator int16_t(void) const {
+    llvm::APSInt apsInt(16, true);
+    bool isExact;
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    apf_self.convertToInteger(apsInt, llvm::APFloat::rmNearestTiesToEven, &isExact);
+    return static_cast<int16_t>(apsInt.getZExtValue());
+  }
+
+  half half::convToHalf(uint16_t u16) {
+    llvm::APFloat res(llvm::APFloat::IEEEhalf, llvm::APInt(16, 0, false));
+    uint64_t u64 = static_cast<uint64_t>(u16);
+    llvm::APInt apInt(16, u64, false);
+    res.convertFromAPInt(apInt, false, llvm::APFloat::rmNearestTiesToEven);
+    return half(convAPFloatToU16(res));
+  }
+
+  half half::convToHalf(int16_t v16) {
+    llvm::APFloat res(llvm::APFloat::IEEEhalf, llvm::APInt(16, 0, true));
+    uint64_t u64 = static_cast<uint64_t>(v16);
+    llvm::APInt apInt(16, u64, true);
+    res.convertFromAPInt(apInt, true, llvm::APFloat::rmNearestTiesToEven);
+    return half(convAPFloatToU16(res));
+  }
+
+  half half::operator +(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    apf_self.add(apf_other, llvm::APFloat::rmNearestTiesToEven);
+    uint16_t ret = convAPFloatToU16(apf_self);
+    return half(ret);
+  }
+
+  half half::operator -(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    apf_self.subtract(apf_other, llvm::APFloat::rmNearestTiesToEven);
+    uint16_t ret = convAPFloatToU16(apf_self);
+    return half(ret);
+  }
+
+  half half::operator *(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    apf_self.multiply(apf_other, llvm::APFloat::rmNearestTiesToEven);
+    uint16_t ret = convAPFloatToU16(apf_self);
+    return half(ret);
+  }
+
+  half half::operator /(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    apf_self.divide(apf_other, llvm::APFloat::rmNearestTiesToEven);
+    uint16_t ret = convAPFloatToU16(apf_self);
+    return half(ret);
+  }
+
+  half half::operator %(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    apf_self.remainder(apf_other);
+    uint16_t ret = convAPFloatToU16(apf_self);
+    return half(ret);
+  }
+
+  bool half::operator ==(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    llvm::APFloat::cmpResult res = apf_self.compare(apf_other);
+    if (res == llvm::APFloat::cmpEqual)
+      return true;
+
+    return false;
+  }
+
+  bool half::operator !=(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    llvm::APFloat::cmpResult res = apf_self.compare(apf_other);
+    if (res == llvm::APFloat::cmpEqual)
+      return false;
+
+    return true;
+  }
+
+  bool half::operator <(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    llvm::APFloat::cmpResult res = apf_self.compare(apf_other);
+    if (res == llvm::APFloat::cmpLessThan)
+      return true;
+
+    return false;
+  }
+
+  bool half::operator >(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    llvm::APFloat::cmpResult res = apf_self.compare(apf_other);
+    if (res == llvm::APFloat::cmpGreaterThan)
+      return true;
+
+    return false;
+  }
+
+  bool half::operator <=(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    llvm::APFloat::cmpResult res = apf_self.compare(apf_other);
+    if (res == llvm::APFloat::cmpLessThan || res == llvm::APFloat::cmpEqual)
+      return true;
+
+    return false;
+  }
+
+  bool half::operator >=(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    llvm::APFloat::cmpResult res = apf_self.compare(apf_other);
+    if (res == llvm::APFloat::cmpGreaterThan || res == llvm::APFloat::cmpEqual)
+      return true;
+
+    return false;
+  }
+
+  bool half::operator &&(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    if (apf_self.isZero() || apf_other.isZero())
+      return false;
+
+    return true;
+  }
+
+  bool half::operator ||(const half& other) const
+  {
+    llvm::APFloat apf_self = convU16ToAPFloat(this->val);
+    llvm::APFloat apf_other = convU16ToAPFloat(other.val);
+    if (apf_self.isZero() && apf_other.isZero())
+      return false;
+
+    return true;
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/half.hpp b/backend/src/ir/half.hpp
new file mode 100644
index 0000000..6d2e207
--- /dev/null
+++ b/backend/src/ir/half.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file half.hpp
+ *
+ */
+
+#ifndef __GBE_IR_HALF_HPP__
+#define __GBE_IR_HALF_HPP__
+
+#include "llvm/ADT/APFloat.h"
+
+namespace gbe {
+namespace ir {
+  /* Because there is no builtin half float data type for GCC on X86 platform,
+     we need to generate a half class to implement all the OP and CONV for half
+     float using LLVM's APFloat ADT. */
+  class half
+  {
+    private:
+      uint16_t val;
+    public:
+      half(uint16_t v) : val(v) {};
+      static half convToHalf(uint16_t u16);
+      static half convToHalf(int16_t v16);
+      half(const half& other) { this->val = other.val; };
+      uint16_t getVal(void) { return val; };
+      operator float (void) const;
+      operator double (void) const;
+      operator uint16_t (void) const;
+      operator int16_t (void) const;
+      half operator+ (const half &) const;
+      half operator- (const half &) const;
+      half operator* (const half &) const;
+      half operator/ (const half &) const;
+      half operator% (const half &) const;
+      bool operator> (const half &) const;
+      bool operator< (const half &) const;
+      bool operator== (const half &) const;
+      bool operator!= (const half &) const;
+      bool operator>= (const half &) const;
+      bool operator<= (const half &) const;
+      bool operator&& (const half &) const;
+      bool operator|| (const half &) const;
+  };
+} /* namespace ir */
+} /* namespace gbe */
+#endif /* End of __GBE_IR_HALF_HPP__ */
diff --git a/backend/src/ir/immediate.cpp b/backend/src/ir/immediate.cpp
index 1aac9bc..35fc965 100644
--- a/backend/src/ir/immediate.cpp
+++ b/backend/src/ir/immediate.cpp
@@ -41,6 +41,7 @@ using namespace ir;
         case TYPE_S64:    return Immediate(*this->data.s64 OP *right.data.s64); \
         case TYPE_U64:    return Immediate(*this->data.u64 OP *right.data.u64); \
         case TYPE_FLOAT:  return Immediate(*this->data.f32 OP *right.data.f32); \
+        case TYPE_HALF:   return Immediate(*this->data.f16 OP *right.data.f16); \
         case TYPE_DOUBLE: return Immediate(*this->data.f64 OP *right.data.f64); \
       }\
       return *this;\
@@ -145,6 +146,7 @@ using namespace ir;
         case TYPE_S64:    return Immediate(*left.data.s64 < *right.data.s64);
         case TYPE_U64:    return Immediate(*left.data.u64 < *right.data.u64);
         case TYPE_FLOAT:  return Immediate(*left.data.f32 < *right.data.f32);
+        case TYPE_HALF:   return Immediate(*left.data.f16 < *right.data.f16);
         case TYPE_DOUBLE: return Immediate(*left.data.f64 < *right.data.f64);
       }
     }
@@ -168,6 +170,7 @@ using namespace ir;
           case TYPE_S64:    return Immediate(left.data.s64[index]);
           case TYPE_U64:    return Immediate(left.data.u64[index]);
           case TYPE_FLOAT:  return Immediate(left.data.f32[index]);
+          case TYPE_HALF:   return Immediate(left.data.f16[index]);
           case TYPE_DOUBLE: return Immediate(left.data.f64[index]);
         }
       } else
@@ -187,7 +190,7 @@ using namespace ir;
         case IMM_XOR: *this = left ^ right; break;
         case IMM_REM:
         {
-          if (left.getType() > TYPE_BOOL && left.getType() <= TYPE_U64)
+          if (left.getType() > TYPE_BOOL && left.getType() <= TYPE_HALF)
             *this = left % right;
           else if (left.getType() == TYPE_FLOAT && right.getType() == TYPE_FLOAT) {
             *this = Immediate(left);
diff --git a/backend/src/ir/immediate.hpp b/backend/src/ir/immediate.hpp
index 6b27e8b..3141643 100644
--- a/backend/src/ir/immediate.hpp
+++ b/backend/src/ir/immediate.hpp
@@ -27,6 +27,7 @@
 
 #include <string.h>
 #include "ir/type.hpp"
+#include "ir/half.hpp"
 #include "sys/platform.hpp"
 
 namespace gbe {
@@ -57,6 +58,10 @@ namespace ir {
     IMM_FPTOSI,
     IMM_SITOFP,
     IMM_UITOFP,
+    IMM_HFTOUS,
+    IMM_HFTOSS,
+    IMM_SSTOHF,
+    IMM_USTOHF,
     IMM_EXTRACT,
     IMM_SEXT,
     IMM_ZEXT,
@@ -74,6 +79,7 @@ namespace ir {
     IMM_TYPE_S64 = TYPE_S64,
     IMM_TYPE_U64 = TYPE_U64,
     IMM_TYPE_FLOAT = TYPE_FLOAT,
+    IMM_TYPE_HALF = TYPE_HALF,
     IMM_TYPE_DOUBLE = TYPE_DOUBLE,
     IMM_TYPE_COMP             // compond immediate which consist many immediates.
   } ImmType;
@@ -106,6 +112,7 @@ namespace ir {
         case TYPE_S8:
         case TYPE_U8:   return 1;
         case TYPE_S16:
+        case TYPE_HALF:
         case TYPE_U16:  return 2;
         case TYPE_FLOAT:
         case TYPE_S32:
@@ -130,12 +137,13 @@ namespace ir {
     DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8)
     DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8)
     DECL_CONSTRUCTOR(int16_t, s16, TYPE_S16)
-    DECL_CONSTRUCTOR(uint16_t, u16, TYPE_S16)
+    DECL_CONSTRUCTOR(uint16_t, u16, TYPE_U16)
     DECL_CONSTRUCTOR(int32_t, s32, TYPE_S32)
-    DECL_CONSTRUCTOR(uint32_t, u32, TYPE_S32)
+    DECL_CONSTRUCTOR(uint32_t, u32, TYPE_U32)
     DECL_CONSTRUCTOR(int64_t, s64, TYPE_S64)
-    DECL_CONSTRUCTOR(uint64_t, u64, TYPE_S64)
+    DECL_CONSTRUCTOR(uint64_t, u64, TYPE_U64)
     DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT)
+    DECL_CONSTRUCTOR(half, f16, TYPE_HALF)
     DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE)
 #undef DECL_CONSTRUCTOR
 
@@ -155,12 +163,13 @@ namespace ir {
     DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8, elemNum)
     DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8, elemNum)
     DECL_CONSTRUCTOR(int16_t, s16, TYPE_S16, elemNum)
-    DECL_CONSTRUCTOR(uint16_t, u16, TYPE_S16, elemNum)
+    DECL_CONSTRUCTOR(uint16_t, u16, TYPE_U16, elemNum)
     DECL_CONSTRUCTOR(int32_t, s32, TYPE_S32, elemNum)
-    DECL_CONSTRUCTOR(uint32_t, u32, TYPE_S32, elemNum)
+    DECL_CONSTRUCTOR(uint32_t, u32, TYPE_U32, elemNum)
     DECL_CONSTRUCTOR(int64_t, s64, TYPE_S64, elemNum)
-    DECL_CONSTRUCTOR(uint64_t, u64, TYPE_S64, elemNum)
+    DECL_CONSTRUCTOR(uint64_t, u64, TYPE_U64, elemNum)
     DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT, elemNum)
+    DECL_CONSTRUCTOR(half, f16, TYPE_HALF, elemNum)
     DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE, elemNum)
 #undef DECL_CONSTRUCTOR
 
@@ -209,6 +218,17 @@ namespace ir {
       return *data.f32;
     }
 
+    INLINE half getHalfValue(void) const {
+      GBE_ASSERT(type == IMM_TYPE_HALF);
+      return *data.f16;
+    }
+
+    INLINE half asHalfValue(void) const {
+      // we allow bitcast from u32/s32 immediate to float
+      GBE_ASSERT(type == IMM_TYPE_HALF || type == IMM_TYPE_U16 || type == IMM_TYPE_S16);
+      return *data.f16;
+    }
+
     INLINE int64_t asIntegerValue(void) const {
       GBE_ASSERT(elemNum == 1);
       return *data.s64;
@@ -245,6 +265,10 @@ namespace ir {
         case IMM_FPTOSI: *this = Immediate((int32_t)*other.data.f32); break;
         case IMM_UITOFP: *this = Immediate((float)*other.data.u32); break;
         case IMM_SITOFP: *this = Immediate((float)*other.data.s32); break;
+        case IMM_HFTOUS: *this = Immediate((uint16_t)*other.data.f16); break;
+        case IMM_HFTOSS: *this = Immediate((int16_t)*other.data.f16); break;
+        case IMM_USTOHF: *this = Immediate(half::convToHalf(*other.data.u16)); break;
+        case IMM_SSTOHF: *this = Immediate(half::convToHalf(*other.data.s16)); break;
         case IMM_SEXT:
         {
           int64_t value = other.getIntegerValue();
@@ -274,9 +298,20 @@ namespace ir {
         }
         case IMM_FPEXT:
         {
-          GBE_ASSERT(other.getType() == TYPE_FLOAT && dstType == TYPE_DOUBLE);
-          double value = other.getFloatValue();
-          *this = Immediate(value);
+          if (other.getType() == TYPE_FLOAT) {
+            GBE_ASSERT(dstType == TYPE_DOUBLE);
+            double value = other.getFloatValue();
+            *this = Immediate(value);
+          } else if (other.getType() == TYPE_HALF) {
+            GBE_ASSERT(dstType == TYPE_DOUBLE || dstType == TYPE_FLOAT);
+            if (dstType == TYPE_FLOAT) {
+              float value = other.getHalfValue();
+              *this = Immediate(value);
+            } else {
+              double value = other.getHalfValue();
+              *this = Immediate(value);
+            }
+          }
           break;
         }
       }
@@ -307,6 +342,7 @@ namespace ir {
       uint64_t *u64;
       float *f32;
       double *f64;
+      half *f16;
       const Immediate *immVec[];
       void *p;
     } data;     //!< Value to store
@@ -338,10 +374,12 @@ namespace ir {
   INLINE bool operator< (const Immediate &imm0, const Immediate &imm1) {
     if (imm0.getType() != imm1.getType())
       return uint32_t(imm0.getType()) < uint32_t(imm1.getType());
-    else if (imm0.getType() == TYPE_FLOAT || imm0.getType() == TYPE_DOUBLE)
+    else if (imm0.getType() == TYPE_FLOAT || imm0.getType() == TYPE_DOUBLE || imm0.getType() == TYPE_HALF)
       return imm0.asIntegerValue() < imm1.asIntegerValue();
     else
       return imm0.getIntegerValue() < imm1.getIntegerValue();
+
+    GBE_ASSERT(0);
   }
 
   /*! A value is stored in a per-function vector. This is the index to it */
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 12bc1bf..f93c528 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -131,6 +131,17 @@ namespace ir {
       Register src[srcNum]; //!< Indices of the sources
     };
 
+    /*! All 0-source arithmetic instructions */
+    class ALIGNED_INSTRUCTION NullaryInstruction : public NaryInstruction<0>
+    {
+    public:
+      NullaryInstruction(Opcode opcode, Type type, Register dst) {
+        this->opcode = opcode;
+        this->type = type;
+        this->dst[0] = dst;
+      }
+    };
+
     /*! All 1-source arithmetic instructions */
     class ALIGNED_INSTRUCTION UnaryInstruction : public NaryInstruction<1>
     {
@@ -307,14 +318,14 @@ namespace ir {
 
     class ALIGNED_INSTRUCTION AtomicInstruction :
       public BasePolicy,
-      public TupleSrcPolicy<AtomicInstruction>,
       public NDstPolicy<AtomicInstruction, 1>
     {
     public:
       AtomicInstruction(AtomicOps atomicOp,
                          Register dst,
                          AddressSpace addrSpace,
-                         BTI bti,
+                         Register bti,
+                         bool fixedBTI,
                          Tuple src)
       {
         this->opcode = OP_ATOMIC;
@@ -323,23 +334,43 @@ namespace ir {
         this->src = src;
         this->addrSpace = addrSpace;
         this->bti = bti;
+        this->fixedBTI = fixedBTI ? 1: 0;
         srcNum = 2;
         if((atomicOp == ATOMIC_OP_INC) ||
           (atomicOp == ATOMIC_OP_DEC))
           srcNum = 1;
         if(atomicOp == ATOMIC_OP_CMPXCHG)
           srcNum = 3;
+        srcNum++;
+      }
+      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
+        if (ID == 0u)
+          return bti;
+        else
+          return fn.getRegister(src, ID -1);
       }
+      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+        GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
+        if (ID == 0u)
+          bti = reg;
+        else
+          fn.setRegister(src, ID - 1, reg);
+      }
+      INLINE uint32_t getSrcNum(void) const { return srcNum; }
+
       INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
-      INLINE BTI getBTI(void) const { return bti; }
+      INLINE Register getBTI(void) const { return bti; }
+      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
       INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
       INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       Register dst[1];
       Tuple src;
       AddressSpace addrSpace; //!< Address space
-      BTI bti;               //!< bti
-      uint8_t srcNum:2;     //!<Source Number
+      Register bti;               //!< bti
+      uint8_t fixedBTI:1;      //!< fixed bti or not
+      uint8_t srcNum:3;     //!<Source Number
       AtomicOps atomicOp:6;     //!<Source Number
     };
 
@@ -399,7 +430,7 @@ namespace ir {
 
     class ALIGNED_INSTRUCTION LoadInstruction :
       public BasePolicy,
-      public NSrcPolicy<LoadInstruction, 1>
+      public NSrcPolicy<LoadInstruction, 2>
     {
     public:
       LoadInstruction(Type type,
@@ -408,7 +439,8 @@ namespace ir {
                       AddressSpace addrSpace,
                       uint32_t valueNum,
                       bool dwAligned,
-                      BTI bti)
+                      bool fixedBTI,
+                      Register bti)
       {
         GBE_ASSERT(valueNum < 128);
         this->opcode = OP_LOAD;
@@ -418,6 +450,7 @@ namespace ir {
         this->addrSpace = addrSpace;
         this->valueNum = valueNum;
         this->dwAligned = dwAligned ? 1 : 0;
+        this->fixedBTI = fixedBTI ? 1 : 0;
         this->bti = bti;
       }
       INLINE Register getDst(const Function &fn, uint32_t ID) const {
@@ -432,16 +465,18 @@ namespace ir {
       INLINE Type getValueType(void) const { return type; }
       INLINE uint32_t getValueNum(void) const { return valueNum; }
       INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
-      INLINE BTI getBTI(void) const { return bti; }
+      INLINE Register getBTI(void) const { return bti; }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       INLINE bool isAligned(void) const { return !!dwAligned; }
+      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
       Type type;              //!< Type to store
       Register src[0];        //!< Address where to load from
+      Register bti;
       Register offset;        //!< Alias to make it similar to store
       Tuple values;           //!< Values to load
       AddressSpace addrSpace; //!< Where to load
-      BTI bti;
+      uint8_t fixedBTI:1;
       uint8_t valueNum:7;     //!< Number of values to load
       uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
     };
@@ -456,7 +491,8 @@ namespace ir {
                        AddressSpace addrSpace,
                        uint32_t valueNum,
                        bool dwAligned,
-                       BTI bti)
+                       bool fixedBTI,
+                       Register bti)
       {
         GBE_ASSERT(valueNum < 255);
         this->opcode = OP_STORE;
@@ -466,35 +502,42 @@ namespace ir {
         this->addrSpace = addrSpace;
         this->valueNum = valueNum;
         this->dwAligned = dwAligned ? 1 : 0;
+        this->fixedBTI = fixedBTI ? 1 : 0;
         this->bti = bti;
       }
       INLINE Register getSrc(const Function &fn, uint32_t ID) const {
-        GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
         if (ID == 0u)
+          return bti;
+        else if (ID == 1u)
           return offset;
         else
-          return fn.getRegister(values, ID - 1);
+          return fn.getRegister(values, ID - 2);
       }
       INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
-        GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+        GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
         if (ID == 0u)
+          bti = reg;
+        else if (ID == 1u)
           offset = reg;
         else
-          fn.setRegister(values, ID - 1, reg);
+          fn.setRegister(values, ID - 2, reg);
       }
-      INLINE uint32_t getSrcNum(void) const { return valueNum + 1u; }
+      INLINE uint32_t getSrcNum(void) const { return valueNum + 2u; }
       INLINE uint32_t getValueNum(void) const { return valueNum; }
       INLINE Type getValueType(void) const { return type; }
       INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
-      INLINE BTI getBTI(void) const { return bti; }
+      INLINE Register getBTI(void) const { return bti; }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       INLINE bool isAligned(void) const { return !!dwAligned; }
+      INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
       Type type;              //!< Type to store
+      Register bti;
       Register offset;        //!< First source is the offset where to store
       Tuple values;           //!< Values to store
       AddressSpace addrSpace; //!< Where to store
-      BTI bti;                //!< Which btis need access
+      uint8_t fixedBTI:1;                //!< Which btis need access
       uint8_t valueNum:7;     //!< Number of values to store
       uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
       Register dst[0];        //!< No destination
@@ -506,10 +549,11 @@ namespace ir {
       public TupleDstPolicy<SampleInstruction>
     {
     public:
-      SampleInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
+      SampleInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple, uint8_t srcNum, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
         this->opcode = OP_SAMPLE;
         this->dst = dstTuple;
         this->src = srcTuple;
+        this->srcNum = srcNum;
         this->dstIsFloat = dstIsFloat;
         this->srcIsFloat = srcIsFloat;
         this->samplerIdx = sampler;
@@ -521,10 +565,13 @@ namespace ir {
         this->outOpcode(out);
         out << "." << this->getDstType()
             << "." << this->getSrcType()
-            << " surface id " << (int)this->getImageIndex()
-            << " coord u %" << this->getSrc(fn, 0)
-            << " coord v %" << this->getSrc(fn, 1)
-            << " coord w %" << this->getSrc(fn, 2)
+            << " surface id " << (int)this->getImageIndex();
+        out << " coord u %" << this->getSrc(fn, 0);
+        if (srcNum >= 2)
+          out << " coord v %" << this->getSrc(fn, 1);
+        if (srcNum >= 3)
+          out << " coord w %" << this->getSrc(fn, 2);
+        out
             << " %" << this->getDst(fn, 0)
             << " %" << this->getDst(fn, 1)
             << " %" << this->getDst(fn, 2)
@@ -544,7 +591,7 @@ namespace ir {
       uint8_t samplerIdx:4;
       uint8_t samplerOffset:2;
       uint8_t imageIdx;
-      static const uint32_t srcNum = 3;
+      uint8_t srcNum;
       static const uint32_t dstNum = 4;
     };
 
@@ -555,9 +602,10 @@ namespace ir {
     {
     public:
 
-      INLINE TypedWriteInstruction(uint8_t imageIdx, Tuple srcTuple, Type srcType, Type coordType) {
+      INLINE TypedWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, Type srcType, Type coordType) {
         this->opcode = OP_TYPED_WRITE;
         this->src = srcTuple;
+        this->srcNum = srcNum;
         this->coordType = coordType;
         this->srcType = srcType;
         this->imageIdx = imageIdx;
@@ -565,27 +613,30 @@ namespace ir {
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
         this->outOpcode(out);
+        uint32_t srcID = 0;
         out << "." << this->getSrcType()
             << " surface id " << (int)this->getImageIndex()
-            << " coord u %" << this->getSrc(fn, 0)
-            << " coord v %" << this->getSrc(fn, 1)
-            << " coord w %" << this->getSrc(fn, 2)
-            << " %" << this->getSrc(fn, 3)
-            << " %" << this->getSrc(fn, 4)
-            << " %" << this->getSrc(fn, 5)
-            << " %" << this->getSrc(fn, 6);
+            << " coord u %" << this->getSrc(fn, srcID++);
+        if (srcNum >= 6)
+          out << " coord v %" << this->getSrc(fn, srcID++);
+        if (srcNum >= 7)
+          out << " coord w %" << this->getSrc(fn, srcID++);
+        out   << " %" << this->getSrc(fn, srcID++);
+        out   << " %" << this->getSrc(fn, srcID++);
+        out   << " %" << this->getSrc(fn, srcID++);
+        out   << " %" << this->getSrc(fn, srcID++);
       }
 
       Tuple src;
       uint8_t srcType;
       uint8_t coordType;
       uint8_t imageIdx;
+      // bti, u, [v], [w], 4 data elements
+      uint8_t srcNum;
 
       INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE Type getSrcType(void) const { return (Type)this->srcType; }
       INLINE Type getCoordType(void) const { return (Type)this->coordType; }
-      // bti, u, v, w, 4 data elements
-      static const uint32_t srcNum = 7;
       Register dst[0];               //!< No dest register
     };
 
@@ -690,6 +741,22 @@ namespace ir {
       Register src[0];
     };
 
+    class ALIGNED_INSTRUCTION SimdShuffleInstruction : public NaryInstruction<2>
+    {
+    public:
+      SimdShuffleInstruction(Type type,
+                        Register dst,
+                        Register src0,
+                        Register src1) {
+        this->opcode = OP_SIMD_SHUFFLE;
+        this->type = type;
+        this->dst[0] = dst;
+        this->src[0] = src0;
+        this->src[1] = src1;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+    };
+
     class ALIGNED_INSTRUCTION RegionInstruction :
       public BasePolicy,
       public NSrcPolicy<RegionInstruction, 1>,
@@ -710,6 +777,30 @@ namespace ir {
       Register src[1];
     };
 
+    class ALIGNED_INSTRUCTION IndirectMovInstruction :
+      public BasePolicy,
+      public NSrcPolicy<IndirectMovInstruction, 2>,
+      public NDstPolicy<IndirectMovInstruction, 1>
+    {
+    public:
+      INLINE IndirectMovInstruction(Type type, Register dst, Register src0, Register src1, uint32_t offset) {
+        this->type = type;
+        this->offset = offset;
+        this->dst[0] = dst;
+        this->src[0] = src0;
+        this->src[1] = src1;
+        this->opcode = OP_INDIRECT_MOV;
+      }
+      INLINE Type getType(void) const { return this->type; }
+      INLINE uint32_t getOffset(void) const { return this->offset; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Type type;
+      uint32_t offset;
+      Register dst[1];
+      Register src[2];
+    };
+
     class ALIGNED_INSTRUCTION LabelInstruction :
       public BasePolicy,
       public NSrcPolicy<LabelInstruction, 0>,
@@ -796,7 +887,7 @@ namespace ir {
                                       TYPE_S16, TYPE_U16,
                                       TYPE_S32, TYPE_U32,
                                       TYPE_S64, TYPE_U64,
-                                      TYPE_FLOAT, TYPE_DOUBLE};
+                                      TYPE_HALF, TYPE_FLOAT, TYPE_DOUBLE};
     static const uint32_t allButBoolNum = ARRAY_ELEM_NUM(allButBool);
 
     // TODO add support for 64 bits values
@@ -942,10 +1033,12 @@ namespace ir {
         return false;
       if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
         return false;
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
-        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID), fn, whyNot) == false))
+      for (uint32_t srcID = 0; srcID < srcNum-1u; ++srcID)
+        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
           return false;
 
+      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, bti, fn, whyNot) == false))
+        return false;
       return true;
     }
 
@@ -1077,6 +1170,19 @@ namespace ir {
       return true;
     }
 
+    INLINE bool SimdShuffleInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32 && this->type != TYPE_FLOAT)) {
+        whyNot = "Only support S32/U32/FLOAT type";
+        return false;
+      }
+
+      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, src[1], fn, whyNot) == false))
+        return false;
+
+      return true;
+    }
+
     INLINE bool RegionInstruction::wellFormed(const Function &fn, std::string &whyNot) const
     {
       if (UNLIKELY(checkRegisterData(FAMILY_DWORD, src[0], fn, whyNot) == false))
@@ -1087,6 +1193,16 @@ namespace ir {
       return true;
     }
 
+    INLINE bool IndirectMovInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      const RegisterFamily family = getFamily(this->type);
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+        return false;
+      return true;
+    }
+
     // Only a label index is required
     INLINE bool LabelInstruction::wellFormed(const Function &fn, std::string &whyNot) const
     {
@@ -1146,12 +1262,10 @@ namespace ir {
       this->outOpcode(out);
       out << "." << addrSpace;
       out << " %" << this->getDst(fn, 0);
-      out << " {" << "%" << this->getSrc(fn, 0) << "}";
-      for (uint32_t i = 1; i < srcNum; ++i)
+      out << " {" << "%" << this->getSrc(fn, 1) << "}";
+      for (uint32_t i = 2; i < srcNum; ++i)
         out << " %" << this->getSrc(fn, i);
-      out << " bti";
-      for (uint32_t i = 0; i < bti.count; ++i)
-        out << ": " << (int)bti.bti[i];
+      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
     }
 
 
@@ -1185,22 +1299,18 @@ namespace ir {
       for (uint32_t i = 0; i < valueNum; ++i)
         out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
       out << "}";
-      out << " %" << this->getSrc(fn, 0);
-      out << " bti";
-      for (uint32_t i = 0; i < bti.count; ++i)
-        out << ": " << (int)bti.bti[i];
+      out << " %" << this->getSrc(fn, 1);
+      out << (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
     }
 
     INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
       out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
-      out << " %" << this->getSrc(fn, 0) << " {";
+      out << " %" << this->getSrc(fn, 1) << " {";
       for (uint32_t i = 0; i < valueNum; ++i)
-        out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
+        out << "%" << this->getSrc(fn, i+2) << (i != (valueNum-1u) ? " " : "");
       out << "}";
-      out << " bti";
-      for (uint32_t i = 0; i < bti.count; ++i)
-        out << ": " << (int)bti.bti[i];
+      out <<  (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
     }
 
     INLINE void ReadARFInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1213,6 +1323,12 @@ namespace ir {
       out << " %" << this->getDst(fn, 0) << " %" << this->getSrc(fn, 0) << " offset: " << this->offset;
     }
 
+    INLINE void IndirectMovInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << type << " %" << this->getDst(fn, 0) << " %" << this->getSrc(fn, 0);
+      out << " %" << this->getSrc(fn, 1) << " offset: " << this->offset;
+    }
+
     INLINE void LabelInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
       out << " $" << labelIndex;
@@ -1298,6 +1414,10 @@ namespace ir {
     }; \
   }
 
+START_INTROSPECTION(NullaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(NullaryInstruction)
+
 START_INTROSPECTION(UnaryInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(UnaryInstruction)
@@ -1370,6 +1490,14 @@ START_INTROSPECTION(RegionInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(RegionInstruction)
 
+START_INTROSPECTION(SimdShuffleInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SimdShuffleInstruction)
+
+START_INTROSPECTION(IndirectMovInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(IndirectMovInstruction)
+
 START_INTROSPECTION(LabelInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LabelInstruction)
@@ -1525,6 +1653,7 @@ END_FUNCTION(Instruction, Register)
     return reinterpret_cast<const internal::CLASS*>(this)->CALL; \
   }
 
+DECL_MEM_FN(NullaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(UnaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(BinaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(BinaryInstruction, bool, commutes(void), commutes())
@@ -1536,18 +1665,18 @@ DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(AtomicInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
+DECL_MEM_FN(AtomicInstruction, bool, isFixedBTI(void), isFixedBTI())
 DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
 DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(StoreInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(StoreInstruction, bool, isFixedBTI(void), isFixedBTI())
 DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
 DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(LoadInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(LoadInstruction, bool, isFixedBTI(void), isFixedBTI())
 DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
 DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
 DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
@@ -1556,7 +1685,10 @@ DECL_MEM_FN(BranchInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
 DECL_MEM_FN(SyncInstruction, uint32_t, getParameters(void), getParameters())
 DECL_MEM_FN(ReadARFInstruction, Type, getType(void), getType())
 DECL_MEM_FN(ReadARFInstruction, ARFRegister, getARFRegister(void), getARFRegister())
+DECL_MEM_FN(SimdShuffleInstruction, Type, getType(void), getType())
 DECL_MEM_FN(RegionInstruction, uint32_t, getOffset(void), getOffset())
+DECL_MEM_FN(IndirectMovInstruction, uint32_t, getOffset(void), getOffset())
+DECL_MEM_FN(IndirectMovInstruction, Type, getType(void), getType())
 DECL_MEM_FN(SampleInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(SampleInstruction, uint8_t, getSamplerIndex(void), getSamplerIndex())
@@ -1578,6 +1710,21 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   ///////////////////////////////////////////////////////////////////////////
   // Implements the emission functions
   ///////////////////////////////////////////////////////////////////////////
+  // For all nullary functions with given opcode
+  Instruction ALU0(Opcode opcode, Type type, Register dst) {
+    return internal::NullaryInstruction(opcode, type, dst).convert();
+  }
+
+  // All nullary functions
+#define DECL_EMIT_FUNCTION(NAME) \
+  Instruction NAME(Type type, Register dst) { \
+    return ALU0(OP_##NAME, type, dst);\
+  }
+
+  DECL_EMIT_FUNCTION(SIMD_SIZE)
+  DECL_EMIT_FUNCTION(SIMD_ID)
+
+#undef DECL_EMIT_FUNCTION
 
   // For all unary functions with given opcode
   Instruction ALU1(Opcode opcode, Type type, Register dst, Register src) {
@@ -1594,6 +1741,7 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   DECL_EMIT_FUNCTION(FBH)
   DECL_EMIT_FUNCTION(FBL)
   DECL_EMIT_FUNCTION(CBIT)
+  DECL_EMIT_FUNCTION(LZD)
   DECL_EMIT_FUNCTION(COS)
   DECL_EMIT_FUNCTION(SIN)
   DECL_EMIT_FUNCTION(LOG)
@@ -1695,8 +1843,8 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   }
 
   // For all unary functions with given opcode
-  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, BTI bti, Tuple src) {
-    return internal::AtomicInstruction(atomicOp, dst, space, bti, src).convert();
+  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src) {
+    return internal::AtomicInstruction(atomicOp, dst, space, bti, fixedBTI, src).convert();
   }
 
   // BRA
@@ -1744,9 +1892,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
                    AddressSpace space, \
                    uint32_t valueNum, \
                    bool dwAligned, \
-                   BTI bti) \
+                   bool fixedBTI, \
+                   Register bti) \
   { \
-    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,bti).convert(); \
+    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,fixedBTI,bti).convert(); \
   }
 
   DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
@@ -1765,6 +1914,13 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   Instruction REGION(Register dst, Register src, uint32_t offset) {
     return internal::RegionInstruction(dst, src, offset).convert();
   }
+  Instruction SIMD_SHUFFLE(Type type, Register dst, Register src0, Register src1) {
+    return internal::SimdShuffleInstruction(type, dst, src0, src1).convert();
+  }
+
+  Instruction INDIRECT_MOV(Type type, Register dst, Register src0, Register src1, uint32_t offset) {
+    return internal::IndirectMovInstruction(type, dst, src0, src1, offset).convert();
+  }
 
   // LABEL
   Instruction LABEL(LabelIndex labelIndex) {
@@ -1772,12 +1928,12 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   }
 
   // SAMPLE
-  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
-    return internal::SampleInstruction(imageIndex, dst, src, dstIsFloat, srcIsFloat, sampler, samplerOffset).convert();
+  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, uint8_t srcNum, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
+    return internal::SampleInstruction(imageIndex, dst, src, srcNum, dstIsFloat, srcIsFloat, sampler, samplerOffset).convert();
   }
 
-  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType) {
-    return internal::TypedWriteInstruction(imageIndex, src, srcType, coordType).convert();
+  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType) {
+    return internal::TypedWriteInstruction(imageIndex, src, srcNum, srcType, coordType).convert();
   }
 
   Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg) {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index f7024d4..cf8d839 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -36,10 +36,13 @@
 namespace gbe {
 namespace ir {
   struct BTI {
-    uint8_t bti[MAX_MIXED_POINTER];
-    uint8_t count;
-    BTI() : count(0) {
-      memset(bti, 0, MAX_MIXED_POINTER);
+    uint8_t isConst; // whether fixed bti
+    union {
+      Register reg;  // mixed reg
+      unsigned short imm;  // fixed bti
+    };
+
+    BTI() : isConst(0) {
     }
     ~BTI() {}
   };
@@ -199,6 +202,15 @@ namespace ir {
   /*! Output the instruction string in the given stream */
   std::ostream &operator<< (std::ostream &out, const Instruction &proxy);
 
+  /*! Nullary instruction instructions are typed. */
+  class NullaryInstruction : public Instruction {
+  public:
+    /*! Get the type manipulated by the instruction */
+    Type getType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Unary instructions are typed. dst and sources share the same type */
   class UnaryInstruction : public Instruction {
   public:
@@ -280,10 +292,12 @@ namespace ir {
   class AtomicInstruction : public Instruction {
   public:
     /*! Where the address register goes */
-    static const uint32_t addressIndex = 0;
+    static const uint32_t btiIndex = 0;
+    static const uint32_t addressIndex = 1;
     /*! Address space that is manipulated here */
     AddressSpace getAddressSpace(void) const;
-    BTI getBTI(void) const;
+    Register getBTI(void) const { return this->getSrc(btiIndex); }
+    bool isFixedBTI(void) const;
     /*! Return the atomic function code */
     AtomicOps getAtomicOpcode(void) const;
     /*! Return the register that contains the addresses */
@@ -298,12 +312,14 @@ namespace ir {
   class StoreInstruction : public Instruction {
   public:
     /*! Where the address register goes */
-    static const uint32_t addressIndex = 0;
+    static const uint32_t btiIndex = 0;
+    static const uint32_t addressIndex = 1;
     /*! Return the types of the values to store */
     Type getValueType(void) const;
     /*! Give the number of values the instruction is storing (srcNum-1) */
     uint32_t getValueNum(void) const;
-    BTI getBTI(void) const;
+    Register getBTI(void) const { return this->getSrc(btiIndex); }
+    bool isFixedBTI(void) const;
     /*! Address space that is manipulated here */
     AddressSpace getAddressSpace(void) const;
     /*! DWORD aligned means untyped read for Gen. That is what matters */
@@ -313,7 +329,7 @@ namespace ir {
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       GBE_ASSERT(valueID < this->getValueNum());
-      return this->getSrc(valueID + 1u);
+      return this->getSrc(valueID + 2u);
     }
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
@@ -334,8 +350,9 @@ namespace ir {
     /*! DWORD aligned means untyped read for Gen. That is what matters */
     bool isAligned(void) const;
     /*! Return the register that contains the addresses */
-    INLINE Register getAddress(void) const { return this->getSrc(0u); }
-    BTI getBTI(void) const;
+    INLINE Register getAddress(void) const { return this->getSrc(1u); }
+    Register getBTI(void) const {return this->getSrc(0u);}
+    bool isFixedBTI(void) const;
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       return this->getDst(valueID);
@@ -505,6 +522,14 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! simd shuffle */
+  class SimdShuffleInstruction : public Instruction {
+  public:
+    Type getType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! return a region of a register, make sure the offset does not exceed the register size */
   class RegionInstruction : public Instruction {
   public:
@@ -513,6 +538,15 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Indirect Move instruction */
+  class IndirectMovInstruction : public Instruction {
+  public:
+    Type getType(void) const;
+    uint32_t getOffset(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Specialize the instruction. Also performs typechecking first based on the
    *  opcode. Crashes if it fails
    */
@@ -559,6 +593,12 @@ namespace ir {
   /// All emission functions
   ///////////////////////////////////////////////////////////////////////////
 
+  /*! alu0.type dst */
+  Instruction ALU0(Opcode opcode, Type type, Register dst);
+  /*! simd_size.type dst */
+  Instruction SIMD_SIZE(Type type, Register dst);
+  /*! simd_id.type dst */
+  Instruction SIMD_ID(Type type, Register dst);
   /*! alu1.type dst src */
   Instruction ALU1(Opcode opcode, Type type, Register dst, Register src);
   /*! mov.type dst src */
@@ -587,6 +627,8 @@ namespace ir {
   Instruction FBL(Type type, Register dst, Register src);
   /*! cbit.type dst src */
   Instruction CBIT(Type type, Register dst, Register src);
+  /*! lzd.type dst src */
+  Instruction LZD(Type type, Register dst, Register src);
   /*! hadd.type dst src */
   Instruction HADD(Type type, Register dst, Register src0, Register src1);
   /*! rhadd.type dst src */
@@ -619,6 +661,8 @@ namespace ir {
   Instruction RNDU(Type type, Register dst, Register src);
   /*! rndz.type dst src */
   Instruction RNDZ(Type type, Register dst, Register src);
+  /*! bswap.type dst src */
+  Instruction BSWAP(Type type, Register dst, Register src);
   /*! pow.type dst src0 src1 */
   Instruction POW(Type type, Register dst, Register src0, Register src1);
   /*! mul.type dst src0 src1 */
@@ -667,6 +711,8 @@ namespace ir {
   Instruction GT(Type type, Register dst, Register src0, Register src1);
   /*! ord.type dst src0 src1 */
   Instruction ORD(Type type, Register dst, Register src0, Register src1);
+  /*! sub_group_shuffle.type dst src0 src1 */
+  Instruction SIMD_SHUFFLE(Type type, Register dst, Register src0, Register src1);
   /*! BITCAST.{dstType <- srcType} dst src */
   Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum);
   /*! cvt.{dstType <- srcType} dst src */
@@ -678,7 +724,7 @@ namespace ir {
   /*! F32TO16.{dstType <- srcType} dst src */
   Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
   /*! atomic dst addr.space {src1 {src2}} */
-  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, BTI bti, Tuple src);
+  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src);
   /*! bra labelIndex */
   Instruction BRA(LabelIndex labelIndex);
   /*! (pred) bra labelIndex */
@@ -694,9 +740,9 @@ namespace ir {
   /*! ret */
   Instruction RET(void);
   /*! load.type.space {dst1,...,dst_valueNum} offset value */
-  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
   /*! store.type.space offset {src1,...,src_valueNum} value */
-  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
   /*! loadi.type dst value */
   Instruction LOADI(Type type, Register dst, ImmediateIndex value);
   /*! sync.params... (see Sync instruction) */
@@ -704,10 +750,11 @@ namespace ir {
 
   Instruction READ_ARF(Type type, Register dst, ARFRegister arf);
   Instruction REGION(Register dst, Register src, uint32_t offset);
+  Instruction INDIRECT_MOV(Type type, Register dst, Register src0, Register src1, uint32_t offset);
   /*! typed write */
-  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType);
+  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType);
   /*! sample textures */
-  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset);
+  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, uint8_t srcNum, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset);
   /*! get image information , such as width/height/depth/... */
   Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg);
   /*! label labelIndex */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 9a89069..81548c9 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -25,6 +25,8 @@
  * \file instruction.hxx
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
+DECL_INSN(SIMD_SIZE, NullaryInstruction)
+DECL_INSN(SIMD_ID, NullaryInstruction)
 DECL_INSN(MOV, UnaryInstruction)
 DECL_INSN(COS, UnaryInstruction)
 DECL_INSN(SIN, UnaryInstruction)
@@ -40,6 +42,7 @@ DECL_INSN(RNDU, UnaryInstruction)
 DECL_INSN(RNDZ, UnaryInstruction)
 DECL_INSN(SIMD_ANY, UnaryInstruction)
 DECL_INSN(SIMD_ALL, UnaryInstruction)
+DECL_INSN(BSWAP, UnaryInstruction)
 DECL_INSN(POW, BinaryInstruction)
 DECL_INSN(MUL, BinaryInstruction)
 DECL_INSN(ADD, BinaryInstruction)
@@ -56,6 +59,7 @@ DECL_INSN(BSB, BinaryInstruction)
 DECL_INSN(OR, BinaryInstruction)
 DECL_INSN(XOR, BinaryInstruction)
 DECL_INSN(AND, BinaryInstruction)
+DECL_INSN(SIMD_SHUFFLE, SimdShuffleInstruction)
 DECL_INSN(SEL, SelectInstruction)
 DECL_INSN(EQ, CompareInstruction)
 DECL_INSN(NE, CompareInstruction)
@@ -81,12 +85,14 @@ DECL_INSN(SYNC, SyncInstruction)
 DECL_INSN(LABEL, LabelInstruction)
 DECL_INSN(READ_ARF, ReadARFInstruction)
 DECL_INSN(REGION, RegionInstruction)
+DECL_INSN(INDIRECT_MOV, IndirectMovInstruction)
 DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
 DECL_INSN(MUL_HI, BinaryInstruction)
 DECL_INSN(I64_MUL_HI, BinaryInstruction)
 DECL_INSN(FBH, UnaryInstruction)
 DECL_INSN(FBL, UnaryInstruction)
 DECL_INSN(CBIT, UnaryInstruction)
+DECL_INSN(LZD, UnaryInstruction)
 DECL_INSN(HADD, BinaryInstruction)
 DECL_INSN(RHADD, BinaryInstruction)
 DECL_INSN(I64HADD, BinaryInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index 2b1ffdb..9fa7ac3 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -66,6 +66,11 @@ namespace ir {
         const uint32_t srcNum = insn.getSrcNum();
         const uint32_t dstNum = insn.getDstNum();
         bool uniform = true;
+
+        //do not change dst uniform for simd id
+        if (insn.getOpcode() == ir::OP_SIMD_ID)
+          uniform = false;
+
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const Register reg = insn.getSrc(srcID);
           if (!fn.isUniformRegister(reg))
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 73b1dd2..9fcdf74 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -87,9 +87,15 @@ namespace ir {
     uint64_t offset;      //!< Offset where to load in the structure
     uint32_t argID;       //!< Associated function argument
   };
+  struct IndirectLoad {
+    Instruction *load;           //!< Load from the argument
+    vector<Instruction *> adds;  //!< Can be NULL if we only have load(arg)
+    uint32_t argID;              //!< Associated function argument
+  };
 
   /*! List of direct loads */
   typedef vector<LoadAddImm> LoadAddImmSeq;
+  typedef vector<IndirectLoad> IndirectLoadSeq;
 
   /*! Helper class to lower function arguments if required */
   class FunctionArgumentLowerer : public Context
@@ -102,9 +108,13 @@ namespace ir {
     /*! Perform all function arguments substitution if needed */
     void lower(const std::string &name);
     /*! Lower the given function argument accesses */
-    void lower(uint32_t argID);
+    ArgUse lower(uint32_t argID);
     /*! Build the constant push for the function */
     void buildConstantPush(void);
+    /* Lower indirect Read to indirct Mov */
+    void lowerIndirectRead(uint32_t argID);
+    /* Convert indirectLoad to indirect Mov */
+    void ReplaceIndirectLoad(void);
     /*! Inspect the given function argument to see how it is used. If this is
      *  direct loads only, we also output the list of instructions used for each
      *  load
@@ -117,6 +127,7 @@ namespace ir {
     Liveness *liveness; //!< To compute the function graph
     FunctionDAG *dag;   //!< Contains complete dependency information
     LoadAddImmSeq seq;  //!< All the direct loads
+    IndirectLoadSeq indirectSeq;  //!< All the indirect loads
   };
 
   INLINE uint64_t getOffsetFromImm(const Immediate &imm) {
@@ -183,15 +194,21 @@ namespace ir {
     // Process all structure arguments and find all the direct loads we can
     // replace
     const uint32_t argNum = fn->argNum();
+    vector<uint32_t> indirctReadArgs;
     for (uint32_t argID = 0; argID < argNum; ++argID) {
       FunctionArgument &arg = fn->getArg(argID);
       if (arg.type != FunctionArgument::STRUCTURE) continue;
-      this->lower(argID);
+      if(this->lower(argID) == ARG_INDIRECT_READ)
+        indirctReadArgs.push_back(argID);
     }
 
     // Build the constant push description and remove the instruction that
     // therefore become useless
     this->buildConstantPush();
+    for (uint32_t i = 0; i < indirctReadArgs.size(); ++i){
+      lowerIndirectRead(indirctReadArgs[i]);
+    }
+    ReplaceIndirectLoad();
   }
 
 // Remove all the given instructions from the stream (if dead)
@@ -259,17 +276,128 @@ namespace ir {
         replaced = true;
       }
 
-      if (replaced)
+      if (replaced) {
         dead.insert(load);
+        load->remove();
+      }
     }
 
-    REMOVE_INSN(load)
     REMOVE_INSN(add)
     REMOVE_INSN(loadImm)
   }
 
 #undef REMOVE_INSN
 
+  void FunctionArgumentLowerer::lowerIndirectRead(uint32_t argID)
+  {
+    FunctionArgument &arg = fn->getArg(argID);
+
+    vector<Register> derivedRegs;
+    map<Register, vector<Instruction *>> addPtrInsns;
+    derivedRegs.push_back(arg.reg);
+
+    //Collect all load from this argument.
+    for(uint32_t i=0; i<derivedRegs.size(); i++) {
+      const UseSet *useSet = dag->getRegUse(derivedRegs[i]);
+      for (const auto &use : *useSet) {
+        Instruction *insn = const_cast<Instruction*>(use->getInstruction());
+        const Opcode opcode = insn->getOpcode();
+        const uint32_t dstNum = insn->getDstNum();
+        GBE_ASSERT(dstNum == 1 || opcode == OP_LOAD);
+        const Register dst = insn->getDst();
+        auto it = addPtrInsns.find(derivedRegs[i]);
+
+        if((opcode == OP_ADD) && (derivedRegs[i] == arg.reg)) {
+          GBE_ASSERT(it == addPtrInsns.end());
+
+          vector<Instruction *> addInsns;
+          addInsns.push_back(insn);
+          addPtrInsns.insert(std::make_pair(dst, addInsns));
+          derivedRegs.push_back(dst);
+        } else if(opcode == OP_LOAD) {
+          LoadInstruction *load = cast<LoadInstruction>(insn);
+          if (load->getAddressSpace() != MEM_PRIVATE)
+            continue;
+
+          IndirectLoad indirectLoad;
+          Register addr = load->getAddress();
+          indirectLoad.argID = argID;
+          indirectLoad.load = insn;
+
+          auto addrIt = addPtrInsns.find(addr);
+          GBE_ASSERT(addrIt != addPtrInsns.end());
+          indirectLoad.adds = addrIt->second;
+
+          indirectSeq.push_back(indirectLoad);
+        } else {
+          if(it == addPtrInsns.end()) continue;  //use arg as phi or selection, no add, skip it.
+          auto dstIt = addPtrInsns.find(dst);
+          if(dstIt == addPtrInsns.end())
+            addPtrInsns.insert(std::make_pair(dst, it->second));
+          else {
+            //Muilt src from both argument, such as select, or phi, merge the vector
+            dstIt->second.insert(dstIt->second.end(), it->second.begin(), it->second.end());
+          }
+          derivedRegs.push_back(dst);
+        }
+      }
+    }
+  }
+
+  void FunctionArgumentLowerer::ReplaceIndirectLoad(void)
+  {
+    if (indirectSeq.size() == 0)
+      return;
+
+    // Track instructions we remove to recursively kill them properly
+    set<const Instruction*> dead;
+
+    set<PushLocation> inserted;
+    for (const auto &indirectLoad : indirectSeq) {
+      const Register arg = fn->getArg(indirectLoad.argID).reg;
+      if(dead.contains(indirectLoad.load)) continue;  //repetitive load in the indirectSeq, skip.
+      LoadInstruction *load = cast<LoadInstruction>(indirectLoad.load);
+      const uint32_t valueNum = load->getValueNum();
+      bool replaced = false;
+      Instruction *ins_after = load; // the instruction to insert after.
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+        const Type type = load->getValueType();
+        const RegisterFamily family = getFamily(type);
+        const uint32_t size = getFamilySize(family);
+        const uint32_t offset = valueID * size;
+
+        const Register reg = load->getValue(valueID);
+
+        Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load->getAddress(), offset);
+        mov.insert(ins_after, &ins_after);
+        replaced = true;
+      }
+
+      if (replaced && !dead.contains(load)) {
+        dead.insert(load);
+        load->remove();
+      }
+
+      vector<Instruction *> adds = indirectLoad.adds;
+      for (uint32_t i=0; i<adds.size(); i++) {
+        BinaryInstruction *add = cast<BinaryInstruction>(adds[i]);
+        if (!dead.contains(add)) {
+          Register dst = add->getDst();
+          const Register src0 = add->getSrc(0);
+          const Register src1 = add->getSrc(1);
+
+          GBE_ASSERT(src0 == arg || src1 == arg);
+          Register src = (src0 == arg) ? src1 : src0;
+          Instruction mov = ir::MOV(add->getType(), dst, src);
+
+          //MOV instruction could optimize if the dst don't write later
+          mov.replace(add);
+          dead.insert(add);
+        }
+      }
+    }
+  }
+
   bool FunctionArgumentLowerer::useStore(const ValueDef &def, set<const Instruction*> &visited)
   {
     const UseSet &useSet = dag->getUse(def);
@@ -298,6 +426,7 @@ namespace ir {
   {
     const FunctionArgument &arg = fn->getArg(argID);
     LoadAddImmSeq tmpSeq;
+    bool match = true;
 
     // Inspect all uses of the function argument pointer
     const UseSet &useSet = dag->getUse(&arg);
@@ -345,7 +474,8 @@ namespace ir {
         if (matchLoad(insn, add, loadImm, offset, argID, loadAddImm)) {
           tmpSeq.push_back(loadAddImm);
           continue;
-        }
+        } else
+          match = false;
       }
     }
 
@@ -353,7 +483,7 @@ namespace ir {
     // direct load definitions we found
     for (const auto &loadImmSeq : tmpSeq)
       seq.push_back(loadImmSeq);
-    return true;
+    return match;
   }
 
   ArgUse FunctionArgumentLowerer::getArgUse(uint32_t argID)
@@ -373,17 +503,18 @@ namespace ir {
     return ARG_INDIRECT_READ;
   }
 
-  void FunctionArgumentLowerer::lower(uint32_t argID) {
-    IF_DEBUG(const ArgUse argUse = )this->getArgUse(argID);
+  ArgUse FunctionArgumentLowerer::lower(uint32_t argID) {
+    const ArgUse argUse = this->getArgUse(argID);
 #if GBE_DEBUG
     GBE_ASSERTM(argUse != ARG_WRITTEN,
                 "TODO A store to a structure argument "
                 "(i.e. not a char/short/int/float argument) has been found. "
                 "This is not supported yet");
-    GBE_ASSERTM(argUse != ARG_INDIRECT_READ,
-                "TODO Only direct loads of structure arguments are "
-                "supported now");
+    //GBE_ASSERTM(argUse != ARG_INDIRECT_READ,
+    //            "TODO Only direct loads of structure arguments are "
+    //            "supported now");
 #endif /* GBE_DEBUG */
+    return argUse;
   }
 
   void lowerFunctionArguments(Unit &unit, const std::string &functionName) {
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index ec7ab94..af9f698 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -44,7 +44,9 @@ namespace ir {
         "retVal", "slm_offset",
         "printf_buffer_pointer", "printf_index_buffer_pointer",
         "dwblockip",
-        "invalid"
+        "lane_id",
+        "invalid",
+        "bti_utility"
     };
 
 #if GBE_DEBUG
@@ -88,7 +90,9 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
       DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0);
+      DECL_NEW_REG(FAMILY_DWORD, laneid, 0);
       DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
+      DECL_NEW_REG(FAMILY_DWORD, btiUtil, 1);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 8f69320..9323824 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -72,8 +72,10 @@ namespace ir {
     static const Register printfbptr = Register(28); // printf buffer address .
     static const Register printfiptr = Register(29); // printf index buffer address.
     static const Register dwblockip = Register(30);  // blockip
-    static const Register invalid = Register(31);  // used for valid comparation.
-    static const uint32_t regNum = 32;             // number of special registers
+    static const Register laneid = Register(31);  // lane id.
+    static const Register invalid = Register(32);  // used for valid comparation.
+    static const Register btiUtil = Register(33);  // used for mixed pointer as bti utility.
+    static const uint32_t regNum = 34;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/ir/structurizer.cpp b/backend/src/ir/structurizer.cpp
new file mode 100644
index 0000000..6c4e455
--- /dev/null
+++ b/backend/src/ir/structurizer.cpp
@@ -0,0 +1,996 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "structurizer.hpp"
+#include "sys/cvar.hpp"
+
+using namespace llvm;
+namespace gbe {
+namespace ir {
+  CFGStructurizer::~CFGStructurizer()
+  {
+    BlockVector::iterator iter = blocks.begin();
+    BlockVector::iterator iter_end = blocks.end();
+    while(iter != iter_end)
+    {
+      delete *iter;
+      iter++;
+    }
+  }
+
+  void CFGStructurizer::handleSelfLoopBlock(Block *loopblock, LabelIndex& whileLabel)
+  {
+    //BlockList::iterator child_iter = (*it)->children.begin();
+    BasicBlock *pbb = loopblock->getExit();
+    GBE_ASSERT(pbb->isLoopExit);
+    BasicBlock::iterator it = pbb->end();
+    it--;
+    if (pbb->hasExtraBra)
+      it--;
+    BranchInstruction* pinsn = static_cast<BranchInstruction *>(&*it);
+
+    if(!pinsn->isPredicated()){
+      std::cout << "WARNING:" << "endless loop detected!" << std::endl;
+      return;
+    }
+    Register reg = pinsn->getPredicateIndex();
+    /* since this block is an while block, so we remove the BRA instruction at the bottom of the exit BB of 'block',
+     * and insert WHILE instead
+     */
+    whileLabel = pinsn->getLabelIndex();
+    Instruction insn = WHILE(whileLabel, reg);
+    Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+    pbb->insertAt(it, *p_new_insn);
+    pbb->whileLabel = whileLabel;
+    pbb->erase(it);
+  }
+
+  /* recursive mark the bbs' variable needEndif*/
+  void CFGStructurizer::markNeedIf(Block *block, bool status)
+  {
+    if(block->type() == SingleBlockType)
+    {
+      BasicBlock* bb = ((SimpleBlock*)block)->getBasicBlock();
+      bb->needIf = status;
+      return;
+    }
+    BlockList::iterator it = block->children.begin();
+    while(it != block->children.end())
+    {
+      markNeedIf(*it,status);
+      it++;
+    }
+  }
+
+  /* recursive mark the bbs' variable needIf*/
+  void CFGStructurizer::markNeedEndif(Block *block, bool status)
+  {
+    if(block->type() == SingleBlockType)
+    {
+      BasicBlock* bb = ((SimpleBlock*)block)->getBasicBlock();
+      bb->needEndif = status;
+      return;
+    }
+
+    BlockList::iterator it = block->children.begin();
+    while(it != block->children.end())
+    {
+      markNeedEndif(*it, status);
+      it++;
+    }
+  }
+
+  /* recursive mark the bbs' variable mark*/
+  void CFGStructurizer::markStructuredBlocks(Block *block, bool status)
+  {
+    if(block->type() == SingleBlockType)
+    {
+      SimpleBlock* pbb = static_cast<SimpleBlock*>(block);
+      pbb->getBasicBlock()->belongToStructure = true;
+    }
+    block->mark = status;
+    BlockList::iterator it = block->children.begin();
+    while(it != block->children.end())
+    {
+      markStructuredBlocks(*it, status);
+      it++;
+    }
+  }
+
+  void CFGStructurizer::handleIfBlock(Block *block, LabelIndex& matchingEndifLabel, LabelIndex& matchingElseLabel)
+  {
+    BasicBlock *pbb = block->getExit();
+    BranchInstruction* pinsn = static_cast<BranchInstruction *>(pbb->getLastInstruction());
+    Register reg = pinsn->getPredicateIndex();
+    BasicBlock::iterator it = pbb->end();
+    it--;
+    /* since this block is an if block, so we remove the BRA instruction at the bottom of the exit BB of 'block',
+     * and insert IF instead
+     */
+    pbb->erase(it);
+    Instruction insn = IF(matchingElseLabel, reg, block->inversePredicate);
+    Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+    pbb->append(*p_new_insn);
+    pbb->matchingEndifLabel = matchingEndifLabel;
+    pbb->matchingElseLabel = matchingElseLabel;
+  }
+
+  void CFGStructurizer::handleThenBlock(Block * block, LabelIndex& endiflabel)
+  {
+    BasicBlock *pbb = block->getExit();
+    BasicBlock::iterator it = pbb->end();
+    it--;
+    Instruction *p_last_insn = pbb->getLastInstruction();
+
+    endiflabel = fn->newLabel();
+    //pbb->thisEndifLabel = endiflabel;
+
+    Instruction insn = ENDIF(endiflabel);
+    Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+    // we need to insert ENDIF before the BRA(if exists).
+    bool append_bra = false;
+    if((*it).getOpcode() == OP_BRA)
+    {
+      pbb->erase(it);
+      append_bra = true;
+    }
+    pbb->append(*p_new_insn);
+    if(append_bra)
+      pbb->append(*p_last_insn);
+  }
+
+  void CFGStructurizer::handleThenBlock2(Block *block, Block *elseblock, LabelIndex elseBBLabel)
+  {
+    BasicBlock *pbb = block->getExit();
+    BasicBlock::iterator it = pbb->end();
+    it--;
+    if((*it).getOpcode() == OP_BRA)
+      pbb->erase(it);
+
+    if(block->getExit()->getNextBlock() == elseblock->getEntry())
+      return;
+
+    // Add an unconditional jump to 'else' block
+    Instruction insn = BRA(elseBBLabel);
+    Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+    pbb->append(*p_new_insn);
+  }
+
+  void CFGStructurizer::handleElseBlock(Block * block, LabelIndex& elselabel, LabelIndex& endiflabel)
+  {
+    // to insert ENDIF properly
+    handleThenBlock(block, endiflabel);
+
+    BasicBlock *pbb = block->getEntry();
+    BasicBlock::iterator it = pbb->begin();
+    it++;
+
+    elselabel = fn->newLabel();
+    pbb->thisElseLabel = elselabel;
+
+    // insert ELSE properly
+    Instruction insn = ELSE(endiflabel);
+    Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+
+    pbb->insertAt(it, *p_new_insn);
+  }
+
+  void CFGStructurizer::handleStructuredBlocks()
+  {
+    BlockVector::iterator it;
+    BlockVector::iterator end = blocks.end();
+    BlockVector::iterator begin = blocks.begin();
+    it = end;
+    it--;
+    BlockVector::reverse_iterator rit = blocks.rbegin();
+    /* structured bbs only need if and endif insn to handle the execution
+     * in structure entry and exit BasicBlock, so we process the blocks backward, since
+     * the block at the back of blocks is always a 'not smaller' structure then
+     * the ones before it. we mark the blocks which are sub-blocks of the block
+     * we are dealing with, in order to ensure we are always handling the 'biggest'
+     * structures */
+    while(rit != blocks.rend())
+    {
+      if((*rit)->type() == IfThenType || (*rit)->type() == IfElseType|| (*rit)->type() == SelfLoopType)
+      {
+        if(false == (*rit)->mark && (*rit)->canBeHandled)
+        {
+          markStructuredBlocks(*rit, true);
+          /* only the entry bb of this structure needs 'if' at backend and
+           * only the exit bb of this structure needs 'endif' at backend
+           * see comment about needEndif and needIf at function.hpp for detail. */
+          markNeedEndif(*rit, false);
+          markNeedIf(*rit, false);
+          BasicBlock* entry = (*rit)->getEntry();
+          BasicBlock* eexit = (*rit)->getExit();
+          entry->needIf = true;
+          eexit->needEndif = true;
+          entry->endifLabel = fn->newLabel();
+          eexit->endifLabel = entry->endifLabel;
+          eexit->isStructureExit = true;
+          eexit->matchingStructureEntry = entry;
+        }
+      }
+      rit++;
+    }
+
+    rit = blocks.rbegin();
+    gbe::vector<BasicBlock *> &bblocks = fn->getBlocks();
+    std::vector<BasicBlock *> bbs;
+    bbs.resize(bblocks.size());
+
+    /* here insert the bras to the BBs, which would
+     * simplify the reorder of basic blocks */
+    for(size_t i = 0; i < bblocks.size(); ++i)
+    {
+      bbs[i] = bblocks[i];
+      if(i != bblocks.size() -1 &&
+         (bbs[i]->getLastInstruction()->getOpcode() != OP_BRA ||
+         (bbs[i]->isStructureExit && bbs[i]->isLoopExit)))
+      {
+        Instruction insn = BRA(bbs[i]->getNextBlock()->getLabelIndex());
+        Instruction* pNewInsn = bbs[i]->getParent().newInstruction(insn);
+        bbs[i]->append(*pNewInsn);
+        if (bbs[i]->isStructureExit && bbs[i]->isLoopExit)
+          bbs[i]->hasExtraBra = true;
+      }
+    }
+
+    /* now, reorder the basic blocks to reduce the unconditional jump we inserted whose
+     * targets are the 'else' blocks. the algorithm is quite simple, just put the unstructured
+     * BBs(maybe belong to another structure, but not this one) in front of the entry BB of
+     * this structure in front of all the others and put the other unstructured BBs at the
+     * back of the others. the sequence of structured is get through function getStructureSequence.
+     */
+    while(rit != blocks.rend())
+    {
+      if(((*rit)->type() == IfThenType || (*rit)->type() == IfElseType || (*rit)->type() == SerialBlockType ||(*rit)->type() == SelfLoopType) &&
+          (*rit)->canBeHandled && (*rit)->mark == true)
+      {
+        markStructuredBlocks(*rit, false);
+        std::set<int> ns = getStructureBasicBlocksIndex(*rit, bbs);
+        BasicBlock *entry = (*rit)->getEntry();
+
+        int entryIndex = *(ns.begin());
+        for(size_t i=0; i<bbs.size(); ++i)
+        {
+          if(bbs[i] == entry)
+            entryIndex = i;
+        }
+
+        std::set<int>::iterator iter = ns.begin();
+        int index = *iter;
+
+        std::vector<BasicBlock *> unstruSeqHead;
+        std::vector<BasicBlock *> unstruSeqTail;
+
+        iter = ns.begin();
+        while(iter != ns.end())
+        {
+          if(index != *iter)
+          {
+            if(index < entryIndex)
+              unstruSeqHead.push_back(bbs[index]);
+            else
+              unstruSeqTail.push_back(bbs[index]);
+            index++;
+          }
+          else
+          {
+            index++;
+            iter++;
+          }
+        }
+
+        std::vector<BasicBlock *> struSeq;
+        getStructureSequence(*rit, struSeq);
+
+        int firstindex = *(ns.begin());
+        for(size_t i = 0; i < unstruSeqHead.size(); ++i)
+          bbs[firstindex++] = unstruSeqHead[i];
+        for(size_t i = 0; i < struSeq.size(); ++i)
+          bbs[firstindex++] = struSeq[i];
+        for(size_t i = 0; i < unstruSeqTail.size(); ++i)
+          bbs[firstindex++] = unstruSeqTail[i];
+      }
+      rit++;
+    }
+
+   /* now, erase the BRAs inserted before whose targets are their fallthrough blocks */
+    for(size_t i=0; i<bbs.size(); ++i)
+    {
+      if(bbs[i]->getLastInstruction()->getOpcode() == OP_BRA &&
+         !((BranchInstruction*)(bbs[i]->getLastInstruction()))->isPredicated())
+      {
+        if(((BranchInstruction *)bbs[i]->getLastInstruction())->getLabelIndex() == bbs[i+1]->getLabelIndex())
+        {
+          BasicBlock::iterator it= bbs[i]->end();
+          it--;
+
+          bbs[i]->erase(it);
+
+          if (bbs[i]->hasExtraBra)
+            bbs[i]->hasExtraBra = false;
+        }
+      }
+    }
+    for(size_t i=0; i<bbs.size(); ++i)
+      bblocks[i] = bbs[i];
+
+    fn->sortLabels();
+    fn->computeCFG();
+
+    it = begin;
+    while(it != end)
+    {
+      if((*it)->canBeHandled)
+      {
+        switch((*it)->type())
+        {
+          case IfThenType:
+            {
+              BlockList::iterator child_iter = (*it)->children.end();
+              LabelIndex endiflabel;
+              child_iter--;
+              handleThenBlock(*child_iter, endiflabel); // this call would pass out the proper endiflabel for handleIfBlock's use.
+              child_iter--;
+              handleIfBlock(*child_iter, endiflabel, endiflabel);
+            }
+            break;
+
+          case IfElseType:
+            {
+              BlockList::iterator child_iter = (*it)->children.end();
+              LabelIndex endiflabel;
+              LabelIndex elselabel;
+              BlockList::iterator else_block;
+              child_iter--;
+              else_block= child_iter;
+              handleElseBlock(*child_iter, elselabel, endiflabel);
+              LabelIndex elseBBLabel = (*child_iter)->getEntry()->getLabelIndex();
+              child_iter--;
+              handleThenBlock2(*child_iter, *else_block, elseBBLabel);
+              child_iter--;
+              handleIfBlock(*child_iter, endiflabel, elselabel);
+            }
+            break;
+
+          case SelfLoopType:
+            {
+              LabelIndex whilelabel;
+              handleSelfLoopBlock(*it, whilelabel);
+            }
+            break;
+
+          default:
+            break;
+        }
+      }
+
+      it++;
+    }
+  }
+
+  void CFGStructurizer::getStructureSequence(Block *block, std::vector<BasicBlock*> &seq)
+  {
+    /* in the control tree, for if-then, if block is before then block; for if-else, the
+     * stored sequence is if-then-else, for block structure, the stored sequence is just
+     * their executed sequence. so we could just get the structure sequence by recrusive
+     * calls getStructureSequence to all the elements in children one by one.
+     */
+    if(block->type() == SingleBlockType)
+    {
+      seq.push_back(((SimpleBlock*)block)->getBasicBlock());
+      return;
+    }
+
+    BlockList::iterator iter = block->children.begin();
+    while(iter != block->children.end())
+    {
+      getStructureSequence(*iter, seq);
+      iter++;
+    }
+  }
+
+  std::set<int> CFGStructurizer::getStructureBasicBlocksIndex(Block* block, std::vector<BasicBlock *> &bbs)
+  {
+    std::set<int> result;
+    if(block->type() == SingleBlockType)
+    {
+      for(size_t i=0; i<bbs.size(); i++)
+      {
+        if(bbs[i] == ((SimpleBlock*)block)->getBasicBlock())
+        {
+          result.insert(i);
+          break;
+        }
+      }
+      return result;
+    }
+    BlockList::iterator iter = (block->children).begin();
+    BlockList::iterator end = (block->children).end();
+    while(iter != end)
+    {
+      std::set<int> ret = getStructureBasicBlocksIndex(*iter, bbs);
+      result.insert(ret.begin(), ret.end());
+      iter++;
+    }
+    return result;
+  }
+
+  std::set<BasicBlock *> CFGStructurizer::getStructureBasicBlocks(Block *block)
+  {
+    std::set<BasicBlock *> result;
+    if(block->type() == SingleBlockType)
+    {
+      result.insert(((SimpleBlock*)block)->getBasicBlock());
+      return result;
+    }
+    BlockList::iterator iter = (block->children).begin();
+    BlockList::iterator end = (block->children).end();
+    while(iter != end)
+    {
+      std::set<BasicBlock *> ret = getStructureBasicBlocks(*iter);
+      result.insert(ret.begin(), ret.end());
+      iter++;
+    }
+    return result;
+  }
+
+  Block* CFGStructurizer::insertBlock(Block *p_block)
+  {
+    blocks.push_back(p_block);
+    return p_block;
+  }
+
+  bool CFGStructurizer::checkForBarrier(const BasicBlock* bb)
+  {
+    BasicBlock::const_iterator iter = bb->begin();
+    BasicBlock::const_iterator iter_end = bb->end();
+    while(iter != iter_end)
+    {
+      if((*iter).getOpcode() == OP_SYNC)
+        return true;
+      iter++;
+    }
+
+    return false;
+  }
+
+  void CFGStructurizer::getLiveIn(BasicBlock& bb, std::set<Register>& livein)
+  {
+    BasicBlock::iterator iter = bb.begin();
+    std::set<Register> varKill;
+    while(iter != bb.end())
+    {
+      Instruction& insn = *iter;
+      const uint32_t srcNum = insn.getSrcNum();
+      const uint32_t dstNum = insn.getDstNum();
+      for(uint32_t srcID = 0; srcID < srcNum; ++srcID)
+      {
+        const Register reg = insn.getSrc(srcID);
+        if(varKill.find(reg) == varKill.end())
+          livein.insert(reg);
+      }
+      for(uint32_t dstID = 0; dstID < dstNum; ++dstID)
+      {
+        const Register reg = insn.getDst(dstID);
+        varKill.insert(reg);
+      }
+
+      iter++;
+    }
+  }
+
+  void CFGStructurizer::calculateNecessaryLiveout()
+  {
+    BlockVector::iterator iter = blocks.begin();
+
+    while(iter != blocks.end())
+    {
+      switch((*iter)->type())
+      {
+        case IfElseType:
+        {
+          std::set<BasicBlock *> bbs;
+          BlockList::iterator thenIter = (*iter)->children.begin();
+          thenIter++;
+          bbs = getStructureBasicBlocks(*thenIter);
+
+          Block *elseblock = *((*iter)->children.rbegin());
+          std::set<Register> livein;
+          getLiveIn(*(elseblock->getEntry()), livein);
+
+          std::set<BasicBlock *>::iterator bbiter = bbs.begin();
+          while(bbiter != bbs.end())
+          {
+            (*bbiter)->liveout.insert(livein.begin(), livein.end());
+            bbiter++;
+          }
+        }
+
+        default:
+          break;
+      }
+      iter++;
+    }
+  }
+
+  void CFGStructurizer::initializeBlocks()
+  {
+    BasicBlock& tmp_bb = fn->getTopBlock();
+    BasicBlock* p_tmp_bb = &tmp_bb;
+    Block* p = NULL;
+
+    if(NULL != p_tmp_bb)
+    {
+      Block *p_tmp_block = new SimpleBlock(p_tmp_bb);
+      p_tmp_block->label = p_tmp_bb->getLabelIndex();
+
+      if(checkForBarrier(p_tmp_bb))
+        p_tmp_block->hasBarrier() = true;
+
+      blocks.push_back(p_tmp_block);
+      bbmap[p_tmp_bb] = p_tmp_block;
+      bTobbmap[p_tmp_block] = p_tmp_bb;
+      p_tmp_bb = p_tmp_bb->getNextBlock();
+      p = p_tmp_block;
+    }
+
+    while(p_tmp_bb != NULL)
+    {
+      Block *p_tmp_block = new SimpleBlock(p_tmp_bb);
+      p_tmp_block->label = p_tmp_bb->getLabelIndex();
+
+      if(checkForBarrier(p_tmp_bb))
+        p_tmp_block->hasBarrier() = true;
+
+      p->fallthrough() = p_tmp_block;
+      p = p_tmp_block;
+      blocks.push_back(p_tmp_block);
+      bbmap[p_tmp_bb] = p_tmp_block;
+      bTobbmap[p_tmp_block] = p_tmp_bb;
+      p_tmp_bb = p_tmp_bb->getNextBlock();
+    }
+
+    if(NULL != p)
+      p->fallthrough() = NULL;
+
+    p_tmp_bb = &tmp_bb;
+
+    this->blocks_entry = bbmap[p_tmp_bb];
+
+    while(p_tmp_bb != NULL)
+    {
+      BlockSet::const_iterator iter_begin = p_tmp_bb->getPredecessorSet().begin();
+      BlockSet::const_iterator iter_end = p_tmp_bb->getPredecessorSet().end();
+      while(iter_begin != iter_end)
+      {
+        bbmap[p_tmp_bb]->predecessors().insert(bbmap[*iter_begin]);
+        iter_begin++;
+      }
+
+      iter_begin = p_tmp_bb->getSuccessorSet().begin();
+      iter_end = p_tmp_bb->getSuccessorSet().end();
+      while(iter_begin != iter_end)
+      {
+        bbmap[p_tmp_bb]->successors().insert(bbmap[*iter_begin]);
+        iter_begin++;
+      }
+
+      p_tmp_bb = p_tmp_bb->getNextBlock();
+    }
+
+    //copy the sequenced blocks to orderedBlks.
+    loops = fn->getLoops();
+    fn->foreachBlock([&](ir::BasicBlock &bb){
+        orderedBlks.push_back(bbmap[&bb]);
+        });
+  }
+
+  void CFGStructurizer::outBlockTypes(BlockType type)
+  {
+    if(type == SerialBlockType)
+        std::cout << " T:["<< "Serial" <<"]"<< std::endl;
+    else if(type == IfThenType)
+        std::cout << " T:["<< "IfThen" <<"]"<< std::endl;
+    else if(type == IfElseType)
+        std::cout << " T:["<< "IfElse" <<"]"<< std::endl;
+    else if(type == SelfLoopType)
+        std::cout << " T:["<< "SelfLoop" <<"]"<< std::endl;
+    else
+        std::cout << " T:["<< "BasicBlock" <<"]"<< std::endl;
+  }
+
+  /* dump the block info for debug use, only SingleBlockType has label.*/
+  void CFGStructurizer::printOrderedBlocks()
+  {
+    size_t i = 0;
+    std::cout << "\n ordered Blocks ->  BasicBlocks -> Current BB: "<< *orderIter << std::endl;
+    for (auto iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
+      std::cout << "B:" << *iterBlk << " BB:" << bTobbmap[*iterBlk];
+      if((*iterBlk)->type() == SingleBlockType)
+        std::cout << " L:"<< bTobbmap[*iterBlk]->getLabelIndex() << std::endl;
+      else
+        outBlockTypes((*iterBlk)->type());
+    }
+  }
+
+  /* transfer the predecessors and successors from the matched blocks to new mergedBB.
+   * if the blocks contains backage, should add a successor to itself to make a self loop.*/
+  void CFGStructurizer::cfgUpdate(Block* mergedBB,  const BlockSets& blockBBs)
+  {
+    for(auto iter= blockBBs.begin(); iter != blockBBs.end(); iter++)
+    {
+      for(auto p = (*iter)->pred_begin(); p != (*iter)->pred_end(); p++)
+      {
+        if(blockBBs.find(*p) != blockBBs.end())
+          continue;
+
+        (*p)->successors().erase(*iter);
+        (*p)->successors().insert(mergedBB);
+        mergedBB->predecessors().insert(*p);
+
+        if((*p)->fallthrough() == *iter)
+          (*p)->fallthrough() = mergedBB;
+      }
+      for(auto s = (*iter)->succ_begin(); s != (*iter)->succ_end(); s++)
+      {
+        if(blockBBs.find(*s) != blockBBs.end())
+          continue;
+
+        (*s)->predecessors().erase(*iter);
+        (*s)->predecessors().insert(mergedBB);
+        mergedBB->successors().insert(*s);
+
+        if((*iter)->fallthrough() == *s)
+          mergedBB->fallthrough() = *s;
+      }
+    }
+
+    if(mergedBB->type() != SelfLoopType) {
+      for(auto iter= blockBBs.begin(); iter != blockBBs.end(); iter++)
+      {
+        for(auto s = (*iter)->succ_begin(); s != (*iter)->succ_end(); s++)
+        {
+          if(blockBBs.find(*s) == blockBBs.end())
+            continue;
+
+          LabelIndex l_iter = (*iter)->getEntry()->getLabelIndex();
+          LabelIndex l_succ = (*s)->getEntry()->getLabelIndex();
+          if(l_iter > l_succ)
+          {
+            mergedBB->predecessors().insert(mergedBB);
+            mergedBB->successors().insert(mergedBB);
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  /* delete the matched blocks and replace it with mergedBB to reduce the CFG.
+   * the mergedBB should be inserted to the entry block position. */
+  void CFGStructurizer::replace(Block* mergedBB,  BlockSets blockBBs)
+  {
+    lIterator iter, iterRep;
+    bool flag = false;
+    for(iter = orderedBlks.begin(); iter!= orderedBlks.end() && !blockBBs.empty();)
+    {
+      if(!blockBBs.erase(*iter))
+      {
+        iter++;
+        continue;
+      }
+      if(flag == false)
+      {
+        iter = orderedBlks.erase(iter);
+        iterRep = iter;
+        orderIter = orderedBlks.insert(iterRep, mergedBB);
+        flag = true;
+      }else
+      {
+        iter = orderedBlks.erase(iter);
+      }
+    }
+  }
+
+  Block* CFGStructurizer::mergeSerialBlock(BlockList& serialBBs)
+  {
+      Block* p = new SerialBlock(serialBBs);
+      BlockList::iterator iter = serialBBs.begin();
+      while(iter != serialBBs.end())
+      {
+        if((*iter)->canBeHandled == false)
+        {
+          p->canBeHandled = false;
+          break;
+        }
+        iter++;
+      }
+      return insertBlock(p);
+  }
+
+  BVAR(OCL_OUTPUT_STRUCTURIZE, false);
+
+  /* if the block has only one successor, and it's successor has only one predecessor 
+   * and one successor. the block and the childBlk could be merged to a serial Block.*/
+  int CFGStructurizer::serialPatternMatch(Block *block) {
+    if (block->succ_size() != 1)
+      return 0;
+
+    if(block->hasBarrier())
+      return 0;
+
+    Block *childBlk = *block->succ_begin();
+    //FIXME, As our barrier implementation doen't support structured barrier
+    //operation, exclude all the barrier blocks from serialPatternMatch.
+    if (childBlk->pred_size() != 1 || childBlk->hasBarrier() )
+      return 0;
+
+    BlockList serialBBs;//childBBs
+    BlockSets serialSets;
+    serialBBs.push_back(block);
+    serialBBs.push_back(childBlk);
+    serialSets.insert(block);
+    serialSets.insert(childBlk);
+
+    Block* mergedBB = mergeSerialBlock(serialBBs);
+    if(mergedBB == NULL)
+      return 0;
+
+    cfgUpdate(mergedBB, serialSets);
+    replace(mergedBB, serialSets);
+
+    if(OCL_OUTPUT_STRUCTURIZE)
+      printOrderedBlocks();
+    ++numSerialPatternMatch;
+    if(serialSets.find(blocks_entry) != serialSets.end())
+      blocks_entry = mergedBB;
+    return 1;
+  }
+
+  Block* CFGStructurizer::mergeLoopBlock(BlockList& loopSets)
+  {
+    if(loopSets.size() == 1)
+    {
+      Block* p = new SelfLoopBlock(*loopSets.begin());
+      p->canBeHandled = true;
+      (*loopSets.begin())->getExit()->isLoopExit = true;
+      return insertBlock(p);
+    }
+    return NULL;
+  }
+
+  /*match the selfLoop pattern with llvm info or check whether the compacted node has a backage to itself.*/
+  int CFGStructurizer::loopPatternMatch(Block *block) {
+    Block* loop_header = NULL;
+    Block* b = block;
+    BlockSets loopSets;
+    BlockList loopBBs;
+
+    //if b is basic block , query the llvm loop info to find the loop whoose loop header is b;
+    if(block->type() == SingleBlockType){
+      for (auto l : loops) {
+        BasicBlock &a = fn->getBlock(l->bbs[0]);
+        loop_header = bbmap.find(&a)->second;
+
+        if(loop_header == b){
+          for (auto bb : l->bbs) {
+            BasicBlock &tmp = fn->getBlock(bb);
+            Block* block_ = bbmap.find(&tmp)->second;
+            loopBBs.push_front(block_);
+            loopSets.insert(block_);
+          }
+          break;
+        }
+      }
+    }else{
+      //b is compacted node, it would have a successor pointed to itself for self loop.
+      if(block->successors().find(b) != block->successors().end())
+      {
+        loopBBs.push_front(b);
+        loopSets.insert(b);
+      }
+    }
+
+    if(loopBBs.empty())
+      return 0;
+
+    if(loopSets.size() == 1) {
+    //self loop header should have a successor to itself, check this before merged.
+      Block* lblock = *loopSets.begin();
+      if(lblock->successors().find(lblock) == lblock->successors().end())
+        return 0;
+    }
+
+    Block* mergedBB = mergeLoopBlock(loopBBs);
+    if(mergedBB == NULL)
+      return 0;
+
+    cfgUpdate(mergedBB, loopSets);
+    replace(mergedBB, loopSets);
+
+    if(OCL_OUTPUT_STRUCTURIZE)
+      printOrderedBlocks();
+    ++numLoopPatternMatch;
+    if(loopSets.find(blocks_entry) != loopSets.end())
+      blocks_entry = mergedBB;
+    return 1;
+  }
+
+  /* match the if pattern(E: entry block; T: True block; F: False block; C: Converged block):
+  *  for if-else pattern:
+  **   E
+  **  / \
+  ** T   F
+  **  \ /
+  **   C
+  ** E has two edges T and F, T and F both have only one predecessor and one successor indepedently,
+  ** the successor of T and F must be the same. E's fallthrough need be treated as True edge.
+  *
+  *  for if-then pattern E-T-C:
+  **   E
+  **  / |
+  ** T  |
+  **  \ |
+  **   C
+  ** E has two edges T and C,  T should have only one predecessor and one successor, the successor
+  ** of T must be C. if E's fallthrough is C, need inverse the predicate.
+  *
+  *  for if-then pattern E-F-C:
+  **   E
+  **  | \
+  **  |  F
+  **  | /
+  **   C
+  ** E has two edges C and F,  F should have only one predecessor and one successor, the successor
+  ** of F must be C. if E's fallthrough is C, need inverse the predicate.
+  */
+  int CFGStructurizer::ifPatternMatch(Block *block)
+  {
+    //two edges
+    if (block->succ_size() != 2)
+      return 0;
+
+    if(block->hasBarrier())
+      return 0;
+
+    int NumMatch = 0;
+    Block *TrueBB = *block->succ_begin();
+    Block *FalseBB = *(++block->succ_begin());
+    Block *mergedBB = NULL;
+    BlockSets ifSets;
+
+    assert (!TrueBB->succ_empty() || !FalseBB->succ_empty());
+    if (TrueBB->succ_size() == 1 && FalseBB->succ_size() == 1
+        && TrueBB->pred_size() == 1 && FalseBB->pred_size() == 1
+        && *TrueBB->succ_begin() == *FalseBB->succ_begin()
+        && !TrueBB->hasBarrier() && !FalseBB->hasBarrier() ) {
+      // if-else pattern
+      ifSets.insert(block);
+      if(block->fallthrough() == TrueBB) {
+        ifSets.insert(TrueBB);
+        ifSets.insert(FalseBB);
+        mergedBB = new IfElseBlock(block, TrueBB, FalseBB);
+      }else if(block->fallthrough() == FalseBB) {
+        ifSets.insert(FalseBB);
+        ifSets.insert(TrueBB);
+        mergedBB = new IfElseBlock(block, FalseBB, TrueBB);
+      }else{
+        GBE_ASSERT(0);
+      }
+
+      if(block->canBeHandled == false || TrueBB->canBeHandled == false || FalseBB->canBeHandled == false)
+        block->canBeHandled = false;
+
+      insertBlock(mergedBB);
+    } else if (TrueBB->succ_size() == 1 && TrueBB->pred_size() == 1 &&
+        *TrueBB->succ_begin() == FalseBB && !TrueBB->hasBarrier() ) {
+      // if-then pattern, false is empty
+      ifSets.insert(block);
+      ifSets.insert(TrueBB);
+      mergedBB = new IfThenBlock(block, TrueBB);
+      if(block->fallthrough() == FalseBB)
+        block->inversePredicate = false;
+
+      if(block->canBeHandled == false || TrueBB->canBeHandled == false)
+        block->canBeHandled = false;
+
+      insertBlock(mergedBB);
+    } else if (FalseBB->succ_size() == 1 && FalseBB->pred_size() == 1 &&
+        *FalseBB->succ_begin() == TrueBB && !FalseBB->hasBarrier() ) {
+      // if-then pattern, true is empty
+      ifSets.insert(block);
+      ifSets.insert(FalseBB);
+      mergedBB = new IfThenBlock(block, FalseBB);
+      if(block->fallthrough() == TrueBB)
+        block->inversePredicate = false;
+
+      if(block->canBeHandled == false || FalseBB->canBeHandled == false)
+        block->canBeHandled = false;
+
+      insertBlock(mergedBB);
+    }
+    else{
+      return 0;
+    }
+
+    if(ifSets.empty())
+      return 0;
+
+    if(mergedBB == NULL)
+      return 0;
+
+    cfgUpdate(mergedBB, ifSets);
+    replace(mergedBB, ifSets);
+
+    if(OCL_OUTPUT_STRUCTURIZE)
+      printOrderedBlocks();
+    ++numIfPatternMatch;
+    if(ifSets.find(blocks_entry) != ifSets.end())
+      blocks_entry = mergedBB;
+    return NumMatch + 1;
+  }
+
+  /* match loop pattern, serail pattern, if pattern accordingly, update and replace block the CFG internally once matched. */
+  int CFGStructurizer::patternMatch(Block *block) {
+    int NumMatch = 0;
+    NumMatch += loopPatternMatch(block);
+    NumMatch += serialPatternMatch(block);
+    NumMatch += ifPatternMatch(block);
+    return NumMatch;
+  }
+
+  void CFGStructurizer::blockPatternMatch()
+  {
+    int increased = 0;
+
+    do
+    {
+      increased = numSerialPatternMatch + numLoopPatternMatch + numIfPatternMatch;
+
+      orderIter = orderedBlks.begin();
+
+      while(orderedBlks.size() > 1 && orderIter != orderedBlks.end())
+      {
+        if(OCL_OUTPUT_STRUCTURIZE)
+          printOrderedBlocks();
+        patternMatch(*orderIter);
+        orderIter++;
+      }
+      if(OCL_OUTPUT_STRUCTURIZE)
+        printOrderedBlocks();
+
+      if(increased == numSerialPatternMatch + numLoopPatternMatch + numIfPatternMatch)
+        break;
+
+    } while(orderedBlks.size()>1);
+    if(OCL_OUTPUT_STRUCTURIZE)
+      std::cout << "Serial:" << numSerialPatternMatch << "Loop:" << numLoopPatternMatch << "If:" << numIfPatternMatch << std::endl;
+  }
+
+  void CFGStructurizer::StructurizeBlocks()
+  {
+    initializeBlocks();
+    blockPatternMatch();
+    handleStructuredBlocks();
+    calculateNecessaryLiveout();
+  }
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/structurizer.hpp b/backend/src/ir/structurizer.hpp
new file mode 100644
index 0000000..8207644
--- /dev/null
+++ b/backend/src/ir/structurizer.hpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __STRUCTURIZER_HPP__
+#define __STRUCTURIZER_HPP__
+#include "llvm/ADT/SmallVector.h"
+#include "ir/unit.hpp"
+#include "ir/function.hpp"
+#include "ir/instruction.hpp"
+
+#include <iostream>
+#include <set>
+#include <map>
+#include <vector>
+#include <list>
+#include <algorithm>
+namespace gbe {
+namespace ir {
+  using namespace llvm;
+
+  enum BlockType
+  {
+    SingleBlockType = 0,
+    SerialBlockType,
+    IfThenType,
+    IfElseType,
+    SelfLoopType
+  };
+
+  /* Block*/
+  class Block;
+
+  typedef std::set<Block *> BlockSets;
+  typedef std::list<Block *> BlockList;
+  typedef std::vector<Block *> BlockVector;
+  typedef std::set<Block *>::iterator sIterator;
+  typedef std::list<Block *>::iterator lIterator;
+
+  class Block
+  {
+  public:
+    Block(BlockType type, const BlockList& children): has_barrier(false), mark(false), canBeHandled(true), inversePredicate(true)
+    {
+      this->btype = type;
+      this->children = children;
+    }
+    virtual ~Block() {}
+    Block*& fallthrough() { return fall_through; }
+    BlockSets& successors() { return successor; }
+    size_t succ_size() { return successor.size(); }
+    sIterator succ_begin() { return successor.begin(); }
+    sIterator succ_end() { return successor.end(); }
+    bool succ_empty() { return successor.empty(); }
+    BlockSets& predecessors() { return predecessor; }
+    size_t pred_size() { return predecessor.size(); }
+    sIterator pred_begin() { return predecessor.begin(); }
+    sIterator pred_end() { return predecessor.end(); }
+    bool& hasBarrier() { return has_barrier; }
+    BlockType type() { return btype; }
+    virtual BasicBlock* getEntry()
+    {
+      return (*(children.begin()))->getEntry();
+    }
+    virtual BasicBlock* getExit()
+    {
+      return (*(children.rbegin()))->getExit();
+    }
+
+  public:
+    BlockType btype;
+    Block* fall_through;
+    BlockSets predecessor;
+    BlockSets successor;
+    BlockList children;
+    bool has_barrier;
+    bool mark;
+    bool canBeHandled;
+    //label is for debug
+    int label;
+    /* inversePredicate should be false under two circumstance,
+     * fallthrough is the same with succs:
+     * (1) n->succs == m && block->fallthrough == m
+     * block
+     * | \
+     * |  \
+     * m<--n
+     * (2) m->succs == n && block->fallthrough == n
+     * block
+     * | \
+     * |  \
+     * m-->n
+     * */
+    bool inversePredicate;
+  };
+
+  /* represents basic block */
+  class SimpleBlock: public Block
+  {
+  public:
+    SimpleBlock(BasicBlock *p_bb) : Block(SingleBlockType, BlockList()) { this->p_bb = p_bb; }
+    virtual ~SimpleBlock() {}
+    BasicBlock* getBasicBlock() { return p_bb; }
+    virtual BasicBlock* getEntry() { return p_bb; }
+    virtual BasicBlock* getExit() { return p_bb; }
+    virtual BasicBlock* getFirstBB() { return p_bb; }
+  private:
+    BasicBlock *p_bb;
+  };
+
+  /* a serial of Blocks*/
+  class SerialBlock : public Block
+  {
+  public:
+    SerialBlock(BlockList& children) : Block(SerialBlockType, children) {}
+    virtual ~SerialBlock(){}
+  };
+
+  /* If-Then Block*/
+  class IfThenBlock : public Block
+  {
+  public:
+    IfThenBlock(Block* pred, Block* trueBlock) : Block(IfThenType, InitChildren(pred, trueBlock)) {}
+    virtual ~IfThenBlock() {}
+
+  private:
+    const BlockList InitChildren(Block* pred, Block* trueBlock)
+    {
+      BlockList children;
+      children.push_back(pred);
+      children.push_back(trueBlock);
+      return children;
+    }
+  };
+
+  /* If-Else Block*/
+  class IfElseBlock: public Block
+  {
+  public:
+    IfElseBlock(Block* pred, Block* trueBlock, Block* falseBlock) : Block(IfElseType, InitChildren(pred, trueBlock, falseBlock)) {}
+    virtual ~IfElseBlock() {}
+
+  private:
+    const BlockList InitChildren(Block* pred, Block* trueBlock, Block* falseBlock)
+    {
+      BlockList children;
+      children.push_back(pred);
+      children.push_back(trueBlock);
+      children.push_back(falseBlock);
+      return children;
+    }
+  };
+
+  /* Self loop Block*/
+  class SelfLoopBlock: public Block
+  {
+  public:
+    SelfLoopBlock(Block* block) : Block(SelfLoopType, InitChildren(block)) {}
+    virtual ~SelfLoopBlock() {}
+    virtual BasicBlock* getEntry()
+    {
+      return (*(children.begin()))->getEntry();
+    }
+    virtual BasicBlock* getExit()
+    {
+      return (*(children.begin()))->getExit();
+    }
+
+  private:
+    const BlockList InitChildren(Block * block)
+    {
+      BlockList children;
+      children.push_back(block);
+      return children;
+    }
+  };
+
+  class CFGStructurizer{
+    public:
+      CFGStructurizer(Function* fn) { this->fn = fn; numSerialPatternMatch = 0; numLoopPatternMatch = 0; numIfPatternMatch = 0;}
+      ~CFGStructurizer();
+
+      void StructurizeBlocks();
+
+    private:
+      int  numSerialPatternMatch;
+      int  numLoopPatternMatch;
+      int  numIfPatternMatch;
+
+      void outBlockTypes(BlockType type);
+      void printOrderedBlocks();
+      void blockPatternMatch();
+      int  serialPatternMatch(Block *block);
+      Block* mergeSerialBlock(BlockList& serialBB);
+      void cfgUpdate(Block* mergedBB,  const BlockSets& blockBBs);
+      void replace(Block* mergedBB,  BlockSets serialSets);
+      int  loopPatternMatch(Block *block);
+      Block* mergeLoopBlock(BlockList& loopSets);
+      int  ifPatternMatch(Block *block);
+      int  patternMatch(Block *block);
+
+    private:
+      void handleSelfLoopBlock(Block *loopblock, LabelIndex& whileLabel);
+      void markNeedIf(Block *block, bool status);
+      void markNeedEndif(Block *block, bool status);
+      void markStructuredBlocks(Block *block, bool status);
+      void handleIfBlock(Block *block, LabelIndex& matchingEndifLabel, LabelIndex& matchingElseLabel);
+      void handleThenBlock(Block * block, LabelIndex& endiflabel);
+      void handleThenBlock2(Block *block, Block *elseblock, LabelIndex elseBBLabel);
+      void handleElseBlock(Block * block, LabelIndex& elselabel, LabelIndex& endiflabel);
+      void handleStructuredBlocks();
+      void getStructureSequence(Block *block, std::vector<BasicBlock*> &seq);
+      std::set<int> getStructureBasicBlocksIndex(Block* block, std::vector<BasicBlock *> &bbs);
+      std::set<BasicBlock *> getStructureBasicBlocks(Block *block);
+      Block* insertBlock(Block *p_block);
+      bool checkForBarrier(const BasicBlock* bb);
+      void getLiveIn(BasicBlock& bb, std::set<Register>& livein);
+      void initializeBlocks();
+      void calculateNecessaryLiveout();
+
+    private:
+      Function *fn;
+      std::map<BasicBlock *, Block *> bbmap;
+      std::map<Block *, BasicBlock *> bTobbmap;
+      BlockVector blocks;
+      Block* blocks_entry;
+      gbe::vector<Loop *> loops;
+      BlockList orderedBlks;
+      BlockList::iterator orderIter;
+  };
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 24dda43..0cd1eef 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -90,7 +90,7 @@ MACRO(GENERATE_SOURCE_PY _mod)
 	)
 ENDMACRO(GENERATE_SOURCE_PY)
 
-SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math)
+SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math ocl_simd)
 FOREACH(M ${OCL_PY_GENERATED_MODULES})
     GENERATE_HEADER_PY(${M})
     GENERATE_SOURCE_PY(${M})
@@ -181,7 +181,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
 	)
 ENDMACRO(ADD_LL_TO_BC_TARGET)
 
-SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset)
+SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset ocl_clz)
 FOREACH(f ${OCL_LL_MODULES})
     COPY_THE_LL(${f})
     ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index e886670..a4af4aa 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -36,6 +36,7 @@
 #include "ocl_sync.h"
 #include "ocl_vload.h"
 #include "ocl_workitem.h"
+#include "ocl_simd.h"
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
+#pragma OPENCL EXTENSION cl_khr_fp16 : disable
 #endif
diff --git a/backend/src/libocl/include/ocl_async.h b/backend/src/libocl/include/ocl_async.h
index dd89942..9d5cc06 100644
--- a/backend/src/libocl/include/ocl_async.h
+++ b/backend/src/libocl/include/ocl_async.h
@@ -45,7 +45,7 @@ DEF(double)
 #undef DEFN
 #undef DEF
 
-void wait_group_events (int num_events, event_t *event_list);
+OVERLOADABLE void wait_group_events (int num_events, event_t *event_list);
 
 #define DEFN(TYPE) \
 OVERLOADABLE void prefetch(const global TYPE *p, size_t num);
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index aa3f504..359025b 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -128,14 +128,6 @@ DEF(ulong)
 #undef DEC16
 #undef DEC16X
 
-
-/* Temp to add the SIMD functions here. */
-/////////////////////////////////////////////////////////////////////////////
-// SIMD level function
-/////////////////////////////////////////////////////////////////////////////
-short __gen_ocl_simd_any(short);
-short __gen_ocl_simd_all(short);
-
 struct time_stamp {
   // time tick
   ulong tick;
diff --git a/backend/src/libocl/include/ocl_printf.h b/backend/src/libocl/include/ocl_printf.h
index ffeefb9..27cef27 100644
--- a/backend/src/libocl/include/ocl_printf.h
+++ b/backend/src/libocl/include/ocl_printf.h
@@ -24,9 +24,12 @@
 /* From LLVM 3.4, c string are all in constant address space */
 #if 100*__clang_major__ + __clang_minor__ < 304
 int __gen_ocl_printf_stub(const char * format, ...);
+int __gen_ocl_puts_stub(const char * format);
 #else
 int __gen_ocl_printf_stub(constant char * format, ...);
+int __gen_ocl_puts_stub(constant char * format);
 #endif
 #define printf __gen_ocl_printf_stub
+#define puts __gen_ocl_puts_stub
 
 #endif
diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h
index ed7c6e4..18090d5 100644
--- a/backend/src/libocl/include/ocl_sync.h
+++ b/backend/src/libocl/include/ocl_sync.h
@@ -27,7 +27,7 @@
 #define CLK_GLOBAL_MEM_FENCE (1 << 1)
 
 typedef uint cl_mem_fence_flags;
-void barrier(cl_mem_fence_flags flags);
+OVERLOADABLE void barrier(cl_mem_fence_flags flags);
 void mem_fence(cl_mem_fence_flags flags);
 void read_mem_fence(cl_mem_fence_flags flags);
 void write_mem_fence(cl_mem_fence_flags flags);
diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h
index 487fe68..eb4c3b4 100644
--- a/backend/src/libocl/include/ocl_types.h
+++ b/backend/src/libocl/include/ocl_types.h
@@ -19,6 +19,7 @@
 #define __OCL_TYPES_H__
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #include "ocl_defines.h"
 
 #define NULL 0
@@ -79,6 +80,7 @@ DEF(long);
 DEF(ulong);
 DEF(float);
 DEF(double);
+DEF(half);
 #undef DEF
 
 /////////////////////////////////////////////////////////////////////////////
@@ -87,7 +89,5 @@ DEF(double);
 // FIXME:
 // This is a transitional hack to bypass the LLVM 3.3 built-in types.
 // See the Khronos SPIR specification for handling of these types.
-typedef size_t __event_t;
-#define event_t __event_t
 
 #endif /* __OCL_TYPES_H__ */
diff --git a/backend/src/libocl/include/ocl_workitem.h b/backend/src/libocl/include/ocl_workitem.h
index 7534ee8..84bb1fb 100644
--- a/backend/src/libocl/include/ocl_workitem.h
+++ b/backend/src/libocl/include/ocl_workitem.h
@@ -20,13 +20,13 @@
 
 #include "ocl_types.h"
 
-uint get_work_dim(void);
-uint get_global_size(uint dimindx);
-uint get_global_id(uint dimindx);
-uint get_local_size(uint dimindx);
-uint get_local_id(uint dimindx);
-uint get_num_groups(uint dimindx);
-uint get_group_id(uint dimindx);
-uint get_global_offset(uint dimindx);
+OVERLOADABLE uint get_work_dim(void);
+OVERLOADABLE uint get_global_size(uint dimindx);
+OVERLOADABLE uint get_global_id(uint dimindx);
+OVERLOADABLE uint get_local_size(uint dimindx);
+OVERLOADABLE uint get_local_id(uint dimindx);
+OVERLOADABLE uint get_num_groups(uint dimindx);
+OVERLOADABLE uint get_group_id(uint dimindx);
+OVERLOADABLE uint get_global_offset(uint dimindx);
 
 #endif  /* __OCL_WORKITEM_H__ */
diff --git a/backend/src/libocl/script/gen_vector.py b/backend/src/libocl/script/gen_vector.py
index ffc573a..cb562a2 100755
--- a/backend/src/libocl/script/gen_vector.py
+++ b/backend/src/libocl/script/gen_vector.py
@@ -20,13 +20,14 @@
 
 # This file is to generate inline code to lower down those builtin
 # vector functions to scalar functions.
+from __future__ import print_function
 import re
 import sys
 import os
 
 if len(sys.argv) != 4:
-    print "Invalid argument {0}".format(sys.argv)
-    print "use {0} spec_file_name output_file_name just_proto".format(sys.argv[0])
+    print("Invalid argument {0}".format(sys.argv))
+    print("use {0} spec_file_name output_file_name just_proto".format(sys.argv[0]))
     raise
 
 all_vector = 1,2,3,4,8,16
@@ -56,17 +57,18 @@ all_itype = "char","short","int","long"
 all_utype = "uchar","ushort","uint","ulong"
 all_int_type = all_itype + all_utype
 
-all_float_type = "float","double"
+all_float_type = "float","double","half"
 all_type = all_int_type + all_float_type
 
 # all vector/scalar types
 for t in all_type:
-    exec "{0}n = [\"{0}n\", gen_vector_type([\"{0}\"])]".format(t)
-    exec "s{0} = [\"{0}\", gen_vector_type([\"{0}\"], [1])]".format(t)
+    exec("{0}n = [\"{0}n\", gen_vector_type([\"{0}\"])]".format(t))
+    exec("s{0} = [\"{0}\", gen_vector_type([\"{0}\"], [1])]".format(t))
 
 # Predefined type sets according to the Open CL spec.
 math_gentype = ["math_gentype", gen_vector_type(all_float_type)]
 math_gentypef = ["math_gentypef", gen_vector_type(["float"])]
+math_gentypeh = ["math_gentypeh", gen_vector_type(["half"])]
 math_gentyped = ["math_gentyped", gen_vector_type(["double"])]
 
 half_native_math_gentype = ["half_native_math_gentype", gen_vector_type(["float"])]
@@ -79,6 +81,7 @@ fast_integer_gentype = ["fast_integer_gentype", gen_vector_type(["uint", "int"])
 
 common_gentype = ["common_gentype", gen_vector_type(all_float_type)]
 common_gentypef = ["common_gentypef", gen_vector_type(["float"])]
+common_gentypeh = ["common_gentypeh", gen_vector_type(["half"])]
 common_gentyped = ["common_gentyped", gen_vector_type(["double"])]
 
 relational_gentype = ["relational_gentype", gen_vector_type(all_type)]
@@ -90,14 +93,14 @@ misc_gentypen = ["misc_gentypen", gen_vector_type(all_type, [2, 4, 8, 16])]
 misc_ugentypem = ["misc_ugentypem", gen_vector_type(all_utype, [2, 4, 8, 16])]
 misc_ugentypen = ["misc_ugentypen", gen_vector_type(all_utype, [2, 4, 8, 16])]
 
-all_predefined_type = math_gentype, math_gentypef, math_gentyped,                \
+all_predefined_type = math_gentype, math_gentypef, math_gentyped, math_gentypeh, \
                       half_native_math_gentype, integer_gentype,integer_sgentype,\
                       integer_ugentype, charn, ucharn, shortn, ushortn, intn,    \
-                      uintn, longn, ulongn, floatn, doublen,                     \
+                      uintn, longn, ulongn, floatn, doublen, halfn, common_gentypeh, \
                       fast_integer_gentype, common_gentype, common_gentypef,     \
                       common_gentyped, relational_gentype, relational_igentype,  \
-                      relational_ugentype, schar, suchar, sshort, sint, suint,   \
-                      slong, sulong, sfloat, sdouble, misc_gentypem,              \
+                      relational_ugentype, schar, suchar, sshort, sushort, sint, \
+                      suint, slong, sulong, sfloat, shalf, sdouble, misc_gentypem,  \
                       misc_ugentypem, misc_gentypen, misc_ugentypen
 
 # type dictionary contains all the predefined type sets.
@@ -124,10 +127,12 @@ def check_type(types):
     for t in types:
         memspace, t = stripMemSpace(t)
         if not t in type_dict:
-            print t
-            raise "found invalid type."
+            print(t)
+            raise TypeError("found invalid type.")
 
 def match_unsigned(dtype):
+    if dtype[0] == 'half':
+        return ["ushort", dtype[1]]
     if dtype[0] == 'float':
         return ["uint", dtype[1]]
     if dtype[0] == 'double':
@@ -137,6 +142,8 @@ def match_unsigned(dtype):
     return ['u' + dtype[0], dtype[1]]
 
 def match_signed(dtype):
+    if dtype[0] == 'half':
+        return ["short", dtype[1]]
     if dtype[0] == 'float':
         return ["int", dtype[1]]
     if dtype[0] == 'double':
@@ -187,8 +194,8 @@ def fixup_type(dstType, srcType, n):
         if (len(dstType) == len(srcType)):
             return dstType[n]
 
-    print dstType, srcType
-    raise "type mispatch"
+    print(dstType, srcType)
+    raise TypeError("type mispatch")
 
 class builtinProto():
     valueTypeStr = ""
@@ -226,7 +233,7 @@ class builtinProto():
 
     def init_from_line(self, t):
         self.append('//{0}'.format(t))
-        line = filter(None, re.split(',| |\(', t.rstrip(')\n')))
+        line = [_f for _f in re.split(',| |\(', t.rstrip(')\n')) if _f]
         self.paramCount = 0
         stripped = 0
         memSpace = ''
@@ -310,7 +317,7 @@ class builtinProto():
                 vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i)
                 if vtype[1] != ptype[1]:
                     if ptype[1] != 1:
-                        raise "parameter is not a scalar but has different width with result value."
+                        raise TypeError("parameter is not a scalar but has different width with result value.")
                     if isPointer(ptype):
                         formatStr += '&'
                     formatStr += 'param{0}'.format(n)
@@ -333,7 +340,7 @@ class builtinProto():
 
     def output(self):
         for line in self.outputStr:
-            print line
+            print(line)
 
     def output(self, outFile):
         for line in self.outputStr:
diff --git a/backend/src/libocl/script/ocl_as.sh b/backend/src/libocl/script/ocl_as.sh
index a432189..22212ba 100755
--- a/backend/src/libocl/script/ocl_as.sh
+++ b/backend/src/libocl/script/ocl_as.sh
@@ -31,7 +31,7 @@ else
 fi
 
 # Supported base types and their lengths
-TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4"
+TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4 half:2"
 # Supported vector lengths
 VECTOR_LENGTHS="1 2 3 4 8 16"
 ROUNDING_MODES="rte rtz rtp rtn"
diff --git a/backend/src/libocl/script/ocl_common.def b/backend/src/libocl/script/ocl_common.def
index fac5ef5..0233eb4 100644
--- a/backend/src/libocl/script/ocl_common.def
+++ b/backend/src/libocl/script/ocl_common.def
@@ -1,22 +1,28 @@
 ##common
 gentype clamp (gentype x, gentype minval, gentype maxval)
 gentypef clamp (gentypef x, float minval, float maxval)
+gentypeh clamp (gentypeh x, half minval, half maxval)
 gentyped clamp (gentyped x, double minval, double maxval)
 gentype degrees (gentype radians)
 gentype max (gentype x,  gentype y)
 gentypef max (gentypef x, float y)
+gentypeh max (gentypeh x, half y)
 gentyped max (gentyped x, double y)
 gentype min (gentype x,  gentype y)
 gentypef min (gentypef x,  float y)
+gentypeh min (gentypeh x,  half y)
 gentyped min (gentyped x,  double y)
 gentype mix (gentype x, gentype y, gentype a)
 gentypef mix (gentypef x, gentypef y, float a)
+gentypeh mix (gentypeh x, gentypeh y, half a)
 gentyped mix (gentyped x, gentyped y, double a)
 gentype radians (gentype degrees)
 gentype step (gentype edge, gentype x)
 gentypef step (float edge, gentypef x)
+gentypeh step (half edge, gentypeh x)
 gentyped step (double edge, gentyped x)
 gentype smoothstep (gentype edge0, gentype edge1, gentype x)
 gentypef smoothstep (float edge0, float edge1, gentypef x)
+gentypeh smoothstep (half edge0, half edge1, gentypeh x)
 gentyped smoothstep (double edge0, double edge1, gentyped x)
 gentype sign (gentype x)
diff --git a/backend/src/libocl/script/ocl_convert.sh b/backend/src/libocl/script/ocl_convert.sh
index afaacab..4f720fe 100755
--- a/backend/src/libocl/script/ocl_convert.sh
+++ b/backend/src/libocl/script/ocl_convert.sh
@@ -31,7 +31,7 @@ else
 fi
 
 # Supported base types and their lengths
-TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4"
+TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4 half:2"
 # Supported vector lengths
 VECTOR_LENGTHS="1 2 3 4 8 16"
 ROUNDING_MODES="rte rtz rtp rtn"
@@ -119,6 +119,7 @@ for vector_length in $VECTOR_LENGTHS; do
 done
 
 echo '
+/* The sat cvt supported by HW. */
 #define DEF(DSTTYPE, SRCTYPE) \
 OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
 DEF(char, uchar);
@@ -145,6 +146,12 @@ DEF(int, uint);
 DEF(int, float);
 DEF(uint, int);
 DEF(uint, float);
+DEF(char, half);
+DEF(uchar, half);
+DEF(short, half);
+DEF(ushort, half);
+DEF(int, half);
+DEF(uint, half);
 #undef DEF
 '
 
@@ -266,6 +273,42 @@ DEF(ulong, ulong);
 #undef DEF
 '
 
+# for half to long
+if [ $1"a" = "-pa" ]; then
+    echo '
+       OVERLOADABLE long convert_long_sat(half x);
+       OVERLOADABLE ulong convert_ulong_sat(half x);
+       '
+else
+    echo '
+union _type_half_and_ushort {
+  half hf;
+  ushort us;
+};
+OVERLOADABLE long convert_long_sat(half x) {
+  union _type_half_and_ushort u;
+  u.hf = x;
+  if (u.us == 0x7C00) // +inf
+    return 0x7FFFFFFFFFFFFFFF;
+  if (u.us == 0xFC00) // -inf
+    return 0x8000000000000000;
+
+  return (long)x;
+}
+OVERLOADABLE ulong convert_ulong_sat(half x) {
+  union _type_half_and_ushort u;
+  u.hf = x;
+  if (u.us == 0x7C00) // +inf
+    return 0xFFFFFFFFFFFFFFFF;
+
+  if (x < (half)0.0) {
+    return 0;
+  }
+  return (ulong)x;
+}'
+fi
+
+
 # vector convert_DSTTYPE_sat function
 for vector_length in $VECTOR_LENGTHS; do
     if test $vector_length -eq 1; then continue; fi
@@ -276,7 +319,7 @@ for vector_length in $VECTOR_LENGTHS; do
 
 	for ttype in $TYPES; do
 	    tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
-	    if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+	    if test $tbasetype = "double" -o $tbasetype = "float" -o $tbasetype = "half" ; then continue; fi
 
 	    fvectortype=$fbasetype$vector_length
 	    tvectortype=$tbasetype$vector_length
@@ -323,10 +366,10 @@ done
 
 if [ $1"a" != "-pa" ]; then
 echo '
-float __gen_ocl_rndz(float x);
-float __gen_ocl_rnde(float x);
-float __gen_ocl_rndu(float x);
-float __gen_ocl_rndd(float x);
+CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
+CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
+CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
+CONST float __gen_ocl_rndd(float x) __asm("llvm.floor" ".f32");
 OVERLOADABLE float __convert_float_rtz(long x)
 {
   union {
@@ -586,7 +629,7 @@ for vector_length in $VECTOR_LENGTHS; do
 
 	for ttype in $TYPES; do
 	    tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
-	    if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+	    if test $tbasetype = "double" -o $tbasetype = "float" -o $tbasetype = "half" ; then continue; fi
 
 	    if test $vector_length -eq 1; then
 		if [ $1"a" = "-pa" ]; then
diff --git a/backend/src/libocl/script/ocl_math.def b/backend/src/libocl/script/ocl_math.def
index 5617c09..9c65af7 100644
--- a/backend/src/libocl/script/ocl_math.def
+++ b/backend/src/libocl/script/ocl_math.def
@@ -29,9 +29,11 @@ gentype floor (gentype)
 gentype fma (gentype a, gentype b, gentype c)
 gentype fmax (gentype x, gentype y)
 gentypef fmax (gentypef x, float y)
+gentypeh fmax (gentypeh x, half y)
 gentyped fmax (gentyped x, double y)
 gentype fmin (gentype x, gentype y)
 gentypef fmin (gentypef x, float y)
+gentypeh fmin (gentypeh x, half y)
 gentyped fmin (gentyped x, double y)
 gentype fmod (gentype x, gentype y)
 gentype fract (gentype x, __global gentype *iptr)
@@ -43,6 +45,12 @@ floatn frexp (floatn x, __private intn *exp)
 float frexp (float x, __global int *exp)
 float frexp (float x, __local int *exp)
 float frexp (float x, __private int *exp)
+halfn frexp (halfn x, __global intn *exp)
+halfn frexp (halfn x, __local intn *exp)
+halfn frexp (halfn x, __private intn *exp)
+half frexp (half x, __global int *exp)
+half frexp (half x, __local int *exp)
+half frexp (half x, __private int *exp)
 doublen frexp (doublen x, __global intn *exp)
 doublen frexp (doublen x, __local intn *exp)
 doublen frexp (doublen x, __private intn *exp)
@@ -52,11 +60,16 @@ double frexp (double x, __private int *exp)
 gentype hypot (gentype x, gentype y)
 intn ilogb (floatn x)
 int ilogb (float x)
+shortn ilogb (halfn x)
+short ilogb (half x)
 intn ilogb (doublen x)
 int ilogb (double x)
 floatn ldexp (floatn x, intn k)
 floatn ldexp (floatn x, int k)
 float ldexp (float x, int k)
+halfn ldexp (halfn x, intn k)
+halfn ldexp (halfn x, int k)
+half ldexp (half x, int k)
 doublen ldexp (doublen x, intn k)
 doublen ldexp (doublen x, int k)
 double ldexp (double x, int k)
@@ -67,6 +80,12 @@ floatn lgamma_r (floatn x, __private intn *signp)
 float lgamma_r (float x, __global int *signp)
 float lgamma_r (float x, __local int *signp)
 float lgamma_r (float x,   __private int *signp)
+halfn lgamma_r (halfn x, __global intn *signp)
+halfn lgamma_r (halfn x, __local intn *signp)
+halfn lgamma_r (halfn x, __private intn *signp)
+half lgamma_r (half x, __global int *signp)
+half lgamma_r (half x, __local int *signp)
+half lgamma_r (half x,   __private int *signp)
 #doublen lgamma_r (doublen x, __global intn *signp)
 #doublen lgamma_r (doublen x, __local intn *signp)
 #doublen lgamma_r (doublen x, __private intn *signp)
@@ -86,12 +105,16 @@ gentype modf (gentype x, __local gentype *iptr)
 gentype modf (gentype x, __private gentype *iptr)
 floatn nan (uintn nancode)
 float nan (uint nancode)
+halfn nan (ushortn nancode)
+half nan (ushort nancode)
 doublen nan (ulongn nancode)
 double nan (ulong nancode)
 gentype nextafter (gentype x, gentype y)
 gentype pow (gentype x, gentype y)
 floatn pown (floatn x, intn y)
 float pown (float x, int y)
+halfn pown (halfn x, intn y)
+half pown (half x, int y)
 doublen pown (doublen x, intn y)
 double pown (double x, int y)
 gentype powr (gentype x, gentype y)
@@ -102,6 +125,12 @@ floatn remquo (floatn x, floatn y, __private intn *quo)
 float remquo (float x, float y, __global int *quo)
 float remquo (float x, float y, __local int *quo)
 float remquo (float x, float y, __private int *quo)
+halfn remquo (halfn x, halfn y, __global intn *quo)
+halfn remquo (halfn x, halfn y, __local intn *quo)
+halfn remquo (halfn x, halfn y, __private intn *quo)
+half remquo (half x, half y, __global int *quo)
+half remquo (half x, half y, __local int *quo)
+half remquo (half x, half y, __private int *quo)
 doublen remquo (doublen x, doublen y, __global intn *quo)
 doublen remquo (doublen x, doublen y, __local intn *quo)
 doublen remquo (doublen x, doublen y, __private intn *quo)
@@ -110,7 +139,7 @@ double remquo (double x, double y, __local int *quo)
 double remquo (double x, double y, __private int *quo)
 gentype rint (gentype)
 floatn rootn (floatn x, intn y)
-
+halfn rootn (halfn x, intn y)
 doublen rootn (doublen x, intn y)
 doublen rootn (double x, int y)
 gentype round (gentype x)
diff --git a/backend/src/libocl/script/ocl_relational.def b/backend/src/libocl/script/ocl_relational.def
index 379c511..db3ddcf 100644
--- a/backend/src/libocl/script/ocl_relational.def
+++ b/backend/src/libocl/script/ocl_relational.def
@@ -1,31 +1,45 @@
 ##relational
 intn isequal (floatn x, floatn y)
+shortn isequal (halfn x, halfn y)
 longn isequal (doublen x, doublen y)
 intn isnotequal (floatn x, floatn y)
+shortn isnotequal (halfn x, halfn y)
 longn isnotequal (doublen x, doublen y)
 intn isgreater (floatn x, floatn y)
+shortn isgreater (halfn x, halfn y)
 longn isgreater (doublen x, doublen y)
 intn isgreaterequal (floatn x, floatn y)
+shortn isgreaterequal (halfn x, halfn y)
 longn isgreaterequal (doublen x, doublen y)
 intn isless (floatn x, floatn y)
+shortn isless (halfn x, halfn y)
 longn isless (doublen x, doublen y)
 intn islessequal (floatn x, floatn y)
+shortn islessequal (halfn x, halfn y)
 longn islessequal (doublen x, doublen y)
 intn islessgreater (floatn x, floatn y)
+shortn islessgreater (halfn x, halfn y)
 longn islessgreater (doublen x, doublen y)
-intn isfinite (floatn
+intn isfinite (floatn)
+shortn isfinite (halfn)
 longn isfinite (doublen)
 intn isinf (floatn)
+shortn isinf (halfn)
 longn isinf (doublen)
 intn isnan (floatn)
+shortn isnan (halfn)
 longn isnan (doublen)
 intn isnormal (floatn)
+shortn isnormal (halfn)
 longn isnormal (doublen)
 intn isordered (floatn x, floatn y)
+shortn isordered (halfn x, halfn y)
 longn isordered (doublen x, doublen y)
 intn isunordered (floatn x, floatn y)
+shortn isunordered (halfn x, halfn y)
 longn isunordered (doublen x, doublen y)
 intn signbit (floatn)
+shortn signbit (halfn)
 longn signbit (doublen)
 int any (igentype x)
 int all (igentype x)
diff --git a/backend/src/libocl/script/ocl_simd.def b/backend/src/libocl/script/ocl_simd.def
new file mode 100644
index 0000000..e26243e
--- /dev/null
+++ b/backend/src/libocl/script/ocl_simd.def
@@ -0,0 +1,4 @@
+##simd level functions
+floatn intel_sub_group_shuffle(floatn x, uint c)
+intn intel_sub_group_shuffle(intn x, uint c)
+uintn intel_sub_group_shuffle(uintn x, uint c)
diff --git a/backend/src/libocl/src/ocl_async.cl b/backend/src/libocl/src/ocl_async.cl
index 041aaf2..10d0aa4 100644
--- a/backend/src/libocl/src/ocl_async.cl
+++ b/backend/src/libocl/src/ocl_async.cl
@@ -66,7 +66,7 @@ DEF(double)
 #undef DEFN
 #undef DEF
 
-void wait_group_events (int num_events, event_t *event_list) {
+OVERLOADABLE void wait_group_events (int num_events, event_t *event_list) {
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
 }
 
diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
index 4e55fcb..dc3579c 100644
--- a/backend/src/libocl/src/ocl_barrier.ll
+++ b/backend/src/libocl/src/ocl_barrier.ll
@@ -10,7 +10,7 @@ declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
 declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
 declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
 
-define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
+define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
   %1 = icmp eq i32 %flags, 3
   br i1 %1, label %barrier_local_global, label %barrier_local_check
 
diff --git a/backend/src/libocl/src/ocl_clz.ll b/backend/src/libocl/src/ocl_clz.ll
new file mode 100644
index 0000000..a274cde
--- /dev/null
+++ b/backend/src/libocl/src/ocl_clz.ll
@@ -0,0 +1,62 @@
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @clz_s8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i8 @clz_u8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i16 @clz_s16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i16 @clz_u16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i32 @clz_s32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i32 @clz_u32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i64 @clz_s64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call2, 32
+  %4 = add i32 %call1, 32
+  %5 = select i1 %cmp, i32 %call2, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
+}
+
+define i64 @clz_u64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call2, 32
+  %4 = add i32 %call1, 32
+  %5 = select i1 %cmp, i32 %call2, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
+}
diff --git a/backend/src/libocl/src/ocl_geometric.cl b/backend/src/libocl/src/ocl_geometric.cl
index e469ff9..886e88c 100644
--- a/backend/src/libocl/src/ocl_geometric.cl
+++ b/backend/src/libocl/src/ocl_geometric.cl
@@ -21,7 +21,7 @@
 #include "ocl_math.h"
 #include "ocl_float.h"
 
-PURE CONST float __gen_ocl_fabs(float x);
+CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
 
 OVERLOADABLE float dot(float p0, float p1) {
   return p0 * p1;
@@ -38,13 +38,9 @@ OVERLOADABLE float dot(float4 p0, float4 p1) {
 OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
 
 #define BODY \
-  if(m == 0) \
-    return 0; \
-  if(isinf(m)) \
-    return INFINITY; \
-  if(m < 1) \
-    m = 1; \
-  x /= m; \
+  m = m==0.0f ? 1.0f : m; \
+  m = isinf(m) ? 1.0f : m; \
+  x = x/m; \
   return m * sqrt(dot(x,x));
 OVERLOADABLE float length(float2 x) {
   float m = max(__gen_ocl_fabs(x.s0), __gen_ocl_fabs(x.s1));
@@ -64,30 +60,23 @@ OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
 OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
 OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
 OVERLOADABLE float normalize(float x) {
-  union { float f; unsigned u; } u;
-  u.f = x;
-  if(u.u == 0)
-    return 0.f;
-  if(isnan(x))
-    return NAN;
-  return u.u < 0x7fffffff ? 1.f : -1.f;
+  float m = length(x);
+  m = m == 0.0f ? 1.0f : m;
+  return x / m;
 }
 OVERLOADABLE float2 normalize(float2 x) {
   float m = length(x);
-  if(m == 0)
-    return 0;
+  m = m == 0.0f ? 1.0f : m;
   return x / m;
 }
 OVERLOADABLE float3 normalize(float3 x) {
   float m = length(x);
-  if(m == 0)
-    return 0;
+  m = m == 0.0f ? 1.0f : m;
   return x / m;
 }
 OVERLOADABLE float4 normalize(float4 x) {
   float m = length(x);
-  if(m == 0)
-    return 0;
+  m = m == 0.0f ? 1.0f : m;
   return x / m;
 }
 
diff --git a/backend/src/libocl/src/ocl_workitem.cl b/backend/src/libocl/src/ocl_workitem.cl
index f4629f8..6ddc406 100644
--- a/backend/src/libocl/src/ocl_workitem.cl
+++ b/backend/src/libocl/src/ocl_workitem.cl
@@ -18,7 +18,7 @@
 #include "ocl_workitem.h"
 
 PURE CONST uint __gen_ocl_get_work_dim(void);
-uint get_work_dim(void)
+OVERLOADABLE uint get_work_dim(void)
 {
   return __gen_ocl_get_work_dim();
 }
@@ -37,7 +37,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
 #undef DECL_INTERNAL_WORK_ITEM_FN
 
 #define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
-unsigned NAME(unsigned int dim) {             \
+OVERLOADABLE unsigned NAME(unsigned int dim) {             \
   if (dim == 0) return __gen_ocl_##NAME##0();        \
   else if (dim == 1) return __gen_ocl_##NAME##1();   \
   else if (dim == 2) return __gen_ocl_##NAME##2();   \
@@ -52,6 +52,6 @@ DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
 DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
 #undef DECL_PUBLIC_WORK_ITEM_FN
 
-uint get_global_id(uint dim) {
+OVERLOADABLE uint get_global_id(uint dim) {
   return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
 }
diff --git a/backend/src/libocl/tmpl/ocl_common.tmpl.cl b/backend/src/libocl/tmpl/ocl_common.tmpl.cl
index db7b0d8..76aca2b 100644
--- a/backend/src/libocl/tmpl/ocl_common.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_common.tmpl.cl
@@ -17,12 +17,13 @@
  */
 #include "ocl_common.h"
 #include "ocl_float.h"
+#include "ocl_relational.h"
 
 /////////////////////////////////////////////////////////////////////////////
 // Common Functions
 /////////////////////////////////////////////////////////////////////////////
-PURE CONST float __gen_ocl_fmax(float a, float b);
-PURE CONST float __gen_ocl_fmin(float a, float b);
+PURE CONST OVERLOADABLE float __gen_ocl_fmax(float a, float b);
+PURE CONST OVERLOADABLE float __gen_ocl_fmin(float a, float b);
 
 OVERLOADABLE float step(float edge, float x) {
   return x < edge ? 0.0 : 1.0;
@@ -55,11 +56,64 @@ OVERLOADABLE float smoothstep(float e0, float e1, float x) {
 }
 
 OVERLOADABLE float sign(float x) {
-  if(x > 0)
-    return 1;
-  if(x < 0)
-    return -1;
-  if(x == -0.f)
-    return -0.f;
-  return 0.f;
+// TODO: the best form of implementation is below,
+//      But I find it hard to implement in Beignet now,
+//      So I would put it in the TODO list.
+
+//      cmp.ne.f0  null    x:f  0.0:f
+//      and        ret:ud  x:ud 0x80000000:ud
+//(+f0) or         ret:ud  ret:ud 0x3f800000:ud
+//      cmp.ne.f0  null    x:f  x:f
+//(+f0) mov        ret:f   0.0f
+
+  union {float f; unsigned u;} ieee;
+  ieee.f = x;
+  unsigned k = ieee.u;
+  float r = (k&0x80000000) ? -1.0f : 1.0f;
+  // differentiate +0.0f -0.0f
+  float s = 0.0f * r;
+  s = (x == 0.0f) ? s : r;
+  return isnan(x) ? 0.0f : s;
+}
+
+// Half float version.
+PURE CONST OVERLOADABLE half __gen_ocl_fmax(half a, half b);
+PURE CONST OVERLOADABLE half __gen_ocl_fmin(half a, half b);
+
+OVERLOADABLE half step(half edge, half x) {
+  return x < edge ? 0.0 : 1.0;
+}
+OVERLOADABLE half max(half a, half b) {
+  return __gen_ocl_fmax(a, b);
+}
+OVERLOADABLE half min(half a, half b) {
+  return __gen_ocl_fmin(a, b);
+}
+OVERLOADABLE half mix(half x, half y, half a) {
+  return x + (y-x)*a;
+}
+OVERLOADABLE half clamp(half v, half l, half u) {
+  return max(min(v, u), l);
+}
+OVERLOADABLE half degrees(half radians) {
+  return ((half)(180 / M_PI_F)) * radians;
+}
+OVERLOADABLE half radians(half degrees) {
+  return ((half)(M_PI_F / 180)) * degrees;
+}
+
+OVERLOADABLE half smoothstep(half e0, half e1, half x) {
+  x = clamp((x - e0) / (e1 - e0), (half)0.0, (half)1.0);
+  return x * x * (3 - 2 * x);
+}
+
+OVERLOADABLE half sign(half x) {
+  union {half h; ushort u;} ieee;
+  ieee.h = x;
+  unsigned k = ieee.u;
+  half r = (k&0x8000) ? -1.0 : 1.0;
+  // differentiate +0.0f -0.0f
+  half s = (half)0.0 * r;
+  s = (x == (half)0.0) ? s : r;
+  return isnan(x) ? 0.0 : s;
 }
diff --git a/backend/src/libocl/tmpl/ocl_common.tmpl.h b/backend/src/libocl/tmpl/ocl_common.tmpl.h
index 4a9379d..8e9cec0 100644
--- a/backend/src/libocl/tmpl/ocl_common.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_common.tmpl.h
@@ -34,3 +34,16 @@ OVERLOADABLE float radians(float degrees);
 OVERLOADABLE float smoothstep(float e0, float e1, float x);
 
 OVERLOADABLE float sign(float x);
+
+// Half half version.
+OVERLOADABLE half step(half edge, half x);
+OVERLOADABLE half max(half a, half b);
+OVERLOADABLE half min(half a, half b);
+OVERLOADABLE half mix(half x, half y, half a);
+OVERLOADABLE half clamp(half v, half l, half u);
+
+OVERLOADABLE half degrees(half radians);
+OVERLOADABLE half radians(half degrees);
+OVERLOADABLE half smoothstep(half e0, half e1, half x);
+
+OVERLOADABLE half sign(half x);
diff --git a/backend/src/libocl/tmpl/ocl_defines.tmpl.h b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
index 4e210be..9c53093 100644
--- a/backend/src/libocl/tmpl/ocl_defines.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
@@ -34,5 +34,7 @@
 #define cl_khr_byte_addressable_store
 #define cl_khr_icd
 #define cl_khr_gl_sharing
+#define cl_khr_spir
+#define cl_khr_fp16
 
 #endif /* end of __OCL_COMMON_DEF_H__ */
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 6da0bab..12408eb 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -19,6 +19,8 @@
 
 PURE CONST uint __gen_ocl_fbh(uint);
 PURE CONST uint __gen_ocl_fbl(uint);
+
+
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort);
@@ -26,71 +28,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char);
 
-OVERLOADABLE char clz(char x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE uchar clz(uchar x) {
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE short clz(short x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE ushort clz(ushort x) {
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE int clz(int x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE uint clz(uint x) {
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE long clz(long x) {
-  union { int i[2]; long x; } u;
-  u.x = x;
-  if (u.i[1] & 0x80000000u)
-    return 0;
-  if (u.i[1] == 0 && u.i[0] == 0)
-    return 64;
-  uint v = clz(u.i[1]);
-  if(v == 32)
-    v += clz(u.i[0]);
-  return v;
-}
-
-OVERLOADABLE ulong clz(ulong x) {
-  if (x == 0)
-    return 64;
-  union { uint i[2]; ulong x; } u;
-  u.x = x;
-  uint v = clz(u.i[1]);
-  if(v == 32)
-    v += clz(u.i[0]);
-  return v;
-}
+#define SDEF(TYPE, TYPE_NAME, SIZE)        \
+OVERLOADABLE TYPE clz(TYPE x){ return clz_##TYPE_NAME##SIZE(x);}
+SDEF(char, s, 8);
+SDEF(uchar, u, 8);
+SDEF(short, s, 16);
+SDEF(ushort, u, 16);
+SDEF(int, s, 32);
+SDEF(uint, u, 32);
+SDEF(long, s, 64);
+SDEF(ulong, u, 64);
+#undef SDEF
 
 #define SDEF(TYPE)        \
 OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
@@ -330,7 +278,9 @@ OVERLOADABLE ulong rhadd(ulong x, ulong y) {
   return __gen_ocl_rhadd(x, y);
 }
 
-int __gen_ocl_abs(int x);
+PURE CONST OVERLOADABLE char __gen_ocl_abs(char x);
+PURE CONST OVERLOADABLE short __gen_ocl_abs(short x);
+PURE CONST OVERLOADABLE int __gen_ocl_abs(int x);
 #define DEC(TYPE) OVERLOADABLE u##TYPE abs(TYPE x) { return (u##TYPE) __gen_ocl_abs(x); }
 DEC(int)
 DEC(short)
@@ -348,35 +298,17 @@ DEC(ulong)
 /* Char and short type abs diff */
 /* promote char and short to int and will be no module overflow */
 #define DEC(TYPE, UTYPE) OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
-                         { return (UTYPE) (abs((int)x - (int)y)); }
+      { return y > x ? (y -x) : (x - y); }
 DEC(char, uchar)
 DEC(uchar, uchar)
 DEC(short, ushort)
 DEC(ushort, ushort)
+DEC(int, uint)
+DEC(uint, uint)
+DEC(long, ulong)
+DEC(ulong, ulong)
 #undef DEC
 
-OVERLOADABLE uint abs_diff (uint x, uint y) {
-    /* same signed will never overflow. */
-    return y > x ? (y -x) : (x - y);
-}
-
-OVERLOADABLE uint abs_diff (int x, int y) {
-    /* same signed will never module overflow. */
-    if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
-        return abs(x - y);
-
-    return (abs(x) + abs(y));
-}
-
-OVERLOADABLE ulong abs_diff (long x, long y) {
-  if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
-    return abs(x - y);
-  return abs(x) + abs(y);
-}
-OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
-  return y > x ? (y - x) : (x - y);
-}
-
 
 #define DECL_MIN_MAX_CLAMP(TYPE) \
 OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
index f067b8d..4b3b5ae 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
@@ -45,6 +45,15 @@ OVERLOADABLE uint clz(uint x);
 OVERLOADABLE long clz(long x);
 OVERLOADABLE ulong clz(ulong x);
 
+char   clz_s8(char);
+uchar  clz_u8(uchar);
+short  clz_s16(short);
+ushort clz_u16(ushort);
+int    clz_s32(int);
+uint   clz_u32(uint);
+long   clz_s64(long);
+ulong  clz_u64(ulong);
+
 OVERLOADABLE char popcount(char x);
 OVERLOADABLE uchar popcount(uchar x);
 OVERLOADABLE short popcount(short x);
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index fcc60fd..dc0363d 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -23,19 +23,19 @@
 
 extern constant int __ocl_math_fastpath_flag;
 
-PURE CONST float __gen_ocl_fabs(float x);
-PURE CONST float __gen_ocl_sin(float x);
-PURE CONST float __gen_ocl_cos(float x);
-PURE CONST float __gen_ocl_sqrt(float x);
+CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
+CONST float __gen_ocl_sin(float x) __asm("llvm.sin" ".f32");
+CONST float __gen_ocl_cos(float x) __asm("llvm.cos" ".f32");
+CONST float __gen_ocl_sqrt(float x) __asm("llvm.sqrt" ".f32");
 PURE CONST float __gen_ocl_rsqrt(float x);
-PURE CONST float __gen_ocl_log(float x);
-PURE CONST float __gen_ocl_exp(float x);
-PURE CONST float __gen_ocl_pow(float x, float y);
+CONST float __gen_ocl_log(float x) __asm("llvm.log2" ".f32");
+CONST float __gen_ocl_exp(float x) __asm("llvm.exp2" ".f32");
+PURE CONST float __gen_ocl_pow(float x, float y) __asm("llvm.pow" ".f32");
 PURE CONST float __gen_ocl_rcp(float x);
-PURE CONST float __gen_ocl_rndz(float x);
-PURE CONST float __gen_ocl_rnde(float x);
-PURE CONST float __gen_ocl_rndu(float x);
-PURE CONST float __gen_ocl_rndd(float x);
+CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
+CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
+CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
+CONST float __gen_ocl_rndd(float x) __asm("llvm.floor" ".f32");
 
 
 /* native functions */
@@ -1280,18 +1280,16 @@ OVERLOADABLE float logb(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_logb(x);
 
-union {float f; unsigned i;} u;
+  union {float f; unsigned i;} u;
   u.f = x;
   int e =  ((u.i & 0x7f800000) >> 23);
-  if(e == 0) {
+  float r1 = e-127;
+  float r2 = -INFINITY;
+  float r3 = x*x;
     /* sub normal or +/-0 */
-    return -INFINITY;
-  } else if(e == 0xff) {
+  float r = e == 0 ? r2 : r1;
     /* inf & nan */
-    return x*x;
-  } else {
-    return (float)(e-127);
-  }
+  return e == 0xff ? r3 : r;
 }
 
 OVERLOADABLE int ilogb(float x) {
@@ -1746,13 +1744,6 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
   }
 }
 
-INLINE_OVERLOADABLE float tgamma(float x) {
-  float y;
-  int s;
-  y=lgamma_r(x,&s);
-  return __gen_ocl_internal_exp(y)*s;
-}
-
 /* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
  * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com.
  */
@@ -2618,7 +2609,7 @@ OVERLOADABLE float ldexp(float x, int n) {
   return __gen_ocl_internal_ldexp(x, n);
 }
 
-PURE CONST float __gen_ocl_mad(float a, float b, float c);
+CONST float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
 PURE CONST float __gen_ocl_fmax(float a, float b);
 PURE CONST float __gen_ocl_fmin(float a, float b);
 
@@ -2706,6 +2697,8 @@ OVERLOADABLE float modf(float x, private float *i) { BODY; }
 
 OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
 OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmax(half a, half b) { return max(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmin(half a, half b) { return min(a,b); }
 OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
   float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
   return a > b ? x : b > a ? y : max(x, y);
@@ -2963,6 +2956,95 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   return sn*z;
 }
 
+OVERLOADABLE float tgamma (float x)
+{
+  /* based on glibc __ieee754_gammaf_r by Ulrich Drepper <drepper at cygnus.com> */
+
+  unsigned int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  if (hx == 0xff800000)
+    {
+      /* x == -Inf.  According to ISO this is NaN.  */
+      return NAN;
+    }
+  if ((hx & 0x7f800000) == 0x7f800000)
+    {
+      /* Positive infinity (return positive infinity) or NaN (return
+	 NaN).  */
+      return x;
+    }
+  if (x < 0.0f && __gen_ocl_internal_floor (x) == x)
+    {
+      /* integer x < 0 */
+      return NAN;
+    }
+
+  if (x >= 36.0f)
+    {
+      /* Overflow.  */
+      return INFINITY;
+    }
+  else if (x <= 0.0f && x >= -FLT_EPSILON / 4.0f)
+    {
+      return 1.0f / x;
+    }
+  else
+    {
+      float sinpix = __gen_ocl_internal_sinpi(x);
+      if (x <= -42.0f)
+	/* Underflow.  */
+	{return 0.0f * sinpix /*for sign*/;}
+      int exp2_adj = 0;
+      float x_abs = __gen_ocl_fabs(x);
+      float gam0;
+
+      if (x_abs < 4.0f) {
+        /* gamma = exp(lgamma) is only accurate for small lgamma */
+        float prod,x_adj;
+        if (x_abs < 0.5f) {
+          prod = 1.0f / x_abs;
+          x_adj = x_abs + 1.0f;
+        } else if (x_abs <= 1.5f) {
+          prod = 1.0f;
+          x_adj = x_abs;
+        } else if (x_abs < 2.5f) {
+          x_adj = x_abs - 1.0f;
+          prod = x_adj;
+        } else {
+          x_adj = x_abs - 2.0f;
+          prod = x_adj * (x_abs - 1.0f);
+        }
+        gam0 = __gen_ocl_internal_exp (lgamma (x_adj)) * prod;
+      }
+      else {
+        /* Compute gamma (X) using Stirling's approximation,
+  	 starting by computing pow (X, X) with a power of 2
+  	 factored out to avoid intermediate overflow.  */
+        float x_int = __gen_ocl_internal_round (x_abs);
+        float x_frac = x_abs - x_int;
+        int x_log2;
+        float x_mant = frexp (x_abs, &x_log2);
+        if (x_mant < M_SQRT1_2_F)
+          {
+          x_log2--;
+          x_mant *= 2.0f;
+          }
+        exp2_adj = x_log2 * (int) x_int;
+        float ret = (__gen_ocl_internal_pow(x_mant, x_abs)
+  		   * exp2 (x_log2 * x_frac)
+  		   * __gen_ocl_internal_exp (-x_abs)
+  		   * sqrt (2.0f * M_PI_F / x_abs) );
+
+        float x2 = x_abs * x_abs;
+        float bsum = (0x3.403404p-12f / x2 -0xb.60b61p-12f) / x2 + 0x1.555556p-4f;
+        gam0 = ret + ret * __gen_ocl_internal_expm1 (bsum / x_abs);
+      }
+      if (x > 0.0f) {return __gen_ocl_internal_ldexp (gam0, exp2_adj);}
+      float gam1 = M_PI_F / (-x * sinpix * gam0);
+      return __gen_ocl_internal_ldexp (gam1, -exp2_adj);
+    }
+}
+
 float __gen_ocl_internal_pown(float x, int y) {
   const float
   bp[] = {1.0, 1.5,},
@@ -3510,3 +3592,374 @@ OVERLOADABLE float maxmag(float x, float y) {
 OVERLOADABLE float minmag(float x, float y) {
   return __gen_ocl_internal_minmag(x, y);
 }
+
+
+/* So far, the HW do not support half float math function.
+   We just do the conversion and call the float version here. */
+OVERLOADABLE half cospi(half x) {
+  float _x = (float)x;
+  return (half)cospi(_x);
+}
+OVERLOADABLE half cosh(half x) {
+  float _x = (float)x;
+  return (half)cosh(_x);
+}
+OVERLOADABLE half acos(half x) {
+  float _x = (float)x;
+  return (half)acos(_x);
+}
+OVERLOADABLE half acospi(half x) {
+  float _x = (float)x;
+  return (half)acospi(_x);
+}
+OVERLOADABLE half acosh(half x) {
+  float _x = (float)x;
+  return (half)acosh(_x);
+}
+OVERLOADABLE half sinpi(half x) {
+  float _x = (float)x;
+  return (half)sinpi(_x);
+}
+OVERLOADABLE half sinh(half x) {
+  float _x = (float)x;
+  return (half)sinh(_x);
+}
+OVERLOADABLE half asin(half x) {
+  float _x = (float)x;
+  return (half)asin(_x);
+}
+OVERLOADABLE half asinpi(half x) {
+  float _x = (float)x;
+  return (half)asinpi(_x);
+}
+OVERLOADABLE half asinh(half x) {
+  float _x = (float)x;
+  return (half)asinh(_x);
+}
+OVERLOADABLE half tanpi(half x) {
+  float _x = (float)x;
+  return (half)tanpi(_x);
+}
+OVERLOADABLE half tanh(half x) {
+  float _x = (float)x;
+  return (half)tanh(_x);
+}
+OVERLOADABLE half atan(half x) {
+  float _x = (float)x;
+  return (half)atan(_x);
+}
+OVERLOADABLE half atan2(half y, half x) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)atan2(_x, _y);
+}
+OVERLOADABLE half atan2pi(half y, half x) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)atan2pi(_x, _y);
+}
+OVERLOADABLE half atanpi(half x) {
+  float _x = (float)x;
+  return (half)atanpi(_x);
+}
+OVERLOADABLE half atanh(half x) {
+  float _x = (float)x;
+  return (half)atanh(_x);
+}
+OVERLOADABLE half cbrt(half x) {
+  float _x = (float)x;
+  return (half)cbrt(_x);
+}
+OVERLOADABLE half rint(half x) {
+  float _x = (float)x;
+  return (half)rint(_x);
+}
+OVERLOADABLE half copysign(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)copysign(_x, _y);
+}
+OVERLOADABLE half erf(half x) {
+  float _x = (float)x;
+  return (half)erf(_x);
+}
+OVERLOADABLE half erfc(half x) {
+  float _x = (float)x;
+  return (half)erfc(_x);
+}
+OVERLOADABLE half fmod(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)fmod(_x, _y);
+}
+OVERLOADABLE half remainder(half x, half p) {
+  float _x = (float)x;
+  float _p = (float)p;
+  return (half)remainder(_x, _p);
+}
+OVERLOADABLE half ldexp(half x, int n) {
+  float _x = (float)x;
+  return (half)ldexp(_x, n);
+}
+OVERLOADABLE half powr(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)powr(_x, _y);
+}
+OVERLOADABLE half pow(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)pow(_x, _y);
+}
+//no pow, we use powr instead
+OVERLOADABLE half fabs(half x) {
+  float _x = (float)x;
+  return (half)fabs(_x);
+}
+OVERLOADABLE half trunc(half x) {
+  float _x = (float)x;
+  return (half)trunc(_x);
+}
+OVERLOADABLE half round(half x) {
+  float _x = (float)x;
+  return (half)round(_x);
+}
+OVERLOADABLE half floor(half x) {
+  float _x = (float)x;
+  return (half)floor(_x);
+}
+OVERLOADABLE half ceil(half x) {
+  float _x = (float)x;
+  return (half)ceil(_x);
+}
+OVERLOADABLE half log(half x) {
+  float _x = (float)x;
+  return (half)log(_x);
+}
+OVERLOADABLE half log2(half x) {
+  float _x = (float)x;
+  return (half)log2(_x);
+}
+OVERLOADABLE half log10(half x) {
+  float _x = (float)x;
+  return (half)log10(_x);
+}
+OVERLOADABLE half exp(half x) {
+  float _x = (float)x;
+  return (half)exp(_x);
+}
+OVERLOADABLE half exp10(half x) {
+  float _x = (float)x;
+  return (half)exp10(_x);
+}
+OVERLOADABLE half expm1(half x) {
+  float _x = (float)x;
+  return (half)expm1(_x);
+}
+OVERLOADABLE half fmin(half a, half b) {
+  return __gen_ocl_internal_fmin(a, b);
+}
+OVERLOADABLE half fmax(half a, half b) {
+  return __gen_ocl_internal_fmax(a, b);
+}
+OVERLOADABLE half fma(half a, half b, half c) {
+  float _a = (float)a;
+  float _b = (float)b;
+  float _c = (float)c;
+  return (half)fma(_a, _b, _c);
+}
+OVERLOADABLE half fdim(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)fdim(_x, _y);
+}
+OVERLOADABLE half maxmag(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)maxmag(_x, _y);
+}
+OVERLOADABLE half minmag(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)minmag(_x, _y);
+}
+OVERLOADABLE half exp2(half x) {
+  float _x = (float)x;
+  return (half)exp2(_x);
+}
+OVERLOADABLE half mad(half a, half b, half c) {
+  float _a = (float)a;
+  float _b = (float)b;
+  float _c = (float)c;
+  return (half)mad(_a, _b, _c);
+}
+OVERLOADABLE half sin(half x) {
+  float _x = (float)x;
+  return (half)sin(_x);
+}
+OVERLOADABLE half cos(half x) {
+  float _x = (float)x;
+  return (half)cos(_x);
+}
+OVERLOADABLE half tan(half x) {
+  float _x = (float)x;
+  return (half)tan(_x);
+}
+OVERLOADABLE half tgamma(half x) {
+  float _x = (float)x;
+  return (half)tgamma(_x);
+}
+OVERLOADABLE half lgamma(half x) {
+  float _x = (float)x;
+  return (half)lgamma(_x);
+}
+OVERLOADABLE half lgamma_r(half x, global int *signgamp) {
+  float _x = (float)x;
+  return (half)lgamma_r(_x, signgamp);
+}
+OVERLOADABLE half lgamma_r(half x, local int *signgamp) {
+  float _x = (float)x;
+  return (half)lgamma_r(_x, signgamp);
+}
+OVERLOADABLE half lgamma_r(half x, private int *signgamp) {
+  float _x = (float)x;
+  return (half)lgamma_r(_x, signgamp);
+}
+OVERLOADABLE half log1p(half x) {
+  float _x = (float)x;
+  return (half)log1p(_x);
+}
+OVERLOADABLE half logb(half x) {
+  float _x = (float)x;
+  return (half)logb(_x);
+}
+OVERLOADABLE int ilogb(half x) {
+  float _x = (float)x;
+  return ilogb(_x);
+}
+OVERLOADABLE half nan(ushort code) {
+  return (half)NAN;
+}
+
+OVERLOADABLE half sincos(half x, global half *cosval) {
+  float _x = (float)x;
+  float _cosval;
+  half ret = (half)sincos(_x, &_cosval);
+  *cosval = (half)_cosval;
+  return ret;
+}
+OVERLOADABLE half sincos(half x, local half *cosval) {
+  float _x = (float)x;
+  float _cosval;
+  half ret = (half)sincos(_x, &_cosval);
+  *cosval = (half)_cosval;
+  return ret;
+}
+OVERLOADABLE half sincos(half x, private half *cosval) {
+  float _x = (float)x;
+  float _cosval;
+  half ret = (half)sincos(_x, &_cosval);
+  *cosval = (half)_cosval;
+  return ret;
+}
+
+OVERLOADABLE half sqrt(half x) {
+  float _x = (float)x;
+  return (half)sqrt(_x);
+}
+OVERLOADABLE half rsqrt(half x) {
+  float _x = (float)x;
+  return (half)rsqrt(_x);
+}
+OVERLOADABLE half frexp(half x, global int *exp) {
+  float _x = (float)x;
+  return (half)frexp(_x, exp);
+}
+OVERLOADABLE half frexp(half x, local int *exp) {
+  float _x = (float)x;
+  return (half)frexp(_x, exp);
+}
+OVERLOADABLE half frexp(half x, private int *exp) {
+  float _x = (float)x;
+  return (half)frexp(_x, exp);
+}
+OVERLOADABLE half nextafter(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)nextafter(_x, _y);
+}
+
+OVERLOADABLE half modf(half x, global half *i) {
+  float _x = (float)x;
+  float _i;
+  half ret = (half)modf(_x, &_i);
+  *i = (half)_i;
+  return ret;
+}
+OVERLOADABLE half modf(half x, local half *i) {
+  float _x = (float)x;
+  float _i;
+  half ret = (half)modf(_x, &_i);
+  *i = (half)_i;
+  return ret;
+}
+OVERLOADABLE half modf(half x, private half *i) {
+  float _x = (float)x;
+  float _i;
+  half ret = (half)modf(_x, &_i);
+  *i = (half)_i;
+  return ret;
+}
+
+OVERLOADABLE half hypot(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)hypot(_x, _y);
+}
+
+OVERLOADABLE half fract(half x, global half *p) {
+  float _x = (float)x;
+  float _p;
+  half ret = (half)fract(_x, &_p);
+  *p = (half)_p;
+  return ret;
+}
+OVERLOADABLE half fract(half x, local half *p) {
+  float _x = (float)x;
+  float _p;
+  half ret = (half)fract(_x, &_p);
+  *p = (half)_p;
+  return ret;
+}
+OVERLOADABLE half fract(half x, private half *p) {
+  float _x = (float)x;
+  float _p;
+  half ret = (half)fract(_x, &_p);
+  *p = (half)_p;
+  return ret;
+}
+
+OVERLOADABLE half remquo(half x, half y, global int *quo) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)remquo(_x, _y, quo);
+}
+OVERLOADABLE half remquo(half x, half y, local int *quo) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)remquo(_x, _y, quo);
+}
+OVERLOADABLE half remquo(half x, half y, private int *quo) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)remquo(_x, _y, quo);
+}
+
+OVERLOADABLE half pown(half x, int n) {
+  float _x = (float)x;
+  return (half)pown(_x, n);
+}
+OVERLOADABLE half rootn(half x, int n) {
+  float _x = (float)x;
+  return (half)rootn(_x, n);
+}
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h
index 0075797..90dad1f 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
@@ -117,7 +117,107 @@ OVERLOADABLE float native_sin(float x);
 OVERLOADABLE float native_sqrt(float x);
 OVERLOADABLE float native_tan(float x);
 
-// half
+
+// Half float version.
+OVERLOADABLE half cospi(half x);
+OVERLOADABLE half cosh(half x);
+OVERLOADABLE half acos(half x);
+OVERLOADABLE half acospi(half x);
+OVERLOADABLE half acosh(half x);
+OVERLOADABLE half sinpi(half x);
+OVERLOADABLE half sinh(half x);
+OVERLOADABLE half asin(half x);
+OVERLOADABLE half asinpi(half x);
+OVERLOADABLE half asinh(half x);
+OVERLOADABLE half tanpi(half x);
+OVERLOADABLE half tanh(half x);
+OVERLOADABLE half atan(half x);
+OVERLOADABLE half atan2(half y, half x);
+OVERLOADABLE half atan2pi(half y, half x);
+OVERLOADABLE half atanpi(half x);
+OVERLOADABLE half atanh(half x);
+OVERLOADABLE half cbrt(half x);
+OVERLOADABLE half rint(half x);
+OVERLOADABLE half copysign(half x, half y);
+OVERLOADABLE half erf(half x);
+OVERLOADABLE half erfc(half x);
+OVERLOADABLE half fmod (half x, half y);
+OVERLOADABLE half remainder(half x, half p);
+OVERLOADABLE half ldexp(half x, int n);
+OVERLOADABLE half powr(half x, half y);
+OVERLOADABLE half pow(half x, half y);
+//no pow, we use powr instead
+OVERLOADABLE half fabs(half x);
+OVERLOADABLE half trunc(half x);
+OVERLOADABLE half round(half x);
+OVERLOADABLE half floor(half x);
+OVERLOADABLE half ceil(half x);
+OVERLOADABLE half log(half x);
+OVERLOADABLE half log2(half x);
+OVERLOADABLE half log10(half x);
+OVERLOADABLE half exp(half x);
+OVERLOADABLE half exp10(half x);
+OVERLOADABLE half expm1(half x);
+OVERLOADABLE half fmin(half a, half b);
+OVERLOADABLE half fmax(half a, half b);
+OVERLOADABLE half fma(half a, half b, half c);
+OVERLOADABLE half fdim(half x, half y);
+OVERLOADABLE half maxmag(half x, half y);
+OVERLOADABLE half minmag(half x, half y);
+OVERLOADABLE half exp2(half x);
+OVERLOADABLE half mad(half a, half b, half c);
+OVERLOADABLE half sin(half x);
+OVERLOADABLE half cos(half x);
+OVERLOADABLE half tan(half x);
+OVERLOADABLE half tgamma(half x);
+OVERLOADABLE half lgamma(half x);
+OVERLOADABLE half lgamma_r(half x, global int *signgamp);
+OVERLOADABLE half lgamma_r(half x, local int *signgamp);
+OVERLOADABLE half lgamma_r(half x, private int *signgamp);
+OVERLOADABLE half log1p(half x);
+OVERLOADABLE half logb(half x);
+OVERLOADABLE int ilogb(half x);
+OVERLOADABLE half nan(ushort code);
+OVERLOADABLE half sincos(half x, global half *cosval);
+OVERLOADABLE half sincos(half x, local half *cosval);
+OVERLOADABLE half sincos(half x, private half *cosval);
+OVERLOADABLE half sqrt(half x);
+OVERLOADABLE half rsqrt(half x);
+OVERLOADABLE half frexp(half x, global int *exp);
+OVERLOADABLE half frexp(half x, local int *exp);
+OVERLOADABLE half frexp(half x, private int *exp);
+OVERLOADABLE half nextafter(half x, half y);
+OVERLOADABLE half modf(half x, global half *i);
+OVERLOADABLE half modf(half x, local half *i);
+OVERLOADABLE half modf(half x, private half *i);
+OVERLOADABLE half hypot(half x, half y);
+OVERLOADABLE half fract(half x, global half *p);
+OVERLOADABLE half fract(half x, local half *p);
+OVERLOADABLE half fract(half x, private half *p);
+OVERLOADABLE half remquo(half x, half y, global int *quo);
+OVERLOADABLE half remquo(half x, half y, local int *quo);
+OVERLOADABLE half remquo(half x, half y, private int *quo);
+OVERLOADABLE half pown(half x, int n);
+OVERLOADABLE half rootn(half x, int n);
+
+// native half
+OVERLOADABLE half native_cos(half x);
+OVERLOADABLE half native_divide(half x, half y);
+OVERLOADABLE half native_exp(half x);
+OVERLOADABLE half native_exp2(half x);
+OVERLOADABLE half native_exp10(half x);
+OVERLOADABLE half native_log(half x);
+OVERLOADABLE half native_log2(half x);
+OVERLOADABLE half native_log10(half x);
+OVERLOADABLE half native_powr(half x, half y);
+OVERLOADABLE half native_recip(half x);
+OVERLOADABLE half native_rsqrt(half x);
+OVERLOADABLE half native_sin(half x);
+OVERLOADABLE half native_sqrt(half x);
+OVERLOADABLE half native_tan(half x);
+
+
+// half accuracy
 #define half_cos cos
 #define half_divide native_divide
 #define half_exp native_exp
diff --git a/backend/src/libocl/tmpl/ocl_relational.tmpl.cl b/backend/src/libocl/tmpl/ocl_relational.tmpl.cl
index 1100815..f66b6c1 100644
--- a/backend/src/libocl/tmpl/ocl_relational.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_relational.tmpl.cl
@@ -68,7 +68,6 @@ OVERLOADABLE int isnormal(float x) {
   return (u.u < 0x7F800000) && (u.u >= 0x800000);
 }
 
-
 OVERLOADABLE int isordered(float x, float y) {
   return isequal(x, x) && isequal(y, y);
 }
@@ -82,6 +81,71 @@ OVERLOADABLE int signbit(float x) {
 }
 
 
+// Half float version.
+OVERLOADABLE int isequal(half x, half y) {
+  return x == y;
+}
+
+OVERLOADABLE int isnotequal(half x, half y) {
+  return x != y;
+}
+
+OVERLOADABLE int isgreater(half x, half y) {
+  return x > y;
+}
+
+OVERLOADABLE int isgreaterequal(half x, half y) {
+  return x >= y;
+}
+
+OVERLOADABLE int isless(half x, half y) {
+  return x < y;
+}
+
+OVERLOADABLE int islessequal(half x, half y) {
+  return x <= y;
+}
+
+OVERLOADABLE int islessgreater(half x, half y) {
+  return (x < y) || (x > y);
+}
+
+OVERLOADABLE int isfinite(half x) {
+  union { ushort u; half h; } u;
+  u.h = x;
+  return (u.u & 0x7FFF) < 0x7C00;
+}
+
+OVERLOADABLE int isinf(half x) {
+  union { ushort u; half h; } u;
+  u.h = x;
+  return (u.u & 0x7FFF) == 0x7C00;
+}
+
+OVERLOADABLE int isnan(half x) {
+  return x != x;
+}
+
+OVERLOADABLE int isnormal(half x) {
+  union { ushort u; half h; } u;
+  u.h = x;
+  u.u &= 0x7FFF;
+  return (u.u < 0x7C00) && (u.u >= 0x400);
+}
+
+OVERLOADABLE int isordered(half x, half y) {
+  return isequal(x, x) && isequal(y, y);
+}
+OVERLOADABLE int isunordered(half x, half y) {
+  return isnan(x) || isnan(y);
+}
+OVERLOADABLE int signbit(half x) {
+  union { ushort u; half h; } u;
+  u.h = x;
+  return u.u >> 15;
+}
+
+
 // any
 #define DEC1(type) OVERLOADABLE int any(type a) { return a<0; }
 #define DEC2(type) OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0; }
diff --git a/backend/src/libocl/tmpl/ocl_relational.tmpl.h b/backend/src/libocl/tmpl/ocl_relational.tmpl.h
index 9921317..0ec0cbe 100644
--- a/backend/src/libocl/tmpl/ocl_relational.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_relational.tmpl.h
@@ -34,11 +34,29 @@ OVERLOADABLE int isinf(float x);
 OVERLOADABLE int isnan(float x);
 OVERLOADABLE int isnormal(float x);
 
-
 OVERLOADABLE int isordered(float x, float y);
 OVERLOADABLE int isunordered(float x, float y);
 OVERLOADABLE int signbit(float x);
 
+// Half half version.
+OVERLOADABLE int isequal(half x, half y);
+OVERLOADABLE int isnotequal(half x, half y);
+OVERLOADABLE int isgreater(half x, half y);
+OVERLOADABLE int isgreaterequal(half x, half y);
+OVERLOADABLE int isless(half x, half y);
+OVERLOADABLE int islessequal(half x, half y);
+OVERLOADABLE int islessgreater(half x, half y);
+
+OVERLOADABLE int isfinite(half x);
+OVERLOADABLE int isinf(half x);
+OVERLOADABLE int isnan(half x);
+OVERLOADABLE int isnormal(half x);
+
+OVERLOADABLE int isordered(half x, half y);
+OVERLOADABLE int isunordered(half x, half y);
+OVERLOADABLE int signbit(half x);
+
+
 // any
 #define DEC1(type) OVERLOADABLE int any(type a);
 #define DEC2(type) OVERLOADABLE int any(type a);
@@ -94,6 +112,7 @@ DEF(char) DEF(uchar) DEF(short) DEF(ushort) DEF(int) DEF(uint)
 DEF(long) DEF(ulong)
 #undef DEF
 OVERLOADABLE float bitselect(float a, float b, float c);
+OVERLOADABLE half bitselect(half a, half b, half c);
 
 
 #define DEF(TYPE1, TYPE2) \
@@ -116,4 +135,6 @@ DEF(ulong, long)
 DEF(ulong, ulong)
 DEF(float, int)
 DEF(float, uint)
+DEF(half, short)
+DEF(half, ushort)
 #undef DEF
diff --git a/backend/src/libocl/include/ocl_printf.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
similarity index 61%
copy from backend/src/libocl/include/ocl_printf.h
copy to backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index ffeefb9..b9da5e2 100644
--- a/backend/src/libocl/include/ocl_printf.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2012 - 2014 Intel Corporation
+ * Copyright @ 2015 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -15,18 +15,5 @@
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
  */
-#ifndef __OCL_PRINTF_H__
-#define __OCL_PRINTF_H__
 
-#include "ocl_types.h"
-
-/* The printf function. */
-/* From LLVM 3.4, c string are all in constant address space */
-#if 100*__clang_major__ + __clang_minor__ < 304
-int __gen_ocl_printf_stub(const char * format, ...);
-#else
-int __gen_ocl_printf_stub(constant char * format, ...);
-#endif
-#define printf __gen_ocl_printf_stub
-
-#endif
+#include "ocl_simd.h"
diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
similarity index 66%
copy from backend/src/libocl/include/ocl_sync.h
copy to backend/src/libocl/tmpl/ocl_simd.tmpl.h
index ed7c6e4..67a1cee 100644
--- a/backend/src/libocl/include/ocl_sync.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2012 - 2014 Intel Corporation
+ * Copyright © 2015 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -15,21 +15,20 @@
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
  */
-#ifndef __OCL_SYNC_H__
-#define __OCL_SYNC_H__
+#ifndef __OCL_SIMD_H__
+#define __OCL_SIMD_H__
 
 #include "ocl_types.h"
 
 /////////////////////////////////////////////////////////////////////////////
-// Synchronization functions
+// SIMD level function
 /////////////////////////////////////////////////////////////////////////////
-#define CLK_LOCAL_MEM_FENCE  (1 << 0)
-#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+int sub_group_any(int);
+int sub_group_all(int);
 
-typedef uint cl_mem_fence_flags;
-void barrier(cl_mem_fence_flags flags);
-void mem_fence(cl_mem_fence_flags flags);
-void read_mem_fence(cl_mem_fence_flags flags);
-void write_mem_fence(cl_mem_fence_flags flags);
+uint get_sub_group_size(void);
+uint get_sub_group_id(void);
 
-#endif  /* __OCL_SYNC_H__ */
+OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
+OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 17248c0..ebf4386 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -96,8 +96,7 @@ namespace gbe
             call->getCalledFunction()->getIntrinsicID() != 0)
           continue;
 
-        Value *Callee = call->getCalledValue();
-        const std::string fnName = Callee->getName();
+        std::string fnName = call->getCalledValue()->stripPointerCasts()->getName();
 
         if (!MFS.insert(fnName).second) {
           continue;
@@ -237,6 +236,10 @@ namespace gbe
       kernels.push_back(f);
     }
 
+    /* the SPIR binary datalayout maybe different with beignet's bitcode */
+    if(clonedLib->getDataLayout() != mod->getDataLayout())
+      mod->setDataLayout(clonedLib->getDataLayout());
+
     /* We use beignet's bitcode as dst because it will have a lot of
        lazy functions which will not be loaded. */
     char* errorMsg;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 0487bcb..4905415 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -87,6 +87,7 @@
 #endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#include "llvm/IR/IRBuilder.h"
 #if LLVM_VERSION_MINOR <= 2
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
@@ -154,6 +155,7 @@
 #include "llvm/llvm_gen_backend.hpp"
 #include "ir/context.hpp"
 #include "ir/unit.hpp"
+#include "ir/half.hpp"
 #include "ir/liveness.hpp"
 #include "ir/value.hpp"
 #include "sys/set.hpp"
@@ -182,6 +184,7 @@ namespace gbe
   static bool isScalarType(const Type *type)
   {
     return type->isFloatTy()   ||
+           type->isHalfTy()    ||
            type->isIntegerTy() ||
            type->isDoubleTy()  ||
            type->isPointerTy();
@@ -193,6 +196,8 @@ namespace gbe
     GBE_ASSERT(isScalarType(type));
     if (type->isFloatTy() == true)
       return ir::TYPE_FLOAT;
+    if (type->isHalfTy() == true)
+      return ir::TYPE_HALF;
     if (type->isDoubleTy() == true)
       return ir::TYPE_DOUBLE;
     if (type->isPointerTy() == true) {
@@ -241,7 +246,7 @@ namespace gbe
       return ir::FAMILY_BOOL;
     if (type == Type::getInt8Ty(type->getContext()))
       return ir::FAMILY_BYTE;
-    if (type == Type::getInt16Ty(type->getContext()))
+    if (type == Type::getInt16Ty(type->getContext()) || type->isHalfTy())
       return ir::FAMILY_WORD;
     if (type == Type::getInt32Ty(type->getContext()) || type->isFloatTy())
       return ir::FAMILY_DWORD;
@@ -290,11 +295,8 @@ namespace gbe
     return ir::MEM_GLOBAL;
   }
 
-  static INLINE ir::AddressSpace btiToGen(const ir::BTI &bti) {
-    if (bti.count > 1)
-      return ir::MEM_MIXED;
-    uint8_t singleBti = bti.bti[0];
-    switch (singleBti) {
+  static INLINE ir::AddressSpace btiToGen(const unsigned bti) {
+    switch (bti) {
       case BTI_CONSTANT: return ir::MEM_CONSTANT;
       case BTI_PRIVATE: return  ir::MEM_PRIVATE;
       case BTI_LOCAL: return ir::MEM_LOCAL;
@@ -361,6 +363,7 @@ namespace gbe
       switch (typeID) {
         case Type::IntegerTyID:
         case Type::FloatTyID:
+        case Type::HalfTyID:
         case Type::DoubleTyID:
         case Type::PointerTyID:
           GBE_ASSERT(index == 0);
@@ -373,6 +376,7 @@ namespace gbe
           auto elementTypeID = elementType->getTypeID();
           if (elementTypeID != Type::IntegerTyID &&
               elementTypeID != Type::FloatTyID &&
+              elementTypeID != Type::HalfTyID &&
               elementTypeID != Type::DoubleTyID)
             GBE_ASSERTM(false, "Vectors of elements are not supported");
             return this->_newScalar(value, key, elementType, index, uniform);
@@ -385,6 +389,7 @@ namespace gbe
           auto elementTypeID = elementType->getTypeID();
           if (elementTypeID != Type::IntegerTyID &&
               elementTypeID != Type::FloatTyID &&
+              elementTypeID != Type::HalfTyID &&
               elementTypeID != Type::DoubleTyID)
             GBE_ASSERTM(false, "Strcuts of elements are not supported");
             return this->_newScalar(value, key, elementType, index, uniform);
@@ -485,7 +490,14 @@ namespace gbe
 
     map<Value *, SmallVector<Value *, 4>> pointerOrigMap;
     typedef map<Value *, SmallVector<Value *, 4>>::iterator PtrOrigMapIter;
-
+    // map pointer source to bti
+    map<Value *, unsigned> BtiMap;
+    // map ptr to its bti register
+    map<Value *, Value *> BtiValueMap;
+    // map ptr to it's base
+    map<Value *, Value *> pointerBaseMap;
+    std::set<Value *> addrStoreInst;
+    typedef map<Value *, Value *>::iterator PtrBaseMapIter;
     /*! We visit each function twice. Once to allocate the registers and once to
      *  emit the Gen IR instructions
      */
@@ -501,6 +513,7 @@ namespace gbe
     } ConstTypeId;
 
     LoopInfo *LI;
+    Function *Func;
     const Module *TheModule;
     int btiBase;
   public:
@@ -547,23 +560,41 @@ namespace gbe
       bool bKernel = isKernelFunction(F);
       if(!bKernel) return false;
 
+      Func = &F;
+      assignBti(F);
       analyzePointerOrigin(F);
+
       LI = &getAnalysis<LoopInfo>();
       emitFunction(F);
       phiMap.clear();
       globalPointer.clear();
       pointerOrigMap.clear();
+      BtiMap.clear();
+      BtiValueMap.clear();
+      pointerBaseMap.clear();
+      addrStoreInst.clear();
       // Reset for next function
       btiBase = BTI_RESERVED_NUM;
       return false;
     }
     /*! Given a possible pointer value, find out the interested escape like
         load/store or atomic instruction */
-    void findPointerEscape(Value *ptr);
+    void findPointerEscape(Value *ptr, std::set<Value *> &mixedPtr, bool recordMixed, std::vector<Value *> &revisit);
     /*! For all possible pointers, GlobalVariable, function pointer argument,
         alloca instruction, find their pointer escape points */
     void analyzePointerOrigin(Function &F);
-
+    unsigned getNewBti(Value *origin, bool isImage);
+    void assignBti(Function &F);
+    bool isSingleBti(Value *Val);
+    Value *getBtiRegister(Value *v);
+    /*! get the pointer origin */
+    Value *getSinglePointerOrigin(Value *ptr);
+    /*! get the bti base address */
+    Value *getPointerBase(Value *ptr);
+    void processPointerArray(Value *ptr, Value *bti, Value *base);
+    void handleStoreLoadAddress(Function &F);
+
+    MDNode *getKernelFunctionMetadata(Function *F);
     virtual bool doFinalization(Module &M) { return false; }
     /*! handle global variable register allocation (local, constant space) */
     void allocateGlobalVariableRegister(Function &F);
@@ -660,10 +691,10 @@ namespace gbe
     // batch vec4/8/16 load/store
     INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                   Value *llvmValue, const ir::Register ptr,
-                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
-                  bool dwAligned);
+                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::Register bti,
+                  bool dwAligned, bool fixedBTI);
     // handle load of dword/qword with unaligned address
-    void emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned);
+    void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
     private:
       ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
@@ -675,10 +706,50 @@ namespace gbe
 
   char GenWriter::ID = 0;
 
-  void GenWriter::findPointerEscape(Value *ptr) {
+  static void updatePointerSource(Value *parent, Value *theUser, Value *source, SmallVector<Value *, 4> &pointers) {
+    if (isa<SelectInst>(theUser)) {
+      SelectInst *si = dyn_cast<SelectInst>(theUser);
+      if (si->getTrueValue() == parent)
+        pointers[0] = source;
+      else
+        pointers[1] = source;
+    } else if (isa<PHINode>(theUser)) {
+      PHINode *phi = dyn_cast<PHINode>(theUser);
+      unsigned opNum = phi->getNumIncomingValues();
+      for (unsigned j = 0; j < opNum; j++) {
+        if (phi->getIncomingValue(j) == parent) {
+          pointers[j] = source;
+        }
+      }
+    } else {
+      pointers[0] = source;
+    }
+  }
+
+  bool isMixedPoint(Value *val, SmallVector<Value *, 4> &pointers) {
+    Value *validSrc = NULL;
+    unsigned i = 0;
+    if (pointers.size() < 2) return false;
+    while(i < pointers.size()) {
+      if (pointers[i] != NULL && validSrc != NULL && pointers[i] != validSrc)
+        return true;
+      // when source is same as itself, we don't treat it as a new source
+      // this often occurs for PHINode
+      if (pointers[i] != NULL && validSrc == NULL && pointers[i] != val) {
+        validSrc = pointers[i];
+      }
+      i++;
+    }
+    return false;
+  }
+
+  void GenWriter::findPointerEscape(Value *ptr,  std::set<Value *> &mixedPtr, bool bFirstPass, std::vector<Value *> &revisit) {
     std::vector<Value*> workList;
     std::set<Value *> visited;
+    // loadInst result maybe used as pointer
+    std::set<LoadInst *> ptrCandidate;
 
+    bool isPointerArray = false;
     if (ptr->use_empty()) return;
 
     workList.push_back(ptr);
@@ -686,7 +757,6 @@ namespace gbe
     for (unsigned i = 0; i < workList.size(); i++) {
       Value *work = workList[i];
       if (work->use_empty()) continue;
-
       for (Value::use_iterator iter = work->use_begin(); iter != work->use_end(); ++iter) {
       // After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
       // which is more straightforward.
@@ -695,10 +765,97 @@ namespace gbe
   #else
         User *theUser = iter->getUser();
   #endif
-        if (visited.find(theUser) != visited.end()) continue;
+        // becareful with sub operation
+        if (isa<BinaryOperator>(theUser) && dyn_cast<BinaryOperator>(theUser)->getOpcode() == Instruction::Sub) {
+          // check both comes from ptrtoInt, don't need to traverse ptrdiff
+          Value *op0 = theUser->getOperand(0);
+          Value *op1 = theUser->getOperand(1);
+          if ((isa<Instruction>(op0) && dyn_cast<Instruction>(op0)->getOpcode() == Instruction::PtrToInt)
+              &&(isa<Instruction>(op1) && dyn_cast<Instruction>(op1)->getOpcode() == Instruction::PtrToInt)) {
+            continue;
+          }
+        }
+
+        if (isa<Instruction>(theUser)) {
+          // some GlobalVariable maybe used in the function which is not current processed.
+          // such kind of user should be skipped
+          if (dyn_cast<Instruction>(theUser)->getParent()->getParent() != Func)
+            continue;
+        }
+
+        bool visitedInThisSource = visited.find(theUser) != visited.end();
+
+        if (isa<SelectInst>(theUser) || isa<PHINode>(theUser))
+        {
+          // reached from another source, update pointer source
+          PtrOrigMapIter ptrIter = pointerOrigMap.find(theUser);
+          if (ptrIter == pointerOrigMap.end()) {
+            // create new one
+            unsigned capacity = 1;
+            if (isa<SelectInst>(theUser)) capacity = 2;
+            if (isa<PHINode>(theUser)) {
+              PHINode *phi = dyn_cast<PHINode>(theUser);
+              capacity = phi->getNumIncomingValues();
+            }
+
+            SmallVector<Value *, 4> pointers;
+
+            unsigned k = 0;
+            while (k++ < capacity) {
+              pointers.push_back(NULL);
+            }
+
+            updatePointerSource(work, theUser, ptr, pointers);
+            pointerOrigMap.insert(std::make_pair(theUser, pointers));
+          } else {
+            // update pointer source
+            updatePointerSource(work, theUser, ptr, (*ptrIter).second);
+          }
+          ptrIter = pointerOrigMap.find(theUser);
+
+          if (isMixedPoint(theUser, (*ptrIter).second)) {
+            // for the first pass, we need to record the mixed point instruction.
+            // for the second pass, we don't need to go further, the reason is:
+            // we always use it's 'direct mixed pointer parent' as origin, if we don't
+            // stop here, we may set wrong pointer origin.
+            if (bFirstPass)
+              mixedPtr.insert(theUser);
+            else
+              continue;
+          }
+          // don't fall into dead loop,
+          if (visitedInThisSource || theUser == ptr) {
+            continue;
+          }
+        }
+
         // pointer address is used as the ValueOperand in store instruction, should be skipped
-        if (StoreInst *load = dyn_cast<StoreInst>(theUser)) {
-          if (load->getValueOperand() == work) {
+        if (StoreInst *store = dyn_cast<StoreInst>(theUser)) {
+          if (store->getValueOperand() == work) {
+            addrStoreInst.insert(store);
+            Value * pointerOperand = store->getPointerOperand();
+            // check whether the pointerOperand already visited or not,
+            // if not visited, then we need to record all the loadInst
+            // on the origin of pointerOperand
+            // if visited, that is the origin of the pointerOperand already
+            // traversed, we need to the traverse again to record all the LoadInst
+            PtrOrigMapIter pointerOpIter = pointerOrigMap.find(pointerOperand);
+            bool pointerVisited = pointerOpIter != pointerOrigMap.end();
+            if (pointerVisited) {
+              revisit.push_back((*pointerOpIter).second[0]);
+            }
+
+            PtrOrigMapIter ptrIter = pointerOrigMap.find(work);
+            if (ptrIter == pointerOrigMap.end()) {
+              // create new one
+              SmallVector<Value *, 4> pointers;
+              pointers.push_back(ptr);
+              pointerOrigMap.insert(std::make_pair(work, pointers));
+            } else {
+              // update the pointer source here,
+              (*ptrIter).second[0] = ptr;
+            }
+
             continue;
           }
         }
@@ -710,16 +867,412 @@ namespace gbe
             Function *F = dyn_cast<CallInst>(theUser)->getCalledFunction();
             if (!F || F->getIntrinsicID() != 0) continue;
           }
+          Value *pointer = NULL;
+          if (isa<LoadInst>(theUser)) {
+            ptrCandidate.insert(cast<LoadInst>(theUser));
+            pointer = dyn_cast<LoadInst>(theUser)->getPointerOperand();
+          } else if (isa<StoreInst>(theUser)) {
+            pointer = dyn_cast<StoreInst>(theUser)->getPointerOperand();
+            // Check whether we have stored a address to this pointer
+            // if yes, we need to traverse the ptrCandidate, as they are loaded pointers
+            if (addrStoreInst.find(theUser) != addrStoreInst.end()) {
+              isPointerArray = true;
+            }
+          } else if (isa<CallInst>(theUser)) {
+            // atomic/read(write)image
+            CallInst *ci = dyn_cast<CallInst>(theUser);
+            pointer = ci->getArgOperand(0);
+          } else {
+            theUser->dump();
+            GBE_ASSERT(0 && "Unknown instruction operating on pointers\n");
+          }
 
-          PtrOrigMapIter ptrIter = pointerOrigMap.find(theUser);
+          // the pointer operand is same as pointer origin, don't add to pointerOrigMap
+          if (ptr == pointer) continue;
+
+          // load/store/atomic instruction, we have reached the end, stop further traversing
+          PtrOrigMapIter ptrIter = pointerOrigMap.find(pointer);
           if (ptrIter == pointerOrigMap.end()) {
             // create new one
             SmallVector<Value *, 4> pointers;
             pointers.push_back(ptr);
-            pointerOrigMap.insert(std::make_pair(theUser, pointers));
+            pointerOrigMap.insert(std::make_pair(pointer, pointers));
           } else {
-            // append it
-            (*ptrIter).second.push_back(ptr);
+            // update the pointer source here,
+            (*ptrIter).second[0] = ptr;
+          }
+        } else {
+          workList.push_back(theUser);
+        }
+      }
+    }
+
+    if (isPointerArray) {
+      GBE_ASSERT((isa<AllocaInst>(ptr) || ptrCandidate.empty())
+                && "storing/loading pointers only support private array");
+      for (auto x : ptrCandidate) {
+        revisit.push_back(x);
+      }
+    }
+    ptrCandidate.clear();
+  }
+
+  bool GenWriter::isSingleBti(Value *Val) {
+    // self + others same --> single
+    // all same  ---> single
+    if (!isa<SelectInst>(Val) && !isa<PHINode>(Val)) {
+      return true;
+    } else {
+      PtrOrigMapIter iter = pointerOrigMap.find(Val);
+      SmallVector<Value *, 4> &pointers = (*iter).second;
+      unsigned srcNum = pointers.size();
+      Value *source = NULL;
+      for (unsigned x = 0; x < srcNum; x++) {
+        // often happend in phiNode where one source is same as PHINode itself, skip it
+        if (pointers[x] == Val) continue;
+
+        if (source == NULL) source = pointers[x];
+        else {
+          if (source != pointers[x])
+            return false;
+        }
+      }
+      return true;
+    }
+  }
+  Value *GenWriter::getPointerBase(Value *ptr) {
+    PtrBaseMapIter baseIter = pointerBaseMap.find(ptr);
+    if (baseIter != pointerBaseMap.end()) {
+      return baseIter->second;
+    }
+    typedef std::map<Value *, unsigned>::iterator BtiIter;
+    // for pointers that already assigned a bti, it is the base pointer,
+    BtiIter found = BtiMap.find(ptr);
+    if (found != BtiMap.end()) {
+      if (isa<PointerType>(ptr->getType())) {
+        PointerType *ty = cast<PointerType>(ptr->getType());
+        // only global pointer will have starting address
+        if (ty->getAddressSpace() == 1) {
+          return ptr;
+        } else {
+          return ConstantPointerNull::get(ty);
+        }
+      } else {
+          PointerType *ty = PointerType::get(ptr->getType(), 0);
+          return ConstantPointerNull::get(ty);
+      }
+    }
+
+    PtrOrigMapIter iter = pointerOrigMap.find(ptr);
+    SmallVector<Value *, 4> &pointers = (*iter).second;
+    if (isSingleBti(ptr)) {
+      Value *base = getPointerBase(pointers[0]);
+      pointerBaseMap.insert(std::make_pair(ptr, base));
+      return base;
+    } else {
+      if (isa<SelectInst>(ptr)) {
+          SelectInst *si = dyn_cast<SelectInst>(ptr);
+          IRBuilder<> Builder(si->getParent());
+
+          Value *trueVal = getPointerBase((*iter).second[0]);
+          Value *falseVal = getPointerBase((*iter).second[1]);
+          Builder.SetInsertPoint(si);
+          Value *base = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
+          pointerBaseMap.insert(std::make_pair(ptr, base));
+        return base;
+      } else if (isa<PHINode>(ptr)) {
+          PHINode *phi = dyn_cast<PHINode>(ptr);
+          IRBuilder<> Builder(phi->getParent());
+          Builder.SetInsertPoint(phi);
+
+          PHINode *basePhi = Builder.CreatePHI(ptr->getType(), phi->getNumIncomingValues());
+          unsigned srcNum = pointers.size();
+          for (unsigned x = 0; x < srcNum; x++) {
+            Value *base = NULL;
+            if (pointers[x] != ptr) {
+              base = getPointerBase(pointers[x]);
+            } else {
+              base = basePhi;
+            }
+            IRBuilder<> Builder2(phi->getIncomingBlock(x));
+            BasicBlock *predBB = phi->getIncomingBlock(x);
+            if (predBB->getTerminator())
+              Builder2.SetInsertPoint(predBB->getTerminator());
+
+#if (LLVM_VERSION_MAJOR== 3 && LLVM_VERSION_MINOR < 6)
+  // llvm 3.5 and older version don't have CreateBitOrPointerCast() define
+            Type *srcTy = base->getType();
+            Type *dstTy = ptr->getType();
+            if (srcTy->isPointerTy() && dstTy->isIntegerTy())
+              base = Builder2.CreatePtrToInt(base, dstTy);
+            else if (srcTy->isIntegerTy() && dstTy->isPointerTy())
+              base = Builder2.CreateIntToPtr(base, dstTy);
+            else if (srcTy != dstTy)
+              base = Builder2.CreateBitCast(base, dstTy);
+#else
+            base = Builder2.CreateBitOrPointerCast(base, ptr->getType());
+#endif
+            basePhi->addIncoming(base, phi->getIncomingBlock(x));
+          }
+          pointerBaseMap.insert(std::make_pair(ptr, basePhi));
+          return basePhi;
+      } else {
+        ptr->dump();
+        GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
+        return ptr;
+      }
+    }
+  }
+
+  Value *GenWriter::getSinglePointerOrigin(Value *ptr) {
+    typedef std::map<Value *, unsigned>::iterator BtiIter;
+    // for pointers that already assigned a bti, it is the pointer origin,
+    BtiIter found = BtiMap.find(ptr);
+    if (found != BtiMap.end())
+      return ptr;
+    PtrOrigMapIter iter = pointerOrigMap.find(ptr);
+    GBE_ASSERT(iter != pointerOrigMap.end());
+    return iter->second[0];
+  }
+
+  Value *GenWriter::getBtiRegister(Value *Val) {
+    typedef std::map<Value *, unsigned>::iterator BtiIter;
+    typedef std::map<Value *, Value *>::iterator BtiValueIter;
+    BtiIter found = BtiMap.find(Val);
+    BtiValueIter valueIter = BtiValueMap.find(Val);
+    if (valueIter != BtiValueMap.end())
+      return valueIter->second;
+
+    if (found != BtiMap.end()) {
+      // the Val already got assigned an BTI, return it
+      Value *bti = ConstantInt::get(IntegerType::get(Val->getContext(), 32), found->second);
+      BtiValueMap.insert(std::make_pair(Val, bti));
+      return bti;
+    } else {
+      if (isSingleBti(Val)) {
+        PtrOrigMapIter iter = pointerOrigMap.find(Val);
+        GBE_ASSERT(iter != pointerOrigMap.end());
+        Value * bti = getBtiRegister((*iter).second[0]);
+        BtiValueMap.insert(std::make_pair(Val, bti));
+        return bti;
+      } else {
+        if (isa<SelectInst>(Val)) {
+          SelectInst *si = dyn_cast<SelectInst>(Val);
+
+          IRBuilder<> Builder(si->getParent());
+          PtrOrigMapIter iter = pointerOrigMap.find(Val);
+          GBE_ASSERT(iter != pointerOrigMap.end());
+          Value *trueVal = getBtiRegister((*iter).second[0]);
+          Value *falseVal = getBtiRegister((*iter).second[1]);
+          Builder.SetInsertPoint(si);
+          Value *bti = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
+          BtiValueMap.insert(std::make_pair(Val, bti));
+          return bti;
+        } else if (isa<PHINode>(Val)) {
+          PHINode *phi = dyn_cast<PHINode>(Val);
+          IRBuilder<> Builder(phi->getParent());
+          Builder.SetInsertPoint(phi);
+
+          PHINode *btiPhi = Builder.CreatePHI(IntegerType::get(Val->getContext(), 32), phi->getNumIncomingValues());
+          PtrOrigMapIter iter = pointerOrigMap.find(Val);
+          GBE_ASSERT(iter != pointerOrigMap.end());
+          SmallVector<Value *, 4> &pointers = (*iter).second;
+          unsigned srcNum = pointers.size();
+          for (unsigned x = 0; x < srcNum; x++) {
+            Value *bti = NULL;
+            if (pointers[x] != Val) {
+              bti = getBtiRegister(pointers[x]);
+            } else {
+              bti = btiPhi;
+            }
+            btiPhi->addIncoming(bti, phi->getIncomingBlock(x));
+          }
+          BtiValueMap.insert(std::make_pair(Val, btiPhi));
+          return btiPhi;
+        } else {
+          Val->dump();
+          GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
+          return Val;
+        }
+      }
+    }
+  }
+
+  unsigned GenWriter::getNewBti(Value *origin, bool isImage) {
+    unsigned new_bti = 0;
+    if (isImage) {
+      new_bti = btiBase;
+      incBtiBase();
+      return new_bti;
+    }
+
+    if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
+      new_bti = btiBase;
+      incBtiBase();
+    } else if (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
+      new_bti = btiBase;
+      incBtiBase();
+    }
+    else if (isa<GlobalVariable>(origin)
+        && dyn_cast<GlobalVariable>(origin)->isConstant()) {
+      new_bti = BTI_CONSTANT;
+    } else {
+      unsigned space = origin->getType()->getPointerAddressSpace();
+      switch (space) {
+        case 0:
+          new_bti = BTI_PRIVATE;
+          break;
+        case 1:
+        {
+          new_bti = btiBase;
+          incBtiBase();
+          break;
+        }
+        case 2:
+          new_bti = BTI_CONSTANT;
+
+          break;
+        case 3:
+          new_bti = BTI_LOCAL;
+          break;
+        default:
+          GBE_ASSERT(0);
+          break;
+      }
+    }
+    return new_bti;
+  }
+
+  MDNode *GenWriter::getKernelFunctionMetadata(Function *F) {
+    NamedMDNode *clKernels = TheModule->getNamedMetadata("opencl.kernels");
+     uint32_t ops = clKernels->getNumOperands();
+      for(uint32_t x = 0; x < ops; x++) {
+        MDNode* node = clKernels->getOperand(x);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+        Value * op = node->getOperand(0);
+#else
+        auto *V = cast<ValueAsMetadata>(node->getOperand(0));
+        Value *op = V ? V->getValue() : NULL;
+#endif
+        if(op == F) {
+          return node;
+        }
+      }
+    return NULL;
+  }
+
+  void GenWriter::assignBti(Function &F) {
+    Module::GlobalListType &globalList = const_cast<Module::GlobalListType &> (TheModule->getGlobalList());
+    for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+      GlobalVariable &v = *i;
+      if(!v.isConstantUsed()) continue;
+
+      BtiMap.insert(std::make_pair(&v, getNewBti(&v, false)));
+    }
+    MDNode *typeNameNode = NULL;
+    MDNode *node = getKernelFunctionMetadata(&F);
+    for(uint j = 0; j < node->getNumOperands() - 1; j++) {
+      MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
+      if (attrNode == NULL) break;
+      MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
+      if (!attrName) continue;
+      if (attrName->getString() == "kernel_arg_type") {
+        typeNameNode = attrNode;
+      }
+    }
+
+    unsigned argID = 0;
+    ir::FunctionArgument::InfoFromLLVM llvmInfo;
+    for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, argID++) {
+      llvmInfo.typeName= (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+      bool isImage = llvmInfo.isImageType();
+      if (I->getType()->isPointerTy() || isImage) {
+        BtiMap.insert(std::make_pair(I, getNewBti(I, isImage)));
+      }
+    }
+
+    BasicBlock &bb = F.getEntryBlock();
+    for (BasicBlock::iterator iter = bb.begin(), iterE = bb.end(); iter != iterE; ++iter) {
+      if (AllocaInst *ai = dyn_cast<AllocaInst>(iter)) {
+        BtiMap.insert(std::make_pair(ai, BTI_PRIVATE));
+      }
+    }
+  }
+
+  void GenWriter::processPointerArray(Value *ptr, Value *bti, Value *base) {
+    std::vector<Value*> workList;
+    std::set<Value *> visited;
+
+    if (ptr->use_empty()) return;
+
+    workList.push_back(ptr);
+
+    for (unsigned i = 0; i < workList.size(); i++) {
+      Value *work = workList[i];
+      if (work->use_empty()) continue;
+
+      for (Value::use_iterator iter = work->use_begin(); iter != work->use_end(); ++iter) {
+      // After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
+      // which is more straightforward.
+  #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+        User *theUser = *iter;
+  #else
+        User *theUser = iter->getUser();
+  #endif
+        if(visited.find(theUser) != visited.end())
+          continue;
+
+        visited.insert(theUser);
+
+        if (isa<LoadInst>(theUser) || isa<StoreInst>(theUser) || isa<CallInst>(theUser)) {
+          if (isa<CallInst>(theUser)) {
+            Function *F = dyn_cast<CallInst>(theUser)->getCalledFunction();
+            if (!F || F->getIntrinsicID() != 0) continue;
+          }
+          bool isLoad; Value *pointerOp;
+
+          IRBuilder<> Builder(cast<Instruction>(theUser)->getParent());
+          if (isa<LoadInst>(theUser)) {
+            pointerOp = dyn_cast<LoadInst>(theUser)->getPointerOperand();
+            isLoad = true;
+          } else {
+            pointerOp = dyn_cast<StoreInst>(theUser)->getPointerOperand();
+            isLoad = false;
+          }
+          Builder.SetInsertPoint(cast<Instruction>(theUser));
+
+          Type *int32Ty = Type::getInt32Ty(ptr->getContext());
+          Value *v1 = Builder.CreatePtrToInt(pointerOp, int32Ty);
+
+          Value *v2 = Builder.CreatePtrToInt(getSinglePointerOrigin(pointerOp), int32Ty);
+          Value *v3 = Builder.CreatePtrToInt(base, int32Ty);
+          Value *v4 = Builder.CreatePtrToInt(bti, int32Ty);
+          // newLocBase = (pointer - origin) + base_start
+          Value *diff = Builder.CreateSub(v1, v2);
+          Value *newLocBase = Builder.CreateAdd(v3, diff);
+          newLocBase = Builder.CreateIntToPtr(newLocBase, Type::getInt32PtrTy(ptr->getContext()));
+          // newLocBti = (pointer - origin) + bti_start
+          Value *newLocBti = Builder.CreateAdd(v4, diff);
+          newLocBti = Builder.CreateIntToPtr(newLocBti, Type::getInt32PtrTy(ptr->getContext()));
+
+          // later GenWriter instruction translation needs this map info
+          BtiValueMap.insert(std::make_pair(newLocBti, ConstantInt::get(Type::getInt32Ty(ptr->getContext()), BTI_PRIVATE)));
+          pointerBaseMap.insert(std::make_pair(newLocBti, ConstantPointerNull::get(cast<PointerType>(pointerOp->getType()))));
+
+          BtiValueMap.insert(std::make_pair(newLocBase, ConstantInt::get(Type::getInt32Ty(ptr->getContext()), BTI_PRIVATE)));
+          pointerBaseMap.insert(std::make_pair(newLocBase, ConstantPointerNull::get(cast<PointerType>(pointerOp->getType()))));
+
+          if (isLoad) {
+            Value *loadedBase = Builder.CreateLoad(newLocBase);
+            Value *loadedBti = Builder.CreateLoad(newLocBti);
+
+            BtiValueMap.insert(std::make_pair(theUser, loadedBti));
+            pointerBaseMap.insert(std::make_pair(theUser, loadedBase));
+          } else {
+            Value *valueOp = cast<StoreInst>(theUser)->getValueOperand();
+            Value *tmp = Builder.CreatePtrToInt(getPointerBase(valueOp), Type::getInt32Ty(ptr->getContext()));
+            Builder.CreateStore(tmp, newLocBase);
+            Builder.CreateStore(getBtiRegister(valueOp), newLocBti);
           }
         } else {
           workList.push_back(theUser);
@@ -729,26 +1282,82 @@ namespace gbe
   }
 
   void GenWriter::analyzePointerOrigin(Function &F) {
+    // used to record where the pointers get mixed (i.e. select or phi instruction)
+    std::set<Value *> mixedPtr;
+    // This is a two-pass algorithm, the 1st pass will try to update the pointer sources for
+    // every instruction reachable from pointers and record mix-point in this pass.
+    // The second pass will start from really mixed-pointer instruction like select or phinode.
+    // and update the sources correctly. For pointers reachable from mixed-pointer, we will set
+    // its direct mixed-pointer parent as it's pointer origin.
+
+    std::vector<Value *> revisit;
     // GlobalVariable
     Module::GlobalListType &globalList = const_cast<Module::GlobalListType &> (TheModule->getGlobalList());
     for(auto i = globalList.begin(); i != globalList.end(); i ++) {
       GlobalVariable &v = *i;
       if(!v.isConstantUsed()) continue;
-      findPointerEscape(&v);
+      findPointerEscape(&v, mixedPtr, true, revisit);
     }
     // function argument
     for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
       if (I->getType()->isPointerTy()) {
-        findPointerEscape(I);
+        findPointerEscape(I, mixedPtr, true, revisit);
       }
     }
     // alloca
     BasicBlock &bb = F.getEntryBlock();
     for (BasicBlock::iterator iter = bb.begin(), iterE = bb.end(); iter != iterE; ++iter) {
       if (AllocaInst *ai = dyn_cast<AllocaInst>(iter)) {
-        findPointerEscape(ai);
+        findPointerEscape(ai, mixedPtr, true, revisit);
       }
     }
+    // storing/loading pointer would introduce revisit
+    for (std::vector<Value *>::iterator iter = revisit.begin(); iter != revisit.end(); ++iter) {
+      findPointerEscape(*iter, mixedPtr, true, revisit);
+    }
+
+    // the second pass starts from mixed pointer
+    for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
+      findPointerEscape(*iter, mixedPtr, false, revisit);
+    }
+
+    for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
+      getBtiRegister(*iter);
+    }
+
+    for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
+      getPointerBase(*iter);
+    }
+    handleStoreLoadAddress(F);
+  }
+  void GenWriter::handleStoreLoadAddress(Function &F) {
+    std::set<Value *> processed;
+    for (std::set<Value *>::iterator iter = addrStoreInst.begin(); iter != addrStoreInst.end(); ++iter) {
+      StoreInst *store = cast<StoreInst>(*iter);
+      Value *pointerOp = store->getPointerOperand();
+      Value *base = getSinglePointerOrigin(pointerOp);
+      if (processed.find(base) != processed.end()) {
+        continue;
+      }
+      processed.insert(base);
+
+      if (!isa<AllocaInst>(base)) continue;
+
+      Value *ArraySize = cast<AllocaInst>(base)->getArraySize();
+
+      BasicBlock &entry = F.getEntryBlock();
+      BasicBlock::iterator bbIter = entry.begin();
+      while (isa<AllocaInst>(bbIter)) ++bbIter;
+
+      IRBuilder<> Builder(&entry);
+      Builder.SetInsertPoint(bbIter);
+
+      PointerType * AITy = cast<AllocaInst>(base)->getType();
+      Value * btiArray = Builder.CreateAlloca(AITy->getElementType(), ArraySize, base->getName() + ".bti");
+      Value * pointerBaseArray = Builder.CreateAlloca(AITy->getElementType(), ArraySize, base->getName() + ".pointer-base");
+
+      processPointerArray(base, btiArray, pointerBaseArray);
+    }
   }
 
   void getSequentialData(const ConstantDataSequential *cda, void *ptr, uint32_t &offset) {
@@ -942,6 +1551,8 @@ namespace gbe
         return processSeqConstant<float>(seq, index, CONST_FLOAT);
       } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
         return processSeqConstant<double>(seq, index, CONST_DOUBLE);
+      } else if (Ty == Type::getHalfTy(CPV->getContext())) {
+        GBE_ASSERTM(0, "Const data array never be half float\n");
       }
     } else
 #endif /* LLVM_VERSION_MINOR > 0 */
@@ -968,6 +1579,9 @@ namespace gbe
       } else if (Ty == Type::getFloatTy(CPV->getContext())) {
         const float f32 = 0;
         return ctx.newImmediate(f32);
+      } else if (Ty == Type::getHalfTy(CPV->getContext())) {
+        const ir::half f16 = 0;
+        return ctx.newImmediate(f16);
       } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
         const double f64 = 0;
         return ctx.newImmediate(f64);
@@ -1021,6 +1635,7 @@ namespace gbe
         if (Ty == Type::getInt32Ty(CPV->getContext())) return ctx.newImmediate((uint32_t)0);
         if (Ty == Type::getInt64Ty(CPV->getContext())) return ctx.newImmediate((uint64_t)0);
         if (Ty == Type::getFloatTy(CPV->getContext())) return ctx.newImmediate((float)0);
+        if (Ty == Type::getHalfTy(CPV->getContext())) return ctx.newImmediate((ir::half)0);
         if (Ty == Type::getDoubleTy(CPV->getContext())) return ctx.newImmediate((double)0);
         GBE_ASSERT(0 && "Unsupported undef value type.\n");
       }
@@ -1028,6 +1643,7 @@ namespace gbe
       // Floats and doubles
       switch (typeID) {
         case Type::FloatTyID:
+        case Type::HalfTyID:
         case Type::DoubleTyID:
         {
           ConstantFP *FPC = cast<ConstantFP>(CPV);
@@ -1036,9 +1652,16 @@ namespace gbe
           if (FPC->getType() == Type::getFloatTy(CPV->getContext())) {
             const float f32 = FPC->getValueAPF().convertToFloat();
             return ctx.newImmediate(f32);
-          } else {
+          } else if (FPC->getType() == Type::getDoubleTy(CPV->getContext())) {
             const double f64 = FPC->getValueAPF().convertToDouble();
             return ctx.newImmediate(f64);
+          } else {
+            llvm::APFloat apf = FPC->getValueAPF();
+            llvm::APInt api = apf.bitcastToAPInt();
+            uint64_t v64 = api.getZExtValue();
+            uint16_t v16 = static_cast<uint16_t>(v64);
+            const ir::half f16(v16);
+            return ctx.newImmediate(f16);
           }
         }
         break;
@@ -1075,6 +1698,7 @@ namespace gbe
     switch (typeID) {
       case Type::IntegerTyID:
       case Type::FloatTyID:
+      case Type::HalfTyID:
       case Type::DoubleTyID:
       case Type::PointerTyID:
         regTranslator.newScalar(value, key, 0, uniform);
@@ -1253,11 +1877,9 @@ namespace gbe
                 "Returned value for kernel functions is forbidden");
 
     // Loop over the kernel metadatas to set the required work group size.
-    NamedMDNode *clKernelMetaDatas = TheModule->getNamedMetadata("opencl.kernels");
     size_t reqd_wg_sz[3] = {0, 0, 0};
     size_t hint_wg_sz[3] = {0, 0, 0};
     ir::FunctionArgument::InfoFromLLVM llvmInfo;
-    MDNode *node = NULL;
     MDNode *addrSpaceNode = NULL;
     MDNode *typeNameNode = NULL;
     MDNode *accessQualNode = NULL;
@@ -1267,16 +1889,7 @@ namespace gbe
     std::string functionAttributes;
 
     /* First find the meta data belong to this function. */
-    for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++) {
-      node = clKernelMetaDatas->getOperand(i);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
-      if (node->getOperand(0) == &F) break;
-#else
-      auto *V = cast<ValueAsMetadata>(node->getOperand(0));
-      if (V && V->getValue() == &F) break;
-#endif
-      node = NULL;
-    }
+    MDNode *node = getKernelFunctionMetadata(&F);
 
     /* because "-cl-kernel-arg-info", should always have meta data. */
     if (!F.arg_empty())
@@ -1362,7 +1975,6 @@ namespace gbe
         functionAttributes += " ";
       }
     }
-    ctx.appendSurface(1, ir::ocl::stackbuffer);
 
     ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
 
@@ -1388,7 +2000,9 @@ namespace gbe
         llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
         llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(1 + argID)))->getString();
         llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(1 + argID)))->getString();
-        llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
+        if(argNameNode){
+          llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
+        }
 
         // function arguments are uniform values.
         this->newRegister(I, NULL, true);
@@ -1417,7 +2031,7 @@ namespace gbe
         const ir::Register reg = getRegister(I);
         if (llvmInfo.isImageType()) {
           ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, 4, 4, 0);
-          ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
+          ctx.getFunction().getImageSet()->append(reg, &ctx, BtiMap.find(I)->second);
           collectImageArgs(llvmInfo.accessQual, imageArgsInfo);
           continue;
         }
@@ -1450,10 +2064,7 @@ namespace gbe
             const uint32_t align = getAlignmentByte(unit, pointed);
               switch (addrSpace) {
               case ir::MEM_GLOBAL:
-                globalPointer.insert(std::make_pair(I, btiBase));
-                ctx.appendSurface(btiBase, reg);
-                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
-                incBtiBase();
+                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, BtiMap.find(I)->second);
               break;
               case ir::MEM_LOCAL:
                 ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, BTI_LOCAL);
@@ -1552,6 +2163,9 @@ namespace gbe
     // between phi and phiCopy live range. If there is no point that
     // phi & phiCopy are both alive, then we can optimize off the move
     // from phiCopy to phi, and use phiCopy directly instead of phi.
+    // right now, the algorithm is still very conservative, we need to do
+    // aggressive coaleasing for the moves added during phi elimination.
+
     using namespace ir;
     ir::FunctionDAG *dag = new ir::FunctionDAG(liveness);
 
@@ -1563,6 +2177,13 @@ namespace gbe
       const ir::UseSet *phiUse = dag->getRegUse(phi);
       const DefSet *phiDef = dag->getRegDef(phi);
       bool isOpt = true;
+
+      // FIXME, I find under some situation, the phiDef maybe null, seems a bug when building FunctionDAg.
+      // need fix it there.
+      if (phiDef->empty()) continue;
+
+      const ir::BasicBlock *phiDefBB = (*phiDef->begin())->getInstruction()->getParent();
+
       for (auto &x : *phiCopyDef) {
         const ir::Instruction * phiCopyDefInsn = x->getInstruction();
         const ir::BasicBlock *bb = phiCopyDefInsn->getParent();
@@ -1573,6 +2194,62 @@ namespace gbe
           isOpt = false;
           break;
         }
+
+        const ir::Register phiCopySrc = phiCopyDefInsn->getSrc(0);
+        const ir::UseSet *phiCopySrcUse = dag->getRegUse(phiCopySrc);
+        const ir::DefSet *phiCopySrcDef = dag->getRegDef(phiCopySrc);
+
+        // we should only do coaleasing on instruction-def and ssa-value
+        if (phiCopySrcDef->size() == 1 && (*(phiCopySrcDef->begin()))->getType() == ValueDef::DEF_INSN_DST) {
+          const ir::Instruction *phiCopySrcDefInsn = (*(phiCopySrcDef->begin()))->getInstruction();
+          if(bb == phiDefBB && bb == phiCopySrcDefInsn->getParent()) {
+            // phiCopy, phiCopySrc defined in same basicblock as phi
+            // try to coalease phiCopy and phiCopySrc first.
+            // consider below situation:
+            // bb1:
+            //    ...
+            // bb2:
+            //    x = phi [x1, bb1], [x2, bb2]
+            //    x2 = x+1;
+            // after de-ssa:
+            // bb2:
+            //    mov x, x-copy
+            //    add x2, x, 1
+            //    mov x-copy, x2
+            //  obviously x2, x-copy and x2 can be mapped to same virtual register
+
+            ir::BasicBlock::const_iterator iter = ir::BasicBlock::const_iterator(phiCopySrcDefInsn);
+            ir::BasicBlock::const_iterator iterE = bb->end();
+            // check no use of phi in this basicblock between [phiCopySrc def, bb end]
+            bool phiPhiCopySrcInterfere = false;
+            while (iter != iterE) {
+              const ir::Instruction *insn = iter.node();
+              // check phiUse
+              for (unsigned i = 0; i < insn->getSrcNum(); i++) {
+                ir::Register src = insn->getSrc(i);
+                if (src == phi) {
+                  phiPhiCopySrcInterfere = true; break;
+                }
+              }
+              ++iter;
+            }
+            if (!phiPhiCopySrcInterfere) {
+              // phiCopy source can be coaleased with phiCopy
+              const_cast<Instruction *>(phiCopyDefInsn)->remove();
+
+              for (auto &s : *phiCopySrcDef) {
+                const Instruction *phiSrcDefInsn = s->getInstruction();
+                replaceDst(const_cast<Instruction *>(phiSrcDefInsn), phiCopySrc, phiCopy);
+              }
+
+              for (auto &s : *phiCopySrcUse) {
+                const Instruction *phiSrcUseInsn = s->getInstruction();
+                replaceSrc(const_cast<Instruction *>(phiSrcUseInsn), phiCopySrc, phiCopy);
+              }
+            }
+          }
+        }
+
         // If phi is used in the same BB that define the phiCopy,
         // we need carefully check the liveness of phi & phiCopy.
         // Make sure their live ranges do not interfere.
@@ -1602,8 +2279,7 @@ namespace gbe
         }
       }
 
-      // [MOV phi, phiCopy;] can be removed. So we remove it
-      // and replace phi uses with phiCopy
+      // coalease phi and phiCopy 
       if (isOpt) {
         for (auto &x : *phiDef) {
           const_cast<Instruction *>(x->getInstruction())->remove();
@@ -1804,14 +2480,10 @@ namespace gbe
         ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
       } else {
         if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
-          ctx.appendSurface(btiBase, ir::ocl::printfbptr);
-          ctx.getFunction().getPrintfSet()->setBufBTI(btiBase);
-          globalPointer.insert(std::make_pair(&v, incBtiBase()));
+          ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
           regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
         } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
-          ctx.appendSurface(btiBase, ir::ocl::printfiptr);
-          ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase);
-          globalPointer.insert(std::make_pair(&v, incBtiBase()));
+          ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
           regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
         } else if(v.getName().str().substr(0, 4) == ".str") {
           /* When there are multi printf statements in multi kernel fucntions within the same
@@ -2035,6 +2707,7 @@ namespace gbe
 #else
       case CallingConv::C:
       case CallingConv::Fast:
+      case CallingConv::SPIR_KERNEL:
 #endif
         break;
       default:
@@ -2042,6 +2715,7 @@ namespace gbe
     }
 
     ctx.startFunction(F.getName());
+
     ir::Function &fn = ctx.getFunction();
     this->regTranslator.clear();
     this->labelMap.clear();
@@ -2435,6 +3109,12 @@ namespace gbe
           const ir::Register src = this->getRegister(I.getOperand(0));
           ctx.SEL(dstType, dst, src, oneReg, zeroReg);
         }
+        /* For half <---> float conversion, we use F16TO32 or F32TO16, make the code path same. */
+        else if (srcType == ir::TYPE_HALF && dstType == ir::TYPE_FLOAT) {
+          ctx.F16TO32(ir::TYPE_FLOAT, ir::TYPE_U16, getRegister(&I), getRegister(I.getOperand(0)));
+        } else if (srcType == ir::TYPE_FLOAT && dstType == ir::TYPE_HALF) {
+          ctx.F32TO16(ir::TYPE_U16, ir::TYPE_FLOAT, getRegister(&I), getRegister(I.getOperand(0)));
+        }
         // Use a convert for the other cases
         else {
           const ir::Register dst = this->getRegister(&I);
@@ -2613,18 +3293,32 @@ namespace gbe
           case Intrinsic::umul_with_overflow:
             this->newRegister(&I);
           break;
+          case Intrinsic::ctlz:
           case Intrinsic::bswap:
             this->newRegister(&I);
           break;
+          case Intrinsic::fabs:
+          case Intrinsic::sqrt:
+          case Intrinsic::ceil:
+          case Intrinsic::fma:
+          case Intrinsic::trunc:
+          case Intrinsic::rint:
+          case Intrinsic::floor:
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+          case Intrinsic::log2:
+          case Intrinsic::exp2:
+          case Intrinsic::pow:
+            this->newRegister(&I);
+          break;
           default:
           GBE_ASSERTM(false, "Unsupported intrinsics");
         }
         return;
       }
     }
-
     // Get the name of the called function and handle it
-    const std::string fnName = Callee->getName();
+    const std::string fnName = Callee->stripPointerCasts()->getName();
     auto genIntrinsicID = intrinsicMap.find(fnName);
     switch (genIntrinsicID) {
       case GEN_OCL_GET_GROUP_ID0:
@@ -2668,20 +3362,9 @@ namespace gbe
       case GEN_OCL_FBH:
       case GEN_OCL_FBL:
       case GEN_OCL_CBIT:
-      case GEN_OCL_COS:
-      case GEN_OCL_SIN:
-      case GEN_OCL_SQR:
       case GEN_OCL_RSQ:
-      case GEN_OCL_LOG:
-      case GEN_OCL_EXP:
-      case GEN_OCL_POW:
       case GEN_OCL_RCP:
       case GEN_OCL_ABS:
-      case GEN_OCL_FABS:
-      case GEN_OCL_RNDZ:
-      case GEN_OCL_RNDE:
-      case GEN_OCL_RNDU:
-      case GEN_OCL_RNDD:
       case GEN_OCL_GET_IMAGE_WIDTH:
       case GEN_OCL_GET_IMAGE_HEIGHT:
       case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
@@ -2745,7 +3428,6 @@ namespace gbe
       case GEN_OCL_UPSAMPLE_SHORT:
       case GEN_OCL_UPSAMPLE_INT:
       case GEN_OCL_UPSAMPLE_LONG:
-      case GEN_OCL_MAD:
       case GEN_OCL_FMAX:
       case GEN_OCL_FMIN:
       case GEN_OCL_SADD_SAT_CHAR:
@@ -2794,12 +3476,21 @@ namespace gbe
       case GEN_OCL_SAT_CONV_F32_TO_I32:
       case GEN_OCL_SAT_CONV_I32_TO_U32:
       case GEN_OCL_SAT_CONV_F32_TO_U32:
+      case GEN_OCL_SAT_CONV_F16_TO_I8:
+      case GEN_OCL_SAT_CONV_F16_TO_U8:
+      case GEN_OCL_SAT_CONV_F16_TO_I16:
+      case GEN_OCL_SAT_CONV_F16_TO_U16:
+      case GEN_OCL_SAT_CONV_F16_TO_I32:
+      case GEN_OCL_SAT_CONV_F16_TO_U32:
       case GEN_OCL_CONV_F16_TO_F32:
       case GEN_OCL_CONV_F32_TO_F16:
       case GEN_OCL_SIMD_ANY:
       case GEN_OCL_SIMD_ALL:
+      case GEN_OCL_SIMD_SIZE:
       case GEN_OCL_READ_TM:
       case GEN_OCL_REGION:
+      case GEN_OCL_SIMD_ID:
+      case GEN_OCL_SIMD_SHUFFLE:
         this->newRegister(&I);
         break;
       case GEN_OCL_PRINTF:
@@ -2830,19 +3521,46 @@ namespace gbe
     CallSite::arg_iterator AE = CS.arg_end();
     GBE_ASSERT(AI != AE);
 
+    ir::AddressSpace addrSpace;
+
+    Value *llvmPtr = *AI;
+    Value *bti = getBtiRegister(llvmPtr);
+    Value *ptrBase = getPointerBase(llvmPtr);
+    ir::Register pointer = this->getRegister(llvmPtr);
+    ir::Register baseReg = this->getRegister(ptrBase);
+
+    ir::Register btiReg;
+    bool fixedBTI = false;
+    if (isa<ConstantInt>(bti)) {
+      fixedBTI = true;
+      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
+      addrSpace = btiToGen(index);
+      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
+      btiReg = ctx.reg(ir::FAMILY_DWORD);
+      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
+    } else {
+      addrSpace = ir::MEM_MIXED;
+      btiReg = this->getRegister(bti);
+    }
+
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+    const ir::Register ptr = ctx.reg(pointerFamily);
+    ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+
     const ir::Register dst = this->getRegister(&I);
 
-    ir::BTI bti;
-    gatherBTI(&I, bti);
-    const ir::AddressSpace addrSpace = btiToGen(bti);
-    vector<ir::Register> src;
     uint32_t srcNum = 0;
+    vector<ir::Register> src;
+    src.push_back(ptr);
+    srcNum++;
+    AI++;
+
     while(AI != AE) {
       src.push_back(this->getRegister(*(AI++)));
       srcNum++;
     }
     const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
-    ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
+    ctx.ATOMIC(opcode, dst, addrSpace, btiReg, fixedBTI, srcTuple);
   }
 
   /* append a new sampler. should be called before any reference to
@@ -2874,6 +3592,13 @@ namespace gbe
     if (Function *F = I.getCalledFunction()) {
       if (F->getIntrinsicID() != 0) {
         const ir::Function &fn = ctx.getFunction();
+
+        // Get the function arguments
+        CallSite CS(&I);
+        CallSite::arg_iterator AI = CS.arg_begin();
+#if GBE_DEBUG
+        CallSite::arg_iterator AE = CS.arg_end();
+#endif /* GBE_DEBUG */
         switch (F->getIntrinsicID()) {
           case Intrinsic::stacksave:
           {
@@ -2947,95 +3672,83 @@ namespace gbe
           case Intrinsic::umul_with_overflow:
           NOT_IMPLEMENTED;
           break;
-          case Intrinsic::bswap:
+          case Intrinsic::ctlz:
           {
-            // FIXME, this is an unoptimized version, could be optimized by
-            // leveraging GEN's register region/indirect address feature.
             Type *llvmDstType = I.getType();
-            uint32_t elementSize = getTypeByteSize(unit, llvmDstType);
-
-            const ir::Register dst0  = this->getRegister(&I);
-            const ir::Register src0 = this->getRegister(I.getOperand(0));
-            switch(elementSize)
-            {
-              case 2:
-                {
-                  ir::Type srcType = getUnsignedType(ctx, llvmDstType);
-                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
-
-                  ir::Register regWMask = ctx.reg( ir::FAMILY_WORD );
-                  const ir::ImmediateIndex wMask = ctx.newIntegerImmediate(0x00FF, ir::TYPE_S16);
-                  ir::Register regShift = ctx.reg( ir::FAMILY_WORD );
-                  const ir::ImmediateIndex shift = ctx.newIntegerImmediate(8, ir::TYPE_S16);
-
-                  ctx.LOADI(ir::TYPE_S16, regWMask, wMask);
-                  ctx.AND(srcType, tmp1, src0, regWMask);
+            ir::Type dstType = getType(ctx, llvmDstType);
+            Type *llvmSrcType = I.getOperand(0)->getType();
+            ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
 
-                  ctx.LOADI(ir::TYPE_S16, regShift, shift);
-                  ctx.SHL(srcType, tmp2, tmp1, regShift);
+            //the llvm.ctlz.i64 is lowered to two llvm.ctlz.i32 call in ocl_clz.ll
+            GBE_ASSERT(srcType != ir::TYPE_U64);
 
-                  ir::Register tmp3 = ctx.reg( getFamily(srcType) );
-                  ctx.SHR(srcType, tmp3, src0, regShift);
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register src = this->getRegister(I.getOperand(0));
+            int imm_value = 0;
+            if(srcType == ir::TYPE_U16) {
+              imm_value = 16;
+            }else if(srcType == ir::TYPE_U8) {
+              imm_value = 24;
+            }
 
-                  ctx.OR(srcType, dst0, tmp2, tmp3);
-                }
-                break;
-              case 4:
-                {
-                  ir::Type srcType = getType(ctx, llvmDstType);
-                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp3 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp4 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp5 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp6 = ctx.reg(getFamily(srcType));
-
-                  ir::Register regDWMask = ctx.reg( ir::FAMILY_DWORD );
-                  ir::Register regShift_8 = ctx.reg( ir::FAMILY_DWORD );
-                  ir::Register regShift_24 = ctx.reg( ir::FAMILY_DWORD );
-                  ir::ImmediateIndex wMask_L = ctx.newIntegerImmediate(0x0000FF00, ir::TYPE_S32);
-                  ir::ImmediateIndex wMask_H = ctx.newIntegerImmediate(0x00FF0000, ir::TYPE_S32);
-                  ir::ImmediateIndex shift_8 = ctx.newIntegerImmediate(8, ir::TYPE_S32);
-                  ir::ImmediateIndex shift_24 = ctx.newIntegerImmediate(24, ir::TYPE_S32);
-
-                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
-                  ctx.SHL(srcType, tmp1, src0, regShift_24);
-
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_L);
-                  ctx.AND(srcType, tmp2, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
-                  ctx.SHL(srcType, tmp3, tmp2, regShift_8);
-
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_H);
-                  ctx.AND(srcType, tmp4, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
-                  ctx.SHR(makeTypeUnsigned(srcType), tmp5, tmp4, regShift_8);
-
-                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
-                  ctx.SHR(makeTypeUnsigned(srcType), tmp6, src0, regShift_24);
-
-                  ir::Register tmp7 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp8 = ctx.reg(getFamily(srcType));
-                  ctx.OR(srcType, tmp7, tmp1, tmp3);
-                  ctx.OR(srcType, tmp8, tmp5, tmp6);
-                  ctx.OR(srcType, dst0, tmp7, tmp8);
-                }
-                break;
-              case 8:
-                NOT_IMPLEMENTED;
-                break;
-              default:
-                GBE_ASSERT(0);
+            if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) {
+              ir::ImmediateIndex imm;
+              ir::Type tmpType = ir::TYPE_S32;
+              imm = ctx.newIntegerImmediate(imm_value, tmpType);
+              const ir::RegisterFamily family = getFamily(tmpType);
+              const ir::Register immReg = ctx.reg(family);
+              ctx.LOADI(ir::TYPE_S32, immReg, imm);
+
+              ir::Register tmp0 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp1 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp2 = ctx.reg(getFamily(tmpType));
+              ctx.CVT(tmpType, srcType, tmp0, src);
+              ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0);
+              ctx.SUB(tmpType, tmp2, tmp1, immReg);
+              ctx.CVT(dstType, tmpType, dst, tmp2);
+            }
+            else
+            {
+              ctx.ALU1(ir::OP_LZD, dstType, dst, src);
             }
           }
           break;
+          case Intrinsic::fma:
+          {
+            ir::Type srcType = getType(ctx, I.getType());
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register src0 = this->getRegister(I.getOperand(0));
+            const ir::Register src1 = this->getRegister(I.getOperand(1));
+            const ir::Register src2 = this->getRegister(I.getOperand(2));
+            ctx.MAD(srcType, dst, src0, src1, src2);
+          }
+          break;
+          case Intrinsic::sqrt: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
+          case Intrinsic::ceil: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
+          case Intrinsic::fabs: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
+          case Intrinsic::trunc: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
+          case Intrinsic::rint: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
+          case Intrinsic::floor: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
+          case Intrinsic::sin: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
+          case Intrinsic::cos: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
+          case Intrinsic::log2: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
+          case Intrinsic::exp2: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
+          case Intrinsic::bswap:
+            this->emitUnaryCallInst(I,CS,ir::OP_BSWAP, getUnsignedType(ctx, I.getType())); break;
+          case Intrinsic::pow:
+          {
+            const ir::Register src0 = this->getRegister(*AI); ++AI;
+            const ir::Register src1 = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
+            break;
+          }
           default: NOT_IMPLEMENTED;
         }
       } else {
         // Get the name of the called function and handle it
         Value *Callee = I.getCalledValue();
-        const std::string fnName = Callee->getName();
+        const std::string fnName = Callee->stripPointerCasts()->getName();
         auto genIntrinsicID = intrinsicMap.find(fnName);
 
         // Get the function arguments
@@ -3046,14 +3759,6 @@ namespace gbe
 #endif /* GBE_DEBUG */
 
         switch (genIntrinsicID) {
-          case GEN_OCL_POW:
-          {
-            const ir::Register src0 = this->getRegister(*AI); ++AI;
-            const ir::Register src1 = this->getRegister(*AI);
-            const ir::Register dst = this->getRegister(&I);
-            ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
-            break;
-          }
           case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
           case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
           case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, getUnsignedType(ctx, (*AI)->getType())); break;
@@ -3061,21 +3766,21 @@ namespace gbe
           {
             const ir::Register src = this->getRegister(*AI);
             const ir::Register dst = this->getRegister(&I);
-            ctx.ALU1(ir::OP_ABS, ir::TYPE_S32, dst, src);
+            ctx.ALU1(ir::OP_ABS, getType(ctx, (*AI)->getType()), dst, src);
             break;
           }
           case GEN_OCL_SIMD_ALL:
           {
             const ir::Register src = this->getRegister(*AI);
             const ir::Register dst = this->getRegister(&I);
-            ctx.ALU1(ir::OP_SIMD_ALL, ir::TYPE_S16, dst, src);
+            ctx.ALU1(ir::OP_SIMD_ALL, ir::TYPE_S32, dst, src);
             break;
           }
           case GEN_OCL_SIMD_ANY:
           {
             const ir::Register src = this->getRegister(*AI);
             const ir::Register dst = this->getRegister(&I);
-            ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S16, dst, src);
+            ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S32, dst, src);
             break;
           }
           case GEN_OCL_READ_TM:
@@ -3098,18 +3803,8 @@ namespace gbe
             ctx.REGION(dst, src, x.getIntegerValue());
             break;
           }
-          case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
-          case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
-          case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
-          case GEN_OCL_EXP: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
-          case GEN_OCL_SQR: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
-          case GEN_OCL_FABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
-          case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
-          case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
-          case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
-          case GEN_OCL_RNDD: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
           case GEN_OCL_FORCE_SIMD8: ctx.setSimdWidth(8); break;
           case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
           case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
@@ -3190,16 +3885,8 @@ namespace gbe
             GBE_ASSERT(isFloatCoord == requiredFloatCoord);
 
             vector<ir::Register> dstTupleData, srcTupleData;
-            for (uint32_t elemID = 0; elemID < 3; elemID++) {
-              ir::Register reg;
-
-              if (elemID < imageDim)
-                reg = this->getRegister(coordVal, elemID);
-              else
-                reg = ir::ocl::invalid;
-
-              srcTupleData.push_back(reg);
-            }
+            for (uint32_t elemID = 0; elemID < imageDim; elemID++)
+              srcTupleData.push_back(this->getRegister(coordVal, elemID));
 
             uint32_t elemNum;
             ir::Type dstType = getVectorInfo(ctx, &I, elemNum);
@@ -3210,9 +3897,9 @@ namespace gbe
               dstTupleData.push_back(reg);
             }
             const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
-            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 3);
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], imageDim);
 
-            ctx.SAMPLE(imageID, dstTuple, srcTuple, dstType == ir::TYPE_FLOAT,
+            ctx.SAMPLE(imageID, dstTuple, srcTuple, imageDim, dstType == ir::TYPE_FLOAT,
                        requiredFloatCoord, sampler, samplerOffset);
             break;
           }
@@ -3231,16 +3918,9 @@ namespace gbe
             vector<ir::Register> srcTupleData;
             GBE_ASSERT(imageDim >= 1 && imageDim <= 3);
 
-            for (uint32_t elemID = 0; elemID < 3; elemID++) {
-              ir::Register reg;
-
-              if (elemID < imageDim)
-                reg = this->getRegister(*AI, elemID);
-              else
-                reg = ir::ocl::invalid;
+            for (uint32_t elemID = 0; elemID < imageDim; elemID++)
+              srcTupleData.push_back(this->getRegister(*AI, elemID));
 
-              srcTupleData.push_back(reg);
-            }
             ++AI; GBE_ASSERT(AI != AE);
             uint32_t elemNum;
             ir::Type srcType = getVectorInfo(ctx, *AI, elemNum);
@@ -3250,8 +3930,8 @@ namespace gbe
               const ir::Register reg = this->getRegister(*AI, elemID);
               srcTupleData.push_back(reg);
             }
-            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 7);
-            ctx.TYPED_WRITE(imageID, srcTuple, srcType, ir::TYPE_U32);
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], imageDim + 4);
+            ctx.TYPED_WRITE(imageID, srcTuple, imageDim + 4, srcType, ir::TYPE_U32);
             break;
           }
           case GEN_OCL_MUL_HI_INT:
@@ -3372,14 +4052,6 @@ namespace gbe
             ctx.I64MADSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1, src2);
             break;
            }
-          case GEN_OCL_MAD: {
-            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
-            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
-            GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
-            const ir::Register dst = this->getRegister(&I);
-            ctx.MAD(getType(ctx, I.getType()), dst, src0, src1, src2);
-            break;
-          }
           case GEN_OCL_FMAX:
           case GEN_OCL_FMIN:{
             GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
@@ -3480,6 +4152,18 @@ namespace gbe
             DEF(ir::TYPE_U32, ir::TYPE_S32);
           case GEN_OCL_SAT_CONV_F32_TO_U32:
             DEF(ir::TYPE_U32, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_F16_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_HALF);
+          case GEN_OCL_SAT_CONV_F16_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_HALF);
+          case GEN_OCL_SAT_CONV_F16_TO_I16:
+            DEF(ir::TYPE_S16, ir::TYPE_HALF);
+          case GEN_OCL_SAT_CONV_F16_TO_U16:
+            DEF(ir::TYPE_U16, ir::TYPE_HALF);
+          case GEN_OCL_SAT_CONV_F16_TO_I32:
+            DEF(ir::TYPE_S32, ir::TYPE_HALF);
+          case GEN_OCL_SAT_CONV_F16_TO_U32:
+            DEF(ir::TYPE_U32, ir::TYPE_HALF);
           case GEN_OCL_CONV_F16_TO_F32:
             ctx.F16TO32(ir::TYPE_FLOAT, ir::TYPE_U16, getRegister(&I), getRegister(I.getOperand(0)));
             break;
@@ -3495,6 +4179,26 @@ namespace gbe
             assert(fmt);
             break;
           }
+          case GEN_OCL_SIMD_SIZE:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU0(ir::OP_SIMD_SIZE, getType(ctx, I.getType()), dst);
+            break;
+          }
+          case GEN_OCL_SIMD_ID:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU0(ir::OP_SIMD_ID, getType(ctx, I.getType()), dst);
+            break;
+          }
+          case GEN_OCL_SIMD_SHUFFLE:
+          {
+            const ir::Register src0 = this->getRegister(*AI); ++AI;
+            const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.SIMD_SHUFFLE(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
           default: break;
         }
       }
@@ -3546,7 +4250,7 @@ namespace gbe
       if (step != 0) {
         ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
         ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
-        ctx.LOADI(ir::TYPE_S32, stepReg, stepImm);
+        ctx.LOADI(ir::TYPE_U32, stepReg, stepImm);
         ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
         ctx.getFunction().pushStackSize(step);
       }
@@ -3573,8 +4277,8 @@ namespace gbe
   void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                                       Value *llvmValues, const ir::Register ptr,
                                       const ir::AddressSpace addrSpace,
-                                      Type * elemType, bool isLoad, ir::BTI bti,
-                                      bool dwAligned) {
+                                      Type * elemType, bool isLoad, ir::Register bti,
+                                      bool dwAligned, bool fixedBTI) {
     const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
     uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
     uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
@@ -3620,79 +4324,18 @@ namespace gbe
 
       // Emit the instruction
       if (isLoad)
-        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
+        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
       else
-        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
-    }
-  }
-
-  // The idea behind is to search along the use-def chain, and find out all
-  // possible sources of the pointer. Then in later codeGen, we can emit
-  // read/store instructions to these BTIs gathered.
-  void GenWriter::gatherBTI(Value *insn, ir::BTI &bti) {
-    PtrOrigMapIter iter = pointerOrigMap.find(insn);
-    if (iter != pointerOrigMap.end()) {
-      SmallVectorImpl<Value *> &origins = iter->second;
-      uint8_t nBTI = 0;
-      for (unsigned i = 0; i < origins.size(); i++) {
-        uint8_t new_bti = 0;
-        Value *origin = origins[i];
-        // all constant put into constant cache, including __constant & const __private
-        if (isa<GlobalVariable>(origin)
-            && dyn_cast<GlobalVariable>(origin)->isConstant()) {
-          new_bti = BTI_CONSTANT;
-        } else {
-          unsigned space = origin->getType()->getPointerAddressSpace();
-          switch (space) {
-            case 0:
-              new_bti = BTI_PRIVATE;
-              break;
-            case 1:
-            {
-              GlobalPtrIter iter = globalPointer.find(origin);
-              GBE_ASSERT(iter != globalPointer.end());
-              new_bti = iter->second;
-              break;
-            }
-            case 2:
-              new_bti = BTI_CONSTANT;
-              break;
-            case 3:
-              new_bti = BTI_LOCAL;
-              break;
-            default:
-              GBE_ASSERT(0 && "address space not unhandled in gatherBTI()\n");
-              break;
-          }
-        }
-
-        // avoid duplicate
-        bool bFound = false;
-        for (int j = 0; j < nBTI; j++) {
-          if (bti.bti[j] == new_bti) {
-            bFound = true; break;
-          }
-        }
-        if (bFound == false) {
-          bti.bti[nBTI++] = new_bti;
-          bti.count = nBTI;
-        }
-      }
-    } else {
-      insn->dump();
-      std::cerr << "Illegal pointer which is not from a valid memory space." << std::endl;
-      std::cerr << "Aborting..." << std::endl;
-      exit(-1);
+        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
     }
-    GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
   }
+
   // handle load of dword/qword with unaligned address
-  void GenWriter::emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned)
+  void GenWriter::emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI)
   {
     Type *llvmType = llvmValues->getType();
     const ir::Type type = getType(ctx, llvmType);
     unsigned byteSize = getTypeByteSize(unit, llvmType);
-    const ir::Register ptr = this->getRegister(llvmPtr);
 
     Type *elemType = llvmType;
     unsigned elemNum = 1;
@@ -3722,13 +4365,13 @@ namespace gbe
     const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
 
     if (isLoad) {
-      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
+      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
       ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
     } else {
       ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
       // FIXME: byte scatter does not handle correctly vector store, after fix that,
       //        we can directly use on store instruction like:
-      //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
+      //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
       const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
       for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
         const ir::Register reg = byteTupleData[elemID];
@@ -3743,7 +4386,7 @@ namespace gbe
           ctx.LOADI(ir::TYPE_S32, offset, immIndex);
           ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
         }
-       ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
+       ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, bti, reg);
       }
     }
   }
@@ -3756,10 +4399,31 @@ namespace gbe
     Value *llvmValues = getLoadOrStoreValue(I);
     Type *llvmType = llvmValues->getType();
     const bool dwAligned = (I.getAlignment() % 4) == 0;
-    const ir::Register ptr = this->getRegister(llvmPtr);
-    ir::BTI binding;
-    gatherBTI(&I, binding);
-    const ir::AddressSpace addrSpace = btiToGen(binding);
+    ir::AddressSpace addrSpace;
+    const ir::Register pointer = this->getRegister(llvmPtr);
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+
+    Value *bti = getBtiRegister(llvmPtr);
+    Value *ptrBase = getPointerBase(llvmPtr);
+    ir::Register baseReg = this->getRegister(ptrBase);
+    bool zeroBase = false;
+    if (isa<ConstantPointerNull>(ptrBase)) {
+      zeroBase = true;
+    }
+
+    ir::Register btiReg;
+    bool fixedBTI = false;
+    if (isa<ConstantInt>(bti)) {
+      fixedBTI = true;
+      unsigned index = cast<ConstantInt>(bti)->getZExtValue();
+      addrSpace = btiToGen(index);
+      ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
+      btiReg = ctx.reg(ir::FAMILY_DWORD);
+      ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
+    } else {
+      addrSpace = ir::MEM_MIXED;
+      btiReg = this->getRegister(bti);
+    }
 
     Type *scalarType = llvmType;
     if (!isScalarType(llvmType)) {
@@ -3767,11 +4431,20 @@ namespace gbe
       scalarType = vectorType->getElementType();
     }
 
+    ir::Register ptr = ctx.reg(pointerFamily);
+    // FIXME: avoid subtraction zero at this stage is not a good idea,
+    // but later ArgumentLower pass need to match exact load/addImm pattern
+    // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
+    if (!zeroBase)
+      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+    else
+      ptr = pointer;
+
     if (!dwAligned
        && (scalarType == IntegerType::get(I.getContext(), 64)
           || scalarType == IntegerType::get(I.getContext(), 32))
        ) {
-      emitUnalignedDQLoadStore(llvmPtr, llvmValues, addrSpace, binding, isLoad, dwAligned);
+      emitUnalignedDQLoadStore(ptr, llvmValues, addrSpace, btiReg, isLoad, dwAligned, fixedBTI);
       return;
     }
     // Scalar is easy. We neednot build register tuples
@@ -3779,9 +4452,9 @@ namespace gbe
       const ir::Type type = getType(ctx, llvmType);
       const ir::Register values = this->getRegister(llvmValues);
       if (isLoad)
-        ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
+        ctx.LOAD(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
       else
-        ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
+        ctx.STORE(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
     }
     // A vector type requires to build a tuple
     else {
@@ -3803,10 +4476,9 @@ namespace gbe
       // The code is going to be fairly different from types to types (based on
       // size of each vector element)
       const ir::Type type = getType(ctx, elemType);
-      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
       const ir::RegisterFamily dataFamily = getFamily(type);
 
-      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT && addrSpace != ir::MEM_MIXED) {
+      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
         // One message is enough here. Nothing special to do
         if (elemNum <= 4) {
           // Build the tuple data in the vector
@@ -3825,19 +4497,19 @@ namespace gbe
 
           // Emit the instruction
           if (isLoad)
-            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
           else
-            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
         }
         // Not supported by the hardware. So, we split the message and we use
         // strided loads and stores
         else {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
         }
       }
       else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
               (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
           if(regTranslator.isUndefConst(llvmValues, elemID))
@@ -3857,9 +4529,9 @@ namespace gbe
               ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
           }
           if (isLoad)
-           ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
+           ctx.LOAD(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
           else
-           ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
+           ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
         }
       }
     }
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 5724917..1f16557 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -140,9 +140,6 @@ namespace gbe
   /*! Remove/add NoDuplicate function attribute for barrier functions. */
   llvm::ModulePass* createBarrierNodupPass(bool);
 
-  /*! Legalize all wide integer instructions */
-  llvm::FunctionPass* createLegalizePass();
-
   /*! Convert the Intrinsic call to gen function */
   llvm::BasicBlockPass *createIntrinsicLoweringPass();
 
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 8ec8336..cabb225 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -19,20 +19,8 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
 DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 
 // Math function
-DECL_LLVM_GEN_FUNCTION(FABS, __gen_ocl_fabs)
-DECL_LLVM_GEN_FUNCTION(COS, __gen_ocl_cos)
-DECL_LLVM_GEN_FUNCTION(SIN, __gen_ocl_sin)
-DECL_LLVM_GEN_FUNCTION(SQR, __gen_ocl_sqrt)
 DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
-DECL_LLVM_GEN_FUNCTION(LOG, __gen_ocl_log)
-DECL_LLVM_GEN_FUNCTION(EXP, __gen_ocl_exp)
-DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
-DECL_LLVM_GEN_FUNCTION(RNDZ, __gen_ocl_rndz)
-DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)
-DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
-DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
-DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
 DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax)
 DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
 
@@ -163,9 +151,19 @@ DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U32, _Z16convert_uint_satf)
 DECL_LLVM_GEN_FUNCTION(CONV_F16_TO_F32, __gen_ocl_f16to32)
 DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)
 
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F16_TO_I8, _Z16convert_char_satDh)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F16_TO_U8, _Z17convert_uchar_satDh)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F16_TO_I16, _Z17convert_short_satDh)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F16_TO_U16, _Z18convert_ushort_satDh)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F16_TO_I32, _Z15convert_int_satDh)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F16_TO_U32, _Z16convert_uint_satDh)
+
 // SIMD level function for internal usage
-DECL_LLVM_GEN_FUNCTION(SIMD_ANY, __gen_ocl_simd_any)
-DECL_LLVM_GEN_FUNCTION(SIMD_ALL, __gen_ocl_simd_all)
+DECL_LLVM_GEN_FUNCTION(SIMD_ANY, sub_group_any)
+DECL_LLVM_GEN_FUNCTION(SIMD_ALL, sub_group_all)
+DECL_LLVM_GEN_FUNCTION(SIMD_SIZE, get_sub_group_size)
+DECL_LLVM_GEN_FUNCTION(SIMD_ID, get_sub_group_id)
+DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle)
 
 DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
 DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
diff --git a/backend/src/llvm/llvm_legalize.cpp b/backend/src/llvm/llvm_legalize.cpp
deleted file mode 100644
index 250fd11..0000000
--- a/backend/src/llvm/llvm_legalize.cpp
+++ /dev/null
@@ -1,704 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Ruiling, Song <ruiling.song at intel.com>
- *
- * Legalize unsupported integer data type i128/i256/...
- * right now, the implementation only consider little-endian system.
- *
- */
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/PassManager.h"
-
-#include "llvm/Config/llvm-config.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/IR/IRBuilder.h"
-#if LLVM_VERSION_MINOR >= 5
-#include "llvm/IR/CFG.h"
-#else
-#include "llvm/Support/CFG.h"
-#endif
-
-
-#include "llvm_gen_backend.hpp"
-
-using namespace llvm;
-
-namespace gbe {
-
-  class Legalize : public FunctionPass {
-  public:
-    Legalize() : FunctionPass(ID) {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
-      initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
-#else
-      initializeDominatorTreePass(*PassRegistry::getPassRegistry());
-#endif
-    }
-    bool runOnFunction(Function& F) {
-      if (!isKernelFunction(F)) return false;
-      return legalizeFunction(F);
-    }
-    Value *getComponent(Value *v, uint32_t i, Type *ty);
-    bool isIncomplete(Value *v);
-    void legalizePHI(IRBuilder <> Builder, Instruction *p);
-    void legalizeSelect(IRBuilder<> &Builder, Instruction *p);
-    void legalizeICmp(IRBuilder<> &Builder, Instruction *p);
-    void legalizeShl(IRBuilder<> &Builder, Instruction *p);
-    void legalizeLShr(IRBuilder<> &Builder, Instruction *p);
-    void legalizeAnd(IRBuilder<> &Builder, Instruction *p);
-    void legalizeOr(IRBuilder<> &Builder, Instruction *p);
-    void legalizeXor(IRBuilder<> &Builder, Instruction *p);
-    void legalizeBitCast(IRBuilder<> &Builder, Instruction *p);
-    void legalizeTrunc(IRBuilder<> &Builder, Instruction *p);
-    void legalizeZExt(IRBuilder<> &Builder, Instruction *p);
-    bool legalizeFunction(Function& F);
-    void splitLargeInteger(APInt op, Type *splitTy, SmallVector<APInt, 16> &split);
-    void splitConstantInt(ConstantInt *c, Type *splitTy, SmallVector<Value*, 16> &split);
-    static char ID;
-  private:
-    std::set<Value *> processed;
-    std::set<PHINode *> incompletePHIs;
-    std::map<Value *, SmallVector<Value*, 16>> valueMap;
-    typedef std::map<Value*, SmallVector<Value*, 16>>::iterator ValueMapIter;
-  };
-
-  void splitAPInt(APInt &data, SmallVectorImpl<APInt> &result, int totalBits, int subBits) {
-    APInt lo = data.getLoBits(totalBits/2).trunc(totalBits/2);
-    APInt hi = data.getHiBits(totalBits/2).trunc(totalBits/2);
-
-    if (totalBits/2 <= subBits) {
-      result.push_back(lo);
-      result.push_back(hi);
-      return;
-    }
-    splitAPInt(lo, result, totalBits/2, subBits);
-    splitAPInt(hi, result, totalBits/2, subBits);
-  }
-
-  void Legalize::splitLargeInteger(APInt data, Type *splitTy, SmallVector<APInt, 16> &split) {
-    unsigned opSz = data.getBitWidth();
-    GBE_ASSERT(opSz > 7 && llvm::isPowerOf2_32(opSz));
-    unsigned subSz = splitTy->getPrimitiveSizeInBits();
-    splitAPInt(data, split, opSz, subSz);
-  }
-
-  void Legalize::splitConstantInt(ConstantInt *c, Type *splitTy, SmallVector<Value*, 16> &split) {
-    SmallVector<APInt, 16> imm;
-    splitLargeInteger(c->getValue(), splitTy, imm);
-    for (unsigned i = 0; i < imm.size(); i++) {
-      split.push_back(ConstantInt::get(splitTy, imm[i]));
-    }
-  }
-
-  bool Legalize::isIncomplete(Value *v) {
-    return valueMap.find(v) == valueMap.end() && !isa<ConstantInt>(v);
-  }
-
-  Value *Legalize::getComponent(Value *v, uint32_t i, Type *ty) {
-    GBE_ASSERT(!isIncomplete(v));
-    if (isa<ConstantInt>(v)) {
-      GBE_ASSERT(ty);
-      ConstantInt *CI = dyn_cast<ConstantInt>(v);
-      SmallVector<APInt, 16> imm;
-      splitLargeInteger(CI->getValue(), ty, imm);
-      return ConstantInt::get(ty, imm[i]);
-    }
-    return valueMap.find(v)->second[i];
-  }
-
-  void Legalize::legalizePHI(IRBuilder <> Builder, Instruction *p) {
-    PHINode *phi = dyn_cast<PHINode>(p);
-    bool incomplete = false, allConst = true;
-    uint32_t compNum = 0;
-    Type *splitTy = NULL;
-    for (unsigned int i = 0; i < phi->getNumIncomingValues(); ++i) {
-      Value *val = phi->getIncomingValue(i);
-      if (isIncomplete(val)) {
-        incomplete = true;
-        break;
-      }
-      if (allConst && valueMap.find(val) != valueMap.end()) {
-        allConst = false;
-        splitTy = valueMap.find(val)->second[0]->getType();
-        compNum = valueMap.find(val)->second.size();
-      }
-    }
-
-    if (incomplete) {
-      // FIME, if a PHINode is totally incomplete which means
-      // we don't even know the base type of this instruction.
-      // Then it will be a little bit difficult to handle here.
-      // Will do it in the future.
-      incompletePHIs.insert(phi);
-      GBE_ASSERT(0 && "unsupported PHI");
-    }
-    else {
-      GBE_ASSERT(!allConst);
-      SmallVector<Value*, 16> v;
-      for (unsigned int i = 0; i < compNum; ++i) {
-        PHINode* res = Builder.CreatePHI(splitTy, phi->getNumIncomingValues());
-
-        // Loop over pairs of operands: [Value*, BasicBlock*]
-        for (unsigned int j = 0; j < phi->getNumIncomingValues(); j++) {
-          BasicBlock* bb = phi->getIncomingBlock(j);
-          res->addIncoming(getComponent(phi->getIncomingValue(j), i, splitTy), bb);
-        }
-        v.push_back(res);
-      }
-      valueMap.insert(std::make_pair(phi, v));
-    }
-  }
-
-  void Legalize::legalizeSelect(IRBuilder<> &Builder, Instruction *p) {
-    SelectInst *sel = dyn_cast<SelectInst>(p);
-    Value *op0 = sel->getOperand(0);
-    Value *op1 = sel->getOperand(1);
-    Value *op2 = sel->getOperand(2);
-
-    ValueMapIter iter1 = valueMap.find(op1);
-    ValueMapIter iter2 = valueMap.find(op2);
-    SmallVector<Value*, 16> v;
-    if (iter1 != valueMap.end() && iter2 != valueMap.end()) {
-      SmallVectorImpl<Value*> &opVec1 = iter1->second;
-      SmallVectorImpl<Value*> &opVec2 = iter2->second;
-
-      GBE_ASSERT(opVec1.size() == opVec2.size());
-
-      for (unsigned i = 0; i < opVec1.size(); i++) {
-        Value *elemV = Builder.CreateSelect(op0, opVec1[i], opVec2[i]);
-        v.push_back(elemV);
-      }
-    } else if (iter1 != valueMap.end()) {
-      SmallVectorImpl<Value*> &opVec1 = iter1->second;
-      Type *splitTy = opVec1[0]->getType();
-      GBE_ASSERT(isa<ConstantInt>(op2));
-      ConstantInt *CI = dyn_cast<ConstantInt>(op2);
-      SmallVector<APInt, 16> imm;
-
-      splitLargeInteger(CI->getValue(), splitTy, imm);
-      for (unsigned i = 0; i < opVec1.size(); i++) {
-        Value *elemV = Builder.CreateSelect(op0, opVec1[i], ConstantInt::get(splitTy, imm[i]));
-        v.push_back(elemV);
-      }
-    } else if (iter2 != valueMap.end()) {
-      SmallVectorImpl<Value*> &opVec2 = iter2->second;
-      Type *splitTy = opVec2[0]->getType();
-      GBE_ASSERT(isa<ConstantInt>(op1));
-      ConstantInt *CI = dyn_cast<ConstantInt>(op1);
-      SmallVector<APInt, 16> imm;
-
-      splitLargeInteger(CI->getValue(), splitTy, imm);
-      for (unsigned i = 0; i < opVec2.size(); i++) {
-        Value *elemV = Builder.CreateSelect(op0, ConstantInt::get(splitTy, imm[i]), opVec2[i]) ;
-        v.push_back(elemV);
-      }
-    } else {
-      p->dump(); GBE_ASSERT(0 && "unsupported select.");
-    }
-    valueMap.insert(std::make_pair(p, v));
-  }
-
-  void Legalize::legalizeICmp(IRBuilder<> &Builder, Instruction *p) {
-    ICmpInst *IC = dyn_cast<ICmpInst>(p);
-    ICmpInst::Predicate pred = IC->getPredicate();
-    // I could not figure out why llvm could generate some
-    // compare instruction on large integers. so here only support equality check
-    GBE_ASSERT(IC->isEquality());
-    Value *op0 = p->getOperand(0);
-    Value *op1 = p->getOperand(1);
-
-    if (isa<ConstantInt>(op0)) {
-      op0 = p->getOperand(1);
-      op1 = p->getOperand(0);
-    }
-
-    if (isa<ConstantInt>(op1)) {
-      ValueMapIter iter = valueMap.find(op0);
-      SmallVectorImpl<Value*> &opVec = iter->second;
-      SmallVector<APInt, 16> imm;
-
-      Value *res = NULL;
-      Type *splitTy = opVec[0]->getType();
-      ConstantInt *CI = dyn_cast<ConstantInt>(op1);
-
-      splitLargeInteger(CI->getValue(), splitTy, imm);
-      for (unsigned i = 0; i < opVec.size(); i++) {
-        Value *tmp = Builder.CreateICmp(pred, opVec[i], ConstantInt::get(splitTy, imm[i]));
-        if (res != NULL) {
-          if (pred == CmpInst::ICMP_EQ)
-            tmp = Builder.CreateAnd(tmp, res);
-          else
-            tmp = Builder.CreateOr(tmp, res);
-        }
-        res = tmp;
-      }
-      p->replaceAllUsesWith(res);
-    } else {
-      ValueMapIter iter0 = valueMap.find(op0);
-      ValueMapIter iter1 = valueMap.find(op1);
-      SmallVectorImpl<Value*> &opVec0 = iter0->second;
-      SmallVectorImpl<Value*> &opVec1 = iter1->second;
-
-      Value *res = NULL;
-      for (unsigned i = 0; i < opVec0.size(); i++) {
-        Value *tmp = Builder.CreateICmp(pred, opVec0[i], opVec1[i]);
-        if (res != NULL) {
-          if (pred == CmpInst::ICMP_EQ)
-            tmp = Builder.CreateAnd(tmp, res);
-          else
-            tmp = Builder.CreateOr(tmp, res);
-        }
-        res = tmp;
-      }
-      p->replaceAllUsesWith(res);
-    }
-  }
-
-  void Legalize::legalizeShl(IRBuilder<> &Builder, Instruction *p) {
-    // only support known bits shift
-    GBE_ASSERT(isa<ConstantInt>(p->getOperand(1)));
-
-    ValueMapIter iter = valueMap.find(p->getOperand(0));
-    GBE_ASSERT(iter != valueMap.end());
-    SmallVectorImpl<Value*> &v0 = iter->second;
-
-    uint64_t shiftBits = dyn_cast<ConstantInt>(p->getOperand(1))->getZExtValue();
-    Type *splitTy = v0[0]->getType();
-
-    unsigned elemNum = v0.size();
-    unsigned szSplit = splitTy->getPrimitiveSizeInBits();
-    unsigned shift = shiftBits / szSplit;
-    unsigned unaligned = shiftBits % szSplit;
-
-    if (unaligned == 0) {
-      SmallVector<Value*, 16> v1;
-      // fill lower bits with zero
-      for (unsigned i = 0; i < shift; i++) {
-        v1.push_back(ConstantInt::get(splitTy, 0));
-      }
-      // do the shift
-      for (unsigned j =0; j < elemNum - shift; j++)
-        v1.push_back(v0[j]);
-
-      valueMap.insert(std::make_pair(p, v1));
-    } else {
-      SmallVector<Value*, 16> v1;
-      // fill lower bits with zero
-      for (unsigned i = 0; i < shift; i++) {
-        v1.push_back(ConstantInt::get(splitTy, 0));
-      }
-      // first one is special, shl is enough.
-      v1.push_back(Builder.CreateShl(v0[0], unaligned));
-
-      for (unsigned i = 0; i < elemNum - shift - 1; i++) {
-        Value *t0 = Builder.CreateLShr(v0[i], ConstantInt::get(v0[0]->getType(), szSplit-unaligned));
-        Value *t1 = Builder.CreateShl(v0[i + 1], ConstantInt::get(v0[i + 1]->getType(), unaligned));
-        Value *t2 = Builder.CreateOr(t0, t1);
-        v1.push_back(t2);
-      }
-      valueMap.insert(std::make_pair(p, v1));
-    }
-  }
-
-  void Legalize::legalizeLShr(IRBuilder<> &Builder, Instruction *p) {
-    Value *op0 = p->getOperand(0);
-    Value *op1 = p->getOperand(1);
-    SmallVector<Value*, 16> result;
-
-    GBE_ASSERT(isa<ConstantInt>(p->getOperand(1)));
-
-    ValueMapIter iter = valueMap.find(op0);
-    GBE_ASSERT(iter != valueMap.end());
-    SmallVectorImpl<Value*> &opVec = iter->second;
-
-    unsigned szTotal = op1->getType()->getPrimitiveSizeInBits();
-    unsigned elemNum = opVec.size();
-    unsigned szSplit = szTotal / elemNum;
-    int64_t shift = dyn_cast<ConstantInt>(op1)->getSExtValue();
-    GBE_ASSERT(shift > 0);
-    unsigned elemShift = shift / szSplit;
-    unsigned unalign = shift % szSplit;
-
-    if (unalign == 0) {
-      // the shift bits is aligned with the split size
-      Constant *zero = ConstantInt::getSigned(opVec[0]->getType(), 0);
-      for (unsigned s = 0; s < elemNum - elemShift; s++)
-        result.push_back(opVec[s + elemShift]);
-
-      for (unsigned s = 0; s < elemShift; s++)
-        result.push_back(zero);
-
-      valueMap.insert(std::make_pair(p, result));
-    } else {
-      // not aligned case
-      for (unsigned s = elemShift; s < elemNum-1; s++) {
-        Value *t0 = Builder.CreateLShr(opVec[s], ConstantInt::get(opVec[s]->getType(), unalign));
-        Value *t1 = Builder.CreateShl(opVec[s + 1], ConstantInt::get(opVec[s + 1]->getType(), szSplit - unalign));
-        Value *t2 = Builder.CreateOr(t0, t1);
-        result.push_back(t2);
-      }
-      // last element only need lshr
-      result.push_back(Builder.CreateLShr(opVec[elemNum-1], ConstantInt::get(opVec[elemNum - 1]->getType(), unalign)));
-
-      for (unsigned s = 0; s < elemShift; s++) {
-        result.push_back(ConstantInt::getSigned(opVec[0]->getType(), 0));
-      }
-      valueMap.insert(std::make_pair(p, result));
-    }
-  }
-
-  void Legalize::legalizeAnd(IRBuilder<> &Builder, Instruction *p) {
-    Value *op0 = p->getOperand(0);
-    Value *op1 = p->getOperand(1);
-
-    if ((isa<UndefValue>(op0) || isa<UndefValue>(op1))) {
-      // I meet some special case as below:
-      //   %82 = zext i32 %81 to i512
-      //   %mask148 = and i512 undef, -4294967296
-      //   %ins149 = or i512 %mask148, %82
-      // I don't know how to split this kind of i512 instruction in a good way,
-      // to simplify the situation, I directly optimize it to zero.
-      // And in later instructions like and/or/shr... that operates on
-      // the value can be optimized.
-      p->replaceAllUsesWith(ConstantInt::get(p->getType(), 0));
-      return;
-    }
-
-    if ((isa<ConstantInt>(op0) && dyn_cast<ConstantInt>(op0)->isZero())
-       || (isa<ConstantInt>(op1) && dyn_cast<ConstantInt>(op1)->isZero())) {
-      // zero & anyValue  ==> zero
-      p->replaceAllUsesWith(ConstantInt::get(p->getType(), 0));
-      return;
-    }
-
-    if (isa<ConstantInt>(op0)) {
-      op0 = p->getOperand(1);
-      op1 = p->getOperand(0);
-    }
-
-    ValueMapIter iter = valueMap.find(op0);
-    SmallVector<Value*, 16> v0 = iter->second;
-    SmallVector<Value*, 16> v1;
-    SmallVector<Value*, 16> v2;
-
-    if (isa<ConstantInt>(op1)) {
-      splitConstantInt(dyn_cast<ConstantInt>(op1), v0[0]->getType(), v1);
-    } else {
-      v1 = valueMap.find(op1)->second;
-    }
-
-    for (unsigned i = 0; i < v0.size(); i++) {
-      ConstantInt *c0 = NULL, *c1 = NULL;
-      if (isa<ConstantInt>(v0[i])) c0 = dyn_cast<ConstantInt>(v0[i]);
-      if (isa<ConstantInt>(v1[i])) c1 = dyn_cast<ConstantInt>(v1[i]);
-
-      if ((c0 &&c0->isZero()) || (c1 && c1->isZero())) {
-        // zero & anyvalue ==> zero
-        v2.push_back(ConstantInt::get(v0[i]->getType(), 0));
-      } else if (c0 && c0->isMinusOne()) {
-        // 1111s & anyvalue ==> anyvalue
-        v2.push_back(v1[i]);
-      } else if (c1 && c1->isMinusOne()) {
-        // 1111s & anyvalue ==> anyvalue
-        v2.push_back(v0[i]);
-      } else {
-        v2.push_back(Builder.CreateAnd(v0[i], v1[i]));
-      }
-    }
-    valueMap.insert(std::make_pair(p, v2));
-  }
-
-  void Legalize::legalizeOr(IRBuilder<> &Builder, Instruction *p) {
-    Value *op0 = p->getOperand(0);
-    Value *op1 = p->getOperand(1);
-
-    if (isa<ConstantInt>(op0)) {
-      op0 = p->getOperand(1);
-      op1 = p->getOperand(0);
-    }
-
-    if (isa<ConstantInt>(op1) && dyn_cast<ConstantInt>(op1)->isZero()) {
-      ValueMapIter iter = valueMap.find(op0);
-      valueMap.insert(std::make_pair(p, iter->second));
-      return;
-    }
-
-    ValueMapIter iter = valueMap.find(op0);
-    SmallVector<Value*, 16> v0 = iter->second;
-    SmallVector<Value*, 16> v1;
-    SmallVector<Value*, 16> v2;
-
-    if (isa<ConstantInt>(op1)) {
-      splitConstantInt(dyn_cast<ConstantInt>(op1), v0[0]->getType(), v1);
-    } else {
-      v1 = valueMap.find(op1)->second;
-    }
-
-    for (unsigned i = 0; i < v0.size(); i++) {
-      ConstantInt *c0 = NULL, *c1 = NULL;
-      if (isa<ConstantInt>(v0[i])) c0 = dyn_cast<ConstantInt>(v0[i]);
-      if (isa<ConstantInt>(v1[i])) c1 = dyn_cast<ConstantInt>(v1[i]);
-
-      if ((c0 &&c0->isZero())) {
-        // zero | anyvalue ==> anyvalue
-        v2.push_back(v1[i]);
-      } else if (c1 && c1->isZero()) {
-        // zero | anyvalue ==> anyvalue
-        v2.push_back(v0[i]);
-      } else if (c0 && c0->isMinusOne()) {
-        // 1111 | anyvalue ==> 1111
-        v2.push_back(c0);
-      } else if (c1 && c1->isMinusOne()) {
-        // 1111 | anyvalue ==> 1111
-        v2.push_back(c1);
-      } else {
-        v2.push_back(Builder.CreateOr(v0[i], v1[i]));
-      }
-    }
-    valueMap.insert(std::make_pair(p, v2));
-  }
-
-  void Legalize::legalizeXor(IRBuilder<> &Builder, Instruction *p) {
-    Value *op0 = p->getOperand(0);
-    Value *op1 = p->getOperand(1);
-
-    if (isa<ConstantInt>(op0)) {
-      op0 = p->getOperand(1);
-      op1 = p->getOperand(0);
-    }
-
-    ValueMapIter iter = valueMap.find(op0);
-    SmallVector<Value*, 16> v0 = iter->second;
-    SmallVector<Value*, 16> v1;
-    SmallVector<Value*, 16> v2;
-
-    if (isa<ConstantInt>(op1)) {
-      splitConstantInt(dyn_cast<ConstantInt>(op1), v0[0]->getType(), v1);
-    } else {
-      v1 = valueMap.find(op1)->second;
-    }
-
-    for (unsigned i = 0; i < v0.size(); i++) {
-      v2.push_back(Builder.CreateXor(v0[i], v1[i]));
-    }
-    valueMap.insert(std::make_pair(p, v2));
-  }
-
-  void Legalize::legalizeBitCast(IRBuilder<> &Builder, Instruction *p) {
-    SmallVector<Value*, 16> split;
-    Type *dstTy = p->getType();
-    Type *srcTy = dyn_cast<CastInst>(p)->getSrcTy();
-
-    if(srcTy->isVectorTy()) {
-      VectorType *vecTy = dyn_cast<VectorType>(srcTy);
-      Type *splitTy = vecTy->getElementType();
-      unsigned elements = srcTy->getPrimitiveSizeInBits()/splitTy->getPrimitiveSizeInBits();
-      // bitcast large integer from vector, so we do extractElement to get split integer
-      unsigned splitSz = splitTy->getPrimitiveSizeInBits();
-      Value *src = p->getOperand(0);
-      // if it is cast from <4 x float> to i128
-      // we cast <4 x float> to <4 x i32> first
-      if (!splitTy->isIntegerTy())
-        src = Builder.CreateBitCast(src, VectorType::get(IntegerType::get(p->getContext(), splitSz), elements));
-
-      for (unsigned i = 0; i < elements; i++) {
-        Value *NV = Builder.CreateExtractElement(src,
-                      ConstantInt::get(IntegerType::get(p->getContext(), 32), i));
-        split.push_back(NV);
-      }
-      valueMap.insert(std::make_pair(p, split));
-    } else if (dstTy->isVectorTy()) {
-      //bitcast from large integer to vector, so we do insertElement to build the vector
-      ValueMapIter iter = valueMap.find(p->getOperand(0));
-      SmallVectorImpl<Value*> &opVec = iter->second;
-      Type *splitTy = opVec[0]->getType();
-      GBE_ASSERT(dstTy->getPrimitiveSizeInBits() % splitTy->getPrimitiveSizeInBits() == 0);
-      GBE_ASSERT(dstTy->getPrimitiveSizeInBits() / splitTy->getPrimitiveSizeInBits() == opVec.size());
-      Value *vec = NULL;
-      Type *idxTy = IntegerType::get(p->getContext(), 32);
-      for (unsigned i = 0; i < opVec.size(); ++i) {
-        Value *tmp = vec ? vec : UndefValue::get(VectorType::get(splitTy, opVec.size()));
-        Value *idx = ConstantInt::get(idxTy, i);
-        vec = Builder.CreateInsertElement(tmp, opVec[i], idx);
-      }
-      Type *elemTy = cast<VectorType>(dstTy)->getElementType();
-      if (elemTy == opVec[0]->getType())
-        p->replaceAllUsesWith(vec);
-      else {
-        Value *newVec = Builder.CreateBitCast(vec, dstTy);
-        p->replaceAllUsesWith(newVec);
-      }
-    } else {
-      p->dump(); GBE_ASSERT(0 && "Unsupported bitcast");
-    }
-  }
-
-  void Legalize::legalizeTrunc(IRBuilder<> &Builder, Instruction *p) {
-    Type *dstTy = p->getType();
-
-    ValueMapIter iter = valueMap.find(p->getOperand(0));
-    SmallVector<Value*, 16> &opVec = iter->second;
-    unsigned szSplit = opVec[0]->getType()->getPrimitiveSizeInBits();
-    unsigned szResult = dstTy->getPrimitiveSizeInBits();
-
-    if(szResult > szSplit) {
-      // the needed bits is larger than what is already split,
-      // we have to merge the split Value, use Shl/Or to do it.
-      int endIdx = (szResult + szSplit-1 )/szSplit;
-      Value * prev = ConstantInt::get(dstTy, 0);
-      for (int i = endIdx - 1; i >=0; i--) {
-        Value * res = Builder.CreateZExt(opVec[i], dstTy);
-        if (i > 0)
-          res = Builder.CreateShl(res, i*szSplit);
-        prev = Builder.CreateOr(res, prev);
-      }
-      Value *newValue = Builder.CreateTrunc(prev, dstTy);
-      p->replaceAllUsesWith(newValue);
-    } else if (szResult == szSplit) {
-      // same bit width, should use bitcast instead of trunc.
-      Value *newValue = Builder.CreateBitCast(opVec[0], dstTy);
-      p->replaceAllUsesWith(newValue);
-    } else {
-      // normal case, trunc to a shorter bit width
-      Value *newValue = Builder.CreateTrunc(opVec[0], dstTy);
-      p->replaceAllUsesWith(newValue);
-    }
-  }
-
-  void Legalize::legalizeZExt(IRBuilder<> &Builder, Instruction *p) {
-    SmallVector<Value*, 16> split;
-    Type *dstTy = dyn_cast<CastInst>(p)->getDestTy();
-    Type *srcTy = p->getOperand(0)->getType();
-    int elements = dstTy->getPrimitiveSizeInBits() / srcTy->getPrimitiveSizeInBits();
-
-    split.push_back(p->getOperand(0));
-    for (int i = 0; i < elements - 1; i++)
-      split.push_back(ConstantInt::getSigned(srcTy, 0));
-
-    valueMap.insert(std::make_pair(p, split));
-  }
-
-  bool Legalize::legalizeFunction(Function &F) {
-    bool changed = false;
-
-    typedef ReversePostOrderTraversal<Function*> RPOTType;
-    RPOTType rpot(&F);
-
-    for (RPOTType::rpo_iterator bb = rpot.begin(), bbE = rpot.end(); bb != bbE; ++bb) {
-      IRBuilder<> Builder(*bb);
-      for (BasicBlock::iterator it = (*bb)->begin(), itE = (*bb)->end(); it != itE; ++it) {
-        Instruction *insn = it;
-        Type *ty = insn->getType();
-        if(ty->isIntegerTy() && ty->getIntegerBitWidth() > 64) {
-          // result is large integer, push back itself and its users
-          changed = true;
-
-          processed.insert(insn);
-
-          for(Value::use_iterator iter = insn->use_begin(); iter != insn->use_end(); ++iter) {
-            // After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
-          #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
-            User *theUser = *iter;
-          #else
-            User *theUser = iter->getUser();
-          #endif
-            processed.insert(theUser);
-          }
-        }
-
-        if(processed.empty() || processed.find(insn) == processed.end())
-          continue;
-
-        Builder.SetInsertPoint(insn);
-        switch(insn->getOpcode()) {
-          default: { insn->dump(); GBE_ASSERT(false && "Illegal instruction\n"); break;}
-          case Instruction::PHI:
-            legalizePHI(Builder, insn);
-            break;
-          case Instruction::Select:
-            legalizeSelect(Builder, insn);
-            break;
-          case Instruction::ICmp:
-            legalizeICmp(Builder, insn);
-            break;
-
-          case Instruction::Shl:
-            legalizeShl(Builder, insn);
-            break;
-
-          case Instruction::LShr:
-            legalizeLShr(Builder, insn);
-            break;
-
-          case Instruction::And:
-            legalizeAnd(Builder, insn);
-            break;
-
-          case Instruction::Or:
-            legalizeOr(Builder, insn);
-            break;
-
-          case Instruction::Xor:
-            legalizeXor(Builder, insn);
-            break;
-
-          case Instruction::BitCast:
-            legalizeBitCast(Builder, insn);
-            break;
-
-          case Instruction::Trunc:
-            legalizeTrunc(Builder, insn);
-            break;
-
-          case Instruction::ZExt:
-            legalizeZExt(Builder, insn);
-            break;
-        }
-      }
-    }
-
-    for (Value *v : processed) {
-      if (isa<Instruction>(v)) {
-        dyn_cast<Instruction>(v)->dropAllReferences();
-      }
-    }
-
-    for (Value *v : processed) {
-      if (isa<Instruction>(v)) {
-        dyn_cast<Instruction>(v)->eraseFromParent();
-      }
-    }
-
-    processed.clear();
-    valueMap.clear();
-    incompletePHIs.clear();
-    return changed;
-  }
-
-  FunctionPass* createLegalizePass() {
-    return new Legalize();
-  }
-  char Legalize::ID = 0;
-};
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index ab8e1a0..2f85443 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -552,7 +552,6 @@ error:
     return true;
   }
 
-
   bool PrintfParser::runOnFunction(llvm::Function &F)
   {
     bool changed = false;
@@ -565,6 +564,7 @@ error:
 #else
       case CallingConv::C:
       case CallingConv::Fast:
+      case CallingConv::SPIR_KERNEL:
 #endif
         break;
       default:
@@ -582,6 +582,15 @@ error:
 
     builder = new IRBuilder<>(module->getContext());
 
+    llvm::GlobalValue* gFun = module->getNamedValue("printf");
+    if(gFun) {
+      gFun->setName("__gen_ocl_printf_stub");
+    }
+    llvm::GlobalValue* gFun2 = module->getNamedValue("puts");
+    if(gFun2 ) {
+      gFun2->setName("__gen_ocl_puts_stub");
+    }
+
     /* First find printfs and caculate all slots size of one loop. */
     for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
       for (BasicBlock::iterator instI = B->begin(),
@@ -595,13 +604,13 @@ error:
           continue;
         }
 
-        if (call->getCalledFunction()->getIntrinsicID() != 0)
+        if (call->getCalledFunction() && call->getCalledFunction()->getIntrinsicID() != 0)
           continue;
 
         Value *Callee = call->getCalledValue();
         const std::string fnName = Callee->getName();
 
-        if (fnName != "__gen_ocl_printf_stub")
+        if (fnName != "__gen_ocl_printf_stub" && fnName != "__gen_ocl_puts_stub")
           continue;
 
         if (!parseOnePrintfInstruction(call, pInfo, sizeof_size)) {
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 694b008..bc985c6 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -291,8 +291,17 @@ namespace gbe {
 
   bool Scalarize::IsPerComponentOp(const Instruction* inst)
   {
-    //if (const IntrinsicInst* intr = dyn_cast<const IntrinsicInst>(inst))
-    //    return IsPerComponentOp(intr);
+    if (const IntrinsicInst* intr = dyn_cast<const IntrinsicInst>(inst))
+    {
+        const Intrinsic::ID intrinsicID = (Intrinsic::ID) intr->getIntrinsicID();
+        switch (intrinsicID) {
+          default: return false;
+          case Intrinsic::sqrt:
+          case Intrinsic::ceil:
+          case Intrinsic::trunc:
+              return true;
+        }
+    }
 
     if (inst->isTerminator())
         return false;
@@ -437,13 +446,17 @@ namespace gbe {
       // assumption. This is due to how getDeclaration operates; it only takes
       // a list of types that fit overloadable slots.
       SmallVector<Type*, 8> tys(1, GetBasicType(inst->getType()));
+
       // Call instructions have the decl as a last argument, so skip it
+      SmallVector<Value*, 8> _args;
+
       for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end() - 1; i != e; ++i) {
         tys.push_back(GetBasicType((*i)->getType()));
+        _args.push_back(*i);
       }
 
       Function* f = Intrinsic::getDeclaration(module, intr->getIntrinsicID(), tys);
-      return CallInst::Create(f, args);
+      return CallInst::Create(f, _args);
     }
 
     NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unsupported instruction: ", inst->getOpcode(),
@@ -646,7 +659,18 @@ namespace gbe {
   bool Scalarize::scalarizeFuncCall(CallInst* call) {
     if (Function *F = call->getCalledFunction()) {
       if (F->getIntrinsicID() != 0) {   //Intrinsic functions
-        NOT_IMPLEMENTED;
+        const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
+
+        switch (intrinsicID) {
+          default: GBE_ASSERTM(false, "Unsupported Intrinsic");
+          case Intrinsic::sqrt:
+          case Intrinsic::ceil:
+          case Intrinsic::trunc:
+          {
+            scalarizePerComponent(call);
+          }
+          break;
+        }
       } else {
         Value *Callee = call->getCalledValue();
         const std::string fnName = Callee->getName();
@@ -791,6 +815,7 @@ namespace gbe {
 #else
     case CallingConv::C:
     case CallingConv::Fast:
+    case CallingConv::SPIR_KERNEL:
 #endif
       break;
     default:
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index a4ce4a2..891f2a1 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -61,7 +61,8 @@
 #include "sys/cvar.hpp"
 #include "sys/platform.hpp"
 #include "ir/unit.hpp"
-#include "ir/structural_analysis.hpp"
+#include "ir/function.hpp"
+#include "ir/structurizer.hpp"
 
 #include <clang/CodeGen/CodeGenAction.h>
 
@@ -74,6 +75,7 @@ namespace gbe
 {
   BVAR(OCL_OUTPUT_CFG, false);
   BVAR(OCL_OUTPUT_CFG_ONLY, false);
+  BVAR(OCL_OUTPUT_CFG_GEN_IR, false);
   using namespace llvm;
 
   void runFuntionPass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL)
@@ -158,7 +160,7 @@ namespace gbe
     MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
     MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
     MPM.add(createLoopDeletionPass());          // Delete dead loops
-    MPM.add(createLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
+    MPM.add(createLoopUnrollPass(1024)); //1024, 32, 1024, 512)); //Unroll loops
     if(optLevel > 0) {
       MPM.add(createSROAPass(/*RequiresDomTree*/ false));
       MPM.add(createGVNPass());                 // Remove redundancies
@@ -312,12 +314,15 @@ namespace gbe
     ir::Unit::FunctionSet::const_iterator iter = fs.begin();
     while(iter != fs.end())
     {
-      analysis::ControlTree *ct = new analysis::ControlTree(iter->second);
-      ct->analyze();
-      delete ct;
+      ir::CFGStructurizer *structurizer = new ir::CFGStructurizer(iter->second);
+      structurizer->StructurizeBlocks();
+      delete structurizer;
+      if (OCL_OUTPUT_CFG_GEN_IR)
+        iter->second->outputCFG();
       iter++;
     }
 
+
     delete libraryInfo;
     return true;
   }
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 9a2bd77..3e43a21 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -11,10 +11,12 @@ set (benchmark_sources
   ../utests/utest_file_map.cpp
   ../utests/utest_helper.cpp
   ../utests/vload_bench.cpp
-  enqueue_copy_buf.cpp
+  benchmark_copy_buf.cpp
   benchmark_use_host_ptr_buffer.cpp
   benchmark_read_buffer.cpp
-  benchmark_read_image.cpp)
+  benchmark_read_image.cpp
+  benchmark_copy_buffer_to_image.cpp
+  benchmark_copy_image_to_buffer.cpp)
 
 
 SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
diff --git a/benchmark/benchmark_copy_buf.cpp b/benchmark/benchmark_copy_buf.cpp
new file mode 100644
index 0000000..e21c936
--- /dev/null
+++ b/benchmark/benchmark_copy_buf.cpp
@@ -0,0 +1,51 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+double benchmark_copy_buf(void)
+{
+  size_t i;
+  const size_t sz = 127 *1023 * 1023;
+  const size_t cb = sz;
+  size_t src_off =0, dst_off = 0;
+  struct timeval start,stop;
+
+  cl_char* buf0;
+
+  OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+  buf0 = (cl_char *)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 0, sizeof(char), 0, NULL, NULL, NULL);
+
+  for (i=0; i < sz; i++) {
+    buf0[i]=(rand() & 0xFF);
+  }
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf0, 0, NULL, NULL);
+
+  if (src_off + cb > sz || dst_off + cb > sz) {
+    /* Expect Error. */
+    OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+          src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+  }
+
+  /* Internal kernel will be built for the first time of calling
+   * clEnqueueCopyBuffer, so the first execution time of clEnqueueCopyBuffer
+   * will be much longer. It should not be added to benchmark time. */
+  OCL_ASSERT(CL_SUCCESS == clEnqueueCopyBuffer(queue, buf[0], buf[1],
+        src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+  OCL_FINISH();
+  gettimeofday(&start,0);
+
+  for (i=0; i<100; i++) {
+    OCL_ASSERT(CL_SUCCESS == clEnqueueCopyBuffer(queue, buf[0], buf[1],
+          src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+  }
+  OCL_FINISH();
+
+  gettimeofday(&stop,0);
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(sz * sizeof(char) * 100, elapsed);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buf);
diff --git a/benchmark/benchmark_copy_buffer_to_image.cpp b/benchmark/benchmark_copy_buffer_to_image.cpp
new file mode 100644
index 0000000..2177cfe
--- /dev/null
+++ b/benchmark/benchmark_copy_buffer_to_image.cpp
@@ -0,0 +1,66 @@
+#include <string.h>
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define IMAGE_BPP 2
+
+double benchmark_copy_buffer_to_image(void)
+{
+  struct timeval start,stop;
+  const size_t w = 960 * 4;
+  const size_t h = 540 * 4;
+  const size_t sz = IMAGE_BPP * w * h;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup image and buffer
+  buf_data[0] = (unsigned short*) malloc(sz);
+  for (uint32_t i = 0; i < w*h; ++i) {
+    ((unsigned short*)buf_data[0])[i] = (rand() & 0xffff);
+  }
+
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNSIGNED_INT16;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, sz, buf_data[0]);
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+
+  /*copy buffer to image*/
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, 1};
+
+  OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  OCL_FINISH();
+  OCL_MAP_BUFFER_GTT(1);
+  /*check result*/
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+    {
+      OCL_ASSERT(((unsigned short*)buf_data[0])[j * w + i] == ((unsigned short*)buf_data[1])[j * w + i]);
+    }
+  OCL_UNMAP_BUFFER_GTT(1);
+  gettimeofday(&start,0);
+
+  for (uint32_t i=0; i<100; i++) {
+    OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  }
+  OCL_FINISH();
+
+  gettimeofday(&stop,0);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(sz * 100, elapsed);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buffer_to_image);
diff --git a/benchmark/benchmark_copy_image_to_buffer.cpp b/benchmark/benchmark_copy_image_to_buffer.cpp
new file mode 100644
index 0000000..debed09
--- /dev/null
+++ b/benchmark/benchmark_copy_image_to_buffer.cpp
@@ -0,0 +1,64 @@
+#include <string.h>
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define IMAGE_BPP 2
+
+double benchmark_copy_image_to_buffer(void)
+{
+  struct timeval start,stop;
+  const size_t w = 960 * 4;
+  const size_t h = 540 * 4;
+  const size_t sz = IMAGE_BPP * w * h;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup image and buffer
+  buf_data[0] = (unsigned short*) malloc(sz);
+  for (uint32_t i = 0; i < w*h; ++i) {
+    ((unsigned short*)buf_data[0])[i] = (rand() & 0xffff);
+  }
+
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNSIGNED_INT16;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = desc.image_width * IMAGE_BPP;
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, sz, NULL);
+
+  /*copy image to buffer*/
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, 1};
+
+  OCL_CALL (clEnqueueCopyImageToBuffer, queue, buf[0], buf[1], origin, region,
+            0, 0, NULL, NULL);
+  OCL_FINISH();
+  OCL_MAP_BUFFER(1);
+  /*check result*/
+  for (uint32_t i = 0; i < w*h; ++i) {
+    OCL_ASSERT(((unsigned short *)buf_data[0])[i] == ((unsigned short *)buf_data[1])[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+  gettimeofday(&start,0);
+
+  for (uint32_t i=0; i<100; i++) {
+    OCL_CALL (clEnqueueCopyImageToBuffer, queue, buf[0], buf[1], origin, region,
+            0, 0, NULL, NULL);
+  }
+  OCL_FINISH();
+
+  gettimeofday(&stop,0);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(sz * 100, elapsed);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_image_to_buffer);
diff --git a/benchmark/benchmark_read_buffer.cpp b/benchmark/benchmark_read_buffer.cpp
index 31a1f59..431f42a 100644
--- a/benchmark/benchmark_read_buffer.cpp
+++ b/benchmark/benchmark_read_buffer.cpp
@@ -1,7 +1,7 @@
 #include "utests/utest_helper.hpp"
 #include <sys/time.h>
 
-int benchmark_read_buffer(void)
+double benchmark_read_buffer(void)
 {
   struct timeval start,stop;
 
@@ -43,7 +43,9 @@ int benchmark_read_buffer(void)
   free(buf_data[0]);
   buf_data[0] = NULL;
 
-  return time_subtract(&stop, &start, 0);
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(sz * sizeof(float) * 2 * 100, elapsed);
 }
 
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_buffer);
diff --git a/benchmark/benchmark_read_image.cpp b/benchmark/benchmark_read_image.cpp
index 48aa987..e3aa5bd 100644
--- a/benchmark/benchmark_read_image.cpp
+++ b/benchmark/benchmark_read_image.cpp
@@ -2,7 +2,7 @@
 #include "utests/utest_helper.hpp"
 #include <sys/time.h>
 
-int benchmark_read_image(void)
+double benchmark_read_image(void)
 {
   struct timeval start,stop;
 
@@ -61,7 +61,9 @@ int benchmark_read_image(void)
   free(buf_data[0]);
   buf_data[0] = NULL;
 
-  return time_subtract(&stop, &start, 0);
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(sz * sizeof(float) * 2 * 100, elapsed);
 }
 
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_image);
diff --git a/benchmark/benchmark_use_host_ptr_buffer.cpp b/benchmark/benchmark_use_host_ptr_buffer.cpp
index 80b6c5c..9e3d155 100644
--- a/benchmark/benchmark_use_host_ptr_buffer.cpp
+++ b/benchmark/benchmark_use_host_ptr_buffer.cpp
@@ -1,7 +1,7 @@
 #include "utests/utest_helper.hpp"
 #include <sys/time.h>
 
-int benchmark_use_host_ptr_buffer(void)
+double benchmark_use_host_ptr_buffer(void)
 {
   struct timeval start,stop;
 
@@ -32,7 +32,9 @@ int benchmark_use_host_ptr_buffer(void)
   free(buf_data[0]);
   buf_data[0] = NULL;
 
-  return time_subtract(&stop, &start, 0);
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(n*sizeof(uint32_t)*100*2, elapsed);
 }
 
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_use_host_ptr_buffer);
diff --git a/benchmark/enqueue_copy_buf.cpp b/benchmark/enqueue_copy_buf.cpp
deleted file mode 100644
index f012cf7..0000000
--- a/benchmark/enqueue_copy_buf.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include "utests/utest_helper.hpp"
-#include <sys/time.h>
-
-void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
-{
-  unsigned int i;
-  cl_char* buf0;
-
-  OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
-
-  buf0 = (cl_char *)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 0, sizeof(char), 0, NULL, NULL, NULL);
-
-  for (i=0; i < sz; i++) {
-    buf0[i]=(rand() & 0xFF);
-  }
-
-  clEnqueueUnmapMemObject(queue, buf[0], buf0, 0, NULL, NULL);
-
-  if (src_off + cb > sz || dst_off + cb > sz) {
-  /* Expect Error. */
-    OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
-                 src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
-    return;
-  }
-
-  OCL_ASSERT(CL_SUCCESS == clEnqueueCopyBuffer(queue, buf[0], buf[1],
-    src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
-}
-
-int enqueue_copy_buf(void)
-{
-  size_t i;
-  const size_t sz = 127 *1023 * 1023;
-  struct timeval start,stop;
-
-  gettimeofday(&start,0);
-
-  for (i=0; i<10; i++) {
-    test_copy_buf(sz, 0, 0, sz);
-  }
-
-  gettimeofday(&stop,0);
-  return time_subtract(&stop, &start, 0);
-}
-
-MAKE_BENCHMARK_FROM_FUNCTION(enqueue_copy_buf);
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index 57041ba..9a2b516 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -47,9 +47,9 @@ A simple command to install all the above dependencies for ubuntu or debian is:
 
 **The recommended LLVM/CLANG version is 3.5 and/or 3.6**
 
-Based on our test result, LLVM 3.5 has best pass rate on all the test suites. LLVM 3.6
-has slightly less pass rate(caused by one front end bug at clang 3.6) than 3.5 but has
-better performance about 3% to 8% for different cases and different targets.
+Based on our test result, LLVM 3.5 has best pass rate on all the test suites. Compare
+to LLVM 3.5, LLVM 3.6 has slightly lower pass rate(caused by one front end bug at clang
+3.6) but has better performance(3% to 8% up).
 
 For LLVM 3.3 and 3.4, Beignet still support them, but it may be limited to support the
 build and major functions.
@@ -142,7 +142,7 @@ Supported Targets
 
  * 3rd Generation Intel Core Processors
  * Intel “Bay Trail” platforms with Intel HD Graphics
- * 4th Generation Intel Core Processors, need kernel patch currently, see the "Known Issues" section.
+ * 4th Generation Intel Core Processors "Haswell", need kernel patch currently, see the "Known Issues" section.
  * 5th Generation Intel Core Processors "Broadwell".
 
 Known Issues
@@ -163,31 +163,43 @@ Known Issues
   But this command is a little bit dangerous, as if your kernel really hang, then the gpu will lock up
   forever until a reboot.
 
-* Almost all unit tests fail.
-  There is a known issue in some versions of linux kernel which enable register whitelist feature
-  but miss some necessary registers which are required for beignet. For non-HSW platforms, the
-  problematic version are around 3.15 and 3.16 which have commit f0a346b... but haven't commit
-  c9224f... If it is the case, you can apply c9224f... manually and rebuild the kernel or just
-  disable the parse command by invoke the following command (use Ubuntu as an example):
+* "Beignet: self-test failed" and almost all unit tests fail.
+  Linux 3.15 and 3.16 (commits [f0a346b](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=f0a346bdafaf6fc4a51df9ddf1548fd888f860d8)
+  to [c9224fa](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c9224faa59c3071ecfa2d4b24592f4eb61e57069))
+  enable the register whitelist by default but miss some registers needed
+  for Beignet.
+  
+  This can be fixed by upgrading Linux, or by disabling the whitelist:
 
   `# echo 0 > /sys/module/i915/parameters/enable_cmd_parser`
 
-  For HSW platforms, this issue exists in all linux kernel version after 3.15. We always need
-  to execute the above command.
-
-* Some unit test cases, maybe 20 to 30, fail on 4th Generation (HSW) platform.
-  _The 4th Generation Intel Core Processors's support requires some Linux kernel
-  modification_. You need to apply the patch at:  
-  [https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support](https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support)
+  On Haswell hardware, Beignet 1.0.1 to 1.0.3 also required the
+  above workaround on later Linux versions, but this _should not_ be
+  required in current (after [83f8739](http://cgit.freedesktop.org/beignet/commit/?id=83f8739b6fc4893fac60145326052ccb5cf653dc))
+  git master.
+
+* "Beignet: self-test failed" and 15-30 unit tests fail on 4th Generation (Haswell) hardware.
+  On Haswell, shared local memory (\_\_local) does not work at all on
+  Linux <= 4.0, and requires the i915.enable_ppgtt=2 [boot parameter](https://wiki.ubuntu.com/Kernel/KernelBootParameters)
+  on Linux 4.1.
+  
+  This will be fixed in Linux 4.2; older versions can be fixed with
+  [this patch](https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support).
+  
+  If you do not need \_\_local, you can override the self-test with
+  
+  `export OCL_IGNORE_SELF_TEST=1`
+  
+  but using \_\_local after this may silently give wrong results.
 
 * Precision issue.
   Currently Gen does not provide native support of high precision math functions
   required by OpenCL. We provide a software version to achieve high precision,
-  which you can turn on through
+  which you can turn off through
 
-  `# export OCL_STRICT_CONFORMANCE=1`.
+  `# export OCL_STRICT_CONFORMANCE=0`.
 
-  But be careful, this would make your CL kernel run a little longer.
+  This would lost some precision but gain performance.
 
 * cl\_khr\_gl\_sharing.
   This extension highly depends on mesa support. It seems that mesa would not provide
@@ -274,6 +286,7 @@ Fedora Maintainer:
 * Igor Gnatenko
 
 If I missed any other package maintainers, please feel free to contact the mail list.
+
 How to contribute
 -----------------
 You are always welcome to contribute to this project, just need to subscribe
@@ -288,9 +301,11 @@ by running the beignet's unit test.
 
 Documents for OpenCL application developers
 -------------------------------------------
-- [[Cross compile|Beignet/howto/cross-compiler-howto]]
+- [[Cross compile (yocto)|Beignet/howto/cross-compiler-howto]]
+- [[Work with old system without c++11|Beignet/howto/oldgcc-howto]]
 - [[Kernel Optimization Guide|Beignet/optimization-guide]]
 - [[Libva Buffer Sharing|Beignet/howto/libva-buffer-sharing-howto]]
+- [[V4l2 Buffer Sharing|Beignet/howto/v4l2-buffer-sharing-howto]]
 
 The wiki URL is as below:
 [http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/Beignet/Backend.mdwn b/docs/Beignet/Backend.mdwn
index e4259fb..583e5d2 100644
--- a/docs/Beignet/Backend.mdwn
+++ b/docs/Beignet/Backend.mdwn
@@ -45,13 +45,18 @@ Environment variables are used all over the code. Most important ones are:
   Normally, you don't need to set it, we will select suitable simd width for
   a given kernel. Default value is 16.
 
+- `OCL_OUTPUT_KENERL_SOURCE` `(0 or 1)`. Output the building or compiling kernel's
+  source code.
+
 - `OCL_OUTPUT_GEN_IR` `(0 or 1)`. Output Gen IR (scalar intermediate
   representation) code
 
-- `OCL_OUTPUT_LLVM` `(0 or 1)`. Output LLVM code after the lowering passes
+- `OCL_OUTPUT_LLVM_BEFORE_LINK` `(0 or 1)`. Output LLVM code before llvm link
+
+- `OCL_OUTPUT_LLVM_AFTER_LINK` `(0 or 1)`. Output LLVM code after llvm link
 
-- `OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS` `(0 or 1)`. Output LLVM code before the
-  lowering passes
+- `OCL_OUTPUT_LLVM_AFTER_GEN` `(0 or 1)`. Output LLVM code after the lowering
+  passes, Gen IR is generated based on it.
 
 - `OCL_OUTPUT_ASM` `(0 or 1)`. Output Gen ISA
 
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index 8930be9..7aa8e94 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,16 +1,13 @@
 # News
 
-## Apr 17, 2015
-[Beignet 1.0.3](https://01.org/beignet/downloads/beignet-1.0.3-2015-04-17) is released. This is a bug-fix release.
-
-## Mar 16, 2015
-[Beignet 1.0.2](https://01.org/beignet/downloads/beignet-1.0.2-2015-03-16) is released. This is a bug-fix release.
+## Jul 21, 2015
+[Beignet 1.1.0](https://01.org/beignet/downloads/beignet-1.1.0-2015-07-31) is released. This is a major release. Please see the release notes for more information.
 
 ## Jan 19, 2015
-[Beignet 1.0.1](https://01.org/beignet/downloads/beignet-1.0.1-2015-01-19) is released. This is a bug-fix release.  
+[Beignet 1.0.1](https://01.org/beignet/downloads/beignet-1.0.1-2015-01-19) is released. This is a bug-fix release.
 
 ## Nov 14, 2014
-[Beignet 1.0.0](https://01.org/downloads/beignet-1.0.0-2014-11-14) is released. This is a major release. Please see the release notes for more information.
+[Beignet 1.0.0](https://01.org/beignet/downloads/beignet-1.0.0-2014-11-14) is released. This is a major release. Please see the release notes for more information.
 
 ## Sep 15, 2014
 [Beignet 0.9.3](https://01.org/zh/beignet/downloads/beignet-0.9.3-2014-09-15-0) is released. This is a bug-fix release.
diff --git a/docs/howto/cross-compiler-howto.mdwn b/docs/howto/cross-compiler-howto.mdwn
index 535cd9a..d541816 100644
--- a/docs/howto/cross-compiler-howto.mdwn
+++ b/docs/howto/cross-compiler-howto.mdwn
@@ -7,6 +7,8 @@ and OpenCL kernels for a target machine (embedded/handheld devices) in a
 host machine with the help of cross compiler, and also the large-size-reduced
 Beignet driver package for the target machine.
 
+Key part of yocto receipt is shown as an example.
+
 Build Beignet with a cross compiler
 -----------------------------------
 
@@ -17,6 +19,10 @@ configure Beignet with cmake.
   Beignet depends on llvm+clang, this option refers to the path of llvm-config,
   llvm-as, llvm-link and clang in the cross compiler environment.
 
+  Please ensure that llvm-config, llvm-as, llvm-link and clang are in the same
+  directory. Please also make sure 'your_llvm_install_dir/llvm-config --libdir'
+  and 'your_llvm_install_dir/llvm-config --includedir' print the right output.
+
 - CMAKE_SKIP_RPATH
   Some cross compiler systems forbid the usage of rpath in binaries/libraries,
   set this option to be TRUE.
@@ -26,8 +32,30 @@ configure Beignet with cmake.
   of Intel Ivybridge GPU, and 0x0f31 is Intel Baytrail GPU. The information can
   be queried with command 'lspci -n'.
 
-- CMAKE_INSTALL_PREFIX
-  This option controls the prefix of installation path.
+- CMAKE_INSTALL_PREFIX and BEIGNET_INSTALL_DIR
+  These two options control the installation path.
+
+
+To build Beignet with yocto, the receipt looks like:
+
+S = "${WORKDIR}/git"
+BEIGNET_BUILD_DIR = "${S}/build"
+do_configure() {
+    mkdir -p ${BEIGNET_BUILD_DIR}
+    cd ${BEIGNET_BUILD_DIR}
+    cmake \
+        .. \
+        -DLLVM_INSTALL_DIR=${TMPDIR}/sysroots/baytraili/usr/bin/ \
+        -DCMAKE_INSTALL_PREFIX=/usr/ \
+        -DBEIGNET_INSTALL_DIR=/usr/lib/ \
+        -DCMAKE_SKIP_RPATH=TRUE \
+        -DGEN_PCI_ID=0x0f31
+}
+do_compile() {
+    cd ${BEIGNET_BUILD_DIR}
+    oe_runmake
+}
+
 
 Distribution of large-size-reduced Beignet driver package
 ---------------------------------------------------------
@@ -37,7 +65,48 @@ provide only the OpenCL runtime library without OpenCL compiler, and only the
 executable binary kernel is supported on such devices.
 
 It means that just distribute libcl.so and libgbeinterp.so (~320k in total after strip)
-are enough for OpenCL embeded profile in the target machine.
+are enough for OpenCL embeded profile in the target machine. The whole Beignet
+driver set can be separated into several packages for different usage.
+
+
+Take yocto as an example, the receipt looks like:
+SYSROOT_PREPROCESS_FUNCS += "beignet_sysroot_preprocess"
+beignet_sysroot_preprocess() {
+    install -d ${SYSROOT_DESTDIR}${bindir}
+    install -m 0755 ${BEIGNET_BUILD_DIR}/backend/src/gbe_bin_generater ${SYSROOT_DESTDIR}${bindir}
+}
+do_install() {
+    cd ${BEIGNET_BUILD_DIR}
+    oe_runmake DESTDIR=${D} install
+
+    #install OpenCL offline compiler
+    install -d ${D}${bindir}
+    install -m 0755 ${BEIGNET_BUILD_DIR}/backend/src/gbe_bin_generater ${D}${bindir}
+
+    #install utest
+    install -d ${D}${bindir}/beignet
+    install -d ${D}${bindir}/beignet/include
+    install -m 0755 ${BEIGNET_BUILD_DIR}/utests/utest_run ${D}${bindir}/beignet
+    install -m 0755 ${S}/setenv.sh ${D}${bindir}/beignet
+    install -m 0644 ${S}/kernels/*.cl ${D}${bindir}/beignet
+    install -m 0644 ${S}/kernels/*.bmp ${D}${bindir}/beignet
+    install -m 0644 ${S}/kernels/compiler_ceil.bin ${D}${bindir}/beignet
+    install -m 0644 ${S}/kernels/runtime_compile_link.h ${D}${bindir}/beignet
+    install -m 0644 ${S}/kernels/include/runtime_compile_link_inc.h ${D}${bindir}/beignet/include/runtime_compile_link_inc.h
+    install -d ${D}${libdir}
+    install -m 0644 ${BEIGNET_BUILD_DIR}/utests/libutests.so ${D}${libdir}
+}
+do_install_append() {
+    rm -rf ${D}${sysconfdir}
+}
+PACKAGES += "${PN}-compiler ${PN}-test"
+FILES_${PN} = "${libdir}/libcl.so ${libdir}/libgbeinterp.so"
+FILES_${PN}-compiler = "${bindir}/gbe_bin_generater ${libdir}/libgbe.so ${libdir}/beignet.bc ${libdir}/beignet.pch ${libdir}/ocl_stdlib
+.h ${libdir}/include/"
+FILES_${PN}-dev = "${includedir}/CL/"
+FILES_${PN}-test = "${bindir}/beignet/"
+FILES_${PN}-test += "${libdir}/libutests.so"
+
 
 Build OpenCL kernels with OpenCL offline compiler
 -------------------------------------------------
diff --git a/docs/howto/v4l2-buffer-sharing-howto.mdwn b/docs/howto/v4l2-buffer-sharing-howto.mdwn
new file mode 100644
index 0000000..68d37aa
--- /dev/null
+++ b/docs/howto/v4l2-buffer-sharing-howto.mdwn
@@ -0,0 +1,64 @@
+V4l2 Buffer Sharing HowTo
+=========================
+
+Beignet has extension clGetMemObjectFdIntel to share gpu buffer object with v4l2. So users
+can utilize OpenCL to do processing on input/ouput buffers of v4l2 device without buffer copy.
+
+Prerequisite
+------------
+
+Linux kernel supports DMABUF buffer sharing for v4l2 from version 3.8. DMABUF buffer
+sharing runs well for V4L2_PIX_FMT_MJPEG format on this version, but there is a bug
+for V4L2_PIX_FMT_YUYV format. Linux kernel 3.19.0-rc1 fix this bug, so please use kernel
+version 3.19.0-rc1 at least if you want to utilize this feature for V4L2_PIX_FMT_YUYV
+format.
+
+Steps
+-----
+
+The below official v4l2 document describes the details of sharing DMA buffers between
+v4l devices and other devices using v4l2 as a DMABUF importer:
+[http://linuxtv.org/downloads/v4l-dvb-apis/dmabuf.html](http://linuxtv.org/downloads/v4l-dvb-apis/dmabuf.html)
+Beignet has added extension clGetMemObjectFdIntel to support this mechanism. Please follow
+the steps as below to utilize DMABUF buffer sharing between v4l devices and Beignet:
+
+- Get the address of this extension by the function:
+  clGetExtensionFunctionAddress("clGetMemObjectFdIntel")
+
+- Create a number of cl buffer objects, invoke clGetMemObjectFdIntel to get these buffer
+  objects' file descriptors.
+
+- Initiating streaming I/O with DMABUF buffer sharing by calling the VIDIOC_REQBUFS v4l2 ioctl.
+
+- Enqueue these buffers by calling the VIDIOC_QBUF, dequeue a buffer by calling VIDIOC_DQBUF,
+  use OpenCL to do processing on this buffer and re-enqueue...
+
+- Close file descriptors of these buffers by close if your program doesn't need DMABUF buffer
+  sharing anymore.
+
+Sample code
+-----------
+
+We have developed an example showing how to share DMA buffers between webcam and Beignet in
+examples/v4l2_buffer_sharing directory. The webcam directly captures V4L2_PIX_FMT_YUYV frames
+into cl buffer objects by the way of DMABUF buffer sharing, then frames are got mirror effect
+by OpenCL kernel, and finally show on screen by libva.
+
+Steps to build and run this example:
+
+- Update your linux kernel to at least 3.19.0-rc1.
+
+- Make sure there is a webcam connected to your pc.
+
+- Add option -DBUILD_EXAMPLES=ON to enable building examples when running cmake, such as:
+  `> mkdir build`
+  `> cd build`
+  `> cmake -DBUILD_EXAMPLES=ON ../`
+
+- Build source code:
+  `> make`
+
+- Run:
+  `> cd examples`
+  `> . ../utests/setenv.sh`
+  `> ./example-v4l2_buffer_sharing`
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 904f259..850b3d9 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,21 +1,23 @@
-EXEC_PROGRAM(ls ARGS "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva" OUTPUT_VARIABLE LS_OUTPUT)
-IF(NOT LS_OUTPUT)
-EXEC_PROGRAM(git "${CMAKE_CURRENT_SOURCE_DIR}/.." ARGS "submodule init")
-EXEC_PROGRAM(git "${CMAKE_CURRENT_SOURCE_DIR}/.." ARGS "submodule update")
-EXEC_PROGRAM(git "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva" ARGS "checkout master")
-ENDIF(NOT LS_OUTPUT)
-
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
                     ${CMAKE_CURRENT_SOURCE_DIR}/../utests
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include
-                    ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va
-                    ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common
                     ${X11_INCLUDE_DIR})
 
+IF(LIBVA_BUF_SH_DEP OR V4L2_BUF_SH_DEP)
+EXECUTE_PROCESS(COMMAND ls "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva" OUTPUT_VARIABLE LS_RESULT)
+IF ("LS_RESULT" STREQUAL "")
+EXECUTE_PROCESS(COMMAND git submodule init WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+EXECUTE_PROCESS(COMMAND git submodule update WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+EXECUTE_PROCESS(COMMAND git checkout master WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva)
+ENDIF ("LS_RESULT" STREQUAL "")
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va
+                    ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common)
+
 link_directories (${LIBVA_LIBDIR}
                   ${LIBVA-X11_LIBDIR})
 
-set (examples_sources
+set (va_ocl_basic_sources
   ../utests/utest_error.c
   ../utests/utest_assert.cpp
   ../utests/utest_file_map.cpp
@@ -23,13 +25,20 @@ set (examples_sources
   ./thirdparty/libva/test/common/va_display.c
   ./thirdparty/libva/test/common/va_display_x11.c)
 
-
 ADD_DEFINITIONS(-DHAVE_VA_X11)
-ADD_DEFINITIONS(-DINPUT_NV12_DEFAULT="${CMAKE_CURRENT_SOURCE_DIR}/libva_buffer_sharing/256_128.nv12")
 
-ADD_LIBRARY(va_ocl_basic SHARED ${examples_sources})
+ADD_LIBRARY(va_ocl_basic SHARED ${va_ocl_basic_sources})
 
 TARGET_LINK_LIBRARIES(va_ocl_basic cl m va va-x11 ${X11_X11_LIB})
 
+IF(LIBVA_BUF_SH_DEP)
+ADD_DEFINITIONS(-DINPUT_NV12_DEFAULT="${CMAKE_CURRENT_SOURCE_DIR}/libva_buffer_sharing/256_128.nv12")
 ADD_EXECUTABLE(example-libva_buffer_sharing ./libva_buffer_sharing/libva_buffer_sharing.cpp)
 TARGET_LINK_LIBRARIES(example-libva_buffer_sharing va_ocl_basic)
+ENDIF(LIBVA_BUF_SH_DEP)
+
+IF(V4L2_BUF_SH_DEP)
+ADD_EXECUTABLE(example-v4l2_buffer_sharing ./v4l2_buffer_sharing/v4l2_buffer_sharing.cpp)
+TARGET_LINK_LIBRARIES(example-v4l2_buffer_sharing va_ocl_basic)
+ENDIF(V4L2_BUF_SH_DEP)
+ENDIF(LIBVA_BUF_SH_DEP OR V4L2_BUF_SH_DEP)
diff --git a/examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp b/examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp
new file mode 100644
index 0000000..42ab642
--- /dev/null
+++ b/examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp
@@ -0,0 +1,590 @@
+/*
+ ** Copyright (c) 2012, 2015 Intel Corporation. All Rights Reserved.
+ **
+ ** Permission is hereby granted, free of charge, to any person obtaining a
+ ** copy of this software and associated documentation files (the
+ ** "Software"), to deal in the Software without restriction, including
+ ** without limitation the rights to use, copy, modify, merge, publish,
+ ** distribute, sub license, and/or sell copies of the Software, and to
+ ** permit persons to whom the Software is furnished to do so, subject to
+ ** the following conditions:
+ **
+ ** The above copyright notice and this permission notice (including the
+ ** next paragraph) shall be included in all copies or substantial portions
+ ** of the Software.
+ **
+ ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ ** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ ** IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ ** ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ **/
+
+#include <getopt.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <linux/videodev2.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <inttypes.h>
+#include <ctype.h>
+
+#include <va/va.h>
+#include <va/va_drmcommon.h>
+
+#include "va_display.h"
+#include "utest_helper.hpp"
+
+using namespace std;
+
+#define BUFFER_NUM_DEFAULT 5
+#define VIDEO_NODE_DEFAULT "/dev/video0"
+#define WIDTH_DEFAULT 640
+#define HEIGHT_DEFAULT 480
+
+#define CHECK_VASTATUS(va_status,func)                                  \
+  if (va_status != VA_STATUS_SUCCESS) {                                   \
+    fprintf(stderr, "status = %d, %s: %s(line %d) failed, exit\n",va_status, __func__, func, __LINE__); \
+    exit(1);                                                            \
+  }
+
+#define CHECK_CLSTATUS(status,func)                                  \
+  if (status != CL_SUCCESS) {                                   \
+    fprintf(stderr, "status = %d, %s: %s(line %d) failed, exit\n", status, __func__, func, __LINE__); \
+    exit(1);                                                            \
+  }
+
+#define CHECK_V4L2ERROR(ret, STR)                               \
+  if (ret){                             \
+    fprintf(stderr, STR);            \
+    perror(" ");                            \
+    fprintf(stderr, "ret = %d, %s: %s(line %d) failed, exit\n", ret, __func__, STR, __LINE__);      \
+    exit(1);                                  \
+  }
+
+VADisplay	va_dpy;
+cl_int cl_status;
+VAStatus va_status;
+VASurfaceID nv12_surface_id;
+VAImage nv12_image;
+
+int dev_fd;
+uint64_t image_size;
+unsigned int pitch;
+cl_mem *import_buf = NULL;
+typedef cl_int (OCLGETMEMOBJECTFD)(cl_context, cl_mem, int *);
+OCLGETMEMOBJECTFD *oclGetMemObjectFd = NULL;
+
+int frame_count = 0;
+struct v4l2_options{
+  const char *dev_name;
+  unsigned int width, height;
+  unsigned int spec_res;
+  unsigned int buffer_num;
+  unsigned int do_list;
+} vo;
+int *import_buf_fd = NULL;
+
+static const char short_options[] = "d:r:b:lh";
+
+static const struct option
+long_options[] = {
+  { "device", required_argument, NULL, 'd' },
+  { "help",   no_argument,       NULL, 'h' },
+  { "resolution", required_argument,       NULL, 'r' },
+  { "buffer_num",  required_argument, NULL, 'b' },
+  { "list",  no_argument, NULL, 'l' },
+  { 0, 0, 0, 0 }
+};
+
+static void usage(FILE *fp, int argc, char **argv)
+{
+  fprintf(fp,
+      "This example aims to demostrate the usage of DMABUF buffer sharing between v4l2 and Beignet.\n"
+      "For more details, please read docs/howto/v4l2-buffer-sharing-howto.mdwn.\n"
+      "Usage: %s [options]\n\n"
+      "Options:\n"
+      "-d | --device=<dev>  Specify device by <dev> instead of /dev/video0\n"
+      "-h | --help          Print this message\n"
+      "-r | --resolution=<width,height>    Set image resolution\n"
+      "-b | --buffer_num=<num>  Set number of buffers\n"
+      "-l | --list  List available resolution of format 'V4L2_PIX_FMT_YUYV'\n"
+      "",
+      argv[0]);
+}
+
+static void list_resolution(){
+  int ret;
+  struct v4l2_capability cap;
+  struct v4l2_frmsizeenum frm_sz;
+
+  dev_fd = open(vo.dev_name, O_RDWR | O_NONBLOCK, 0);
+  if (dev_fd < 0) {
+    fprintf(stderr, "Can not open %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+
+  memset(&cap, 0, sizeof(cap));
+  ret = ioctl(dev_fd, VIDIOC_QUERYCAP, &cap);
+  CHECK_V4L2ERROR(ret, "VIDIOC_QUERYCAP");
+
+  if(!(cap.capabilities & V4L2_CAP_VIDEO_CAPTURE)){
+    fprintf(stderr, "The device is not video capture device\n");
+    exit(1);
+  }
+  if(!(cap.capabilities & V4L2_CAP_STREAMING)){
+    fprintf(stderr, "The device does not support streaming i/o\n");
+    exit(1);
+  }
+
+  printf("Supported resolution under pixel format 'V4L2_PIX_FMT_YUYV':\n");
+  frm_sz.pixel_format = V4L2_PIX_FMT_YUYV;
+  frm_sz.index = 0;
+  bool extra_info = true;
+  while (ioctl(dev_fd, VIDIOC_ENUM_FRAMESIZES, &frm_sz) == 0) {
+    if (frm_sz.type == V4L2_FRMSIZE_TYPE_DISCRETE) {
+      if(extra_info){
+        printf("(width, height) = \n");
+        extra_info = false;
+      }
+      printf("(%d, %d)", frm_sz.discrete.width, frm_sz.discrete.height);
+      printf("\n");
+    }
+    else if (frm_sz.type == V4L2_FRMSIZE_TYPE_STEPWISE) {
+      printf("(width, height) from (%d, %d) to (%d, %d) with step (%d, %d)",
+          frm_sz.stepwise.min_width,
+          frm_sz.stepwise.min_height,
+          frm_sz.stepwise.max_width,
+          frm_sz.stepwise.max_height,
+          frm_sz.stepwise.step_width,
+          frm_sz.stepwise.step_height);
+      continue;
+    }
+    frm_sz.index++;
+  }
+
+  ret = close(dev_fd);
+  if (ret) {
+    fprintf(stderr, "Failed to close %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+}
+
+static void analyse_args(int argc, char *argv[])
+{
+  vo.dev_name = NULL;
+  vo.width = 0;
+  vo.height = 0;
+  vo.spec_res = 0;
+  vo.buffer_num = BUFFER_NUM_DEFAULT;
+  vo.do_list = 0;
+
+  int c, idx;
+  for (;;) {
+
+    c = getopt_long(argc, argv,
+        short_options, long_options, &idx);
+
+    if (-1 == c)
+      break;
+
+    switch (c) {
+      case 0:
+        break;
+
+      case 'd':
+        vo.dev_name = optarg;
+        break;
+
+      case '?':
+      case 'h':
+        usage(stdout, argc, argv);
+        exit(0);
+
+      case 'r':
+        sscanf(optarg, "%d,%d", &vo.width, &vo.height);
+        vo.spec_res = 1;
+        break;
+
+      case 'b':
+        vo.buffer_num = strtoul(optarg, NULL, 0);
+        break;
+
+      case 'l':
+        vo.do_list = 1;
+        break;
+
+      default:
+        usage(stderr, argc, argv);
+        exit(1);
+    }
+  }
+
+  if(!vo.dev_name){
+    printf("Haven't specified device, use default device: %s\n",
+        VIDEO_NODE_DEFAULT);
+  }
+  if(!vo.dev_name)
+    vo.dev_name = VIDEO_NODE_DEFAULT;
+  if(vo.do_list){
+    list_resolution();
+    exit(0);
+  }
+  if(!vo.spec_res){
+    printf("Haven't specified resolution, use default resolution: (width,height) = (%d, %d)\n",
+        WIDTH_DEFAULT, HEIGHT_DEFAULT);
+    vo.width = WIDTH_DEFAULT;
+    vo.height = HEIGHT_DEFAULT;
+  }
+  return;
+}
+
+static void initialize_va_ocl(){
+  int major_ver, minor_ver;
+
+  printf("\n***********************libva info: ***********************\n");
+  fflush(stdout);
+  va_dpy = va_open_display();
+  va_status = vaInitialize(va_dpy, &major_ver, &minor_ver);
+  CHECK_VASTATUS(va_status, "vaInitialize");
+
+  VASurfaceAttrib forcc;
+  forcc.type =VASurfaceAttribPixelFormat;
+  forcc.flags=VA_SURFACE_ATTRIB_SETTABLE;
+  forcc.value.type=VAGenericValueTypeInteger;
+  forcc.value.value.i = VA_FOURCC_NV12;
+  va_status = vaCreateSurfaces(va_dpy, VA_RT_FORMAT_YUV420,
+                               vo.width, vo.height,
+                               &nv12_surface_id, 1, &forcc, 1);
+  CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+  VAImageFormat image_fmt;
+  image_fmt.fourcc = VA_FOURCC_NV12;
+  image_fmt.byte_order = VA_LSB_FIRST;
+  image_fmt.bits_per_pixel = 12;
+  va_status = vaCreateImage(va_dpy, &image_fmt, vo.width, vo.height, &nv12_image);
+  CHECK_VASTATUS(va_status, "vaCreateImage");
+
+  //ocl initialization: basic & create kernel & get extension
+  printf("\n***********************OpenCL info: ***********************\n");
+  if ((cl_status = cl_test_init("runtime_yuy2_processing.cl", "runtime_yuy2_processing", SOURCE)) != 0){
+    fprintf(stderr, "cl_test_init error\n");
+    exit(1);
+  }
+
+#ifdef CL_VERSION_1_2
+  oclGetMemObjectFd = (OCLGETMEMOBJECTFD *)clGetExtensionFunctionAddressForPlatform(platform, "clGetMemObjectFdIntel");
+#else
+  oclGetMemObjectFd = (OCLGETMEMOBJECTFD *)clGetExtensionFunctionAddress("clGetMemObjectFdIntel");
+#endif
+  if(!oclGetMemObjectFd){
+    fprintf(stderr, "Failed to get extension clGetMemObjectFdIntel\n");
+    exit(1);
+  }
+  printf("\n***********************************************************\n");
+}
+
+static void create_dmasharing_buffers()
+{
+  if(import_buf_fd == NULL)
+    import_buf_fd = (int *)malloc(sizeof(int) * vo.buffer_num);
+  if(import_buf == NULL){
+    import_buf = (cl_mem *)malloc(sizeof(cl_mem) * vo.buffer_num);
+  }
+
+  for (unsigned int i = 0; i < vo.buffer_num; ++i){
+    import_buf[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE, image_size, NULL, &cl_status);
+    CHECK_CLSTATUS(cl_status, "clCreateBuffer");
+
+    //get cl buffer object's fd
+    cl_status = oclGetMemObjectFd(ctx, import_buf[i], &import_buf_fd[i]);
+    CHECK_CLSTATUS(cl_status, "clGetMemObjectFdIntel");
+  }
+}
+
+static void release_va_ocl(){
+  va_status = vaDestroySurfaces(va_dpy,&nv12_surface_id,1);
+  CHECK_VASTATUS(va_status, "vaDestroySurfaces");
+  va_status = vaDestroyImage(va_dpy, nv12_image.image_id);
+  CHECK_VASTATUS(va_status, "vaDestroyImage");
+  va_status = vaTerminate(va_dpy);
+  CHECK_VASTATUS(va_status, "vaTerminate");
+  va_close_display(va_dpy);
+
+  int ret;
+  for (unsigned int i = 0; i < vo.buffer_num; ++i) {
+    ret = close(import_buf_fd[i]);
+    if (ret) {
+      fprintf(stderr, "Failed to close import_buf[%u]'s fd: %s\n", i, strerror(errno));
+    }
+    cl_status = clReleaseMemObject(import_buf[i]);
+    CHECK_CLSTATUS(cl_status, "clReleaseMemObject");
+  }
+}
+
+static void process_show_frame(int index)
+{
+  //process import_buf[index] by ocl
+  size_t global_size[2];
+  global_size[0] = vo.width * 2 / 4;
+  global_size[1] = vo.height;
+  cl_status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &import_buf[index]);
+  CHECK_CLSTATUS(cl_status, "clSetKernelArg");
+  cl_status = clSetKernelArg(kernel, 1, sizeof(int), &vo.height);
+  CHECK_CLSTATUS(cl_status, "clSetKernelArg");
+  cl_status = clSetKernelArg(kernel, 2, sizeof(int), &pitch);
+  CHECK_CLSTATUS(cl_status, "clSetKernelArg");
+  cl_status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
+                                     global_size, NULL, 0, NULL, NULL);
+  CHECK_CLSTATUS(cl_status, "clEnqueueNDRangeKernel");
+  cl_status = clFinish(queue);
+  CHECK_CLSTATUS(cl_status, "clFinish");
+
+  //create corresponding VASurface
+  VASurfaceID yuy2_surface_id;
+  VASurfaceAttrib sa[2];
+  sa[0].type = VASurfaceAttribMemoryType;
+  sa[0].flags = VA_SURFACE_ATTRIB_SETTABLE;
+  sa[0].value.type = VAGenericValueTypeInteger;
+  sa[0].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
+  sa[1].type = VASurfaceAttribExternalBufferDescriptor;
+  sa[1].flags = VA_SURFACE_ATTRIB_SETTABLE;
+  sa[1].value.type = VAGenericValueTypePointer;
+  VASurfaceAttribExternalBuffers sa_eb;
+  sa_eb.pixel_format = VA_FOURCC_YUY2;
+  sa_eb.width = vo.width;
+  sa_eb.height = vo.height;
+  sa_eb.data_size = image_size;
+  sa_eb.num_planes = 1;
+  sa_eb.pitches[0] = pitch;
+  sa_eb.offsets[0] = 0;
+  sa_eb.num_buffers = 1;
+  sa_eb.buffers = (unsigned long *)malloc(sizeof(unsigned long) * sa_eb.num_buffers);
+  sa_eb.buffers[0] = import_buf_fd[index];
+  sa_eb.flags = 0;
+  sa[1].value.value.p = &sa_eb;
+  va_status = vaCreateSurfaces(va_dpy, VA_RT_FORMAT_YUV422,
+                               vo.width, vo.height,
+                               &yuy2_surface_id, 1, sa, 2);
+  CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+  //convert to NV12 format
+  va_status = vaGetImage (va_dpy, yuy2_surface_id, 0, 0,
+                          vo.width, vo.height, nv12_image.image_id);
+  CHECK_VASTATUS(va_status, "vaGetImage");
+  va_status = vaPutImage(va_dpy, nv12_surface_id, nv12_image.image_id,
+                         0, 0, vo.width, vo.height, 0, 0,
+                         vo.width, vo.height);
+  CHECK_VASTATUS(va_status, "vaPutImage");
+
+  //show by vaPutsurface
+  VARectangle src_rect, dst_rect;
+  src_rect.x      = 0;
+  src_rect.y      = 0;
+  src_rect.width  = vo.width;
+  src_rect.height = vo.height;
+  dst_rect        = src_rect;
+  va_status = va_put_surface(va_dpy, nv12_surface_id, &src_rect, &dst_rect);
+  CHECK_VASTATUS(va_status, "vaPutSurface");
+
+  vaDestroySurfaces(va_dpy,&yuy2_surface_id,1);
+  CHECK_VASTATUS(va_status, "vaDestroySurfaces");
+  free(sa_eb.buffers);
+  return;
+}
+
+static void init_dmabuf(void){
+  int ret;
+  struct v4l2_requestbuffers reqbuf;
+
+  memset(&reqbuf, 0, sizeof(reqbuf));
+  reqbuf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  reqbuf.memory = V4L2_MEMORY_DMABUF;
+  reqbuf.count = vo.buffer_num;
+
+  ret = ioctl(dev_fd, VIDIOC_REQBUFS, &reqbuf);
+  if(ret == -1 && errno == EINVAL){
+    fprintf(stderr, "Video capturing or DMABUF streaming is not supported\n");
+    exit(1);
+  }
+  else
+    CHECK_V4L2ERROR(ret, "VIDIOC_REQBUFS");
+
+  create_dmasharing_buffers();
+  printf("Succeed to create %d dma buffers \n", vo.buffer_num);
+
+}
+
+static void init_device(void){
+
+  int ret;
+  struct v4l2_capability cap;
+  struct v4l2_format format;
+
+  dev_fd = open(vo.dev_name, O_RDWR | O_NONBLOCK, 0);
+  if (dev_fd < 0) {
+    fprintf(stderr, "Can not open %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+
+  memset(&cap, 0, sizeof(cap));
+  ret = ioctl(dev_fd, VIDIOC_QUERYCAP, &cap);
+  CHECK_V4L2ERROR(ret, "VIDIOC_QUERYCAP");
+  if(!(cap.capabilities & V4L2_CAP_STREAMING)){
+    fprintf(stderr, "The device does not support streaming i/o\n");
+    exit(1);
+  }
+
+  memset(&format, 0, sizeof(format));
+  format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  format.fmt.pix.width = vo.width;
+  format.fmt.pix.height = vo.height;
+  format.fmt.pix.pixelformat = V4L2_PIX_FMT_YUYV;
+  format.fmt.pix.field = V4L2_FIELD_ANY;
+
+  ret = ioctl(dev_fd, VIDIOC_S_FMT, &format);
+  CHECK_V4L2ERROR(ret, "VIDIOC_S_FMT");
+
+  ret = ioctl(dev_fd, VIDIOC_G_FMT, &format);
+  CHECK_V4L2ERROR(ret, "VIDIOC_G_FMT");
+  if(format.fmt.pix.pixelformat != V4L2_PIX_FMT_YUYV){
+    fprintf(stderr, "V4L2_PIX_FMT_YUYV format is not supported by %s\n", vo.dev_name);
+    exit(1);
+  }
+  if(format.fmt.pix.width != vo.width  || format.fmt.pix.height != vo.height){
+    fprintf(stderr, "This resolution is not supported, please go through supported resolution by command './main -l'\n");
+    exit(1);
+  }
+  printf("Input image format: (width, height) = (%u, %u), pixel format = %.4s\n",
+      format.fmt.pix.width, format.fmt.pix.height, (char*)&format.fmt.pix.pixelformat);
+  image_size = format.fmt.pix.sizeimage;
+	pitch = format.fmt.pix.bytesperline;
+}
+
+static void start_capturing(void){
+  int ret;
+  for (unsigned int i = 0; i < vo.buffer_num; ++i) {
+    struct v4l2_buffer buf;
+
+    memset(&buf, 0, sizeof(buf));
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_DMABUF;
+    buf.index = i;
+    buf.m.fd = import_buf_fd[i];
+    ret = ioctl(dev_fd, VIDIOC_QBUF, &buf);
+    CHECK_V4L2ERROR(ret, "VIDIOC_QBUF");
+  }
+
+  int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  ret = ioctl(dev_fd, VIDIOC_STREAMON, &type);
+  CHECK_V4L2ERROR(ret, "VIDIOC_STREAMON");
+}
+
+static void mainloop(void){
+  int ret;
+  struct v4l2_buffer buf;
+  int index;
+
+  while (1) {
+    frame_count++;
+    printf("******************Frame %d\n", frame_count);
+    fd_set fds;
+    struct timeval tv;
+    int r;
+
+    FD_ZERO(&fds);
+    FD_SET(dev_fd, &fds);
+
+    /* Timeout. */
+    tv.tv_sec = 2;
+    tv.tv_usec = 0;
+
+
+    r = select(dev_fd + 1, &fds, NULL, NULL, &tv);
+
+    if (-1 == r) {
+      if (EINTR == errno)
+        continue;
+      perror("select");
+    }
+
+    if(r == 0){
+      fprintf(stderr, "Select timeout\n");
+      exit(1);
+    }
+
+    memset(&buf, 0, sizeof(buf));
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_DMABUF;
+    ret = ioctl(dev_fd, VIDIOC_DQBUF, &buf);
+    CHECK_V4L2ERROR(ret, "VIDIOC_DQBUF");
+    index = buf.index;
+
+    //process by ocl and show on screen by libva
+    process_show_frame(index);
+
+    //Then queue this buffer(buf.index) by QBUF
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_DMABUF;
+    buf.m.fd = import_buf_fd[index];
+    buf.index = index;
+
+    ret = ioctl(dev_fd, VIDIOC_QBUF, &buf);
+    CHECK_V4L2ERROR(ret, "VIDIOC_QBUF");
+  }
+}
+
+static void stop_capturing(void)
+{
+  int ret;
+  int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+
+  ret = ioctl(dev_fd, VIDIOC_STREAMOFF, &type);
+  CHECK_V4L2ERROR(ret, "VIDIOC_STREAMOFF");
+}
+
+static void uninit_device(void){
+  free(import_buf_fd);
+  free(import_buf);
+  int ret = close(dev_fd);
+  if (ret) {
+    fprintf(stderr, "Failed to close %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  analyse_args(argc, argv);
+
+  init_device();
+  initialize_va_ocl();
+  init_dmabuf();
+
+  start_capturing();
+  mainloop();
+
+  stop_capturing();
+  release_va_ocl();
+  uninit_device();
+
+  return 0;
+}
diff --git a/kernels/compiler_argument_structure_indirect.cl b/kernels/compiler_argument_structure_indirect.cl
index c4b062f..6fd7873 100644
--- a/kernels/compiler_argument_structure_indirect.cl
+++ b/kernels/compiler_argument_structure_indirect.cl
@@ -1,7 +1,7 @@
-struct hop { int x[16]; };
+struct hop { int a, x[16]; };
 
 __kernel void
-compiler_argument_structure(__global int *dst, struct hop h)
+compiler_argument_structure_indirect(__global int *dst, struct hop h)
 {
   int id = (int)get_global_id(0);
   dst[id] = h.x[get_local_id(0)];
diff --git a/kernels/compiler_argument_structure_select.cl b/kernels/compiler_argument_structure_select.cl
new file mode 100644
index 0000000..295acf4
--- /dev/null
+++ b/kernels/compiler_argument_structure_select.cl
@@ -0,0 +1,18 @@
+typedef struct {
+  int  offset;
+  int  threshold0;
+  int  threshold1;
+}hop;
+
+__kernel void compiler_argument_structure_select(__global int *dst, hop h)
+{
+  int i = get_global_id (0);
+  int threshold=0;
+  if (i == 0)  {
+    threshold = h.threshold0;
+  } else {
+    threshold = h.threshold1;
+  }
+  dst[i] = threshold;
+}
+
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
index dddde44..4beb436 100644
--- a/kernels/compiler_async_copy.cl
+++ b/kernels/compiler_async_copy.cl
@@ -5,10 +5,10 @@ compiler_async_copy_##TYPE(__global TYPE *dst, __global TYPE *src, __local TYPE
   event_t event; \
   int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0); \
   int i; \
-  event = async_work_group_copy((__local TYPE*)localBuffer, (__global const TYPE*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 ); \
+  event = async_work_group_copy((__local TYPE*)localBuffer, (__global const TYPE*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, 0 ); \
   wait_group_events( 1, &event ); \
 \
-  event = async_work_group_copy((__global TYPE*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const TYPE*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 ); \
+  event = async_work_group_copy((__global TYPE*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const TYPE*)localBuffer, (size_t)copiesPerWorkgroup, 0 ); \
   wait_group_events( 1, &event ); \
 }
 
diff --git a/kernels/compiler_async_stride_copy.cl b/kernels/compiler_async_stride_copy.cl
index a926588..5dbb559 100644
--- a/kernels/compiler_async_stride_copy.cl
+++ b/kernels/compiler_async_stride_copy.cl
@@ -4,13 +4,13 @@ compiler_async_stride_copy(__global char4 *dst, __global char4 *src, __local cha
   event_t event;
   int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
   int i;
-  event = async_work_group_strided_copy( (__local char4*)localBuffer, (__global const char4*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+  event = async_work_group_strided_copy( (__local char4*)localBuffer, (__global const char4*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, 0 );
   wait_group_events( 1, &event );
 
   for(i=0; i<copiesPerWorkItem; i++)
     localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (char4)(3);
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  event = async_work_group_strided_copy((__global char4*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const char4*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+  event = async_work_group_strided_copy((__global char4*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const char4*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, 0 );
   wait_group_events( 1, &event );
 }
diff --git a/kernels/compiler_bswap.cl b/kernels/compiler_bswap.cl
index 97313b1..3a0a373 100644
--- a/kernels/compiler_bswap.cl
+++ b/kernels/compiler_bswap.cl
@@ -1,13 +1,17 @@
-#define TEST_TYPE(TYPE, LENGTH)                                       \
-kernel void compiler_bswap_##TYPE(global TYPE * src, global TYPE * dst){ \
-   dst[get_global_id(0)]= __builtin_bswap##LENGTH(src[get_global_id(0)]); \
-   dst[get_global_id(0)]= __builtin_bswap##LENGTH(dst[get_global_id(0)] -1 ); \
-}
+kernel void compiler_bswap(global uint * src0, global uint * dst0, global ushort * src1, global ushort * dst1,
+    int src2, global int * dst2,  short src3, global short * dst3) {
+  if (get_global_id(0) % 2 == 0) {
+    dst0[get_global_id(0)] = __builtin_bswap32(src0[get_global_id(0)]);
+  } else {
+    dst0[get_global_id(0)] = src0[get_global_id(0)];
+  }
 
+  dst1[get_global_id(0)] = __builtin_bswap16(src1[get_global_id(0)]);
+  if (get_global_id(0) % 2 == 1) {
+    dst1[get_global_id(0)] = __builtin_bswap16(dst1[get_global_id(0)] + 1);
+  }
 
-TEST_TYPE(short, 16)
-TEST_TYPE(ushort, 16)
-TEST_TYPE(int, 32)
-TEST_TYPE(uint, 32)
+  dst2[get_global_id(0)] = __builtin_bswap32(src2);
+  dst3[get_global_id(0)] = __builtin_bswap16(src3);
+}
 
-#undef TEST_TYPE
diff --git a/kernels/compiler_ceil32.spir b/kernels/compiler_ceil32.spir
new file mode 100644
index 0000000..ee64834
Binary files /dev/null and b/kernels/compiler_ceil32.spir differ
diff --git a/kernels/compiler_clz.cl b/kernels/compiler_clz.cl
new file mode 100644
index 0000000..4b06178
--- /dev/null
+++ b/kernels/compiler_clz.cl
@@ -0,0 +1,16 @@
+#define COMPILER_CLZ(TYPE) \
+    kernel void compiler_clz_##TYPE(global TYPE* src, global TYPE* dst)   \
+{                                                \
+  __global TYPE* A = &src[get_global_id(0)];    \
+  __global TYPE* B = &dst[get_global_id(0)];    \
+  *B =  clz(*A);   \
+}
+
+COMPILER_CLZ(ulong)
+COMPILER_CLZ(uint)
+COMPILER_CLZ(ushort)
+COMPILER_CLZ(uchar)
+COMPILER_CLZ(long)
+COMPILER_CLZ(int)
+COMPILER_CLZ(short)
+COMPILER_CLZ(char)
diff --git a/kernels/compiler_clz_int.cl b/kernels/compiler_clz_int.cl
deleted file mode 100644
index 0f17f86..0000000
--- a/kernels/compiler_clz_int.cl
+++ /dev/null
@@ -1,5 +0,0 @@
-kernel void compiler_clz_int(global int *src, global int *dst) {
-  int i = get_global_id(0);
-  dst[i] = clz(src[i]);
-}
-
diff --git a/kernels/compiler_clz_short.cl b/kernels/compiler_clz_short.cl
deleted file mode 100644
index 1ecf7a9..0000000
--- a/kernels/compiler_clz_short.cl
+++ /dev/null
@@ -1,5 +0,0 @@
-kernel void compiler_clz_short(global short *src, global short *dst) {
-  int i = get_global_id(0);
-  dst[i] = clz(src[i]);
-}
-
diff --git a/kernels/compiler_get_sub_group_id.cl b/kernels/compiler_get_sub_group_id.cl
new file mode 100644
index 0000000..10033ff
--- /dev/null
+++ b/kernels/compiler_get_sub_group_id.cl
@@ -0,0 +1,8 @@
+__kernel void compiler_get_sub_group_id(global int *dst)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_sub_group_size();
+
+  dst[i+1] = get_sub_group_id();
+}
diff --git a/kernels/compiler_get_sub_group_size.cl b/kernels/compiler_get_sub_group_size.cl
new file mode 100644
index 0000000..4d5e3eb
--- /dev/null
+++ b/kernels/compiler_get_sub_group_size.cl
@@ -0,0 +1,5 @@
+__kernel void compiler_get_sub_group_size(global int *dst)
+{
+  int i = get_global_id(0);
+  dst[i] = get_sub_group_size();
+}
diff --git a/kernels/compiler_half.cl b/kernels/compiler_half.cl
new file mode 100644
index 0000000..dc22766
--- /dev/null
+++ b/kernels/compiler_half.cl
@@ -0,0 +1,11 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_half_basic(global half *src, global half *dst) {
+  int i = get_global_id(0);
+  half hf = 2.5;
+  half val = src[i];
+  val = val + hf;
+  val = val*val;
+  val = val/(half)1.8;
+  dst[i] = val;
+}
+
diff --git a/kernels/compiler_half_convert.cl b/kernels/compiler_half_convert.cl
new file mode 100644
index 0000000..c28921e
--- /dev/null
+++ b/kernels/compiler_half_convert.cl
@@ -0,0 +1,56 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+kernel void compiler_half_to_long_sat(global half *src, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_long_sat(src[i]);
+}
+
+kernel void compiler_ulong_to_half(global ulong *src, global half *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_half(src[i]);
+}
+
+kernel void compiler_half_to_long(global half *src, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_long(src[i]);
+}
+
+kernel void compiler_int_to_half(global int *src, global half *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_half(src[i]);
+}
+
+kernel void compiler_uchar_to_half(global uchar *src, global half *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_half(src[i]);
+}
+
+kernel void compiler_half_to_uint_sat(global half *src, global uint *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_uint(src[i]);
+}
+
+kernel void compiler_half_to_ushort_sat(global half *src, global ushort *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_ushort(src[i]);
+}
+
+kernel void compiler_half_to_char_sat(global half *src, global char *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_char_sat(src[i]);
+}
+
+kernel void compiler_half2_as_int(global half2 *src, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = as_int(src[i]);
+}
+
+kernel void compiler_half_as_char2(global half *src, global char2 *dst) {
+  int i = get_global_id(0);
+  dst[i] = as_char2(src[i]);
+}
+
+kernel void compiler_half_to_float(global half4 *src, global float4 *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_float4(src[i]);
+}
diff --git a/kernels/compiler_half_math.cl b/kernels/compiler_half_math.cl
new file mode 100644
index 0000000..a11a956
--- /dev/null
+++ b/kernels/compiler_half_math.cl
@@ -0,0 +1,28 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define MATH_KERNEL_ARG1(NAME) \
+    kernel void compiler_half_math_##NAME(global half *src, global half *dst) { \
+    int i = get_global_id(0); \
+    dst[i] = NAME(src[i]); \
+  }
+
+MATH_KERNEL_ARG1(sin);
+MATH_KERNEL_ARG1(cos);
+MATH_KERNEL_ARG1(sinh);
+MATH_KERNEL_ARG1(cosh);
+MATH_KERNEL_ARG1(tan);
+MATH_KERNEL_ARG1(log10);
+MATH_KERNEL_ARG1(log);
+MATH_KERNEL_ARG1(trunc);
+MATH_KERNEL_ARG1(exp);
+MATH_KERNEL_ARG1(sqrt);
+MATH_KERNEL_ARG1(ceil);
+
+#define MATH_KERNEL_ARG2(NAME) \
+    kernel void compiler_half_math_##NAME(global half4 *src0, global half4 *src1, global half4 *dst) { \
+    int i = get_global_id(0); \
+    dst[i] = NAME(src0[i], src1[i]); \
+  }
+MATH_KERNEL_ARG2(fmod);
+MATH_KERNEL_ARG2(fmax);
+MATH_KERNEL_ARG2(fmin);
diff --git a/kernels/compiler_half_relation.cl b/kernels/compiler_half_relation.cl
new file mode 100644
index 0000000..dfb01e6
--- /dev/null
+++ b/kernels/compiler_half_relation.cl
@@ -0,0 +1,10 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+kernel void compiler_half_isnan(global half2 *src, global short2 *dst) {
+  int i = get_global_id(0);
+  dst[i] = isnan(src[i]);
+}
+
+kernel void compiler_half_isinf(global half *src, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = isinf(src[i]);
+}
diff --git a/kernels/compiler_long_div.cl b/kernels/compiler_long_div.cl
new file mode 100644
index 0000000..b55263c
--- /dev/null
+++ b/kernels/compiler_long_div.cl
@@ -0,0 +1,12 @@
+kernel void compiler_long_div(__global long *srcA, __global long *srcB, __global long *dst)
+{
+    int tid = get_global_id(0);
+    dst[tid] = srcA[tid] / srcB[tid];
+}
+
+kernel void compiler_long_rem(__global long *srcA, __global long *srcB, __global long *dst)
+{
+    int tid = get_global_id(0);
+    dst[tid] = srcA[tid] % srcB[tid];
+}
+
diff --git a/kernels/compiler_long_hi_sat.cl b/kernels/compiler_long_hi_sat.cl
new file mode 100644
index 0000000..66e4ab9
--- /dev/null
+++ b/kernels/compiler_long_hi_sat.cl
@@ -0,0 +1,19 @@
+kernel void compiler_long_mul_hi(global long *src, global long *dst, long num0, long num1) {
+    int i = get_local_id(0);
+    long c;
+
+    if (i % 2 == 0) {
+      c = mul_hi(src[i],  num0);
+    } else {
+      c = mul_hi(src[i],  num1);
+    }
+    dst[i] = c;
+}
+
+kernel void compiler_long_mul_sat(global long *src, global long *dst, long num0, long num1) {
+    int i = get_local_id(0);
+    long c;
+
+    c = mad_sat(src[i],  num0, num1);
+    dst[i] = c;
+}
diff --git a/kernels/compiler_long_not.cl b/kernels/compiler_long_not.cl
new file mode 100644
index 0000000..39ce77b
--- /dev/null
+++ b/kernels/compiler_long_not.cl
@@ -0,0 +1,6 @@
+__kernel void compiler_long_not_vec8(__global ulong8 *src, __global long8 *dst)
+{
+  int tid = get_global_id(0);
+  dst[tid] = !src[tid];
+}
+
diff --git a/kernels/compiler_simd_all.cl b/kernels/compiler_simd_all.cl
deleted file mode 100644
index 504710b..0000000
--- a/kernels/compiler_simd_all.cl
+++ /dev/null
@@ -1,12 +0,0 @@
-__kernel void compiler_simd_all(global int *src, global int *dst)
-{
-  int i = get_global_id(0);
-  if (i % 2 == 1) {
-    if (__gen_ocl_simd_all((src[i] < 12) && (src[i] > 0)))
-      dst[i] = 1;
-    else
-      dst[i] = 2;
-  }
-  else
-    dst[i] = 3;
-}
diff --git a/kernels/compiler_simd_any.cl b/kernels/compiler_simd_any.cl
deleted file mode 100644
index 3b04f82..0000000
--- a/kernels/compiler_simd_any.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-__kernel void compiler_simd_any(global int *src, global int *dst)
-{
-  int i = get_global_id(0);
-
-  if (i % 2 == 1) {
-    if (__gen_ocl_simd_any(src[i] == 5) || __gen_ocl_simd_any(src[i] == 9))
-      dst[i] = 1;
-    else if (__gen_ocl_simd_any(src[i] == 6))
-      dst[i] = 0;
-    else
-      dst[i] = 2;
-  }
-  else
-    dst[i] = 3;
-}
diff --git a/kernels/compiler_sub_group_all.cl b/kernels/compiler_sub_group_all.cl
new file mode 100644
index 0000000..30db5bc
--- /dev/null
+++ b/kernels/compiler_sub_group_all.cl
@@ -0,0 +1,12 @@
+__kernel void compiler_sub_group_all(global int *src, global int *dst)
+{
+  int i = get_global_id(0);
+  if (i % 2 == 1) {
+    if (sub_group_all((src[i] < 12) && (src[i] > 0)))
+      dst[i] = 1;
+    else
+      dst[i] = 2;
+  }
+  else
+    dst[i] = 3;
+}
diff --git a/kernels/compiler_sub_group_any.cl b/kernels/compiler_sub_group_any.cl
new file mode 100644
index 0000000..15702db
--- /dev/null
+++ b/kernels/compiler_sub_group_any.cl
@@ -0,0 +1,15 @@
+__kernel void compiler_sub_group_any(global int *src, global int *dst)
+{
+  int i = get_global_id(0);
+
+  if (i % 2 == 1) {
+    if (sub_group_any(src[i] == 5) || sub_group_any(src[i] == 9))
+      dst[i] = 1;
+    else if (sub_group_any(src[i] == 6))
+      dst[i] = 0;
+    else
+      dst[i] = 2;
+  }
+  else
+    dst[i] = 3;
+}
diff --git a/kernels/compiler_sub_group_shuffle.cl b/kernels/compiler_sub_group_shuffle.cl
new file mode 100644
index 0000000..75adde3
--- /dev/null
+++ b/kernels/compiler_sub_group_shuffle.cl
@@ -0,0 +1,18 @@
+__kernel void compiler_sub_group_shuffle(global int *dst, int c)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_sub_group_size();
+  dst++;
+
+  int from = i;
+  int j = get_sub_group_size() - get_sub_group_id() - 1;
+  int o0 = get_sub_group_id();
+  int o1 = intel_sub_group_shuffle(from, c);
+  int o2 = intel_sub_group_shuffle(from, 5);
+  int o3 = intel_sub_group_shuffle(from, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
diff --git a/kernels/runtime_yuy2_processing.cl b/kernels/runtime_yuy2_processing.cl
new file mode 100644
index 0000000..1478e65
--- /dev/null
+++ b/kernels/runtime_yuy2_processing.cl
@@ -0,0 +1,15 @@
+__kernel void
+runtime_yuy2_processing(__global uchar *src,
+                        int image_height,
+                        int image_pitch)
+{
+  int gx = get_global_id(0);
+  int gy = get_global_id(1);
+
+  int src_y = image_height / 2 + gy;
+  int mirror_y = image_height - src_y;
+
+  uchar4 mirror_val = *(__global uchar4*)(src + mirror_y*image_pitch + gx*4);
+  *(__global uchar4*)(src + src_y*image_pitch + gx*4) = mirror_val;
+
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d4181d8..40a9afb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,7 +3,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}
                     ${DRM_INCLUDE_DIRS}/../
                     ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include
-                    ${MESA_SOURCE_INCLUDES})
+                    ${MESA_SOURCE_INCLUDES}
+                    ${LLVM_INCLUDE_DIR})
 
 macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
 foreach (KF ${KERNEL_FILES})
@@ -15,13 +16,13 @@ foreach (KF ${KERNEL_FILES})
     add_custom_command(
       OUTPUT ${output_file}
       COMMAND rm -rf ${output_file}
-      COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} -t${GEN_PCI_ID}
+      COMMAND ${GBE_BIN_GENERATER} -s -o${output_file} -t${GEN_PCI_ID} ${input_file}
       DEPENDS ${input_file} ${GBE_BIN_FILE})
   else(GEN_PCI_ID)
     add_custom_command(
       OUTPUT ${output_file}
       COMMAND rm -rf ${output_file}
-      COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file}
+      COMMAND ${GBE_BIN_GENERATER} -s -o${output_file} ${input_file}
       DEPENDS ${input_file} ${GBE_BIN_FILE})
   endif(GEN_PCI_ID)
 endforeach (KF)
@@ -50,7 +51,7 @@ cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_arra
 cl_internal_copy_image_2d_array_to_2d_array cl_internal_copy_image_2d_array_to_2d
 cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array
 cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
-cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
+cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_2d_align16 cl_internal_copy_buffer_to_image_3d
 cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
@@ -118,6 +119,16 @@ SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DHAS_USERPTR ${CMAKE_C_FLAGS}")
 endif (DRM_INTEL_USERPTR)
 
+if (DRM_INTEL_EU_TOTAL)
+SET(CMAKE_CXX_FLAGS "-DHAS_EU_TOTAL ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_EU_TOTAL ${CMAKE_C_FLAGS}")
+endif (DRM_INTEL_EU_TOTAL)
+
+if (DRM_INTEL_SUBSLICE_TOTAL)
+SET(CMAKE_CXX_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_C_FLAGS}")
+endif (DRM_INTEL_SUBSLICE_TOTAL)
+
 set(GIT_SHA1 "git_sha1.h")
 add_custom_target(${GIT_SHA1} ALL
   COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh
diff --git a/src/cl_api.c b/src/cl_api.c
index 972c687..69eb0bc 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -85,7 +85,7 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
       cl_event_new_enqueue_callback(e, data, num, wait_list);
     }
   }
-  queue->current_event = e;
+  set_current_event(queue, e);
   return status;
 }
 
@@ -195,7 +195,7 @@ clGetPlatformInfo(cl_platform_id    platform,
                   size_t *          param_value_size_ret)
 {
   /* Only one platform. This is easy */
-  if (UNLIKELY(platform != NULL && platform != intel_platform))
+  if (UNLIKELY(platform != NULL && platform != cl_get_platform_default()))
     return CL_INVALID_PLATFORM;
 
   return cl_get_platform_info(platform,
@@ -217,7 +217,7 @@ clGetDeviceIDs(cl_platform_id platform,
   /* Check parameter consistency */
   if (UNLIKELY(devices == NULL && num_devices == NULL))
     return CL_INVALID_VALUE;
-  if (UNLIKELY(platform && platform != intel_platform))
+  if (UNLIKELY(platform && platform != cl_get_platform_default()))
     return CL_INVALID_PLATFORM;
   if (UNLIKELY(devices && num_entries == 0))
     return CL_INVALID_VALUE;
@@ -941,6 +941,7 @@ clBuildProgram(cl_program            program,
   /* TODO support create program from binary */
   assert(program->source_type == FROM_LLVM ||
          program->source_type == FROM_SOURCE ||
+         program->source_type == FROM_LLVM_SPIR ||
          program->source_type == FROM_BINARY);
   if((err = cl_program_build(program, options)) != CL_SUCCESS) {
     goto error;
@@ -2980,6 +2981,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
   data->type = EnqueueNDRangeKernel;
   data->queue = command_queue;
 
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
     if (event && (*event)->type != CL_COMMAND_USER
@@ -3192,7 +3194,7 @@ void*
 clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
                               const char *func_name)
 {
-  if (UNLIKELY(platform != NULL && platform != intel_platform))
+  if (UNLIKELY(platform != NULL && platform != cl_get_platform_default()))
     return NULL;
   return internal_clGetExtensionFunctionAddress(func_name);
 }
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 5543976..4e4ebfb 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -78,8 +78,9 @@ cl_command_queue_delete(cl_command_queue queue)
 
   // If there is a valid last event, we need to give it a chance to
   // call the call-back function.
-  if (queue->last_event && queue->last_event->user_cb)
-    cl_event_update_status(queue->last_event, 1);
+  cl_event last_event = get_last_event(queue);
+  if (last_event && last_event->user_cb)
+    cl_event_update_status(last_event, 1);
   /* Remove it from the list */
   assert(queue->ctx);
   pthread_mutex_lock(&queue->ctx->queue_lock);
@@ -140,16 +141,16 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
     cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
-                        image->intel_fmt, image->image_type,
+                        image->intel_fmt, image->image_type, image->bpp,
                         image->w, image->h, image->depth,
-                        image->row_pitch, (cl_gpgpu_tiling)image->tiling);
+                        image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
     // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
     // on demand.
     if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
       cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset,
-                          image->intel_fmt, image->image_type,
+                          image->intel_fmt, image->image_type, image->bpp,
                           image->w, image->h, image->depth,
-                          image->row_pitch, (cl_gpgpu_tiling)image->tiling);
+                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
   }
   return CL_SUCCESS;
 }
@@ -207,7 +208,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
   /* Check that the user did not forget any argument */
   TRY (cl_kernel_check_args, k);
 
-  if (ver == 7 || ver == 75 || ver == 8)
+  if (ver == 7 || ver == 75 || ver == 8 || ver == 9)
     TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
   else
     FATAL ("Unknown Gen Device");
@@ -259,10 +260,14 @@ cl_command_queue_flush(cl_command_queue queue)
   // be released at the call back function, no other function will access
   // the event any more. If we don't do this here, we will leak that event
   // and all the corresponding buffers which is really bad.
-  if (queue->last_event && queue->last_event->user_cb)
-    cl_event_update_status(queue->last_event, 1);
-  if (queue->current_event && err == CL_SUCCESS)
-    err = cl_event_flush(queue->current_event);
+  cl_event last_event = get_last_event(queue);
+  if (last_event && last_event->user_cb)
+    cl_event_update_status(last_event, 1);
+  cl_event current_event = get_current_event(queue);
+  if (current_event && err == CL_SUCCESS) {
+    err = cl_event_flush(current_event);
+    set_current_event(queue, NULL);
+  }
   cl_invalid_thread_gpgpu(queue);
   return err;
 }
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 91c941c..2cd6739 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -40,8 +40,6 @@ struct _cl_command_queue {
   cl_event* wait_events;               /* Point to array of non-complete user events that block this command queue */
   cl_int    wait_events_num;           /* Number of Non-complete user events */
   cl_int    wait_events_size;          /* The size of array that wait_events point to */
-  cl_event  last_event;                /* The last event in the queue, for enqueue mark used */
-  cl_event  current_event;             /* Current event. */
   cl_command_queue_properties  props;  /* Queue properties */
   cl_command_queue prev, next;         /* We chain the command queues together */
   void *thread_data;                   /* Used to store thread context data */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 4adbd2b..89f39b3 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -210,6 +210,14 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
 #undef UPLOAD
 
+  /* get_sub_group_id needs it */
+  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LANE_ID, 0)) >= 0) {
+    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
+    uint32_t *laneid = (uint32_t *) (ker->curbe + offset);
+    int32_t i;
+    for (i = 0; i < (int32_t) simd_sz; ++i) laneid[i] = i;
+  }
+
   /* Write identity for the stack pointer. This is required by the stack pointer
    * computation in the kernel
    */
diff --git a/src/cl_context.c b/src/cl_context.c
index 0f08e6a..c45e0aa 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -68,7 +68,7 @@ cl_context_properties_process(const cl_context_properties *prop,
     case CL_CONTEXT_PLATFORM:
       CHECK (set_cl_context_platform);
       cl_props->platform_id = *(prop + 1);
-      if (UNLIKELY((cl_platform_id) cl_props->platform_id != intel_platform)) {
+      if (UNLIKELY((cl_platform_id) cl_props->platform_id != cl_get_platform_default())) {
         err = CL_INVALID_PLATFORM;
         goto error;
       }
@@ -149,6 +149,7 @@ cl_create_context(const cl_context_properties *  properties,
   /* Save the user callback and user data*/
   ctx->pfn_notify = pfn_notify;
   ctx->user_data = user_data;
+  cl_driver_set_atomic_flag(ctx->drv, ctx->device->atomic_test_result);
 
 exit:
   if (errcode_ret != NULL)
@@ -198,16 +199,16 @@ cl_context_delete(cl_context ctx)
 
   /* delete the internal programs. */
   for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
-    if (ctx->internel_kernels[i]) {
-      cl_kernel_delete(ctx->internel_kernels[i]);
-      ctx->internel_kernels[i] = NULL;
+    if (ctx->internal_kernels[i]) {
+      cl_kernel_delete(ctx->internal_kernels[i]);
+      ctx->internal_kernels[i] = NULL;
 
       assert(ctx->internal_prgs[i]);
       cl_program_delete(ctx->internal_prgs[i]);
       ctx->internal_prgs[i] = NULL;
     }
 
-    if (ctx->internel_kernels[i]) {
+    if (ctx->internal_kernels[i]) {
       cl_kernel_delete(ctx->built_in_kernels[i]);
       ctx->built_in_kernels[i] = NULL;
     }
@@ -268,72 +269,26 @@ cl_context_get_bufmgr(cl_context ctx)
 }
 
 cl_kernel
-cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kernel, const char * str_option)
-{
-  cl_int ret;
-  if (!ctx->internal_prgs[index]) {
-    size_t length = strlen(str_kernel) + 1;
-    ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
-
-    if (!ctx->internal_prgs[index])
-      return NULL;
-
-    ret = cl_program_build(ctx->internal_prgs[index], str_option);
-    if (ret != CL_SUCCESS)
-      return NULL;
-
-    ctx->internal_prgs[index]->is_built = 1;
-
-    /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
-    if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
-      int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
-      for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
-        if (index != i) {
-          assert(ctx->internal_prgs[i] == NULL);
-          assert(ctx->internel_kernels[i] == NULL);
-          cl_program_add_ref(ctx->internal_prgs[index]);
-          ctx->internal_prgs[i] = ctx->internal_prgs[index];
-        }
-
-        if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
-          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
-                                                              "__cl_fill_region_align8_2", NULL);
-        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
-          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
-                                                              "__cl_fill_region_align8_4", NULL);
-        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
-          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
-                                                              "__cl_fill_region_align8_8", NULL);
-        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
-          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
-                                                              "__cl_fill_region_align8_16", NULL);
-        } else
-          assert(0);
-      }
-    } else {
-      ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
-    }
-  }
-
-  return ctx->internel_kernels[index];
-}
-
-cl_kernel
 cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
                   const char * str_kernel, size_t size, const char * str_option)
 {
   cl_int ret;
   cl_int binary_status = CL_SUCCESS;
-  if (!ctx->internal_prgs[index]) {
+  cl_kernel ker;
+  pthread_mutex_lock(&ctx->program_lock);
+  if (ctx->internal_prgs[index] == NULL) {
     ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
       &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
 
-    if (!ctx->internal_prgs[index])
-      return NULL;
-
+    if (!ctx->internal_prgs[index]) {
+      ker = NULL;
+      goto unlock;
+    }
     ret = cl_program_build(ctx->internal_prgs[index], str_option);
-    if (ret != CL_SUCCESS)
-      return NULL;
+    if (ret != CL_SUCCESS) {
+      ker = NULL;
+      goto unlock;
+    }
 
     ctx->internal_prgs[index]->is_built = 1;
 
@@ -343,30 +298,33 @@ cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
       for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
         if (index != i) {
           assert(ctx->internal_prgs[i] == NULL);
-          assert(ctx->internel_kernels[i] == NULL);
+          assert(ctx->internal_kernels[i] == NULL);
           cl_program_add_ref(ctx->internal_prgs[index]);
           ctx->internal_prgs[i] = ctx->internal_prgs[index];
         }
 
         if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
-          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
                                                               "__cl_fill_region_align8_2", NULL);
         } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
-          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
                                                               "__cl_fill_region_align8_4", NULL);
         } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
-          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
                                                               "__cl_fill_region_align8_8", NULL);
         } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
-          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
                                                               "__cl_fill_region_align8_16", NULL);
         } else
           assert(0);
       }
     } else {
-      ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+      ctx->internal_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
     }
   }
+  ker = ctx->internal_kernels[index];
 
-  return ctx->internel_kernels[index];
+unlock:
+  pthread_mutex_unlock(&ctx->program_lock);
+  return cl_kernel_dup(ker);
 }
diff --git a/src/cl_context.h b/src/cl_context.h
index fdbfd2a..ef94823 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -63,6 +63,7 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
   CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
   CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
   CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
@@ -114,7 +115,7 @@ struct _cl_context {
   pthread_mutex_t event_lock;       /* To allocate and deallocate events */
   cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
                                     /* All programs internal used, for example clEnqueuexxx api use */
-  cl_kernel  internel_kernels[CL_INTERNAL_KERNEL_MAX];
+  cl_kernel  internal_kernels[CL_INTERNAL_KERNEL_MAX];
                                     /* All kernels  for clenqueuexxx api, for example clEnqueuexxx api use */
   cl_program built_in_prgs;  /*all built-in kernels belongs to this program only*/
   cl_kernel  built_in_kernels[CL_INTERNAL_KERNEL_MAX];
@@ -163,9 +164,6 @@ extern cl_int cl_context_ND_kernel(cl_context,
 /* Used for allocation */
 extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
 
-/* Get the internal used kernel */
-extern cl_kernel cl_context_get_static_kernel(cl_context ctx, cl_int index, const char *str_kernel, const char * str_option);
-
 /* Get the internal used kernel from binary*/
 extern cl_kernel cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
                   const char * str_kernel, size_t size, const char * str_option);
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index 0d25ca4..b7b64c0 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -228,7 +228,63 @@
    devid == PCI_CHIP_BROADWLL_U_GT3)
 
 #define IS_BROADWELL(devid) (IS_BRW_GT1(devid) || IS_BRW_GT2(devid) || IS_BRW_GT3(devid))
-#define IS_GEN8(devid)      IS_BROADWELL(devid)
+
+#define PCI_CHIP_CHV_0 0x22B0
+#define PCI_CHIP_CHV_1 0x22B1
+#define PCI_CHIP_CHV_2 0x22B2
+#define PCI_CHIP_CHV_3 0x22B3
+#define IS_CHERRYVIEW(devid) \
+  (devid == PCI_CHIP_CHV_0 ||   \
+   devid == PCI_CHIP_CHV_1 || \
+   devid == PCI_CHIP_CHV_2 || \
+   devid == PCI_CHIP_CHV_3)
+
+#define IS_GEN8(devid)      (IS_BROADWELL(devid) || IS_CHERRYVIEW(devid))
+
+/* SKL */
+#define PCI_CHIP_SKYLAKE_ULT_GT1	0x1906   /* Intel(R) Skylake ULT - GT1 */
+#define PCI_CHIP_SKYLAKE_ULT_GT2	0x1916   /* Intel(R) Skylake ULT - GT2 */
+#define PCI_CHIP_SKYLAKE_ULT_GT3	0x1926   /* Intel(R) Skylake ULT - GT3 */
+#define PCI_CHIP_SKYLAKE_ULT_GT2F	0x1921   /* Intel(R) Skylake ULT - GT2F */
+#define PCI_CHIP_SKYLAKE_ULX_GT1	0x190E   /* Intel(R) Skylake ULX - GT1 */
+#define PCI_CHIP_SKYLAKE_ULX_GT2	0x191E   /* Intel(R) Skylake ULX - GT2 */
+#define PCI_CHIP_SKYLAKE_DT_GT1		0x1902   /* Intel(R) Skylake Desktop - GT1 */
+#define PCI_CHIP_SKYLAKE_DT_GT2		0x1912   /* Intel(R) Skylake Desktop - GT2 */
+#define PCI_CHIP_SKYLAKE_HALO_GT1 	0x190B   /* Intel(R) Skylake HALO - GT1 */
+#define PCI_CHIP_SKYLAKE_HALO_GT2	0x191B   /* Intel(R) Skylake HALO - GT2 */
+#define PCI_CHIP_SKYLAKE_HALO_GT3	0x192B   /* Intel(R) Skylake HALO - GT3 */
+#define PCI_CHIP_SKYLAKE_HALO_GT4	0x193B   /* Intel(R) Skylake HALO - GT4 */
+#define PCI_CHIP_SKYLAKE_SRV_GT1	0x190A   /* Intel(R) Skylake Server - GT1 */
+#define PCI_CHIP_SKYLAKE_SRV_GT2	0x191A   /* Intel(R) Skylake Server - GT2 */
+#define PCI_CHIP_SKYLAKE_SRV_GT3	0x192A   /* Intel(R) Skylake Server - GT3 */
+#define PCI_CHIP_SKYLAKE_SRV_GT4	0x193A   /* Intel(R) Skylake Server - GT4 */
+
+#define IS_SKL_GT1(devid)               \
+  (devid == PCI_CHIP_SKYLAKE_ULT_GT1 ||   \
+   devid == PCI_CHIP_SKYLAKE_ULX_GT1 || \
+   devid == PCI_CHIP_SKYLAKE_DT_GT1 || \
+   devid == PCI_CHIP_SKYLAKE_HALO_GT1 || \
+   devid == PCI_CHIP_SKYLAKE_SRV_GT1)
+
+#define IS_SKL_GT2(devid)               \
+  (devid == PCI_CHIP_SKYLAKE_ULT_GT2 ||   \
+   devid == PCI_CHIP_SKYLAKE_ULT_GT2F ||   \
+   devid == PCI_CHIP_SKYLAKE_ULX_GT2 || \
+   devid == PCI_CHIP_SKYLAKE_DT_GT2 || \
+   devid == PCI_CHIP_SKYLAKE_HALO_GT2 || \
+   devid == PCI_CHIP_SKYLAKE_SRV_GT2)
+
+#define IS_SKL_GT3(devid)               \
+  (devid == PCI_CHIP_SKYLAKE_ULT_GT3 ||   \
+   devid == PCI_CHIP_SKYLAKE_HALO_GT3 || \
+   devid == PCI_CHIP_SKYLAKE_SRV_GT3)
+
+#define IS_SKL_GT4(devid)               \
+  (devid == PCI_CHIP_SKYLAKE_HALO_GT4 || \
+   devid == PCI_CHIP_SKYLAKE_SRV_GT4)
+
+#define IS_SKYLAKE(devid) (IS_SKL_GT1(devid) || IS_SKL_GT2(devid) || IS_SKL_GT3(devid) || IS_SKL_GT4(devid))
+#define IS_GEN9(devid)      IS_SKYLAKE(devid)
 
 #endif /* __CL_DEVICE_DATA_H__ */
 
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 0fd4a69..e9e2c16 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -26,6 +26,7 @@
 #include "cl_khr_icd.h"
 #include "cl_thread.h"
 #include "CL/cl.h"
+#include "CL/cl_ext.h"
 #include "cl_gbe_loader.h"
 #include "cl_alloc.h"
 
@@ -140,6 +141,62 @@ static struct _cl_device_id intel_brw_gt3_device = {
 #include "cl_gen75_device.h"
 };
 
+//Cherryview has the same pciid, must get the max_compute_unit and max_thread_per_unit from drm
+static struct _cl_device_id intel_chv_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 8,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+/* XXX we clone brw now */
+static struct _cl_device_id intel_skl_gt1_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 6,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_skl_gt2_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 24,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 3,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_skl_gt3_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 48,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 6,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_skl_gt4_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 72,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 9,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
 
 LOCAL cl_device_id
 cl_get_gt_device(void)
@@ -294,8 +351,9 @@ cl_get_gt_device(void)
       DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 reserved");
 has_break:
       device->vendor_id = device_id;
-      device->platform = intel_platform;
+      device->platform = cl_get_platform_default();
       ret = device;
+      cl_intel_platform_get_default_extension(ret);
       break;
 
     case PCI_CHIP_IVYBRIDGE_GT1:
@@ -306,8 +364,9 @@ has_break:
       DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge S GT1");
 ivb_gt1_break:
       intel_ivb_gt1_device.vendor_id = device_id;
-      intel_ivb_gt1_device.platform = intel_platform;
+      intel_ivb_gt1_device.platform = cl_get_platform_default();
       ret = &intel_ivb_gt1_device;
+      cl_intel_platform_get_default_extension(ret);
       break;
 
     case PCI_CHIP_IVYBRIDGE_GT2:
@@ -318,16 +377,18 @@ ivb_gt1_break:
       DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge S GT2");
 ivb_gt2_break:
       intel_ivb_gt2_device.vendor_id = device_id;
-      intel_ivb_gt2_device.platform = intel_platform;
+      intel_ivb_gt2_device.platform = cl_get_platform_default();
       ret = &intel_ivb_gt2_device;
+      cl_intel_platform_get_default_extension(ret);
       break;
 
     case PCI_CHIP_BAYTRAIL_T:
       DECL_INFO_STRING(baytrail_t_device_break, intel_baytrail_t_device, name, "Intel(R) HD Graphics Bay Trail-T");
 baytrail_t_device_break:
       intel_baytrail_t_device.vendor_id = device_id;
-      intel_baytrail_t_device.platform = intel_platform;
+      intel_baytrail_t_device.platform = cl_get_platform_default();
       ret = &intel_baytrail_t_device;
+      cl_intel_platform_get_default_extension(ret);
       break;
 
     case PCI_CHIP_BROADWLL_M_GT1:
@@ -341,9 +402,11 @@ baytrail_t_device_break:
     case PCI_CHIP_BROADWLL_U_GT1:
       DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell ULX GT1");
 brw_gt1_break:
+      /* For Gen8 and later, half float is suppported and we will enable cl_khr_fp16. */
       intel_brw_gt1_device.vendor_id = device_id;
-      intel_brw_gt1_device.platform = intel_platform;
+      intel_brw_gt1_device.platform = cl_get_platform_default();
       ret = &intel_brw_gt1_device;
+      cl_intel_platform_enable_fp16_extension(ret);
       break;
 
     case PCI_CHIP_BROADWLL_M_GT2:
@@ -358,24 +421,99 @@ brw_gt1_break:
       DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics BroadWell ULX GT2");
 brw_gt2_break:
       intel_brw_gt2_device.vendor_id = device_id;
-      intel_brw_gt2_device.platform = intel_platform;
+      intel_brw_gt2_device.platform = cl_get_platform_default();
       ret = &intel_brw_gt2_device;
+      cl_intel_platform_enable_fp16_extension(ret);
       break;
 
     case PCI_CHIP_BROADWLL_M_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Mobile GT2");
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Mobile GT3");
     case PCI_CHIP_BROADWLL_D_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell U-Processor GT2");
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell U-Processor GT3");
     case PCI_CHIP_BROADWLL_S_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Server GT2");
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Server GT3");
     case PCI_CHIP_BROADWLL_W_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Workstation GT2");
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Workstation GT3");
     case PCI_CHIP_BROADWLL_U_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell ULX GT2");
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell ULX GT3");
 brw_gt3_break:
       intel_brw_gt3_device.vendor_id = device_id;
-      intel_brw_gt3_device.platform = intel_platform;
+      intel_brw_gt3_device.platform = cl_get_platform_default();
       ret = &intel_brw_gt3_device;
+      cl_intel_platform_enable_fp16_extension(ret);
+      break;
+
+    case PCI_CHIP_CHV_0:
+    case PCI_CHIP_CHV_1:
+    case PCI_CHIP_CHV_2:
+    case PCI_CHIP_CHV_3:
+      DECL_INFO_STRING(chv_break, intel_chv_device, name, "Intel(R) HD Graphics Cherryview");
+chv_break:
+      intel_chv_device.vendor_id = device_id;
+      intel_chv_device.platform = cl_get_platform_default();
+      ret = &intel_chv_device;
+      cl_intel_platform_enable_fp16_extension(ret);
+      break;
+
+
+    case PCI_CHIP_SKYLAKE_ULT_GT1:
+      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake ULT GT1");
+    case PCI_CHIP_SKYLAKE_ULX_GT1:
+      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake ULX GT1");
+    case PCI_CHIP_SKYLAKE_DT_GT1:
+      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake Desktop GT1");
+    case PCI_CHIP_SKYLAKE_HALO_GT1:
+      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake Halo GT1");
+    case PCI_CHIP_SKYLAKE_SRV_GT1:
+      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake Server GT1");
+skl_gt1_break:
+      intel_skl_gt1_device.vendor_id = device_id;
+      intel_skl_gt1_device.platform = cl_get_platform_default();
+      ret = &intel_skl_gt1_device;
+      cl_intel_platform_enable_fp16_extension(ret);
+      break;
+
+    case PCI_CHIP_SKYLAKE_ULT_GT2:
+      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake ULT GT2");
+    case PCI_CHIP_SKYLAKE_ULT_GT2F:
+      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake ULT GT2F");
+    case PCI_CHIP_SKYLAKE_ULX_GT2:
+      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake ULX GT2");
+    case PCI_CHIP_SKYLAKE_DT_GT2:
+      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Desktop GT2");
+    case PCI_CHIP_SKYLAKE_HALO_GT2:
+      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Halo GT2");
+    case PCI_CHIP_SKYLAKE_SRV_GT2:
+      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Server GT2");
+skl_gt2_break:
+      intel_skl_gt2_device.vendor_id = device_id;
+      intel_skl_gt2_device.platform = cl_get_platform_default();
+      ret = &intel_skl_gt2_device;
+      cl_intel_platform_enable_fp16_extension(ret);
+      break;
+
+    case PCI_CHIP_SKYLAKE_ULT_GT3:
+      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake ULT GT3");
+    case PCI_CHIP_SKYLAKE_HALO_GT3:
+      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Halo GT3");
+    case PCI_CHIP_SKYLAKE_SRV_GT3:
+      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Server GT3");
+skl_gt3_break:
+      intel_skl_gt3_device.vendor_id = device_id;
+      intel_skl_gt3_device.platform = cl_get_platform_default();
+      ret = &intel_skl_gt3_device;
+      cl_intel_platform_enable_fp16_extension(ret);
+      break;
+
+    case PCI_CHIP_SKYLAKE_HALO_GT4:
+      DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Halo GT4");
+    case PCI_CHIP_SKYLAKE_SRV_GT4:
+      DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Server GT4");
+skl_gt4_break:
+      intel_skl_gt4_device.vendor_id = device_id;
+      intel_skl_gt4_device.platform = cl_get_platform_default();
+      ret = &intel_skl_gt4_device;
+      cl_intel_platform_enable_fp16_extension(ret);
       break;
 
     case PCI_CHIP_SANDYBRIDGE_BRIDGE:
@@ -405,24 +543,8 @@ brw_gt3_break:
     ret->profile_sz = strlen(ret->profile) + 1;
   }
 
-#ifdef HAS_USERPTR
-  cl_driver dummy = cl_driver_new(NULL);
-  cl_buffer_mgr bufmgr = cl_driver_get_bufmgr(dummy);
-
-  const size_t sz = 4096;
-  void* host_ptr = cl_aligned_malloc(sz, 4096);;
-  if (host_ptr != NULL) {
-    cl_buffer bo = cl_buffer_alloc_userptr(bufmgr, "CL memory object", host_ptr, sz, 0);
-    if (bo == NULL)
-      ret->host_unified_memory = CL_FALSE;
-    else
-      cl_buffer_unreference(bo);
-    cl_free(host_ptr);
-  }
-  else
-    ret->host_unified_memory = CL_FALSE;
-  cl_driver_delete(dummy);
-#endif
+  /* Apply any driver-dependent updates to the device info */
+  cl_driver_update_device_info(ret);
 
   struct sysinfo info;
   if (sysinfo(&info) == 0) {
@@ -436,6 +558,83 @@ brw_gt3_break:
   return ret;
 }
 
+/* Runs a small kernel to check that the device works; returns
+ * SELF_TEST_PASS: for success.
+ * SELF_TEST_SLM_FAIL: for SLM results mismatch;
+ * SELF_TEST_ATOMIC_FAIL: for hsw enqueue  kernel failure to not enable atomics in L3.
+ * SELF_TEST_OTHER_FAIL: other fail like runtime API fail.*/
+LOCAL cl_self_test_res
+cl_self_test(cl_device_id device, cl_self_test_res atomic_in_l3_flag)
+{
+  cl_int status;
+  cl_context ctx;
+  cl_command_queue queue;
+  cl_program program;
+  cl_kernel kernel;
+  cl_mem buffer;
+  cl_event kernel_finished;
+  size_t n = 3;
+  cl_int test_data[3] = {3, 7, 5};
+  const char* kernel_source = "__kernel void self_test(__global int *buf) {"
+  "  __local int tmp[3];"
+  "  tmp[get_local_id(0)] = buf[get_local_id(0)];"
+  "  barrier(CLK_LOCAL_MEM_FENCE);"
+  "  buf[get_global_id(0)] = tmp[2 - get_local_id(0)] + buf[get_global_id(0)];"
+  "}"; // using __local to catch the "no SLM on Haswell" problem
+  static int tested = 0;
+  static cl_self_test_res ret = SELF_TEST_OTHER_FAIL;
+  if (tested != 0)
+    return ret;
+  tested = 1;
+  ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
+  cl_driver_set_atomic_flag(ctx->drv, atomic_in_l3_flag);
+  if (status == CL_SUCCESS) {
+    queue = clCreateCommandQueue(ctx, device, 0, &status);
+    if (status == CL_SUCCESS) {
+      program = clCreateProgramWithSource(ctx, 1, &kernel_source, NULL, &status);
+      if (status == CL_SUCCESS) {
+        status = clBuildProgram(program, 1, &device, "", NULL, NULL);
+        if (status == CL_SUCCESS) {
+          kernel = clCreateKernel(program, "self_test", &status);
+          if (status == CL_SUCCESS) {
+            buffer = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n*4, test_data, &status);
+            if (status == CL_SUCCESS) {
+              status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
+              if (status == CL_SUCCESS) {
+                status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &n, &n, 0, NULL, &kernel_finished);
+                if (status == CL_SUCCESS) {
+                  status = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, n*4, test_data, 1, &kernel_finished, NULL);
+                  if (status == CL_SUCCESS) {
+                    if (test_data[0] == 8 && test_data[1] == 14 && test_data[2] == 8){
+                      ret = SELF_TEST_PASS;
+                    } else {
+                      ret = SELF_TEST_SLM_FAIL;
+                      printf("Beignet: self-test failed: (3, 7, 5) + (5, 7, 3) returned (%i, %i, %i)\n"
+                             "See README.md or http://www.freedesktop.org/wiki/Software/Beignet/\n",
+                             test_data[0], test_data[1], test_data[2]);
+
+                    }
+                  }
+                } else{
+                  ret = SELF_TEST_ATOMIC_FAIL;
+                  // Atomic fail need to test SLM again with atomic in L3 feature disabled.
+                  tested = 0;
+                }
+              }
+            }
+            clReleaseMemObject(buffer);
+          }
+          clReleaseKernel(kernel);
+        }
+      }
+      clReleaseProgram(program);
+    }
+    clReleaseCommandQueue(queue);
+  }
+  clReleaseContext(ctx);
+  return ret;
+}
+
 LOCAL cl_int
 cl_get_device_ids(cl_platform_id    platform,
                   cl_device_type    device_type,
@@ -447,6 +646,29 @@ cl_get_device_ids(cl_platform_id    platform,
 
   /* Do we have a usable device? */
   device = cl_get_gt_device();
+  if (device) {
+    cl_self_test_res ret = cl_self_test(device, SELF_TEST_PASS);
+    if (ret == SELF_TEST_ATOMIC_FAIL) {
+      device->atomic_test_result = ret;
+      ret = cl_self_test(device, ret);
+      printf("Beignet: warning - disable atomic in L3 feature.\n");
+    }
+
+    if(ret == SELF_TEST_SLM_FAIL) {
+      int disable_self_test = 0;
+      // can't use BVAR (backend/src/sys/cvar.hpp) here as it's C++
+      const char *env = getenv("OCL_IGNORE_SELF_TEST");
+      if (env != NULL) {
+        sscanf(env, "%i", &disable_self_test);
+      }
+      if (disable_self_test) {
+        printf("Beignet: Warning - overriding self-test failure\n");
+      } else {
+        printf("Beignet: disabling non-working device\n");
+        device = 0;
+      }
+    }
+  }
   if (!device) {
     if (num_devices)
       *num_devices = 0;
@@ -458,8 +680,6 @@ cl_get_device_ids(cl_platform_id    platform,
       *num_devices = 1;
     if (devices) {
       *devices = device;
-      (*devices)->extensions = intel_platform->extensions;
-      (*devices)->extensions_sz = intel_platform->extensions_sz;
     }
     return CL_SUCCESS;
   }
@@ -504,7 +724,12 @@ cl_get_device_info(cl_device_id     device,
                device != &intel_hsw_gt3_device &&
                device != &intel_brw_gt1_device &&
                device != &intel_brw_gt2_device &&
-               device != &intel_brw_gt3_device
+               device != &intel_brw_gt3_device &&
+               device != &intel_chv_device &&
+               device != &intel_skl_gt1_device &&
+               device != &intel_skl_gt2_device &&
+               device != &intel_skl_gt3_device &&
+               device != &intel_skl_gt4_device
                ))
     return CL_INVALID_DEVICE;
 
@@ -547,6 +772,7 @@ cl_get_device_info(cl_device_id     device,
     DECL_FIELD(MEM_BASE_ADDR_ALIGN, mem_base_addr_align)
     DECL_FIELD(MIN_DATA_TYPE_ALIGN_SIZE, min_data_type_align_size)
     DECL_FIELD(SINGLE_FP_CONFIG, single_fp_config)
+    DECL_FIELD(HALF_FP_CONFIG, half_fp_config)
     DECL_FIELD(DOUBLE_FP_CONFIG, double_fp_config)
     DECL_FIELD(GLOBAL_MEM_CACHE_TYPE, global_mem_cache_type)
     DECL_FIELD(GLOBAL_MEM_CACHELINE_SIZE, global_mem_cache_line_size)
@@ -609,7 +835,12 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
                device != &intel_hsw_gt3_device &&
                device != &intel_brw_gt1_device &&
                device != &intel_brw_gt2_device &&
-               device != &intel_brw_gt3_device))
+               device != &intel_brw_gt3_device &&
+               device != &intel_chv_device &&
+               device != &intel_skl_gt1_device &&
+               device != &intel_skl_gt2_device &&
+               device != &intel_skl_gt3_device &&
+               device != &intel_skl_gt4_device))
     return CL_INVALID_DEVICE;
   if (ver == NULL)
     return CL_SUCCESS;
@@ -621,8 +852,11 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
         || device == &intel_hsw_gt3_device) {
     *ver = 75;
   } else if (device == &intel_brw_gt1_device || device == &intel_brw_gt2_device
-        || device == &intel_brw_gt3_device) {
+        || device == &intel_brw_gt3_device || device == &intel_chv_device) {
     *ver = 8;
+  } else if (device == &intel_skl_gt1_device || device == &intel_skl_gt2_device
+        || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device) {
+    *ver = 9;
   } else
     return CL_INVALID_VALUE;
 
@@ -704,7 +938,12 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
                device != &intel_hsw_gt3_device &&
                device != &intel_brw_gt1_device &&
                device != &intel_brw_gt2_device &&
-               device != &intel_brw_gt3_device))
+               device != &intel_brw_gt3_device &&
+               device != &intel_chv_device &&
+               device != &intel_skl_gt1_device &&
+               device != &intel_skl_gt2_device &&
+               device != &intel_skl_gt3_device &&
+               device != &intel_skl_gt4_device))
     return CL_INVALID_DEVICE;
 
   CHECK_KERNEL(kernel);
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index ee6a8e6..6daa31c 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -66,6 +66,7 @@ struct _cl_device_id {
   cl_uint  mem_base_addr_align;
   cl_uint  min_data_type_align_size;
   cl_device_fp_config single_fp_config;
+  cl_device_fp_config half_fp_config;
   cl_device_fp_config double_fp_config;
   cl_device_mem_cache_type global_mem_cache_type;
   cl_uint  global_mem_cache_line_size;
@@ -93,7 +94,7 @@ struct _cl_device_id {
   const char *version;
   const char *profile;
   const char *opencl_c_version;
-  const char *extensions;
+  const char extensions[256];
   const char *driver_version;
   const char *built_in_kernels;
   size_t name_sz;
@@ -113,6 +114,7 @@ struct _cl_device_id {
   cl_device_affinity_domain    affinity_domain;
   cl_device_partition_property partition_type[3];
   cl_uint      device_reference_count;
+  uint32_t atomic_test_result;
 };
 
 /* Get a device from the given platform */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index c39821e..1ab4dff 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -49,6 +49,15 @@ extern cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr;
 typedef uint32_t (cl_driver_get_ver_cb)(cl_driver);
 extern cl_driver_get_ver_cb *cl_driver_get_ver;
 
+typedef enum cl_self_test_res{
+  SELF_TEST_PASS = 0,
+  SELF_TEST_SLM_FAIL  = 1,
+  SELF_TEST_ATOMIC_FAIL = 2,
+  SELF_TEST_OTHER_FAIL = 3,
+} cl_self_test_res;
+/* Set the atomic enable/disable flag in the driver */
+typedef void (cl_driver_set_atomic_flag_cb)(cl_driver, int);
+extern cl_driver_set_atomic_flag_cb *cl_driver_set_atomic_flag;
 /**************************************************************************
  * GPGPU command streamer
  **************************************************************************/
@@ -145,11 +154,13 @@ typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
                                       cl_buffer obj_bo,
                                       uint32_t obj_bo_offset,
                                       uint32_t format,
+                                      uint32_t bpp,
                                       uint32_t type,
                                       int32_t w,
                                       int32_t h,
                                       int32_t depth,
                                       int pitch,
+                                      int32_t slice_pitch,
                                       cl_gpgpu_tiling tiling);
 
 extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
@@ -374,6 +385,10 @@ extern cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align;
 typedef int (cl_driver_get_device_id_cb)(void);
 extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
 
+/* Update the device info */
+typedef void (cl_driver_update_device_info_cb)(cl_device_id device);
+extern cl_driver_update_device_info_cb *cl_driver_update_device_info;
+
 /**************************************************************************
  * cl_khr_gl_sharing.
  **************************************************************************/
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 2b68539..b77acdc 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -25,7 +25,9 @@ LOCAL cl_driver_new_cb *cl_driver_new = NULL;
 LOCAL cl_driver_delete_cb *cl_driver_delete = NULL;
 LOCAL cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;
 LOCAL cl_driver_get_ver_cb *cl_driver_get_ver = NULL;
+LOCAL cl_driver_set_atomic_flag_cb *cl_driver_set_atomic_flag = NULL;
 LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
+LOCAL cl_driver_update_device_info_cb *cl_driver_update_device_info = NULL;
 
 /* Buffer */
 LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
diff --git a/src/cl_event.c b/src/cl_event.c
index b4734b2..bbc1776 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -56,7 +56,7 @@ int cl_event_flush(cl_event event)
     event->gpgpu = NULL;
   }
   cl_gpgpu_event_flush(event->gpgpu_event);
-  event->queue->last_event = event;
+  set_last_event(event->queue, event);
   return err;
 }
 
@@ -117,8 +117,8 @@ void cl_event_delete(cl_event event)
   if (atomic_dec(&event->ref_n) > 1)
     return;
 
-  if(event->queue && event->queue->last_event == event)
-    event->queue->last_event = NULL;
+  if(event->queue && get_last_event(event->queue) == event)
+    set_last_event(event->queue, NULL);
 
   /* Call all user's callback if haven't execute */
   cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE status will force all callbacks that are not executed to run
@@ -221,7 +221,7 @@ cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
       err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
       goto exit;
     }
-    if(event && *event == event_wait_list[i])
+    if(event && event == &event_wait_list[i])
       goto error;
     if(event_wait_list[i]->ctx != ctx)
       goto error;
@@ -568,9 +568,9 @@ cl_int cl_event_marker_with_wait_list(cl_command_queue queue,
     return CL_SUCCESS;
   }
 
-  if(queue->last_event && queue->last_event->gpgpu_event) {
-    cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
-  }
+  cl_event last_event = get_last_event(queue);
+  if(last_event && last_event->gpgpu_event)
+    cl_gpgpu_event_update_status(last_event->gpgpu_event, 1);
 
   cl_event_set_status(e, CL_COMPLETE);
   return CL_SUCCESS;
@@ -605,9 +605,9 @@ cl_int cl_event_barrier_with_wait_list(cl_command_queue queue,
     return CL_SUCCESS;
   }
 
-  if(queue->last_event && queue->last_event->gpgpu_event) {
-    cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
-  }
+  cl_event last_event = get_last_event(queue);
+  if(last_event && last_event->gpgpu_event)
+    cl_gpgpu_event_update_status(last_event->gpgpu_event, 1);
 
   cl_event_set_status(e, CL_COMPLETE);
   return CL_SUCCESS;
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index d07a525..3eb303f 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -1,17 +1,22 @@
+#include "llvm/Config/llvm-config.h"
 #ifdef HAS_EGL
 #include "EGL/egl.h"
 #include "EGL/eglext.h"
 #endif
 
 #include "cl_platform_id.h"
+#include "cl_device_id.h"
 #include "cl_internals.h"
 #include "CL/cl.h"
 #include "cl_utils.h"
 
 #include <stdlib.h>
 #include <string.h>
+#include <assert.h>
 
-static struct cl_extensions intel_extensions =
+/* This extension should be common for all the intel GPU platform.
+   Every device may have its own additional externsions. */
+static struct cl_extensions intel_platform_extensions =
 {
   {
 #define DECL_EXT(name) \
@@ -34,8 +39,14 @@ void check_opt1_extension(cl_extensions_t *extensions)
 {
   int id;
   for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++)
+  {
     if (id == EXT_ID(khr_icd))
       extensions->extensions[id].base.ext_enabled = 1;
+#if  LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+    if (id == EXT_ID(khr_spir))
+      extensions->extensions[id].base.ext_enabled = 1;
+#endif
+  }
 }
 
 void
@@ -62,7 +73,7 @@ process_extension_str(cl_extensions_t *extensions)
   int str_offset = 0;
   int id;
 
-  extensions->ext_str[str_max] = '\0';
+  memset(extensions->ext_str, 0, sizeof(extensions->ext_str));
 
   for(id = 0; id < cl_khr_extension_id_max; id++)
   {
@@ -84,24 +95,51 @@ process_extension_str(cl_extensions_t *extensions)
 }
 
 LOCAL void
-cl_intel_platform_extension_init(cl_platform_id intel_platform)
+cl_intel_platform_get_default_extension(cl_device_id device)
+{
+  cl_platform_id pf = device->platform;
+  memcpy((char*)device->extensions,
+       pf->internal_extensions->ext_str, sizeof(device->extensions));
+  device->extensions_sz = strlen(pf->internal_extensions->ext_str) + 1;
+}
+
+LOCAL void
+cl_intel_platform_enable_fp16_extension(cl_device_id device)
 {
-  static int initialized = 0;
+  cl_extensions_t new_ext;
+  cl_platform_id pf = device->platform;
+  int id;
+  assert(pf);
+
+  memcpy(&new_ext, pf->internal_extensions, sizeof(new_ext));
 
-  if (initialized) {
-    intel_platform->internal_extensions = &intel_extensions;
-    intel_platform->extensions = intel_extensions.ext_str;
-    return;
+  for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++) {
+    if (id == EXT_ID(khr_fp16))
+      new_ext.extensions[id].base.ext_enabled = 1;
   }
-  check_basic_extension(&intel_extensions);
-  check_opt1_extension(&intel_extensions);
-  check_gl_extension(&intel_extensions);
-  check_intel_extension(&intel_extensions);
-  process_extension_str(&intel_extensions);
 
-  intel_platform->internal_extensions = &intel_extensions;
-  intel_platform->extensions = intel_extensions.ext_str;
+  process_extension_str(&new_ext);
+
+  memcpy((char*)device->extensions, new_ext.ext_str, sizeof(device->extensions));
+  device->extensions_sz = strlen(new_ext.ext_str) + 1;
+}
+
+LOCAL void
+cl_intel_platform_extension_init(cl_platform_id intel_platform)
+{
+  static int ext_initialized = 0;
+
+  /* The EXT should be only inited once. */
+  assert(!ext_initialized);
+  check_basic_extension(&intel_platform_extensions);
+  check_opt1_extension(&intel_platform_extensions);
+  check_gl_extension(&intel_platform_extensions);
+  check_intel_extension(&intel_platform_extensions);
+  process_extension_str(&intel_platform_extensions);
+  ext_initialized = 1;
 
-  initialized = 1;
+  intel_platform->internal_extensions = &intel_platform_extensions;
+  intel_platform->extensions = intel_platform_extensions.ext_str;
+  intel_platform->extensions_sz = strlen(intel_platform->extensions) + 1;
   return;
 }
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index e6cdce8..b4544e2 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -94,3 +94,7 @@ typedef struct cl_extensions {
 
 extern void
 cl_intel_platform_extension_init(cl_platform_id intel_platform);
+extern void
+cl_intel_platform_enable_fp16_extension(cl_device_id device);
+extern void
+cl_intel_platform_get_default_extension(cl_device_id device);
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 0950327..4b43c20 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -75,6 +75,7 @@
 .platform = NULL, /* == intel_platform (set when requested) */
 /* IEEE 754, XXX does IVB support CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT? */
 .single_fp_config = CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
+.half_fp_config = CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST ,
 .printf_buffer_size = 1 * 1024 * 1024,
 .interop_user_sync = CL_TRUE,
 
diff --git a/src/cl_image.c b/src/cl_image.c
index 9907f90..d58bdf3 100644
--- a/src/cl_image.c
+++ b/src/cl_image.c
@@ -134,6 +134,12 @@ cl_image_get_intel_format(const cl_image_format *fmt)
         case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32_UINT;
         default: return INTEL_UNSUPPORTED_FORMAT;
       };
+    case CL_RG:
+      switch (type) {
+        case CL_UNORM_INT8:     return I965_SURFACEFORMAT_R8G8_UNORM;
+        case CL_UNSIGNED_INT8:  return I965_SURFACEFORMAT_R8G8_UINT;
+        default: return INTEL_UNSUPPORTED_FORMAT;
+      };
 #if 0
     case CL_RG:
     case CL_RA:
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 31eb4c1..b5671bd 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -663,7 +663,8 @@ cl_image_tiling_t cl_get_default_tiling(cl_driver drv)
 
   if (!initialized) {
     // FIXME, need to find out the performance diff's root cause on BDW.
-    if(cl_driver_get_ver(drv) == 8)
+    // SKL's 3D Image can't use TILE_X, so use TILE_Y as default
+    if(cl_driver_get_ver(drv) == 8 || cl_driver_get_ver(drv) == 9)
       tiling = CL_TILE_Y;
     char *tilingStr = getenv("OCL_TILING");
     if (tilingStr != NULL) {
@@ -842,7 +843,8 @@ _cl_mem_new_image(cl_context ctx,
       image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
     aligned_slice_pitch = 0;
   else
-    aligned_slice_pitch = aligned_pitch * ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
+    //SKL need use tiling's aligned_h to calc slice_pitch and IVB to BDW need CL_NO_TILE's aligned_h to calc.
+    aligned_slice_pitch = aligned_pitch * ALIGN(h, cl_buffer_get_tiling_align(ctx, tiling, 2));
 
   cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
                     intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
@@ -1201,6 +1203,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
     cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
     ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    cl_kernel_delete(ker);
     return ret;
   }
 
@@ -1241,6 +1244,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
     cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
     ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    cl_kernel_delete(ker);
     return ret;
   }
 
@@ -1270,6 +1274,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
     cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
     ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    cl_kernel_delete(ker);
     return ret;
   }
 
@@ -1298,6 +1303,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
     cl_kernel_set_arg(ker, 9, sizeof(int), &src_less);
     ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    cl_kernel_delete(ker);
     return ret;
   }
 
@@ -1370,6 +1376,7 @@ cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image
   cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]);
 
   ret = cl_command_queue_ND_range(queue, ker, 3, global_off, global_sz, local_sz);
+  cl_kernel_delete(ker);
   return ret;
 }
 
@@ -1472,6 +1479,7 @@ cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
     cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
 
   ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  cl_kernel_delete(ker);
   return ret;
 }
 
@@ -1544,7 +1552,7 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
 
   ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
-
+  cl_kernel_delete(ker);
   return ret;
 }
 
@@ -1694,6 +1702,8 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
   ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
 
 fail:
+
+  cl_kernel_delete(ker);
   if (fixupDataType) {
     src_image->intel_fmt = savedIntelFmt;
     dst_image->intel_fmt = savedIntelFmt;
@@ -1795,6 +1805,7 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
 
 fail:
 
+  cl_kernel_delete(ker);
   image->intel_fmt = intel_fmt;
   image->bpp = bpp;
   image->w = w_saved;
@@ -1814,6 +1825,10 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   uint32_t intel_fmt, bpp;
   cl_image_format fmt;
   size_t origin0, region0;
+  size_t kn_src_offset;
+  int align16 = 0;
+  size_t align_size = 1;
+  size_t w_saved = 0;
 
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
@@ -1824,24 +1839,48 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(image->base.ctx == buffer->ctx);
 
-  fmt.image_channel_order = CL_R;
-  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
   intel_fmt = image->intel_fmt;
   bpp = image->bpp;
-  image->intel_fmt = cl_image_get_intel_format(&fmt);
-  image->w = image->w * image->bpp;
-  image->bpp = 1;
+  w_saved = image->w;
   region0 = region[0] * bpp;
-  origin0 = dst_origin[0] * bpp;
+  kn_src_offset = src_offset;
+  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
+      ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)){
+    fmt.image_channel_order = CL_RGBA;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT32;
+    align16 = 1;
+    align_size = 16;
+  }
+  else{
+    fmt.image_channel_order = CL_R;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+    align_size = 1;
+  }
+  image->intel_fmt = cl_image_get_intel_format(&fmt);
+  image->w = (image->w * image->bpp) / align_size;
+  image->bpp = align_size;
+  region0 = (region[0] * bpp) / align_size;
+  origin0 = (dst_origin[0] * bpp) / align_size;
+  kn_src_offset /= align_size;
   global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
 
   /* setup the kernel and run. */
   if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    if(align16){
+      extern char cl_internal_copy_buffer_to_image_2d_align16_str[];
+      extern size_t cl_internal_copy_buffer_to_image_2d_align16_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
+                cl_internal_copy_buffer_to_image_2d_align16_str,
+                (size_t)cl_internal_copy_buffer_to_image_2d_align16_str_size, NULL);
+    }
+    else{
       extern char cl_internal_copy_buffer_to_image_2d_str[];
       extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
 
       ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
           cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
+    }
   }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
       extern char cl_internal_copy_buffer_to_image_3d_str[];
       extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
@@ -1860,13 +1899,14 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
   cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
   cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
-  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
 
   ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  cl_kernel_delete(ker);
 
   image->intel_fmt = intel_fmt;
   image->bpp = bpp;
-  image->w = image->w / bpp;
+  image->w = w_saved;
 
   return ret;
 }
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
index a97c00f..d7a1f68 100644
--- a/src/cl_platform_id.c
+++ b/src/cl_platform_id.c
@@ -41,8 +41,19 @@ static struct _cl_platform_id intel_platform_data = {
 
 #undef DECL_INFO_STRING
 
-/* Intel platform (only GPU now) */
-cl_platform_id const intel_platform = &intel_platform_data;
+/* Intel platform (only GPU now).
+   It is used as default when the API's platform ptr is NULL */
+static cl_platform_id intel_platform = NULL;
+LOCAL cl_platform_id
+cl_get_platform_default(void)
+{
+  if (intel_platform)
+    return intel_platform;
+
+  intel_platform = &intel_platform_data;
+  cl_intel_platform_extension_init(intel_platform);
+  return intel_platform;
+}
 
 LOCAL cl_int
 cl_get_platform_ids(cl_uint          num_entries,
@@ -52,29 +63,28 @@ cl_get_platform_ids(cl_uint          num_entries,
   if (num_platforms != NULL)
     *num_platforms = 1;
 
-  cl_intel_platform_extension_init(intel_platform);
   /* Easy right now, only one platform is supported */
   if(platforms)
-    *platforms = intel_platform;
-  intel_platform->extensions_sz = strlen(intel_platform->extensions) + 1;
+    *platforms = cl_get_platform_default();
+
   return CL_SUCCESS;
 }
 
 #define DECL_FIELD(CASE,FIELD)                                  \
   case JOIN(CL_,CASE):                                          \
-    if (param_value_size < intel_platform->JOIN(FIELD,_sz))     \
+    if (param_value_size < cl_get_platform_default()->JOIN(FIELD,_sz))     \
       return CL_INVALID_VALUE;                                  \
     if (param_value_size_ret != NULL)                           \
-      *param_value_size_ret = intel_platform->JOIN(FIELD,_sz);  \
+      *param_value_size_ret = cl_get_platform_default()->JOIN(FIELD,_sz);  \
     memcpy(param_value,                                         \
-           intel_platform->FIELD,                               \
-           intel_platform->JOIN(FIELD,_sz));                    \
+           cl_get_platform_default()->FIELD,                               \
+           cl_get_platform_default()->JOIN(FIELD,_sz));                    \
       return CL_SUCCESS;
 
 #define GET_FIELD_SZ(CASE,FIELD)                                \
   case JOIN(CL_,CASE):                                          \
     if (param_value_size_ret != NULL)                           \
-      *param_value_size_ret = intel_platform->JOIN(FIELD,_sz);  \
+      *param_value_size_ret = cl_get_platform_default()->JOIN(FIELD,_sz);  \
     return CL_SUCCESS;
 
 LOCAL cl_int
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
index 7b78db1..865317a 100644
--- a/src/cl_platform_id.h
+++ b/src/cl_platform_id.h
@@ -44,8 +44,8 @@ struct _cl_platform_id {
   struct cl_extensions *internal_extensions;
 };
 
-/* Platform implemented by this run-time */
-extern cl_platform_id const intel_platform;
+/* Return the default platform */
+extern cl_platform_id cl_get_platform_default(void);
 
 /* Return the valid platform */
 extern cl_int cl_get_platform_ids(cl_uint          num_entries,
diff --git a/src/cl_program.c b/src/cl_program.c
index c30f85e..db53757 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -231,7 +231,21 @@ cl_program_create_from_binary(cl_context             ctx,
   program->binary_sz = lengths[0];
   program->source_type = FROM_BINARY;
 
-  if(isBitcode((unsigned char*)program->binary+1, (unsigned char*)program->binary+program->binary_sz)) {
+  if(isBitcode((unsigned char*)program->binary, (unsigned char*)program->binary+program->binary_sz)) {
+
+    char* typed_binary;
+    TRY_ALLOC(typed_binary, cl_calloc(lengths[0]+1, sizeof(char)));
+    memcpy(typed_binary+1, binaries[0], lengths[0]);
+    *typed_binary = 1;
+    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->device->vendor_id, typed_binary, program->binary_sz+1);
+    cl_free(typed_binary);
+    if (UNLIKELY(program->opaque == NULL)) {
+      err = CL_INVALID_PROGRAM;
+      goto error;
+    }
+
+    program->source_type = FROM_LLVM_SPIR;
+  }else if(isBitcode((unsigned char*)program->binary+1, (unsigned char*)program->binary+program->binary_sz)) {
     if(*program->binary == 1){
       program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
     }else if(*program->binary == 2){
@@ -499,6 +513,9 @@ cl_program_build(cl_program p, const char *options)
       memcpy(p->build_opts, options, strlen(options));
 
       p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+      if (strstr(options, "-x spir")) {
+        p->source_type = FROM_LLVM_SPIR;
+      }
     }
   }
 
@@ -526,7 +543,7 @@ cl_program_build(cl_program p, const char *options)
 
     /* Create all the kernels */
     TRY (cl_program_load_gen_program, p);
-  } else if (p->source_type == FROM_LLVM) {
+  } else if (p->source_type == FROM_LLVM || p->source_type == FROM_LLVM_SPIR) {
     if (!CompilerSupported()) {
       err = CL_COMPILER_NOT_AVAILABLE;
       goto error;
diff --git a/src/cl_program.h b/src/cl_program.h
index 3ab7acd..7af0206 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -33,7 +33,8 @@ struct _gbe_program;
 enum {
   FROM_SOURCE = 0,
   FROM_LLVM = 1,
-  FROM_BINARY = 2
+  FROM_BINARY = 2,
+  FROM_LLVM_SPIR = 3
 };
 
 /* This maps an OCL file containing some kernels */
diff --git a/src/cl_thread.c b/src/cl_thread.c
index 0d99574..5e5a351 100644
--- a/src/cl_thread.c
+++ b/src/cl_thread.c
@@ -45,6 +45,8 @@ typedef struct _thread_spec_data {
   cl_gpgpu gpgpu ;
   int valid;
   void* thread_batch_buf;
+  cl_event last_event;
+  cl_event current_event;
   int thread_magic;
 } thread_spec_data;
 
@@ -106,6 +108,34 @@ static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int
   return spec;
 }
 
+cl_event get_current_event(cl_command_queue queue)
+{
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+  assert(spec && spec->thread_magic == thread_magic);
+  return spec->current_event;
+}
+
+cl_event get_last_event(cl_command_queue queue)
+{
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+  assert(spec && spec->thread_magic == thread_magic);
+  return spec->last_event;
+}
+
+void set_current_event(cl_command_queue queue, cl_event e)
+{
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+  assert(spec && spec->thread_magic == thread_magic);
+  spec->current_event = e;
+}
+
+void set_last_event(cl_command_queue queue, cl_event e)
+{
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+  assert(spec && spec->thread_magic == thread_magic);
+  spec->last_event = e;
+}
+
 void* cl_thread_data_create(void)
 {
   queue_thread_private* thread_private = CALLOC(queue_thread_private);
diff --git a/src/cl_thread.h b/src/cl_thread.h
index 7b48a26..d77526b 100644
--- a/src/cl_thread.h
+++ b/src/cl_thread.h
@@ -44,4 +44,9 @@ void* cl_get_thread_batch_buf(cl_command_queue queue);
 /* take current gpgpu from the thread gpgpu pool. */
 cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue);
 
+cl_event get_current_event(cl_command_queue queue);
+cl_event get_last_event(cl_command_queue queue);
+void set_current_event(cl_command_queue queue, cl_event e);
+void set_last_event(cl_command_queue queue, cl_event e);
+
 #endif /* __CL_THREAD_H__ */
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index 044c004..6ada30c 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -88,6 +88,7 @@
 #define PIPELINE_SELECT_3D              0
 #define PIPELINE_SELECT_MEDIA           1
 #define PIPELINE_SELECT_GPGPU           2
+#define PIPELINE_SELECT_MASK            (3 << 8)
 
 #define UF0_CS_REALLOC                  (1 << 13)
 #define UF0_VFE_REALLOC                 (1 << 12)
@@ -303,7 +304,6 @@
 
 #define URB_SIZE(intel)         (IS_IGDNG(intel->device_id) ? 1024 : \
                                  IS_G4X(intel->device_id) ? 384 : 256)
-
 // HSW
 #define HSW_SCRATCH1_OFFSET                      (0xB038)
 #define HSW_ROW_CHICKEN3_HDC_OFFSET              (0xE49C)
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 1d5b4d9..9c72777 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -171,7 +171,9 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
   else
     FATAL ("Unsupported Gen for emulation");
 #else
-  if (IS_GEN8(driver->device_id))
+  if (IS_GEN9(driver->device_id))
+    driver->gen_ver = 9;
+  else if (IS_GEN8(driver->device_id))
     driver->gen_ver = 8;
   else if (IS_GEN75(driver->device_id))
     driver->gen_ver = 75;
@@ -446,6 +448,12 @@ intel_driver_get_ver(struct intel_driver *drv)
   return drv->gen_ver;
 }
 
+static void
+intel_driver_set_atomic_flag(intel_driver_t *drv, int atomic_flag)
+{
+  drv->atomic_test_result = atomic_flag;
+}
+
 static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
 static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
 
@@ -472,6 +480,13 @@ static uint32_t intel_buffer_get_tiling_align(cl_context ctx, uint32_t tiling_mo
       ret = 512;
     } else if (dim == 1) { //tileX height in number of rows
       ret = 8;
+    }  else if (dim == 2) { //height to calculate slice pitch
+      if (gen_ver == 9) //SKL same as tileY height
+        ret = 8;
+      else if (gen_ver == 8)  //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
+        ret = 4;
+      else
+        ret = 2;
     } else
       assert(0);
     break;
@@ -481,13 +496,20 @@ static uint32_t intel_buffer_get_tiling_align(cl_context ctx, uint32_t tiling_mo
       ret = 128;
     } else if (dim == 1) { //tileY height in number of rows
       ret = 32;
+    } else if (dim == 2) { //height to calculate slice pitch
+      if (gen_ver == 9) //SKL same as tileY height
+        ret = 32;
+      else if (gen_ver == 8) //IVB, HSW, BDW same as CL_NO_TILE vertical alignment
+        ret = 4;
+      else
+        ret = 2;
     } else
       assert(0);
     break;
 
   case CL_NO_TILE:
-    if (dim == 1) { //vertical alignment
-      if (gen_ver == 8)
+    if (dim == 1 || dim == 2) { //vertical alignment
+      if (gen_ver == 8 || gen_ver == 9) //SKL 1D array need 4 alignment qpitch
         ret = 4;
       else
         ret = 2;
@@ -748,14 +770,80 @@ static int intel_buffer_set_tiling(cl_buffer bo,
   return ret;
 }
 
+#define CHV_CONFIG_WARNING \
+        "Warning: can't get GPU's configurations, will use the minimal one. Please update your drm to 2.4.59+ and linux kernel to 4.0.0+.\n"
+static void
+intel_update_device_info(cl_device_id device)
+{
+  intel_driver_t *driver;
+
+  driver = intel_driver_new();
+  assert(driver != NULL);
+  if (intel_driver_open(driver, NULL) != CL_SUCCESS) {
+    intel_driver_delete(driver);
+    return;
+  }
+
+#ifdef HAS_USERPTR
+  const size_t sz = 4096;
+  void *host_ptr;
+
+  host_ptr = cl_aligned_malloc(sz, 4096);
+  if (host_ptr != NULL) {
+    cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
+      "CL memory object", host_ptr, sz, 0);
+    if (bo == NULL)
+      device->host_unified_memory = CL_FALSE;
+    else
+      drm_intel_bo_unreference((drm_intel_bo*)bo);
+    cl_free(host_ptr);
+  }
+  else
+    device->host_unified_memory = CL_FALSE;
+#endif
+
+#ifdef HAS_EU_TOTAL
+  unsigned int eu_total;
+
+  /* Prefer driver-queried max compute units if supported */
+  if (!drm_intel_get_eu_total(driver->fd, &eu_total))
+    device->max_compute_unit = eu_total;
+  else if (IS_CHERRYVIEW(device->vendor_id))
+    printf(CHV_CONFIG_WARNING);
+#else
+  if (IS_CHERRYVIEW(device->vendor_id))
+    printf(CHV_CONFIG_WARNING);
+#endif
+
+#ifdef HAS_SUBSLICE_TOTAL
+  unsigned int subslice_total;
+
+  /* Prefer driver-queried subslice count if supported */
+  if (!drm_intel_get_subslice_total(driver->fd, &subslice_total))
+    device->sub_slice_count = subslice_total;
+  else if (IS_CHERRYVIEW(device->vendor_id))
+    printf(CHV_CONFIG_WARNING);
+#else
+  if (IS_CHERRYVIEW(device->vendor_id))
+    printf(CHV_CONFIG_WARNING);
+#endif
+
+  intel_driver_context_destroy(driver);
+  intel_driver_close(driver);
+  intel_driver_terminate(driver);
+  intel_driver_delete(driver);
+}
+
 LOCAL void
 intel_setup_callbacks(void)
 {
   cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
   cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
   cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+  cl_driver_set_atomic_flag = (cl_driver_set_atomic_flag_cb *) intel_driver_set_atomic_flag;
   cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
   cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
+  cl_driver_update_device_info = (cl_driver_update_device_info_cb *) intel_update_device_info;
   cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
   cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
   cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
index f972ec8..51f0e0d 100644
--- a/src/intel/intel_driver.h
+++ b/src/intel/intel_driver.h
@@ -89,6 +89,7 @@ typedef struct intel_driver
   Display *x11_display;
   struct dri_state *dri_ctx;
   struct intel_gpgpu_node *gpgpu_list;
+  int atomic_test_result;
 } intel_driver_t;
 
 #define SET_BLOCKED_SIGSET(DRIVER)   do {                     \
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index e7898a2..901bd98 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -107,6 +107,9 @@ intel_gpgpu_load_idrt_t *intel_gpgpu_load_idrt = NULL;
 typedef void (intel_gpgpu_pipe_control_t)(intel_gpgpu_t *gpgpu);
 intel_gpgpu_pipe_control_t *intel_gpgpu_pipe_control = NULL;
 
+typedef void (intel_gpgpu_select_pipeline_t)(intel_gpgpu_t *gpgpu);
+intel_gpgpu_select_pipeline_t *intel_gpgpu_select_pipeline = NULL;
+
 static void
 intel_gpgpu_sync(void *buf)
 {
@@ -245,13 +248,21 @@ error:
 }
 
 static void
-intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
+intel_gpgpu_select_pipeline_gen7(intel_gpgpu_t *gpgpu)
 {
   BEGIN_BATCH(gpgpu->batch, 1);
   OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+static void
+intel_gpgpu_select_pipeline_gen9(intel_gpgpu_t *gpgpu)
+{
+  BEGIN_BATCH(gpgpu->batch, 1);
+  OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MASK | PIPELINE_SELECT_GPGPU);
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
 static uint32_t
 intel_gpgpu_get_cache_ctrl_gen7()
 {
@@ -268,6 +279,13 @@ intel_gpgpu_get_cache_ctrl_gen8()
 {
   return tcc_llc_ec_l3 | mtllc_wb;
 }
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen9()
+{
+  //Pre-defined cache control registers 9:
+  //L3CC: WB; LeCC: WB; TC: LLC/eLLC;
+  return (0x9 << 1);
+}
 
 static void
 intel_gpgpu_set_base_address_gen7(intel_gpgpu_t *gpgpu)
@@ -347,6 +365,55 @@ intel_gpgpu_set_base_address_gen8(intel_gpgpu_t *gpgpu)
     ADVANCE_BATCH(gpgpu->batch);
 }
 
+static void
+intel_gpgpu_set_base_address_gen9(intel_gpgpu_t *gpgpu)
+{
+    const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
+    BEGIN_BATCH(gpgpu->batch, 19);
+    OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 17);
+    /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
+    OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY);    /* General State Base Addr   */
+    OUT_BATCH(gpgpu->batch, 0);
+    OUT_BATCH(gpgpu->batch, 0 | (def_cc << 16));
+    /* 0, State Mem Obj CC */
+    /* We use a state base address for the surface heap since IVB clamp the
+     * binding table pointer at 11 bits. So, we cannot use pointers directly while
+     * using the surface heap
+     */
+    assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0);
+    OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
+              I915_GEM_DOMAIN_SAMPLER,
+              I915_GEM_DOMAIN_SAMPLER,
+              gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
+    OUT_BATCH(gpgpu->batch, 0);
+    OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
+              I915_GEM_DOMAIN_RENDER,
+              I915_GEM_DOMAIN_RENDER,
+              (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); /* Dynamic State Base Addr */
+    OUT_BATCH(gpgpu->batch, 0);
+    OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
+    OUT_BATCH(gpgpu->batch, 0);
+    //OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
+    OUT_RELOC(gpgpu->batch, (drm_intel_bo *)gpgpu->ker->bo,
+              I915_GEM_DOMAIN_INSTRUCTION,
+              I915_GEM_DOMAIN_INSTRUCTION,
+              0 + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
+    OUT_BATCH(gpgpu->batch, 0);
+
+    OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+    /* According to mesa i965 driver code, we must set the dynamic state access upper bound
+     * to a valid bound value, otherwise, the border color pointer may be rejected and you
+     * may get incorrect border color. This is a known hardware bug. */
+    OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+    /* Bindless surface state base address */
+    OUT_BATCH(gpgpu->batch, (def_cc << 4) | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(gpgpu->batch, 0);
+    OUT_BATCH(gpgpu->batch, 0xfffff000);
+    ADVANCE_BATCH(gpgpu->batch);
+}
+
 uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
   return size / 1024 - 1;
 }
@@ -452,6 +519,7 @@ intel_gpgpu_load_vfe_state_gen8(intel_gpgpu_t *gpgpu)
   OUT_BATCH(gpgpu->batch, 0);
   OUT_BATCH(gpgpu->batch, 0);
   OUT_BATCH(gpgpu->batch, 0);
+
   ADVANCE_BATCH(gpgpu->batch);
 }
 
@@ -577,6 +645,25 @@ intel_gpgpu_pipe_control_gen75(intel_gpgpu_t *gpgpu)
 }
 
 static void
+intel_gpgpu_pipe_control_gen8(intel_gpgpu_t *gpgpu)
+{
+  gen8_pipe_control_t* pc = (gen8_pipe_control_t*)
+    intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen8_pipe_control_t));
+  memset(pc, 0, sizeof(*pc));
+  pc->dw0.length = SIZEOF32(gen8_pipe_control_t) - 2;
+  pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
+  pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
+  pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
+  pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
+  pc->dw1.render_target_cache_flush_enable = 1;
+  pc->dw1.texture_cache_invalidation_enable = 1;
+  pc->dw1.cs_stall = 1;
+  pc->dw1.dc_flush_enable = 1;
+  //pc->dw1.instruction_cache_invalidate_enable = 1;
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
 intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
   BEGIN_BATCH(gpgpu->batch, 9);
@@ -632,16 +719,23 @@ static void
 intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
   /* still set L3 in batch buffer for fulsim. */
-  BEGIN_BATCH(gpgpu->batch, 15);
-  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
-  /* FIXME: KMD always disable the atomic in L3 for some reason.
-     I checked the spec, and don't think we need that workaround now.
-     Before I send a patch to kernel, let's just enable it here. */
-  OUT_BATCH(gpgpu->batch, HSW_SCRATCH1_OFFSET);
-  OUT_BATCH(gpgpu->batch, 0);                         /* enable atomic in L3 */
-  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
-  OUT_BATCH(gpgpu->batch, HSW_ROW_CHICKEN3_HDC_OFFSET);
-  OUT_BATCH(gpgpu->batch, (1 << 6ul) << 16);          /* enable atomic in L3 */
+  if(gpgpu->drv->atomic_test_result != SELF_TEST_ATOMIC_FAIL)
+  {
+    BEGIN_BATCH(gpgpu->batch, 15);
+    OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+    /* FIXME: KMD always disable the atomic in L3 for some reason.
+       I checked the spec, and don't think we need that workaround now.
+       Before I send a patch to kernel, let's just enable it here. */
+    OUT_BATCH(gpgpu->batch, HSW_SCRATCH1_OFFSET);
+    OUT_BATCH(gpgpu->batch, 0);                         /* enable atomic in L3 */
+    OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+    OUT_BATCH(gpgpu->batch, HSW_ROW_CHICKEN3_HDC_OFFSET);
+    OUT_BATCH(gpgpu->batch, (1 << 6ul) << 16);          /* enable atomic in L3 */
+  }
+  else
+  {
+    BEGIN_BATCH(gpgpu->batch, 9);
+  }
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
   OUT_BATCH(gpgpu->batch, 0x08800000);
@@ -1047,7 +1141,9 @@ static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_
   uint32_t surface_type;
   if (((IS_IVYBRIDGE(gpgpu->drv->device_id) ||
         IS_HASWELL(gpgpu->drv->device_id) ||
-        IS_BROADWELL(gpgpu->drv->device_id))) &&
+        IS_BROADWELL(gpgpu->drv->device_id) ||
+        IS_CHERRYVIEW(gpgpu->drv->device_id) ||
+        IS_SKYLAKE(gpgpu->drv->device_id))) &&
       index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM &&
       type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
     surface_type = I965_SURFACE_2D;
@@ -1063,10 +1159,12 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
                               uint32_t obj_bo_offset,
                               uint32_t format,
                               cl_mem_object_type type,
+                              uint32_t bpp,
                               int32_t w,
                               int32_t h,
                               int32_t depth,
                               int32_t pitch,
+                              int32_t slice_pitch,
                               int32_t tiling)
 {
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
@@ -1109,10 +1207,12 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
                               uint32_t obj_bo_offset,
                               uint32_t format,
                               cl_mem_object_type type,
+                              uint32_t bpp,
                               int32_t w,
                               int32_t h,
                               int32_t depth,
                               int32_t pitch,
+                              int32_t slice_pitch,
                               int32_t tiling)
 {
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
@@ -1157,10 +1257,12 @@ intel_gpgpu_bind_image_gen8(intel_gpgpu_t *gpgpu,
                             uint32_t obj_bo_offset,
                             uint32_t format,
                             cl_mem_object_type type,
+                            uint32_t bpp,
                             int32_t w,
                             int32_t h,
                             int32_t depth,
                             int32_t pitch,
+                            int32_t slice_pitch,
                             int32_t tiling)
 {
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
@@ -1216,6 +1318,82 @@ intel_gpgpu_bind_image_gen8(intel_gpgpu_t *gpgpu,
 }
 
 static void
+intel_gpgpu_bind_image_gen9(intel_gpgpu_t *gpgpu,
+                            uint32_t index,
+                            dri_bo* obj_bo,
+                            uint32_t obj_bo_offset,
+                            uint32_t format,
+                            cl_mem_object_type type,
+                            uint32_t bpp,
+                            int32_t w,
+                            int32_t h,
+                            int32_t depth,
+                            int32_t pitch,
+                            int32_t slice_pitch,
+                            int32_t tiling)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen8_surface_state_t *ss = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
+  memset(ss, 0, sizeof(*ss));
+  ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+  ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
+  ss->ss0.surface_format = format;
+  if (intel_is_surface_array(type) && ss->ss0.surface_type == I965_SURFACE_1D) {
+    ss->ss0.surface_array = 1;
+    ss->ss1.surface_qpitch = (slice_pitch/bpp + 3)/4;   //align_h
+  }
+
+  if (intel_is_surface_array(type) && ss->ss0.surface_type == I965_SURFACE_2D) {
+    ss->ss0.surface_array = 1;
+    ss->ss1.surface_qpitch = (slice_pitch/pitch + 3)/4;
+  }
+
+  if(ss->ss0.surface_type == I965_SURFACE_3D)
+    ss->ss1.surface_qpitch = (slice_pitch/pitch + 3)/4;
+
+  ss->ss0.horizontal_alignment = 1;
+  ss->ss0.vertical_alignment = 1;
+
+  if (tiling == GPGPU_TILE_X) {
+    ss->ss0.tile_mode = GEN8_TILEMODE_XMAJOR;
+  } else if (tiling == GPGPU_TILE_Y) {
+    ss->ss0.tile_mode = GEN8_TILEMODE_YMAJOR;
+  } else
+    assert(tiling == GPGPU_NO_TILE);// W mode is not supported now.
+
+  ss->ss2.width = w - 1;
+  ss->ss2.height = h - 1;
+  ss->ss3.depth = depth - 1;
+
+  ss->ss8.surface_base_addr_lo = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff;
+  ss->ss9.surface_base_addr_hi = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff;
+
+  ss->ss4.render_target_view_ext = depth - 1;
+  ss->ss4.min_array_elt = 0;
+  ss->ss3.surface_pitch = pitch - 1;
+
+  ss->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl();
+  ss->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
+  ss->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
+  ss->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
+  ss->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
+  ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+                               index * surface_state_sz;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_RENDER,
+                    I915_GEM_DOMAIN_RENDER,
+                    obj_bo_offset,
+                    gpgpu->aux_offset.surface_heap_offset +
+                    heap->binding_table[index] +
+                    offsetof(gen8_surface_state_t, ss8),
+                    obj_bo);
+
+  assert(index < GEN_MAX_SURFACES);
+}
+
+static void
 intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
                      uint32_t internal_offset, uint32_t size, uint8_t bti)
 {
@@ -1357,6 +1535,50 @@ intel_gpgpu_build_idrt_gen8(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
   desc->desc6.slm_sz = slm_sz;
 }
 
+static void
+intel_gpgpu_build_idrt_gen9(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
+{
+  gen8_interface_descriptor_t *desc;
+
+  desc = (gen8_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset);
+
+  memset(desc, 0, sizeof(*desc));
+  desc->desc0.kernel_start_pointer = 0; /* reloc */
+  desc->desc2.single_program_flow = 0;
+  desc->desc2.floating_point_mode = 0; /* use IEEE-754 rule */
+  desc->desc6.rounding_mode = 0; /* round to nearest even */
+
+  assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0);
+  desc->desc3.sampler_state_pointer = gpgpu->aux_offset.sampler_state_offset >> 5;
+  desc->desc4.binding_table_entry_count = 0; /* no prefetch */
+  desc->desc4.binding_table_pointer = 0;
+  desc->desc5.curbe_read_len = kernel->curbe_sz / 32;
+  desc->desc5.curbe_read_offset = 0;
+
+  /* Barriers / SLM are automatically handled on Gen7+ */
+  size_t slm_sz = kernel->slm_sz;
+  /* group_threads_num should not be set to 0 even if the barrier is disabled per bspec */
+  desc->desc6.group_threads_num = kernel->thread_n;
+  desc->desc6.barrier_enable = kernel->use_slm;
+  if (slm_sz == 0)
+    slm_sz = 0;
+  else if (slm_sz <= 1*KB)
+    slm_sz = 1;
+  else if (slm_sz <= 2*KB)
+    slm_sz = 2;
+  else if (slm_sz <= 4*KB)
+    slm_sz = 3;
+  else if (slm_sz <= 8*KB)
+    slm_sz = 4;
+  else if (slm_sz <= 16*KB)
+    slm_sz = 5;
+  else if (slm_sz <= 32*KB)
+    slm_sz = 6;
+  else
+    slm_sz = 7;
+  desc->desc6.slm_sz = slm_sz;
+}
+
 static int
 intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
 {
@@ -1935,13 +2157,15 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
   cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
 
-  if (IS_BROADWELL(device_id)) {
+  if (IS_BROADWELL(device_id) || IS_CHERRYVIEW(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen8;
     intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
     cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen8;
     intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
     intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //BDW need not restore SLM, same as gen7
     intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
+    if(IS_CHERRYVIEW(device_id))
+      intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
     intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen8;
     intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen8;
     intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8;
@@ -1950,7 +2174,27 @@ intel_set_gpgpu_callbacks(int device_id)
     intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen8;
     intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8;
     cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
-    intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen7;
+    intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8;
+	intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
+    return;
+  }
+  if (IS_SKYLAKE(device_id)) {
+    cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen9;
+    intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen9;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
+    intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //BDW need not restore SLM, same as gen7
+    intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
+    intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen9;
+    intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen8;
+    intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8;
+    cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen8;
+    intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen9;
+    intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen8;
+    intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8;
+    cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
+    intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen8;
+    intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen9;
     return;
   }
 
@@ -1960,6 +2204,7 @@ intel_set_gpgpu_callbacks(int device_id)
   intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen7;
   intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen7;
   intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen7;
+  intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
 
   if (IS_HASWELL(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index 258fbb9..fd6a82b 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -142,8 +142,9 @@ typedef struct gen8_interface_descriptor
   } desc5;
 
   struct {
-    uint32_t group_threads_num:8;        /* 0..64, 0 - no barrier use */
-    uint32_t barrier_return_byte:8;
+    uint32_t group_threads_num:10;        /* 0..64, 0 - no barrier use */
+    uint32_t pad:5;
+    uint32_t global_barrier_enable:1;
     uint32_t slm_sz:5;                   /* 0..16 - 0K..64K */
     uint32_t barrier_enable:1;
     uint32_t rounding_mode:2;
@@ -498,6 +499,62 @@ typedef struct gen6_pipe_control
   } dw4;
 } gen6_pipe_control_t;
 
+typedef struct gen8_pipe_control
+{
+  struct {
+    uint32_t length : BITFIELD_RANGE(0, 7);
+    uint32_t reserved : BITFIELD_RANGE(8, 15);
+    uint32_t instruction_subopcode : BITFIELD_RANGE(16, 23);
+    uint32_t instruction_opcode : BITFIELD_RANGE(24, 26);
+    uint32_t instruction_pipeline : BITFIELD_RANGE(27, 28);
+    uint32_t instruction_type : BITFIELD_RANGE(29, 31);
+  } dw0;
+
+  struct {
+    uint32_t depth_cache_flush_enable : BITFIELD_BIT(0);
+    uint32_t stall_at_pixel_scoreboard : BITFIELD_BIT(1);
+    uint32_t state_cache_invalidation_enable : BITFIELD_BIT(2);
+    uint32_t constant_cache_invalidation_enable : BITFIELD_BIT(3);
+    uint32_t vf_cache_invalidation_enable : BITFIELD_BIT(4);
+    uint32_t dc_flush_enable : BITFIELD_BIT(5);
+    uint32_t protected_memory_app_id : BITFIELD_BIT(6);
+    uint32_t pipe_control_flush_enable : BITFIELD_BIT(7);
+    uint32_t notify_enable : BITFIELD_BIT(8);
+    uint32_t indirect_state_pointers_disable : BITFIELD_BIT(9);
+    uint32_t texture_cache_invalidation_enable : BITFIELD_BIT(10);
+    uint32_t instruction_cache_invalidate_enable : BITFIELD_BIT(11);
+    uint32_t render_target_cache_flush_enable : BITFIELD_BIT(12);
+    uint32_t depth_stall_enable : BITFIELD_BIT(13);
+    uint32_t post_sync_operation : BITFIELD_RANGE(14, 15);
+    uint32_t generic_media_state_clear : BITFIELD_BIT(16);
+    uint32_t synchronize_gfdt_surface : BITFIELD_BIT(17);
+    uint32_t tlb_invalidate : BITFIELD_BIT(18);
+    uint32_t global_snapshot_count_reset : BITFIELD_BIT(19);
+    uint32_t cs_stall : BITFIELD_BIT(20);
+    uint32_t store_data_index : BITFIELD_BIT(21);
+    uint32_t protected_memory_enable : BITFIELD_BIT(22);
+    uint32_t reserved : BITFIELD_RANGE(23, 31);
+  } dw1;
+
+  struct {
+    uint32_t reserved : BITFIELD_RANGE(0, 1);
+    uint32_t destination_address_type : BITFIELD_BIT(2);
+    uint32_t address : BITFIELD_RANGE(3, 31);
+  } dw2;
+
+  struct {
+    uint32_t data;
+  } dw3;
+
+  struct {
+    uint32_t data;
+  } dw4;
+
+  struct {
+    uint32_t data;
+  } dw5;
+} gen8_pipe_control_t;
+
 typedef struct gen6_sampler_state
 {
   struct {
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
new file mode 100644
index 0000000..5b32cd5
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
@@ -0,0 +1,18 @@
+kernel void __cl_copy_buffer_to_image_2d_align16(__write_only image2d_t image, global uint4* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  uint4 color = (uint4)(0);
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  src_offset += j * region0 + i;
+  color = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
+
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 9cbf260..e7a9e26 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -2,8 +2,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include)
 
 ##### Math Function Part:
-EXEC_PROGRAM(mkdir ${CMAKE_CURRENT_SOURCE_DIR} ARGS generated -p)
-EXEC_PROGRAM(${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR} ARGS utest_math_gen.py OUTPUT_VARIABLE GEN_MATH_STRING)
+EXECUTE_PROCESS(COMMAND mkdir generated -p WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+EXECUTE_PROCESS(COMMAND ${PYTHON_EXECUTABLE} utest_math_gen.py WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE GEN_MATH_STRING)
 string(REGEX REPLACE " " ";" ADDMATHFUNC ${GEN_MATH_STRING})
 
 string(REGEX REPLACE "generated/([^\ ]*)\\.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../kernels/\\1.cl" KERNEL_MATH_LIST ${GEN_MATH_STRING})
@@ -39,6 +39,8 @@ set (utests_sources
   compiler_box_blur.cpp
   compiler_insert_to_constant.cpp
   compiler_argument_structure.cpp
+  compiler_argument_structure_indirect.cpp
+  compiler_argument_structure_select.cpp
   compiler_arith_shift_right.cpp
   compiler_mixed_pointer.cpp
   compiler_array0.cpp
@@ -49,8 +51,6 @@ set (utests_sources
   compiler_array4.cpp
   compiler_byte_scatter.cpp
   compiler_ceil.cpp
-  compiler_clz_short.cpp
-  compiler_clz_int.cpp
   compiler_popcount.cpp
   compiler_convert_uchar_sat.cpp
   compiler_copy_buffer.cpp
@@ -115,6 +115,7 @@ set (utests_sources
   compiler_write_only_shorts.cpp
   compiler_switch.cpp
   compiler_bswap.cpp
+  compiler_clz.cpp
   compiler_math.cpp
   compiler_atomic_functions.cpp
   compiler_async_copy.cpp
@@ -172,6 +173,9 @@ set (utests_sources
   runtime_compile_link.cpp
   compiler_long.cpp
   compiler_long_2.cpp
+  compiler_long_not.cpp
+  compiler_long_hi_sat.cpp
+  compiler_long_div.cpp
   compiler_long_convert.cpp
   compiler_long_shl.cpp
   compiler_long_shr.cpp
@@ -179,17 +183,19 @@ set (utests_sources
   compiler_long_mult.cpp
   compiler_long_cmp.cpp
   compiler_long_bitcast.cpp
+  compiler_half.cpp
   compiler_function_argument3.cpp
   compiler_function_qualifiers.cpp
   compiler_bool_cross_basic_block.cpp
   compiler_private_const.cpp
   compiler_private_data_overflow.cpp
   compiler_getelementptr_bitcast.cpp
-  compiler_simd_any.cpp
-  compiler_simd_all.cpp
+  compiler_sub_group_any.cpp
+  compiler_sub_group_all.cpp
   compiler_time_stamp.cpp
   compiler_double_precision.cpp
   load_program_from_gen_bin.cpp
+  load_program_from_spir.cpp
   get_arg_info.cpp
   profiling_exec.cpp
   enqueue_copy_buf.cpp
@@ -205,7 +211,10 @@ set (utests_sources
   compiler_assignment_operation_in_if.cpp
   vload_bench.cpp
   runtime_use_host_ptr_buffer.cpp
-  runtime_alloc_host_ptr_buffer.cpp)
+  runtime_alloc_host_ptr_buffer.cpp
+  compiler_get_sub_group_size.cpp
+  compiler_get_sub_group_id.cpp
+  compiler_sub_group_shuffle.cpp)
 
 if (LLVM_VERSION_NODOT VERSION_GREATER 34)
   SET(utests_sources
@@ -214,12 +223,12 @@ if (LLVM_VERSION_NODOT VERSION_GREATER 34)
 endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
 
 if (X11_FOUND)
-  set(utests_sources
+  SET(utests_sources
       ${utests_sources}
       runtime_climage_from_boname.cpp)
   SET(UTESTS_REQUIRED_X11_LIB ${X11_LIBRARIES} ${XEXT_LIBRARIES})
 else()
-SET(UTESTS_REQUIRED_X11_LIB "")
+  SET(UTESTS_REQUIRED_X11_LIB "")
 endif (X11_FOUND)
 
 SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
index a18f31e..f586448 100644
--- a/utests/builtin_pow.cpp
+++ b/utests/builtin_pow.cpp
@@ -28,6 +28,7 @@ static void builtin_pow(void)
 {
   // Setup kernel and buffers
   int k, i, index_cur;
+  float ULPSIZE_NO_FAST_MATH = 16.0;
   float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
 
   for(i=0; i<count_input_ori;i++)
@@ -37,10 +38,10 @@ static void builtin_pow(void)
       input_data2[i*count_input_ori+k] = ori_data[k];
     }
 
-  const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
-  float ULPSIZE_FACTOR = 16.0;
-  if (env_strict == NULL || strcmp(env_strict, "0") == 0)
-    ULPSIZE_FACTOR = 10000.;
+  cl_device_fp_config fp_config;
+  clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, 0);
+  bool denormals_supported = fp_config & CL_FP_DENORM;
+  float ULPSIZE_FACTOR = select_ulpsize(ULPSIZE_FAST_MATH,ULPSIZE_NO_FAST_MATH);
 
   OCL_CREATE_KERNEL("builtin_pow");
 
@@ -75,7 +76,9 @@ static void builtin_pow(void)
 #if udebug
       if ( (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])) ||
            (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])) ||
-           (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR)   )
+           (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR
+           && (denormals_supported || gpu_data[index_cur]!=0 || std::fpclassify(cpu_data[index_cur])!=FP_SUBNORMAL) ) )
+
       {
         printf_c("%d/%d: x:%f, y:%f -> gpu:%f  cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
       }
@@ -88,7 +91,8 @@ static void builtin_pow(void)
        OCL_ASSERT(isnan(gpu_data[index_cur]));
      else
      {
-       OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR);
+       OCL_ASSERT((fabs(gpu_data[index_cur] - cpu_data[index_cur]) < cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR) ||
+       (!denormals_supported && gpu_data[index_cur]==0 && std::fpclassify(cpu_data[index_cur])==FP_SUBNORMAL) );
      }
 #endif
     }
diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
index 47cc5f4..db9ab3c 100644
--- a/utests/builtin_tgamma.cpp
+++ b/utests/builtin_tgamma.cpp
@@ -6,6 +6,7 @@ void builtin_tgamma(void)
 {
   const int n = 1024;
   float src[n];
+  float ULPSIZE_NO_FAST_MATH = 16.0;
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("builtin_tgamma");
@@ -15,15 +16,17 @@ void builtin_tgamma(void)
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
   globals[0] = n;
   locals[0] = 16;
-  const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
-  float ULPSIZE_FACTOR = 16.0;
-  if (env_strict == NULL || strcmp(env_strict, "0") == 0)
-    ULPSIZE_FACTOR = 10000.;
+  float ULPSIZE_FACTOR = select_ulpsize(ULPSIZE_FAST_MATH,ULPSIZE_NO_FAST_MATH);
 
-  for (int j = 0; j < 1024; j ++) {
+  cl_device_fp_config fp_config;
+  clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, 0);
+  bool denormals_supported = fp_config & CL_FP_DENORM;
+  float max_ulp = 0, max_ulp_at = 0;
+
+  for (int j = 0; j < 128; j ++) {
     OCL_MAP_BUFFER(0);
     for (int i = 0; i < n; ++i) {
-      src[i] = ((float*)buf_data[0])[i] = (j*n+i+1) * 0.001f;
+      src[i] = ((float*)buf_data[0])[i] = j - 64 + i*0.001f;
     }
     OCL_UNMAP_BUFFER(0);
 
@@ -32,7 +35,14 @@ void builtin_tgamma(void)
     OCL_MAP_BUFFER(1);
     float *dst = (float*)buf_data[1];
     for (int i = 0; i < n; ++i) {
-      float cpu = tgammaf(src[i]);
+      float cpu = tgamma(src[i]);
+      if (!denormals_supported && std::fpclassify(cpu)==FP_SUBNORMAL && dst[i]==0) {
+        cpu = 0;
+      }
+      if (fabsf(cpu - dst[i]) > cl_FLT_ULP(cpu) * max_ulp) {
+        max_ulp = fabsf(cpu - dst[i]) / cl_FLT_ULP(cpu);
+        max_ulp_at = src[i];
+      }
       if (isinf(cpu)) {
         OCL_ASSERT(isinf(dst[i]));
       } else if (fabsf(cpu - dst[i]) >= cl_FLT_ULP(cpu) * ULPSIZE_FACTOR) {
@@ -42,6 +52,7 @@ void builtin_tgamma(void)
     }
     OCL_UNMAP_BUFFER(1);
   }
+  printf("max error=%f ulp at x=%f ", max_ulp, max_ulp_at);
 }
 
 MAKE_UTEST_FROM_FUNCTION(builtin_tgamma);
diff --git a/utests/compiler_argument_structure_indirect.cpp b/utests/compiler_argument_structure_indirect.cpp
index a4584d5..b54432e 100644
--- a/utests/compiler_argument_structure_indirect.cpp
+++ b/utests/compiler_argument_structure_indirect.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
 
-struct hop { int x[16]; };
+struct hop { int a, x[16]; };
 
 void compiler_argument_structure_indirect(void)
 {
@@ -21,8 +21,9 @@ void compiler_argument_structure_indirect(void)
   OCL_MAP_BUFFER(0);
 
   // Check results
-  for (uint32_t i = 0; i < n; ++i)
-    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 7);
+  for (uint32_t i = 0; i < n; ++i ) {
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == (i%16));
+  }
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_argument_structure_indirect);
diff --git a/utests/compiler_argument_structure_indirect.cpp b/utests/compiler_argument_structure_select.cpp
similarity index 51%
copy from utests/compiler_argument_structure_indirect.cpp
copy to utests/compiler_argument_structure_select.cpp
index a4584d5..b46e745 100644
--- a/utests/compiler_argument_structure_indirect.cpp
+++ b/utests/compiler_argument_structure_select.cpp
@@ -1,17 +1,23 @@
 #include "utest_helper.hpp"
 
-struct hop { int x[16]; };
+struct hop{
+  int  offset;
+  int  threshold0;
+  int  threshold1;
+};
 
-void compiler_argument_structure_indirect(void)
+void compiler_argument_structure_select(void)
 {
   const size_t n = 2048;
   hop h;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_argument_structure_indirect");
+  OCL_CREATE_KERNEL("compiler_argument_structure_select");
   OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  for (int i = 0; i < 16; ++i) h.x[i] = i;
+  h.offset = 2;
+  h.threshold0 = 5;
+  h.threshold1 = 7;
   OCL_SET_ARG(1, sizeof(hop), &h);
 
   // Run the kernel
@@ -21,9 +27,11 @@ void compiler_argument_structure_indirect(void)
   OCL_MAP_BUFFER(0);
 
   // Check results
-  for (uint32_t i = 0; i < n; ++i)
+  OCL_ASSERT(((uint32_t*)buf_data[0])[0] == 5);
+  for (uint32_t i = 1; i < n; ++i ) {
     OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 7);
+  }
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_argument_structure_indirect);
+MAKE_UTEST_FROM_FUNCTION(compiler_argument_structure_select);
 
diff --git a/utests/compiler_bswap.cpp b/utests/compiler_bswap.cpp
index 9475b99..3af9ef5 100644
--- a/utests/compiler_bswap.cpp
+++ b/utests/compiler_bswap.cpp
@@ -1,7 +1,6 @@
 #include "utest_helper.hpp"
 #include "string.h"
 
-namespace {
 #define cpu_htons(A)     ((((uint16_t)(A) & 0xff00) >> 8) | \
     (((uint16_t)(A) & 0x00ff) << 8))
 #define cpu_htonl(A)     ((((uint32_t)(A) & 0xff000000) >> 24) | \
@@ -9,108 +8,190 @@ namespace {
     (((uint32_t)(A) & 0x0000ff00) << 8) | \
     (((uint32_t)(A) & 0x000000ff) << 24))
 
+
+template <typename T> static void gen_rand_val(T & val)
+{
+  val = static_cast<T>(rand());//(0xAABBCCDD);//
+}
+
 template <typename T> static void cpu(int global_id, T *src, T *dst)
 {
-    T f = src[global_id];
-    T g = 0;
-    if(sizeof(T) == sizeof(int16_t))
-      g = cpu_htons(f);
-    else if(sizeof(T) == sizeof(int32_t))
-      g = cpu_htonl(f);
-    dst[global_id] = g;
+  T f = src[global_id];
+  T g = 0;
+  if (sizeof(T) == sizeof(int16_t))
+    g = cpu_htons(f);
+  else if (sizeof(T) == sizeof(int32_t))
+    g = cpu_htonl(f);
+  dst[global_id] = g;
 }
 
-template <typename T> static void gen_rand_val (T & val)
+template <typename T> static void cpu(int global_id, T src, T *dst)
 {
-    val = static_cast<T>(rand() );
+  T f = src;
+  T g = 0;
+  if (sizeof(T) == sizeof(int16_t))
+    g = cpu_htons(f);
+  else if (sizeof(T) == sizeof(int32_t))
+    g = cpu_htonl(f);
+  dst[global_id] = g;
 }
 
-template <typename T>
-inline static void print_data (T& val)
+template <typename T> inline static void print_data(T& val)
 {
-    if(sizeof(T) == sizeof(uint16_t))
-        printf(" %hx", val);
-    else
-        printf(" %x", val);
+  if(sizeof(T) == sizeof(uint16_t))
+    printf(" 0x%hx", val);
+  else
+    printf(" 0x%x", val);
 }
 
-template <typename T> static void dump_data (T* src, T* dst, int n)
+template <typename T> static void dump_data(T* raw, T* cpu, T* gpu, int n)
 {
-    printf("\nRaw: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(((T *)buf_data[0])[i]);
-    }
+  printf("\nRaw: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(raw[i]);
+  }
 
-    printf("\nCPU: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(dst[i]);
-    }
-    printf("\nGPU: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(((T *)buf_data[1])[i]);
-    }
+  printf("\nCPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(cpu[i]);
+  }
+  printf("\nGPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(gpu[i]);
+  }
 }
 
-template<typename T>
-void test(const char *kernel_name)
+template <typename T> static void dump_data(T raw, T* cpu, T* gpu, int n)
 {
-  const size_t n = 64;
-  T cpu_dst[n];
-  T cpu_src[n];
+  printf("\nRaw: \n");
+  print_data(raw);
+
+  printf("\nCPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(cpu[i]);
+  }
+  printf("\nGPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(gpu[i]);
+  }
+}
+
+void compiler_bswap(void)
+{
+  const size_t n = 32;
+  uint32_t src0[n];
+  uint16_t src1[n];
+  uint32_t dst0[n];
+  uint16_t dst1[n];
+  int32_t src2 = static_cast<int32_t>(rand());
+  int32_t dst2[n];
+  int16_t src3 = static_cast<int16_t>(rand());
+  int16_t dst3[n];
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", kernel_name);
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", "compiler_bswap");
+  OCL_CREATE_BUFFER(buf[0], 0, sizeof(src0), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, sizeof(dst0), NULL);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
 
+  OCL_CREATE_BUFFER(buf[2], 0, sizeof(src1), NULL);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_CREATE_BUFFER(buf[3], 0, sizeof(dst1), NULL);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+
+  OCL_SET_ARG(4, sizeof(int32_t), &src2);
+  OCL_CREATE_BUFFER(buf[4], 0, sizeof(dst2), NULL);
+  OCL_SET_ARG(5, sizeof(cl_mem), &buf[4]);
+
+  OCL_SET_ARG(6, sizeof(int16_t), &src3);
+  OCL_CREATE_BUFFER(buf[5], 0, sizeof(dst3), NULL);
+  OCL_SET_ARG(7, sizeof(cl_mem), &buf[5]);
+
   OCL_MAP_BUFFER(0);
   for (int32_t i = 0; i < (int32_t) n; ++i) {
-    gen_rand_val(cpu_src[i]);
+    gen_rand_val(src0[i]);
   }
-
-  memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+  memcpy(buf_data[0], src0, sizeof(src0));
+  OCL_UNMAP_BUFFER(0);
 
   /* Clear the dst buffer to avoid random data. */
   OCL_MAP_BUFFER(1);
-  memset(buf_data[1], 0, sizeof(T) * n);
+  memset(buf_data[1], 0, sizeof(dst0));
   OCL_UNMAP_BUFFER(1);
 
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    gen_rand_val(src1[i]);
+  }
+  memcpy(buf_data[2], src1, sizeof(src1));
+  OCL_UNMAP_BUFFER(2);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(3);
+  memset(buf_data[3], 0, sizeof(dst1));
+  OCL_UNMAP_BUFFER(3);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(4);
+  memset(buf_data[4], 0, sizeof(dst2));
+  OCL_UNMAP_BUFFER(4);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(5);
+  memset(buf_data[5], 0, sizeof(dst3));
+  OCL_UNMAP_BUFFER(5);
+
   globals[0] = n;
   locals[0] = 16;
   OCL_NDRANGE(1);
 
   // Run on CPU
-  for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu(i, cpu_src, cpu_dst);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i%2) {
+      dst0[i] = src0[i];
+      continue;
+    }
+    cpu(i, src0, dst0);
+  }
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu(i, src1, dst1);
+
+    if (i%2) {
+      dst1[i] = dst1[i] + 1;
+      cpu(i, dst1, dst1);
+    }
+  }
 
+  // Run on CPU
   for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu_dst[i] = cpu_dst[i] -1;
+    cpu(i, src2, dst2);
 
   // Run on CPU
   for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu(i, cpu_dst, cpu_dst);
+    cpu(i, src3, dst3);
 
   OCL_MAP_BUFFER(1);
- // dump_data(cpu_src, cpu_dst, n);
+  //dump_data(src0, dst0, (uint32_t *)buf_data[1], n);
+  OCL_ASSERT(!memcmp(buf_data[1], dst0, sizeof(dst0)));
+  OCL_UNMAP_BUFFER(1);
 
-  OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+  OCL_MAP_BUFFER(3);
+  //dump_data(src1, dst1, (uint16_t *)buf_data[3], n);
+  OCL_ASSERT(!memcmp(buf_data[3], dst1, sizeof(dst1)));
+  OCL_UNMAP_BUFFER(3);
 
-  OCL_UNMAP_BUFFER(1);
-  OCL_UNMAP_BUFFER(0);
-}
+  OCL_MAP_BUFFER(4);
+  //dump_data(src2, dst2, (int32_t *)buf_data[4], n);
+  OCL_ASSERT(!memcmp(buf_data[4], dst2, sizeof(dst2)));
+  OCL_UNMAP_BUFFER(4);
 
+  OCL_MAP_BUFFER(5);
+  //dump_data(src3, dst3, (int16_t *)buf_data[5], n);
+  OCL_ASSERT(!memcmp(buf_data[5], dst3, sizeof(dst3)));
+  OCL_UNMAP_BUFFER(5);
 }
 
-#define compiler_bswap(type, kernel) \
-static void compiler_bswap_ ##type(void)\
-{\
-  test<type>(# kernel);\
-}\
-MAKE_UTEST_FROM_FUNCTION(compiler_bswap_ ## type);
-
-compiler_bswap(int16_t, compiler_bswap_short)
-compiler_bswap(uint16_t, compiler_bswap_ushort)
-compiler_bswap(int32_t, compiler_bswap_int)
-compiler_bswap(uint32_t, compiler_bswap_uint)
+MAKE_UTEST_FROM_FUNCTION(compiler_bswap);
diff --git a/utests/compiler_clz.cpp b/utests/compiler_clz.cpp
new file mode 100644
index 0000000..9116608
--- /dev/null
+++ b/utests/compiler_clz.cpp
@@ -0,0 +1,144 @@
+#include "utest_helper.hpp"
+
+namespace {
+
+template<typename T>
+T get_max();
+
+#define DEF_TEMPLATE_MAX(TYPE, NAME)                                \
+template <>                                                         \
+TYPE get_max<TYPE>()                                                \
+{                                                                   \
+  static TYPE max = CL_##NAME##_MAX;                                \
+  return max;                                                       \
+}                                                                   \
+                                                                    \
+template <>                                                         \
+u##TYPE get_max<u##TYPE>()                                          \
+{                                                                   \
+  static u##TYPE max = CL_U##NAME##_MAX;                            \
+  return max;                                                       \
+}
+
+DEF_TEMPLATE_MAX(int8_t, CHAR)
+DEF_TEMPLATE_MAX(int16_t, SHRT)
+DEF_TEMPLATE_MAX(int32_t, INT)
+DEF_TEMPLATE_MAX(int64_t, LONG)
+
+template<typename T>
+T get_min();
+
+#define DEF_TEMPLATE_MIN(TYPE, NAME)                                \
+template <>                                                         \
+TYPE get_min<TYPE>()                                                \
+{                                                                   \
+  static TYPE min = CL_##NAME##_MIN;                                \
+  return min;                                                       \
+}                                                                   \
+                                                                    \
+template <>                                                         \
+u##TYPE get_min<u##TYPE>()                                          \
+{                                                                   \
+  static u##TYPE min = 0;                                           \
+  return min;                                                       \
+}
+
+DEF_TEMPLATE_MIN(int8_t, CHAR)
+DEF_TEMPLATE_MIN(int16_t, SHRT)
+DEF_TEMPLATE_MIN(int32_t, INT)
+DEF_TEMPLATE_MIN(int64_t, LONG)
+
+template<typename U>
+void test(const char *kernel_name, int s_type)
+{
+  const size_t n = 64;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_clz", kernel_name);
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(U), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(U), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  U max = get_max<U>();
+  U min = get_min<U>();
+
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i) {
+      ((U*)buf_data[0])[i] = max >> i;
+      if(i == sizeof(U)*8)
+        ((U*)buf_data[0])[i] = min;
+  }
+
+  OCL_UNMAP_BUFFER(0);
+
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(1);
+  // for unsigned type.
+  if(s_type == 0)
+  {
+    for (uint32_t i = 0; i < n; ++i) {
+      if(sizeof(U) == 1 && i < 8 )
+        OCL_ASSERT(((U*)buf_data[1])[i] == i );
+      else if(sizeof(U) == 2 && i < 16 )
+        OCL_ASSERT(((U*)buf_data[1])[i] == i );
+      else if(sizeof(U) == 4 && i < 32 )
+        OCL_ASSERT(((U*)buf_data[1])[i] == i );
+      else if(sizeof(U) == 8 && i < 64 )
+        OCL_ASSERT(((U*)buf_data[1])[i] == i );
+    }
+  }
+  else  // signed type
+  {
+    for (uint32_t i = 0; i < n; ++i) {
+      if(sizeof(U) == 1)
+      {
+        if( i < 8 )
+          OCL_ASSERT(((U*)buf_data[1])[i] == i+1 );
+        else if( i == 8 )
+          OCL_ASSERT(((U*)buf_data[1])[i] == 0 );
+      }
+      else if(sizeof(U) == 2)
+      {
+        if( i < 16 )
+          OCL_ASSERT(((U*)buf_data[1])[i] == i+1 );
+        else if( i == 16 )
+          OCL_ASSERT(((U*)buf_data[1])[i] == 0 );
+      }
+      else if(sizeof(U) == 4)
+      {
+        if( i < 32 )
+          OCL_ASSERT(((U*)buf_data[1])[i] == i+1 );
+        else if( i == 32 )
+          OCL_ASSERT(((U*)buf_data[1])[i] == 0 );
+      }
+      else if(sizeof(U) == 8)
+      {
+        if( i < 63 )
+          OCL_ASSERT(((U*)buf_data[1])[i] == i+1 );
+      }
+    }
+  }
+  OCL_UNMAP_BUFFER(1);
+
+}
+
+}
+
+#define compiler_clz(type, kernel, s_type)\
+static void compiler_clz_ ##type(void)\
+{\
+  test<type>(# kernel, s_type);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_clz_ ## type);
+
+compiler_clz(uint64_t, compiler_clz_ulong, 0)
+compiler_clz(uint32_t, compiler_clz_uint, 0)
+compiler_clz(uint16_t, compiler_clz_ushort, 0)
+compiler_clz(uint8_t, compiler_clz_uchar, 0)
+compiler_clz(int64_t, compiler_clz_long, 1)
+compiler_clz(int32_t, compiler_clz_int, 1)
+compiler_clz(int16_t, compiler_clz_short, 1)
+compiler_clz(int8_t, compiler_clz_char, 1)
diff --git a/utests/compiler_clz_int.cpp b/utests/compiler_clz_int.cpp
deleted file mode 100644
index c12cfc6..0000000
--- a/utests/compiler_clz_int.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#include "utest_helper.hpp"
-
-void compiler_clz_int(void)
-{
-  const int n = 32;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_clz_int");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  globals[0] = n;
-  locals[0] = 16;
-
-  OCL_MAP_BUFFER(0);
-  ((int*)buf_data[0])[0] = 0;
-  for (int32_t i = 1; i < (int32_t) n; ++i)
-    ((int*)buf_data[0])[i] = 0xffffffffu >> i;
-  OCL_UNMAP_BUFFER(0);
-
-  OCL_NDRANGE(1);
-
-  OCL_MAP_BUFFER(1);
-  OCL_ASSERT(((int*)buf_data[1])[0] == 32);
-  for (int i = 1; i < n; ++i)
-    OCL_ASSERT(((int*)buf_data[1])[i] == i);
-  OCL_UNMAP_BUFFER(1);
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_clz_int);
diff --git a/utests/compiler_clz_short.cpp b/utests/compiler_clz_short.cpp
deleted file mode 100644
index eb3a370..0000000
--- a/utests/compiler_clz_short.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#include "utest_helper.hpp"
-
-void compiler_clz_short(void)
-{
-  const size_t n = 16;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_clz_short");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(short), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  globals[0] = n;
-  locals[0] = 16;
-
-  OCL_MAP_BUFFER(0);
-  ((short*)buf_data[0])[0] = 0;
-  for (int32_t i = 1; i < (int32_t) n; ++i)
-    ((short*)buf_data[0])[i] = 0xffffu >> i;
-  OCL_UNMAP_BUFFER(0);
-
-  OCL_NDRANGE(1);
-
-  OCL_MAP_BUFFER(1);
-  OCL_ASSERT(((short*)buf_data[1])[0] == 16);
-  for (unsigned i = 1; i < (unsigned) n; ++i)
-    OCL_ASSERT(((short*)buf_data[1])[i] == (short)i);
-  OCL_UNMAP_BUFFER(1);
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_clz_short);
diff --git a/utests/compiler_get_sub_group_id.cpp b/utests/compiler_get_sub_group_id.cpp
new file mode 100644
index 0000000..0d88d29
--- /dev/null
+++ b/utests/compiler_get_sub_group_id.cpp
@@ -0,0 +1,33 @@
+#include "utest_helper.hpp"
+
+void compiler_get_sub_group_id(void)
+{
+  const size_t n = 256;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_get_sub_group_id");
+  OCL_CREATE_BUFFER(buf[0], 0, (n+1) * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) (n+1); ++i)
+    ((int*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  int* dst = (int *)buf_data[0];
+  OCL_ASSERT(8 == dst[0] || 16 == dst[0]);
+  for (int32_t i = 1; i < (int32_t) n; ++i){
+    OCL_ASSERT((i-1) % dst[0] == dst[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_get_sub_group_id);
diff --git a/utests/compiler_get_sub_group_size.cpp b/utests/compiler_get_sub_group_size.cpp
new file mode 100644
index 0000000..20339d7
--- /dev/null
+++ b/utests/compiler_get_sub_group_size.cpp
@@ -0,0 +1,32 @@
+#include "utest_helper.hpp"
+
+void compiler_get_sub_group_size(void)
+{
+  const size_t n = 256;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_get_sub_group_size");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    ((int*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  int* dst = (int *)buf_data[0];
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    OCL_ASSERT(8 == dst[i] || 16 == dst[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_get_sub_group_size);
diff --git a/utests/compiler_half.cpp b/utests/compiler_half.cpp
new file mode 100644
index 0000000..e8ed286
--- /dev/null
+++ b/utests/compiler_half.cpp
@@ -0,0 +1,924 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include "utest_helper.hpp"
+
+static uint32_t __half_to_float(uint16_t h, bool* isInf = NULL, bool* infSign = NULL)
+{
+  struct __FP32 {
+    uint32_t mantissa:23;
+    uint32_t exponent:8;
+    uint32_t sign:1;
+  };
+  struct __FP16 {
+    uint32_t mantissa:10;
+    uint32_t exponent:5;
+    uint32_t sign:1;
+  };
+  uint32_t f;
+  __FP32 o;
+  memset(&o, 0, sizeof(o));
+  __FP16 i;
+  memcpy(&i, &h, sizeof(uint16_t));
+
+  if (isInf)
+    *isInf = false;
+  if (infSign)
+    *infSign = false;
+
+  if (i.exponent == 0 && i.mantissa == 0) // (Signed) zero
+    o.sign = i.sign;
+  else {
+    if (i.exponent == 0) { // Denormal (converts to normalized)
+      // Adjust mantissa so it's normalized (and keep
+      // track of exponent adjustment)
+      int e = -1;
+      uint m = i.mantissa;
+      do {
+        e++;
+        m <<= 1;
+      } while ((m & 0x400) == 0);
+
+      o.mantissa = (m & 0x3ff) << 13;
+      o.exponent = 127 - 15 - e;
+      o.sign = i.sign;
+    } else if (i.exponent == 0x1f) { // Inf/NaN
+      // NOTE: Both can be handled with same code path
+      // since we just pass through mantissa bits.
+      o.mantissa = i.mantissa << 13;
+      o.exponent = 255;
+      o.sign = i.sign;
+
+      if (isInf) {
+        *isInf = (i.mantissa == 0);
+        if (infSign)
+          *infSign = !i.sign;
+      }
+    } else { // Normalized number
+      o.mantissa = i.mantissa << 13;
+      o.exponent = 127 - 15 + i.exponent;
+      o.sign = i.sign;
+    }
+  }
+
+  memcpy(&f, &o, sizeof(uint32_t));
+  return f;
+}
+
+
+static uint16_t __float_to_half(uint32_t x)
+{
+  uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
+  uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
+  unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
+
+  /* If zero, or denormal, or exponent underflows too much for a denormal
+   * half, return signed zero. */
+  if (e < 103)
+    return bits;
+
+  /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
+  if (e > 142) {
+    bits |= 0x7c00u;
+    /* If exponent was 0xff and one mantissa bit was set, it means NaN,
+     * not Inf, so make sure we set one mantissa bit too. */
+    bits |= e == 255 && (x & 0x007fffffu);
+    return bits;
+  }
+
+  /* If exponent underflows but not too much, return a denormal */
+  if (e < 113) {
+    m |= 0x0800u;
+    /* Extra rounding may overflow and set mantissa to 0 and exponent
+     * to 1, which is OK. */
+    bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
+    return bits;
+  }
+
+  bits |= ((e - 112) << 10) | (m >> 1);
+  /* Extra rounding. An overflow will set mantissa to 0 and increment
+   * the exponent, which is OK. */
+  bits += m & 1;
+  return bits;
+}
+
+static int check_half_device(void)
+{
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, &param_value_size);
+  std::vector<char> param_value(param_value_size);
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+           param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+  if (!param_value.empty())
+    extStr = std::string(&param_value.front(), param_value_size-1);
+
+  if (std::strstr(extStr.c_str(), "cl_khr_fp16") == NULL) {
+    printf("No cl_khr_fp16, Skip!");
+    return 0;
+  }
+
+  return 1;
+}
+
+void compiler_half_basic(void)
+{
+  const size_t n = 16;
+  uint16_t hsrc[n];
+  float fsrc[n], fdst[n];
+  float f = 2.5;
+  uint32_t tmp_f;
+
+  if (!check_half_device())
+    return;
+
+  memcpy(&tmp_f, &f, sizeof(float));
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half", "compiler_half_basic");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    fsrc[i] = 10.1 * i;
+    memcpy(&tmp_f, &fsrc[i], sizeof(float));
+    hsrc[i] = __float_to_half(tmp_f);
+  }
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    fdst[i] = fsrc[i] + f;
+    fdst[i] = fdst[i]*fdst[i];
+    fdst[i] = fdst[i]/1.8;
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, sizeof(hsrc));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    tmp_f = __half_to_float(((uint16_t *)buf_data[1])[i]);
+    memcpy(&f, &tmp_f, sizeof(float));
+    printf("%f %f\n", f, fdst[i]);
+    OCL_ASSERT(fabs(f - fdst[i]) <= 0.01 * fabs(fdst[i]) || (fdst[i] == 0.0 && f == 0.0));
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_half_basic);
+
+
+#define HALF_MATH_TEST_1ARG(NAME, CPPNAME, RANGE_L, RANGE_H)            \
+  void compiler_half_math_##NAME(void)                                  \
+  {                                                                     \
+    const size_t n = 16;                                                \
+    uint16_t hsrc[n];                                                   \
+    float fsrc[n], fdst[n];                                             \
+    uint32_t tmp_f;                                                     \
+    float f;                                                            \
+                                                                        \
+    if (!check_half_device())                                           \
+      return;                                                           \
+                                                                        \
+    OCL_CREATE_KERNEL_FROM_FILE("compiler_half_math", "compiler_half_math_" #NAME); \
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);           \
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);           \
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);                            \
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);                            \
+    globals[0] = n;                                                     \
+    locals[0] = 16;                                                     \
+                                                                        \
+    for (int32_t i = 0; i < (int32_t) n; ++i) {                         \
+      fsrc[i] = RANGE_L + ((rand()%1000) / 1000.0f ) * ((RANGE_H) - (RANGE_L)); \
+      memcpy(&tmp_f, &fsrc[i], sizeof(float));                          \
+      hsrc[i] = __float_to_half(tmp_f);                                 \
+    }                                                                   \
+                                                                        \
+    for (int32_t i = 0; i < (int32_t) n; ++i) {                         \
+      /* printf("Float is %f\n", fsrc[i]); */                           \
+      fdst[i] = CPPNAME(fsrc[i]);                                       \
+    }                                                                   \
+                                                                        \
+    OCL_MAP_BUFFER(0);                                                  \
+    OCL_MAP_BUFFER(1);                                                  \
+    memcpy(buf_data[0], hsrc, sizeof(hsrc));                            \
+    memset(buf_data[1], 0, sizeof(hsrc));                               \
+    OCL_UNMAP_BUFFER(0);                                                \
+    OCL_UNMAP_BUFFER(1);                                                \
+    OCL_NDRANGE(1);                                                     \
+                                                                        \
+    OCL_MAP_BUFFER(1);                                                  \
+    for (int32_t i = 0; i < (int32_t) n; ++i) {                         \
+      bool isInf, infSign;                                              \
+      tmp_f = __half_to_float(((uint16_t *)buf_data[1])[i], &isInf, &infSign); \
+      memcpy(&f, &tmp_f, sizeof(float));                                \
+      /*printf("%.15f %.15f, diff is %%%f\n", f, fdst[i], (fabs(f - fdst[i])/fabs(fdst[i]))); */ \
+      OCL_ASSERT(((fabs(fdst[i]) < 6e-8f) && (fabs(f) < 6e-8f)) ||      \
+                 (fabs(f - fdst[i]) <= 0.03 * fabs(fdst[i])) ||         \
+                 (isInf && ((infSign && fdst[i] > 65504.0f) || (!infSign && fdst[i] < -65504.0f))) || \
+                 (isnan(f) && isnan(fdst[i])));                         \
+    }                                                                   \
+    OCL_UNMAP_BUFFER(1);                                                \
+  }                                                                     \
+  MAKE_UTEST_FROM_FUNCTION(compiler_half_math_##NAME);
+
+HALF_MATH_TEST_1ARG(sin, sinf, -10, 10);
+HALF_MATH_TEST_1ARG(cos, cosf, -10, 10);
+HALF_MATH_TEST_1ARG(sinh, sinh, -10, 10);
+HALF_MATH_TEST_1ARG(cosh, cosh, -10, 10);
+HALF_MATH_TEST_1ARG(tan, tanf, -3.14/2, 3.14/2);
+HALF_MATH_TEST_1ARG(log10, log10f, 0.1, 100);
+HALF_MATH_TEST_1ARG(log, logf, 0.01, 1000);
+HALF_MATH_TEST_1ARG(trunc, truncf, -1000, 1000);
+HALF_MATH_TEST_1ARG(exp, expf, -19.0, 20.0);
+HALF_MATH_TEST_1ARG(sqrt, sqrtf, -19.0, 10.0);
+HALF_MATH_TEST_1ARG(ceil, ceilf, -19.0, 20.0);
+
+#define HALF_MATH_TEST_2ARG(NAME, CPPNAME, RANGE_L, RANGE_H)            \
+  void compiler_half_math_##NAME(void)                                  \
+  {                                                                     \
+    const size_t n = 16*4;                                              \
+    uint16_t hsrc0[n], hsrc1[n];                                        \
+    float fsrc0[n], fsrc1[n], fdst[n];                                  \
+    uint32_t tmp_f;                                                     \
+    float f;                                                            \
+                                                                        \
+    if (!check_half_device())                                           \
+      return;                                                           \
+                                                                        \
+    OCL_CREATE_KERNEL_FROM_FILE("compiler_half_math", "compiler_half_math_" #NAME); \
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);           \
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);           \
+    OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint16_t), NULL);           \
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);                            \
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);                            \
+    OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);                            \
+    globals[0] = n;                                                     \
+    locals[0] = 16;                                                     \
+                                                                        \
+    for (int32_t i = 0; i < (int32_t) n; ++i) {                         \
+      fsrc0[i] = RANGE_L + (((RANGE_H) - (RANGE_L))/n) * i;            \
+      memcpy(&tmp_f, &fsrc0[i], sizeof(float));                         \
+      hsrc0[i] = __float_to_half(tmp_f);                                \
+      fsrc1[i] = RANGE_L + ((rand()%1000) / 1000.0f ) * ((RANGE_H) - (RANGE_L));            \
+      memcpy(&tmp_f, &fsrc1[i], sizeof(float));                         \
+      hsrc1[i] = __float_to_half(tmp_f);                                \
+    }                                                                   \
+                                                                        \
+    for (int32_t i = 0; i < (int32_t) n; ++i) {                         \
+      /* printf("Float is %f   %f\n", fsrc0[i], fsrc1[i]);*/            \
+      fdst[i] = CPPNAME(fsrc0[i], fsrc1[i]);                            \
+    }                                                                   \
+                                                                        \
+    OCL_MAP_BUFFER(0);                                                  \
+    OCL_MAP_BUFFER(1);                                                  \
+    OCL_MAP_BUFFER(2);                                                  \
+    memcpy(buf_data[0], hsrc0, sizeof(hsrc0));                          \
+    memcpy(buf_data[1], hsrc1, sizeof(hsrc1));                          \
+    memset(buf_data[2], 0, sizeof(hsrc0));                              \
+    OCL_UNMAP_BUFFER(0);                                                \
+    OCL_UNMAP_BUFFER(1);                                                \
+    OCL_UNMAP_BUFFER(2);                                                \
+    OCL_NDRANGE(1);                                                     \
+                                                                        \
+    OCL_MAP_BUFFER(2);                                                  \
+    for (int32_t i = 0; i < (int32_t) n; ++i) {                         \
+    bool isInf, infSign;                                                \
+    tmp_f = __half_to_float(((uint16_t *)buf_data[2])[i], &isInf, &infSign); \
+    memcpy(&f, &tmp_f, sizeof(float));                                  \
+    /*printf("%.15f %.15f, diff is %%%f\n", f, fdst[i], (fabs(f - fdst[i])/fabs(fdst[i]))); */ \
+    OCL_ASSERT(((fabs(fdst[i]) < 6e-8f) && (fabs(f) < 6e-8f)) ||        \
+               (fabs(f - fdst[i]) <= 0.03 * fabs(fdst[i])) ||           \
+               (isInf && ((infSign && fdst[i] > 65504.0f) || (!infSign && fdst[i] < -65504.0f))) || \
+               (isnan(f) && isnan(fdst[i])));                           \
+    }                                                                   \
+    OCL_UNMAP_BUFFER(2);                                                \
+  }                                                                     \
+  MAKE_UTEST_FROM_FUNCTION(compiler_half_math_##NAME);
+
+HALF_MATH_TEST_2ARG(fmod, fmod, 1.0, 500.0);
+HALF_MATH_TEST_2ARG(fmax, fmax, -10.0, 20.0);
+HALF_MATH_TEST_2ARG(fmin, fmin, -10.0, 20.0);
+
+void compiler_half_isnan(void)
+{
+  const size_t n = 16*2;
+  uint16_t hsrc[n];
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_relation", "compiler_half_isnan");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    hsrc[i] = 0xFF00;
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, sizeof(uint16_t)*n);
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%d\n", ((uint16_t *)buf_data[1])[i]);
+    OCL_ASSERT(((int16_t *)buf_data[1])[i] == -1);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_isnan);
+
+void compiler_half_isinf(void)
+{
+  const size_t n = 16;
+  uint16_t hsrc[n];
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_relation", "compiler_half_isinf");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n/2; ++i) {
+    hsrc[i] = 0x7C00;
+  }
+  for (int32_t i = n/2; i < (int32_t) n; ++i) {
+    hsrc[i] = 0xFC00;
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, sizeof(int)*n);
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%d\n", ((int *)buf_data[1])[i]);
+    OCL_ASSERT(((int *)buf_data[1])[i] == 1);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_isinf);
+
+
+void compiler_half_to_float(void)
+{
+  const size_t n = 16*4;
+  uint16_t hsrc[n];
+  float fdst[n];
+  uint32_t tmp_f;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_half_to_float");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    fdst[i] = 13.1 * i;
+    memcpy(&tmp_f, &fdst[i], sizeof(float));
+    hsrc[i] = __float_to_half(tmp_f);
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0.0f, sizeof(fdst));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%f %f, abs is %f\n", (((float *)buf_data[1])[i]), fdst[i], fabs((((float *)buf_data[1])[i]) - fdst[i]));
+    OCL_ASSERT((fabs((((float *)buf_data[1])[i]) - fdst[i]) < 0.001 * fabs(fdst[i])) ||
+               (fdst[i] == 0.0 && (((float *)buf_data[1])[i]) == 0.0));
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_to_float);
+
+void compiler_half_as_char2(void)
+{
+  const size_t n = 16;
+  uint16_t hsrc[n];
+  uint8_t* csrc = (uint8_t*)hsrc;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_half_as_char2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    hsrc[i] = (i&0x0f)<<8 | ((i+1)&0x0f);
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, sizeof(hsrc));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n*2; ++i) {
+    //printf("%d   %d\n", (((uint8_t *)buf_data[1])[i]), csrc[i]);
+    OCL_ASSERT((((uint8_t *)buf_data[1])[i]) == csrc[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_as_char2);
+
+void compiler_half2_as_int(void)
+{
+  const size_t n = 16*2;
+  uint16_t hsrc[n];
+  int* isrc = (int*)hsrc;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_half2_as_int");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    hsrc[i] = (i&0x0f)<<8 | ((i+1)&0x0f);
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, sizeof(hsrc));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n/2; ++i) {
+    //printf("%d   %d\n", (((int *)buf_data[1])[i]), isrc[i]);
+    OCL_ASSERT((((int *)buf_data[1])[i]) == isrc[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half2_as_int);
+
+void compiler_half_to_char_sat(void)
+{
+  const size_t n = 16;
+  uint16_t hsrc[n];
+  float fsrc[n];
+  char dst[n];
+  uint32_t tmp_f;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_half_to_char_sat");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(char), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    fsrc[i] = -200.1f + 30.5f * i;
+    memcpy(&tmp_f, &fsrc[i], sizeof(float));
+    hsrc[i] = __float_to_half(tmp_f);
+    if (fsrc[i] <= -128.0f) {
+      dst[i] = -128;
+    } else if (fsrc[i] >= 127.0f) {
+      dst[i] = 127;
+    } else {
+      dst[i] = (char)fsrc[i];
+    }
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, sizeof(dst));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%d     %d\n", (((char *)buf_data[1])[i]), dst[i]);
+    OCL_ASSERT((((char *)buf_data[1])[i]) == dst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_to_char_sat);
+
+void compiler_half_to_ushort_sat(void)
+{
+  const size_t n = 16;
+  uint16_t hsrc[n];
+  float fsrc[n];
+  uint16_t dst[n];
+  uint32_t tmp_f;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_half_to_ushort_sat");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    fsrc[i] = -100.1f + 10.3f * i;
+    memcpy(&tmp_f, &fsrc[i], sizeof(float));
+    hsrc[i] = __float_to_half(tmp_f);
+    if (fsrc[i] <= 0.0f) {
+      dst[i] = 0;
+    } else {
+      dst[i] = (uint16_t)fsrc[i];
+    }
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, sizeof(dst));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%u     %u\n", (((uint16_t *)buf_data[1])[i]), dst[i]);
+    OCL_ASSERT((((uint16_t *)buf_data[1])[i]) == dst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_to_ushort_sat);
+
+void compiler_half_to_uint_sat(void)
+{
+  const size_t n = 16;
+  uint16_t hsrc[n];
+  float fsrc[n];
+  uint32_t dst[n];
+  uint32_t tmp_f;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_half_to_uint_sat");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    fsrc[i] = -10.1f + 13.965f * i;
+    memcpy(&tmp_f, &fsrc[i], sizeof(float));
+    hsrc[i] = __float_to_half(tmp_f);
+    if (fsrc[i] <= 0.0f) {
+      dst[i] = 0;
+    } else {
+      dst[i] = (uint32_t)fsrc[i];
+    }
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, sizeof(dst));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%u     %u\n", (((uint32_t *)buf_data[1])[i]), dst[i]);
+    OCL_ASSERT((((uint32_t *)buf_data[1])[i]) == dst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_to_uint_sat);
+
+void compiler_uchar_to_half(void)
+{
+  const size_t n = 16;
+  uint8_t hsrc[n];
+  float fdst[n];
+  uint32_t tmp_f;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_uchar_to_half");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint8_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    hsrc[i] = 5*i;
+    fdst[i] = (float)hsrc[i];
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, n*sizeof(uint16_t));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    float f;
+    tmp_f = __half_to_float(((uint16_t *)buf_data[1])[i]);
+    memcpy(&f, &tmp_f, sizeof(float));
+    //printf("%f     %f\n", f, fdst[i]);
+    OCL_ASSERT(f == fdst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_uchar_to_half);
+
+void compiler_int_to_half(void)
+{
+  const size_t n = 16;
+  int hsrc[n];
+  float fdst[n];
+  uint32_t tmp_f;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_int_to_half");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    hsrc[i] = 51*i;
+    fdst[i] = (float)hsrc[i];
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, n*sizeof(uint16_t));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    float f;
+    tmp_f = __half_to_float(((uint16_t *)buf_data[1])[i]);
+    memcpy(&f, &tmp_f, sizeof(float));
+    //printf("%f     %f\n", f, fdst[i]);
+    OCL_ASSERT(f == fdst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_int_to_half);
+
+void compiler_half_to_long(void)
+{
+  const size_t n = 16;
+  uint16_t hsrc[n];
+  int64_t ldst[n];
+  uint32_t tmp_f;
+  float f;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_half_to_long");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    f = -100.1f + 10.3f * i;
+    memcpy(&tmp_f, &f, sizeof(float));
+    hsrc[i] = __float_to_half(tmp_f);
+    ldst[i] = (int64_t)f;
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, n*sizeof(uint64_t));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%ld	   %ld\n", (((int64_t *)buf_data[1])[i]), ldst[i]);
+    OCL_ASSERT((((int64_t *)buf_data[1])[i]) == ldst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_to_long);
+
+void compiler_ulong_to_half(void)
+{
+  const size_t n = 16;
+  uint64_t src[n];
+  float fdst[n];
+  uint32_t tmp_f;
+  float f;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_ulong_to_half");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = 10 + 126*i;
+    fdst[i] = (float)src[i];
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src, sizeof(src));
+  memset(buf_data[1], 0, n*sizeof(uint16_t));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    tmp_f = __half_to_float(((uint16_t *)buf_data[1])[i]);
+    memcpy(&f, &tmp_f, sizeof(float));
+    //printf("%f    %f\n", f, fdst[i]);
+    OCL_ASSERT(f == fdst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_ulong_to_half);
+
+void compiler_half_to_long_sat(void)
+{
+  const size_t n = 16;
+  uint16_t hsrc[n];
+  int64_t ldst[n];
+  uint32_t tmp_f;
+  float f;
+
+  if (!check_half_device())
+    return;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_half_convert", "compiler_half_to_long_sat");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 1; i < (int32_t) n-1; ++i) {
+    f = -100.1f + 10.3f * i;
+    memcpy(&tmp_f, &f, sizeof(float));
+    hsrc[i] = __float_to_half(tmp_f);
+    ldst[i] = (int64_t)f;
+  }
+  hsrc[0] = 0xFC00; //-inf;
+  ldst[0] = 0x8000000000000000;
+  hsrc[n-1] = 0x7C00; //inf;
+  ldst[n-1] = 0x7FFFFFFFFFFFFFFF;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], hsrc, sizeof(hsrc));
+  memset(buf_data[1], 0, n*sizeof(uint64_t));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%lx	   %lx\n", (((int64_t *)buf_data[1])[i]), ldst[i]);
+    OCL_ASSERT((((int64_t *)buf_data[1])[i]) == ldst[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_half_to_long_sat);
diff --git a/utests/compiler_long_div.cpp b/utests/compiler_long_div.cpp
new file mode 100644
index 0000000..233817b
--- /dev/null
+++ b/utests/compiler_long_div.cpp
@@ -0,0 +1,88 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_div(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_div", "compiler_long_div");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src1[i] = ((int64_t)rand() << 32) + rand();
+    src2[i] = ((int64_t)rand() << 32) + rand();;
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("ref is %lx,    res is %lx\n", src1[i] / src2[i] , ((int64_t *)buf_data[2])[i]);
+    OCL_ASSERT(src1[i] / src2[i] == ((int64_t *)buf_data[2])[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_div);
+
+void compiler_long_rem(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_div", "compiler_long_rem");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src1[i] = ((int64_t)rand() << 32) + rand();
+    src2[i] = ((int64_t)rand() << 32) + rand();;
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("ref is %lx,    res is %lx\n", src1[i] / src2[i] , ((int64_t *)buf_data[2])[i]);
+    OCL_ASSERT(src1[i] % src2[i] == ((int64_t *)buf_data[2])[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_rem);
diff --git a/utests/compiler_long_hi_sat.cpp b/utests/compiler_long_hi_sat.cpp
new file mode 100644
index 0000000..1c57d0c
--- /dev/null
+++ b/utests/compiler_long_hi_sat.cpp
@@ -0,0 +1,187 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+static void __u64_mul_u64(uint64_t sourceA, uint64_t sourceB, uint64_t &destLow, uint64_t &destHi)
+{
+  uint64_t lowA, lowB;
+  uint64_t highA, highB;
+
+  lowA = sourceA & 0xffffffff;
+  highA = sourceA >> 32;
+  lowB = sourceB & 0xffffffff;
+  highB = sourceB >> 32;
+
+  uint64_t aHibHi = highA * highB;
+  uint64_t aHibLo = highA * lowB;
+  uint64_t aLobHi = lowA * highB;
+  uint64_t aLobLo = lowA * lowB;
+
+  uint64_t aLobLoHi = aLobLo >> 32;
+  uint64_t aLobHiLo = aLobHi & 0xFFFFFFFFULL;
+  aHibLo += aLobLoHi + aLobHiLo;
+
+  destHi = aHibHi + (aHibLo >> 32 ) + (aLobHi >> 32);    // Cant overflow
+  destLow = (aHibLo << 32) | ( aLobLo & 0xFFFFFFFFULL);
+}
+
+static void __64_mul_64(int64_t sourceA, int64_t sourceB, uint64_t &destLow, int64_t &destHi)
+{
+  int64_t aSign = sourceA >> 63;
+  int64_t bSign = sourceB >> 63;
+  int64_t resultSign = aSign ^ bSign;
+
+  // take absolute values of the argument
+  sourceA = (sourceA ^ aSign) - aSign;
+  sourceB = (sourceB ^ bSign) - bSign;
+
+  uint64_t hi;
+  __u64_mul_u64( (uint64_t) sourceA, (uint64_t) sourceB, destLow, hi );
+
+  // Fix the sign
+  if( resultSign ) {
+    destLow ^= resultSign;
+    hi ^= resultSign;
+    destLow -= resultSign;
+    //carry if necessary
+    if( 0 == destLow )
+      hi -= resultSign;
+  }
+
+  destHi = (int64_t) hi;
+}
+
+static void __mad_sat(int64_t sourceA, int64_t sourceB, int64_t sourceC, int64_t& dst)
+{
+  cl_long multHi;
+  cl_ulong multLo;
+  __64_mul_64(sourceA, sourceB, multLo, multHi);
+  cl_ulong sum = multLo + sourceC;
+
+  // carry if overflow
+  if(sourceC >= 0) {
+    if(multLo > sum) {
+      multHi++;
+      if(CL_LONG_MIN == multHi) {
+        multHi = CL_LONG_MAX;
+        sum = CL_ULONG_MAX;
+      }
+    }
+  } else {
+    if( multLo < sum ) {
+      multHi--;
+      if( CL_LONG_MAX == multHi ) {
+        multHi = CL_LONG_MIN;
+        sum = 0;
+      }
+    }
+  }
+
+  // saturate
+  if( multHi > 0 )
+    sum = CL_LONG_MAX;
+  else if ( multHi == 0 && sum > CL_LONG_MAX)
+    sum = CL_LONG_MAX;
+  else if ( multHi == -1 && sum < (cl_ulong)CL_LONG_MIN)
+    sum = CL_LONG_MIN;
+  else if( multHi < -1 )
+    sum = CL_LONG_MIN;
+
+  dst = (cl_long) sum;
+}
+
+void compiler_long_mul_hi(void)
+{
+  const size_t n = 32;
+  int64_t src[n];
+  int64_t num0 = 0xF00A00CED0090B0CUL;
+  int64_t num1 = 0x7FABCD57FC098FC1UL;
+  memset(src, 0, sizeof(int64_t) * n);
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_hi_sat", "compiler_long_mul_hi");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_long), &num0);
+  OCL_SET_ARG(3, sizeof(cl_long), &num1);
+  globals[0] = n;
+  locals[0] = 32;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    uint64_t a = rand();
+    a = a <<32 | a;
+    src[i] = a;
+  }
+
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, sizeof(uint64_t) * n);
+  OCL_UNMAP_BUFFER(0);
+
+  uint64_t res_lo;
+  int64_t res_hi;
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i % 2 == 0)
+      __64_mul_64(src[i], num0, res_lo, res_hi);
+    else
+      __64_mul_64(src[i], num1, res_lo, res_hi);
+
+    OCL_ASSERT(((int64_t *)(buf_data[1]))[i] == res_hi);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+void compiler_long_mul_sat(void)
+{
+  const size_t n = 32;
+  int64_t src[n];
+  int64_t num0 = 0xF00000CED8090B0CUL;
+  int64_t num1 = 0x0000000000098FC1UL;
+  memset(src, 0, sizeof(int64_t) * n);
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_hi_sat", "compiler_long_mul_sat");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_long), &num0);
+  OCL_SET_ARG(3, sizeof(cl_long), &num1);
+  globals[0] = n;
+  locals[0] = 32;
+
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    uint64_t a = rand();
+    a = a <<32 | a;
+    src[i] = a;
+  }
+
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, sizeof(uint64_t) * n);
+  OCL_UNMAP_BUFFER(0);
+
+  int64_t res;
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    __mad_sat(src[i], num0, num1, res);
+
+    OCL_ASSERT(((int64_t *)(buf_data[1]))[i] == res);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_mul_hi);
+MAKE_UTEST_FROM_FUNCTION(compiler_long_mul_sat);
diff --git a/utests/compiler_long_not.cpp b/utests/compiler_long_not.cpp
new file mode 100644
index 0000000..64e4253
--- /dev/null
+++ b/utests/compiler_long_not.cpp
@@ -0,0 +1,52 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_not_vec8(void)
+{
+  const size_t n = 64;
+  const int v = 8; 
+  int64_t src[n * v];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_not", "compiler_long_not_vec8");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t) * v, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t) * v, NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t) * v, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int32_t i = 0; i < (int32_t) n*v; ++i) {
+    if (i % 3 == 0)
+      src[i] = 0x0UL;
+    else
+      src[i] = ((int64_t)rand() << 32) + rand();
+
+    //	printf(" 0x%lx", src[i]);
+  }
+
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, sizeof(int64_t) * n * v);
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+  uint64_t res;
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n*v; ++i) {
+    res = 0xffffffffffffffffUL;
+    if (src[i])
+      res = 0x0;
+
+    OCL_ASSERT(((uint64_t *)(buf_data[1]))[i] == res);
+    //printf("ref is 0x%lx, result is 0x%lx\n", res, ((int64_t *)(buf_data[1]))[i]);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_not_vec8);
diff --git a/utests/compiler_simd_all.cpp b/utests/compiler_sub_group_all.cpp
similarity index 87%
rename from utests/compiler_simd_all.cpp
rename to utests/compiler_sub_group_all.cpp
index 086c54f..d8e4130 100644
--- a/utests/compiler_simd_all.cpp
+++ b/utests/compiler_sub_group_all.cpp
@@ -1,11 +1,11 @@
 #include "utest_helper.hpp"
 
-void compiler_simd_all(void)
+void compiler_sub_group_all(void)
 {
   const size_t n = 40;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_simd_all");
+  OCL_CREATE_KERNEL("compiler_sub_group_all");
   OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -40,4 +40,4 @@ void compiler_simd_all(void)
   OCL_UNMAP_BUFFER(1);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_simd_all);
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_all);
diff --git a/utests/compiler_simd_any.cpp b/utests/compiler_sub_group_any.cpp
similarity index 87%
rename from utests/compiler_simd_any.cpp
rename to utests/compiler_sub_group_any.cpp
index dcc5ef1..98b1bdd 100644
--- a/utests/compiler_simd_any.cpp
+++ b/utests/compiler_sub_group_any.cpp
@@ -1,11 +1,11 @@
 #include "utest_helper.hpp"
 
-void compiler_simd_any(void)
+void compiler_sub_group_any(void)
 {
   const size_t n = 40;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_simd_any");
+  OCL_CREATE_KERNEL("compiler_sub_group_any");
   OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -40,4 +40,4 @@ void compiler_simd_any(void)
   OCL_UNMAP_BUFFER(1);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_simd_any);
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_any);
diff --git a/utests/compiler_sub_group_shuffle.cpp b/utests/compiler_sub_group_shuffle.cpp
new file mode 100644
index 0000000..4ba8b99
--- /dev/null
+++ b/utests/compiler_sub_group_shuffle.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void compiler_sub_group_shuffle(void)
+{
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_sub_group_shuffle");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  int c = 3;
+  OCL_SET_ARG(1, sizeof(int), &c);
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((int*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  int* dst = (int *)buf_data[0];
+  int suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / suggroupsize;
+    int index = i % suggroupsize;
+    OCL_ASSERT(index == dst[4*i]);
+    OCL_ASSERT((round * suggroupsize + c) == dst[4*i+1]);
+    OCL_ASSERT((round * suggroupsize + 5) == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (suggroupsize - index - 1)) == dst[4*i+3]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle);
diff --git a/utests/load_program_from_spir.cpp b/utests/load_program_from_spir.cpp
new file mode 100644
index 0000000..8ea1cd4
--- /dev/null
+++ b/utests/load_program_from_spir.cpp
@@ -0,0 +1,90 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+
+using namespace std;
+
+static void cpu(int global_id, float *src, float *dst) {
+    dst[global_id] = ceilf(src[global_id]);
+}
+
+static void test_load_program_from_spir(void)
+{
+    size_t param_value_size;
+    std::string extensionStr;
+    OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_EXTENSIONS, 0, 0, &param_value_size);
+    std::vector<char> param_value(param_value_size);
+    OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_EXTENSIONS, param_value_size, param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+    if (!param_value.empty())
+      extensionStr = std::string(&param_value.front(), param_value_size-1);
+
+    if (!std::strstr(extensionStr.c_str(), "cl_khr_spir")) {
+      return;
+    }
+
+    const size_t n = 16;
+    float cpu_dst[16], cpu_src[16];
+    cl_int status;
+    cl_int binary_status;
+    char *ker_path = NULL;
+
+    cl_file_map_t *fm = cl_file_map_new();
+    ker_path = cl_do_kiss_path("compiler_ceil32.spir", device);
+    OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
+
+    const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
+    const size_t sz = cl_file_map_size(fm);
+
+    program = clCreateProgramWithBinary(ctx, 1,
+              &device, &sz, &src, &binary_status, &status);
+
+    OCL_ASSERT(program && status == CL_SUCCESS);
+
+    /* OCL requires to build the program even if it is created from a binary */
+    OCL_ASSERT(clBuildProgram(program, 1, &device, "-x spir", NULL, NULL) == CL_SUCCESS);
+
+    kernel = clCreateKernel(program, "compiler_ceil", &status);
+    OCL_ASSERT(status == CL_SUCCESS);
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    globals[0] = 16;
+    locals[0] = 16;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+        OCL_UNMAP_BUFFER(0);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(1);
+
+#if 0
+        printf("#### GPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", ((float *)buf_data[1])[i]);
+        printf("\n#### CPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", cpu_dst[i]);
+        printf("\n");
+#endif
+
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+        OCL_UNMAP_BUFFER(1);
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_spir);
diff --git a/utests/setenv.sh.in b/utests/setenv.sh.in
index ac06b10..67e3bf1 100644
--- a/utests/setenv.sh.in
+++ b/utests/setenv.sh.in
@@ -6,3 +6,5 @@ export OCL_PCH_PATH=@LOCAL_OCL_PCH_OBJECT@
 export OCL_KERNEL_PATH=@CMAKE_CURRENT_SOURCE_DIR@/../kernels
 export OCL_GBE_PATH=@LOCAL_GBE_OBJECT_DIR@
 export OCL_INTERP_PATH=@LOCAL_INTERP_OBJECT_DIR@
+#disable self-test so we can get something more precise than "doesn't work"
+export OCL_IGNORE_SELF_TEST=1
diff --git a/utests/utest.hpp b/utests/utest.hpp
index b028b64..7ae8b87 100644
--- a/utests/utest.hpp
+++ b/utests/utest.hpp
@@ -30,6 +30,7 @@
 #include "utest_exception.hpp"
 #include <vector>
 #include <iostream>
+#include <iomanip>
 
 /*! struct for statistics */
 struct RStatistics
@@ -135,10 +136,10 @@ struct UTest
 
 #define BENCHMARK(EXPR) \
  do { \
-    int ret = 0;\
+    double ret = 0;\
     try { \
       ret = EXPR; \
-      std::cout << "    [Result: " << ret << "]    [SUCCESS]" << std::endl; \
+      std::cout << "    [Result: " << std::fixed<< std::setprecision(3) << ret << " GB/S]    [SUCCESS]" << std::endl; \
       UTest::retStatistics.passCount += 1; \
     } \
     catch (Exception e) { \
@@ -147,5 +148,8 @@ struct UTest
       UTest::retStatistics.failCount++; \
     } \
   } while (0)
+
+#define BANDWIDTH(BYTES, MSEC) \
+  ((double)(BYTES)) / ((MSEC) * 1e6);
 #endif /* __UTEST_UTEST_HPP__ */
 
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index 510c41a..c220575 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -1,4 +1,5 @@
 #!/usr/bin/python
+from __future__ import print_function
 import os,sys,re
 
 FLT_MAX_POSI='0x1.fffffep127f'
@@ -107,12 +108,8 @@ def udebug(ulpSize,returnType,function):
     static const char* INFORNAN;
     static %s ULPSIZE, ULPSIZE_FACTOR;
 
-    const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
-
-    if (env_strict == NULL || strcmp(env_strict, "0") == 0)
-      ULPSIZE_FACTOR = 1000;
-    else
-      ULPSIZE_FACTOR = %s;
+    float ULPSIZE_NO_FAST_MATH = %s;
+    ULPSIZE_FACTOR = select_ulpsize(ULPSIZE_FAST_MATH,ULPSIZE_NO_FAST_MATH);
     
     if (isinf(cpu_data[index])){
       INFORNAN="INF";
@@ -146,11 +143,11 @@ def udebug(ulpSize,returnType,function):
 #else
     if (isinf(cpu_data[index])){
       sprintf(log, "%s expect:%s\\n", log, INFORNAN);
-      OCL_ASSERTM(isinf(gpu_data[index]) || !env_strict,log);
+      OCL_ASSERTM(isinf(gpu_data[index]),log);
       }
     else if (isnan(cpu_data[index])){
       sprintf(log, "%s expect:%s\\n", log, INFORNAN);
-      OCL_ASSERTM(isnan(gpu_data[index]) || !env_strict,log);
+      OCL_ASSERTM(isnan(gpu_data[index]),log);
       }
     else{
       sprintf(log, "%s expect:%s\\n", log, ULPSIZE);
@@ -326,7 +323,7 @@ which can print more values and information to assist debuging the issue.
     file_object.close()
 
   def nameForCmake(self,content,namesuffix):
-    print("generated/%s_%s.cpp"%(self.fileName,namesuffix)),
+    print("generated/%s_%s.cpp"%(self.fileName,namesuffix),end=" ")
 
   def utestFunc(self,index):
     funcLines=[]
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 591054e..8f772fd 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -53,6 +53,7 @@ cl_mem buf[MAX_BUFFER_N] = {};
 void *buf_data[MAX_BUFFER_N] = {};
 size_t globals[3] = {};
 size_t locals[3] = {};
+float ULPSIZE_FAST_MATH = 10000.;
 
 #ifdef HAS_EGL
 Display    *xDisplay;
@@ -681,7 +682,7 @@ int cl_INT_ULP(int int_number)
   return 0;
 }
 
-int time_subtract(struct timeval *y, struct timeval *x, struct timeval *result)
+double time_subtract(struct timeval *y, struct timeval *x, struct timeval *result)
 {
   if ( x->tv_sec > y->tv_sec )
     return   -1;
@@ -699,6 +700,17 @@ int time_subtract(struct timeval *y, struct timeval *x, struct timeval *result)
     }
   }
 
-  int msec = 1000.0*(y->tv_sec - x->tv_sec) + (y->tv_usec - x->tv_usec)/1000.0;
+  double msec = 1000.0*(y->tv_sec - x->tv_sec) + (y->tv_usec - x->tv_usec)/1000.0;
   return msec;
-}
\ No newline at end of file
+}
+
+float select_ulpsize(float ULPSIZE_FAST_MATH, float ULPSIZE_NO_FAST_MATH)
+{
+  const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
+
+  float ULPSIZE_FACTOR = ULPSIZE_NO_FAST_MATH;
+  if (env_strict != NULL && strcmp(env_strict, "0") == 0 )
+        ULPSIZE_FACTOR = ULPSIZE_FAST_MATH;
+
+  return ULPSIZE_FACTOR;
+}
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 5d8e835..3b17606 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -165,6 +165,7 @@ extern cl_mem buf[MAX_BUFFER_N];
 extern void* buf_data[MAX_BUFFER_N];
 extern size_t globals[3];
 extern size_t locals[3];
+extern float ULPSIZE_FAST_MATH;
 
 enum {
   SOURCE = 0,
@@ -231,7 +232,10 @@ extern float cl_FLT_ULP(float float_number);
 extern int cl_INT_ULP(int int_number);
 
 /* subtract the time */
-int time_subtract(struct timeval *y, struct timeval *x, struct timeval *result);
+double time_subtract(struct timeval *y, struct timeval *x, struct timeval *result);
+
+/* check ulpsize */
+float select_ulpsize(float ULPSIZE_FAST_MATH, float ULPSIZE_NO_FAST_MATH);
 
 #endif /* __UTEST_HELPER_HPP__ */
 
diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
index 71a031f..83edcc3 100755
--- a/utests/utest_math_gen.py
+++ b/utests/utest_math_gen.py
@@ -161,19 +161,19 @@ static float atanpi(float x){
 } '''
   atanpiUtests = func('atanpi','atanpi',[atanpi_input_type],atanpi_output_type,[atanpi_input_values],'4 * FLT_ULP',atanpi_cpu_func)
   
-#  ##### gentype atan2pi(gentype y, gentype x)
-#  atan2pi_base_values = base_input_values
-#  atan2pi_input_values1 = []
-#  atan2pi_input_values2 = []
-#  atan2pi_input_values1,atan2pi_input_values2=gene2ValuesLoop(atan2pi_input_values1,atan2pi_input_values2,atan2pi_base_values)
-#  atan2pi_input_type1 = ['float','float2','float4','float8','float16']
-#  atan2pi_input_type2 = ['float','float2','float4','float8','float16']
-#  atan2pi_output_type = ['float','float2','float4','float8','float16']
-#  atan2pi_cpu_func='''
-#static float atan2pi(float y, float x){
-#  return atan2(y,x)/M_PI;
-#} '''
-#  atan2piUtests = func('atan2pi','atan2pi',[atan2pi_input_type1,atan2pi_input_type2],atan2pi_output_type,[atan2pi_input_values1,atan2pi_input_values2],'6 * FLT_ULP',atan2pi_cpu_func)
+  ##### gentype atan2pi(gentype y, gentype x)
+  atan2pi_base_values = base_input_values
+  atan2pi_input_values1 = []
+  atan2pi_input_values2 = []
+  atan2pi_input_values1,atan2pi_input_values2=gene2ValuesLoop(atan2pi_input_values1,atan2pi_input_values2,atan2pi_base_values)
+  atan2pi_input_type1 = ['float','float2','float4','float8','float16']
+  atan2pi_input_type2 = ['float','float2','float4','float8','float16']
+  atan2pi_output_type = ['float','float2','float4','float8','float16']
+  atan2pi_cpu_func='''
+static float atan2pi(float y, float x){
+  return atan2(y,x)/M_PI;
+} '''
+  atan2piUtests = func('atan2pi','atan2pi',[atan2pi_input_type1,atan2pi_input_type2],atan2pi_output_type,[atan2pi_input_values1,atan2pi_input_values2],'6 * FLT_ULP',atan2pi_cpu_func)
   
   ##### gentype cbrt(gentype)
   cbrt_input_values = base_input_values
@@ -342,6 +342,18 @@ static float atanpi(float x){
   ilogb_output_type = ['int','int2','int4','int8','int16']
   ilogbUtests = func('ilogb','ilogb',[ilogb_input_type],ilogb_output_type,[ilogb_input_values],'0 * INT_ULP')
 
+  #### floatn ldexp(floatnx, intnk)
+  ldexp_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, 0.5, 1, 0.0,1500.24,-1500.24]
+  ldexp_input_values2 = [-1,-2,-3,4,5,6,7,8,10,12,14,16,12]
+  ldexp_input_type1 = ['float','float2','float4','float8','float16']
+  ldexp_input_type2 = ['int','int2','int4','int8','int16']
+  ldexp_output_type = ['float','float2','float4','float8','float16']
+  ldexp_cpu_func='''
+static float ldexp(float x, int y){
+    return x * exp2(y);
+} '''
+  ldexpUtests = func('ldexp','ldexp',[ldexp_input_type1,ldexp_input_type2],ldexp_output_type,[ldexp_input_values1,ldexp_input_values2],'0 * FLT_ULP', ldexp_cpu_func)
+
   ##### gentype lgamma(gentype x)
   lgamma_input_values = base_input_values
   lgamma_input_type = ['float','float2','float4','float8','float16']
diff --git a/utests/vload_bench.cpp b/utests/vload_bench.cpp
index a7703fc..ddfaaee 100644
--- a/utests/vload_bench.cpp
+++ b/utests/vload_bench.cpp
@@ -34,8 +34,8 @@ static double vload_bench(const char *kernelFunc, uint32_t N, uint32_t offset, b
     OCL_FINISH();
     gettimeofday(&end, NULL);
     double elapsed = (end.tv_sec - start.tv_sec) * 1e6 + (end.tv_usec - start.tv_usec);
-    double bandwidth = (globals[0] * (N_ITERATIONS) * sizeof(T) * N) / elapsed;
-    printf("\t%2.1fGB/S\n", bandwidth/1000.);
+    double bandwidth = (globals[0] * (N_ITERATIONS) * sizeof(T) * N) / (elapsed * 1000.);
+    printf("\t%2.1fGB/S\n", bandwidth);
     return bandwidth;
   } else {
     // Check result
@@ -71,7 +71,7 @@ VLOAD_TEST(float, float)
 #endif
 
 #define VLOAD_BENCH(T, kT) \
-static int vload_bench_ ##kT(void) \
+static double vload_bench_ ##kT(void) \
 { \
   uint8_t vectorSize[] = {2, 3, 4, 8, 16}; \
   double totBandwidth = 0; \

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git