[beignet] 02/04: Imported Upstream version 1.0.1

Fri Jan 23 22:15:01 UTC 2015

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit c1348132b77b46eacb2a28384b5afccd5f4f2759
Author: Andreas Beckmann <anbe at debian.org>
Date:   Fri Jan 23 23:10:10 2015 +0100

    Imported Upstream version 1.0.1
---
 CMake/FindStandaloneGbeCompiler.cmake        |   35 +
 CMakeLists.txt                               |   81 +--
 GetGenID.sh                                  |   26 +
 backend/CMakeLists.txt                       |   20 +-
 backend/src/CMakeLists.txt                   |   61 +-
 backend/src/backend/gen/gen_mesa_disasm.c    |    5 +-
 backend/src/backend/gen75_context.cpp        |   17 +-
 backend/src/backend/gen75_context.hpp        |    1 -
 backend/src/backend/gen8_context.cpp         |    4 -
 backend/src/backend/gen8_context.hpp         |    1 -
 backend/src/backend/gen_context.cpp          |    1 -
 backend/src/backend/gen_context.hpp          |    2 -
 backend/src/backend/gen_insn_selection.cpp   |  110 ++-
 backend/src/backend/gen_program.cpp          |   11 +-
 backend/src/backend/program.cpp              |   29 +-
 backend/src/backend/program.h                |    8 +-
 backend/src/backend/program.hpp              |   13 +-
 backend/src/gbe_bin_generater.cpp            |    2 +-
 backend/src/ir/constant.cpp                  |    3 +-
 backend/src/ir/constant.hpp                  |    5 +-
 backend/src/ir/context.hpp                   |    4 +-
 backend/src/ir/function.hpp                  |   51 +-
 backend/src/ir/image.cpp                     |   98 +--
 backend/src/ir/immediate.cpp                 |   34 +-
 backend/src/ir/immediate.hpp                 |   73 +-
 backend/src/ir/instruction.cpp               |    1 -
 backend/src/ir/instruction.hpp               |    1 -
 backend/src/ir/liveness.hpp                  |    8 +-
 backend/src/ir/printf.cpp                    |   28 +-
 backend/src/ir/printf.hpp                    |   41 +-
 backend/src/ir/sampler.cpp                   |   20 +-
 backend/src/ir/sampler.hpp                   |    4 +-
 backend/src/ir/structural_analysis.cpp       |   66 +-
 backend/src/ir/structural_analysis.hpp       |    4 -
 backend/src/ir/unit.hpp                      |    8 +-
 backend/src/libocl/include/ocl_float.h       |    3 +-
 backend/src/libocl/include/ocl_image.h       |    8 -
 backend/src/libocl/include/ocl_types.h       |   27 +-
 backend/src/libocl/src/ocl_image.cl          |  808 ++++++++++++---------
 backend/src/libocl/src/ocl_memcpy.ll         |  177 +++++
 backend/src/libocl/tmpl/ocl_integer.tmpl.cl  |   10 +-
 backend/src/libocl/tmpl/ocl_math.tmpl.cl     | 1010 ++++++++++++++------------
 backend/src/libocl/tmpl/ocl_math.tmpl.h      |   16 +-
 backend/src/llvm/llvm_bitcode_link.cpp       |    7 +
 backend/src/llvm/llvm_gen_backend.cpp        |  831 +++++++++++----------
 backend/src/llvm/llvm_gen_backend.hpp        |   40 +-
 backend/src/llvm/llvm_gen_ocl_function.hxx   |   36 +-
 backend/src/llvm/llvm_intrinsic_lowering.cpp |    4 +-
 backend/src/llvm/llvm_passes.cpp             |   96 +--
 backend/src/llvm/llvm_printf_parser.cpp      |   20 +-
 backend/src/llvm/llvm_sampler_fix.cpp        |  144 ++++
 backend/src/llvm/llvm_scalarize.cpp          |   58 +-
 backend/src/llvm/llvm_to_gen.cpp             |    1 +
 backend/src/sys/hash_map.hpp                 |   82 ---
 benchmark/CMakeLists.txt                     |    5 +-
 benchmark/benchmark_read_buffer.cpp          |   49 ++
 benchmark/benchmark_read_image.cpp           |   67 ++
 benchmark/benchmark_use_host_ptr_buffer.cpp  |   38 +
 benchmark/enqueue_copy_buf.cpp               |   24 +-
 docs/NEWS.mdwn                               |    5 +-
 docs/howto/oldgcc-howto.mdwn                 |   58 ++
 docs/optimization-guide.mdwn                 |  106 ++-
 kernels/compiler_array4.cl                   |    9 +
 kernels/compiler_bswap.cl                    |    1 +
 kernels/compiler_private_const.cl            |    9 +
 kernels/compiler_read_buffer.cl              |   15 +
 kernels/compiler_read_image.cl               |   25 +
 kernels/runtime_alloc_host_ptr_buffer.cl     |    6 +
 kernels/runtime_climage_from_boname.cl       |    8 +
 setup_fulsim_hsw.sh                          |    5 -
 setup_fulsim_ivb.sh                          |    5 -
 setup_perfsim_ivb.sh                         |    4 -
 src/CMakeLists.txt                           |    5 +-
 src/cl_command_queue.c                       |  224 +-----
 src/cl_command_queue.h                       |    4 -
 src/cl_command_queue_gen7.c                  |    4 +-
 src/cl_device_id.c                           |   52 +-
 src/cl_driver.h                              |    4 +-
 src/cl_enqueue.c                             |   32 +-
 src/cl_event.c                               |   73 +-
 src/cl_gen75_device.h                        |    2 +-
 src/cl_gen7_device.h                         |    2 +-
 src/cl_gt_device.h                           |   12 +-
 src/cl_kernel.c                              |   13 +-
 src/cl_mem.c                                 |  128 ++--
 src/cl_mem.h                                 |   10 +-
 src/cl_program.c                             |    6 +-
 src/intel/intel_batchbuffer.c                |    7 -
 src/intel/intel_batchbuffer.h                |    1 -
 src/intel/intel_defines.h                    |    4 +
 src/intel/intel_driver.c                     |   42 +-
 src/intel/intel_driver.h                     |    5 -
 src/intel/intel_gpgpu.c                      |   91 ++-
 utests/CMakeLists.txt                        |   69 +-
 utests/builtin_pow.cpp                       |    4 +-
 utests/builtin_tgamma.cpp                    |    2 +-
 utests/compare_image_2d_and_1d_array.cpp     |    2 +-
 utests/compiler_array4.cpp                   |   45 ++
 utests/compiler_bswap.cpp                    |    7 +
 utests/compiler_displacement_map_element.cpp |    4 +-
 utests/compiler_fill_gl_image.cpp            |    2 +-
 utests/compiler_fill_image_2d_array.cpp      |    4 +-
 utests/compiler_overflow.cpp                 |   52 ++
 utests/compiler_private_const.cpp            |   27 +
 utests/compiler_saturate.cpp                 |    2 +-
 utests/compiler_saturate_sub.cpp             |    2 +-
 utests/get_cl_info.cpp                       |   40 +-
 utests/runtime_alloc_host_ptr_buffer.cpp     |   25 +
 utests/runtime_climage_from_boname.cpp       |  212 ++++++
 utests/runtime_use_host_ptr_buffer.cpp       |    8 +-
 utests/sub_buffer.cpp                        |    9 +-
 utests/utest.cpp                             |   10 +-
 utests/utest.hpp                             |    1 +
 utests/utest_generator.py                    |  104 ++-
 utests/utest_helper.cpp                      |   24 +-
 utests/utest_helper.hpp                      |    3 +
 utests/utest_math_gen.py                     |    5 +-
 utests/vload_bench.cpp                       |    4 +-
 118 files changed, 3747 insertions(+), 2334 deletions(-)

diff --git a/CMake/FindStandaloneGbeCompiler.cmake b/CMake/FindStandaloneGbeCompiler.cmake
new file mode 100644
index 0000000..c27980e
--- /dev/null
+++ b/CMake/FindStandaloneGbeCompiler.cmake
@@ -0,0 +1,35 @@
+# Find the standalone gbe compiler
+#
+# STANDALONE_GBE_COMPILER_DIR - base path of standalone compiler
+# STANDALONE_GBE_COMPILER     - full file name of standalone compiler
+# GEN_PCI_ID                  - Gen's PCI ID
+
+IF (STANDALONE_GBE_COMPILER_DIR)
+  FIND_PROGRAM(STANDALONE_GBE_COMPILER
+             NAMES gbe_bin_generater
+             DOC "standalone gbe compiler executable"
+             PATHS ${STANDALONE_GBE_COMPILER_DIR} NO_DEFAULT_PATH)
+ELSE (STANDALONE_GBE_COMPILER_DIR)
+  FIND_PROGRAM(STANDALONE_GBE_COMPILER
+             NAMES gbe_bin_generater
+             DOC "standalone gbe compiler executable"
+             PATHS /usr/local/lib/beignet/)
+ENDIF (STANDALONE_GBE_COMPILER_DIR)
+
+IF (STANDALONE_GBE_COMPILER)
+  MESSAGE(STATUS "Looking for standalone gbe compiler - found at ${STANDALONE_GBE_COMPILER}")
+  STRING(REGEX REPLACE "(.*)/.*" "\\1" STANDALONE_GBE_COMPILER_DIR ${STANDALONE_GBE_COMPILER})
+  IF (NOT GEN_PCI_ID)
+    Find_Program(LSPCI lspci)
+    IF (LSPCI)
+      MESSAGE(STATUS "Looking for lspci - found")
+    ELSE (LSPCI)
+      MESSAGE(FATAL_ERROR "Looking for lspci - not found")
+    ENDIF (LSPCI)
+    EXECUTE_PROCESS(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/GetGenID.sh"
+             OUTPUT_VARIABLE GEN_PCI_ID)
+    MESSAGE(STATUS "Platform Gen PCI id is " ${GEN_PCI_ID})
+  ENDIF (NOT GEN_PCI_ID)
+ELSE (STANDALONE_GBE_COMPILER)
+  MESSAGE(FATAL_ERROR "Looking for standalone gbe compiler - not found")
+ENDIF (STANDALONE_GBE_COMPILER)
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c68187..bfe0431 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 # compiler choose,now support ICC,GCC CLANG compiler
-set (COMPILER "GCC" CACHE INT "Compiler to choose on Linux (GCC,ICC,CLANG)")
 if (COMPILER STREQUAL "GCC")
+  find_program(CMAKE_C_COMPILER NAMES gcc)
+  find_program(CMAKE_CXX_COMPILER NAMES g++)
 elseif (COMPILER STREQUAL "CLANG")
   set (CMAKE_C_COMPILER   "clang")
   set (CMAKE_CXX_COMPILER "clang++")
@@ -17,10 +18,16 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
 set (LIBCL_DRIVER_VERSION_MAJOR 1)
 set (LIBCL_DRIVER_VERSION_MINOR 0)
-set (LIBCL_DRIVER_VERSION_PATCH 0)
+set (LIBCL_DRIVER_VERSION_PATCH 1)
 set (LIBCL_C_VERSION_MAJOR 1)
 set (LIBCL_C_VERSION_MINOR 2)
-
+if( ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+  set(COMPILER "CLANG")
+elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+  set(COMPILER "GCC")
+elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
+  set(COMPILER "ICC")
+endif()
 configure_file (
   "src/OCLConfig.h.in"
   "src/OCLConfig.h"
@@ -38,9 +45,7 @@ endif (NOT LIB_INSTALL_DIR)
 if (NOT BEIGNET_INSTALL_DIR)
   set (BEIGNET_INSTALL_DIR "${LIB_INSTALL_DIR}/beignet/")
 endif (NOT BEIGNET_INSTALL_DIR)
-SET(EMULATE_IVB false CACHE BOOL "To emulate IVB")
-SET(EMULATE_SNB false CACHE BOOL "To emulate SNB")
-SET(EMULATE_HSW false CACHE BOOL "To emulate HSW")
+
 ADD_DEFINITIONS(-D__$(USER)__)
 
 # Force Release with debug info
@@ -50,29 +55,9 @@ endif (NOT CMAKE_BUILD_TYPE)
 set (CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "assure config" FORCE)
 message(STATUS "Building mode: " ${CMAKE_BUILD_TYPE})
 
-IF (EMULATE_HSW)
-  SET (USE_FULSIM "true")
-  ADD_DEFINITIONS(-DEMULATE_GEN=75)
-ELSEIF (EMULATE_IVB)
-  SET (USE_FULSIM "true")
-  ADD_DEFINITIONS(-DEMULATE_GEN=7)
-ELSEIF (EMULATE_SNB)
-  SET (USE_FULSIM "true")
-  ADD_DEFINITIONS(-DEMULATE_GEN=6)
-ELSE (EMULATE_IVB)
-  SET (USE_FULSIM "false")
-  ADD_DEFINITIONS(-DEMULATE_GEN=0)
-ENDIF (EMULATE_HSW)
-
 # XXX now hard coded to enable the clamp to border workaround for IVB.
 ADD_DEFINITIONS(-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
 
-IF (USE_FULSIM)
-  ADD_DEFINITIONS(-DUSE_FULSIM=1)
-ELSE (USE_FULSIM)
-  ADD_DEFINITIONS(-DUSE_FULSIM=0)
-ENDIF (USE_FULSIM)
-
 # compiler flag setting
 if (COMPILER STREQUAL "GCC")
   set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall -mfpmath=sse -Wcast-align -Wl,-E")
@@ -81,7 +66,7 @@ elseif (COMPILER STREQUAL "CLANG")
 elseif (COMPILER STREQUAL "ICC")
   set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS}  -wd2928 -Wall -fPIC -fstrict-aliasing -fp-model fast -msse4.1 -Wl,-E")
 endif ()
-set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -std=c++0x -Wno-invalid-offsetof -fno-rtti")
+set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -std=c++0x -Wno-invalid-offsetof")
 set (CMAKE_C_FLAGS "${CMAKE_C_CXX_FLAGS}")
 set (CMAKE_CXX_FLAGS_DEBUG          "-O0 -g -DGBE_DEBUG=1")
 set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
@@ -92,9 +77,16 @@ set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
 set (CMAKE_C_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
 set (CMAKE_C_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
 
-# Front end stuff we need
-#INCLUDE(CMake/FindLLVM.cmake)
-Find_Package(LLVM 3.3)
+
+IF (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
+  Find_Package(StandaloneGbeCompiler)
+ELSE (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
+  # Front end stuff we need
+  #INCLUDE(CMake/FindLLVM.cmake)
+  Find_Package(LLVM 3.3)
+  SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
+ENDIF (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
+
 
 set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-undefined ${LLVM_LDFLAGS}")
 
@@ -121,13 +113,12 @@ IF(DRM_INTEL_FOUND)
   INCLUDE_DIRECTORIES(${DRM_INTEL_INCLUDE_DIRS})
   MESSAGE(STATUS "Looking for DRM Intel - found at ${DRM_INTEL_PREFIX} ${DRM_INTEL_VERSION}")
   #userptr support starts from 2.4.57, but 2.4.58 is the actual stable release
-  #FIXME userptr has randome fail for some cases, need further investigating.
-  #IF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
-  #  MESSAGE(STATUS "Enable userptr support")
-  #  SET(DRM_INTEL_USERPTR "enable")
-  #ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
-  #  MESSAGE(STATUS "Disable userptr support")
-  #ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+  IF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+    MESSAGE(STATUS "Enable userptr support")
+    SET(DRM_INTEL_USERPTR "enable")
+  ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+    MESSAGE(STATUS "Disable userptr support")
+  ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
 ELSE(DRM_INTEL_FOUND)
   MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
 ENDIF(DRM_INTEL_FOUND)
@@ -172,12 +163,12 @@ ELSE(EGL_FOUND)
 ENDIF(EGL_FOUND)
 
 # cl_khr_gl_sharing requires to build with mesa source
-Find_Package(MesaSrc)
-IF(MESA_SOURCE_FOUND)
-  MESSAGE(STATUS "Looking for mesa source code - found at ${MESA_SOURCE_PREFIX}")
-ELSE(MESA_SOURCE_FOUND)
-  MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
-ENDIF(MESA_SOURCE_FOUND)
+#Find_Package(MesaSrc)
+#IF(MESA_SOURCE_FOUND)
+#  MESSAGE(STATUS "Looking for mesa source code - found at ${MESA_SOURCE_PREFIX}")
+#ELSE(MESA_SOURCE_FOUND)
+#  MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
+#ENDIF(MESA_SOURCE_FOUND)
 
 Find_Package(OCLIcd)
 IF(OCLIcd_FOUND)
@@ -197,7 +188,11 @@ ADD_SUBDIRECTORY(include)
 ADD_SUBDIRECTORY(backend)
 ADD_SUBDIRECTORY(src)
 ADD_SUBDIRECTORY(utests)
+
+# compile benchmark only if standalone compiler is not provided
+IF (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
 ADD_SUBDIRECTORY(benchmark)
+ENDIF (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
 
 SET(CPACK_PACKAGE_VERSION_MAJOR "${LIBCL_DRIVER_VERSION_MAJOR}")
 SET(CPACK_PACKAGE_VERSION_MINOR "${LIBCL_DRIVER_VERSION_MINOR}")
diff --git a/GetGenID.sh b/GetGenID.sh
new file mode 100755
index 0000000..7acf9bd
--- /dev/null
+++ b/GetGenID.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26)
+pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F ] '{print $1}'))
+n=${#pciid[*]}
+i=0
+m=${#genpciid[*]}
+j=0
+while [ $i -lt $n ]
+do
+    id1=${pciid[$i]}
+    let j=0
+
+    while [ $j -lt $m ]
+    do
+	id2=${genpciid[$j]}
+
+	if [ ${id1} == ${id2} ]
+	then
+	    echo ${id1}
+	    exit 0
+	fi
+	let j=j+1
+    done
+
+    let i=i+1
+done
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index bf96baf..915d60f 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -36,14 +36,28 @@ include_directories (${CMAKE_CURRENT_BINARY_DIR})
 # Project source code
 ##############################################################
 add_subdirectory (src)
+
+if (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
+set(LOCAL_OCL_BITCODE_BIN ${STANDALONE_GBE_COMPILER_DIR}/beignet.bc)
+set(LOCAL_OCL_HEADER_DIR ${STANDALONE_GBE_COMPILER_DIR}/include)
+set(LOCAL_OCL_PCH_OBJECT ${STANDALONE_GBE_COMPILER_DIR}/beignet.pch)
+endif (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
+
 set(LOCAL_OCL_BITCODE_BIN "${LOCAL_OCL_BITCODE_BIN}" PARENT_SCOPE)
 set(LOCAL_OCL_HEADER_DIR "${LOCAL_OCL_HEADER_DIR}" PARENT_SCOPE)
 set(LOCAL_OCL_PCH_OBJECT "${LOCAL_OCL_PCH_OBJECT}" PARENT_SCOPE)
-
 set(LOCAL_GBE_OBJECT_DIR ${LOCAL_GBE_OBJECT_DIR} PARENT_SCOPE)
 set(LOCAL_INTERP_OBJECT_DIR ${LOCAL_INTERP_OBJECT_DIR} PARENT_SCOPE)
 
 set (GBE_BIN_GENERATER
-     env OCL_BITCODE_LIB_PATH=${LOCAL_OCL_BITCODE_BIN} OCL_HEADER_FILE_DIR=${LOCAL_OCL_HEADER_DIR} OCL_PCH_PATH=${LOCAL_OCL_PCH_OBJECT} LD_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src/gbe_bin_generater
-     PARENT_SCOPE)
+     env OCL_BITCODE_LIB_PATH=${LOCAL_OCL_BITCODE_BIN} OCL_HEADER_FILE_DIR=${LOCAL_OCL_HEADER_DIR} OCL_PCH_PATH=${LOCAL_OCL_PCH_OBJECT})
 
+if (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
+set (GBE_BIN_GENERATER
+     ${GBE_BIN_GENERATER} ${STANDALONE_GBE_COMPILER_DIR}/gbe_bin_generater
+     PARENT_SCOPE)
+else (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
+set (GBE_BIN_GENERATER
+     ${GBE_BIN_GENERATER} LD_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src/gbe_bin_generater
+     PARENT_SCOPE)
+endif (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index b4555f1..ce83c62 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -9,19 +9,21 @@ configure_file (
     "GBEConfig.h"
 )
 
+#do not involve libocl if the standalone compiler is given,
+if (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
 add_subdirectory(libocl)
+add_dependencies(beignet_bitcode libocl)
+endif ()
+
 set (LOCAL_GBE_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbe.so" PARENT_SCOPE)
 set (LOCAL_INTERP_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbeinterp.so" PARENT_SCOPE)
 set (LOCAL_OCL_BITCODE_BIN "${OCL_OBJECT_DIR}/beignet.bc" PARENT_SCOPE)
 set (LOCAL_OCL_HEADER_DIR "${OCL_OBJECT_DIR}/include/" PARENT_SCOPE)
 set (LOCAL_OCL_PCH_OBJECT "${OCL_OBJECT_DIR}/beignet.local.pch" PARENT_SCOPE)
 
-add_dependencies(beignet_bitcode libocl)
-
 set (GBE_SRC
     ${ocl_blob_file}
     sys/vector.hpp
-    sys/hash_map.hpp
     sys/map.hpp
     sys/set.hpp
     sys/intrusive_list.hpp
@@ -73,6 +75,7 @@ set (GBE_SRC
     backend/program.cpp
     backend/program.hpp
     backend/program.h
+    llvm/llvm_sampler_fix.cpp
     llvm/llvm_bitcode_link.cpp
     llvm/llvm_gen_backend.cpp
     llvm/llvm_passes.cpp
@@ -117,27 +120,29 @@ set (GBE_SRC
     backend/gen8_encoder.cpp
     )
 
+set (GBE_LINK_LIBRARIES
+    ${DRM_INTEL_LIBRARIES}
+    ${DRM_LIBRARIES}
+    ${CLANG_LIBRARIES}
+    ${LLVM_MODULE_LIBS}
+    ${LLVM_SYSTEM_LIBS}
+    ${CMAKE_THREAD_LIBS_INIT}
+    ${CMAKE_DL_LIBS}
+    )
 
 include_directories (.)
 link_directories (${LLVM_LIBRARY_DIRS} ${DRM_LIBDIR})
 include_directories(${LLVM_INCLUDE_DIRS})
-add_library (gbe SHARED ${GBE_SRC})
-
 
-target_link_libraries(
-                      gbe
-                      ${DRM_INTEL_LIBRARIES}
-                      ${DRM_LIBRARIES}
-                      ${CLANG_LIBRARIES}
-                      ${LLVM_MODULE_LIBS}
-                      ${LLVM_SYSTEM_LIBS}
-                      ${CMAKE_THREAD_LIBS_INIT}
-                      ${CMAKE_DL_LIBS})
+#do not build libgbe.so if the standalone compiler is given
+if (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
+add_library (gbe SHARED ${GBE_SRC})
+target_link_libraries(gbe ${GBE_LINK_LIBRARIES})
+add_dependencies(gbe beignet_bitcode)
+endif (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
 
 add_library(gbeinterp SHARED gbe_bin_interpreter.cpp)
 
-add_dependencies(gbe beignet_bitcode)
-
 if (LLVM_VERSION_NODOT VERSION_EQUAL 34)
   find_library(TERMINFO NAMES tinfo ncurses)
   if (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
@@ -149,11 +154,33 @@ if (LLVM_VERSION_NODOT VERSION_EQUAL 34)
 endif(LLVM_VERSION_NODOT VERSION_EQUAL 34)
 
 link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
+
+#do not build nor install if the standalone compiler is given
+if (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
+if (BUILD_STANDALONE_GBE_COMPILER STREQUAL "true")
+macro(remove_cxx_flag flag)
+  string(REPLACE "${flag}" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endmacro()
+
+remove_cxx_flag("-Wl,-E")
+ADD_EXECUTABLE(gbe_bin_generater gbe_bin_generater.cpp ${GBE_SRC})
+set_target_properties(gbe_bin_generater PROPERTIES LINK_FLAGS "-static")
+TARGET_LINK_LIBRARIES(gbe_bin_generater ${GBE_LINK_LIBRARIES})
+
+ADD_CUSTOM_TARGET(gbecompiler.tgz ALL
+    COMMAND tar zcf ${OCL_OBJECT_DIR}/gbecompiler.tgz gbe_bin_generater -C ${OCL_OBJECT_DIR} beignet.bc -C ${OCL_OBJECT_DIR} beignet.pch -C ${OCL_OBJECT_DIR} include
+    DEPENDS gbe_bin_generater beignet_bitcode
+    )
+
+else ()
 ADD_EXECUTABLE(gbe_bin_generater gbe_bin_generater.cpp)
 TARGET_LINK_LIBRARIES(gbe_bin_generater gbe)
+endif ()
 
 install (TARGETS gbe LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
-install (TARGETS gbeinterp LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
 install (FILES ${OCL_OBJECT_DIR}/beignet.bc DESTINATION ${BEIGNET_INSTALL_DIR})
 install (FILES ${OCL_OBJECT_DIR}/beignet.pch DESTINATION ${BEIGNET_INSTALL_DIR})
 install (FILES ${OCL_HEADER_FILES} DESTINATION ${BEIGNET_INSTALL_DIR}/include)
+endif (NOT (USE_STANDALONE_GBE_COMPILER STREQUAL "true"))
+
+install (TARGETS gbeinterp LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 877c102..4822de3 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -67,6 +67,7 @@ static const struct {
   [GEN_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_FBH] = { .name = "fbh", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_FBL] = { .name = "fbl", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_CBIT] = { .name = "cbit", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_F16TO32] = { .name = "f16to32", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_F32TO16] = { .name = "f32to16", .nsrc = 1, .ndst = 1 },
 
@@ -515,8 +516,8 @@ static int gen_version;
 #define SCRATCH_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.block_size)
 #define SCRATCH_RW_CHANNEL_MODE(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.channel_mode)
 #define SCRATCH_RW_MSG_TYPE(inst)  GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.msg_type)
-#define DWORD_RW_BTI(inst)         GEN_BITS_FIELD(inst, bits3.gen7_dword_rw.msg_type)
-#define DWORD_RW_MSG_TYPE(inst)    GEN_BITS_FIELD(inst, bits3.gen7_dword_rw.bti)
+#define DWORD_RW_BTI(inst)         GEN_BITS_FIELD(inst, bits3.gen7_dword_rw.bti)
+#define DWORD_RW_MSG_TYPE(inst)    GEN_BITS_FIELD(inst, bits3.gen7_dword_rw.msg_type)
 #define MSG_GW_SUBFUNC(inst)       GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.subfunc)
 #define MSG_GW_NOTIFY(inst)        GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.notify)
 #define MSG_GW_ACKREQ(inst)        GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.ackreq)
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index 1f9591e..a830260 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -39,21 +39,22 @@ namespace gbe
     if(kernel->getUseSLM() == false)
       return;
 
-    const GenRegister slm_offset = ra->genReg(GenRegister::ud1grf(ir::ocl::slmoffset));
     const GenRegister slm_index = GenRegister::ud1grf(0, 0);
-    //the slm index is hold in r0.0 24-27 bit, in 4K unit, shift left 12 to get byte unit
+    //the slm index is hold in r0.0 24-27 bit, in 4K unit, move it to sr0.1's 8-11 bits.
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->SHR(slm_offset, slm_index, GenRegister::immud(12));
+      GenRegister sr0 = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                                    GEN_ARF_STATE,
+                                    1,
+                                    GEN_TYPE_UD,
+                                    GEN_VERTICAL_STRIDE_8,
+                                    GEN_WIDTH_8,
+                                    GEN_HORIZONTAL_STRIDE_1);
+      p->SHR(sr0, slm_index, GenRegister::immud(16));
     p->pop();
   }
 
-  void Gen75Context::allocSLMOffsetCurbe(void) {
-    if(fn.getUseSLM())
-      allocCurbeReg(ir::ocl::slmoffset, GBE_CURBE_SLM_OFFSET);
-  }
-
   uint32_t Gen75Context::alignScratchSize(uint32_t size){
     if(size == 0)
       return 0;
diff --git a/backend/src/backend/gen75_context.hpp b/backend/src/backend/gen75_context.hpp
index 19fadb2..deaefc2 100644
--- a/backend/src/backend/gen75_context.hpp
+++ b/backend/src/backend/gen75_context.hpp
@@ -55,7 +55,6 @@ namespace gbe
 
   private:
     virtual void emitSLMOffset(void);
-    virtual void allocSLMOffsetCurbe(void);
     virtual void newSelection(void);
   };
 }
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 6e138e8..776c92b 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -39,10 +39,6 @@ namespace gbe
     return;
   }
 
-  void Gen8Context::allocSLMOffsetCurbe(void) {
-    return;
-  }
-
   uint32_t Gen8Context::alignScratchSize(uint32_t size){
     if(size == 0)
       return 0;
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index 925e080..49193f5 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -55,7 +55,6 @@ namespace gbe
 
   private:
     virtual void emitSLMOffset(void);
-    virtual void allocSLMOffsetCurbe(void);
     virtual void newSelection(void);
   };
 }
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index a473451..3fab9c8 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1816,7 +1816,6 @@ namespace gbe
     allocCurbeReg(one, GBE_CURBE_ONE);
     if (stackUse.size() != 0)
       allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
-    allocSLMOffsetCurbe();
     // Go over the arguments and find the related patch locations
     const uint32_t argNum = fn.argNum();
     for (uint32_t argID = 0u; argID < argNum; ++argID) {
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 41489a0..45347b9 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -214,8 +214,6 @@ namespace gbe
     void buildPatchList(void);
     /*! Calc the group's slm offset from R0.0, to work around HSW SLM bug*/
     virtual void emitSLMOffset(void) { };
-    /*! allocate group's slm offset in curbe, only for HSW */
-    virtual void allocSLMOffsetCurbe(void) { };
     /*! new selection of device */
     virtual void newSelection(void);
     friend class GenRegAllocator;               //!< need to access errCode directly.
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index cd968c0..f83edf5 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -340,9 +340,6 @@ namespace gbe
     INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov);
     /*! spill a register (insert spill/unspill instructions) */
     INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
-    /*! should add per thread offset to the local memory address when load/store/atomic */
-    bool needPatchSLMAddr() const { return patchSLMAddr; }
-    void setPatchSLMAddr(bool b) { patchSLMAddr = b; }
     bool has32X32Mul() const { return bHas32X32Mul; }
     void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
     /*! indicate whether a register is a scalar/uniform register. */
@@ -629,7 +626,6 @@ namespace gbe
   private:
     /*! Auxiliary label for if/endif. */ 
     uint16_t currAuxLabel;
-    bool patchSLMAddr;
     bool bHas32X32Mul;
     INLINE ir::LabelIndex newAuxLabel()
     {
@@ -670,7 +666,7 @@ namespace gbe
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
     stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
-    patchSLMAddr(false), bHas32X32Mul(false)
+    bHas32X32Mul(false)
   {
     const ir::Function &fn = ctx.getFunction();
     this->regNum = fn.regNum();
@@ -1464,14 +1460,13 @@ namespace gbe
   } selectionLibraryInitializer;
 
   bool Selection::Opaque::isRoot(const ir::Instruction &insn) const {
-    if (insn.getDstNum() > 1 ||
-        insn.hasSideEffect() ||
+    if (insn.hasSideEffect() ||
         insn.isMemberOf<ir::BranchInstruction>() ||
         insn.isMemberOf<ir::LabelInstruction>())
     return true;
 
     // No side effect, not a branch and no destination? Impossible
-    GBE_ASSERT(insn.getDstNum() == 1);
+    GBE_ASSERT(insn.getDstNum() >= 1);
 
     // Root if alive outside the block.
     // XXX we should use Value and not registers in liveness info
@@ -1747,7 +1742,6 @@ namespace gbe
   }
 
   Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
-    this->opaque->setPatchSLMAddr(true);
   }
 
   Selection8::Selection8(GenContext &ctx) : Selection(ctx) {
@@ -2844,7 +2838,6 @@ namespace gbe
                    vector<GenRegister> &dst2,
                    GenRegister addr,
                    uint32_t valueNum,
-                   ir::AddressSpace space,
                    ir::BTI bti) const
     {
       for (uint32_t x = 0; x < bti.count; x++) {
@@ -2852,7 +2845,7 @@ namespace gbe
           for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
             dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
 
-        GenRegister temp = getRelativeAddress(sel, addr, space, bti.bti[x]);
+        GenRegister temp = getRelativeAddress(sel, addr, bti.bti[x]);
         sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
         if(x > 0) {
           sel.push();
@@ -2878,7 +2871,7 @@ namespace gbe
       vector<GenRegister> dst2(valueNum);
       for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
-      readDWord(sel, dst, dst2, addr, valueNum, insn.getAddressSpace(), bti);
+      readDWord(sel, dst, dst2, addr, valueNum, bti);
     }
 
     void emitDWordGather(Selection::Opaque &sel,
@@ -2925,7 +2918,7 @@ namespace gbe
       GBE_ASSERT(valueNum == 1);
       GBE_ASSERT(bti.count == 1);
       vector<GenRegister> dst(valueNum);
-      GenRegister tmpAddr = getRelativeAddress(sel, addr, insn.getAddressSpace(), bti.bti[0]);
+      GenRegister tmpAddr = getRelativeAddress(sel, addr, bti.bti[0]);
       for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
       sel.READ64(tmpAddr, dst.data(), valueNum, bti.bti[0]);
@@ -2996,7 +2989,7 @@ namespace gbe
         tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]);
       }
 
-      readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
+      readDWord(sel, tmp, tmp2, address, tmpRegNum, bti);
 
       for(uint32_t i = 0; i < tmpRegNum; i++) {
         unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
@@ -3099,7 +3092,7 @@ namespace gbe
               sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
             sel.pop();
           }
-          readDWord(sel, t1, t2, alignedAddr, width, insn.getAddressSpace(), bti);
+          readDWord(sel, t1, t2, alignedAddr, width, bti);
           remainedReg -= width;
           pos += width;
         } while(remainedReg);
@@ -3124,7 +3117,7 @@ namespace gbe
           if (x > 0)
             tmp = sel.selReg(sel.reg(family, simdWidth == 1), insn.getValueType());
 
-          GenRegister addr = getRelativeAddress(sel, address, insn.getAddressSpace(), bti.bti[x]);
+          GenRegister addr = getRelativeAddress(sel, address, bti.bti[x]);
           readByteAsDWord(sel, elemSize, addr, tmp, simdWidth, bti.bti[x]);
           if (x > 0) {
             sel.push();
@@ -3151,8 +3144,8 @@ namespace gbe
       sel.INDIRECT_MOVE(dst, src);
     }
 
-    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, ir::AddressSpace space, uint8_t bti) const {
-      if(space == ir::MEM_LOCAL || space == ir::MEM_CONSTANT)
+    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti) const {
+      if (bti == 0xfe || bti == BTI_CONSTANT)
         return address;
 
       sel.push();
@@ -3162,11 +3155,18 @@ namespace gbe
       sel.pop();
       return temp;
     }
+    // check whether all binded table index point to constant memory
+    INLINE bool isAllConstant(const ir::BTI &bti) const {
+      for (int x = 0; x < bti.count; x++) {
+         if (bti.bti[x] != BTI_CONSTANT)
+           return false;
+      }
+      return true;
+    }
 
     INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
       using namespace ir;
       GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
-      const AddressSpace space = insn.getAddressSpace();
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
                  insn.getAddressSpace() == MEM_CONSTANT ||
                  insn.getAddressSpace() == MEM_PRIVATE ||
@@ -3174,19 +3174,10 @@ namespace gbe
       //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
-        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
-        sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
-        address = temp;
-      }
-      BTI bti;
-      if (space == MEM_CONSTANT || space == MEM_LOCAL) {
-        bti.bti[0] = space == MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
-        bti.count = 1;
-      } else {
-        bti = insn.getBTI();
-      }
-      if (space == MEM_CONSTANT) {
+      const BTI &bti = insn.getBTI();
+      bool allConstant = isAllConstant(bti);
+
+      if (allConstant) {
         // XXX TODO read 64bit constant through constant cache
         // Per HW Spec, constant cache messages can read at least DWORD data.
         // So, byte/short data type, we have to read through data cache.
@@ -3291,40 +3282,34 @@ namespace gbe
       }
     }
 
+    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti) const {
+      if(bti == 0xfe)
+        return address;
+
+      sel.push();
+        sel.curr.noMask = 1;
+        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
+      sel.pop();
+      return temp;
+    }
+
     INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
-      const AddressSpace space = insn.getAddressSpace();
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
       GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
-      if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
-        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
-        sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
-        address = temp;
-      }
-      if(space == MEM_LOCAL) {
+
+      BTI bti = insn.getBTI();
+      for (int x = 0; x < bti.count; x++) {
+        GenRegister temp = getRelativeAddress(sel, address, bti.bti[x]);
         if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-          this->emitWrite64(sel, insn, address, 0xfe);
+          this->emitWrite64(sel, insn, temp, bti.bti[x]);
         else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-          this->emitUntypedWrite(sel, insn, address,  0xfe);
-        else
-          this->emitByteScatter(sel, insn, elemSize, address, 0xfe);
-      } else {
-        BTI bti = insn.getBTI();
-        for (int x = 0; x < bti.count; x++) {
-          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
-          sel.push();
-            sel.curr.noMask = 1;
-            sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti.bti[x]), ir::TYPE_U32)));
-          sel.pop();
-          if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-            this->emitWrite64(sel, insn, temp, bti.bti[x]);
-          else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-            this->emitUntypedWrite(sel, insn, temp,  bti.bti[x]);
-          else {
-            this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x]);
-          }
+          this->emitUntypedWrite(sel, insn, temp,  bti.bti[x]);
+        else {
+          this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x]);
         }
       }
       return true;
@@ -3734,11 +3719,6 @@ namespace gbe
       GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
       if(space == MEM_LOCAL) {
-        if (sel.needPatchSLMAddr()) {
-          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-          sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
-          src0 = temp;
-        }
         sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
       } else {
         ir::BTI b = insn.getBTI();
@@ -3996,8 +3976,8 @@ namespace gbe
         msgLen = srcNum;
       }
       // We switch to a fixup bti for linear filter on a image1d array sampling.
-      uint32_t bti = insn.getImageIndex() + (insn.getSamplerOffset() == 2 ? BTI_MAX_IMAGE_NUM : 0);
-      if (bti > 253) {
+      uint32_t bti = insn.getImageIndex() + (insn.getSamplerOffset() == 2 ? BTI_WORKAROUND_IMAGE_OFFSET : 0);
+      if (bti > BTI_MAX_ID) {
         std::cerr << "Too large bti " << bti;
         return false;
       }
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 05d830a..2e391e1 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -22,6 +22,7 @@
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
+#ifdef GBE_COMPILER_AVAILABLE
 #include "llvm/Config/llvm-config.h"
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
 #include "llvm/LLVMContext.h"
@@ -45,6 +46,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/IRReader/IRReader.h"
+#endif
 
 #include "backend/program.h"
 #include "backend/gen_program.h"
@@ -56,10 +58,12 @@
 #include "backend/gen/gen_mesa_disasm.h"
 #include "backend/gen_reg_allocation.hpp"
 #include "ir/unit.hpp"
+
+#ifdef GBE_COMPILER_AVAILABLE
 #include "llvm/llvm_to_gen.hpp"
 #include "llvm/llvm_gen_backend.hpp"
-
 #include <clang/CodeGen/CodeGenAction.h>
+#endif
 
 #include <cstring>
 #include <sstream>
@@ -135,7 +139,6 @@ namespace gbe {
     bool limitRegisterPressure;
   } codeGenStrategy[] = {
     {16, 0, false},
-    {16, 10, false},
     {8, 0, false},
     {8, 8, false},
     {8, 16, false},
@@ -151,9 +154,9 @@ namespace gbe {
     uint32_t codeGen = 0;
     GenContext *ctx = NULL;
     if (fn->getSimdWidth() == 8) {
-      codeGen = 2;
+      codeGen = 1;
     } else if (fn->getSimdWidth() == 16) {
-      codeGenNum = 2;
+      codeGenNum = 1;
     } else if (fn->getSimdWidth() == 0) {
       codeGen = 0;
     } else
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 971071e..38ce9c8 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -31,12 +31,16 @@
 #include "ir/value.hpp"
 #include "ir/unit.hpp"
 #include "ir/printf.hpp"
+
+#ifdef GBE_COMPILER_AVAILABLE
 #include "llvm/llvm_to_gen.hpp"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/IR/LLVMContext.h"
+#endif
+
 #include <cstring>
 #include <algorithm>
 #include <fstream>
@@ -46,6 +50,7 @@
 #include <unistd.h>
 #include <mutex>
 
+#ifdef GBE_COMPILER_AVAILABLE
 /* Not defined for LLVM 3.0 */
 #if !defined(LLVM_VERSION_MAJOR)
 #define LLVM_VERSION_MAJOR 3
@@ -75,6 +80,8 @@
 #endif  /* LLVM_VERSION_MINOR <= 2 */
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/Support/raw_ostream.h>
+#endif
+
 #include "src/GBEConfig.h"
 
 namespace gbe {
@@ -99,7 +106,8 @@ namespace gbe {
 
   Program::Program(void) : constantSet(NULL) {}
   Program::~Program(void) {
-    for (auto &kernel : kernels) GBE_DELETE(kernel.second);
+    for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it)
+      GBE_DELETE(it->second);
     if (constantSet) delete constantSet;
   }
 
@@ -184,8 +192,8 @@ namespace gbe {
     }
 
     OUT_UPDATE_SZ(ker_num);
-    for (auto ker : kernels) {
-      size_t sz = ker.second->serializeToBin(outs);
+    for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it) {
+      size_t sz = it->second->serializeToBin(outs);
       if (!sz)
         return 0;
 
@@ -268,7 +276,8 @@ namespace gbe {
     }
 
     OUT_UPDATE_SZ(patches.size());
-    for (auto patch : patches) {
+    for (size_t i = 0; i < patches.size(); ++i) {
+      const PatchInfo& patch = patches[i];
       unsigned int tmp;
       tmp = patch.type;
       OUT_UPDATE_SZ(tmp);
@@ -439,8 +448,8 @@ namespace gbe {
       constantSet->printStatus(indent + 4, outs);
     }
 
-    for (auto ker : kernels) {
-      ker.second->printStatus(indent + 4, outs);
+    for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it) {
+      it->second->printStatus(indent + 4, outs);
     }
 
     outs << spaces << "================ End Program ================" << "\n";
@@ -474,7 +483,8 @@ namespace gbe {
 
     outs << spaces_nl << "  Patches Number is " << patches.size() << "\n";
     num = 0;
-    for (auto patch : patches) {
+    for (size_t i = 0; i < patches.size(); ++i) {
+      PatchInfo& patch = patches[i];
       num++;
       outs << spaces_nl << "  patch " << num << ":\n";
       outs << spaces_nl << "      type value: "<< patch.type << "\n";
@@ -1058,12 +1068,13 @@ namespace gbe {
 
   static void kernelOutputPrintf(void * printf_info, void* index_addr,
                                  void* buf_addr, size_t global_wk_sz0,
-                                 size_t global_wk_sz1, size_t global_wk_sz2)
+                                 size_t global_wk_sz1, size_t global_wk_sz2,
+                                 size_t output_sz)
   {
     if (printf_info == NULL) return;
     ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
     ps->outputPrintf(index_addr, buf_addr, global_wk_sz0,
-                         global_wk_sz1, global_wk_sz2);
+                         global_wk_sz1, global_wk_sz2, output_sz);
   }
 
   static void kernelGetCompileWorkGroupSize(gbe_kernel gbeKernel, size_t wg_size[3]) {
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index a457d52..dc5662f 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -66,8 +66,10 @@ enum gbe_get_arg_info_value {
 #define BTI_CONSTANT 0
 #define BTI_PRIVATE 1
 #define BTI_RESERVED_NUM 2
-#define BTI_MAX_IMAGE_NUM 128
-#define BTI_MAX_ID (BTI_MAX_IMAGE_NUM + BTI_RESERVED_NUM - 1)
+#define BTI_MAX_READ_IMAGE_ARGS 128
+#define BTI_MAX_WRITE_IMAGE_ARGS  8
+#define BTI_WORKAROUND_IMAGE_OFFSET 128
+#define BTI_MAX_ID 253
 
 /*! Constant buffer values (ie values to setup in the constant buffer) */
 enum gbe_curbe_type {
@@ -155,7 +157,7 @@ typedef uint32_t (gbe_get_printf_sizeof_size_cb)(void* printf_info);
 extern gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size;
 
 typedef void (gbe_output_printf_cb) (void* printf_info, void* index_addr, void* buf_addr,
-                         size_t global_wk_sz0, size_t global_wk_sz1, size_t global_wk_sz2);
+              size_t global_wk_sz0, size_t global_wk_sz1, size_t global_wk_sz2, size_t outbuf_sz);
 extern gbe_output_printf_cb* gbe_output_printf;
 
 /*! Create a new program from the given source code (zero terminated string) */
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 4e6b275..cff2463 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -32,7 +32,6 @@
 #include "ir/function.hpp"
 #include "ir/printf.hpp"
 #include "ir/sampler.hpp"
-#include "sys/hash_map.hpp"
 #include "sys/vector.hpp"
 #include <string>
 
@@ -156,10 +155,10 @@ namespace gbe {
     }
 
     void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
-                      size_t global_wk_sz1, size_t global_wk_sz2) {
+                      size_t global_wk_sz1, size_t global_wk_sz2, size_t output_sz) {
       if(printfSet)
         printfSet->outputPrintf(index_addr, buf_addr, global_wk_sz0,
-                                global_wk_sz1, global_wk_sz2);
+                                global_wk_sz1, global_wk_sz2, output_sz);
     }
 
     ir::FunctionArgument::InfoFromLLVM* getArgInfo(uint32_t id) const { return &args[id].info; }
@@ -251,7 +250,7 @@ namespace gbe {
     uint32_t getKernelNum(void) const { return kernels.size(); }
     /*! Get the kernel from its name */
     Kernel *getKernel(const std::string &name) const {
-      auto it = kernels.find(name);
+      map<std::string, Kernel*>::const_iterator it = kernels.find(name);
       if (it == kernels.end())
         return NULL;
       else
@@ -261,9 +260,9 @@ namespace gbe {
     Kernel *getKernel(uint32_t ID) const {
       uint32_t currID = 0;
       Kernel *kernel = NULL;
-      for (const auto &pair : kernels) {
+      for (map<std::string, Kernel*>::const_iterator it = kernels.begin(); it != kernels.end(); ++it) {
         if (currID == ID) {
-          kernel = pair.second;
+          kernel = it->second;
           break;
         }
         currID++;
@@ -307,7 +306,7 @@ namespace gbe {
     /*! Allocate an empty kernel. */
     virtual Kernel *allocateKernel(const std::string &name) = 0;
     /*! Kernels sorted by their name */
-    hash_map<std::string, Kernel*> kernels;
+    map<std::string, Kernel*> kernels;
     /*! Global (constants) outside any kernel */
     ir::ConstantSet *constantSet;
     /*! Use custom allocators */
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index f8e45fb..f4be488 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -310,7 +310,7 @@ int main (int argc, const char **argv)
     deque<int> used_index;
 
     if (argc < 2) {
-        cout << "Usage: kernel_path [-pbuild_parameter]\n[-obin_path]" << endl;
+        cout << "Usage: kernel_path [-pbuild_parameter] [-obin_path] [-tgen_pci_id]" << endl;
         return 0;
     }
 
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
index 6ef8ea6..fa4e14a 100644
--- a/backend/src/ir/constant.cpp
+++ b/backend/src/ir/constant.cpp
@@ -56,7 +56,8 @@ namespace ir {
     }
 
     OUT_UPDATE_SZ(constants.size());
-    for (auto const &cnst : constants) {
+    for (size_t i = 0; i < constants.size(); ++i) {
+      Constant& cnst = constants[i];
       size_t bytes = sizeof(cnst.getName().size())        //name length self
                      + cnst.getName().size()*sizeof(char) //name
                      + sizeof(cnst.getSize())             //size
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
index f5f172d..0891d7b 100644
--- a/backend/src/ir/constant.hpp
+++ b/backend/src/ir/constant.hpp
@@ -76,12 +76,13 @@ namespace ir {
     Constant& getConstant(size_t i) { return constants[i]; }
     /*! Get a special constant */
     Constant& getConstant(const std::string & name) {
-      for (auto & c : constants) {
+      for (size_t i = 0; i < constants.size(); ++i) {
+        Constant& c = constants[i];
         if (c.getName() == name)
           return c;
       }
       GBE_ASSERT(false);
-      return *(Constant *)nullptr;
+      return *(Constant *)NULL;
     }
     /*! Number of bytes of serialized constant data */
     size_t getDataSize(void) const { return data.size(); }
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index 485d558..cf5109d 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -74,11 +74,11 @@ namespace ir {
       return fn->newImmediate(imm);
     }
     /*! Create a new immediate value */
-    INLINE ImmediateIndex newImmediate(vector<ImmediateIndex>indexVector) {
+    INLINE ImmediateIndex newImmediate(vector<ImmediateIndex>indexVector, Type dstType) {
       vector<const Immediate*> immVector;
       for( uint32_t i = 0; i < indexVector.size(); i++)
         immVector.push_back(&fn->getImmediate(indexVector[i]));
-      const Immediate imm(immVector);
+      const Immediate imm(immVector, dstType);
       return fn->newImmediate(imm);
     }
     /*! Create an integer immediate value */
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 0381095..2a3d067 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -180,6 +180,35 @@ namespace ir {
       std::string accessQual;
       std::string typeQual;
       std::string argName; // My different from arg->getName()
+
+      bool isImage1dT() const {
+        return typeName.compare("image1d_t") == 0;
+      }
+      bool isImage1dArrayT() const {
+        return typeName.compare("image1d_array_t") == 0;
+      }
+      bool isImage1dBufferT() const {
+        return typeName.compare("image1d_buffer_t") == 0;
+      }
+      bool isImage2dT() const {
+        return typeName.compare("image2d_t") == 0;
+      }
+      bool isImage2dArrayT() const {
+        return typeName.compare("image2d_array_t") == 0;
+      }
+      bool isImage3dT() const {
+        return typeName.compare("image3d_t") == 0;
+      }
+
+      bool isImageType() const {
+        return isImage1dT() || isImage1dArrayT() || isImage1dBufferT() ||
+               isImage2dT() || isImage2dArrayT() || isImage3dT();
+      }
+
+      bool isSamplerType() const {
+        return typeName.compare("sampler_t") == 0;
+      }
+
     };
 
     /*! Create a function input argument */
@@ -324,12 +353,20 @@ namespace ir {
      *  this is not an input argument
      */
     INLINE const FunctionArgument *getArg(const Register &reg) const {
-      for (auto arg : args) if (arg->reg == reg) return arg;
+      for (size_t i = 0; i < args.size(); ++i) {
+        const FunctionArgument* arg = args[i];
+        if (arg->reg == reg)
+          return arg;
+      }
       return NULL;
     }
 
     INLINE FunctionArgument *getArg(const Register &reg) {
-      for (auto arg : args) if (arg->reg == reg) return arg;
+      for (size_t i = 0; i < args.size(); ++i) {
+        FunctionArgument* arg = args[i];
+        if (arg->reg == reg)
+          return arg;
+      }
       return NULL;
     }
 
@@ -388,12 +425,18 @@ namespace ir {
     /*! Apply the given functor on all basic blocks */
     template <typename T>
     INLINE void foreachBlock(const T &functor) const {
-      for (auto block : blocks) functor(*block);
+      for (size_t i = 0; i < blocks.size(); ++i) {
+        BasicBlock* block = blocks[i];
+        functor(*block);
+      }
     }
     /*! Apply the given functor on all instructions */
     template <typename T>
     INLINE void foreachInstruction(const T &functor) const {
-      for (auto block : blocks) block->foreach(functor);
+      for (size_t i = 0; i < blocks.size(); ++i) {
+        BasicBlock* block = blocks[i];
+        block->foreach(functor);
+      }
     }
     /*! Does it use SLM */
     INLINE bool getUseSLM(void) const { return this->useSLM; }
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index d28a72a..8976a68 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -58,7 +58,7 @@ namespace ir {
 
   void ImageSet::appendInfo(ImageInfoKey key, uint32_t offset)
   {
-    auto it = indexMap.find(key.index);
+    map<uint32_t, struct ImageInfo *>::iterator it = indexMap.find(key.index);
     assert(it != indexMap.end());
     struct ImageInfo *imageInfo = it->second;
     setInfoOffset4Type(imageInfo, key.type, offset);
@@ -67,8 +67,8 @@ namespace ir {
   void ImageSet::clearInfo()
   {
     struct ImageInfo *imageInfo;
-    for(auto &it : indexMap) {
-      imageInfo = it.second;
+    for (map<uint32_t, struct ImageInfo *>::iterator it = indexMap.begin(); it != indexMap.end(); ++it) {
+      imageInfo = it->second;
       imageInfo->wSlot = -1;
       imageInfo->hSlot = -1;
       imageInfo->depthSlot = -1;
@@ -78,7 +78,7 @@ namespace ir {
   }
   int32_t ImageSet::getInfoOffset(ImageInfoKey key) const
   {
-    auto it = indexMap.find(key.index);
+    map<uint32_t, struct ImageInfo *>::const_iterator it = indexMap.find(key.index);
     if (it == indexMap.end())
       return -1;
     struct ImageInfo *imageInfo = it->second;
@@ -87,20 +87,20 @@ namespace ir {
 
   uint32_t ImageSet::getIdx(const Register imageReg) const
   {
-    auto it = regMap.find(imageReg);
+    map<Register, struct ImageInfo *>::const_iterator it = regMap.find(imageReg);
     GBE_ASSERT(it != regMap.end());
     return it->second->idx;
   }
 
   void ImageSet::getData(struct ImageInfo *imageInfos) const {
       int id = 0;
-      for(auto &it : regMap)
-        imageInfos[id++] = *it.second;
+      for (map<Register, struct ImageInfo *>::const_iterator it = regMap.begin(); it != regMap.end(); ++it)
+        imageInfos[id++] = *(it->second);
   }
 
   ImageSet::~ImageSet() {
-    for(auto &it : regMap)
-      GBE_DELETE(it.second);
+    for (map<Register, struct ImageInfo *>::const_iterator it = regMap.begin(); it != regMap.end(); ++it)
+      GBE_DELETE(it->second);
   }
 
 #define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
@@ -113,29 +113,29 @@ namespace ir {
     OUT_UPDATE_SZ(magic_begin);
 
     OUT_UPDATE_SZ(regMap.size());
-    for (auto iter : regMap) {
-      OUT_UPDATE_SZ(iter.first);
-      OUT_UPDATE_SZ(iter.second->arg_idx);
-      OUT_UPDATE_SZ(iter.second->idx);
-      OUT_UPDATE_SZ(iter.second->wSlot);
-      OUT_UPDATE_SZ(iter.second->hSlot);
-      OUT_UPDATE_SZ(iter.second->depthSlot);
-      OUT_UPDATE_SZ(iter.second->dataTypeSlot);
-      OUT_UPDATE_SZ(iter.second->channelOrderSlot);
-      OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+    for (map<Register, struct ImageInfo *>::const_iterator it = regMap.begin(); it != regMap.end(); ++it) {
+      OUT_UPDATE_SZ(it->first);
+      OUT_UPDATE_SZ(it->second->arg_idx);
+      OUT_UPDATE_SZ(it->second->idx);
+      OUT_UPDATE_SZ(it->second->wSlot);
+      OUT_UPDATE_SZ(it->second->hSlot);
+      OUT_UPDATE_SZ(it->second->depthSlot);
+      OUT_UPDATE_SZ(it->second->dataTypeSlot);
+      OUT_UPDATE_SZ(it->second->channelOrderSlot);
+      OUT_UPDATE_SZ(it->second->dimOrderSlot);
     }
 
     OUT_UPDATE_SZ(indexMap.size());
-    for (auto iter : indexMap) {
-      OUT_UPDATE_SZ(iter.first);
-      OUT_UPDATE_SZ(iter.second->arg_idx);
-      OUT_UPDATE_SZ(iter.second->idx);
-      OUT_UPDATE_SZ(iter.second->wSlot);
-      OUT_UPDATE_SZ(iter.second->hSlot);
-      OUT_UPDATE_SZ(iter.second->depthSlot);
-      OUT_UPDATE_SZ(iter.second->dataTypeSlot);
-      OUT_UPDATE_SZ(iter.second->channelOrderSlot);
-      OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+    for (map<uint32_t, struct ImageInfo *>::iterator it = indexMap.begin(); it != indexMap.end(); ++it) {
+      OUT_UPDATE_SZ(it->first);
+      OUT_UPDATE_SZ(it->second->arg_idx);
+      OUT_UPDATE_SZ(it->second->idx);
+      OUT_UPDATE_SZ(it->second->wSlot);
+      OUT_UPDATE_SZ(it->second->hSlot);
+      OUT_UPDATE_SZ(it->second->depthSlot);
+      OUT_UPDATE_SZ(it->second->dataTypeSlot);
+      OUT_UPDATE_SZ(it->second->channelOrderSlot);
+      OUT_UPDATE_SZ(it->second->dimOrderSlot);
     }
 
     OUT_UPDATE_SZ(magic_end);
@@ -211,31 +211,31 @@ namespace ir {
     outs << spaces_nl  << "  ImageSet Map: [reg, arg_idx, idx, wSlot, hSlot, depthSlot, "
                 "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
     outs << spaces_nl << "     regMap size: " << regMap.size() << "\n";
-    for (auto iter : regMap) {
-      outs << spaces_nl << "         [" << iter.first << ", "
-           << iter.second->arg_idx << ", "
-           << iter.second->idx << ", "
-           << iter.second->wSlot << ", "
-           << iter.second->hSlot << ", "
-           << iter.second->depthSlot << ", "
-           << iter.second->dataTypeSlot << ", "
-           << iter.second->channelOrderSlot << ", "
-           << iter.second->dimOrderSlot << "]" << "\n";
+    for (map<Register, struct ImageInfo *>::const_iterator it = regMap.begin(); it != regMap.end(); ++it) {
+      outs << spaces_nl << "         [" << it->first << ", "
+           << it->second->arg_idx << ", "
+           << it->second->idx << ", "
+           << it->second->wSlot << ", "
+           << it->second->hSlot << ", "
+           << it->second->depthSlot << ", "
+           << it->second->dataTypeSlot << ", "
+           << it->second->channelOrderSlot << ", "
+           << it->second->dimOrderSlot << "]" << "\n";
    }
 
    outs << spaces_nl << "  ImageSet Map: [index, arg_idx, idx, wSlot, hSlot, depthSlot, "
            "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
    outs << spaces_nl << "     regMap size: " << indexMap.size() << "\n";
-   for (auto iter : indexMap) {
-     outs << spaces_nl << "         [" << iter.first << ", "
-          << iter.second->arg_idx << ", "
-          << iter.second->idx << ", "
-          << iter.second->wSlot << ", "
-          << iter.second->hSlot << ", "
-          << iter.second->depthSlot << ", "
-          << iter.second->dataTypeSlot << ", "
-          << iter.second->channelOrderSlot << ", "
-          << iter.second->dimOrderSlot << ", " << "\n";
+   for (map<uint32_t, struct ImageInfo *>::iterator it = indexMap.begin(); it != indexMap.end(); ++it) {
+     outs << spaces_nl << "         [" << it->first << ", "
+          << it->second->arg_idx << ", "
+          << it->second->idx << ", "
+          << it->second->wSlot << ", "
+          << it->second->hSlot << ", "
+          << it->second->depthSlot << ", "
+          << it->second->dataTypeSlot << ", "
+          << it->second->channelOrderSlot << ", "
+          << it->second->dimOrderSlot << ", " << "\n";
    }
 
    outs << spaces << "------------- End ImageSet -------------" << "\n";
diff --git a/backend/src/ir/immediate.cpp b/backend/src/ir/immediate.cpp
index 7d26925..1aac9bc 100644
--- a/backend/src/ir/immediate.cpp
+++ b/backend/src/ir/immediate.cpp
@@ -132,7 +132,7 @@ using namespace ir;
     }
 
     Immediate Immediate::less (const Immediate &left, const Immediate &right) {
-      GBE_ASSERT(left.getType() > TYPE_BOOL && left.getType() <= TYPE_U64);
+      GBE_ASSERT(left.getType() > TYPE_BOOL && left.getType() <= TYPE_DOUBLE);
       switch (left.getType()) {
         default:
           GBE_ASSERT(0);
@@ -149,6 +149,30 @@ using namespace ir;
       }
     }
 
+    Immediate Immediate::extract (const Immediate &left, const Immediate &right, Type dstType) {
+      GBE_ASSERT(left.getType() > TYPE_BOOL && left.getType() <= TYPE_DOUBLE);
+      GBE_ASSERT(dstType == left.getType());
+      uint32_t index = right.getIntegerValue();
+      GBE_ASSERT(index >= 0 && index < left.getElemNum());
+      if (left.type != IMM_TYPE_COMP) {
+        switch (left.getType()) {
+          default:
+            GBE_ASSERT(0);
+          case TYPE_BOOL:   return Immediate(left.data.b[index]);
+          case TYPE_S8:     return Immediate(left.data.s8[index]);
+          case TYPE_U8:     return Immediate(left.data.u8[index]);
+          case TYPE_S16:    return Immediate(left.data.s16[index]);
+          case TYPE_U16:    return Immediate(left.data.u16[index]);
+          case TYPE_S32:    return Immediate(left.data.s32[index]);
+          case TYPE_U32:    return Immediate(left.data.u32[index]);
+          case TYPE_S64:    return Immediate(left.data.s64[index]);
+          case TYPE_U64:    return Immediate(left.data.u64[index]);
+          case TYPE_FLOAT:  return Immediate(left.data.f32[index]);
+          case TYPE_DOUBLE: return Immediate(left.data.f64[index]);
+        }
+      } else
+        return *left.data.immVec[index];
+    }
 
     Immediate::Immediate(ImmOpCode op, const Immediate &left, const Immediate &right, Type dstType) {
       switch (op) {
@@ -180,7 +204,7 @@ using namespace ir;
         case IMM_LSHR:
         {
           if (left.getElemNum() == 1)
-            lshr(left, right);
+            *this = lshr(left, right);
           else {
             GBE_ASSERT(right.getIntegerValue() <= (left.getElemNum() * left.getTypeSize() * 8));
             GBE_ASSERT(right.getIntegerValue() % (left.getTypeSize() * 8) == 0);
@@ -216,16 +240,17 @@ using namespace ir;
         case IMM_OLT: *this = less(left, right); break;
         case IMM_OGT: *this = left > right; break;
         case IMM_ORD: *this = (left == left) && (right == right); break;
+        case IMM_EXTRACT: *this = extract(left, right, dstType); break;
       }
       // If the dst type is large int, we will not change the imm type to large int.
       GBE_ASSERT(type == (ImmType)dstType || dstType == TYPE_LARGE_INT || dstType == TYPE_BOOL);
     }
 
-    Immediate::Immediate(const vector<const Immediate*> immVec) {
+    Immediate::Immediate(const vector<const Immediate*> immVec, Type dstType) {
       if (immVec.size() == 1) {
         *this = *immVec[0];
       } else if (!(immVec[0]->isCompType()) && immVec[0]->elemNum == 1) {
-        this->type = immVec[0]->type;
+        this->type = (ImmType)dstType;
         this->elemNum = immVec.size();
         if (immVec[0]->getTypeSize() * immVec.size() < 8)
           this->data.p = &this->defaultData;
@@ -238,6 +263,7 @@ using namespace ir;
           p += immVec[i]->getTypeSize();
         }
       } else {
+        GBE_ASSERT(0);
         this->type = IMM_TYPE_COMP;
         if (immVec.size() * sizeof(Immediate*) < 8)
           this->data.p = &this->defaultData;
diff --git a/backend/src/ir/immediate.hpp b/backend/src/ir/immediate.hpp
index 1f18a4c..756806b 100644
--- a/backend/src/ir/immediate.hpp
+++ b/backend/src/ir/immediate.hpp
@@ -56,7 +56,11 @@ namespace ir {
     IMM_FPTOUI,
     IMM_FPTOSI,
     IMM_SITOFP,
-    IMM_UITOFP
+    IMM_UITOFP,
+    IMM_EXTRACT,
+    IMM_SEXT,
+    IMM_ZEXT,
+    IMM_FPEXT
   } ImmOpCode;
 
   typedef enum {
@@ -80,6 +84,8 @@ namespace ir {
   public:
     INLINE Immediate(void) { }
 
+    Immediate & operator= (const Immediate &);
+
     INLINE Type getType(void) const {
       return (Type)type;
     }
@@ -158,7 +164,7 @@ namespace ir {
     DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE, elemNum)
 #undef DECL_CONSTRUCTOR
 
-    Immediate(const vector<const Immediate*> immVec);
+    Immediate(const vector<const Immediate*> immVec, Type dstType);
 
     INLINE int64_t getIntegerValue(void) const {
       switch (type) {
@@ -176,6 +182,22 @@ namespace ir {
       }
     }
 
+    INLINE uint64_t getUnsignedIntegerValue(void) const {
+      switch (type) {
+        default:
+          GBE_ASSERT(0 && "Invalid immediate type.\n");
+        case TYPE_BOOL: return *data.b;
+        case TYPE_S8:   return *data.s8;
+        case TYPE_U8:   return *data.u8;
+        case TYPE_S16:  return *data.s16;
+        case TYPE_U16:  return *data.u16;
+        case TYPE_S32:  return *data.s32;
+        case TYPE_U32:  return *data.u32;
+        case TYPE_S64:  return *data.s64;
+        case TYPE_U64:  return *data.u64;
+      }
+    }
+
     INLINE float getFloatValue(void) const {
       GBE_ASSERT(type == IMM_TYPE_FLOAT);
       return *data.f32;
@@ -208,13 +230,54 @@ namespace ir {
           copy(other, 0, 1);
           break;
         case IMM_BITCAST:
-          *this = other;
-          type = (ImmType)dstType;
+          if (other.type != IMM_TYPE_COMP) {
+            *this = other;
+            type = (ImmType)dstType;
+          } else {
+            vector<const Immediate*> immVec;
+            for(uint32_t i = 0; i < other.getElemNum(); i++)
+              immVec.push_back(other.data.immVec[i]);
+            *this = Immediate(immVec, dstType);
+          }
           break;
         case IMM_FPTOUI: *this = Immediate((uint32_t)*other.data.f32); break;
         case IMM_FPTOSI: *this = Immediate((int32_t)*other.data.f32); break;
         case IMM_UITOFP: *this = Immediate((float)*other.data.u32); break;
         case IMM_SITOFP: *this = Immediate((float)*other.data.s32); break;
+        case IMM_SEXT:
+        {
+          int64_t value = other.getIntegerValue();
+          if (other.getType() == TYPE_BOOL)
+            value = -value;
+          switch (dstType) {
+            default:
+              GBE_ASSERT(0 && "Illegal sext constant expression");
+            case TYPE_S8:     *this = Immediate((int8_t)value); break;
+            case TYPE_S16:    *this = Immediate((int16_t)value); break;
+            case TYPE_S32:    *this = Immediate((int32_t)value); break;
+            case TYPE_S64:    *this = Immediate((int64_t)value); break;
+          }
+        }
+        case IMM_ZEXT:
+        {
+          uint64_t value = other.getUnsignedIntegerValue();
+          switch (dstType) {
+            default:
+              GBE_ASSERT(0 && "Illegal sext constant expression");
+            case TYPE_U8:     *this = Immediate((uint8_t)value); break;
+            case TYPE_U16:    *this = Immediate((uint16_t)value); break;
+            case TYPE_U32:    *this = Immediate((uint32_t)value); break;
+            case TYPE_U64:    *this = Immediate((uint64_t)value); break;
+          }
+          break;
+        }
+        case IMM_FPEXT:
+        {
+          GBE_ASSERT(other.getType() == TYPE_FLOAT && dstType == TYPE_DOUBLE);
+          double value = other.getFloatValue();
+          *this = Immediate(value);
+          break;
+        }
       }
     }
 
@@ -246,7 +309,6 @@ namespace ir {
       const Immediate *immVec[];
       void *p;
     } data;     //!< Value to store
-    Immediate & operator= (const Immediate &);
     Immediate operator+ (const Immediate &) const;
     Immediate operator- (const Immediate &) const;
     Immediate operator* (const Immediate &) const;
@@ -265,6 +327,7 @@ namespace ir {
     Immediate operator>> (const Immediate &) const;
     static Immediate lshr (const Immediate &left, const Immediate &right);
     static Immediate less (const Immediate &left, const Immediate &right);
+    static Immediate extract (const Immediate &left, const Immediate &right, Type dstType);
 
     void copy(const Immediate &other, int32_t offset, uint32_t num);
     GBE_CLASS(Immediate);
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 2bd0061..82e7dda 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1255,7 +1255,6 @@ namespace ir {
       case MEM_LOCAL: return out << "local";
       case MEM_CONSTANT: return out << "constant";
       case MEM_PRIVATE: return out << "private";
-      case IMAGE: return out << "image";
       case MEM_INVALID: return out << "invalid";
     };
     return out;
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 11e9509..47312f5 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -58,7 +58,6 @@ namespace ir {
     MEM_LOCAL,      //!< Local memory (thread group memory)
     MEM_CONSTANT,   //!< Immutable global memory
     MEM_PRIVATE,    //!< Per thread private memory
-    IMAGE,          //!< For texture image.
     MEM_INVALID
   };
 
diff --git a/backend/src/ir/liveness.hpp b/backend/src/ir/liveness.hpp
index 1bc66fe..4a7dc4e 100644
--- a/backend/src/ir/liveness.hpp
+++ b/backend/src/ir/liveness.hpp
@@ -100,8 +100,8 @@ namespace ir {
     template <DataFlowDirection dir, typename T>
     void foreach(const T &functor) {
       // Iterate on all blocks
-      for (const auto &pair : liveness) {
-        BlockInfo &info = *pair.second;
+      for (Info::iterator pair = liveness.begin(); pair != liveness.end(); ++pair) {
+        BlockInfo &info = *(pair->second);
         const BasicBlock &bb = info.bb;
         const BlockSet *set = NULL;
         if (dir == DF_SUCC)
@@ -109,8 +109,8 @@ namespace ir {
         else
           set = &bb.getPredecessorSet();
         // Iterate over all successors
-        for (auto other : *set) {
-          auto otherInfo = liveness.find(other);
+        for (BlockSet::iterator other = (*set).begin(); other != (*set).end(); ++other) {
+          Info::iterator otherInfo = liveness.find(*other);
           GBE_ASSERT(otherInfo != liveness.end() && otherInfo->second != NULL);
           functor(info, *otherInfo->second);
         }
diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
index 7b670f7..fa108dc 100644
--- a/backend/src/ir/printf.cpp
+++ b/backend/src/ir/printf.cpp
@@ -35,11 +35,11 @@ namespace gbe
     {
       fmts.push_back(*fmt);
 
-      for (auto &f : fmts.back()) {
-        if (f.type == PRINTF_SLOT_TYPE_STRING)
+      for (PrintfFmt::iterator f = fmts.back().begin(); f != fmts.back().end(); ++f) {
+        if (f->type == PRINTF_SLOT_TYPE_STRING)
           continue;
 
-        slots.push_back(&f);
+        slots.push_back(&(*f));
       }
 
       /* Update the total size of size. */
@@ -105,30 +105,36 @@ namespace gbe
 #define PRINT_SOMETHING(target_ty, conv)  do {                          \
       if (!vec_i)                                                       \
         pf_str = pf_str + std::string(#conv);                           \
-      printf(pf_str.c_str(),                                            \
-             ((target_ty *)((char *)buf_addr + sizeOfSize * global_wk_sz0 * global_wk_sz1 * global_wk_sz2 * n \
-                                              + slot.state->out_buf_sizeof_offset * \
-                                                         global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
-             [(k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i) * vec_num + vec_i]);\
+      char *ptr = ((char *)buf_addr + sizeOfSize * global_wk_sz0 * global_wk_sz1 * global_wk_sz2 * n \
+                   + slot.state->out_buf_sizeof_offset *                \
+                   global_wk_sz0 * global_wk_sz1 * global_wk_sz2);      \
+      target_ty* obj_ptr = ((target_ty *)ptr) + (k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i) * vec_num + vec_i; \
+      if ((char *)obj_ptr + sizeof(target_ty) > (char *)buf_addr + output_sz) {            \
+        printf("\n\n!!!The printf message is out of range because of the limited buffer, ignore.\n"); \
+        return;                                                         \
+      }                                                                 \
+      printf(pf_str.c_str(),  *obj_ptr);                                \
     } while (0)
 
 
     void PrintfSet::outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
-                                 size_t global_wk_sz1, size_t global_wk_sz2)
+                                 size_t global_wk_sz1, size_t global_wk_sz2, size_t output_sz)
     {
       LockOutput lock;
       size_t i, j, k;
       std::string pf_str;
       int stmt = 0;
 
-      for (auto &pf : fmts) {
+      for (size_t count = 0; count < fmts.size(); ++count) {
+        PrintfFmt& pf = fmts[count];
         for (i = 0; i < global_wk_sz0; i++) {
           for (j = 0; j < global_wk_sz1; j++) {
             for (k = 0; k < global_wk_sz2; k++) {
               int loop_num = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
                                                  + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
               for (int n = 0; n < loop_num; n++) {
-                for (auto &slot : pf) {
+                for (PrintfFmt::iterator pfit = pf.begin(); pfit != pf.end(); ++pfit) {
+                  PrintfSlot& slot = *pfit;
                   pf_str = "";
                   int vec_num;
 
diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
index b9f7619..f6c6bcf 100644
--- a/backend/src/ir/printf.hpp
+++ b/backend/src/ir/printf.hpp
@@ -75,6 +75,33 @@ namespace gbe
       char conversion_specifier;
       int out_buf_sizeof_offset;  // Should *global_total_size to get the full offset.
       std::string str;            //if %s, the string store here.
+
+      PrintfState(void) {
+        left_justified = 0;
+        sign_symbol = 0;
+        alter_form = 0;
+        zero_padding = 0;
+        vector_n = 0;
+        min_width = 0;
+        precision = 0;
+        length_modifier = 0;
+        conversion_specifier = 0;
+        out_buf_sizeof_offset = 0;
+      }
+
+      PrintfState(const PrintfState & other) {
+        left_justified = other.left_justified;
+        sign_symbol = other.sign_symbol;
+        alter_form = other.alter_form;
+        zero_padding = other.zero_padding;
+        vector_n = other.vector_n;
+        min_width = other.min_width;
+        precision = other.precision;
+        length_modifier = other.length_modifier;
+        conversion_specifier = other.conversion_specifier;
+        out_buf_sizeof_offset = other.out_buf_sizeof_offset;
+        str = other.str;
+      }
     };
 
     enum {
@@ -106,8 +133,7 @@ namespace gbe
 
       PrintfSlot(PrintfState * st) {
         type = PRINTF_SLOT_TYPE_STATE;
-        state = (PrintfState *)malloc(sizeof(PrintfState));
-        memcpy(state, st, sizeof(PrintfState));
+        state = new PrintfState(*st);
       }
 
       PrintfSlot(const PrintfSlot & other) {
@@ -119,8 +145,7 @@ namespace gbe
           type = PRINTF_SLOT_TYPE_STRING;
         } else if (other.type == PRINTF_SLOT_TYPE_STATE) {
           type = PRINTF_SLOT_TYPE_STATE;
-          state = (PrintfState *)malloc(sizeof(PrintfState));
-          memcpy(state, other.state, sizeof(PrintfState));
+          state = new PrintfState(*other.state);
         } else {
           type = PRINTF_SLOT_TYPE_NONE;
           ptr = NULL;
@@ -146,11 +171,13 @@ namespace gbe
     {
     public:
       PrintfSet(const PrintfSet& other) {
-        for (auto &f : other.fmts) {
+        for (size_t i = 0; i < other.fmts.size(); ++i) {
+          const PrintfFmt& f = other.fmts[i];
           fmts.push_back(f);
         }
 
-        for (auto &s : other.slots) {
+        for (size_t i = 0; i < other.slots.size(); ++i) {
+          PrintfSlot* s = other.slots[i];
           slots.push_back(s);
         }
 
@@ -226,7 +253,7 @@ namespace gbe
       }
 
       void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
-                        size_t global_wk_sz1, size_t global_wk_sz2);
+                        size_t global_wk_sz1, size_t global_wk_sz2, size_t output_sz);
 
     private:
       vector<PrintfFmt> fmts;
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
index ba42acb..a4e1ddd 100644
--- a/backend/src/ir/sampler.cpp
+++ b/backend/src/ir/sampler.cpp
@@ -49,15 +49,11 @@ namespace ir {
     ir::FunctionArgument *arg =  ctx->getFunction().getArg(samplerReg);
     GBE_ASSERT(arg != NULL);
 
-    // XXX As LLVM 3.2/3.1 doesn't have a new data type for the sampler_t, we have to fix up the argument
-    // type here. Once we switch to the LLVM and use the new data type sampler_t, we can remove this
-    // work around.
-    arg->type = ir::FunctionArgument::SAMPLER;
-    arg->info.typeName = "sampler_t";
+    GBE_ASSERT(arg->type == ir::FunctionArgument::SAMPLER);
     int32_t id = ctx->getFunction().getArgID(arg);
     GBE_ASSERT(id < (1 << __CLK_SAMPLER_ARG_BITS));
 
-    auto it = samplerMap.find(SAMPLER_ID(id));
+    map<uint32_t, uint32_t>::iterator it = samplerMap.find(SAMPLER_ID(id));
     if (it != samplerMap.end()) {
       return it->second;
     }
@@ -75,9 +71,9 @@ namespace ir {
     OUT_UPDATE_SZ(magic_begin);
 
     OUT_UPDATE_SZ(samplerMap.size());
-    for (auto iter : samplerMap) {
-      OUT_UPDATE_SZ(iter.first);
-      OUT_UPDATE_SZ(iter.second);
+    for (map<uint32_t, uint32_t>::iterator it = samplerMap.begin(); it != samplerMap.end(); ++it) {
+      OUT_UPDATE_SZ(it->first);
+      OUT_UPDATE_SZ(it->second);
     }
 
     OUT_UPDATE_SZ(magic_end);
@@ -127,9 +123,9 @@ namespace ir {
     outs << spaces_nl << "  SamplerSet Map: [index, sampler_reg, sampler_slot]\n";
     outs << spaces_nl << "     samplerMap size: " << samplerMap.size() << "\n";
 
-    for (auto iter : samplerMap) {
-      outs << spaces_nl <<  "     [" << iter.first << ", "
-           << iter.second << "]\n";
+    for (map<uint32_t, uint32_t>::iterator it = samplerMap.begin(); it != samplerMap.end(); ++it) {
+      outs << spaces_nl <<  "     [" << it->first << ", "
+           << it->second << "]\n";
     }
 
     outs << spaces << "------------- End SamplerSet -------------" << "\n";
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
index a23f871..85e6d54 100644
--- a/backend/src/ir/sampler.hpp
+++ b/backend/src/ir/sampler.hpp
@@ -48,8 +48,8 @@ namespace ir {
     size_t getDataSize(void) { return samplerMap.size(); }
     size_t getDataSize(void) const { return samplerMap.size(); }
     void getData(uint32_t *samplers) const {
-      for(auto &it : samplerMap)
-        samplers[it.second] = it.first;
+      for (map<uint32_t, uint32_t>::const_iterator it = samplerMap.begin(); it != samplerMap.end(); ++it)
+        samplers[it->second] = it->first;
     }
 
     void operator = (const SamplerSet& other) {
diff --git a/backend/src/ir/structural_analysis.cpp b/backend/src/ir/structural_analysis.cpp
index 1e98629..4c7e3d2 100644
--- a/backend/src/ir/structural_analysis.cpp
+++ b/backend/src/ir/structural_analysis.cpp
@@ -667,21 +667,6 @@ namespace analysis
   }
 
 
-  bool ControlTree::pathBack(Node* m, Node* n)
-  {
-    for(NodeSet::const_iterator iter = n->preds().begin(); iter!= n->preds().end(); iter++)
-    {
-      if(isBackedge(*iter, n))
-      {
-        visited.clear();
-        if(path(m, *iter, n))
-          return true;
-      }
-    }
-
-    return false;
-  }
-
   /* this algorithm is from Muchnick's textbook(sec 7.7) (Advanced Compiler Design and Implementation) */
   Node* ControlTree::acyclicRegionType(Node* node, NodeSet& nset)
   {
@@ -841,7 +826,6 @@ namespace analysis
     return NULL;
   }
 
-
   bool ControlTree::path(Node *from, Node *to, Node *notthrough)
   {
 
@@ -863,9 +847,6 @@ namespace analysis
   }
 
 
-  /* this algorithm could work right, but it is quite inefficient, and
-   * we are not handling any cyclic regions at this moment, so here just
-   * ignore the identification of cyclic regions. */
   Node * ControlTree::cyclicRegionType(Node *node, NodeList &nset)
   {
     /* check for self-loop */
@@ -883,6 +864,9 @@ namespace analysis
         return NULL;
     }
 
+    //FIXME: as our IR could only handle self loop, the while loop node
+    //is disabled to avoid performace regression by the path function.
+#if 0
     /* check for improper region */
     for(NodeList::const_iterator m = nset.begin(); m != nset.end(); m++)
     {
@@ -907,6 +891,8 @@ namespace analysis
         return insertNode(p);
       }
     }
+#endif
+
     return NULL;
   }
 
@@ -1029,23 +1015,44 @@ namespace analysis
           if(nset.find(entry) != nset.end())
             entry = region;
         }
-        // FIXME loop optimization is still buggy and under development, now disable it by default.
         else
         {
-#if 0
           reachUnder.clear();
           nset.clear();
-          for(NodeList::const_iterator m = post_order.begin(); m != post_order.end(); m++)
-          {
-            if(*m != n && pathBack(*m, n))
+
+          //reuse the loop info from llvm gaterLoopInfo.
+          const gbe::vector<ir::Loop *> &loops = fn->getLoops();
+          if(loops.size() == 0){
+            post_ctr++;
+            continue;
+          }
+
+          Node* loop_header = NULL;
+          //if n is basic block node, query the llvm loop info to find the loop whoose loop header is n;
+          if(n->type() == BasicBlock){
+            for (auto l : loops) {
+              ir::BasicBlock &a = fn->getBlock(l->bbs[0]);
+              loop_header = bbmap.find(&a)->second;
+
+              if(loop_header == n){
+                for (auto bb : l->bbs) {
+                  ir::BasicBlock &tmp = fn->getBlock(bb);
+                  Node* node_ = bbmap.find(&tmp)->second;
+                  reachUnder.push_front(node_);
+                  nset.insert(node_);
+                }
+                break;
+              }
+            }
+          }else{
+          //n is compacted node, it would have a successor pointed to itself for self loop.
+            if(n->succs().find(n) != n->succs().end())
             {
-              reachUnder.push_front(*m);
-              nset.insert(*m);
+                reachUnder.push_front(n);
+                nset.insert(n);
             }
           }
 
-          reachUnder.push_front(n);
-          nset.insert(n);
           region = cyclicRegionType(n, reachUnder);
 
           if(NULL != region)
@@ -1060,9 +1067,6 @@ namespace analysis
           {
             post_ctr++;
           }
-#else
-          post_ctr++;
-#endif
         }
       }
 
diff --git a/backend/src/ir/structural_analysis.hpp b/backend/src/ir/structural_analysis.hpp
index dc2f3c2..7aaa533 100644
--- a/backend/src/ir/structural_analysis.hpp
+++ b/backend/src/ir/structural_analysis.hpp
@@ -300,10 +300,6 @@ namespace analysis
     bool isCyclic(Node*);
     /* is this a back edge? */
     bool isBackedge(const Node*, const Node*);
-    /* returns true if there is a node k such that there is a
-     * (possibly empty) path from m to k that does not pass through n
-     * and an edge k->n that is a back edge, and false otherwise. */
-    bool pathBack(Node*, Node*);
     /* check if there is a barrier in a basic block */
     bool checkForBarrier(const ir::BasicBlock*);
     /* insert while instruction at the proper position of Node */
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index b5b0fa9..8ff858d 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -26,7 +26,6 @@
 
 #include "ir/constant.hpp"
 #include "ir/register.hpp"
-#include "sys/hash_map.hpp"
 #include "sys/map.hpp"
 
 namespace gbe {
@@ -41,7 +40,7 @@ namespace ir {
   class Unit : public NonCopyable
   {
   public:
-    typedef hash_map<std::string, Function*> FunctionSet;
+    typedef map<std::string, Function*> FunctionSet;
     /*! Create an empty unit */
     Unit(PointerSize pointerSize = POINTER_32_BITS);
     /*! Release everything (*including* the function pointers) */
@@ -57,7 +56,8 @@ namespace ir {
     /*! Apply the given functor on all the functions */
     template <typename T>
     INLINE void apply(const T &functor) const {
-      for (const auto &pair : functions) functor(*pair.second);
+      for (FunctionSet::const_iterator it = functions.begin(); it != functions.end(); ++it)
+        functor(*(it->second));
     }
     /*! Return the size of the pointers manipulated */
     INLINE PointerSize getPointerSize(void) const { return pointerSize; }
@@ -76,7 +76,7 @@ namespace ir {
     bool getValid() { return valid; }
   private:
     friend class ContextInterface; //!< Can free modify the unit
-    hash_map<std::string, Function*> functions; //!< All the defined functions
+    FunctionSet functions; //!< All the defined functions
     ConstantSet constantSet; //!< All the constants defined in the unit
     PointerSize pointerSize; //!< Size shared by all pointers
     GBE_CLASS(Unit);
diff --git a/backend/src/libocl/include/ocl_float.h b/backend/src/libocl/include/ocl_float.h
index 65577a3..916233b 100644
--- a/backend/src/libocl/include/ocl_float.h
+++ b/backend/src/libocl/include/ocl_float.h
@@ -91,6 +91,7 @@ INLINE_OVERLOADABLE int __ocl_finitef (float x){
 #define M_2_SQRTPI_F 1.1283791670955126F
 #define M_SQRT2_F    1.4142135623730951F
 #define M_SQRT1_2_F  0.7071067811865476F
-
+#define FP_ILOGB0    (-0x7FFFFFFF-1)
+#define FP_ILOGBNAN  FP_ILOGB0
 
 #endif /* __OCL_FLOAT_H__ */
diff --git a/backend/src/libocl/include/ocl_image.h b/backend/src/libocl/include/ocl_image.h
index 3637d56..39106cf 100644
--- a/backend/src/libocl/include/ocl_image.h
+++ b/backend/src/libocl/include/ocl_image.h
@@ -35,21 +35,13 @@ OVERLOADABLE float4 read_imagef(image1d_t cl_image, const sampler_t sampler, flo
 OVERLOADABLE float4 read_imagef(image1d_t cl_image, int coord);
 OVERLOADABLE void write_imagef(image1d_t cl_image, int coord, float4 color);
 OVERLOADABLE void write_imagef(image1d_t cl_image, float coord, float4 color);
-OVERLOADABLE int4 read_imagei(image1d_buffer_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE int4 read_imagei(image1d_buffer_t cl_image, const sampler_t sampler, float coord);
 OVERLOADABLE int4 read_imagei(image1d_buffer_t cl_image, int coord);
 OVERLOADABLE void write_imagei(image1d_buffer_t cl_image, int coord, int4 color);
-OVERLOADABLE void write_imagei(image1d_buffer_t cl_image, float coord, int4 color);
-OVERLOADABLE uint4 read_imageui(image1d_buffer_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE uint4 read_imageui(image1d_buffer_t cl_image, const sampler_t sampler, float coord);
 OVERLOADABLE uint4 read_imageui(image1d_buffer_t cl_image, int coord);
 OVERLOADABLE void write_imageui(image1d_buffer_t cl_image, int coord, uint4 color);
 OVERLOADABLE void write_imageui(image1d_buffer_t cl_image, float coord, uint4 color);
-OVERLOADABLE float4 read_imagef(image1d_buffer_t cl_image, const sampler_t sampler, int coord);
-OVERLOADABLE float4 read_imagef(image1d_buffer_t cl_image, const sampler_t sampler, float coord);
 OVERLOADABLE float4 read_imagef(image1d_buffer_t cl_image, int coord);
 OVERLOADABLE void write_imagef(image1d_buffer_t cl_image, int coord, float4 color);
-OVERLOADABLE void write_imagef(image1d_buffer_t cl_image, float coord, float4 color);
 
 OVERLOADABLE int get_image_channel_data_type(image1d_t image);
 OVERLOADABLE int get_image_channel_order(image1d_t image);
diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h
index 63478c9..3b4ce97 100644
--- a/backend/src/libocl/include/ocl_types.h
+++ b/backend/src/libocl/include/ocl_types.h
@@ -82,36 +82,11 @@ DEF(double);
 #undef DEF
 
 /////////////////////////////////////////////////////////////////////////////
-// OpenCL built-in image types
+// OpenCL built-in event types
 /////////////////////////////////////////////////////////////////////////////
 // FIXME:
 // This is a transitional hack to bypass the LLVM 3.3 built-in types.
 // See the Khronos SPIR specification for handling of these types.
-#define __texture __attribute__((address_space(4)))
-struct _image1d_t;
-typedef __texture struct _image1d_t* __image1d_t;
-struct _image1d_buffer_t;
-typedef __texture struct _image1d_buffer_t* __image1d_buffer_t;
-struct _image1d_array_t;
-typedef __texture struct _image1d_array_t* __image1d_array_t;
-struct _image2d_t;
-typedef __texture struct _image2d_t* __image2d_t;
-struct _image2d_array_t;
-typedef __texture struct _image2d_array_t* __image2d_array_t;
-struct _image3d_t;
-typedef __texture struct _image3d_t* __image3d_t;
-typedef const ushort __sampler_t;
-#define image1d_t __image1d_t
-#define image1d_buffer_t __image1d_buffer_t
-#define image1d_array_t __image1d_array_t
-#define image2d_t __image2d_t
-#define image2d_array_t __image2d_array_t
-#define image3d_t __image3d_t
-#define sampler_t __sampler_t
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL built-in event types
-/////////////////////////////////////////////////////////////////////////////
 typedef size_t __event_t;
 #define event_t __event_t
 
diff --git a/backend/src/libocl/src/ocl_image.cl b/backend/src/libocl/src/ocl_image.cl
index fd421bf..a8dbc92 100644
--- a/backend/src/libocl/src/ocl_image.cl
+++ b/backend/src/libocl/src/ocl_image.cl
@@ -19,385 +19,518 @@
 #include "ocl_math.h"
 #include "ocl_integer.h"
 #include "ocl_common.h"
+#include "ocl_convert.h"
+
+#define int1 int
+#define float1 float
+
+///////////////////////////////////////////////////////////////////////////////
+// Beignet builtin functions.
+///////////////////////////////////////////////////////////////////////////////
+
+#define DECL_GEN_OCL_RW_IMAGE(image_type, n) \
+  OVERLOADABLE int4 __gen_ocl_read_imagei(image_type image, sampler_t sampler,            \
+                                          float ##n coord, uint sampler_offset);          \
+  OVERLOADABLE int4 __gen_ocl_read_imagei(image_type image, sampler_t sampler,            \
+                                          int ##n coord, uint sampler_offset);            \
+  OVERLOADABLE uint4 __gen_ocl_read_imageui(image_type image, sampler_t sampler,          \
+                                            float ##n coord, uint sampler_offset);        \
+  OVERLOADABLE uint4 __gen_ocl_read_imageui(image_type image, sampler_t sampler,          \
+                                            int ##n coord, uint sampler_offset);          \
+  OVERLOADABLE float4 __gen_ocl_read_imagef(image_type image, sampler_t sampler,          \
+                                            float ##n coord, uint sampler_offset);        \
+  OVERLOADABLE float4 __gen_ocl_read_imagef(image_type image, sampler_t sampler,          \
+                                            int ##n coord, uint sampler_offset);          \
+  OVERLOADABLE void __gen_ocl_write_imagei(image_type image, int ##n coord , int4 color); \
+  OVERLOADABLE void __gen_ocl_write_imageui(image_type image, int ##n coord, uint4 color);\
+  OVERLOADABLE void __gen_ocl_write_imagef(image_type image, int ##n coord, float4 color);
+
+#define DECL_GEN_OCL_QUERY_IMAGE(image_type) \
+  OVERLOADABLE int __gen_ocl_get_image_width(image_type image);                           \
+  OVERLOADABLE int __gen_ocl_get_image_height(image_type image);                          \
+  OVERLOADABLE int __gen_ocl_get_image_channel_data_type(image_type image);               \
+  OVERLOADABLE int __gen_ocl_get_image_channel_order(image_type image);                   \
+  OVERLOADABLE int __gen_ocl_get_image_depth(image_type image);                           \
+
+DECL_GEN_OCL_RW_IMAGE(image1d_t, 1)
+DECL_GEN_OCL_RW_IMAGE(image1d_buffer_t, 2)
+DECL_GEN_OCL_RW_IMAGE(image1d_array_t, 2)
+DECL_GEN_OCL_RW_IMAGE(image1d_array_t, 4)
+DECL_GEN_OCL_RW_IMAGE(image2d_t, 2)
+DECL_GEN_OCL_RW_IMAGE(image2d_array_t, 3)
+DECL_GEN_OCL_RW_IMAGE(image3d_t, 3)
+DECL_GEN_OCL_RW_IMAGE(image2d_array_t, 4)
+DECL_GEN_OCL_RW_IMAGE(image3d_t, 4)
+
+DECL_GEN_OCL_QUERY_IMAGE(image1d_t)
+DECL_GEN_OCL_QUERY_IMAGE(image1d_buffer_t)
+DECL_GEN_OCL_QUERY_IMAGE(image1d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(image2d_t)
+DECL_GEN_OCL_QUERY_IMAGE(image2d_array_t)
+DECL_GEN_OCL_QUERY_IMAGE(image3d_t)
+///////////////////////////////////////////////////////////////////////////////
+// helper functions to validate array index.
+///////////////////////////////////////////////////////////////////////////////
+INLINE_OVERLOADABLE float2 __gen_validate_array_index(float2 coord, image1d_array_t image)
+{
+  float array_size = __gen_ocl_get_image_depth(image);
+  coord.s1 = clamp(rint(coord.s1), 0.f, array_size - 1.f);
+  return coord;
+}
 
-// 1D read
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
-
-// 2D & 1D Array read
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
-
-// 3D & 2D Array read
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
-
-// 1D write
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, float4 color);
-
-// 2D & 1D Array write
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
-
-// 3D & 2D Array write
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
-
-int __gen_ocl_get_image_width(uint surface_id);
-int __gen_ocl_get_image_height(uint surface_id);
-int __gen_ocl_get_image_channel_data_type(uint surface_id);
-int __gen_ocl_get_image_channel_order(uint surface_id);
-int __gen_ocl_get_image_depth(uint surface_id);
+INLINE_OVERLOADABLE float4 __gen_validate_array_index(float4 coord, image2d_array_t image)
+{
+  float array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
+  return coord;
+}
 
-// 2D 3D Image Common Macro
-#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
-#define GEN_FIX_1 1
-#else
-#define GEN_FIX_1 0
-#endif
+INLINE_OVERLOADABLE float3 __gen_validate_array_index(float3 coord, image2d_array_t image)
+{
+  float array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(rint(coord.s2), 0.f, array_size - 1.f);
+  return coord;
+}
 
-#define GET_IMAGE(cl_image, surface_id) \
-    uint surface_id = (uint)cl_image
-OVERLOADABLE float __gen_compute_array_index(const float index, image1d_array_t image)
+INLINE_OVERLOADABLE int2 __gen_validate_array_index(int2 coord, image1d_array_t image)
 {
-  GET_IMAGE(image, surface_id);
-  float array_size = __gen_ocl_get_image_depth(surface_id);
-  return clamp(rint(index), 0.f, array_size - 1.f);
+  int array_size = __gen_ocl_get_image_depth(image);
+  coord.s1 = clamp(coord.s1, 0, array_size - 1);
+  return coord;
 }
 
-OVERLOADABLE float __gen_compute_array_index(float index, image2d_array_t image)
+INLINE_OVERLOADABLE int4 __gen_validate_array_index(int4 coord, image2d_array_t image)
 {
-  GET_IMAGE(image, surface_id);
-  float array_size = __gen_ocl_get_image_depth(surface_id);
-  return clamp(rint(index), 0.f, array_size - 1.f);
+  int array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(coord.s2, 0, array_size - 1);
+  return coord;
 }
 
-OVERLOADABLE int __gen_compute_array_index(int index, image1d_array_t image)
+INLINE_OVERLOADABLE int3 __gen_validate_array_index(int3 coord, image2d_array_t image)
 {
-  GET_IMAGE(image, surface_id);
-  int array_size = __gen_ocl_get_image_depth(surface_id);
-  return clamp(index, 0, array_size - 1);
+  int array_size = __gen_ocl_get_image_depth(image);
+  coord.s2 = clamp(coord.s2, 0, array_size - 1);
+  return coord;
 }
 
-OVERLOADABLE int __gen_compute_array_index(int index, image2d_array_t image)
+// For non array image type, we need to do nothing.
+#define GEN_VALIDATE_ARRAY_INDEX(coord_type, image_type) \
+INLINE_OVERLOADABLE coord_type __gen_validate_array_index(coord_type coord, image_type image) \
+{ \
+  return coord; \
+}
+
+GEN_VALIDATE_ARRAY_INDEX(float, image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(int, image1d_t)
+GEN_VALIDATE_ARRAY_INDEX(float2, image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(int2, image2d_t)
+GEN_VALIDATE_ARRAY_INDEX(float4, image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int4, image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float3, image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(int3, image3d_t)
+GEN_VALIDATE_ARRAY_INDEX(float, image1d_buffer_t)
+GEN_VALIDATE_ARRAY_INDEX(int, image1d_buffer_t)
+
+///////////////////////////////////////////////////////////////////////////////
+// Helper functions to work around some coordiate boundary issues.
+// The major issue on Gen7/Gen7.5 are the sample message could not sampling
+// integer type surfaces correctly with CLK_ADDRESS_CLAMP and CLK_FILTER_NEAREST.
+// The work around is to use a LD message instead of normal sample message.
+///////////////////////////////////////////////////////////////////////////////
+
+bool __gen_ocl_sampler_need_fix(sampler_t);
+bool __gen_ocl_sampler_need_rounding_fix(sampler_t);
+
+bool __gen_sampler_need_fix(const sampler_t sampler)
 {
-  GET_IMAGE(image, surface_id);
-  int array_size = __gen_ocl_get_image_depth(surface_id);
-  return clamp(index, 0, array_size - 1);
+  return __gen_ocl_sampler_need_fix(sampler);
 }
 
-#define DECL_READ_IMAGE0(int_clamping_fix,                                   \
-                        image_type, type, suffix, coord_type, n)             \
-  OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
-                                               const sampler_t sampler,      \
-                                               coord_type coord)             \
-  {                                                                          \
-    GET_IMAGE(cl_image, surface_id);                                         \
-    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai);                          \
-    if (int_clamping_fix &&                                                  \
-        ((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) &&             \
-        ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST))               \
-            return   __gen_ocl_read_image ##suffix(                          \
-                        EXPEND_READ_COORD(surface_id, sampler, coord));      \
-    return  __gen_ocl_read_image ##suffix(                                   \
-                    EXPEND_READ_COORDF(surface_id, sampler, coord), 0);      \
-  }
+bool __gen_sampler_need_rounding_fix(const sampler_t sampler)
+{
+  return __gen_ocl_sampler_need_rounding_fix(sampler);
+}
+
+INLINE_OVERLOADABLE float __gen_fixup_float_coord(float tmpCoord)
+{
+  if (tmpCoord < 0 && tmpCoord > -0x1p-20f)
+    tmpCoord += -0x1p-9f;
+  return tmpCoord;
+}
+
+INLINE_OVERLOADABLE float2 __gen_fixup_float_coord(float2 tmpCoord)
+{
+  if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)
+    tmpCoord.s0 += -0x1p-9f;
+  if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)
+    tmpCoord.s1 += -0x1p-9f;
+  return tmpCoord;
+}
+
+INLINE_OVERLOADABLE float3 __gen_fixup_float_coord(float3 tmpCoord)
+{
+  if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)
+    tmpCoord.s0 += -0x1p-9f;
+  if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)
+    tmpCoord.s1 += -0x1p-9f;
+  if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20f)
+    tmpCoord.s2 += -0x1p-9f;
+  return tmpCoord;
+}
+
+INLINE_OVERLOADABLE float4 __gen_fixup_float_coord(float4 tmpCoord)
+{
+  if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)
+    tmpCoord.s0 += -0x1p-9f;
+  if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)
+    tmpCoord.s1 += -0x1p-9f;
+  if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20f)
+    tmpCoord.s2 += -0x1p-9f;
+  return tmpCoord;
+}
+
+// Functions to denormalize coordiates, it's needed when we need to use LD
+// message (sampler offset is non-zero) and the coordiates are normalized
+// coordiates.
+INLINE_OVERLOADABLE float __gen_denormalize_coord(const image1d_t image, float srcCoord)
+{
+  return srcCoord * __gen_ocl_get_image_width(image);
+}
+
+INLINE_OVERLOADABLE float2 __gen_denormalize_coord(const image1d_array_t image, float2 srcCoord)
+{
+  srcCoord.s0 = srcCoord.s0 * __gen_ocl_get_image_width(image);
+  return srcCoord;
+}
+
+INLINE_OVERLOADABLE float __gen_denormalize_coord(const image1d_buffer_t image, float srcCoord)
+{
+  return srcCoord * __gen_ocl_get_image_width(image);
+}
+
+INLINE_OVERLOADABLE float2 __gen_denormalize_coord(const image2d_t image, float2 srcCoord)
+{
+  srcCoord.s0 = srcCoord.s0 * __gen_ocl_get_image_width(image);
+  srcCoord.s1 = srcCoord.s1 * __gen_ocl_get_image_height(image);
+  return srcCoord;
+}
+
+INLINE_OVERLOADABLE float3 __gen_denormalize_coord(const image2d_array_t image, float3 srcCoord)
+{
+  srcCoord.s0 = srcCoord.s0 * __gen_ocl_get_image_width(image);
+  srcCoord.s1 = srcCoord.s1 * __gen_ocl_get_image_height(image);
+  return srcCoord;
+}
+
+INLINE_OVERLOADABLE float3 __gen_denormalize_coord(const image3d_t image, float3 srcCoord)
+{
+  srcCoord.s0 = srcCoord.s0 * __gen_ocl_get_image_width(image);
+  srcCoord.s1 = srcCoord.s1 * __gen_ocl_get_image_height(image);
+  srcCoord.s2 = srcCoord.s2 * __gen_ocl_get_image_depth(image);
+  return srcCoord;
+}
+
+INLINE_OVERLOADABLE float4 __gen_denormalize_coord(const image2d_array_t image, float4 srcCoord)
+{
+  srcCoord.s0 = srcCoord.s0 * __gen_ocl_get_image_width(image);
+  srcCoord.s1 = srcCoord.s1 * __gen_ocl_get_image_height(image);
+  return srcCoord;
+}
+
+INLINE_OVERLOADABLE float4 __gen_denormalize_coord(const image3d_t image, float4 srcCoord)
+{
+  srcCoord.s0 = srcCoord.s0 * __gen_ocl_get_image_width(image);
+  srcCoord.s1 = srcCoord.s1 * __gen_ocl_get_image_height(image);
+  srcCoord.s2 = srcCoord.s2 * __gen_ocl_get_image_depth(image);
+  return srcCoord;
+}
 
-#define DECL_READ_IMAGE1(float_coord_rounding_fix, int_clamping_fix,         \
-                        image_type, type, suffix, coord_type, n)             \
-  OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
-                                               const sampler_t sampler,      \
-                                               coord_type coord)             \
-  {                                                                          \
-    GET_IMAGE(cl_image, surface_id);                                         \
-    GET_IMAGE_ARRAY_SIZE(cl_image, coord, float, ai)                         \
-    coord_type tmpCoord = coord;                                             \
-    if (float_coord_rounding_fix | int_clamping_fix) {                       \
-      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              \
-          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        \
-        if (float_coord_rounding_fix                                         \
-            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {              \
-          FIXUP_FLOAT_COORD(tmpCoord);                                       \
-        }                                                                    \
-        if (int_clamping_fix) {                                              \
-            coord_type intCoord;                                             \
-            if (sampler & CLK_NORMALIZED_COORDS_TRUE) {                      \
-              DENORMALIZE_COORD(surface_id, intCoord, tmpCoord);             \
-            } else                                                           \
-              intCoord = tmpCoord;                                           \
-            return   __gen_ocl_read_image ##suffix(                          \
-                       EXPEND_READ_COORDI(surface_id, sampler, intCoord));\
-       }                                                                     \
-      }                                                                      \
-    }                                                                        \
-    return  __gen_ocl_read_image ##suffix(                                   \
-                        EXPEND_READ_COORDF(surface_id, sampler, tmpCoord), 0);\
+// After denormalize, we have to fixup the negative boundary.
+INLINE_OVERLOADABLE float __gen_fixup_neg_boundary(float coord)
+{
+  return coord < 0 ? -1 : coord;
+}
+
+INLINE_OVERLOADABLE float2 __gen_fixup_neg_boundary(float2 coord)
+{
+  coord.s0 = coord.s0 < 0 ? -1 : coord.s0;
+  coord.s1 = coord.s1 < 0 ? -1 : coord.s1;
+  return coord;
+}
+
+INLINE_OVERLOADABLE float4 __gen_fixup_neg_boundary(float4 coord)
+{
+  coord.s0 = coord.s0 < 0 ? -1 : coord.s0;
+  coord.s1 = coord.s1 < 0 ? -1 : coord.s1;
+  coord.s2 = coord.s2 < 0 ? -1 : coord.s2;
+  return coord;
+}
+
+INLINE_OVERLOADABLE float3 __gen_fixup_neg_boundary(float3 coord)
+{
+  coord.s0 = coord.s0 < 0 ? -1 : coord.s0;
+  coord.s1 = coord.s1 < 0 ? -1 : coord.s1;
+  coord.s2 = coord.s2 < 0 ? -1 : coord.s2;
+  return coord;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Built-in Image Read/Write Functions
+///////////////////////////////////////////////////////////////////////////////
+
+// 2D 3D Image Common Macro
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+#define GEN_FIX_FLOAT_ROUNDING 1
+#define GEN_FIX_INT_CLAMPING 1
+#else
+#define GEN_FIX_FLOAT_ROUNDING 0
+#define GEN_FIX_INT_CLAMPING 0
+#endif
+
+#define convert_float1 convert_float
+#define convert_int1 convert_int
+
+// For integer coordinates
+#define DECL_READ_IMAGE0(int_clamping_fix, image_type,                        \
+                         image_data_type, suffix, coord_type, n)              \
+  OVERLOADABLE image_data_type read_image ##suffix(image_type cl_image,       \
+                                        const sampler_t sampler,              \
+                                        coord_type coord)                     \
+  {                                                                           \
+    coord = __gen_validate_array_index(coord, cl_image);                      \
+    if (int_clamping_fix && __gen_sampler_need_fix(sampler))                  \
+      return __gen_ocl_read_image ##suffix(cl_image, sampler,                 \
+                                           convert_int ##n(coord), 1);        \
+    return __gen_ocl_read_image ##suffix(cl_image, sampler,                   \
+                                         convert_float ##n (coord), 0);       \
   }
 
-#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type, n)   \
-  OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
-                                               coord_type coord)             \
-  {                                                                          \
-    GET_IMAGE(cl_image, surface_id);                                         \
-    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai)                           \
-    return __gen_ocl_read_image ##suffix(                                    \
-           EXPEND_READ_COORDF(surface_id,                                    \
-                             CLK_NORMALIZED_COORDS_FALSE                     \
-                             | CLK_ADDRESS_NONE                              \
-                             | CLK_FILTER_NEAREST, (float)coord), 0);        \
+// For float coordinates
+#define DECL_READ_IMAGE1(int_clamping_fix, image_type,                        \
+                         image_data_type, suffix, coord_type, n)              \
+  OVERLOADABLE image_data_type read_image ##suffix(image_type cl_image,       \
+                                        const sampler_t sampler,              \
+                                        coord_type coord)                     \
+  {                                                                           \
+    coord_type tmpCoord = __gen_validate_array_index(coord, cl_image);        \
+    if (GEN_FIX_FLOAT_ROUNDING | int_clamping_fix) {                          \
+      if (__gen_sampler_need_fix(sampler)) {                                  \
+        if (GEN_FIX_FLOAT_ROUNDING &&                                         \
+            __gen_sampler_need_rounding_fix(sampler))                         \
+          tmpCoord = __gen_fixup_float_coord(tmpCoord);                       \
+        if (int_clamping_fix) {                                               \
+            if (!__gen_sampler_need_rounding_fix(sampler))                    \
+              tmpCoord = __gen_denormalize_coord(cl_image, tmpCoord);         \
+            tmpCoord = __gen_fixup_neg_boundary(tmpCoord);                    \
+            return __gen_ocl_read_image ##suffix(                             \
+                     cl_image, sampler, convert_int ##n(tmpCoord), 1);        \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    return  __gen_ocl_read_image ##suffix(cl_image, sampler,                  \
+                                          convert_float ##n (tmpCoord), 0);   \
   }
 
-#define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
-  OVERLOADABLE void write_image ##suffix(image_type cl_image, coord_type coord, type color)\
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    __gen_ocl_write_image ##suffix(EXPEND_WRITE_COORD(surface_id, coord, color));\
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, image_data_type,                \
+                                  suffix, coord_type, n)                      \
+  OVERLOADABLE image_data_type read_image ##suffix(image_type cl_image,       \
+                                               coord_type coord)              \
+  {                                                                           \
+    coord = __gen_validate_array_index(coord, cl_image);                      \
+    sampler_t defaultSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE \
+                               | CLK_FILTER_NEAREST;                          \
+    return __gen_ocl_read_image ##suffix(                                     \
+             cl_image, defaultSampler, convert_float ##n (coord), 0);         \
   }
 
-#define DECL_IMAGE_INFO_COMMON(image_type)    \
-  OVERLOADABLE  int get_image_channel_data_type(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_channel_data_type(surface_id); \
-  }\
-  OVERLOADABLE  int get_image_channel_order(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_channel_order(surface_id); \
-  } \
-  OVERLOADABLE int get_image_width(image_type image) \
-  { \
-    GET_IMAGE(image, surface_id); \
-    return __gen_ocl_get_image_width(surface_id);  \
+#define DECL_WRITE_IMAGE(image_type, image_data_type, suffix, coord_type)     \
+  OVERLOADABLE void write_image ##suffix(image_type cl_image,                 \
+                                         coord_type coord,                    \
+                                         image_data_type color)               \
+  {                                                                           \
+    coord_type fixedCoord = __gen_validate_array_index(coord, cl_image);      \
+    __gen_ocl_write_image ##suffix(cl_image, fixedCoord, color);              \
   }
 
+#define DECL_IMAGE(int_clamping_fix, image_type, image_data_type, suffix, n)  \
+  DECL_READ_IMAGE0(int_clamping_fix, image_type,                              \
+                   image_data_type, suffix, int ##n, n)                       \
+  DECL_READ_IMAGE1(int_clamping_fix, image_type,                              \
+                   image_data_type, suffix, float ##n, n)                     \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, image_data_type, suffix, int ##n, n)  \
+  DECL_WRITE_IMAGE(image_type, image_data_type, suffix, int ##n)              \
+
 // 1D
-#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix)                       \
-  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int, 1)               \
-  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float, 1)  \
-  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int, 1)                        \
-  DECL_WRITE_IMAGE(image_type, type, suffix, int)                                    \
-  DECL_WRITE_IMAGE(image_type, type, suffix, float)
-
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord, 1
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord < 0 ? -1 : coord), 1
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord = srcCoord * __gen_ocl_get_image_width(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord, color
-#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                            \
-  {                                                            \
-    if (tmpCoord < 0 && tmpCoord > -0x1p-20f)                  \
-      tmpCoord += -0x1p-9f;                                     \
+#define DECL_IMAGE_TYPE(image_type, n)                                        \
+  DECL_IMAGE(GEN_FIX_INT_CLAMPING, image_type, int4, i, n)                    \
+  DECL_IMAGE(GEN_FIX_INT_CLAMPING, image_type, uint4, ui, n)                  \
+  DECL_IMAGE(0, image_type, float4, f, n)
+
+DECL_IMAGE_TYPE(image1d_t, 1)
+DECL_IMAGE_TYPE(image2d_t, 2)
+DECL_IMAGE_TYPE(image3d_t, 4)
+DECL_IMAGE_TYPE(image3d_t, 3)
+DECL_IMAGE_TYPE(image2d_array_t, 4)
+DECL_IMAGE_TYPE(image2d_array_t, 3)
+
+#define DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(image_type, image_data_type,       \
+                                  suffix, coord_type)                         \
+  OVERLOADABLE image_data_type read_image ##suffix(image_type cl_image,       \
+                                               coord_type coord)              \
+  {                                                                           \
+    sampler_t defaultSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE \
+                               | CLK_FILTER_NEAREST;                          \
+    int2 effectCoord;                                                         \
+    effectCoord.s0 = coord % 8192;                                            \
+    effectCoord.s1 = coord / 8192;                                            \
+    return __gen_ocl_read_image ##suffix(                                     \
+             cl_image, defaultSampler, convert_float2(effectCoord), 0);       \
   }
 
-DECL_IMAGE(GEN_FIX_1, image1d_t, int4, i)
-DECL_IMAGE(GEN_FIX_1, image1d_t, uint4, ui)
-DECL_IMAGE(0, image1d_t, float4, f)
-DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, int4, i)
-DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, uint4, ui)
-DECL_IMAGE(0, image1d_buffer_t, float4, f)
+#define DECL_IMAGE_1DBuffer(int_clamping_fix, image_data_type, suffix)        \
+  DECL_READ_IMAGE1D_BUFFER_NOSAMPLER(image1d_buffer_t, image_data_type,       \
+                                     suffix, int)                             \
+  DECL_WRITE_IMAGE(image1d_buffer_t, image_data_type, suffix, int)
+
+DECL_IMAGE_1DBuffer(GEN_FIX_INT_CLAMPING, int4, i)
+DECL_IMAGE_1DBuffer(GEN_FIX_INT_CLAMPING, uint4, ui)
+DECL_IMAGE_1DBuffer(0, float4, f)
+
+// For 1D Array:
+// fixup_1darray_coord functions are to convert 1d array coord to 2d array coord
+// and the caller must set the sampler offset to 2 by using this converted coord.
+// It is used to work around an image 1d array restrication which could not set
+// ai in the LD message. We solve it by fake the same image as a 2D array, and
+// then access it by LD message as a 3D sufface, treat the ai as the w coordinate.
+INLINE_OVERLOADABLE float4 __gen_fixup_1darray_coord(float2 coord, image1d_array_t image)
+{
+  float4 newCoord;
+  newCoord.s0 = coord.s0 < 0 ? -1 : coord.s0;
+  newCoord.s1 = 0;
+  newCoord.s2 = coord.s1;
+  newCoord.s3 = 0;
+  return newCoord;
+}
 
-// 1D Info
-DECL_IMAGE_INFO_COMMON(image1d_t)
-DECL_IMAGE_INFO_COMMON(image1d_buffer_t)
+INLINE_OVERLOADABLE int4 __gen_fixup_1darray_coord(int2 coord, image1d_array_t image)
+{
+  int4 newCoord;
+  newCoord.s0 = coord.s0;
+  newCoord.s1 = 0;
+  newCoord.s2 = coord.s1;
+  newCoord.s3 = 0;
+  return newCoord;
+}
 
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDF
-#undef EXPEND_READ_COORDI
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-#undef DECL_IMAGE
-// End of 1D
-
-#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n)                       \
-  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int ##n, n)              \
-  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n, n) \
-  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n, n)                       \
-  DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)                                  \
-  DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
-// 2D
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, 1
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), \
-                                               (int)(coord.s1 < 0 ? -1 : coord.s1), 1
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
-                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                            \
-  {                                                            \
-    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
-      tmpCoord.s0 += -0x1p-9f;                                  \
-    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)            \
-      tmpCoord.s1 += -0x1p-9f;                                 \
+// For integer coordinates
+#define DECL_READ_IMAGE0_1DArray(int_clamping_fix,                            \
+                                 image_data_type, suffix, coord_type)         \
+  OVERLOADABLE image_data_type read_image ##suffix(image1d_array_t cl_image,  \
+                                        const sampler_t sampler,              \
+                                        coord_type coord)                     \
+  {                                                                           \
+    coord = __gen_validate_array_index(coord, cl_image);                      \
+    if (int_clamping_fix && __gen_sampler_need_fix(sampler)) {                \
+      int4 newCoord = __gen_fixup_1darray_coord(coord, cl_image);             \
+      return __gen_ocl_read_image ##suffix(cl_image, sampler, newCoord, 2);   \
+    }                                                                         \
+    return  __gen_ocl_read_image ##suffix(cl_image, sampler,                  \
+                                          convert_float2 (coord), 0);         \
   }
 
-DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
-DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
-DECL_IMAGE(0, image2d_t, float4, f, 2)
-
-// 1D Array
-#undef GET_IMAGE_ARRAY_SIZE
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDF
-#undef EXPEND_READ_COORDI
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, (int)0, ai, 2
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)ai
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), 0, (int)ai, 2
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, __gen_compute_array_index(coord.s1, cl_image), color
-#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
-  coord_type ai = __gen_compute_array_index(coord.s1, image);
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                            \
-  {                                                            \
-    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
-      tmpCoord.s0 += -0x1p-9f;                                  \
+// For float coordiates
+#define DECL_READ_IMAGE1_1DArray(int_clamping_fix, image_data_type,           \
+                                 suffix, coord_type)                          \
+  OVERLOADABLE image_data_type read_image ##suffix(image1d_array_t cl_image,  \
+                                        const sampler_t sampler,              \
+                                        coord_type coord)                     \
+  {                                                                           \
+    coord_type tmpCoord = __gen_validate_array_index(coord, cl_image);        \
+    if (GEN_FIX_FLOAT_ROUNDING | int_clamping_fix) {                          \
+      if (__gen_sampler_need_fix(sampler)) {                                  \
+        if (GEN_FIX_FLOAT_ROUNDING &&                                         \
+            __gen_sampler_need_rounding_fix(sampler))                         \
+          tmpCoord = __gen_fixup_float_coord(tmpCoord);                       \
+        if (int_clamping_fix) {                                               \
+            if (!__gen_sampler_need_rounding_fix(sampler))                    \
+              tmpCoord = __gen_denormalize_coord(cl_image, tmpCoord);         \
+            float4 newCoord = __gen_fixup_1darray_coord(tmpCoord, cl_image);  \
+            return __gen_ocl_read_image ##suffix(                             \
+                     cl_image, sampler, convert_int4(newCoord), 2);         \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    return  __gen_ocl_read_image ##suffix(cl_image, sampler,                \
+                                          convert_float2 (tmpCoord), 0);      \
   }
 
-DECL_IMAGE(GEN_FIX_1, image1d_array_t, int4, i, 2)
-DECL_IMAGE(GEN_FIX_1, image1d_array_t, uint4, ui, 2)
-DECL_IMAGE(0, image1d_array_t, float4, f, 2)
+#define DECL_IMAGE_1DArray(int_clamping_fix, image_data_type, suffix)         \
+  DECL_READ_IMAGE0_1DArray(int_clamping_fix, image_data_type, suffix, int2)   \
+  DECL_READ_IMAGE1_1DArray(int_clamping_fix, image_data_type,                 \
+                           suffix, float2)                                    \
+  DECL_READ_IMAGE_NOSAMPLER(image1d_array_t, image_data_type, suffix, int2, 2)\
+  DECL_WRITE_IMAGE(image1d_array_t, image_data_type, suffix, int2)            \
+
+DECL_IMAGE_1DArray(GEN_FIX_INT_CLAMPING, int4, i)
+DECL_IMAGE_1DArray(GEN_FIX_INT_CLAMPING, uint4, ui)
+DECL_IMAGE_1DArray(0, float4, f)
+
+///////////////////////////////////////////////////////////////////////////////
+// Built-in Image Query Functions
+///////////////////////////////////////////////////////////////////////////////
+#define DECL_IMAGE_INFO_COMMON(image_type)                                    \
+  OVERLOADABLE  int get_image_channel_data_type(image_type image)             \
+  {                                                                           \
+    return __gen_ocl_get_image_channel_data_type(image);                 \
+  }                                                                           \
+  OVERLOADABLE  int get_image_channel_order(image_type image)                 \
+  {                                                                           \
+    return __gen_ocl_get_image_channel_order(image);                     \
+  }                                                                           \
+  OVERLOADABLE int get_image_width(image_type image)                          \
+  {                                                                           \
+    return __gen_ocl_get_image_width(image);                             \
+  }
 
-// 2D Info
+DECL_IMAGE_INFO_COMMON(image1d_t)
+DECL_IMAGE_INFO_COMMON(image1d_buffer_t)
+DECL_IMAGE_INFO_COMMON(image1d_array_t)
 DECL_IMAGE_INFO_COMMON(image2d_t)
+DECL_IMAGE_INFO_COMMON(image3d_t)
+DECL_IMAGE_INFO_COMMON(image2d_array_t)
+
+// 2D extra Info
 OVERLOADABLE int get_image_height(image2d_t image)
 {
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_height(surface_id);
+  return __gen_ocl_get_image_height(image);
 }
 OVERLOADABLE int2 get_image_dim(image2d_t image)
 {
   return (int2){get_image_width(image), get_image_height(image)};
 }
+// End of 2D
 
-// 1D Array info
-DECL_IMAGE_INFO_COMMON(image1d_array_t)
-OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
-{
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_depth(surface_id);
-}
-
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDI
-#undef EXPEND_READ_COORDF
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-#undef GET_IMAGE_ARRAY_SIZE
-// End of 2D and 1D Array
-
-// 3D
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2, 1
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)coord.s2
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
-                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)(coord.s2 < 0 ? -1 : coord.s2), 1
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
-                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id); \
-                                                  dstCoord.z = srcCoord.z * __gen_ocl_get_image_depth(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                             \
-  {                                                             \
-    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)              \
-      tmpCoord.s0 += -0x1p-9f;                                   \
-    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)              \
-      tmpCoord.s1 += -0x1p-9f;                                   \
-    if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20f)              \
-      tmpCoord.s2 += -0x1p-9f;                                   \
-  }
-#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
-
-DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
-DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
-DECL_IMAGE(0, image3d_t, float4, f, 4)
-
-DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
-DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
-DECL_IMAGE(0, image3d_t, float4, f, 3)
-
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDF
-#undef EXPEND_READ_COORDI
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-#undef GET_IMAGE_ARRAY_SIZE
-
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, ai, 1
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)ai
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
-                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)ai, 1
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
-                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, __gen_compute_array_index(coord.s2, cl_image), color
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                             \
-  {                                                             \
-    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)              \
-      tmpCoord.s0 += -0x1p-9f;                                   \
-    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)              \
-      tmpCoord.s1 += -0x1p-9f;                                   \
-  }
-#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
-  coord_type ai = __gen_compute_array_index(coord.s2, image);
-
-// 2D Array
-DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 4)
-DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 4)
-DECL_IMAGE(0, image2d_array_t, float4, f, 4)
-
-DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 3)
-DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 3)
-DECL_IMAGE(0, image2d_array_t, float4, f, 3)
-
-// 3D Info
-DECL_IMAGE_INFO_COMMON(image3d_t)
+// 3D extra Info
 OVERLOADABLE int get_image_height(image3d_t image)
 {
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_height(surface_id);
+  return __gen_ocl_get_image_height(image);
 }
 OVERLOADABLE int get_image_depth(image3d_t image)
 {
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_depth(surface_id);
+  return __gen_ocl_get_image_depth(image);
 }
 OVERLOADABLE int4 get_image_dim(image3d_t image)
 {
-  return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0};
+  return (int4) (get_image_width(image),
+                 get_image_height(image),
+                 get_image_depth(image),
+                 0);
 }
 
-// 2D Array Info
-DECL_IMAGE_INFO_COMMON(image2d_array_t)
+// 2D Array extra Info
 OVERLOADABLE int get_image_height(image2d_array_t image)
 {
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_height(surface_id);
+  return __gen_ocl_get_image_height(image);
 }
 OVERLOADABLE int2 get_image_dim(image2d_array_t image)
 {
@@ -405,25 +538,12 @@ OVERLOADABLE int2 get_image_dim(image2d_array_t image)
 }
 OVERLOADABLE size_t get_image_array_size(image2d_array_t image)
 {
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_depth(surface_id);
+  return __gen_ocl_get_image_depth(image);
 }
 
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDF
-#undef EXPEND_READ_COORDI
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-#undef GET_IMAGE_ARRAY_SIZE
-// End of 3D and 2D Array
-
-#undef DECL_IMAGE
-#undef DECL_READ_IMAGE
-#undef DECL_READ_IMAGE_NOSAMPLER
-#undef DECL_WRITE_IMAGE
-#undef GEN_FIX_1
-// End of Image
-
-
-#undef GET_IMAGE
+// 1D Array info
+OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
+{
+  return __gen_ocl_get_image_depth(image);
+}
+// End of 1DArray
diff --git a/backend/src/libocl/src/ocl_memcpy.ll b/backend/src/libocl/src/ocl_memcpy.ll
index fbc44d1..b3fadb2 100644
--- a/backend/src/libocl/src/ocl_memcpy.ll
+++ b/backend/src/libocl/src/ocl_memcpy.ll
@@ -550,3 +550,180 @@ while.body:                                       ; preds = %entry, %while.body
 while.end:                                        ; preds = %while.body, %entry
   ret void
 }
+
+define void @__gen_memcpy_gc_align(i8 addrspace(1)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ugt i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(2)* %add.ptr to i32 addrspace(2)*
+  %1 = load i32 addrspace(2)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  store i32 %1, i32 addrspace(1)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.1
+  %3 = load i8 addrspace(2)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_pc_align(i8 addrspace(0)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ugt i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(2)* %add.ptr to i32 addrspace(2)*
+  %1 = load i32 addrspace(2)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+  store i32 %1, i32 addrspace(0)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.1
+  %3 = load i8 addrspace(2)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_lc_align(i8 addrspace(3)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ugt i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(2)* %add.ptr to i32 addrspace(2)*
+  %1 = load i32 addrspace(2)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+  store i32 %1, i32 addrspace(3)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(2)* %src, i32 %index.1
+  %3 = load i8 addrspace(2)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_pc(i8 addrspace(0)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(2)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(2)*
+  %3 = load i8 addrspace(2)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_gc(i8 addrspace(1)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(2)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(2)*
+  %3 = load i8 addrspace(2)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_lc(i8 addrspace(3)* %dst, i8 addrspace(2)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(2)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(2)*
+  %3 = load i8 addrspace(2)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 992727f..28b9c20 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -87,15 +87,11 @@ OVERLOADABLE ulong clz(ulong x) {
   return v;
 }
 
-OVERLOADABLE char popcount(char x) {
-  return x == 0 ? 0 : x < 0?__gen_ocl_cbit(x) - 24 : __gen_ocl_cbit(x);
-}
-OVERLOADABLE short popcount(short x) {
-  return x == 0 ? 0 : x < 0?__gen_ocl_cbit(x) - 16 : __gen_ocl_cbit(x);
-}
 #define SDEF(TYPE)        \
-OVERLOADABLE TYPE popcount(TYPE x){ return x == 0? 0:__gen_ocl_cbit(x);}
+OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
+SDEF(char);
 SDEF(uchar);
+SDEF(short);
 SDEF(ushort);
 SDEF(int);
 SDEF(uint);
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 236fa0b..49c4efa 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -19,6 +19,7 @@
 #include "ocl_float.h"
 #include "ocl_relational.h"
 #include "ocl_common.h"
+#include "ocl_integer.h"
 
 constant int __ocl_math_fastpath_flag = 1;
 
@@ -399,340 +400,161 @@ float __gen_ocl_scalbnf (float x, int n){
   return x*twom25;
 }
 
-
-__constant const float PIo2[] = {
-  1.5703125000e+00, /* 0x3fc90000 */
-  4.5776367188e-04, /* 0x39f00000 */
-  2.5987625122e-05, /* 0x37da0000 */
-  7.5437128544e-08, /* 0x33a20000 */
-  6.0026650317e-11, /* 0x2e840000 */
-  7.3896444519e-13, /* 0x2b500000 */
-  5.3845816694e-15, /* 0x27c20000 */
-  5.6378512969e-18, /* 0x22d00000 */
-  8.3009228831e-20, /* 0x1fc40000 */
-  3.2756352257e-22, /* 0x1bc60000 */
-  6.3331015649e-25, /* 0x17440000 */
+const __constant unsigned int two_over_pi[] = {
+0, 0, 0xA2F, 0x983, 0x6E4, 0xe44, 0x152, 0x9FC,
+0x275, 0x7D1, 0xF53, 0x4DD, 0xC0D, 0xB62,
+0x959, 0x93C, 0x439, 0x041, 0xFE5, 0x163,
 };
 
+// The main idea is from "Radian Reduction for Trigonometric Functions"
+// written by Mary H. Payne and Robert N. Hanek. Also another reference
+// is "A Continued-Fraction Analysis of Trigonometric Argument Reduction"
+// written by Roger Alan Smith, who gave the worst case in this paper.
+// for single float, worst x = 0x1.47d0fep34, and there are 29 bit
+// leading zeros in the fraction part of x*(2.0/pi). so we need at least
+// 29 (leading zero)+ 24 (fraction )+12 (integer) + guard bits. that is,
+// 65 + guard bits, as we calculate in 12*7 = 84bits, which means we have
+// about 19 guard bits. If we need further precision, we may need more
+// guard bits
+// Note we place two 0 in two_over_pi, which is used to handle input less
+// than 0x1.0p23
+
+int payne_hanek(float x, float *y) {
+  union { float f; unsigned u;} ieee;
+  ieee.f = x;
+  unsigned u = ieee.u;
+  int k = ((u & 0x7f800000) >> 23)-127;
+  int ma = (u & 0x7fffff) | 0x800000;
+  unsigned  high, low;
+  high = (ma & 0xfff000) >> 12;
+  low = ma & 0xfff;
+
+  // Two tune below macro, you need to fully understand the algorithm
+#define CALC_BLOCKS 7
+#define ZERO_BITS 2
 
-int __kernel_rem_pio2f(float *x, float *y, int e0, int nx, int prec, const __constant int *ipio2)
-{
-  /* copied from fdlibm */
-const float
-zero   = 0.0,
-one    = 1.0,
-two8   =  2.5600000000e+02, /* 0x43800000 */
-twon8  =  3.9062500000e-03; /* 0x3b800000 */
-
-  int init_jk[3]; /* initial value for jk */
-  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
-  float z,fw,f[20],fq[20],q[20];
-  init_jk[0] = 4; init_jk[1] = 7; init_jk[2] = 9;
-    /* initialize jk*/
-  jk = init_jk[prec];
-  jp = jk;
-
-    /* determine jx,jv,q0, note that 3>q0 */
-  jx =  nx-1;
-  jv = (e0-3)/8; if(jv<0) jv=0;
-  q0 =  e0-8*(jv+1);
-
-    /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
-  j = jv-jx; m = jx+jk;
-  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (float) ipio2[j];
-
-    /* compute q[0],q[1],...q[jk] */
-  for (i=0;i<=jk;i++) {
-      for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
-  }
-
-  jz = jk;
-recompute:
-    /* distill q[] into iq[] reversingly */
-  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
-      fw    =  (float)((int)(twon8* z));
-      iq[i] =  (int)(z-two8*fw);
-      z     =  q[j-1]+fw;
-  }
-
-    /* compute n */
-  z  = __gen_ocl_scalbnf(z,q0);   /* actual value of z */
-  z -= (float)8.0*__gen_ocl_internal_floor(z*(float)0.125); /* trim off integer >= 8 */
-  n  = (int) z;
-  z -= (float)n;
-  ih = 0;
-  if(q0>0) {  /* need iq[jz-1] to determine n */
-      i  = (iq[jz-1]>>(8-q0)); n += i;
-      iq[jz-1] -= i<<(8-q0);
-      ih = iq[jz-1]>>(7-q0);
-  }
-  else if(q0==0) ih = iq[jz-1]>>8;
-  else if(z>=(float)0.5) ih=2;
-
-  if(ih>0) {  /* q > 0.5 */
-      n += 1; carry = 0;
-      for(i=0;i<jz ;i++) {  /* compute 1-q */
-    j = iq[i];
-    if(carry==0) {
-        if(j!=0) {
-      carry = 1; iq[i] = 0x100- j;
-        }
-    } else  iq[i] = 0xff - j;
-      }
-      if(q0>0) {    /* rare case: chance is 1 in 12 */
-          switch(q0) {
-          case 1:
-           iq[jz-1] &= 0x7f; break;
-        case 2:
-           iq[jz-1] &= 0x3f; break;
-          }
-      }
-      if(ih==2) {
-    z = one - z;
-    if(carry!=0) z -= __gen_ocl_scalbnf(one,q0);
-      }
-  }
+  unsigned result[CALC_BLOCKS];
 
-    /* check if recomputation is needed */
-  if(z==zero) {
-      j = 0;
-      for (i=jz-1;i>=jk;i--) j |= iq[i];
-      if(j==0) { /* need recomputation */
-    for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
+  // round down, note we need 2 bits integer precision
+  int index = (k-23-2) < 0 ? (k-23-2-11)/12 : (k-23-2)/12;
 
-    for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
-        f[jx+i] = (float) ipio2[jv+i];
-        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
-        q[i] = fw;
-    }
-    jz += k;
-    goto recompute;
-      }
+  for (int i = 0; i < CALC_BLOCKS; i++) {
+    result[i] =  low * two_over_pi[index+i+ZERO_BITS] ;
+    result[i] +=  high * two_over_pi[index+i+1+ZERO_BITS];
   }
 
-    /* chop off zero terms */
-  if(z==(float)0.0) {
-      jz -= 1; q0 -= 8;
-      while(iq[jz]==0) { jz--; q0-=8;}
-  } else { /* break z into 8-bit if necessary */
-      z = __gen_ocl_scalbnf(z,-q0);
-      if(z>=two8) {
-    fw = (float)((int)(twon8*z));
-    iq[jz] = (int)(z-two8*fw);
-    jz += 1; q0 += 8;
-    iq[jz] = (int) fw;
-      } else iq[jz] = (int) z ;
+  for (int i = CALC_BLOCKS-1; i > 0; i--) {
+    int temp = result[i] >> 12;
+    result[i]  -= temp << 12;
+    result[i-1] += temp;
   }
+#undef CALC_BLOCKS
+#undef ZERO_BITS
 
-    /* convert integer "bit" chunk to floating-point value */
-  fw = __gen_ocl_scalbnf(one,q0);
-  for(i=jz;i>=0;i--) {
-      q[i] = fw*(float)iq[i]; fw*=twon8;
-  }
+  // get number of integer digits in result[0], note we only consider 12 valid bits
+  // and also it means the fraction digits in result[0] is (12-intDigit)
 
-    /* compute PIo2[0,...,jp]*q[jz,...,0] */
-  for(i=jz;i>=0;i--) {
-      for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
-      fq[jz-i] = fw;
-  }
+  int intDigit = index*(-12) + (k-23);
 
-    /* compress fq[] into y[] */
-  switch(prec) {
-      case 0:
-    fw = 0.0;
-    for (i=jz;i>=0;i--) fw += fq[i];
-    y[0] = (ih==0)? fw: -fw;
-    break;
-      case 1:
-      case 2:
-    fw = 0.0;
-    for (i=jz;i>=0;i--) fw += fq[i];
-    y[0] = (ih==0)? fw: -fw;
-    fw = fq[0]-fw;
-    for (i=1;i<=jz;i++) fw += fq[i];
-    y[1] = (ih==0)? fw: -fw;
-    break;
-      case 3: /* painful */
-    for (i=jz;i>0;i--) {
-        fw      = fq[i-1]+fq[i];
-        fq[i]  += fq[i-1]-fw;
-        fq[i-1] = fw;
-    }
-    for (i=jz;i>1;i--) {
-        fw      = fq[i-1]+fq[i];
-        fq[i]  += fq[i-1]-fw;
-        fq[i-1] = fw;
-    }
-    for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
-    if(ih==0) {
-        y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
-    } else {
-        y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
-    }
-  }
-  return n&7;
+  // As the integer bits may be all included in result[0], and also maybe
+  // some bits in result[0], and some in result[1]. So we merge succesive bits,
+  // which makes easy coding.
+
+  unsigned b0 = (result[0] << 12) | result[1];
+  unsigned b1 = (result[2] << 12) | result[3];
+  unsigned b2 = (result[4] << 12) | result[5];
+  unsigned b3 = (result[6] << 12);
+
+  unsigned intPart = b0 >> (24-intDigit);
+
+  unsigned fract1 = ((b0 << intDigit) | (b1 >> (24-intDigit))) & 0xffffff;
+  unsigned fract2 = ((b1 << intDigit) | (b2 >> (24-intDigit))) & 0xffffff;
+  unsigned fract3 = ((b2 << intDigit) | (b3 >> (24-intDigit))) & 0xffffff;
+
+  // larger than 0.5? which mean larger than pi/4, we need
+  // transform from [0,pi/2] to [-pi/4, pi/4] through -(1.0-fract)
+  int largerPiBy4 = ((fract1 & 0x800000) != 0);
+  int sign = largerPiBy4 ? 1 : 0;
+  intPart = largerPiBy4 ? (intPart+1) : intPart;
+
+  fract1 = largerPiBy4 ? (fract1 ^ 0x00ffffff) : fract1;
+  fract2 = largerPiBy4 ? (fract2 ^ 0x00ffffff) : fract2;
+  fract3 = largerPiBy4 ? (fract3 ^ 0x00ffffff) : fract3;
 
+  int leadingZero = (fract1 == 0);
+
+  // +1 is for the hidden bit 1 in floating-point format
+  int exponent = leadingZero ? -(24+1) : -(0+1);
+
+  fract1 = leadingZero ? fract2 : fract1;
+  fract2 = leadingZero ? fract3 : fract2;
+
+  // fract1 may have leading zeros, add it
+  int shift = clz(fract1)-8;
+  exponent += -shift;
+
+  float pio2 = 0x1.921fb6p+0;
+  unsigned fdigit = ((fract1 << shift) | (fract2 >> (24-shift))) & 0xffffff;
+
+  // we know that denormal number will not appear here
+  ieee.u = (sign << 31) | ((exponent+127) << 23) | (fdigit & 0x7fffff);
+  *y = ieee.f * pio2;
+  return intPart;
 }
 
-__constant const int npio2_hw[32] = {
-0x3fc90f00, 0x40490f00, 0x4096cb00, 0x40c90f00, 0x40fb5300, 0x4116cb00,
-0x412fed00, 0x41490f00, 0x41623100, 0x417b5300, 0x418a3a00, 0x4196cb00,
-0x41a35c00, 0x41afed00, 0x41bc7e00, 0x41c90f00, 0x41d5a000, 0x41e23100,
-0x41eec200, 0x41fb5300, 0x4203f200, 0x420a3a00, 0x42108300, 0x4216cb00,
-0x421d1400, 0x42235c00, 0x4229a500, 0x422fed00, 0x42363600, 0x423c7e00,
-0x4242c700, 0x42490f00
-};
+int argumentReduceSmall(float x, float * remainder) {
+  union {
+    float f;
+    unsigned u;
+  } ieee;
 
-__constant const int two_over_pi[22*9] = {
-0xA2, 0xF9, 0x83, 0x6E, 0x4E, 0x44, 0x15, 0x29, 0xFC,
-0x27, 0x57, 0xD1, 0xF5, 0x34, 0xDD, 0xC0, 0xDB, 0x62,
-0x95, 0x99, 0x3C, 0x43, 0x90, 0x41, 0xFE, 0x51, 0x63,
-0xAB, 0xDE, 0xBB, 0xC5, 0x61, 0xB7, 0x24, 0x6E, 0x3A,
-0x42, 0x4D, 0xD2, 0xE0, 0x06, 0x49, 0x2E, 0xEA, 0x09,
-0xD1, 0x92, 0x1C, 0xFE, 0x1D, 0xEB, 0x1C, 0xB1, 0x29,
-0xA7, 0x3E, 0xE8, 0x82, 0x35, 0xF5, 0x2E, 0xBB, 0x44,
-0x84, 0xE9, 0x9C, 0x70, 0x26, 0xB4, 0x5F, 0x7E, 0x41,
-0x39, 0x91, 0xD6, 0x39, 0x83, 0x53, 0x39, 0xF4, 0x9C,
-0x84, 0x5F, 0x8B, 0xBD, 0xF9, 0x28, 0x3B, 0x1F, 0xF8,
-0x97, 0xFF, 0xDE, 0x05, 0x98, 0x0F, 0xEF, 0x2F, 0x11,
-0x8B, 0x5A, 0x0A, 0x6D, 0x1F, 0x6D, 0x36, 0x7E, 0xCF,
-0x27, 0xCB, 0x09, 0xB7, 0x4F, 0x46, 0x3F, 0x66, 0x9E,
-0x5F, 0xEA, 0x2D, 0x75, 0x27, 0xBA, 0xC7, 0xEB, 0xE5,
-0xF1, 0x7B, 0x3D, 0x07, 0x39, 0xF7, 0x8A, 0x52, 0x92,
-0xEA, 0x6B, 0xFB, 0x5F, 0xB1, 0x1F, 0x8D, 0x5D, 0x08,
-0x56, 0x03, 0x30, 0x46, 0xFC, 0x7B, 0x6B, 0xAB, 0xF0,
-0xCF, 0xBC, 0x20, 0x9A, 0xF4, 0x36, 0x1D, 0xA9, 0xE3,
-0x91, 0x61, 0x5E, 0xE6, 0x1B, 0x08, 0x65, 0x99, 0x85,
-0x5F, 0x14, 0xA0, 0x68, 0x40, 0x8D, 0xFF, 0xD8, 0x80,
-0x4D, 0x73, 0x27, 0x31, 0x06, 0x06, 0x15, 0x56, 0xCA,
-0x73, 0xA8, 0xC9, 0x60, 0xE2, 0x7B, 0xC0, 0x8C, 0x6B,
-};
+  float twoByPi = 2.0f/3.14159265f;
+  float piBy2_1h = (float) 0xc90/0x1.0p11,
+        piBy2_1l = (float) 0xfda/0x1.0p23,
+        piBy2_2h = (float) 0xa22/0x1.0p35,
+        piBy2_2l = (float) 0x168/0x1.0p47,
+        piBy2_3h = (float) 0xc23/0x1.0p59,
+        piBy2_3l = (float) 0x4c4/0x1.0p71;
 
+  float y = (float)(int)(twoByPi * x + 0.5f);
+  ieee.f = y;
+  ieee.u = ieee.u & 0xfffff000;
 
-int __ieee754_rem_pio2f(float x, float *y) {
-  /* copied from fdlibm */
-  float z,w,t,r,fn;
-  float tx[3];
-
-const float half_value = 5.0000000e-1;
-const float zero =  0.0000000000;
-const float two8 =  2.5600000000e+02;
-const float invpio2 =  6.3661980629e-01;
-const float pio2_1  =  1.5707855225e+00;
-const float pio2_1t =  1.0804334124e-05;
-const float pio2_2  =  1.0804273188e-05;
-const float pio2_2t =  6.0770999344e-11;
-const float pio2_3  =  6.0770943833e-11;
-const float pio2_3t =  6.1232342629e-17;
-  int e0,i,j,nx,n,ix,hx;
+  float yh = ieee.f;
+  float yl = y - yh;
+  float rem = x - yh*piBy2_1h - yh*piBy2_1l - yl*piBy2_1h - yl*piBy2_1l;
+  rem = rem - yh*piBy2_2h - yh*piBy2_2l + yl*piBy2_2h + yl*piBy2_2l;
+  rem = rem - yh*piBy2_3h - yh*piBy2_3l - yl*piBy2_3h - yl*piBy2_3l;
 
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix<=0x3f490fd8)   /* |x| ~<= pi/4 , no need for reduction */
-      {y[0] = x; y[1] = 0; return 0;}
-  if(ix<0x4016cbe4) {  /* |x| < 3pi/4, special case with n=+-1 */
-      if(hx>0) {
-    z = x - pio2_1;
-    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
-        y[0] = z - pio2_1t;
-        y[1] = (z-y[0])-pio2_1t;
-    } else {    /* near pi/2, use 24+24+24 bit pi */
-        z -= pio2_2;
-        y[0] = z - pio2_2t;
-        y[1] = (z-y[0])-pio2_2t;
-    }
-    return 1;
-      } else {  /* negative x */
-    z = x + pio2_1;
-    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
-        y[0] = z + pio2_1t;
-        y[1] = (z-y[0])+pio2_1t;
-    } else {    /* near pi/2, use 24+24+24 bit pi */
-        z += pio2_2;
-        y[0] = z + pio2_2t;
-        y[1] = (z-y[0])+pio2_2t;
-    }
-    return -1;
-      }
-  }
-  if(ix<=0x43490f80) { /* |x| ~<= 2^7*(pi/2), medium size */
-      t  = __gen_ocl_fabs(x);
-      n  = (int) (t*invpio2+half_value);
-      fn = (float)n;
-      r  = t-fn*pio2_1;
-      w  = fn*pio2_1t;  /* 1st round good to 40 bit */
-      if(n<32&&(ix&0xffffff00)!=npio2_hw[n-1]) {
-    y[0] = r-w; /* quick check no cancellation */
-      } else {
-          uint high;
-          j  = ix>>23;
-          y[0] = r-w;
-    GEN_OCL_GET_FLOAT_WORD(high,y[0]);
-          i = j-((high>>23)&0xff);
-          if(i>8) {  /* 2nd iteration needed, good to 57 */
-        t  = r;
-        w  = fn*pio2_2;
-        r  = t-w;
-        w  = fn*pio2_2t-((t-r)-w);
-        y[0] = r-w;
-        GEN_OCL_GET_FLOAT_WORD(high,y[0]);
-        i = j-((high>>23)&0xff);
-        if(i>25)  { /* 3rd iteration need, 74 bits acc */
-          t  = r; /* will cover all possible cases */
-          w  = fn*pio2_3;
-          r  = t-w;
-          w  = fn*pio2_3t-((t-r)-w);
-          y[0] = r-w;
-        }
-    }
-      }
-      y[1] = (r-y[0])-w;
-      if(hx<0)  {y[0] = -y[0]; y[1] = -y[1]; return -n;}
-      else   return n;
-  }
-    /*
-     * all other (large) arguments
-     */
-  if(ix>=0x7f800000) {    /* x is inf or NaN */
-      y[0]=y[1]=x-x; return 0;
-  }
-    /* set z = scalbn(|x|,ilogb(x)-7) */
-  e0  = (ix>>23)-134;   /* e0 = ilogb(z)-7; */
-  GEN_OCL_SET_FLOAT_WORD(z, ix - ((int)(e0<<23)));
-  for(i=0;i<2;i++) {
-    tx[i] = (float)((int)(z));
-    z     = (z-tx[i])*two8;
+  *remainder = rem;
+  return (int)y;
+}
+
+
+int __ieee754_rem_pio2f(float x, float *y) {
+  if (x < 4000.0f) {
+    return argumentReduceSmall(x, y);
+  } else {
+    return payne_hanek(x, y);
   }
-  tx[2] = z;
-  nx = 3;
-  while(tx[nx-1]==zero) nx--; /* skip zero term */
-  n  =  __kernel_rem_pio2f(tx,y,e0,nx,2,two_over_pi);
-  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
-  return n;
 }
 
-OVERLOADABLE float __kernel_sinf(float x, float y, int iy)
+OVERLOADABLE float __kernel_sinf(float x)
 {
   /* copied from fdlibm */
-const float
-half_value =  5.0000000000e-01,/* 0x3f000000 */
-S1  = -1.6666667163e-01, /* 0xbe2aaaab */
-S2  =  8.3333337680e-03, /* 0x3c088889 */
-S3  = -1.9841270114e-04, /* 0xb9500d01 */
-S4  =  2.7557314297e-06, /* 0x3638ef1b */
-S5  = -2.5050759689e-08, /* 0xb2d72f34 */
-S6  =  1.5896910177e-10; /* 0x2f2ec9d3 */
+  const float
+  half_value =  5.0000000000e-01,/* 0x3f000000 */
+  S1  = -1.6666667163e-01, /* 0xbe2aaaab */
+  S2  =  8.3333337680e-03, /* 0x3c088889 */
+  S3  = -1.9841270114e-04, /* 0xb9500d01 */
+  S4  =  2.7557314297e-06, /* 0x3638ef1b */
+  S5  = -2.5050759689e-08, /* 0xb2d72f34 */
+  S6  =  1.5896910177e-10; /* 0x2f2ec9d3 */
   float z,r,v;
-  int ix;
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  ix &= 0x7fffffff;     /* high word of x */
-  if(ix<0x32000000)     /* |x| < 2**-27 */
-     {if((int)x==0) return x;}    /* generate inexact */
   z =  x*x;
   v =  z*x;
   r =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
-  if(iy==0) return x+v*(S1+z*r);
-  else      return x-((z*(half_value*y-v*r)-y)-v*S1);
+  return x+v*(S1+z*r);
 }
 
 float __kernel_cosf(float x, float y)
@@ -746,19 +568,10 @@ float __kernel_cosf(float x, float y)
   C4  = -2.7557314297e-07, /* 0xb493f27c */
   C5  =  2.0875723372e-09, /* 0x310f74f6 */
   C6  = -1.1359647598e-11; /* 0xad47d74e */
-  const float pio2_hi = 0x1.92p0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
   float a,hz,z,r,qx;
   int ix;
   GEN_OCL_GET_FLOAT_WORD(ix,x);
   ix &= 0x7fffffff;     /* ix = |x|'s high word*/
-  if(ix<0x32000000) {     /* if x < 2**27 */
-      if(((int)x)==0) return one;   /* generate inexact */
-  }
-
-  if(x < 0.0f) { x= -x; y = -y; }
-  if(ix > 0x3f490fdb) { /* |x|>pi/4*/
-    return -__kernel_sinf(x-pio2_hi-pio2_mid-pio2_low, y, 1);
-  }
   z  = x*x;
   r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
   if(ix < 0x3e99999a)       /* if |x| < 0.3 */
@@ -775,29 +588,26 @@ OVERLOADABLE float sin(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_sin(x);
 
-  /* copied from fdlibm */
-  float y[2],z=0.0;
+  float y,z=0.0;
   int n, ix;
 
+  float negative = x < 0.0f? -1.0f : 1.0f;
+  x = negative * x;
+
   GEN_OCL_GET_FLOAT_WORD(ix,x);
 
-    /* |x| ~< pi/4 */
   ix &= 0x7fffffff;
-  if(ix <= 0x3f490fd8) return __kernel_sinf(x,z,0);
 
     /* sin(Inf or NaN) is NaN */
-  else if (ix>=0x7f800000) return x-x;
+  if (ix>=0x7f800000) return x-x;
 
     /* argument reduction needed */
   else {
-      n = __ieee754_rem_pio2f(x,y);
-      switch(n&3) {
-    case 0: return  __kernel_sinf(y[0],y[1],1);
-    case 1: return  __kernel_cosf(y[0],y[1]);
-    case 2: return -__kernel_sinf(y[0],y[1],1);
-    default:
-      return -__kernel_cosf(y[0],y[1]);
-      }
+      n = __ieee754_rem_pio2f(x,&y);
+      float s = __kernel_sinf(y);
+      float c = __kernel_cosf(y,0.0f);
+      float ret = (n&1) ? negative*c : negative*s;
+      return (n&3)> 1? -1.0f*ret : ret;
   }
 }
 
@@ -805,29 +615,32 @@ OVERLOADABLE float cos(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_cos(x);
 
-  /* copied from fdlibm */
-  float y[2],z=0.0;
+  float y,z=0.0;
   int n, ix;
-
+  x = __gen_ocl_fabs(x);
   GEN_OCL_GET_FLOAT_WORD(ix,x);
 
-    /* |x| ~< pi/4 */
   ix &= 0x7fffffff;
-  if(ix <= 0x3f490fd8) return __kernel_cosf(x,z);
 
     /* cos(Inf or NaN) is NaN */
-  else if (ix>=0x7f800000) return x-x;
+  if (ix>=0x7f800000) return x-x;
 
     /* argument reduction needed */
   else {
-      n = __ieee754_rem_pio2f(x,y);
-      switch(n&3) {
-    case 0: return  __kernel_cosf(y[0],y[1]);
-    case 1: return  -__kernel_sinf(y[0],y[1],1);
-    case 2: return -__kernel_cosf(y[0],y[1]);
-    default:
-      return __kernel_sinf(y[0],y[1],1);
-      }
+      n = __ieee754_rem_pio2f(x,&y);
+      n &= 3;
+      float c = __kernel_cosf(y, 0.0f);
+      float s = __kernel_sinf(y);
+      float v = (n&1) ? s : c;
+      /* n&3   return
+          0    cos(y)
+          1   -sin(y)
+          2   -cos(y)
+          3    sin(y)
+      */
+      int mask = (n>>1) ^ n;
+      float sign = (mask&1) ? -1.0f : 1.0f;
+      return sign * v;
   }
 }
 
@@ -908,46 +721,27 @@ float __kernel_tanf(float x, float y, int iy)
 
 OVERLOADABLE float tan(float x)
 {
-
     if (__ocl_math_fastpath_flag)
       return __gen_ocl_internal_fastpath_tan(x);
 
-  /* copied from fdlibm */
-        const float pio2_hi = 0x1.92p-0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
-        const float pio4  =  7.8539812565e-01;
-        float y[2],z=0.0;
-        int n, ix;
+    float y,z=0.0;
+    int n, ix;
+    float negative = x < 0.0f? -1.0f : 1.0f;
+    x = negative * x;
 
-        GEN_OCL_GET_FLOAT_WORD(ix,x);
+    GEN_OCL_GET_FLOAT_WORD(ix,x);
 
-    /* |x| ~< pi/4 */
-        ix &= 0x7fffffff;
-        if(ix <= 0x3f490fda) return __kernel_tanf(x,z,1);
+    ix &= 0x7fffffff;
 
     /* tan(Inf or NaN) is NaN */
-        else if (ix>=0x7f800000) return x-x;            /* NaN */
+    if (ix>=0x7f800000) return x-x;            /* NaN */
 
     /* argument reduction needed */
-      else {
-        n = __ieee754_rem_pio2f(x,y);
-
-        x = y[0];
-        float m = y[1];
-        int iy = 1-((n&1)<<1);
-        GEN_OCL_GET_FLOAT_WORD(ix,x);
-        float sign = 1.0f;
-        if(ix < 0) {
-          x = -x; m = -m;
-          sign = -1.0f;
-        }
-
-        if(x > pio4) {/* reduce x to less than pi/4 through (pi/2-x) */
-          float t = __kernel_tanf(pio2_hi-x+pio2_mid+pio2_low, -m, 1);
-          if(iy == -1) return sign*(-t); else return sign*1/t;
-        } else
-            return __kernel_tanf(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
+    else {
+      n = __ieee754_rem_pio2f(x,&y);
+      return negative * __kernel_tanf(y,0.0f,1-((n&1)<<1)); /*   1 -- n even
                                                               -1 -- n odd */
-      }
+    }
 }
 
 OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
@@ -967,13 +761,13 @@ OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
     return __kernel_cosf(m*M_PI_F, 0.0f);
    case 1:
    case 2:
-    return __kernel_sinf((0.5f-m)*M_PI_F, 0.0f, 0);
+    return __kernel_sinf((0.5f-m)*M_PI_F);
    case 3:
    case 4:
     return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
    case 5:
    case 6:
-    return __kernel_sinf((m-1.5f)*M_PI_F, 0.0f, 0);
+    return __kernel_sinf((m-1.5f)*M_PI_F);
    default:
     return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
    }
@@ -994,18 +788,18 @@ OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
 
   switch(ix) {
    case 0:
-    return sign*__kernel_sinf(m*M_PI_F, 0.0f, 0);
+    return sign*__kernel_sinf(m*M_PI_F);
    case 1:
    case 2:
     return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
    case 3:
    case 4:
-    return -sign*__kernel_sinf((m-1.0f)*M_PI_F, 0.0f, 0);
+    return -sign*__kernel_sinf((m-1.0f)*M_PI_F);
    case 5:
    case 6:
     return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
    default:
-    return -sign*__kernel_sinf((2.0f-m)*M_PI_F, 0.0f, 0);
+    return -sign*__kernel_sinf((2.0f-m)*M_PI_F);
    }
 
 }
@@ -1499,8 +1293,7 @@ union {float f; unsigned i;} u;
     return (float)(e-127);
   }
 }
-#define FP_ILOGB0 (-0x7FFFFFFF-1)
-#define FP_ILOGBNAN FP_ILOGB0
+
 OVERLOADABLE int ilogb(float x) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_ilogb(x);
@@ -1528,7 +1321,33 @@ OVERLOADABLE float nan(uint code) {
   return NAN;
 }
 OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
-  return native_tan(x * M_PI_F);
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  int n = __gen_ocl_internal_floor(m*4.0f);
+  if(m == 0.5f) {
+    return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
+  }
+  if(m == 0.0f) {
+    return (ix&0x1) == 0 ? 0.0f : -0.0f;
+  }
+
+  switch(n) {
+    case 0:
+      return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
+    case 1:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    case 2:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    default:
+      return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
+  }
 }
 OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
   /* copied from fdlibm */
@@ -1833,63 +1652,6 @@ OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
 }
 
 OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
-  uint ix = as_uint(x), iy = as_uint(y),
-       pos_zero = 0, neg_zero = 0x80000000u,
-       pos_inf = 0x7f800000, neg_inf = 0xff800000u;
-  if(iy == pos_zero) {
-    if(ix == pos_zero)
-      return 0;
-    if(ix == neg_zero)
-      return 1;
-    if(x < 0)
-      return 1;
-    if(x > 0)
-      return 0;
-  }
-  if(iy == neg_zero) {
-    if(ix == pos_zero)
-      return -0.f;
-    if(ix == neg_zero)
-      return -1;
-    if(x < 0)
-      return -1;
-    if(x > 0)
-      return -0.f;
-  }
-  if((ix & 0x7fffffff) == 0) {
-    if(y < 0)
-      return -.5f;
-    if(y > 0)
-      return .5f;
-  }
-  if(ix == pos_inf) {
-    if(y > 0 && iy != pos_inf)
-      return 0;
-    if(y < 0 && iy != neg_inf)
-      return -0.f;
-  }
-  if(ix == neg_inf) {
-    if(y > 0 && iy != pos_inf)
-      return 1;
-    if(y < 0 && iy != neg_inf)
-      return -1;
-  }
-  if(iy == pos_inf) {
-    if(ix == pos_inf)
-      return 0.25f;
-    if(ix == neg_inf)
-      return 0.75f;
-    if(x >= 0 || x <= 0)
-      return 0.5f;
-  }
-  if(iy == neg_inf) {
-    if(ix == pos_inf)
-      return -0.25f;
-    if(ix == neg_inf)
-      return -0.75f;
-    if(x >= 0 || x <= 0)
-      return -0.5f;
-  }
   return __gen_ocl_internal_atan2(y, x) / M_PI_F;
 }
 OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
@@ -1901,7 +1663,6 @@ OVERLOADABLE float __gen_ocl_internal_round(float x) {
   return y;
 }
 OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
 OVERLOADABLE float __gen_ocl_internal_rint(float x) {
   return __gen_ocl_rnde(x);
 }
@@ -1916,8 +1677,15 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
   float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
   u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
   twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
-  ivln2	 =	1.4426950216e+00; /* 0x3fb8aa3b =1/ln2 */
-  float y,hi=0.0,lo=0.0,t;
+  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one = 1.0,
+  huge = 1.0e+30,
+  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+  P2 = -2.7777778450e-03, /* 0xbb360b61 */
+  P3 = 6.6137559770e-05, /* 0x388ab355 */
+  P4 = -1.6533901999e-06, /* 0xb5ddea0e */
+  P5 =	4.1381369442e-08; /* 0x3331bb4c */
+  float y,hi=0.0,lo=0.0,c,t;
   int k=0,xsb;
   unsigned hx;
   float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
@@ -1933,16 +1701,17 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
 
   /* filter out non-finite argument */
   if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
-    // native_exp already handled this
-    return native_exp(x);
+    if(hx>0x7f800000)
+      return x+x;			/* NaN */
+    if(hx==0x7f800000)
+      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
+    if(x > o_threshold) return huge*huge; /* overflow */
+    if(x < u_threshold) return twom100*twom100; /* underflow */
   }
-
   /* argument reduction */
   if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
     if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
-      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0);
-      lo= xsb == 1? ln2LO_1 : ln2LO_0;
-      k = 1-xsb-xsb;
+      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
     } else {
       float tmp = xsb == 1 ? half_1 : half_0;
       k  = ivln2*x+tmp;
@@ -1952,8 +1721,18 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
     }
     x  = hi - lo;
   }
+  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
+    if(huge+x>one) return one+x;/* trigger inexact */
+  }
+  else k = 0;
 
-  y = native_exp(x);
+  /* x is now in primary range */
+  t  = x*x;
+  c  = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  if(k==0)
+    return one-((x*c)/(c-(float)2.0)-x);
+  else
+    y = one-((lo-(x*c)/((float)2.0-c))-hi);
   if(k >= -125) {
     unsigned hy;
     GEN_OCL_GET_FLOAT_WORD(hy,y);
@@ -2630,7 +2409,6 @@ OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
 }
 
 OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
-  if(!__ocl_finitef(x)||x==(float)0.0) return x;
   x = __gen_ocl_scalbnf(x,n);
   return x;
 }
@@ -2668,10 +2446,10 @@ OVERLOADABLE float __gen_ocl_internal_exp10(float x){
   P[3] = 2.034649854009453E+000;
   P[4] = 2.650948748208892E+000;
   P[5] = 2.302585167056758E+000;
-  if( isinf(x))
-    return INFINITY;
 
-  if( x < -MAXL10 )return 0.0;
+  if( x < -MAXL10 ) return 0.0;
+
+  if( isinf(x))  return INFINITY;
   /* The following is necessary because range reduction blows up: */
   if( x == 0 )return 1.0;
 
@@ -2836,6 +2614,7 @@ OVERLOADABLE float ldexp(float x, int n) {
   if (__ocl_math_fastpath_flag)
     return __gen_ocl_internal_fastpath_ldexp(x, n);
 
+  if (x == (float)0.0f) x = 0.0f;
   return __gen_ocl_internal_ldexp(x, n);
 }
 
@@ -2942,6 +2721,21 @@ OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
     return y;
   return x > y ? (x - y) : +0.f;
 }
+/*
+ * the pow/pown high precision implementation are copied from msun library.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
 
 OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   float z,ax,z_h,z_l,p_h,p_l;
@@ -2991,6 +2785,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   }
    /* y==zero: x**0 = 1 */
   if(iy==0) return one;
+  /* pow(+1, y) returns 1 for any y, even a NAN */
   if(hx==0x3f800000) return one;
   /* +-NaN return x+y */
   if(ix > 0x7f800000 || iy > 0x7f800000)
@@ -3090,6 +2885,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
     GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
     t_l = ax - (t_h-bp[k]);
     s_l = v*((u-s_h*t_h)-s_h*t_l);
+
     /* compute log(ax) */
     s2 = s*s;
     r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
@@ -3097,7 +2893,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
     s2  = s_h*s_h;
     t_h = 3.0f+s2+r;
     GEN_OCL_GET_FLOAT_WORD(is,t_h);
-    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xfffff000);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
     t_l = r-((t_h-3.0f)-s2);
     /* u+v = s*(1+...) */
     u = s_h*t_h;
@@ -3105,7 +2901,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
     /* 2/(3log2)*(s+...) */
     p_h = u+v;
     GEN_OCL_GET_FLOAT_WORD(is,p_h);
-    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xfffff000);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
     p_l = v-(p_h-u);
     z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
     z_l = cp_l*p_h+p_l*cp+dp_l[k];
@@ -3113,13 +2909,13 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
     t = (float)n;
     t1 = (((z_h+z_l)+dp_h[k])+t);
     GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
     t2 = z_l-(((t1-t)-dp_h[k])-z_h);
   }
 
   /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
   GEN_OCL_GET_FLOAT_WORD(is,y);
-  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
   p_l = (y-y1)*t1+y*t2;
   p_h = y1*t1;
   z = p_l+p_h;
@@ -3167,6 +2963,209 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   return sn*z;
 }
 
+float __gen_ocl_internal_pown(float x, int y) {
+  const float
+  bp[] = {1.0, 1.5,},
+  dp_h[] = { 0.0, 5.84960938e-01,}, /* 0x3f15c000 */
+  dp_l[] = { 0.0, 1.56322085e-06,}, /* 0x35d1cfdc */
+  zero    =  0.0,
+  one =  1.0,
+  two =  2.0,
+  two24 =  16777216.0,  /* 0x4b800000 */
+  huge  =  1.0e30,
+  tiny    =  1.0e-30,
+    /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+  L1  =  6.0000002384e-01, /* 0x3f19999a */
+  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
+  L3  =  3.3333334327e-01, /* 0x3eaaaaab */
+  L4  =  2.7272811532e-01, /* 0x3e8ba305 */
+  L5  =  2.3066075146e-01, /* 0x3e6c3255 */
+  L6  =  2.0697501302e-01, /* 0x3e53f142 */
+  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
+  P2   = -2.7777778450e-03, /* 0xbb360b61 */
+  P3   =  6.6137559770e-05, /* 0x388ab355 */
+  P4   = -1.6533901999e-06, /* 0xb5ddea0e */
+  P5   =  4.1381369442e-08, /* 0x3331bb4c */
+  lg2  =  6.9314718246e-01, /* 0x3f317218 */
+  lg2_h  =  0x1.62ep-1,
+  lg2_l  =  0x1.0bfbe8p-15,
+  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
+  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+
+  float z,ax,z_h,z_l,p_h,p_l;
+  float y1,t1,t2,r,s,t,u,v,w;
+  int i,j,k,yisint,n;
+  int hx,ix,iy,is;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  iy = y > 0 ? y&0x7fffffff : (-y)&0x7fffffff;
+    /* y==zero: x**0 = 1 */
+  if(y==0) return one;
+
+    /* +-NaN return NAN */
+  if(ix > 0x7f800000)
+    return NAN;
+
+    /* determine if y is an odd int
+     * yisint = 1 ... y is an odd int
+     * yisint = 2 ... y is an even int
+     */
+    yisint = y&1 ? 1 : 2;
+
+  if (y == 1) return x;
+  if (y == -1) return one/x;
+  if (y == 2) return x*x;
+
+  ax   = __gen_ocl_fabs(x);
+
+   /* special value of x */
+  if(ix==0x7f800000||ix==0||ix==0x3f800000){
+      z = ax;     /*x is +-0,+-inf,+-1*/
+      if(y<0) z = one/z; /* z = (1/|x|) */
+      if(hx<0) {
+      if(yisint==1)
+        z = -z;   /* (x<0)**odd = -(|x|**odd) */
+      }
+      return z;
+  }
+
+  float sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+  if(((((unsigned)hx>>31)-1)|(yisint-1))==0)
+      sn = -one; /* (-ve)**(odd int) */
+
+    /* |y| is huge */
+  if(iy>0x08000000) { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+      if(ix<0x3f7ffff8) return (y<0)? sn*huge*huge:tiny*tiny;
+      if(ix>0x3f800007) return (y>0)? sn*huge*huge:tiny*tiny;
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+     log(x) by x-x^2/2+x^3/3-x^4/4 */
+      t = ax-1;   /* t has 20 trailing zeros */
+      w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
+      u = ivln2_h*t;  /* ivln2_h has 16 sig. bits */
+      v = t*ivln2_l-w*ivln2;
+      t1 = u+v;
+      GEN_OCL_GET_FLOAT_WORD(is,t1);
+      GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+      t2 = v-(t1-u);
+  } else {
+    float s2,s_h,s_l,t_h,t_l;
+    n = 0;
+    /* take care subnormal number */
+//      if(ix<0x00800000)
+//    {ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+    n  += ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+    /* determine interval */
+    ix = j|0x3f800000;    /* normalize ix */
+    if(j<=0x1cc471) k=0;  /* |x|<sqrt(3/2) */
+    else if(j<0x5db3d7) k=1;  /* |x|<sqrt(3)   */
+    else {k=0;n+=1;ix -= 0x00800000;}
+    GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+    /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+    u = ax-bp[k];   /* bp[0]=1.0, bp[1]=1.5 */
+    v = one/(ax+bp[k]);
+    s = u*v;
+    s_h = s;
+    GEN_OCL_GET_FLOAT_WORD(is,s_h);
+    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+
+    /* t_h=ax+bp[k] High */
+    GEN_OCL_SET_FLOAT_WORD(t_h, (((ix>>1)|0x20000000)+0x00400000+(k<<21)) &0xfffff000);
+    t_l = ax - (t_h-bp[k]);
+    s_l = v*((u-s_h*t_h)-s_h*t_l);
+
+
+    /* compute log(ax) */
+    s2 = s*s;
+    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+    r += s_l*(s_h+s);
+    s2  = s_h*s_h;
+    t_h = (float)3.0+s2+r;
+    GEN_OCL_GET_FLOAT_WORD(is,t_h);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+    t_l = r-((t_h-(float)3.0)-s2);
+    /* u+v = s*(1+...) */
+    u = s_h*t_h;
+    v = s_l*t_h+t_l*s;
+    /* 2/(3log2)*(s+...) */
+    p_h = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,p_h);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+    p_l = v-(p_h-u);
+    z_h = cp_h*p_h;   /* cp_h+cp_l = 2/(3*log2) */
+    z_l = cp_l*p_h+p_l*cp+dp_l[k];
+    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+    t = (float)n;
+    t1 = (((z_h+z_l)+dp_h[k])+t);
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+  }
+
+  /* split up y into y1+y2+y3 and compute (y1+y2+y3)*(t1+t2) */
+
+  float fy = (float)y;
+  float y3 = (float)(y-(int)fy);
+  GEN_OCL_GET_FLOAT_WORD(is,fy);
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+
+  p_l = (fy-y1)*t1 + y3*t1 + fy*t2 + y3*t2;
+  p_h = y1*t1;
+  z = p_l+p_h;
+
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  if (j>0x43000000)       /* if z > 128 */
+      return sn*huge*huge;       /* overflow */
+  else if (j==0x43000000) {     /* if z == 128 */
+      if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
+  }
+  else if ((j&0x7fffffff)>0x43160000)   /* z <= -150 */
+      return sn*tiny*tiny;       /* underflow */
+  else if (j==0xc3160000){      /* z == -150 */
+      if(p_l<=z-p_h) return sn*tiny*tiny;    /* underflow */
+  }
+    /*
+     * compute 2**(p_h+p_l)
+     */
+  i = j&0x7fffffff;
+  k = (i>>23)-0x7f;
+  n = 0;
+  if(i>0x3f000000) {    /* if |z| > 0.5, set n = [z+0.5] */
+      n = j+(0x00800000>>(k+1));
+      k = ((n&0x7fffffff)>>23)-0x7f;  /* new k for n */
+      GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+      n = ((n&0x007fffff)|0x00800000)>>(23-k);
+      if(j<0) n = -n;
+      p_h -= t;
+
+      z -= n;
+  }
+
+  t = z;
+  GEN_OCL_GET_FLOAT_WORD(is,t);
+  GEN_OCL_SET_FLOAT_WORD(t,is&0xfffff000);
+  u = t*lg2_h;
+  v = (p_l-(t-p_h))*lg2+t*lg2_l;
+  z = u+v;
+  w = v-(z-u);
+  t  = z*z;
+  t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  r  = (z*t1)/(t1-two)-(w+z*w);
+  z  = one-(r-z);
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  j += (n<<23);
+  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);  /* subnormal output */
+  else GEN_OCL_SET_FLOAT_WORD(z,j);
+  return sn*z;
+}
 
 OVERLOADABLE float hypot(float x, float y) {
   if (__ocl_math_fastpath_flag)
@@ -3216,6 +3215,8 @@ OVERLOADABLE float fract(float x, private float *p) { BODY; }
   int n,hx,hy,hz,ix,iy,sx,i,sy; \
   uint q,sxy; \
   Zero[0] = 0.0;Zero[1] = -0.0; \
+  if (x == 0.0f) { x = 0.0f; }; \
+  if (y == 0.0f) { y = 0.0f; }\
   GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_GET_FLOAT_WORD(hy,y); \
   sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
   hx ^=sx; hy &= 0x7fffffff; \
@@ -3295,30 +3296,99 @@ OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
 OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
 #undef BODY
 
+OVERLOADABLE float powr(float x, float y) {
+  unsigned int hx, sx, hy, sy;
+
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_pow(x,y);
+  else {
+    if (isnan(x) || isnan(y)) return NAN;
+    GEN_OCL_GET_FLOAT_WORD(hx,x);
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    sx = (hx & 0x80000000) >> 31;
+    sy = (hy & 0x80000000) >> 31;
+
+    if ((hx&0x7fffffff) < 0x00800000) {	   /* x < 2**-126  */
+      x = 0.0f;/* Gen does not support subnormal number now */
+      hx = hx &0x80000000;
+    }
+    if ((hy&0x7fffffff) < 0x00800000) {	  /* y < 2**-126  */
+      y = 0.0;/* Gen does not support subnormal number now */
+      hy = hy &0x80000000;
+    }
+
+    // (x < 0) ** y = NAN (y!=0)
+    if ((sx && (hx & 0x7fffffff))) return NAN;
+
+    // +/-0 ** +/-0 = NAN
+    if ( !(hx&0x7fffffff) && !(hy&0x7fffffff)) return NAN;
+
+    // +inf ** +/-0 = NAN
+    if ( ((hx & 0x7f800000) ==0x7f800000) && !(hy&0x7fffffff)) return NAN;
+
+    // others except nan/inf/0 ** 0 = 1.0
+    if (!(hy&0x7fffffff)) return 1.0f;
+
+    // +1 ** inf = NAN; +1 ** finite = 1;
+    if (hx == 0x3f800000) {
+      return isinf(y) ? NAN : 1.0f;
+    }
+
+    if ( !(hx & 0x7fffffff)) {
+        // +/-0 ** y<0 = +inf
+        // +/-0 ** y>0 = +0
+      return sy ? INFINITY : 0.0f;
+    }
+
+    return __gen_ocl_internal_pow(x,y);
+  }
+}
+
 OVERLOADABLE float pown(float x, int n) {
-  if (x == 0.f && n == 0)
-    return 1.f;
-  if (x < 0.f && (n&1) )
-    return -powr(-x, n);
-  return powr(x, n);
+  if (__ocl_math_fastpath_flag) {
+    if (x == 0.f && n == 0)
+      return 1.f;
+    if (x < 0.f && (n&1) )
+      return -powr(-x, n);
+    return powr(x, n);
+  } else {
+    int ix;
+    GEN_OCL_GET_FLOAT_WORD(ix, x);
+    float sign = ix < 0 ? -1.0f : 1.0f;
+    if (x == 0.0f) x = sign * 0.0f;
+
+    return __gen_ocl_internal_pown(x, n);
+  }
 }
 
 OVERLOADABLE float pow(float x, float y) {
-  int n;
-  if (x == 0.f && y == 0.f)
-    return 1.f;
-  if (x >= 0.f)
-    return powr(x, y);
-  n = y;
-  if ((float)n == y)//is exact integer
-    return pown(x, n);
-  return NAN;
+  if (!__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_pow(x,y);
+  else {
+    int n;
+    if (x == 0.f && y == 0.f)
+      return 1.f;
+    if (x >= 0.f)
+      return powr(x, y);
+    n = y;
+    if ((float)n == y)//is exact integer
+      return pown(x, n);
+    return NAN;
+  }
 }
 
 OVERLOADABLE float rootn(float x, int n) {
   float ax,re;
   int sign = 0;
+  int hx;
   if( n == 0 )return NAN;
+
+  GEN_OCL_GET_FLOAT_WORD(hx, x);
+  // Gen does not support denorm, flush to zero
+  if ((hx & 0x7fffffff) < 0x00800000) {
+    x = hx < 0 ? -0.0f : 0.0f;
+  }
+
   //rootn ( x, n )  returns a NaN for x < 0 and n is even.
   if( x < 0 && 0 == (n&1) )
     return NAN;
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h
index 69ee3f3..0075797 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
@@ -117,4 +117,18 @@ OVERLOADABLE float native_sin(float x);
 OVERLOADABLE float native_sqrt(float x);
 OVERLOADABLE float native_tan(float x);
 
-// half  not supported now.
+// half
+#define half_cos cos
+#define half_divide native_divide
+#define half_exp native_exp
+#define half_exp2 native_exp2
+#define half_exp10 native_exp10
+#define half_log native_log
+#define half_log2 native_log2
+#define half_log10 native_log10
+#define half_powr powr
+#define half_recip native_recip
+#define half_rsqrt native_rsqrt
+#define half_sin sin
+#define half_sqrt native_sqrt
+#define half_tan tan
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index f5e9f81..d3058d6 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -170,6 +170,13 @@ namespace gbe
     builtinFuncs.push_back("__gen_memset_g_align");
     builtinFuncs.push_back("__gen_memset_l_align");
 
+    builtinFuncs.push_back("__gen_memcpy_pc");
+    builtinFuncs.push_back("__gen_memcpy_gc");
+    builtinFuncs.push_back("__gen_memcpy_lc");
+
+    builtinFuncs.push_back("__gen_memcpy_pc_align");
+    builtinFuncs.push_back("__gen_memcpy_gc_align");
+    builtinFuncs.push_back("__gen_memcpy_lc_align");
 
     for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) {
       if (SF->isDeclaration()) continue;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 558491f..8d22c4e 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -257,9 +257,10 @@ namespace gbe
   /*! Get number of element to process dealing either with a vector or a scalar
    *  value
    */
-  static ir::Type getVectorInfo(ir::Context &ctx, Type *llvmType, Value *value, uint32_t &elemNum, bool useUnsigned = false)
+  static ir::Type getVectorInfo(ir::Context &ctx, Value *value, uint32_t &elemNum, bool useUnsigned = false)
   {
     ir::Type type;
+    Type *llvmType = value->getType();
     if (llvmType->isVectorTy() == true) {
       VectorType *vectorType = cast<VectorType>(llvmType);
       Type *elementType = vectorType->getElementType();
@@ -285,7 +286,6 @@ namespace gbe
       case 1: return ir::MEM_GLOBAL;
       case 2: return ir::MEM_CONSTANT;
       case 3: return ir::MEM_LOCAL;
-      case 4: return ir::IMAGE;
     }
     GBE_ASSERT(false);
     return ir::MEM_GLOBAL;
@@ -464,10 +464,16 @@ namespace gbe
      */
     set<const Value*> conditionSet;
     map<const Value*, int> globalPointer;
+    typedef map<const Value*, int>::iterator GlobalPtrIter;
+
     /*!
      *  <phi,phiCopy> node information for later optimization
      */
     map<const ir::Register, const ir::Register> phiMap;
+
+    map<Value *, SmallVector<Value *, 4>> pointerOrigMap;
+    typedef map<Value *, SmallVector<Value *, 4>>::iterator PtrOrigMapIter;
+
     /*! We visit each function twice. Once to allocate the registers and once to
      *  emit the Gen IR instructions
      */
@@ -529,14 +535,22 @@ namespace gbe
       bool bKernel = isKernelFunction(F);
       if(!bKernel) return false;
 
+      analyzePointerOrigin(F);
       LI = &getAnalysis<LoopInfo>();
       emitFunction(F);
       phiMap.clear();
       globalPointer.clear();
+      pointerOrigMap.clear();
       // Reset for next function
       btiBase = BTI_RESERVED_NUM;
       return false;
     }
+    /*! Given a possible pointer value, find out the interested escape like
+        load/store or atomic instruction */
+    void findPointerEscape(Value *ptr);
+    /*! For all possible pointers, GlobalVariable, function pointer argument,
+        alloca instruction, find their pointer escape points */
+    void analyzePointerOrigin(Function &F);
 
     virtual bool doFinalization(Module &M) { return false; }
     /*! handle global variable register allocation (local, constant space) */
@@ -615,6 +629,7 @@ namespace gbe
     void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
 
     uint8_t appendSampler(CallSite::arg_iterator AI);
+    uint8_t getImageID(CallInst &I);
 
     // These instructions are not supported at all
     void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
@@ -647,6 +662,83 @@ namespace gbe
   };
 
   char GenWriter::ID = 0;
+
+  void GenWriter::findPointerEscape(Value *ptr) {
+    std::vector<Value*> workList;
+    std::set<Value *> visited;
+
+    if (ptr->use_empty()) return;
+
+    workList.push_back(ptr);
+
+    for (unsigned i = 0; i < workList.size(); i++) {
+      Value *work = workList[i];
+      if (work->use_empty()) continue;
+
+      for (Value::use_iterator iter = work->use_begin(); iter != work->use_end(); ++iter) {
+      // After LLVM 3.5, use_iterator points to 'Use' instead of 'User',
+      // which is more straightforward.
+  #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+        User *theUser = *iter;
+  #else
+        User *theUser = iter->getUser();
+  #endif
+        if (visited.find(theUser) != visited.end()) continue;
+        // pointer address is used as the ValueOperand in store instruction, should be skipped
+        if (StoreInst *load = dyn_cast<StoreInst>(theUser)) {
+          if (load->getValueOperand() == work) {
+            continue;
+          }
+        }
+
+        visited.insert(theUser);
+
+        if (isa<LoadInst>(theUser) || isa<StoreInst>(theUser) || isa<CallInst>(theUser)) {
+          if (isa<CallInst>(theUser)) {
+            Function *F = dyn_cast<CallInst>(theUser)->getCalledFunction();
+            if (!F || F->getIntrinsicID() != 0) continue;
+          }
+
+          PtrOrigMapIter ptrIter = pointerOrigMap.find(theUser);
+          if (ptrIter == pointerOrigMap.end()) {
+            // create new one
+            SmallVector<Value *, 4> pointers;
+            pointers.push_back(ptr);
+            pointerOrigMap.insert(std::make_pair(theUser, pointers));
+          } else {
+            // append it
+            (*ptrIter).second.push_back(ptr);
+          }
+        } else {
+          workList.push_back(theUser);
+        }
+      }
+    }
+  }
+
+  void GenWriter::analyzePointerOrigin(Function &F) {
+    // GlobalVariable
+    Module::GlobalListType &globalList = const_cast<Module::GlobalListType &> (TheModule->getGlobalList());
+    for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+      GlobalVariable &v = *i;
+      if(!v.isConstantUsed()) continue;
+      findPointerEscape(&v);
+    }
+    // function argument
+    for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+      if (I->getType()->isPointerTy()) {
+        findPointerEscape(I);
+      }
+    }
+    // alloca
+    BasicBlock &bb = F.getEntryBlock();
+    for (BasicBlock::iterator iter = bb.begin(), iterE = bb.end(); iter != iterE; ++iter) {
+      if (AllocaInst *ai = dyn_cast<AllocaInst>(iter)) {
+        findPointerEscape(ai);
+      }
+    }
+  }
+
   void getSequentialData(const ConstantDataSequential *cda, void *ptr, uint32_t &offset) {
     StringRef data = cda->getRawDataValues();
     memcpy((char*)ptr+offset, data.data(), data.size());
@@ -757,7 +849,7 @@ namespace gbe
       if(!v.isConstantUsed()) continue;
       const char *name = v.getName().data();
       unsigned addrSpace = v.getType()->getAddressSpace();
-      if(addrSpace == ir::AddressSpace::MEM_CONSTANT) {
+      if(addrSpace == ir::AddressSpace::MEM_CONSTANT || v.isConstant()) {
         GBE_ASSERT(v.hasInitializer());
         const Constant *c = v.getInitializer();
         Type * type = c->getType();
@@ -811,7 +903,7 @@ namespace gbe
       vector<ir::ImmediateIndex> immVector;
       for (uint32_t i = 0; i < cv->getNumOperands(); i++)
         immVector.push_back(processConstantImmIndex(cv->getOperand(i)));
-      return ctx.newImmediate(immVector);
+      return ctx.newImmediate(immVector, getType(ctx, cv->getType()->getElementType()));
     }
   }
 
@@ -954,11 +1046,25 @@ namespace gbe
 
     if (dyn_cast<ConstantExpr>(CPV)) {
       ConstantExpr *ce = dyn_cast<ConstantExpr>(CPV);
+
+      if (!isScalarType(ce->getType())) {
+        VectorType *vecType = cast<VectorType>(ce->getType());
+        GBE_ASSERT(ce->getOpcode() == Instruction::BitCast);
+        GBE_ASSERT(isScalarType(vecType->getElementType()));
+        ir::Type elemType = getType(ctx, vecType->getElementType());
+
+        const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
+        const ir::Immediate imm = ctx.getImmediate(immIndex);
+        GBE_ASSERT(vecType->getNumElements() == imm.getElemNum() &&
+                   getTypeByteSize(unit, vecType->getElementType()) == imm.getTypeSize());
+        return ctx.processImm(ir::IMM_BITCAST, immIndex, elemType);
+      }
       ir::Type type = getType(ctx, ce->getType());
       switch (ce->getOpcode()) {
         default:
           ce->dump();
           GBE_ASSERT(0 && "unsupported ce opcode.\n");
+        case Instruction::FPTrunc:
         case Instruction::Trunc:
         {
           const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
@@ -975,6 +1081,9 @@ namespace gbe
         case Instruction::FPToSI:
         case Instruction::SIToFP:
         case Instruction::UIToFP:
+        case Instruction::SExt:
+        case Instruction::ZExt:
+        case Instruction::FPExt:
         {
           const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
           switch (ce->getOpcode()) {
@@ -984,15 +1093,26 @@ namespace gbe
             case Instruction::FPToSI: return ctx.processImm(ir::IMM_FPTOSI, immIndex, type);
             case Instruction::SIToFP: return ctx.processImm(ir::IMM_SITOFP, immIndex, type);
             case Instruction::UIToFP: return ctx.processImm(ir::IMM_UITOFP, immIndex, type);
+            case Instruction::SExt:  return ctx.processImm(ir::IMM_SEXT, immIndex, type);
+            case Instruction::ZExt:  return ctx.processImm(ir::IMM_ZEXT, immIndex, type);
+            case Instruction::FPExt: return ctx.processImm(ir::IMM_FPEXT, immIndex, type);
           }
         }
+
+        case Instruction::ExtractElement:
         case Instruction::FCmp:
         case Instruction::ICmp:
+        case Instruction::FAdd:
         case Instruction::Add:
         case Instruction::Sub:
+        case Instruction::FSub:
         case Instruction::Mul:
+        case Instruction::FMul:
         case Instruction::SDiv:
+        case Instruction::UDiv:
+        case Instruction::FDiv:
         case Instruction::SRem:
+        case Instruction::FRem:
         case Instruction::Shl:
         case Instruction::AShr:
         case Instruction::LShr:
@@ -1005,15 +1125,23 @@ namespace gbe
           default:
             //ce->dump();
             GBE_ASSERTM(0, "Unsupported constant expression.\n");
+
+          case Instruction::ExtractElement:
+            return ctx.processImm(ir::IMM_EXTRACT, lhs, rhs, type);
           case Instruction::Add:
+          case Instruction::FAdd:
             return ctx.processImm(ir::IMM_ADD, lhs, rhs, type);
+          case Instruction::FSub:
           case Instruction::Sub:
             return ctx.processImm(ir::IMM_SUB, lhs, rhs, type);
           case Instruction::Mul:
+          case Instruction::FMul:
             return ctx.processImm(ir::IMM_MUL, lhs, rhs, type);
           case Instruction::SDiv:
+          case Instruction::FDiv:
             return ctx.processImm(ir::IMM_DIV, lhs, rhs, type);
           case Instruction::SRem:
+          case Instruction::FRem:
             return ctx.processImm(ir::IMM_REM, lhs, rhs, type);
           case Instruction::Shl:
             return ctx.processImm(ir::IMM_SHL, lhs, rhs, type);
@@ -1126,37 +1254,18 @@ namespace gbe
       return pointer_reg;
     }
     else if (expr->getOpcode() == Instruction::GetElementPtr) {
-      uint32_t TypeIndex;
       uint32_t constantOffset = 0;
 
       Value *pointer = val;
       CompositeType* CompTy = cast<CompositeType>(pointer->getType());
       for(uint32_t op=1; op<expr->getNumOperands(); ++op) {
-        uint32_t offset = 0;
+        int32_t TypeIndex;
         ConstantInt* ConstOP = dyn_cast<ConstantInt>(expr->getOperand(op));
-        GBE_ASSERT(ConstOP);
+        if (ConstOP == NULL)
+          goto error;
         TypeIndex = ConstOP->getZExtValue();
-        if (op == 1) {
-          if (TypeIndex != 0) {
-            Type *elementType = (cast<PointerType>(pointer->getType()))->getElementType();
-            uint32_t elementSize = getTypeByteSize(unit, elementType);
-            uint32_t align = getAlignmentByte(unit, elementType);
-            elementSize += getPadding(elementSize, align);
-            offset += elementSize * TypeIndex;
-          }
-        } else {
-          for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
-          {
-            Type* elementType = CompTy->getTypeAtIndex(ty_i);
-            uint32_t align = getAlignmentByte(unit, elementType);
-            offset += getPadding(offset, align);
-            offset += getTypeByteSize(unit, elementType);
-          }
-          const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
-          offset += getPadding(offset, align);
-        }
-
-        constantOffset += offset;
+        GBE_ASSERT(TypeIndex >= 0);
+        constantOffset += getGEPConstOffset(unit, CompTy, TypeIndex);
         CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
       }
 
@@ -1172,10 +1281,11 @@ namespace gbe
       ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg);
       return reg;
     }
-    else {
-      GBE_ASSERT(0 && "Unsupported constant expression");
-      return regTranslator.getScalar(val, elemID);
-    }
+
+error:
+    expr->dump();
+    GBE_ASSERT(0 && "Unsupported constant expression");
+    return regTranslator.getScalar(val, elemID);
   }
 
   ir::Register GenWriter::getConstantRegister(Constant *c, uint32_t elemID) {
@@ -1309,6 +1419,32 @@ namespace gbe
     }
   }
 
+  /*! To track read image args and write args */
+  struct ImageArgsInfo{
+    uint32_t readImageArgs;
+    uint32_t writeImageArgs;
+  };
+
+  static void collectImageArgs(std::string& accessQual, ImageArgsInfo& imageArgsInfo)
+  {
+    if(accessQual.find("read") != std::string::npos)
+    {
+      imageArgsInfo.readImageArgs++;
+      GBE_ASSERT(imageArgsInfo.readImageArgs <= BTI_MAX_READ_IMAGE_ARGS);
+    }
+    else if(accessQual.find("write") != std::string::npos)
+    {
+      imageArgsInfo.writeImageArgs++;
+      GBE_ASSERT(imageArgsInfo.writeImageArgs <= BTI_MAX_WRITE_IMAGE_ARGS);
+    }
+    else
+    {
+      //default is read_only per spec.
+      imageArgsInfo.readImageArgs++;
+      GBE_ASSERT(imageArgsInfo.readImageArgs <= BTI_MAX_READ_IMAGE_ARGS);
+    }
+  }
+
   void GenWriter::emitFunctionPrototype(Function &F)
   {
     GBE_ASSERTM(F.hasStructRetAttr() == false,
@@ -1415,6 +1551,7 @@ namespace gbe
     // Loop over the arguments and output registers for them
     if (!F.arg_empty()) {
       uint32_t argID = 0;
+      ImageArgsInfo imageArgsInfo = {};
       Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
 
       // Insert a new register for each function argument
@@ -1427,18 +1564,13 @@ namespace gbe
 
         llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
         llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
-        if (llvmInfo.typeName.find("image") != std::string::npos &&
-            llvmInfo.typeName.find("*") != std::string::npos) {
-          uint32_t start = llvmInfo.typeName.find("image");
-          uint32_t end = llvmInfo.typeName.find("*");
-          llvmInfo.typeName = llvmInfo.typeName.substr(start, end - start);
-        }
         llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(1 + argID)))->getString();
         llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(1 + argID)))->getString();
         llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
 
         // function arguments are uniform values.
         this->newRegister(I, NULL, true);
+
         // add support for vector argument.
         if(type->isVectorTy()) {
           VectorType *vectorType = cast<VectorType>(type);
@@ -1461,6 +1593,19 @@ namespace gbe
         GBE_ASSERTM(isScalarType(type) == true,
                     "vector type in the function argument is not supported yet");
         const ir::Register reg = getRegister(I);
+        if (llvmInfo.isImageType()) {
+          ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, 4, 4, 0);
+          ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
+          collectImageArgs(llvmInfo.accessQual, imageArgsInfo);
+          continue;
+        }
+
+        if (llvmInfo.isSamplerType()) {
+          ctx.input(argName, ir::FunctionArgument::SAMPLER, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
+          (void)ctx.getFunction().getSamplerSet()->append(reg, &ctx);
+          continue;
+        }
+
         if (type->isPointerTy() == false)
           ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
         else {
@@ -1495,10 +1640,6 @@ namespace gbe
               case ir::MEM_CONSTANT:
                 ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg,  llvmInfo, ptrSize, align, 0x2);
               break;
-              case ir::IMAGE:
-                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, ptrSize, align, 0x0);
-                ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
-              break;
               default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
             }
           }
@@ -1832,7 +1973,7 @@ namespace gbe
         this->newRegister(const_cast<GlobalVariable*>(&v));
         ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
         ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32));
-      } else if(addrSpace == ir::MEM_CONSTANT) {
+      } else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) {
         GBE_ASSERT(v.hasInitializer());
         this->newRegister(const_cast<GlobalVariable*>(&v));
         ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
@@ -1850,7 +1991,7 @@ namespace gbe
           ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase);
           globalPointer.insert(std::make_pair(&v, incBtiBase()));
           regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
-	} else if(v.getName().str().substr(0, 4) == ".str") {
+        } else if(v.getName().str().substr(0, 4) == ".str") {
           /* When there are multi printf statements in multi kernel fucntions within the same
              translate unit, if they have the same sting parameter, such as
              kernel_func1 () {
@@ -1863,12 +2004,12 @@ namespace gbe
              So when translating the kernel_func1, we can not unref that global var, so we will
              get here. Just ignore it to avoid assert. */
         } else {
-          GBE_ASSERT(0);
+          GBE_ASSERT(0 && "Unsupported private memory access pattern");
         }
       }
     }
-
   }
+
   static INLINE void findAllLoops(LoopInfo * LI, std::vector<std::pair<Loop*, int>> &lp)
   {
       for (Loop::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I) {
@@ -1941,6 +2082,25 @@ namespace gbe
       fn.addLoop(loopBBs, loopExits);
     }
   }
+
+
+  static unsigned getChildNo(BasicBlock *bb) {
+    TerminatorInst *term = bb->getTerminator();
+    return term->getNumSuccessors();
+  }
+
+  // return NULL if index out-range of children number
+  static BasicBlock *getChildPossible(BasicBlock *bb, unsigned index) {
+
+    TerminatorInst *term = bb->getTerminator();
+    unsigned childNo = term->getNumSuccessors();
+    BasicBlock *child = NULL;
+    if(index < childNo) {
+      child = term->getSuccessor(index);
+    }
+    return child;
+  }
+
 /*!
 
   Sorting Basic blocks is mainly used to solve register liveness issue, take a
@@ -1958,7 +2118,7 @@ namespace gbe
   |
    -->7
 
-  A register %10 defined in bb4, and used in bb5 & bb6. In normal liveness
+  1.) A register %10 defined in bb4, and used in bb5 & bb6. In normal liveness
   analysis, %10 is not alive in bb3. But under simd execution model, after
   executing bb4, some channel jump through bb5 to bb3, other channel may jump
   to bb6, we must execute bb3 first, then bb6, to avoid missing instructions.
@@ -1969,25 +2129,80 @@ namespace gbe
   we can see the bb3 will be placed after bb5 & bb6. The liveness calculation
   is just as normal and will be correct.
 
-  Another advantage of sorting basic blocks is reducing register pressure.
+  2.) Another advantage of sorting basic blocks is reducing register pressure.
   In the above CFG, a register defined in bb3 and used in bb7 will be
   alive through 3,4,5,6,7. But in fact it should be only alive in bb3 and bb7.
   After topological sorting, this kind of register would be only alive in bb3
   and bb7. Register pressure in 4,5,6 is reduced.
-*/
 
+  3.) Classical post-order traversal will automatically choose a order for the
+  successors of a basic block, But this order may be hard to handle, take a look
+  at below CFG:
+
+       1 <-----
+      /        |
+      2 --> 4 -
+      |
+      3
+      |
+      5
+  In the post oder traversal, it may be: 5->4->3->2->1, as 4, 3 does not have
+  strict order. This is a serious issue, a value defined in bb3, used in bb5
+  may be overwritten in bb1. Remember the simd execution model? some lanes
+  may execute bb4 after other lanes finish bb3, and then jump to bb1, but live
+  range of the register does not cover bb1. what we done here is for a loop
+  exit (here bb3), we alwasy make sure it is visited first in the post-order
+  traversal, for the graph, that means 5->3->4->2->1. Then a definition in bb3,
+  and used in 5 will not interfere with any other values defined in the loop.
+  FIXME: For irreducible graph, we need to identify it and convert to reducible graph.
+*/
   void GenWriter::sortBasicBlock(Function &F) {
-    typedef ReversePostOrderTraversal<Function*> RPOTType;
-    RPOTType rpot(&F);
-    Function::BasicBlockListType &bbList = F.getBasicBlockList();
+    BasicBlock &entry = F.getEntryBlock();
+    std::vector<BasicBlock *> visitStack;
+    std::vector<BasicBlock *> sorted;
+    std::set<BasicBlock *> visited;
+
+    visitStack.push_back(&entry);
+    visited.insert(&entry);
+
+    while (!visitStack.empty()) {
+      BasicBlock *top = visitStack.back();
+      unsigned childNo = getChildNo(top);
+      GBE_ASSERT(childNo <= 2);
+
+      BasicBlock *child0 = getChildPossible(top, 0);
+      BasicBlock *child1 = getChildPossible(top, 1);
+      if(childNo == 2) {
+        Loop *loop = LI->getLoopFor(top);
+        // visit loop exit node first, so loop exit block will be placed
+        // after blocks in loop in 'reverse post-order' list.
+        if (loop && loop->contains(child0) && !loop->contains(child1)) {
+          BasicBlock *tmp = child0; child0 = child1; child1 = tmp;
+        }
+      }
 
-    for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
-      (*bbI)->removeFromParent();
+      if (child0 != NULL && visited.find(child0) == visited.end()) {
+        visitStack.push_back(child0);
+        visited.insert(child0);
+      } else if (child1 != NULL && visited.find(child1) == visited.end()) {
+        visitStack.push_back(child1);
+        visited.insert(child1);
+      } else {
+        sorted.push_back(visitStack.back());
+        visitStack.pop_back();
+      }
     }
-    for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
-      bbList.push_back(*bbI);
+
+    Function::BasicBlockListType &bbList = F.getBasicBlockList();
+    for (std::vector<BasicBlock *>::iterator iter = sorted.begin(); iter != sorted.end(); ++iter) {
+      (*iter)->removeFromParent();
+    }
+
+    for (std::vector<BasicBlock *>::reverse_iterator iter = sorted.rbegin(); iter != sorted.rend(); ++iter) {
+      bbList.push_back(*iter);
     }
   }
+
   void GenWriter::emitFunction(Function &F)
   {
     switch (F.getCallingConv()) {
@@ -2324,8 +2539,8 @@ namespace gbe
         Value *srcValue = I.getOperand(0);
         Value *dstValue = &I;
         uint32_t srcElemNum = 0, dstElemNum = 0 ;
-        ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, srcElemNum);
-        ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), dstValue, dstElemNum);
+        ir::Type srcType = getVectorInfo(ctx, srcValue, srcElemNum);
+        ir::Type dstType = getVectorInfo(ctx, dstValue, dstElemNum);
         // As long and double are not compatible in register storage
         // and we do not support double yet, simply put an assert here
         GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE));
@@ -2587,16 +2802,8 @@ namespace gbe
 
     // Get the name of the called function and handle it
     const std::string fnName = Callee->getName();
-    auto it = instrinsicMap.map.find(fnName);
-    // FIXME, should create a complete error reporting mechanism
-    // when found error in beignet managed passes including Gen pass.
-    if (it == instrinsicMap.map.end()) {
-      std::cerr << "Unresolved symbol: " << fnName << std::endl;
-      std::cerr << "Aborting..." << std::endl;
-      exit(-1);
-    }
-    GBE_ASSERT(it != instrinsicMap.map.end());
-    switch (it->second) {
+    auto genIntrinsicID = intrinsicMap.find(fnName);
+    switch (genIntrinsicID) {
       case GEN_OCL_GET_GROUP_ID0:
         regTranslator.newScalarProxy(ir::ocl::groupid0, dst); break;
       case GEN_OCL_GET_GROUP_ID1:
@@ -2693,39 +2900,17 @@ namespace gbe
       case GEN_OCL_LGBARRIER:
         ctx.getFunction().setUseSLM(true);
         break;
-      case GEN_OCL_WRITE_IMAGE_I_1D:
-      case GEN_OCL_WRITE_IMAGE_UI_1D:
-      case GEN_OCL_WRITE_IMAGE_F_1D:
-      case GEN_OCL_WRITE_IMAGE_I_2D:
-      case GEN_OCL_WRITE_IMAGE_UI_2D:
-      case GEN_OCL_WRITE_IMAGE_F_2D:
-      case GEN_OCL_WRITE_IMAGE_I_3D:
-      case GEN_OCL_WRITE_IMAGE_UI_3D:
-      case GEN_OCL_WRITE_IMAGE_F_3D:
+      case GEN_OCL_WRITE_IMAGE_I:
+      case GEN_OCL_WRITE_IMAGE_UI:
+      case GEN_OCL_WRITE_IMAGE_F:
         break;
-      case GEN_OCL_READ_IMAGE_I_1D:
-      case GEN_OCL_READ_IMAGE_UI_1D:
-      case GEN_OCL_READ_IMAGE_F_1D:
-      case GEN_OCL_READ_IMAGE_I_2D:
-      case GEN_OCL_READ_IMAGE_UI_2D:
-      case GEN_OCL_READ_IMAGE_F_2D:
-      case GEN_OCL_READ_IMAGE_I_3D:
-      case GEN_OCL_READ_IMAGE_UI_3D:
-      case GEN_OCL_READ_IMAGE_F_3D:
-
-      case GEN_OCL_READ_IMAGE_I_1D_I:
-      case GEN_OCL_READ_IMAGE_UI_1D_I:
-      case GEN_OCL_READ_IMAGE_F_1D_I:
-      case GEN_OCL_READ_IMAGE_I_2D_I:
-      case GEN_OCL_READ_IMAGE_UI_2D_I:
-      case GEN_OCL_READ_IMAGE_F_2D_I:
-      case GEN_OCL_READ_IMAGE_I_3D_I:
-      case GEN_OCL_READ_IMAGE_UI_3D_I:
-      case GEN_OCL_READ_IMAGE_F_3D_I:
+      case GEN_OCL_READ_IMAGE_I:
+      case GEN_OCL_READ_IMAGE_UI:
+      case GEN_OCL_READ_IMAGE_F:
       {
         // dst is a 4 elements vector. We allocate all 4 registers here.
         uint32_t elemNum;
-        (void)getVectorInfo(ctx, I.getType(), &I, elemNum);
+        (void)getVectorInfo(ctx, &I, elemNum);
         GBE_ASSERT(elemNum == 4);
         this->newRegister(&I);
         break;
@@ -2822,7 +3007,7 @@ namespace gbe
     const ir::Register dst = this->getRegister(&I);
 
     ir::BTI bti;
-    gatherBTI(*AI, bti);
+    gatherBTI(&I, bti);
     vector<ir::Register> src;
     uint32_t srcNum = 0;
     while(AI != AE) {
@@ -2843,7 +3028,7 @@ namespace gbe
       // This is not a kernel argument sampler, we need to append it to sampler set,
       // and allocate a sampler slot for it.
       const ir::Immediate &x = processConstantImm(CPV);
-      GBE_ASSERTM(x.getType() == ir::TYPE_U16 || x.getType() == ir::TYPE_S16, "Invalid sampler type");
+      GBE_ASSERTM(x.getType() == ir::TYPE_U32 || x.getType() == ir::TYPE_S32, "Invalid sampler type");
 
       index = ctx.getFunction().getSamplerSet()->append(x.getIntegerValue(), &ctx);
     } else {
@@ -2853,6 +3038,11 @@ namespace gbe
     return index;
   }
 
+  uint8_t GenWriter::getImageID(CallInst &I) {
+    const ir::Register imageReg = this->getRegister(I.getOperand(0));
+    return ctx.getFunction().getImageSet()->getIdx(imageReg);
+  }
+
   void GenWriter::emitCallInst(CallInst &I) {
     if (Function *F = I.getCalledFunction()) {
       if (F->getIntrinsicID() != 0) {
@@ -2905,7 +3095,8 @@ namespace gbe
             ctx.ADD(dst0Type, dst0, src0, src1);
 
             ir::Register overflow = this->getRegister(&I, 1);
-            ctx.LT(dst0Type, overflow, dst0, src1);
+            const ir::Type unsignedType = makeTypeUnsigned(dst0Type);
+            ctx.LT(unsignedType, overflow, dst0, src1);
           }
           break;
           case Intrinsic::usub_with_overflow:
@@ -2965,51 +3156,43 @@ namespace gbe
                 break;
               case 4:
                 {
-                  ir::Type srcType = getUnsignedType(ctx, llvmDstType);
+                  ir::Type srcType = getType(ctx, llvmDstType);
                   ir::Register tmp1 = ctx.reg(getFamily(srcType));
                   ir::Register tmp2 = ctx.reg(getFamily(srcType));
                   ir::Register tmp3 = ctx.reg(getFamily(srcType));
                   ir::Register tmp4 = ctx.reg(getFamily(srcType));
                   ir::Register tmp5 = ctx.reg(getFamily(srcType));
                   ir::Register tmp6 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp7 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp8 = ctx.reg(getFamily(srcType));
 
                   ir::Register regDWMask = ctx.reg( ir::FAMILY_DWORD );
-                  ir::Register regShift = ctx.reg( ir::FAMILY_DWORD );
-                  ir::ImmediateIndex wMask = ctx.newIntegerImmediate(0x000000FF, ir::TYPE_S32);
-                  ir::ImmediateIndex shift = ctx.newIntegerImmediate(24, ir::TYPE_S32);
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask);
-                  ctx.AND(srcType, tmp1, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift, shift);
-                  ctx.SHL(srcType, tmp2, tmp1, regShift);
+                  ir::Register regShift_8 = ctx.reg( ir::FAMILY_DWORD );
+                  ir::Register regShift_24 = ctx.reg( ir::FAMILY_DWORD );
+                  ir::ImmediateIndex wMask_L = ctx.newIntegerImmediate(0x0000FF00, ir::TYPE_S32);
+                  ir::ImmediateIndex wMask_H = ctx.newIntegerImmediate(0x00FF0000, ir::TYPE_S32);
+                  ir::ImmediateIndex shift_8 = ctx.newIntegerImmediate(8, ir::TYPE_S32);
+                  ir::ImmediateIndex shift_24 = ctx.newIntegerImmediate(24, ir::TYPE_S32);
+
+                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
+                  ctx.SHL(srcType, tmp1, src0, regShift_24);
+
+                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_L);
+                  ctx.AND(srcType, tmp2, src0, regDWMask);
+                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
+                  ctx.SHL(srcType, tmp3, tmp2, regShift_8);
 
-                  wMask = ctx.newIntegerImmediate(0x0000FF00, ir::TYPE_S32);
-                  shift = ctx.newIntegerImmediate(8, ir::TYPE_S32);
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask);
-                  ctx.AND(srcType, tmp3, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift, shift);
-                  ctx.SHL(srcType, tmp4, tmp3, regShift);
-
-                  wMask = ctx.newIntegerImmediate(0x00FF0000, ir::TYPE_S32);
-                  shift = ctx.newIntegerImmediate(8, ir::TYPE_S32);
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask);
-                  ctx.AND(srcType, tmp5, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift, shift);
-                  ctx.SHR(srcType, tmp6, tmp5, regShift);
-
-                  wMask = ctx.newIntegerImmediate(0xFF000000, ir::TYPE_S32);
-                  shift = ctx.newIntegerImmediate(24, ir::TYPE_S32);
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask);
-                  ctx.AND(srcType, tmp7, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift, shift);
-                  ctx.SHR(srcType, tmp8, tmp7, regShift);
-
-                  ir::Register tmp9 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp10 = ctx.reg(getFamily(srcType));
-                  ctx.OR(srcType, tmp9, tmp2, tmp4);
-                  ctx.OR(srcType, tmp10, tmp6, tmp8);
-                  ctx.OR(srcType, dst0, tmp9, tmp10);
+                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_H);
+                  ctx.AND(srcType, tmp4, src0, regDWMask);
+                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
+                  ctx.SHR(makeTypeUnsigned(srcType), tmp5, tmp4, regShift_8);
+
+                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
+                  ctx.SHR(makeTypeUnsigned(srcType), tmp6, src0, regShift_24);
+
+                  ir::Register tmp7 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp8 = ctx.reg(getFamily(srcType));
+                  ctx.OR(srcType, tmp7, tmp1, tmp3);
+                  ctx.OR(srcType, tmp8, tmp5, tmp6);
+                  ctx.OR(srcType, dst0, tmp7, tmp8);
                 }
                 break;
               case 8:
@@ -3023,12 +3206,10 @@ namespace gbe
           default: NOT_IMPLEMENTED;
         }
       } else {
-        int image_dim;
         // Get the name of the called function and handle it
         Value *Callee = I.getCalledValue();
         const std::string fnName = Callee->getName();
-        auto it = instrinsicMap.map.find(fnName);
-        GBE_ASSERT(it != instrinsicMap.map.end());
+        auto genIntrinsicID = intrinsicMap.find(fnName);
 
         // Get the function arguments
         CallSite CS(&I);
@@ -3037,7 +3218,7 @@ namespace gbe
         CallSite::arg_iterator AE = CS.arg_end();
 #endif /* GBE_DEBUG */
 
-        switch (it->second) {
+        switch (genIntrinsicID) {
           case GEN_OCL_POW:
           {
             const ir::Register src0 = this->getRegister(*AI); ++AI;
@@ -3139,196 +3320,111 @@ namespace gbe
           case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
           case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
           {
-            GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
+            const uint8_t imageID = getImageID(I);
+            GBE_ASSERT(AI != AE); ++AI;
             const ir::Register reg = this->getRegister(&I, 0);
-            int infoType = it->second - GEN_OCL_GET_IMAGE_WIDTH;
-            const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
-            ir::ImageInfoKey key(surfaceID, infoType);
+            int infoType = genIntrinsicID - GEN_OCL_GET_IMAGE_WIDTH;
+            ir::ImageInfoKey key(imageID, infoType);
             const ir::Register infoReg = ctx.getFunction().getImageSet()->appendInfo(key, &ctx);
-            ctx.GET_IMAGE_INFO(infoType, reg, surfaceID, infoReg);
+            ctx.GET_IMAGE_INFO(infoType, reg, imageID, infoReg);
             break;
           }
 
-          case GEN_OCL_READ_IMAGE_I_1D:
-          case GEN_OCL_READ_IMAGE_UI_1D:
-          case GEN_OCL_READ_IMAGE_F_1D:
-          case GEN_OCL_READ_IMAGE_I_1D_I:
-          case GEN_OCL_READ_IMAGE_UI_1D_I:
-          case GEN_OCL_READ_IMAGE_F_1D_I:
-            image_dim = 1;
-            goto handle_read_image;
-          case GEN_OCL_READ_IMAGE_I_2D:
-          case GEN_OCL_READ_IMAGE_UI_2D:
-          case GEN_OCL_READ_IMAGE_F_2D:
-          case GEN_OCL_READ_IMAGE_I_2D_I:
-          case GEN_OCL_READ_IMAGE_UI_2D_I:
-          case GEN_OCL_READ_IMAGE_F_2D_I:
-            image_dim = 2;
-            goto handle_read_image;
-          case GEN_OCL_READ_IMAGE_I_3D:
-          case GEN_OCL_READ_IMAGE_UI_3D:
-          case GEN_OCL_READ_IMAGE_F_3D:
-          case GEN_OCL_READ_IMAGE_I_3D_I:
-          case GEN_OCL_READ_IMAGE_UI_3D_I:
-          case GEN_OCL_READ_IMAGE_F_3D_I:
-            image_dim = 3;
-handle_read_image:
+          case GEN_OCL_READ_IMAGE_I:
+          case GEN_OCL_READ_IMAGE_UI:
+          case GEN_OCL_READ_IMAGE_F:
           {
-            GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
-            const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
+            const uint8_t imageID = getImageID(I);
+            GBE_ASSERT(AI != AE); ++AI;
             GBE_ASSERT(AI != AE);
             const uint8_t sampler = this->appendSampler(AI);
-            ++AI;
-
-            ir::Register ucoord;
-            ir::Register vcoord;
-            ir::Register wcoord;
-
-            GBE_ASSERT(AI != AE); ucoord = this->getRegister(*AI); ++AI;
-            if (image_dim > 1) {
-              GBE_ASSERT(AI != AE);
-              vcoord = this->getRegister(*AI);
-              ++AI;
-            } else {
-              vcoord = ir::ocl::invalid;
-            }
-
-            if (image_dim > 2) {
-              GBE_ASSERT(AI != AE);
-              wcoord = this->getRegister(*AI);
-              ++AI;
-            } else {
-              wcoord = ir::ocl::invalid;
-            }
+            ++AI; GBE_ASSERT(AI != AE);
+            uint32_t coordNum;
+            const ir::Type coordType = getVectorInfo(ctx, *AI, coordNum);
+            if (coordNum == 4)
+              coordNum = 3;
+            const uint32_t imageDim = coordNum;
+            GBE_ASSERT(imageDim >= 1 && imageDim <= 3);
 
-            vector<ir::Register> dstTupleData, srcTupleData;
-            const uint32_t elemNum = 4;
-            for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-              const ir::Register reg = this->getRegister(&I, elemID);
-              dstTupleData.push_back(reg);
-            }
-            srcTupleData.push_back(ucoord);
-            srcTupleData.push_back(vcoord);
-            srcTupleData.push_back(wcoord);
             uint8_t samplerOffset = 0;
+            Value *coordVal = *AI;
+            ++AI; GBE_ASSERT(AI != AE);
+            Value *samplerOffsetVal = *AI;
 #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
-            GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
+            Constant *CPV = dyn_cast<Constant>(samplerOffsetVal);
             assert(CPV);
             const ir::Immediate &x = processConstantImm(CPV);
             GBE_ASSERTM(x.getType() == ir::TYPE_U32 || x.getType() == ir::TYPE_S32, "Invalid sampler type");
             samplerOffset = x.getIntegerValue();
 #endif
-            const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
-            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 3);
+            bool isFloatCoord = coordType == ir::TYPE_FLOAT;
+            bool requiredFloatCoord = samplerOffset == 0;
 
-            ir::Type dstType = ir::TYPE_U32;
-
-            switch(it->second) {
-              case GEN_OCL_READ_IMAGE_I_1D:
-              case GEN_OCL_READ_IMAGE_UI_1D:
-              case GEN_OCL_READ_IMAGE_I_2D:
-              case GEN_OCL_READ_IMAGE_UI_2D:
-              case GEN_OCL_READ_IMAGE_I_3D:
-              case GEN_OCL_READ_IMAGE_UI_3D:
-              case GEN_OCL_READ_IMAGE_I_1D_I:
-              case GEN_OCL_READ_IMAGE_UI_1D_I:
-              case GEN_OCL_READ_IMAGE_I_2D_I:
-              case GEN_OCL_READ_IMAGE_UI_2D_I:
-              case GEN_OCL_READ_IMAGE_I_3D_I:
-              case GEN_OCL_READ_IMAGE_UI_3D_I:
-                dstType = ir::TYPE_U32;
-                break;
-              case GEN_OCL_READ_IMAGE_F_1D:
-              case GEN_OCL_READ_IMAGE_F_2D:
-              case GEN_OCL_READ_IMAGE_F_3D:
-              case GEN_OCL_READ_IMAGE_F_1D_I:
-              case GEN_OCL_READ_IMAGE_F_2D_I:
-              case GEN_OCL_READ_IMAGE_F_3D_I:
-                dstType = ir::TYPE_FLOAT;
-                break;
-              default:
-                GBE_ASSERT(0); // never been here.
+            GBE_ASSERT(isFloatCoord == requiredFloatCoord);
+
+            vector<ir::Register> dstTupleData, srcTupleData;
+            for (uint32_t elemID = 0; elemID < 3; elemID++) {
+              ir::Register reg;
+
+              if (elemID < imageDim)
+                reg = this->getRegister(coordVal, elemID);
+              else
+                reg = ir::ocl::invalid;
+
+              srcTupleData.push_back(reg);
             }
 
-            bool isFloatCoord = it->second <= GEN_OCL_READ_IMAGE_F_3D;
+            uint32_t elemNum;
+            ir::Type dstType = getVectorInfo(ctx, &I, elemNum);
+            GBE_ASSERT(elemNum == 4);
+
+            for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+              const ir::Register reg = this->getRegister(&I, elemID);
+              dstTupleData.push_back(reg);
+            }
+            const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 3);
 
-            ctx.SAMPLE(surfaceID, dstTuple, srcTuple, dstType == ir::TYPE_FLOAT,
-                       isFloatCoord, sampler, samplerOffset);
+            ctx.SAMPLE(imageID, dstTuple, srcTuple, dstType == ir::TYPE_FLOAT,
+                       requiredFloatCoord, sampler, samplerOffset);
             break;
           }
 
-          case GEN_OCL_WRITE_IMAGE_I_1D:
-          case GEN_OCL_WRITE_IMAGE_UI_1D:
-          case GEN_OCL_WRITE_IMAGE_F_1D:
-            image_dim = 1;
-            goto handle_write_image;
-          case GEN_OCL_WRITE_IMAGE_I_2D:
-          case GEN_OCL_WRITE_IMAGE_UI_2D:
-          case GEN_OCL_WRITE_IMAGE_F_2D:
-            image_dim = 2;
-            goto handle_write_image;
-          case GEN_OCL_WRITE_IMAGE_I_3D:
-          case GEN_OCL_WRITE_IMAGE_UI_3D:
-          case GEN_OCL_WRITE_IMAGE_F_3D:
-            image_dim = 3;
-handle_write_image:
+          case GEN_OCL_WRITE_IMAGE_I:
+          case GEN_OCL_WRITE_IMAGE_UI:
+          case GEN_OCL_WRITE_IMAGE_F:
           {
-            GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
-            const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
-            ir::Register ucoord, vcoord, wcoord;
-
-            GBE_ASSERT(AI != AE); ucoord = this->getRegister(*AI); ++AI;
+            const uint8_t imageID = getImageID(I);
+            GBE_ASSERT(AI != AE); ++AI; GBE_ASSERT(AI != AE);
+            uint32_t coordNum;
+            (void)getVectorInfo(ctx, *AI, coordNum);
+            if (coordNum == 4)
+              coordNum = 3;
+            const uint32_t imageDim = coordNum;
+            vector<ir::Register> srcTupleData;
+            GBE_ASSERT(imageDim >= 1 && imageDim <= 3);
 
-            if (image_dim > 1) {
-              GBE_ASSERT(AI != AE);
-              vcoord = this->getRegister(*AI);
-              ++AI;
-            } else
-              vcoord = ir::ocl::invalid;
-
-            if (image_dim > 2) {
-              GBE_ASSERT(AI != AE);
-              wcoord = this->getRegister(*AI);
-              ++AI;
-            } else {
-              wcoord = ir::ocl::invalid;
-            }
+            for (uint32_t elemID = 0; elemID < 3; elemID++) {
+              ir::Register reg;
 
-            GBE_ASSERT(AI != AE);
-            vector<ir::Register> srcTupleData;
+              if (elemID < imageDim)
+                reg = this->getRegister(*AI, elemID);
+              else
+                reg = ir::ocl::invalid;
 
-            srcTupleData.push_back(ucoord);
-            srcTupleData.push_back(vcoord);
-            srcTupleData.push_back(wcoord);
+              srcTupleData.push_back(reg);
+            }
+            ++AI; GBE_ASSERT(AI != AE);
+            uint32_t elemNum;
+            ir::Type srcType = getVectorInfo(ctx, *AI, elemNum);
+            GBE_ASSERT(elemNum == 4);
 
-            const uint32_t elemNum = 4;
             for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
               const ir::Register reg = this->getRegister(*AI, elemID);
               srcTupleData.push_back(reg);
             }
             const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 7);
-
-            ir::Type srcType = ir::TYPE_U32;
-
-            switch(it->second) {
-              case GEN_OCL_WRITE_IMAGE_I_1D:
-              case GEN_OCL_WRITE_IMAGE_UI_1D:
-              case GEN_OCL_WRITE_IMAGE_I_2D:
-              case GEN_OCL_WRITE_IMAGE_UI_2D:
-              case GEN_OCL_WRITE_IMAGE_I_3D:
-              case GEN_OCL_WRITE_IMAGE_UI_3D:
-                srcType = ir::TYPE_U32;
-                break;
-              case GEN_OCL_WRITE_IMAGE_F_1D:
-              case GEN_OCL_WRITE_IMAGE_F_2D:
-              case GEN_OCL_WRITE_IMAGE_F_3D:
-                srcType = ir::TYPE_FLOAT;
-                break;
-              default:
-                GBE_ASSERT(0); // never been here.
-            }
-
-            ctx.TYPED_WRITE(surfaceID, srcTuple, srcType, ir::TYPE_U32);
+            ctx.TYPED_WRITE(imageID, srcTuple, srcType, ir::TYPE_U32);
             break;
           }
           case GEN_OCL_MUL_HI_INT:
@@ -3466,7 +3562,7 @@ handle_write_image:
             //Becasue cmp's sources are same as sel's source, so cmp instruction and sel
             //instruction will be merged to one sel_cmp instruction in the gen selection
             //Add two intruction here for simple.
-            if(it->second == GEN_OCL_FMAX)
+            if(genIntrinsicID == GEN_OCL_FMAX)
               ctx.GE(getType(ctx, I.getType()), cmp, src0, src1);
             else
               ctx.LT(getType(ctx, I.getType()), cmp, src0, src1);
@@ -3704,96 +3800,62 @@ handle_write_image:
   }
 
   // The idea behind is to search along the use-def chain, and find out all
-  // possible source of the pointer. Then in later codeGen, we can emit
-  // read/store instructions to these btis gathered.
-  void GenWriter::gatherBTI(Value *pointer, ir::BTI &bti) {
-    typedef map<const Value*, int>::iterator GlobalPtrIter;
-    Value *p;
-    size_t idx = 0;
-    int nBTI = 0;
-    std::vector<Value*> candidates;
-    candidates.push_back(pointer);
-    std::set<Value*> processed;
-
-    while (idx < candidates.size()) {
-      bool isPrivate = false;
-      bool needNewBTI = true;
-      p = candidates[idx];
-
-      while (dyn_cast<User>(p) && !dyn_cast<GlobalVariable>(p)) {
-
-        if (processed.find(p) == processed.end()) {
-          processed.insert(p);
+  // possible sources of the pointer. Then in later codeGen, we can emit
+  // read/store instructions to these BTIs gathered.
+  void GenWriter::gatherBTI(Value *insn, ir::BTI &bti) {
+    PtrOrigMapIter iter = pointerOrigMap.find(insn);
+    if (iter != pointerOrigMap.end()) {
+      SmallVectorImpl<Value *> &origins = iter->second;
+      uint8_t nBTI = 0;
+      for (unsigned i = 0; i < origins.size(); i++) {
+        uint8_t new_bti = 0;
+        Value *origin = origins[i];
+        // all constant put into constant cache, including __constant & const __private
+        if (isa<GlobalVariable>(origin)
+            && dyn_cast<GlobalVariable>(origin)->isConstant()) {
+          new_bti = BTI_CONSTANT;
         } else {
-          // This use-def chain falls into a loop,
-          // it does not introduce a new buffer source.
-          needNewBTI = false;
-          break;
-        }
-
-        if (dyn_cast<SelectInst>(p)) {
-          SelectInst *sel = cast<SelectInst>(p);
-          p = sel->getTrueValue();
-          candidates.push_back(sel->getFalseValue());
-          continue;
-        }
-
-        if (dyn_cast<PHINode>(p)) {
-          PHINode* phi = cast<PHINode>(p);
-          int n = phi->getNumIncomingValues();
-          for (int j = 1; j < n; j++)
-            candidates.push_back(phi->getIncomingValue(j));
-          p = phi->getIncomingValue(0);
-          continue;
-        }
-
-        if (dyn_cast<AllocaInst>(p)) {
-          isPrivate = true;
-          break;
+          unsigned space = origin->getType()->getPointerAddressSpace();
+          switch (space) {
+            case 0:
+              new_bti = BTI_PRIVATE;
+              break;
+            case 1:
+            {
+              GlobalPtrIter iter = globalPointer.find(origin);
+              GBE_ASSERT(iter != globalPointer.end());
+              new_bti = iter->second;
+              break;
+            }
+            case 2:
+              new_bti = BTI_CONSTANT;
+              break;
+            case 3:
+              new_bti = 0xfe;
+              break;
+            default:
+              GBE_ASSERT(0 && "address space not unhandled in gatherBTI()\n");
+              break;
+          }
         }
-        p = cast<User>(p)->getOperand(0);
-      }
 
-      if (needNewBTI == false) {
-        // go to next possible pointer source
-        idx++; continue;
-      }
-
-      uint8_t new_bti = 0;
-      if (isPrivate) {
-        new_bti = BTI_PRIVATE;
-      } else {
-        if(isa<Argument>(p) && dyn_cast<Argument>(p)->hasByValAttr()) {
-          // structure value implementation is not complete now,
-          // they are now treated as push constant, so, the load/store
-          // here is not as meaningful.
-          bti.bti[0] = BTI_PRIVATE;
-          bti.count = 1;
-          break;
-        }
-        Type *ty = p->getType();
-        if(ty->getPointerAddressSpace() == 3) {
-          // __local memory
-          new_bti = 0xfe;
-        } else {
-          // __global memory
-          GlobalPtrIter iter = globalPointer.find(p);
-          GBE_ASSERT(iter != globalPointer.end());
-          new_bti = iter->second;
+        // avoid duplicate
+        bool bFound = false;
+        for (int j = 0; j < nBTI; j++) {
+          if (bti.bti[j] == new_bti) {
+            bFound = true; break;
+          }
         }
-      }
-      // avoid duplicate
-      bool bFound = false;
-      for (int j = 0; j < nBTI; j++) {
-        if (bti.bti[j] == new_bti) {
-          bFound = true; break;
+        if (bFound == false) {
+          bti.bti[nBTI++] = new_bti;
+          bti.count = nBTI;
         }
       }
-      if (bFound == false) {
-        bti.bti[nBTI++] = new_bti;
-        bti.count = nBTI;
-      }
-      idx++;
+    } else {
+      insn->dump();
+      std::cerr << "Illegal pointer which is not from a valid memory space." << std::endl;
+      std::cerr << "Aborting..." << std::endl;
+      exit(-1);
     }
     GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
   }
@@ -3810,9 +3872,8 @@ handle_write_image:
     const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
     const ir::Register ptr = this->getRegister(llvmPtr);
     ir::BTI binding;
-    if(addrSpace == ir::MEM_GLOBAL || addrSpace == ir::MEM_PRIVATE) {
-      gatherBTI(llvmPtr, binding);
-    }
+    gatherBTI(&I, binding);
+
     // Scalar is easy. We neednot build register tuples
     if (isScalarType(llvmType) == true) {
       const ir::Type type = getType(ctx, llvmType);
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 5dac3ea..e127996 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -26,12 +26,17 @@
 #ifndef __GBE_LLVM_GEN_BACKEND_HPP__
 #define __GBE_LLVM_GEN_BACKEND_HPP__
 
+#include <cxxabi.h>
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/LoopPass.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/Instructions.h"
+#endif
 #include "sys/platform.hpp"
 #include "sys/map.hpp"
-#include "sys/hash_map.hpp"
 #include <algorithm>
 
 // LLVM Type
@@ -51,7 +56,7 @@ namespace gbe
 
   /*! Build the hash map for OCL functions on Gen */
   struct OCLIntrinsicMap {
-    /*! Build the intrinsic hash map */
+    /*! Build the intrinsic map */
     OCLIntrinsicMap(void) {
 #define DECL_LLVM_GEN_FUNCTION(ID, NAME) \
   map.insert(std::make_pair(#NAME, GEN_OCL_##ID));
@@ -59,14 +64,35 @@ namespace gbe
 #undef DECL_LLVM_GEN_FUNCTION
     }
     /*! Sort intrinsics with their names */
-    hash_map<std::string, OCLInstrinsic> map;
+    gbe::map<std::string, OCLInstrinsic> map;
+    OCLInstrinsic find(const std::string symbol) const {
+      auto it = map.find(symbol);
+
+      if (it == map.end()) {
+        int status;
+        const char *realName = abi::__cxa_demangle(symbol.c_str(), NULL, NULL, &status);
+        if (status == 0) {
+          std::string realFnName(realName), stripName;
+          stripName = realFnName.substr(0, realFnName.find("("));
+          it = map.find(stripName);
+        }
+      }
+      // FIXME, should create a complete error reporting mechanism
+      // when found error in beignet managed passes including Gen pass.
+      if (it == map.end()) {
+        std::cerr << "Unresolved symbol: " << symbol << std::endl;
+        std::cerr << "Aborting..." << std::endl;
+        exit(-1);
+      }
+      return it->second;
+    }
   };
 
   /*! Sort the OCL Gen instrinsic functions (built on pre-main) */
-  static const OCLIntrinsicMap instrinsicMap;
+  static const OCLIntrinsicMap intrinsicMap;
 
   /*! Pad the offset */
-  uint32_t getPadding(uint32_t offset, uint32_t align);
+  int32_t getPadding(int32_t offset, int32_t align);
 
   /*! Get the type alignment in bytes */
   uint32_t getAlignmentByte(const ir::Unit &unit, llvm::Type* Ty);
@@ -77,6 +103,9 @@ namespace gbe
   /*! Get the type size in bytes */
   uint32_t getTypeByteSize(const ir::Unit &unit, llvm::Type* Ty);
 
+  /*! Get GEP constant offset for the specified operand.*/
+  int32_t getGEPConstOffset(const ir::Unit &unit, llvm::CompositeType *CompTy, int32_t TypeIndex);
+
   /*! whether this is a kernel function */
   bool isKernelFunction(const llvm::Function &f);
 
@@ -107,6 +136,7 @@ namespace gbe
   /* customized loop unrolling pass. */
   llvm::LoopPass *createCustomLoopUnrollPass();
 #endif
+  llvm::FunctionPass* createSamplerFixPass();
 
   /*! Add all the function call of ocl to our bitcode. */
   llvm::Module* runBitCodeLinker(llvm::Module *mod, bool strictMath);
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 7434c78..8ec8336 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -46,38 +46,14 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8,  __gen_ocl_force_simd8)
 DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)
 
 // To read_image functions.
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_1D, _Z21__gen_ocl_read_imageijtfj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_1D, _Z22__gen_ocl_read_imageuijtfj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_1D, _Z21__gen_ocl_read_imagefjtfj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_2D, _Z21__gen_ocl_read_imageijtffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_2D, _Z22__gen_ocl_read_imageuijtffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_2D, _Z21__gen_ocl_read_imagefjtffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D, _Z21__gen_ocl_read_imageijtfffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D, _Z22__gen_ocl_read_imageuijtfffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D, _Z21__gen_ocl_read_imagefjtfffj)
-// work around read image with the LD message. The coords are integer type.
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_1D_I, _Z21__gen_ocl_read_imageijtij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_1D_I, _Z22__gen_ocl_read_imageuijtij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_1D_I, _Z21__gen_ocl_read_imagefjtij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_2D_I, _Z21__gen_ocl_read_imageijtiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_2D_I, _Z22__gen_ocl_read_imageuijtiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_2D_I, _Z21__gen_ocl_read_imagefjtiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D_I, _Z21__gen_ocl_read_imageijtiiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D_I, _Z22__gen_ocl_read_imageuijtiiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D_I, _Z21__gen_ocl_read_imagefjtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I, __gen_ocl_read_imagei)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI, __gen_ocl_read_imageui)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F, __gen_ocl_read_imagef)
 
 // To write_image functions.
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_1D, _Z22__gen_ocl_write_imageijiDv4_i)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_1D, _Z23__gen_ocl_write_imageuijiDv4_j)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_1D, _Z22__gen_ocl_write_imagefjiDv4_f)
-
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_2D, _Z22__gen_ocl_write_imageijiiDv4_i)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_2D, _Z23__gen_ocl_write_imageuijiiDv4_j)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_2D, _Z22__gen_ocl_write_imagefjiiDv4_f)
-
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_3D, _Z22__gen_ocl_write_imageijiiiDv4_i)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_3D, _Z23__gen_ocl_write_imageuijiiiDv4_j)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_3D, _Z22__gen_ocl_write_imagefjiiiDv4_f)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I, __gen_ocl_write_imagei)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI, __gen_ocl_write_imageui)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F, __gen_ocl_write_imagef)
 
 // To get image info function
 DECL_LLVM_GEN_FUNCTION(GET_IMAGE_WIDTH, __gen_ocl_get_image_width)
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index 52f99c1..7d1f8f0 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -72,10 +72,12 @@ namespace gbe {
             return 'p';
           case 1:
             return 'g';
+          case 2:
+            return 'c';
           case 3:
             return 'l';
           default:
-            assert("Non support address space");
+            assert(0 && "Non support address space");
             return '\0';
         }
       }
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index f9fda4d..d315d53 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -126,7 +126,7 @@ namespace gbe
     return bKernel;
   }
 
-  uint32_t getPadding(uint32_t offset, uint32_t align) {
+  int32_t getPadding(int32_t offset, int32_t align) {
     return (align - (offset % align)) % align; 
   }
 
@@ -220,24 +220,44 @@ namespace gbe
     return size_bit/8;
   }
 
+  int32_t getGEPConstOffset(const ir::Unit &unit, CompositeType *CompTy, int32_t TypeIndex) {
+    int32_t offset = 0;
+    SequentialType * seqType = dyn_cast<SequentialType>(CompTy);
+    if (seqType != NULL) {
+      if (TypeIndex != 0) {
+        Type *elementType = seqType->getElementType();
+        uint32_t elementSize = getTypeByteSize(unit, elementType);
+        uint32_t align = getAlignmentByte(unit, elementType);
+        elementSize += getPadding(elementSize, align);
+        offset = elementSize * TypeIndex;
+      }
+    } else {
+      int32_t step = TypeIndex > 0 ? 1 : -1;
+      GBE_ASSERT(CompTy->isStructTy());
+      for(int32_t ty_i=0; ty_i != TypeIndex; ty_i += step)
+      {
+        Type* elementType = CompTy->getTypeAtIndex(ty_i);
+        uint32_t align = getAlignmentByte(unit, elementType);
+        offset += getPadding(offset, align * step);
+        offset += getTypeByteSize(unit, elementType) * step;
+      }
+
+      //add getPaddingding for accessed type
+      const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+      offset += getPadding(offset, align * step);
+    }
+    return offset;
+  }
+
   class GenRemoveGEPPasss : public BasicBlockPass
   {
 
    public:
     static char ID;
-#define FORMER_VERSION 0
-#if FORMER_VERSION
-   GenRemoveGEPPasss(map<const Value *, const Value *>& 
-                                       parentCompositePointer)
-     : BasicBlockPass(ID),
-     parentPointers(parentCompositePointer) {}
-    map<const Value *, const Value *>& parentPointers;
-#else
-   GenRemoveGEPPasss(const ir::Unit &unit) :
-     BasicBlockPass(ID),
-     unit(unit) {}
-  const ir::Unit &unit;
-#endif
+    GenRemoveGEPPasss(const ir::Unit &unit) :
+      BasicBlockPass(ID),
+      unit(unit) {}
+    const ir::Unit &unit;
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
     }
@@ -267,51 +287,22 @@ namespace gbe
   {
     const uint32_t ptrSize = unit.getPointerSize();
     Value* parentPointer = GEPInst->getOperand(0);
-#if FORMER_VERSION
-    Value* topParent = parentPointer;
-#endif
     CompositeType* CompTy = cast<CompositeType>(parentPointer->getType());
 
     Value* currentAddrInst = 
       new PtrToIntInst(parentPointer, IntegerType::get(GEPInst->getContext(), ptrSize), "", GEPInst);
 
-    uint32_t constantOffset = 0;
+    int32_t constantOffset = 0;
 
     for(uint32_t op=1; op<GEPInst->getNumOperands(); ++op)
     {
-      uint32_t TypeIndex;
-      //we have a constant struct/array acces
-      if(ConstantInt* ConstOP = dyn_cast<ConstantInt>(GEPInst->getOperand(op)))
-      {
-        uint32_t offset = 0;
+      int32_t TypeIndex;
+      ConstantInt* ConstOP = dyn_cast<ConstantInt>(GEPInst->getOperand(op));
+      if (ConstOP != NULL) {
         TypeIndex = ConstOP->getZExtValue();
-        if (op == 1) {
-          if (TypeIndex != 0) {
-            Type *elementType = (cast<PointerType>(parentPointer->getType()))->getElementType();
-            uint32_t elementSize = getTypeByteSize(unit, elementType);
-            uint32_t align = getAlignmentByte(unit, elementType);
-            elementSize += getPadding(elementSize, align);
-            offset += elementSize * TypeIndex;
-          }
-        } else {
-          for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
-          {
-            Type* elementType = CompTy->getTypeAtIndex(ty_i);
-            uint32_t align = getAlignmentByte(unit, elementType);
-            offset += getPadding(offset, align);
-            offset += getTypeByteSize(unit, elementType);
-          }
-
-          //add getPaddingding for accessed type
-          const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
-          offset += getPadding(offset, align);
-        }
-
-        constantOffset += offset;
+        constantOffset += getGEPConstOffset(unit, CompTy, TypeIndex);
       }
-      // none constant index (=> only array/verctor allowed)
-      else
-      {
+      else {
         // we only have array/vectors here, 
         // therefore all elements have the same size
         TypeIndex = 0;
@@ -382,13 +373,6 @@ namespace gbe
     GEPInst->dropAllReferences();
     GEPInst->eraseFromParent();
 
-#if FORMER_VERSION
-    //insert new pointer into parent list
-    while(parentPointers.find(topParent)!=parentPointers.end())
-      topParent = parentPointers.find(topParent)->second;
-    parentPointers[intToPtrInst] = topParent;
-#endif
-
     return true;
   }
 
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 11e9633..8fec683 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -617,13 +617,25 @@ error:
 
     if (!pbuf_ptr) {
       /* alloc a new buffer ptr to collect the print output. */
-      Type *ptrTy = Type::getInt32PtrTy(module->getContext());
-      llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
+      Type *ptrTy = Type::getInt32PtrTy(module->getContext(), 1);
+      llvm::Constant *pBuf = new GlobalVariable(*module, ptrTy, false,
+                                GlobalVariable::ExternalLinkage,
+                                nullptr,
+                                StringRef("__gen_ocl_printf_buf"),
+                                nullptr,
+                                GlobalVariable::NotThreadLocal,
+                                1);
       pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
     }
     if (!index_buf_ptr) {
-      Type *ptrTy = Type::getInt32PtrTy(module->getContext());
-      llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
+      Type *ptrTy = Type::getInt32PtrTy(module->getContext(), 1);
+      llvm::Constant *pBuf = new GlobalVariable(*module, ptrTy, false,
+                                GlobalVariable::ExternalLinkage,
+                                nullptr,
+                                StringRef("__gen_ocl_printf_index_buf"),
+                                nullptr,
+                                GlobalVariable::NotThreadLocal,
+                                1);
       index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
     }
 
diff --git a/backend/src/llvm/llvm_sampler_fix.cpp b/backend/src/llvm/llvm_sampler_fix.cpp
new file mode 100644
index 0000000..8c76324
--- /dev/null
+++ b/backend/src/llvm/llvm_sampler_fix.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This pass is to solve the __gen_ocl_sampler_need_fix() and
+ * __gen_ocl_sampler_need_rounding_fix(), as for some special
+ * sampler type, we need some extra work around operations to
+ * make sure to get correct pixel value. But for some other
+ * sampler, we don't need those work around code.
+ */
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/IRBuilder.h"
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CFG.h"
+#endif
+
+#include "llvm/Analysis/ConstantsScanner.h"
+
+#include "llvm_gen_backend.hpp"
+#include "ocl_common_defines.h"
+
+using namespace llvm;
+
+namespace gbe {
+
+  class SamplerFix : public FunctionPass {
+  public:
+    SamplerFix() : FunctionPass(ID) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+      initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
+#else
+      initializeDominatorTreePass(*PassRegistry::getPassRegistry());
+#endif
+    }
+
+    bool visitCallInst(CallInst *I) {
+      Value *Callee = I->getCalledValue();
+      const std::string fnName = Callee->getName();
+      bool changed = false;
+      Type *boolTy = IntegerType::get(I->getContext(), 1);
+      Type *i32Ty = IntegerType::get(I->getContext(), 32);
+
+      if (fnName.compare("__gen_ocl_sampler_need_fix") == 0) {
+
+        //  return (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) &&
+        //          ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST));
+        bool needFix = true;
+        Value *needFixVal;
+        if (dyn_cast<ConstantInt>(I->getOperand(0))) {
+          const ConstantInt *ci = dyn_cast<ConstantInt>(I->getOperand(0));
+          uint32_t samplerInt = ci->getZExtValue();
+          needFix = ((samplerInt & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP &&
+                     (samplerInt & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST);
+          needFixVal = ConstantInt::get(boolTy, needFix);
+        } else {
+          IRBuilder<> Builder(I->getParent());
+
+          Builder.SetInsertPoint(I);
+          Value *addressMask = ConstantInt::get(i32Ty, __CLK_ADDRESS_MASK);
+          Value *addressMode = Builder.CreateAnd(I->getOperand(0), addressMask);
+          Value *clampInt =  ConstantInt::get(i32Ty, CLK_ADDRESS_CLAMP);
+          Value *isClampMode = Builder.CreateICmpEQ(addressMode, clampInt);
+          Value *filterMask = ConstantInt::get(i32Ty, __CLK_FILTER_MASK);
+          Value *filterMode = Builder.CreateAnd(I->getOperand(0), filterMask);
+          Value *nearestInt = ConstantInt::get(i32Ty, CLK_FILTER_NEAREST);
+          Value *isNearestMode = Builder.CreateICmpEQ(filterMode, nearestInt);
+          needFixVal = Builder.CreateAnd(isClampMode, isNearestMode);
+        }
+
+        I->replaceAllUsesWith(needFixVal);
+        changed = true;
+      } else if (fnName.compare("__gen_ocl_sampler_need_rounding_fix") == 0) {
+
+        //  return ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0);
+        bool needFix = true;
+        Value *needFixVal;
+        if (dyn_cast<ConstantInt>(I->getOperand(0))) {
+          const ConstantInt *ci = dyn_cast<ConstantInt>(I->getOperand(0));
+          uint32_t samplerInt = ci->getZExtValue();
+          needFix = samplerInt & CLK_NORMALIZED_COORDS_TRUE;
+          needFixVal = ConstantInt::get(boolTy, needFix);
+        } else {
+          IRBuilder<> Builder(I->getParent());
+          Builder.SetInsertPoint(I);
+          Value *normalizeMask = ConstantInt::get(i32Ty, CLK_NORMALIZED_COORDS_TRUE);
+          Value *normalizeMode = Builder.CreateAnd(I->getOperand(0), normalizeMask);
+          needFixVal = Builder.CreateICmpEQ(normalizeMode, ConstantInt::get(i32Ty, 0));
+        }
+        I->replaceAllUsesWith(needFixVal);
+        changed = true;
+      }
+      return changed;
+    }
+
+    bool runOnFunction(Function& F) {
+      bool changed = false;
+      std::set<Instruction*> deadInsnSet;
+      for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+        if (dyn_cast<CallInst>(&*I)) {
+          if (visitCallInst(dyn_cast<CallInst>(&*I))) {
+            changed = true;
+            deadInsnSet.insert(&*I);
+          }
+        }
+      }
+      for (auto it: deadInsnSet)
+        it->eraseFromParent();
+      return changed;
+    }
+
+    static char ID;
+  };
+
+  FunctionPass* createSamplerFixPass() {
+    return new SamplerFix();
+  }
+  char SamplerFix::ID = 0;
+};
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 5450a2b..cf2939d 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -636,60 +636,32 @@ namespace gbe {
       } else {
         Value *Callee = call->getCalledValue();
         const std::string fnName = Callee->getName();
-        auto it = instrinsicMap.map.find(fnName);
-        // FIXME, should create a complete error reporting mechanism
-        // when found error in beignet managed passes including Gen pass.
-        if (it == instrinsicMap.map.end()) {
-          std::cerr << "Unresolved symbol: " << fnName << std::endl;
-          std::cerr << "Aborting..." << std::endl;
-          exit(-1);
-        }
-        GBE_ASSERT(it != instrinsicMap.map.end());
+        auto genIntrinsicID = intrinsicMap.find(fnName);
 
         // Get the function arguments
         CallSite CS(call);
-        CallSite::arg_iterator CI = CS.arg_begin() + 2;
+        CallSite::arg_iterator CI = CS.arg_begin() + 1;
 
-        switch (it->second) {
+        switch (genIntrinsicID) {
           default: break;
-          case GEN_OCL_READ_IMAGE_I_1D:
-          case GEN_OCL_READ_IMAGE_UI_1D:
-          case GEN_OCL_READ_IMAGE_F_1D:
-          case GEN_OCL_READ_IMAGE_I_2D:
-          case GEN_OCL_READ_IMAGE_UI_2D:
-          case GEN_OCL_READ_IMAGE_F_2D:
-          case GEN_OCL_READ_IMAGE_I_3D:
-          case GEN_OCL_READ_IMAGE_UI_3D:
-          case GEN_OCL_READ_IMAGE_F_3D:
-
-	  case GEN_OCL_READ_IMAGE_I_1D_I:
-          case GEN_OCL_READ_IMAGE_UI_1D_I:
-          case GEN_OCL_READ_IMAGE_F_1D_I:
-          case GEN_OCL_READ_IMAGE_I_2D_I:
-          case GEN_OCL_READ_IMAGE_UI_2D_I:
-          case GEN_OCL_READ_IMAGE_F_2D_I:
-          case GEN_OCL_READ_IMAGE_I_3D_I:
-          case GEN_OCL_READ_IMAGE_UI_3D_I:
-          case GEN_OCL_READ_IMAGE_F_3D_I:
-          case GEN_OCL_GET_IMAGE_WIDTH:
-          case GEN_OCL_GET_IMAGE_HEIGHT:
+          case GEN_OCL_READ_IMAGE_I:
+          case GEN_OCL_READ_IMAGE_UI:
+          case GEN_OCL_READ_IMAGE_F:
           {
+            ++CI;
+            if ((*CI)->getType()->isVectorTy()) 
+              *CI = InsertToVector(call, *CI);
             setAppendPoint(call);
             extractFromVector(call);
             break;
           }
-          case GEN_OCL_WRITE_IMAGE_I_3D:
-          case GEN_OCL_WRITE_IMAGE_UI_3D:
-          case GEN_OCL_WRITE_IMAGE_F_3D:
-            CI++;
-          case GEN_OCL_WRITE_IMAGE_I_2D:
-          case GEN_OCL_WRITE_IMAGE_UI_2D:
-          case GEN_OCL_WRITE_IMAGE_F_2D:
-            CI++;
-          case GEN_OCL_WRITE_IMAGE_I_1D:
-          case GEN_OCL_WRITE_IMAGE_UI_1D:
-          case GEN_OCL_WRITE_IMAGE_F_1D:
+          case GEN_OCL_WRITE_IMAGE_I:
+          case GEN_OCL_WRITE_IMAGE_UI:
+          case GEN_OCL_WRITE_IMAGE_F:
           {
+            if ((*CI)->getType()->isVectorTy()) 
+              *CI = InsertToVector(call, *CI);
+            ++CI;
             *CI = InsertToVector(call, *CI);
             break;
           }
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index eb75ba1..e3b3822 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -120,6 +120,7 @@ namespace gbe
     MPM.add(createTypeBasedAliasAnalysisPass());
     MPM.add(createBasicAliasAnalysisPass());
     MPM.add(createIntrinsicLoweringPass());
+    MPM.add(createSamplerFixPass());
     MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
 
     MPM.add(createIPSCCPPass());              // IP SCCP
diff --git a/backend/src/sys/hash_map.hpp b/backend/src/sys/hash_map.hpp
deleted file mode 100644
index e153cf3..0000000
--- a/backend/src/sys/hash_map.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* 
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
- */
-
-/**
- * \file hash_map.hpp
- *
- * \author Benjamin Segovia <benjamin.segovia at intel.com>
- */
-
-#ifndef __GBE_HASH_MAP_HPP__
-#define __GBE_HASH_MAP_HPP__
-
-#include "sys/platform.hpp"
-
-#ifdef __MSVC__
-#include <unordered_map>
-#else
-#include <tr1/unordered_map>
-#endif /* __MSVC__ */
-
-namespace gbe
-{
-  /*! Add specific allocator to the hash map */
-  template <class Key,
-            class T,
-            class Hash = std::hash<Key>,
-            class Pred = std::equal_to<Key>>
-  class hash_map : public std::tr1::unordered_map<Key,T,Hash,Pred,Allocator<std::pair<const Key,T>>>,
-                   public NonCopyable
-  {
-  public:
-    // Typedefs
-    typedef std::pair<const Key, T> value_type;
-    typedef Allocator<value_type> allocator_type;
-    typedef std::tr1::unordered_map<Key,T,Hash,Pred,allocator_type> parent_type;
-    typedef typename allocator_type::size_type size_type;
-    typedef Key key_type;
-    typedef T mapped_type;
-    typedef Hash hasher;
-    typedef Pred key_equal;
-
-    /*! Default constructor */
-    INLINE explicit hash_map(size_type n = 3,
-                             const hasher& hf = hasher(),
-                             const key_equal& eql = key_equal(),
-                             const allocator_type& a = allocator_type()) :
-      parent_type(n, hf, eql, a) {}
-    /*! Iteration constructor */
-    template <class InputIterator>
-    INLINE hash_map(InputIterator first,
-                    InputIterator last,
-                    size_type n = 3,
-                    const hasher& hf = hasher(),
-                    const key_equal& eql = key_equal(),
-                    const allocator_type& a = allocator_type()) :
-      parent_type(first,last,n,hf,eql,a) {}
-#if 0
-    /*! Copy constructor */
-    INLINE hash_map(const hash_map &other) : parent_type(other) {}
-#endif
-    GBE_CLASS(hash_map);
-  };
-} /* namespace gbe */
-
-#endif /* __GBE_HASH_MAP_HPP__ */
-
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 0a959c8..9a2bd77 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -11,7 +11,10 @@ set (benchmark_sources
   ../utests/utest_file_map.cpp
   ../utests/utest_helper.cpp
   ../utests/vload_bench.cpp
-  enqueue_copy_buf.cpp)
+  enqueue_copy_buf.cpp
+  benchmark_use_host_ptr_buffer.cpp
+  benchmark_read_buffer.cpp
+  benchmark_read_image.cpp)
 
 
 SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
diff --git a/benchmark/benchmark_read_buffer.cpp b/benchmark/benchmark_read_buffer.cpp
new file mode 100644
index 0000000..31a1f59
--- /dev/null
+++ b/benchmark/benchmark_read_buffer.cpp
@@ -0,0 +1,49 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+int benchmark_read_buffer(void)
+{
+  struct timeval start,stop;
+
+  const size_t n = 1024 * 1024;
+  int count = 16;
+  const size_t sz = 4 * n * count;
+
+  OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, sz * sizeof(float), NULL);
+
+  OCL_CREATE_KERNEL("compiler_read_buffer");
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (size_t i = 0; i < sz; i ++) {
+    ((float *)(buf_data[0]))[i] = rand();
+    ((float *)(buf_data[1]))[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Setup kernel and buffers
+  globals[0] = n;
+  locals[0] = 256;
+
+  gettimeofday(&start,0);
+  for (size_t i=0; i<100; i++) {
+    OCL_NDRANGE(1);
+  }
+  OCL_FINISH();
+  gettimeofday(&stop,0);
+
+  clReleaseMemObject(buf[0]);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  return time_subtract(&stop, &start, 0);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_buffer);
diff --git a/benchmark/benchmark_read_image.cpp b/benchmark/benchmark_read_image.cpp
new file mode 100644
index 0000000..48aa987
--- /dev/null
+++ b/benchmark/benchmark_read_image.cpp
@@ -0,0 +1,67 @@
+#include <string.h>
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+int benchmark_read_image(void)
+{
+  struct timeval start,stop;
+
+  const size_t x_count = 4;
+  const size_t y_count = 4;
+  const size_t w = 1024;
+  const size_t h = 1024;
+  const size_t sz = 4 * x_count * y_count * w * h;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("compiler_read_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(float) * sz);
+  buf_data[1] = (uint32_t*) malloc(sizeof(float) * sz);
+  for (uint32_t i = 0; i < sz; ++i) {
+    ((float*)buf_data[0])[i] = rand();
+    ((float*)buf_data[1])[i] = rand();
+  }
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_FLOAT;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w * x_count;
+  desc.image_height = h * y_count;
+  desc.image_row_pitch = desc.image_width * sizeof(float) * 4;
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+  OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[1]);
+  OCL_CREATE_BUFFER(buf[2], 0, sz * sizeof(float), NULL);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+  free(buf_data[1]);
+  buf_data[1] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+
+  gettimeofday(&start,0);
+  for (size_t i=0; i<100; i++) {
+    OCL_NDRANGE(2);
+  }
+  OCL_FINISH();
+  gettimeofday(&stop,0);
+
+  clReleaseMemObject(buf[0]);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  return time_subtract(&stop, &start, 0);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_image);
diff --git a/benchmark/benchmark_use_host_ptr_buffer.cpp b/benchmark/benchmark_use_host_ptr_buffer.cpp
new file mode 100644
index 0000000..7ede576
--- /dev/null
+++ b/benchmark/benchmark_use_host_ptr_buffer.cpp
@@ -0,0 +1,38 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+int benchmark_use_host_ptr_buffer(void)
+{
+  struct timeval start,stop;
+
+  const size_t n = 4096*4096;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("runtime_use_host_ptr_buffer");
+
+  int ret = posix_memalign(&buf_data[0], 4096, sizeof(uint32_t) * n);
+  OCL_ASSERT(ret == 0);
+
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = i;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_USE_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = n;
+  locals[0] = 256;
+
+  gettimeofday(&start,0);
+  for (size_t i=0; i<100; i++) {
+    OCL_NDRANGE(1);
+    void* mapptr = (int*)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_READ, 0, n*sizeof(uint32_t), 0, NULL, NULL, NULL);
+    clEnqueueUnmapMemObject(queue, buf[0], mapptr, 0, NULL, NULL);
+  }
+  gettimeofday(&stop,0);
+
+  clReleaseMemObject(buf[0]);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  return time_subtract(&stop, &start, 0);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_use_host_ptr_buffer);
diff --git a/benchmark/enqueue_copy_buf.cpp b/benchmark/enqueue_copy_buf.cpp
index 0d0d4df..f012cf7 100644
--- a/benchmark/enqueue_copy_buf.cpp
+++ b/benchmark/enqueue_copy_buf.cpp
@@ -28,28 +28,6 @@ void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
     src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
 }
 
-int tim_subtract(struct timeval *y, struct timeval *x, struct timeval *result){
-  if ( x->tv_sec > y->tv_sec )
-    return   -1;
-
-  if ((x->tv_sec == y->tv_sec) && (x->tv_usec > y->tv_usec))
-    return   -1;
-
-  if ( result != NULL){
-    result->tv_sec = ( y->tv_sec - x->tv_sec );
-    result->tv_usec = ( y->tv_usec - x->tv_usec );
-
-    if (result->tv_usec < 0){
-      result->tv_sec --;
-      result->tv_usec += 1000000;
-    }
-  }
-
-  int msec = 1000.0*(y->tv_sec - x->tv_sec) + (y->tv_usec - x->tv_usec)/1000.0;
-  return msec;
-}
-
-
 int enqueue_copy_buf(void)
 {
   size_t i;
@@ -63,7 +41,7 @@ int enqueue_copy_buf(void)
   }
 
   gettimeofday(&stop,0);
-  return tim_subtract(&stop, &start, 0);
+  return time_subtract(&stop, &start, 0);
 }
 
 MAKE_BENCHMARK_FROM_FUNCTION(enqueue_copy_buf);
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index 0231d27..2af2dda 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,7 +1,10 @@
 # News
 
+## Jan 19, 2015
+[Beignet 1.0.1](https://01.org/beignet/downloads/beignet-1.0.1-2015-01-19) is released. This is a bug-fix release.  
+
 ## Nov 14, 2014
-[Beignet 1.0.0](https://01.org/beignet/downloads/beignet-1.0.0-2014-11-14) is released. This is a major release. Please see the release notes for more information.
+[Beignet 1.0.0](https://01.org/downloads/beignet-1.0.0-2014-11-14) is released. This is a major release. Please see the release notes for more information.
 
 ## Sep 15, 2014
 [Beignet 0.9.3](https://01.org/zh/beignet/downloads/beignet-0.9.3-2014-09-15-0) is released. This is a bug-fix release.
diff --git a/docs/howto/oldgcc-howto.mdwn b/docs/howto/oldgcc-howto.mdwn
new file mode 100644
index 0000000..72e3537
--- /dev/null
+++ b/docs/howto/oldgcc-howto.mdwn
@@ -0,0 +1,58 @@
+Old Gcc Support HowTo
+====================
+
+Beignet GBE compiler requires C++11 features, while some platforms does not provide
+such with an old gcc version. The solution is to build a standalone GBE compiler in
+a feasible system, and then build Beignet with the pre-built standalone compiler.
+The standalone compiler is also needed during OpenCL application development. Here
+are the brief steps.
+
+Build standalone GBE compiler in a system supporting C++11 features
+-------------------------------------------------------------------
+
+The build process is as usual except to add a CMake option "-DBUILD_STANDALONE_GBE_COMPILER=true".
+After build finishes, there is a tar ball at your_build_dir/backend/src/libocl/usr/local/lib/beignet/gbecompiler.tgz,
+this is the standalone compiler package that will be used next.
+
+
+Distribute the package to your target system without C++11 features
+-------------------------------------------------------------------
+
+Copy gbecompiler.tgz into your target system, the preference location is /usr/local/lib/beignet,
+and unzip it.
+tar zxvf gbecompiler.tgz
+
+Please check there are following files and directory at /usr/local/lib/beignet/ after unzip:
+beignet.bc  beignet.pch  gbe_bin_generater  include
+
+
+Build Beignet on your target system with the standalone compiler
+-------------------------------------------------------------------
+
+The build process is as usual except to add a CMake option "-DUSE_STANDALONE_GBE_COMPILER=true",
+
+If your_path_for_compiler is not /usr/local/lib/beignet, just need to add another CMake option
+"-DSTANDALONE_GBE_COMPILER_DIR=/your_path_for_compiler/".
+
+
+Develop OpenCL applications on your target machine
+--------------------------------------------------
+
+Firstly, write OpenCL kernel source code, secondly, compile the kernel source code to
+kernel binary with the standalone compiler gbe_bin_generater (see the usage below), finally,
+invoke clCreateProgramWithBinary to load the kernel binary and execute it. The utest case
+load_program_from_bin_file demos the whole procedure.
+
+gbe_bin_generater INFILE [-pbuild_parameter] -oOUTFILE -tGEN_PCI_ID
+
+For example, the following command builds OpenCL source kernel from file 'mykernel.cl'
+for Ivybridge with pci_id 0x0162, and write the result (executable binary kernel)
+into file 'mykernel.bin'.
+
+gbe_bin_generater mykernel.cl -omykernel.bin -t0x0162
+
+If the standalone compiler is not located at /usr/local/lib/beignet, need to set the below
+environment to execute gbe_bin_generater.
+OCL_BITCODE_LIB_PATH=/your_path_for_compiler/beignet.bc
+OCL_HEADER_FILE_DIR=/your_path_for_compiler/include
+OCL_PCH_PATH=/your_path_for_compiler/beignet.pch
diff --git a/docs/optimization-guide.mdwn b/docs/optimization-guide.mdwn
index 8fb29a6..5f648fb 100644
--- a/docs/optimization-guide.mdwn
+++ b/docs/optimization-guide.mdwn
@@ -1,28 +1,106 @@
 Optimization Guide
 ====================
 
-All the SIMD optimization principle also apply to Beignet optimization.  
-Furthermore, there are some special tips for Beignet optimization.
+All the SIMD optimization principles such as avoid branching and don't waste 
+SIMD lanes are also applied to Beignet optimization on Gen platform. Furthermore,
+there are some special tips for Beignet optimization.
 
 1. It is recommended to choose multiple of 16 work group size. Too much SLM usage may reduce parallelism at group level. 
    If kernel uses large amount SLM, it's better to choose large work group size. Please refer the following table for recommendations
-   with some SLM usage.  
+   with some SLM usage.
+
 | Amount of SLM | 0  | 4K | 8K  | 16K | 32K |  
-| WorkGroup size| 16 | 64 | 128 | 256 | 512 |
+| WorkGroup size| 16 | 64 | 128 | 256 | 512 |  
+
+   Actually, a good method is to pass in a NULL local work size parameter to let the driver to determine the best work group size for you.
+
+1. Use shorter data type could get better performance. There are also some detail tips as below.
+  1. Use uchar16/ushort8/uint4 as much as possible.
+  1. If the data has to be DWORD(4 bytes) unaligned, it's better to use vload16(for char), vload8(for short) to load the data.
+  1. Read/write scalar char/short will be very slow and may lead to be even worse performance than use DW type.
+
+1. Avoid too strided global/constant memory access.
+  
+  Some examples are as below (assume the data is a cache line aligned global/constant uint buffer, and the work group size is 16 with SIMD16 mode):  
+  `uint x = data[get_global_id(0)];      //best, only read one cache line, no bandwidth waste`  
+  `uint x = data[get_global_id(0) + 1]; //bad, cross 2 cache lines, not good, waste half of the bandwidth`  
+  `uint x = data[get_global_id(0) * 16]; //worst, cross 16 cache lines, waste 15/16 bandwidth.`  
+
+1. Avoid dynamic indexed private buffer if possible.
+  Currently, private buffer access in beignet backend is very slow. Many small private buffer could be optimized by the compiler.
+  But the following type of dynamic indexed private buffer could not be optimized:
+
+`
+     uint private_buffer[32];
+     for (i = 0; i < xid; i++)    {
+       int dynamic_idx = src[xid];
+       private_buffer[dynamic_idx % 10] = src1[xid];
+       ...
+     }
+`
+
+   The following case is OK.
+
+`
+     ...
+     uint private_buffer[32];
+     for (i = 0; i < xid; i++)    {
+       private_buffer[xid % 32] = src1[xid];
+       ...
+     }
+`
+
+
+1. Use SLM to reduce the memory bandwidth requirement if possible.
+  
+  On Gen platform, SLM is in GPU's L3 cache, if it could be used to  
+  share data between work items, it could reduce the memory bandwidth  
+  on the system memory bus. This will be a big win for many I/O intensity  
+  kernels.
+
+1. Avoid SLM bank conflicts.
+
+  SLM is banked at a DWORD granularity, totally 16 banks. Access on the same
+  bank with different addresses will lead to a conflicts. It should be avoided.
+  The worst case is:
+
+  Some examples are as below (assume the data is a cache line aligned global/constant uint buffer, and the work group size is 16 with SIMD16 mode):  
+  `uint x = data[get_global_id(0)];      //best, no bank conflicts, no bandwidth waste`  
+  `uint x = data[get_global_id(0) + 1]; //best, no bank conflicts, no bandwidth waste`  
+  `uint x = data[get_global_id(0) * 2]; //bad, work item (id) and (id + 8) conflict to each other, waste half of the bandwidth`  
+  `uint x = data[get_global_id(0) * 16]; //worst, all work items conflicts on the zero bank, waste 15/16 bandwidth.`  
+
+1. Zero copy on buffer creation. (Only avaliable in git master branch and Release\_v1.0 branch).
+
+  Use CL\_MEM\_USE\_HOST\_PTR to create buffer, and pass in a page  
+  aligned host pointer which also has multiple page size. Beignet  
+  will leverage userptr to create a buffer object by using that  
+  host buffer directly. If possible, you can also use CL\_MEM\_ALLOC\_HOST\_PTR  
+  flag to let the driver to allocate a userptr qualified buffer which could  
+  guarantee zero copy on the buffer.
+
+  Please be noted, this feature requires the kernel is newer than 3.16 and the libdrm version is newer than 2.4.57.
+
+1. Use float data type as much as possible.
+
+  The two ALUs of one EU could both handle float data,but only one of them could handle non-float type data.
+
+1. Avoid using long.
+
+  GEN7 and Gen7.5 doesn't support long natively. And Gen8's native long support is still under development.
+
+1. Declare small constant buffer with content in the kernel if possible.
 
-2. GEN7's read/write on global memory with DWORD and DWORD4 are significantly faster than read/write on BYTE/WORD.  
-   Use DWORD or DWORD4 to access data in global memory if possible. If you cannot avoid the byte/word access, try to do it on SLM.
+  For a small constant buffer, it's better to declare it in the kernel directly with "const \_\_constant". The compiler may optimize it if the buffer is defined inside kernel.
 
-3. Use float data type as much as possible.
+1. Avoid unnecessary synchronizations.
 
-4. Avoid using long. GEN7's performance for long integer is poor.
+  Both in the runtime and in the kernel. For examples, clFinish and clWaitForEvents in runtime and barrier() in the kernel.
 
-5. If there is a small constant buffer, define it in the kernel instead of using the constant buffer argument if possible.  
-   The compiler may optimize it if the buffer is defined inside kernel.
+1. Consider native version of math built-ins, such as native\_sin, native\_cos, if your kernel is not precision sensitive.
 
-6. Avoid unnecessary synchronizations, both in the runtime and in the kernel.  For examples, clFinish and clWaitForEvents in runtime  
-   and barrier() in the kernel.
+1. Use fma()/mad() as much as possible.
 
-7. Consider native version of math built-ins, such as native\_sin, native\_cos, if your kernel is not precision sensitive.
+1. Try to eliminate branching as much as possible.
 
-8. Try to eliminate branching as much as possible. For example using min, max, clamp or select built-ins instead of if/else if possible.
+  For example using min, max, clamp or select built-ins instead of if/else if possible.
diff --git a/kernels/compiler_array4.cl b/kernels/compiler_array4.cl
new file mode 100644
index 0000000..6ddc973
--- /dev/null
+++ b/kernels/compiler_array4.cl
@@ -0,0 +1,9 @@
+__kernel void
+compiler_array4(__global int4 *src4, __global int4 *dst4, int offset)
+{
+  int i;
+  int final[16];
+  __global int *dst = (__global int *)(dst4 + offset + get_global_id(0));
+  __global int *src = (__global int *)(src4 + offset + get_global_id(0));
+  dst[-4] = src[-4];
+}
diff --git a/kernels/compiler_bswap.cl b/kernels/compiler_bswap.cl
index 9ef0e6b..97313b1 100644
--- a/kernels/compiler_bswap.cl
+++ b/kernels/compiler_bswap.cl
@@ -1,6 +1,7 @@
 #define TEST_TYPE(TYPE, LENGTH)                                       \
 kernel void compiler_bswap_##TYPE(global TYPE * src, global TYPE * dst){ \
    dst[get_global_id(0)]= __builtin_bswap##LENGTH(src[get_global_id(0)]); \
+   dst[get_global_id(0)]= __builtin_bswap##LENGTH(dst[get_global_id(0)] -1 ); \
 }
 
 
diff --git a/kernels/compiler_private_const.cl b/kernels/compiler_private_const.cl
new file mode 100644
index 0000000..7fad369
--- /dev/null
+++ b/kernels/compiler_private_const.cl
@@ -0,0 +1,9 @@
+constant int x[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+__kernel void
+compiler_private_const( __global int *dst)
+{
+  const int array0[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+
+  dst[get_global_id(0)] = array0[get_global_id(0)] + x[get_global_id(0)];
+}
+
diff --git a/kernels/compiler_read_buffer.cl b/kernels/compiler_read_buffer.cl
new file mode 100644
index 0000000..4d3183a
--- /dev/null
+++ b/kernels/compiler_read_buffer.cl
@@ -0,0 +1,15 @@
+#define COUNT 16
+
+__kernel void
+compiler_read_buffer(__global float4* src0, __global float4* src1, __global float4* dst)
+{
+  float4 sum = 0;
+  int offset = 0, i = 0;
+  int id = (int)get_global_id(0);
+  int sz = (int)get_global_size(0);
+  for(i=0; i<COUNT; i++) {
+    sum = sum + src0[offset + id] + src1[offset + id];
+    offset += sz;
+  }
+  dst[id] = sum;
+}
diff --git a/kernels/compiler_read_image.cl b/kernels/compiler_read_image.cl
new file mode 100644
index 0000000..f059743
--- /dev/null
+++ b/kernels/compiler_read_image.cl
@@ -0,0 +1,25 @@
+#define X_COUNT 4
+#define Y_COUNT 4
+
+__kernel void
+compiler_read_image(__read_only image2d_t src0, __read_only image2d_t src1, __global float4* dst)
+{
+  float4 sum = 0;
+  int2 coord;
+  int x_sz = (int)get_global_size(0);
+  int y_sz = (int)get_global_size(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE| CLK_ADDRESS_CLAMP| CLK_FILTER_NEAREST;
+  int i, j;
+
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+
+  for(i=0; i<X_COUNT; i++) {
+    coord.x = x + i * x_sz;
+    for(j=0; j<Y_COUNT; j++) {
+      coord.y = y + j * y_sz;
+      sum = sum + read_imagef(src0, sampler, coord) + read_imagef(src1, sampler, coord);
+    }
+  }
+  dst[y * x_sz + x] = sum;
+}
diff --git a/kernels/runtime_alloc_host_ptr_buffer.cl b/kernels/runtime_alloc_host_ptr_buffer.cl
new file mode 100644
index 0000000..290241d
--- /dev/null
+++ b/kernels/runtime_alloc_host_ptr_buffer.cl
@@ -0,0 +1,6 @@
+__kernel void
+runtime_alloc_host_ptr_buffer(__global int* buf)
+{
+  int id = (int)get_global_id(0);
+  buf[id] = id / 2;
+}
diff --git a/kernels/runtime_climage_from_boname.cl b/kernels/runtime_climage_from_boname.cl
new file mode 100644
index 0000000..25baee6
--- /dev/null
+++ b/kernels/runtime_climage_from_boname.cl
@@ -0,0 +1,8 @@
+__kernel void
+runtime_climage_from_boname(__write_only image2d_t dst)
+{
+  int2 coord;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  write_imagef(dst, coord, (float4)(0.34f));
+}
diff --git a/setup_fulsim_hsw.sh b/setup_fulsim_hsw.sh
deleted file mode 100644
index 140be66..0000000
--- a/setup_fulsim_hsw.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-export INTEL_DEVID_OVERRIDE=0x0094
-export DEVICE=hsw_m0
-export OCL_FULSIM_RUN=1
-export OCL_FULSIM_DEBUG_MODE=$1
-
diff --git a/setup_fulsim_ivb.sh b/setup_fulsim_ivb.sh
deleted file mode 100644
index 9df9082..0000000
--- a/setup_fulsim_ivb.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-export INTEL_DEVID_OVERRIDE=0x0166     # or, 0x0112
-export DEVICE=ivb_m_gt2                #     snb_gt2 for SNB GT2 desktop
-export OCL_SIMULATOR=1                 # 0 -> HW, 1 -> fulsim, 2 -> perfsim
-export OCL_FULSIM_DEBUG_MODE=$1
-
diff --git a/setup_perfsim_ivb.sh b/setup_perfsim_ivb.sh
deleted file mode 100644
index 4cfdd1a..0000000
--- a/setup_perfsim_ivb.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-export INTEL_DEVID_OVERRIDE=0x0166     # or, 0x0112
-export DEVICE=ivb_m_gt2                #     snb_gt2 for SNB GT2 desktop
-export OCL_SIMULATOR=2                 # 0 -> HW, 1 -> fulsim, 2 -> perfsim
-
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7182bad..a55f84d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,18 +10,19 @@ foreach (KF ${KERNEL_FILES})
   set (input_file ${KERNEL_PATH}/${KF}.cl)
   set (output_file ${KERNEL_PATH}/${KF}_str.c)
   list (APPEND KERNEL_STR_FILES ${output_file})
+  list (GET GBE_BIN_GENERATER -1 GBE_BIN_FILE)
   if(GEN_PCI_ID)
     add_custom_command(
       OUTPUT ${output_file}
       COMMAND rm -rf ${output_file}
       COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} -t${GEN_PCI_ID}
-      DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+      DEPENDS ${input_file} ${GBE_BIN_FILE})
   else(GEN_PCI_ID)
     add_custom_command(
       OUTPUT ${output_file}
       COMMAND rm -rf ${output_file}
       COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file}
-      DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+      DEPENDS ${input_file} ${GBE_BIN_FILE})
   endif(GEN_PCI_ID)
 endforeach (KF)
 endmacro (MakeKernelBinStr)
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 12530d7..89afa07 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -90,10 +90,6 @@ cl_command_queue_delete(cl_command_queue queue)
     if (queue->ctx->queues == queue)
       queue->ctx->queues = queue->next;
   pthread_mutex_unlock(&queue->ctx->queue_lock);
-  if (queue->fulsim_out != NULL) {
-    cl_mem_delete(queue->fulsim_out);
-    queue->fulsim_out = NULL;
-  }
 
   cl_thread_data_destroy(queue);
   queue->thread_data = NULL;
@@ -146,7 +142,7 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
     // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
     // on demand.
     if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
-      cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_MAX_IMAGE_NUM, image->base.bo, image->offset,
+      cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset,
                           image->intel_fmt, image->image_type,
                           image->w, image->h, image->depth,
                           image->row_pitch, (cl_gpgpu_tiling)image->tiling);
@@ -179,197 +175,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   return CL_SUCCESS;
 }
 
-
-#if USE_FULSIM
-extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
-extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr, FILE*);
-extern void aub_exec_dump_raw_file(cl_buffer, size_t offset, size_t sz);
-
-static void
-cl_run_fulsim(void)
-{
-  const char *run_it = getenv("OCL_SIMULATOR");
-  const char *debug_mode = getenv("OCL_FULSIM_DEBUG_MODE");
-  if (run_it == NULL || strcmp(run_it, "1")) return;
-
-#if EMULATE_GEN == 7 /* IVB */
-  if (debug_mode == NULL || strcmp(debug_mode, "1"))
-    system("wine AubLoad.exe dump.aub -device ivbB0");
-  else
-    system("wine AubLoad.exe dump.aub -device ivbB0 -debug");
-#elif EMULATE_GEN == 75 /* HSW */
-  if (debug_mode == NULL || strcmp(debug_mode, "1"))
-    system("wine AubLoad.exe dump.aub -device hsw.h.a0");
-  else
-    system("wine AubLoad.exe dump.aub -device hsw.h.a0 -debug");
-#else
-#error "Unknown device"
-#endif
-}
-
-/* Each buffer is dump using several chunks of this size */
-static const size_t chunk_sz = 8192u;
-
-static cl_int
-cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k)
-{
-  cl_int err = CL_SUCCESS;
-  cl_mem mem = NULL;
-  int i;
-  size_t j;
-
-  /* Bind user defined surface */
-  for (i = 0; i < k->arg_n; ++i) {
-    size_t chunk_n, chunk_remainder;
-    if (interp_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
-      continue;
-    mem = (cl_mem) k->args[i].mem;
-    CHECK_MEM(mem);
-    chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz;
-    chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz;
-    for (j = 0; j < chunk_n; ++j)
-      aub_exec_dump_raw_file(mem->bo, j * chunk_sz, chunk_sz);
-    if (chunk_remainder)
-      aub_exec_dump_raw_file(mem->bo, chunk_n * chunk_sz, chunk_remainder);
-  }
-error:
-  return err;
-}
-
-struct bmphdr {
-  /* 2 bytes of magic here, "BM", total header size is 54 bytes! */
-  int filesize;      /*  4 total file size incl header */
-  short as0, as1;    /*  8 app specific */
-  int bmpoffset;     /* 12 ofset of bmp data  */
-  int headerbytes;   /* 16 bytes in header from this point (40 actually) */
-  int width;         /* 20  */
-  int height;        /* 24  */
-  short nplanes;     /* 26 no of color planes */
-  short bpp;         /* 28 bits/pixel */
-  int compression;   /* 32 BI_RGB = 0 = no compression */
-  int sizeraw;       /* 36 size of raw bmp file, excluding header, incl padding */
-  int hres;          /* 40 horz resolutions pixels/meter */
-  int vres;          /* 44 */
-  int npalcolors;    /* 48 No of colors in palette */
-  int nimportant;    /* 52 No of important colors */
-  /* raw b, g, r data here, dword aligned per scan line */
-};
-
-static int*
-cl_read_bmp(const char *filename, int *width, int *height)
-{
-  int n;
-  struct bmphdr hdr;
-
-  FILE *fp = fopen(filename, "rb");
-  assert(fp);
-
-  char magic[2];
-  n = fread(&magic[0], 1, 2, fp);
-  assert(n == 2 && magic[0] == 'B' && magic[1] == 'M');
-
-  n = fread(&hdr, 1, sizeof(hdr), fp);
-  assert(n == sizeof(hdr));
-
-  assert(hdr.width > 0 &&
-         hdr.height > 0 &&
-         hdr.nplanes == 1
-         && hdr.compression == 0);
-
-  int *rgb32 = (int *) cl_malloc(hdr.width * hdr.height * sizeof(int));
-  assert(rgb32);
-  int x, y;
-
-  int *dst = rgb32;
-  for (y = 0; y < hdr.height; y++) {
-    for (x = 0; x < hdr.width; x++) {
-      assert(!feof(fp));
-      int b = (getc(fp) & 0x0ff);
-      int g = (getc(fp) & 0x0ff);
-      int r = (getc(fp) & 0x0ff);
-      *dst++ = (r | (g << 8) | (b << 16) | 0xff000000);	/* abgr */
-    }
-    while (x & 3) {
-      getc(fp);
-      x++;
-    }
-  }
-  fclose(fp);
-  *width = hdr.width;
-  *height = hdr.height;
-  return rgb32;
-}
-
-static char*
-cl_read_dump(const char *name, size_t *size)
-{
-  char *raw = NULL, *dump = NULL;
-  size_t i, sz;
-  int w, h;
-  if ((raw = (char*) cl_read_bmp(name, &w, &h)) == NULL)
-    return NULL;
-  sz = w * h;
-  dump = (char*) cl_malloc(sz);
-  assert(dump);
-  for (i = 0; i < sz; ++i)
-    dump[i] = raw[4*i];
-  cl_free(raw);
-  if (size)
-    *size = sz;
-  return dump;
-}
-
-static cl_int
-cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k)
-{
-  cl_int err = CL_SUCCESS;
-  cl_mem mem = NULL;
-  char *from = NULL, *to = NULL;
-  size_t size, j, chunk_n, chunk_remainder;
-  int i, curr = 0;
-  /* Bind user defined surface */
-  for (i = 0; i < k->arg_n; ++i) {
-    if (interp_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
-      continue;
-    mem = (cl_mem) k->args[i].mem;
-    CHECK_MEM(mem);
-    assert(mem->bo);
-    chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz;
-    chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz;
-    to = cl_mem_map(mem, 1);
-    for (j = 0; j < chunk_n; ++j) {
-      char name[256];
-      sprintf(name, "dump%03i.bmp", curr);
-#ifdef NDEBUG
-      from = cl_read_dump(name, NULL);
-#else
-      from = cl_read_dump(name, &size);
-      assert(size == chunk_sz);
-#endif /* NDEBUG */
-      memcpy(to + j*chunk_sz, from, chunk_sz);
-      cl_free(from);
-      curr++;
-    }
-    if (chunk_remainder) {
-      char name[256];
-      sprintf(name, "dump%03i.bmp", curr);
-#ifdef NDEBUG
-      from = cl_read_dump(name, NULL);
-#else
-      from = cl_read_dump(name, &size);
-      assert(size == chunk_remainder);
-#endif /* NDEBUG */
-      memcpy(to + chunk_n*chunk_sz, from, chunk_remainder);
-      cl_free(from);
-      curr++;
-    }
-    cl_mem_unmap(mem);
-  }
-error:
-  return err;
-}
-#endif
-
 extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
 
 static cl_int
@@ -398,33 +203,11 @@ cl_command_queue_ND_range(cl_command_queue queue,
   /* Check that the user did not forget any argument */
   TRY (cl_kernel_check_args, k);
 
-#if USE_FULSIM
-  cl_buffer_mgr bufmgr = NULL;
-  FILE *file = NULL;
-  const char *run_it = getenv("OCL_SIMULATOR");
-  if (run_it != NULL && strcmp(run_it, "1") == 0) {
-    file = fopen("dump.aub", "wb");
-    FATAL_IF (file == NULL, "Unable to open file dump.aub");
-    bufmgr = cl_context_get_bufmgr(queue->ctx);
-    drm_intel_bufmgr_gem_set_aubfile(bufmgr, file);
-  }
-#endif /* USE_FULSIM */
-
   if (ver == 7 || ver == 75 || ver == 8)
     TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
   else
     FATAL ("Unknown Gen Device");
 
-#if USE_FULSIM
-  if (run_it != NULL && strcmp(run_it, "1") == 0) {
-    TRY (cl_fulsim_dump_all_surfaces, queue, k);
-    drm_intel_bufmgr_gem_stop_aubfile(bufmgr);
-    fclose(file);
-    cl_run_fulsim();
-    TRY (cl_fulsim_read_all_surfaces, queue, k);
-  }
-#endif /* USE_FULSIM */
-
 error:
   return err;
 }
@@ -433,7 +216,8 @@ LOCAL void
 cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
 {
   size_t global_wk_sz[3];
-  void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz);
+  size_t outbuf_sz = 0;
+  void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz, &outbuf_sz);
 
   cl_gpgpu_flush(gpgpu);
 
@@ -444,7 +228,7 @@ cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
       buf_addr = cl_gpgpu_map_printf_buffer(gpgpu, 1);
 
     interp_output_printf(printf_info, index_addr, buf_addr, global_wk_sz[0],
-                      global_wk_sz[1], global_wk_sz[2]);
+                      global_wk_sz[1], global_wk_sz[2], outbuf_sz);
 
     cl_gpgpu_unmap_printf_buffer(gpgpu, 0);
     if (interp_get_printf_sizeof_size(printf_info))
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 7ec1b6f..47dae4a 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -46,7 +46,6 @@ struct _cl_command_queue {
   cl_command_queue prev, next;         /* We chain the command queues together */
   void *thread_data;                   /* Used to store thread context data */
   cl_mem perf;                         /* Where to put the perf counters */
-  cl_mem fulsim_out;                   /* Fulsim will output this buffer */
 };
 
 /* The macro to get the thread specified gpgpu struct. */
@@ -77,9 +76,6 @@ extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
 /* The memory object where to report the performance */
 extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
 
-/* Fulsim will dump this buffer (mostly to check its consistency */
-cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem);
-
 /* Flush for the command queue */
 extern cl_int cl_command_queue_flush(cl_command_queue);
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index ba015ca..eec39b4 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -31,7 +31,7 @@
 #include <string.h>
 
 #define MAX_GROUP_SIZE_IN_HALFSLICE   512
-static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+128; }
+static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+256; }
 
 /* "Varing" payload is the part of the curbe that changes accross threads in the
  *  same work group. Right now, it consists in local IDs and block IPs
@@ -281,7 +281,7 @@ cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num,
   if (buf_size < 1*1024)
     buf_size = 1*1024*1024;
   else
-    buf_size = 4*1024*1024; //at most.
+    buf_size = 16*1024*1024; //at most.
 
   if (offset > 0) {
     if (cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset, interp_get_printf_buf_bti(printf_info)) != 0)
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 522c3c5..3032a38 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -27,11 +27,13 @@
 #include "cl_thread.h"
 #include "CL/cl.h"
 #include "cl_gbe_loader.h"
+#include "cl_alloc.h"
 
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <sys/sysinfo.h>
 
 #ifndef CL_VERSION_1_2
 #define CL_DEVICE_BUILT_IN_KERNELS 0x103F
@@ -42,8 +44,8 @@ static struct _cl_device_id intel_ivb_gt2_device = {
   .max_compute_unit = 16,
   .max_thread_per_unit = 8,
   .sub_slice_count = 2,
-  .max_work_item_sizes = {1024, 1024, 1024},
-  .max_work_group_size = 1024,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
   .max_clock_frequency = 1000,
 #include "cl_gen7_device.h"
 };
@@ -53,8 +55,8 @@ static struct _cl_device_id intel_ivb_gt1_device = {
   .max_compute_unit = 6,
   .max_thread_per_unit = 6,
   .sub_slice_count = 1,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
+  .max_work_item_sizes = {256, 256, 256},
+  .max_work_group_size = 256,
   .max_clock_frequency = 1000,
 #include "cl_gen7_device.h"
 };
@@ -64,8 +66,8 @@ static struct _cl_device_id intel_baytrail_t_device = {
   .max_compute_unit = 4,
   .max_thread_per_unit = 8,
   .sub_slice_count = 1,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
+  .max_work_item_sizes = {256, 256, 256},
+  .max_work_group_size = 256,
   .max_clock_frequency = 1000,
 #include "cl_gen7_device.h"
 };
@@ -76,8 +78,8 @@ static struct _cl_device_id intel_hsw_gt1_device = {
   .max_compute_unit = 10,
   .max_thread_per_unit = 7,
   .sub_slice_count = 1,
-  .max_work_item_sizes = {1024, 1024, 1024},
-  .max_work_group_size = 1024,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
   .max_clock_frequency = 1000,
 #include "cl_gen75_device.h"
 };
@@ -87,8 +89,8 @@ static struct _cl_device_id intel_hsw_gt2_device = {
   .max_compute_unit = 20,
   .max_thread_per_unit = 7,
   .sub_slice_count = 2,
-  .max_work_item_sizes = {1024, 1024, 1024},
-  .max_work_group_size = 1024,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
   .max_clock_frequency = 1000,
 #include "cl_gen75_device.h"
 };
@@ -98,8 +100,8 @@ static struct _cl_device_id intel_hsw_gt3_device = {
   .max_compute_unit = 40,
   .max_thread_per_unit = 7,
   .sub_slice_count = 4,
-  .max_work_item_sizes = {1024, 1024, 1024},
-  .max_work_group_size = 1024,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
   .max_clock_frequency = 1000,
 #include "cl_gen75_device.h"
 };
@@ -110,7 +112,7 @@ static struct _cl_device_id intel_brw_gt1_device = {
   .max_compute_unit = 12,
   .max_thread_per_unit = 7,
   .sub_slice_count = 2,
-  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
 #include "cl_gen75_device.h"
@@ -121,7 +123,7 @@ static struct _cl_device_id intel_brw_gt2_device = {
   .max_compute_unit = 24,
   .max_thread_per_unit = 7,
   .sub_slice_count = 3,
-  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
 #include "cl_gen75_device.h"
@@ -132,7 +134,7 @@ static struct _cl_device_id intel_brw_gt3_device = {
   .max_compute_unit = 48,
   .max_thread_per_unit = 7,
   .sub_slice_count = 6,
-  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
 #include "cl_gen75_device.h"
@@ -407,21 +409,29 @@ brw_gt3_break:
   cl_buffer_mgr bufmgr = cl_driver_get_bufmgr(dummy);
 
   const size_t sz = 4096;
-  void* host_ptr = NULL;
-  int err = posix_memalign(&host_ptr, 4096, sz);
-  if (err == 0) {
+  void* host_ptr = cl_aligned_malloc(sz, 4096);;
+  if (host_ptr != NULL) {
     cl_buffer bo = cl_buffer_alloc_userptr(bufmgr, "CL memory object", host_ptr, sz, 0);
     if (bo == NULL)
       ret->host_unified_memory = CL_FALSE;
     else
       cl_buffer_unreference(bo);
-    free(host_ptr);
+    cl_free(host_ptr);
   }
   else
     ret->host_unified_memory = CL_FALSE;
   cl_driver_delete(dummy);
 #endif
 
+  struct sysinfo info;
+  if (sysinfo(&info) == 0) {
+    uint64_t two_gb = 2 * 1024 * 1024 * 1024ul; 
+    uint64_t totalram = info.totalram * info.mem_unit;
+    ret->global_mem_size = (totalram > two_gb) ? 
+                            two_gb : info.totalram;
+    ret->max_mem_alloc_size = ret->global_mem_size / 2;
+  }
+
   return ret;
 }
 
@@ -669,9 +679,9 @@ cl_get_kernel_max_wg_sz(cl_kernel kernel)
     if(thread_cnt > 64)
       thread_cnt = 64;
     work_group_size = thread_cnt * simd_width;
-    if(work_group_size > kernel->program->ctx->device->max_work_group_size)
-      work_group_size = kernel->program->ctx->device->max_work_group_size;
   }
+  if(work_group_size > kernel->program->ctx->device->max_work_group_size)
+    work_group_size = kernel->program->ctx->device->max_work_group_size;
   return work_group_size;
 }
 
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 97ca559..c88b9be 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -266,7 +266,7 @@ typedef int (cl_gpgpu_set_printf_info_cb)(cl_gpgpu, void *, size_t*);
 extern cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info;
 
 /* Get the last printfset pointer */
-typedef void* (cl_gpgpu_get_printf_info_cb)(cl_gpgpu, size_t*);
+typedef void* (cl_gpgpu_get_printf_info_cb)(cl_gpgpu, size_t*, size_t*);
 extern cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info;
 
 /* Will spawn all threads */
@@ -305,7 +305,7 @@ extern cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture;
 typedef cl_buffer (cl_buffer_get_buffer_from_libva_cb)(cl_context ctx, unsigned int bo_name, size_t *sz);
 extern cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva;
 
-typedef cl_buffer (cl_buffer_get_image_from_libva_cb)(cl_context ctx, unsigned int bo_name, struct _cl_mem_image *image, unsigned int offset);
+typedef cl_buffer (cl_buffer_get_image_from_libva_cb)(cl_context ctx, unsigned int bo_name, struct _cl_mem_image *image);
 extern cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva;
 
 /* Unref a buffer and destroy it if no more ref */
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 5798e20..d51592c 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -40,8 +40,15 @@ cl_int cl_enqueue_read_buffer(enqueue_data* data)
     if (cl_buffer_get_subdata(mem->bo, data->offset + buffer->sub_offset,
 			       data->size, data->ptr) != 0)
       err = CL_MAP_FAILURE;
-  } else
-    memcpy(data->ptr, (char*)mem->host_ptr + data->offset + buffer->sub_offset, data->size);
+  } else {
+    void* src_ptr = cl_mem_map_auto(mem, 0);
+    if (src_ptr == NULL)
+      err = CL_MAP_FAILURE;
+    else {
+      memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
+      cl_mem_unmap_auto(mem);
+    }
+  }
   return err;
 }
 
@@ -99,13 +106,28 @@ error:
 
 cl_int cl_enqueue_write_buffer(enqueue_data *data)
 {
+  cl_int err = CL_SUCCESS;
   cl_mem mem = data->mem_obj;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
          mem->type == CL_MEM_SUBBUFFER_TYPE);
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
 
-  return cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset,
-			   data->size, data->const_ptr);
+  if (mem->is_userptr) {
+    void* dst_ptr = cl_mem_map_auto(mem, 1);
+    if (dst_ptr == NULL)
+      err = CL_MAP_FAILURE;
+    else {
+      memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
+      cl_mem_unmap_auto(mem);
+    }
+  }
+  else {
+    if (cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset,
+			   data->size, data->const_ptr) != 0)
+      err = CL_MAP_FAILURE;
+  }
+
+  return err;
 }
 
 cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
@@ -240,7 +262,7 @@ cl_int cl_enqueue_map_buffer(enqueue_data *data)
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
 
   if (mem->is_userptr)
-    ptr = mem->host_ptr;
+    ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
   else {
     if(data->unsync_map == 1)
       //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
diff --git a/src/cl_event.c b/src/cl_event.c
index e20342a..f70e531 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -436,9 +436,6 @@ void cl_event_set_status(cl_event event, cl_int status)
     event->status = status;
   pthread_mutex_unlock(&event->ctx->event_lock);
 
-  if(event->status <= CL_COMPLETE)
-    cl_event_delete(event);
-
   /* Call user callback */
   user_cb = event->user_cb;
   while(user_cb) {
@@ -449,46 +446,48 @@ void cl_event_set_status(cl_event event, cl_int status)
     user_cb = user_cb->next;
   }
 
-  if(event->type != CL_COMMAND_USER)
-    return;
+  if(event->type == CL_COMMAND_USER) {
+    /* Check all defer enqueue */
+    enqueue_callback *cb, *enqueue_cb = event->waits_head;
+    while(enqueue_cb) {
+      /* Remove this user event in enqueue_cb, update the header if needed. */
+      cl_event_remove_user_event(&enqueue_cb->wait_user_events, event);
+      cl_event_delete(event);
+
+      /* Still wait on other user events */
+      if(enqueue_cb->wait_user_events != NULL) {
+        enqueue_cb = enqueue_cb->next;
+        continue;
+      }
 
-  /* Check all defer enqueue */
-  enqueue_callback *cb, *enqueue_cb = event->waits_head;
-  while(enqueue_cb) {
-    /* Remove this user event in enqueue_cb, update the header if needed. */
-    cl_event_remove_user_event(&enqueue_cb->wait_user_events, event);
-    cl_event_delete(event);
+      //remove user event frome enqueue_cb's ctx
+      cl_command_queue_remove_event(enqueue_cb->event->queue, event);
+      cl_command_queue_remove_barrier_event(enqueue_cb->event->queue, event);
 
-    /* Still wait on other user events */
-    if(enqueue_cb->wait_user_events != NULL) {
+      /* All user events complete, now wait enqueue events */
+      ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
+          enqueue_cb->event->queue);
+      assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
+      ret = ~ret;
+      cb = enqueue_cb;
       enqueue_cb = enqueue_cb->next;
-      continue;
-    }
 
-    //remove user event frome enqueue_cb's ctx
-    cl_command_queue_remove_event(enqueue_cb->event->queue, event);
-    cl_command_queue_remove_barrier_event(enqueue_cb->event->queue, event);
-
-    /* All user events complete, now wait enqueue events */
-    ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
-        enqueue_cb->event->queue);
-    assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
-    ret = ~ret;
-    cb = enqueue_cb;
-    enqueue_cb = enqueue_cb->next;
-
-    /* Call the pending operation */
-    evt = cb->event;
-    /* TODO: if this event wait on several events, one event's
-       status is error, the others is complete, what's the status
-       of this event? Can't find the description in OpenCL spec.
-       Simply update to latest finish wait event.*/
-    cl_event_set_status(cb->event, status);
-    if(evt->emplict == CL_FALSE) {
-      cl_event_delete(evt);
+      /* Call the pending operation */
+      evt = cb->event;
+      /* TODO: if this event wait on several events, one event's
+         status is error, the others is complete, what's the status
+         of this event? Can't find the description in OpenCL spec.
+         Simply update to latest finish wait event.*/
+      cl_event_set_status(cb->event, status);
+      if(evt->emplict == CL_FALSE) {
+        cl_event_delete(evt);
+      }
     }
+    event->waits_head = NULL;
   }
-  event->waits_head = NULL;
+
+  if(event->status <= CL_COMPLETE)
+    cl_event_delete(event);
 }
 
 void cl_event_update_status(cl_event event, int wait)
diff --git a/src/cl_gen75_device.h b/src/cl_gen75_device.h
index d6743a4..8cf2dcd 100644
--- a/src/cl_gen75_device.h
+++ b/src/cl_gen75_device.h
@@ -19,7 +19,7 @@
 
 /* Common fields for both SNB devices (either GT1 or GT2)
  */
-.max_parameter_size = 1024, 
+.max_parameter_size = 1024,
 .global_mem_cache_line_size = 128, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
 .local_mem_type = CL_GLOBAL,
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index 470531a..6857f8a 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -18,7 +18,7 @@
  */
 
 /* Common fields for both IVB devices (either GT1 or GT2) */
-.max_parameter_size = 1024, 
+.max_parameter_size = 1024,
 .global_mem_cache_line_size = 128, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
 .local_mem_type = CL_GLOBAL,
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 37abfd2..0950327 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -24,7 +24,7 @@
 .max_1d_global_work_sizes = {1024 * 1024 * 256, 1, 1},
 .max_2d_global_work_sizes = {8192, 8192, 1},
 .max_3d_global_work_sizes = {8192, 8192, 2048},
-.preferred_vector_width_char = 8,
+.preferred_vector_width_char = 16,
 .preferred_vector_width_short = 8,
 .preferred_vector_width_int = 4,
 .preferred_vector_width_long = 2,
@@ -40,24 +40,24 @@
 .native_vector_width_half = 8,
 .preferred_wg_sz_mul = 16,
 .address_bits = 32,
-.max_mem_alloc_size = 256 * 1024 * 1024,
+.max_mem_alloc_size = 512 * 1024 * 1024,
 .image_support = CL_TRUE,
-.max_read_image_args = 128,
-.max_write_image_args = 8,
+.max_read_image_args = BTI_MAX_READ_IMAGE_ARGS,
+.max_write_image_args = BTI_MAX_WRITE_IMAGE_ARGS,
 .image_max_array_size = 2048,
 .image2d_max_width = 8192,
 .image2d_max_height = 8192,
 .image3d_max_width = 8192,
 .image3d_max_height = 8192,
 .image3d_max_depth = 2048,
-.image_mem_size = 8192,
+.image_mem_size = 65536,
 .max_samplers = 16,
 .mem_base_addr_align = sizeof(cl_long) * 16 * 8,
 .min_data_type_align_size = sizeof(cl_long) * 16,
 .double_fp_config = 0,
 .global_mem_cache_type = CL_READ_WRITE_CACHE,
 .global_mem_size = 1024 * 1024 * 1024,
-.max_constant_buffer_size = 512 << 10,
+.max_constant_buffer_size = 128 * 1024 * 1024,
 .max_constant_args = 8,
 .error_correction_support = CL_FALSE,
 #ifdef HAS_USERPTR
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index a869515..331d250 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -114,11 +114,8 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   arg_sz = interp_kernel_get_arg_size(k->opaque, index);
 
   if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
-    if (arg_sz == 2 && arg_type == GBE_ARG_VALUE && sz == sizeof(cl_sampler)) {
-      /* FIXME, this is a workaround for the case when a kernel arg
-         defined a sampler_t but doesn't use it.*/
-      arg_type = GBE_ARG_SAMPLER;
-    } else
+    if (arg_type != GBE_ARG_SAMPLER ||
+        (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
       return CL_INVALID_ARG_SIZE;
   }
 
@@ -182,8 +179,9 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     k->args[index].sampler = sampler;
     cl_set_sampler_arg_slot(k, index, sampler);
     offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    assert(offset + 2 <= k->curbe_sz);
-    memcpy(k->curbe + offset, &sampler->clkSamplerValue, 2);
+    //assert(arg_sz == 4);
+    assert(offset + 4 <= k->curbe_sz);
+    memcpy(k->curbe + offset, &sampler->clkSamplerValue, 4);
     return CL_SUCCESS;
   }
 
@@ -338,6 +336,7 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   /* Get image data & size */
   k->image_sz = interp_kernel_get_image_size(k->opaque);
   assert(k->sampler_sz <= GEN_MAX_SURFACES);
+  assert(k->image_sz <= ctx->device->max_read_image_args + ctx->device->max_write_image_args);
   if (k->image_sz > 0) {
     TRY_ALLOC_NO_ERR(k->images, cl_calloc(k->image_sz, sizeof(k->images[0])));
     interp_kernel_get_image_data(k->opaque, k->images);
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 0fbd304..3225fd2 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -190,10 +190,18 @@ cl_get_image_info(cl_mem mem,
     *(size_t *)param_value = image->slice_pitch;
     break;
   case CL_IMAGE_WIDTH:
-    *(size_t *)param_value = image->w;
+
+    if (mem->type == CL_MEM_BUFFER1D_IMAGE_TYPE) {
+      struct _cl_mem_buffer1d_image *buffer1d_image = (struct _cl_mem_buffer1d_image*) image;
+      *(size_t *)param_value = buffer1d_image->size;
+    } else
+      *(size_t *)param_value = image->w;
     break;
   case CL_IMAGE_HEIGHT:
-    *(size_t *)param_value = IS_1D(image) ? 0 : image->h;
+    if (mem->type == CL_MEM_BUFFER1D_IMAGE_TYPE)
+      *(size_t *)param_value = 0;
+    else
+      *(size_t *)param_value = IS_1D(image) ? 0 : image->h;
     break;
   case CL_IMAGE_DEPTH:
     *(size_t *)param_value = IS_3D(image) ? image->depth : 0;
@@ -243,6 +251,10 @@ cl_mem_allocate(enum cl_mem_type type,
     struct _cl_mem_gl_image *gl_image = NULL;
     TRY_ALLOC (gl_image, CALLOC(struct _cl_mem_gl_image));
     mem = &gl_image->base.base;
+  } else if (type == CL_MEM_BUFFER1D_IMAGE_TYPE) {
+    struct _cl_mem_buffer1d_image *buffer1d_image = NULL;
+    TRY_ALLOC(buffer1d_image, CALLOC(struct _cl_mem_buffer1d_image));
+    mem = &buffer1d_image->base.base;
   } else {
     struct _cl_mem_buffer *buffer = NULL;
     TRY_ALLOC (buffer, CALLOC(struct _cl_mem_buffer));
@@ -266,16 +278,26 @@ cl_mem_allocate(enum cl_mem_type type,
 
 #ifdef HAS_USERPTR
     if (ctx->device->host_unified_memory) {
+      int page_size = getpagesize();
       /* currently only cl buf is supported, will add cl image support later */
-      if ((flags & CL_MEM_USE_HOST_PTR) && host_ptr != NULL) {
-        /* userptr not support tiling */
-        if (!is_tiled) {
-          int page_size = getpagesize();
-          if ((((unsigned long)host_ptr | sz) & (page_size - 1)) == 0) {
-            mem->is_userptr = 1;
-            mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", host_ptr, sz, 0);
+      if (type == CL_MEM_BUFFER_TYPE) {
+        if (flags & CL_MEM_USE_HOST_PTR) {
+          assert(host_ptr != NULL);
+          /* userptr not support tiling */
+          if (!is_tiled) {
+            if ((((unsigned long)host_ptr | sz) & (page_size - 1)) == 0) {
+              mem->is_userptr = 1;
+              mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", host_ptr, sz, 0);
+            }
           }
         }
+        else if (flags & CL_MEM_ALLOC_HOST_PTR) {
+          const size_t alignedSZ = ALIGN(sz, page_size);
+          void* internal_host_ptr = cl_aligned_malloc(alignedSZ, page_size);
+          mem->host_ptr = internal_host_ptr;
+          mem->is_userptr = 1;
+          mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", internal_host_ptr, alignedSZ, 0);
+        }
       }
     }
 
@@ -377,22 +399,6 @@ cl_mem_new_buffer(cl_context ctx,
     goto error;
   }
 
-  /* CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR
-     are mutually exclusive. */
-  if (UNLIKELY(flags & CL_MEM_ALLOC_HOST_PTR &&
-               flags & CL_MEM_USE_HOST_PTR)) {
-    err = CL_INVALID_HOST_PTR;
-    goto error;
-  }
-
-  /* CL_MEM_COPY_HOST_PTR and CL_MEM_USE_HOST_PTR
-     are mutually exclusive. */
-  if (UNLIKELY(flags & CL_MEM_COPY_HOST_PTR &&
-               flags & CL_MEM_USE_HOST_PTR)) {
-    err = CL_INVALID_HOST_PTR;
-    goto error;
-  }
-
   if ((err = cl_get_device_info(ctx->device,
                                 CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                                 sizeof(max_mem_size),
@@ -416,13 +422,17 @@ cl_mem_new_buffer(cl_context ctx,
     goto error;
 
   /* Copy the data if required */
-  if (flags & CL_MEM_COPY_HOST_PTR)
-    cl_buffer_subdata(mem->bo, 0, sz, data);
+  if (flags & CL_MEM_COPY_HOST_PTR) {
+    if (mem->is_userptr)
+      memcpy(mem->host_ptr, data, sz);
+    else
+      cl_buffer_subdata(mem->bo, 0, sz, data);
+  }
 
   if ((flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr)
     cl_buffer_subdata(mem->bo, 0, sz, data);
 
-  if (flags & CL_MEM_USE_HOST_PTR || flags & CL_MEM_COPY_HOST_PTR)
+  if (flags & CL_MEM_USE_HOST_PTR)
     mem->host_ptr = data;
 
 exit:
@@ -637,11 +647,15 @@ cl_mem_copy_image(struct _cl_mem_image *image,
   cl_mem_unmap_auto((cl_mem)image);
 }
 
-cl_image_tiling_t cl_get_default_tiling(void)
+cl_image_tiling_t cl_get_default_tiling(cl_driver drv)
 {
   static int initialized = 0;
   static cl_image_tiling_t tiling = CL_TILE_X;
+
   if (!initialized) {
+    // FIXME, need to find out the performance diff's root cause on BDW.
+    if(cl_driver_get_ver(drv) == 8)
+      tiling = CL_TILE_Y;
     char *tilingStr = getenv("OCL_TILING");
     if (tilingStr != NULL) {
       switch (tilingStr[0]) {
@@ -676,6 +690,7 @@ _cl_mem_new_image(cl_context ctx,
   cl_mem_object_type image_type = orig_image_type;
   uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
   size_t sz = 0, aligned_pitch = 0, aligned_slice_pitch = 0, aligned_h = 0;
+  size_t origin_width = w;  // for image1d buffer work around.
   cl_image_tiling_t tiling = CL_NO_TILE;
 
   /* Check flags consistency */
@@ -708,8 +723,7 @@ _cl_mem_new_image(cl_context ctx,
       image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)))
     DO_IMAGE_ERROR;
 
-  if (image_type == CL_MEM_OBJECT_IMAGE1D ||
-      image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+  if (image_type == CL_MEM_OBJECT_IMAGE1D) {
     size_t min_pitch = bpp * w;
     if (data && pitch == 0)
       pitch = min_pitch;
@@ -722,19 +736,30 @@ _cl_mem_new_image(cl_context ctx,
     if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
     if (UNLIKELY(!data && slice_pitch != 0)) DO_IMAGE_ERROR;
     tiling = CL_NO_TILE;
-  } else if (image_type == CL_MEM_OBJECT_IMAGE2D) {
+  } else if (image_type == CL_MEM_OBJECT_IMAGE2D ||
+             image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+
+    if (image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+      if (UNLIKELY(w > ctx->device->image_mem_size)) DO_IMAGE_ERROR;
+      /* This is an image1d buffer which exceeds normal image size restrication
+         We have to use a 2D image to simulate this 1D image. */
+      h = (w + ctx->device->image2d_max_width - 1) / ctx->device->image2d_max_width;
+      w = w > ctx->device->image2d_max_width ? ctx->device->image2d_max_width : w;
+      tiling = CL_NO_TILE;
+    } else if (cl_driver_get_ver(ctx->drv) != 6) {
+      /* Pick up tiling mode (we do only linear on SNB) */
+      tiling = cl_get_default_tiling(ctx->drv);
+    }
+
     size_t min_pitch = bpp * w;
     if (data && pitch == 0)
       pitch = min_pitch;
+
     if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
     if (UNLIKELY(h > ctx->device->image2d_max_height)) DO_IMAGE_ERROR;
     if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
     if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
 
-    /* Pick up tiling mode (we do only linear on SNB) */
-    if (cl_driver_get_ver(ctx->drv) != 6)
-      tiling = cl_get_default_tiling();
-
     depth = 1;
   } else if (image_type == CL_MEM_OBJECT_IMAGE3D ||
              image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
@@ -743,7 +768,7 @@ _cl_mem_new_image(cl_context ctx,
       h = 1;
       tiling = CL_NO_TILE;
     } else if (cl_driver_get_ver(ctx->drv) != 6)
-      tiling = cl_get_default_tiling();
+      tiling = cl_get_default_tiling(ctx->drv);
 
     size_t min_pitch = bpp * w;
     if (data && pitch == 0)
@@ -785,11 +810,20 @@ _cl_mem_new_image(cl_context ctx,
   if(tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
     tiling = CL_NO_TILE;
     aligned_pitch = w * bpp;
-    aligned_h     = h;
+    aligned_h     = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
     sz = aligned_pitch * aligned_h * depth;
   }
 
-  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, &err);
+  if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)
+    mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, &err);
+  else {
+    mem = cl_mem_allocate(CL_MEM_BUFFER1D_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, &err);
+    if (mem != NULL && err == CL_SUCCESS) {
+      struct _cl_mem_buffer1d_image *buffer1d_image = (struct _cl_mem_buffer1d_image *)mem;
+      buffer1d_image->size = origin_width;;
+    }
+  }
+
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
@@ -1081,6 +1115,9 @@ cl_mem_delete(cl_mem mem)
     cl_buffer_unreference(mem->bo);
   }
 
+  if (mem->is_userptr && (mem->flags & CL_MEM_ALLOC_HOST_PTR))
+    cl_free(mem->host_ptr);
+
   cl_free(mem);
 }
 
@@ -1818,8 +1855,13 @@ cl_mem_map_auto(cl_mem mem, int write)
 {
   if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE)
     return cl_mem_map_gtt(mem);
-  else
-    return cl_mem_map(mem, write);
+  else {
+    if (mem->is_userptr) {
+      cl_buffer_wait_rendering(mem->bo);
+      return mem->host_ptr;
+    }else
+      return cl_mem_map(mem, write);
+  }
 }
 
 LOCAL cl_int
@@ -1829,7 +1871,7 @@ cl_mem_unmap_auto(cl_mem mem)
     cl_buffer_unmap_gtt(mem->bo);
     mem->mapped_gtt = 0;
   }
-  else
+  else if (!mem->is_userptr)
     cl_buffer_unmap(mem->bo);
   return CL_SUCCESS;
 }
@@ -1910,7 +1952,7 @@ LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx,
 
   image = cl_mem_image(mem);
 
-  mem->bo = cl_buffer_get_image_from_libva(ctx, bo_name, image, offset);
+  mem->bo = cl_buffer_get_image_from_libva(ctx, bo_name, image);
 
   image->w = width;
   image->h = height;
diff --git a/src/cl_mem.h b/src/cl_mem.h
index ac1175d..fd50220 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -72,6 +72,7 @@ enum cl_mem_type {
   CL_MEM_SUBBUFFER_TYPE,
   CL_MEM_IMAGE_TYPE,
   CL_MEM_GL_IMAGE_TYPE,
+  CL_MEM_BUFFER1D_IMAGE_TYPE
 };
 #define IS_IMAGE(mem) (mem->type >= CL_MEM_IMAGE_TYPE)
 #define IS_GL_IMAGE(mem) (mem->type == CL_MEM_GL_IMAGE_TYPE)
@@ -86,13 +87,13 @@ typedef  struct _cl_mem {
   size_t size;              /* original request size, not alignment size, used in constant buffer */
   cl_context ctx;           /* Context it belongs to */
   cl_mem_flags flags;       /* Flags specified at the creation time */
-  void * host_ptr;          /* Pointer of the host mem specified by CL_MEM_ALLOC_HOST_PTR */
+  void * host_ptr;          /* Pointer of the host mem specified by CL_MEM_ALLOC_HOST_PTR, CL_MEM_USE_HOST_PTR */
   cl_mapped_ptr* mapped_ptr;/* Store the mapped addresses and size by caller. */
   int mapped_ptr_sz;        /* The array size of mapped_ptr. */
   int map_ref;              /* The mapped count. */
   uint8_t mapped_gtt;       /* This object has mapped gtt, for unmap. */
   cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
-  uint8_t is_userptr;    /* CL_MEM_USE_HOST_PTR is enabled*/
+  uint8_t is_userptr;       /* CL_MEM_USE_HOST_PTR is enabled*/
 } _cl_mem;
 
 struct _cl_mem_image {
@@ -117,6 +118,11 @@ struct _cl_mem_gl_image {
   uint32_t texture;
 };
 
+struct _cl_mem_buffer1d_image {
+  struct _cl_mem_image base;
+  uint32_t size;
+};
+
 inline static void
 cl_mem_image_init(struct _cl_mem_image *image, size_t w, size_t h,
                   cl_mem_object_type image_type,
diff --git a/src/cl_program.c b/src/cl_program.c
index fa67ef2..c30f85e 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -738,9 +738,9 @@ cl_program_compile(cl_program            p,
 
     if (UNLIKELY(p->opaque == NULL)) {
       if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
-        err = CL_INVALID_BUILD_OPTIONS;
+        err = CL_INVALID_COMPILER_OPTIONS;
       else
-        err = CL_BUILD_PROGRAM_FAILURE;
+        err = CL_COMPILE_PROGRAM_FAILURE;
       goto error;
     }
 
@@ -758,8 +758,6 @@ cl_program_compile(cl_program            p,
 
 error:
   p->build_status = CL_BUILD_ERROR;
-  cl_program_delete(p);
-  p = NULL;
   return err;
 }
 
diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c
index a65ac86..dcc8d98 100644
--- a/src/intel/intel_batchbuffer.c
+++ b/src/intel/intel_batchbuffer.c
@@ -158,13 +158,6 @@ intel_batchbuffer_emit_reloc(intel_batchbuffer_t *batch,
   intel_batchbuffer_emit_dword(batch, bo->offset + delta);
 }
 
-LOCAL void
-intel_batchbuffer_emit_mi_flush(intel_batchbuffer_t *batch)
-{
-  intel_batchbuffer_require_space(batch, 4);
-  intel_batchbuffer_emit_dword(batch, MI_FLUSH | STATE_INSTRUCTION_CACHE_INVALIDATE);
-}
-
 LOCAL intel_batchbuffer_t*
 intel_batchbuffer_new(intel_driver_t *intel)
 {
diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h
index 121c824..0071be6 100644
--- a/src/intel/intel_batchbuffer.h
+++ b/src/intel/intel_batchbuffer.h
@@ -96,7 +96,6 @@ extern void intel_batchbuffer_emit_reloc(intel_batchbuffer_t*,
                                          uint32_t read_domains,
                                          uint32_t write_domains,
                                          uint32_t delta);
-extern void intel_batchbuffer_emit_mi_flush(intel_batchbuffer_t*);
 extern void intel_batchbuffer_init(intel_batchbuffer_t*, struct intel_driver*);
 extern void intel_batchbuffer_terminate(intel_batchbuffer_t*);
 extern void intel_batchbuffer_flush(intel_batchbuffer_t*);
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index e983718..044c004 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -304,6 +304,10 @@
 #define URB_SIZE(intel)         (IS_IGDNG(intel->device_id) ? 1024 : \
                                  IS_G4X(intel->device_id) ? 384 : 256)
 
+// HSW
+#define HSW_SCRATCH1_OFFSET                      (0xB038)
+#define HSW_ROW_CHICKEN3_HDC_OFFSET              (0xE49C)
+
 // L3 cache stuff 
 #define GEN7_L3_SQC_REG1_ADDRESS_OFFSET          (0XB010)
 #define GEN7_L3_CNTL_REG2_ADDRESS_OFFSET         (0xB020)
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index c370c66..21546d9 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -109,13 +109,29 @@ error:
 /* just used for maximum relocation number in drm_intel */
 #define BATCH_SIZE 0x4000
 
+/* set OCL_DUMP_AUB=1 to get aub file */
+static void
+intel_driver_aub_dump(intel_driver_t *driver)
+{
+  char *val;
+  val = getenv("OCL_DUMP_AUB");
+  if (!val)
+    return;
+  if (atoi(val) != 0) {
+    drm_intel_bufmgr_gem_set_aub_filename(driver->bufmgr,
+					  "beignet.aub");
+    drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
+  }
+}
+
 static void
 intel_driver_memman_init(intel_driver_t *driver)
 {
   driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
   assert(driver->bufmgr);
-  //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
   drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
+  driver->device_id = drm_intel_bufmgr_gem_get_devid(driver->bufmgr);
+  intel_driver_aub_dump(driver);
 }
 
 static void
@@ -139,11 +155,7 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
   driver->fd = dev_fd;
   driver->locked = 0;
   pthread_mutex_init(&driver->ctxmutex, NULL);
-#ifndef NDEBUG
-  int res =
-#endif /* NDEBUG */
-  intel_driver_get_param(driver, I915_PARAM_CHIPSET_ID, &driver->device_id);
-  assert(res);
+
   intel_driver_memman_init(driver);
   intel_driver_context_init(driver);
 
@@ -251,20 +263,6 @@ intel_driver_close(intel_driver_t *intel)
 }
 
 LOCAL int
-intel_driver_get_param(intel_driver_t *driver, int param, int *value)
-{
-  int ret;
-  struct drm_i915_getparam gp;
-
-  memset(&gp, 0, sizeof(struct drm_i915_getparam));
-  gp.param = param;
-  gp.value = value;
-
-  ret = drmCommandWriteRead(driver->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
-  return ret == 0;
-}
-
-LOCAL int
 intel_driver_is_active(intel_driver_t *driver) {
   return driver->fd >= 0;
 }
@@ -675,15 +673,13 @@ cl_buffer intel_share_buffer_from_libva(cl_context ctx,
 
 cl_buffer intel_share_image_from_libva(cl_context ctx,
                                        unsigned int bo_name,
-                                       struct _cl_mem_image *image,
-                                       unsigned int offset)
+                                       struct _cl_mem_image *image)
 {
   drm_intel_bo *intel_bo;
   uint32_t intel_tiling, intel_swizzle_mode;
 
   intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
 
-  intel_bo->offset += offset;
   drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
   image->tiling = get_cl_tiling(intel_tiling);
 
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
index 61653db..f972ec8 100644
--- a/src/intel/intel_driver.h
+++ b/src/intel/intel_driver.h
@@ -61,8 +61,6 @@
 
 #define MI_NOOP                                 (CMD_MI | 0)
 #define MI_BATCH_BUFFER_END                     (CMD_MI | (0xA << 23))
-#define MI_FLUSH                                (CMD_MI | (0x4 << 23))
-#define STATE_INSTRUCTION_CACHE_INVALIDATE      (0x1 << 0)
 
 #define XY_COLOR_BLT_CMD                        (CMD_2D | (0x50 << 22) | 0x04)
 #define XY_COLOR_BLT_WRITE_ALPHA                (1 << 21)
@@ -143,9 +141,6 @@ extern int intel_driver_terminate(intel_driver_t*);
 /* simple check if driver was initialized (checking fd should suffice) */
 extern int intel_driver_is_active(intel_driver_t*);
 
-/* query device parameters using driver ioctl */
-extern int intel_driver_get_param(intel_driver_t*, int param, int *value);
-
 /* init the call backs used by the ocl driver */
 extern void intel_setup_callbacks(void);
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index b6e19db..479077c 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -104,6 +104,9 @@ intel_gpgpu_load_curbe_buffer_t *intel_gpgpu_load_curbe_buffer = NULL;
 typedef void (intel_gpgpu_load_idrt_t)(intel_gpgpu_t *gpgpu);
 intel_gpgpu_load_idrt_t *intel_gpgpu_load_idrt = NULL;
 
+typedef void (intel_gpgpu_pipe_control_t)(intel_gpgpu_t *gpgpu);
+intel_gpgpu_pipe_control_t *intel_gpgpu_pipe_control = NULL;
+
 static void
 intel_gpgpu_sync(void *buf)
 {
@@ -289,13 +292,6 @@ intel_gpgpu_set_base_address_gen7(intel_gpgpu_t *gpgpu)
 
   OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
   OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
-  /* If we output an AUB file, we limit the total size to 64MB */
-#if USE_FULSIM
-  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
-  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
-  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
-  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
-#else
   OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
   /* According to mesa i965 driver code, we must set the dynamic state access upper bound
    * to a valid bound value, otherwise, the border color pointer may be rejected and you
@@ -303,7 +299,6 @@ intel_gpgpu_set_base_address_gen7(intel_gpgpu_t *gpgpu)
   OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
   OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
   OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
-#endif /* USE_FULSIM */
   ADVANCE_BATCH(gpgpu->batch);
 }
 
@@ -341,13 +336,7 @@ intel_gpgpu_set_base_address_gen8(intel_gpgpu_t *gpgpu)
               I915_GEM_DOMAIN_INSTRUCTION,
               0 + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
     OUT_BATCH(gpgpu->batch, 0);
-    /* If we output an AUB file, we limit the total size to 64MB */
-#if USE_FULSIM
-    OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
-    OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
-    OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
-    OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
-#else
+
     OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
     /* According to mesa i965 driver code, we must set the dynamic state access upper bound
      * to a valid bound value, otherwise, the border color pointer may be rejected and you
@@ -355,7 +344,6 @@ intel_gpgpu_set_base_address_gen8(intel_gpgpu_t *gpgpu)
     OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
     OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
     OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
-#endif /* USE_FULSIM */
     ADVANCE_BATCH(gpgpu->batch);
 }
 
@@ -542,7 +530,7 @@ intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx)
 }
 
 static void
-intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
+intel_gpgpu_pipe_control_gen7(intel_gpgpu_t *gpgpu)
 {
   gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
     intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
@@ -561,12 +549,40 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
 }
 
 static void
+intel_gpgpu_pipe_control_gen75(intel_gpgpu_t *gpgpu)
+{
+  gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
+    intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
+  memset(pc, 0, sizeof(*pc));
+  pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
+  pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
+  pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
+  pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
+  pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
+  pc->dw1.cs_stall = 1;
+  pc->dw1.dc_flush_enable = 1;
+
+  pc = (gen6_pipe_control_t*)
+    intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
+  memset(pc, 0, sizeof(*pc));
+  pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
+  pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
+  pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
+  pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
+  pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
+  pc->dw1.render_target_cache_flush_enable = 1;
+  pc->dw1.texture_cache_invalidation_enable = 1;
+  pc->dw1.cs_stall = 1;
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
 intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
   BEGIN_BATCH(gpgpu->batch, 9);
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
-  OUT_BATCH(gpgpu->batch, 0x00730000);
+  OUT_BATCH(gpgpu->batch, 0x00A00000);
 
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
@@ -616,10 +632,19 @@ static void
 intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
   /* still set L3 in batch buffer for fulsim. */
-  BEGIN_BATCH(gpgpu->batch, 9);
+  BEGIN_BATCH(gpgpu->batch, 15);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  /* FIXME: KMD always disable the atomic in L3 for some reason.
+     I checked the spec, and don't think we need that workaround now.
+     Before I send a patch to kernel, let's just enable it here. */
+  OUT_BATCH(gpgpu->batch, HSW_SCRATCH1_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0);                         /* enable atomic in L3 */
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, HSW_ROW_CHICKEN3_HDC_OFFSET);
+  OUT_BATCH(gpgpu->batch, (1 << 6ul) << 16);          /* enable atomic in L3 */
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
-  OUT_BATCH(gpgpu->batch, 0x00610000);
+  OUT_BATCH(gpgpu->batch, 0x08800000);
 
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
@@ -745,7 +770,6 @@ static void
 intel_gpgpu_flush_batch_buffer(intel_batchbuffer_t *batch)
 {
   assert(batch);
-  intel_batchbuffer_emit_mi_flush(batch);
   intel_batchbuffer_flush(batch);
 }
 
@@ -1003,11 +1027,11 @@ static int
 intel_get_surface_type(cl_mem_object_type type)
 {
   switch (type) {
-  case CL_MEM_OBJECT_IMAGE1D_BUFFER:
   case CL_MEM_OBJECT_IMAGE1D:
   case CL_MEM_OBJECT_IMAGE1D_ARRAY:
     return I965_SURFACE_1D;
 
+  case CL_MEM_OBJECT_IMAGE1D_BUFFER:
   case CL_MEM_OBJECT_IMAGE2D:
   case CL_MEM_OBJECT_IMAGE2D_ARRAY:
     return I965_SURFACE_2D;
@@ -1031,7 +1055,7 @@ static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_
   if (((IS_IVYBRIDGE(gpgpu->drv->device_id) ||
         IS_HASWELL(gpgpu->drv->device_id) ||
         IS_BROADWELL(gpgpu->drv->device_id))) &&
-      index >= BTI_MAX_IMAGE_NUM + BTI_RESERVED_NUM &&
+      index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM &&
       type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
     surface_type = I965_SURFACE_2D;
   else
@@ -1063,7 +1087,7 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
     ss->ss0.surface_array_spacing = 1;
   }
   ss->ss0.surface_format = format;
-  ss->ss1.base_addr = obj_bo->offset;
+  ss->ss1.base_addr = obj_bo->offset + obj_bo_offset;
   ss->ss2.width = w - 1;
 
   ss->ss2.height = h - 1;
@@ -1108,7 +1132,7 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
     ss->ss0.surface_array_spacing = 1;
   }
   ss->ss0.surface_format = format;
-  ss->ss1.base_addr = obj_bo->offset;
+  ss->ss1.base_addr = obj_bo->offset + obj_bo_offset;
   ss->ss2.width = w - 1;
   ss->ss2.height = h - 1;
   ss->ss3.depth = depth - 1;
@@ -1170,8 +1194,8 @@ intel_gpgpu_bind_image_gen8(intel_gpgpu_t *gpgpu,
   ss->ss2.height = h - 1;
   ss->ss3.depth = depth - 1;
 
-  ss->ss8.surface_base_addr_lo = obj_bo->offset64 & 0xffffffff;
-  ss->ss9.surface_base_addr_hi = (obj_bo->offset64 >> 32) & 0xffffffff;
+  ss->ss8.surface_base_addr_lo = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff;
+  ss->ss9.surface_base_addr_hi = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff;
 
   ss->ss4.render_target_view_ext = depth - 1;
   ss->ss4.min_array_elt = 0;
@@ -1595,6 +1619,9 @@ intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu,
   OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
   OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
   ADVANCE_BATCH(gpgpu->batch);
+
+  if (IS_IVYBRIDGE(gpgpu->drv->device_id))
+    intel_gpgpu_pipe_control(gpgpu);
 }
 
 static void
@@ -1645,6 +1672,8 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
   OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
   OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
   ADVANCE_BATCH(gpgpu->batch);
+
+  intel_gpgpu_pipe_control(gpgpu);
 }
 
 static intel_event_t*
@@ -1867,11 +1896,14 @@ intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info, size_t * gl
 }
 
 static void*
-intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu, size_t * global_sz)
+intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu, size_t * global_sz, size_t *outbuf_sz)
 {
   global_sz[0] = gpgpu->global_wk_sz[0];
   global_sz[1] = gpgpu->global_wk_sz[1];
   global_sz[2] = gpgpu->global_wk_sz[2];
+
+  if (gpgpu->printf_b.bo)
+    *outbuf_sz = gpgpu->printf_b.bo->size;
   return gpgpu->printf_info;
 }
 
@@ -1925,6 +1957,7 @@ intel_set_gpgpu_callbacks(int device_id)
     intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen8;
     intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8;
     cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
+    intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen7;
     return;
   }
 
@@ -1943,6 +1976,7 @@ intel_set_gpgpu_callbacks(int device_id)
     intel_gpgpu_post_action = intel_gpgpu_post_action_gen75;
     intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb
     intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen75;
+    intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen75;
   }
   else if (IS_IVYBRIDGE(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
@@ -1957,5 +1991,6 @@ intel_set_gpgpu_callbacks(int device_id)
     intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7;
     intel_gpgpu_post_action = intel_gpgpu_post_action_gen7;
     intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen7;
+    intel_gpgpu_pipe_control = intel_gpgpu_pipe_control_gen7;
   }
 }
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 8cc8b43..8c42f1a 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -18,8 +18,18 @@ configure_file (
 
 #XXX only need GL if required
 link_directories (${LLVM_LIBRARY_DIR} ${OPENGL_LIBDIR} ${DRM_LIBDIR})
-set (utests_sources
+
+set (utests_basic_sources
   utest_error.c
+  utest_assert.cpp
+  utest.cpp
+  utest_file_map.cpp
+  utest_helper.cpp)
+
+# the test case with binary kernel
+set (utests_binary_kernel_sources load_program_from_bin_file.cpp enqueue_built_in_kernels.cpp)
+
+set (utests_sources
   compiler_basic_arithmetic.cpp
   compiler_displacement_map_element.cpp
   compiler_mandelbrot.cpp
@@ -36,6 +46,7 @@ set (utests_sources
   compiler_array1.cpp
   compiler_array2.cpp
   compiler_array3.cpp
+  compiler_array4.cpp
   compiler_byte_scatter.cpp
   compiler_ceil.cpp
   compiler_clz_short.cpp
@@ -103,7 +114,6 @@ set (utests_sources
   compiler_write_only.cpp
   compiler_write_only_shorts.cpp
   compiler_switch.cpp
-  compiler_overflow.cpp
   compiler_bswap.cpp
   compiler_math.cpp
   compiler_atomic_functions.cpp
@@ -171,13 +181,13 @@ set (utests_sources
   compiler_function_argument3.cpp
   compiler_function_qualifiers.cpp
   compiler_bool_cross_basic_block.cpp
+  compiler_private_const.cpp
   compiler_private_data_overflow.cpp
   compiler_getelementptr_bitcast.cpp
   compiler_simd_any.cpp
   compiler_simd_all.cpp
   compiler_time_stamp.cpp
   compiler_double_precision.cpp
-  load_program_from_bin_file.cpp
   load_program_from_gen_bin.cpp
   get_arg_info.cpp
   profiling_exec.cpp
@@ -185,7 +195,6 @@ set (utests_sources
   enqueue_copy_buf_unaligned.cpp
   test_printf.cpp
   enqueue_fill_buf.cpp
-  enqueue_built_in_kernels.cpp
   builtin_kernel_max_global_size.cpp
   image_1D_buffer.cpp
   compare_image_2d_and_1d_array.cpp
@@ -195,30 +204,38 @@ set (utests_sources
   compiler_assignment_operation_in_if.cpp
   vload_bench.cpp
   runtime_use_host_ptr_buffer.cpp
-  utest_assert.cpp
-  utest.cpp
-  utest_file_map.cpp
-  utest_helper.cpp)
+  runtime_alloc_host_ptr_buffer.cpp)
+
+if (LLVM_VERSION_NODOT VERSION_GREATER 34)
+  SET(utests_sources
+      ${utests_sources}
+      compiler_overflow.cpp)
+endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
+
+if (X11_FOUND)
+  set(utests_sources
+      ${utests_sources}
+      runtime_climage_from_boname.cpp)
+  SET(UTESTS_REQUIRED_X11_LIB ${X11_LIBRARIES} ${XEXT_LIBRARIES})
+else()
+SET(UTESTS_REQUIRED_X11_LIB "")
+endif (X11_FOUND)
 
 SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
 
+list (GET GBE_BIN_GENERATER -1 GBE_BIN_FILE)
 if(GEN_PCI_ID)
   ADD_CUSTOM_COMMAND(
   OUTPUT ${kernel_bin}.bin
   COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin -t${GEN_PCI_ID}
-  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl)
+  DEPENDS ${GBE_BIN_FILE} ${kernel_bin}.cl)
 else(GEN_PCI_ID)
   ADD_CUSTOM_COMMAND(
   OUTPUT ${kernel_bin}.bin
   COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin
-  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl)
+  DEPENDS ${GBE_BIN_FILE} ${kernel_bin}.cl)
 endif(GEN_PCI_ID)
 
-if (DRM_INTEL_USERPTR)
-SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR ${CMAKE_CXX_FLAGS}")
-SET(CMAKE_C_FLAGS "-DHAS_USERPTR ${CMAKE_C_FLAGS}")
-endif (DRM_INTEL_USERPTR)
-
 ADD_CUSTOM_TARGET(kernel_bin.bin
     DEPENDS ${kernel_bin}.bin)
 
@@ -234,21 +251,27 @@ add_custom_target(utest_generator
     )
 
 if (EGL_FOUND AND MESA_SOURCE_FOUND)
-SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
-SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
-SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
-SET(UTESTS_REQUIRED_EGL_LIB ${EGL_LIBRARIES})
+  SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
+  SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
+  SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
+  SET(UTESTS_REQUIRED_EGL_LIB ${EGL_LIBRARIES})
 else()
-SET(UTESTS_REQUIRED_EGL_LIB "")
+  SET(UTESTS_REQUIRED_EGL_LIB "")
 endif()
 
+if (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
+  SET(utests_sources ${utests_basic_sources} ${utests_binary_kernel_sources})
+else ()
+  SET(utests_sources ${utests_basic_sources} ${utests_binary_kernel_sources} ${ADDMATHFUNC} ${utests_sources})
+endif ()
+
 if (COMPILER STREQUAL "CLANG")
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-tautological-compare")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-tautological-compare")
 endif ()
 
-ADD_LIBRARY(utests SHARED ${ADDMATHFUNC} ${utests_sources})
+ADD_LIBRARY(utests SHARED ${utests_sources})
 
-TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT})
+TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT} ${UTESTS_REQUIRED_X11_LIB})
 
 ADD_EXECUTABLE(utest_run utest_run.cpp)
 TARGET_LINK_LIBRARIES(utest_run utests)
diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
index 93863a5..a18f31e 100644
--- a/utests/builtin_pow.cpp
+++ b/utests/builtin_pow.cpp
@@ -38,7 +38,7 @@ static void builtin_pow(void)
     }
 
   const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
-  float ULPSIZE_FACTOR = 1.0;
+  float ULPSIZE_FACTOR = 16.0;
   if (env_strict == NULL || strcmp(env_strict, "0") == 0)
     ULPSIZE_FACTOR = 10000.;
 
@@ -75,7 +75,7 @@ static void builtin_pow(void)
 #if udebug
       if ( (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])) ||
            (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])) ||
-           (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR.)   )
+           (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR)   )
       {
         printf_c("%d/%d: x:%f, y:%f -> gpu:%f  cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
       }
diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
index 16dac97..47cc5f4 100644
--- a/utests/builtin_tgamma.cpp
+++ b/utests/builtin_tgamma.cpp
@@ -16,7 +16,7 @@ void builtin_tgamma(void)
   globals[0] = n;
   locals[0] = 16;
   const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
-  float ULPSIZE_FACTOR = 1.0;
+  float ULPSIZE_FACTOR = 16.0;
   if (env_strict == NULL || strcmp(env_strict, "0") == 0)
     ULPSIZE_FACTOR = 10000.;
 
diff --git a/utests/compare_image_2d_and_1d_array.cpp b/utests/compare_image_2d_and_1d_array.cpp
index a2de507..dfa4273 100644
--- a/utests/compare_image_2d_and_1d_array.cpp
+++ b/utests/compare_image_2d_and_1d_array.cpp
@@ -61,7 +61,7 @@ static void compare_image_2d_and_1d_array(void)
   globals[0] = 32;
   globals[1] = 16;
   locals[0] = 32;
-  locals[1] = 16;
+  locals[1] = 8;
   OCL_NDRANGE(2);
 
   // Check result
diff --git a/utests/compiler_array4.cpp b/utests/compiler_array4.cpp
new file mode 100644
index 0000000..51b6d60
--- /dev/null
+++ b/utests/compiler_array4.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+  dst[global_id * 4] = src[global_id * 4];
+}
+
+void compiler_array4(void)
+{
+  const size_t n = 16;
+  int cpu_dst[64], cpu_src[64];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_array4");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t) * 4, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t) * 4, NULL);
+  uint32_t offset = 1;
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(offset), &offset);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i * 4] = ((int32_t*)buf_data[0])[i * 4] = rand() % 16;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i) {
+      OCL_ASSERT(((int32_t*)buf_data[1])[i * 4] == cpu_dst[i * 4]);
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array4);
diff --git a/utests/compiler_bswap.cpp b/utests/compiler_bswap.cpp
index b5986b9..9475b99 100644
--- a/utests/compiler_bswap.cpp
+++ b/utests/compiler_bswap.cpp
@@ -85,6 +85,13 @@ void test(const char *kernel_name)
   for (int32_t i = 0; i < (int32_t) n; ++i)
     cpu(i, cpu_src, cpu_dst);
 
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu_dst[i] = cpu_dst[i] -1;
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu(i, cpu_dst, cpu_dst);
+
   OCL_MAP_BUFFER(1);
  // dump_data(cpu_src, cpu_dst, n);
 
diff --git a/utests/compiler_displacement_map_element.cpp b/utests/compiler_displacement_map_element.cpp
index 98041ec..f031d99 100644
--- a/utests/compiler_displacement_map_element.cpp
+++ b/utests/compiler_displacement_map_element.cpp
@@ -1,8 +1,8 @@
 #include "utest_helper.hpp"
 
 typedef unsigned int uint;
-constexpr int W = 16, H = 16;
-constexpr int SIZE = W * H;
+const int W = 16, H = 16;
+const int SIZE = W * H;
 uint in_1[SIZE];
 uint disp_map[SIZE];
 uint out_1[SIZE];
diff --git a/utests/compiler_fill_gl_image.cpp b/utests/compiler_fill_gl_image.cpp
index 87d2fcd..f1eb8e7 100644
--- a/utests/compiler_fill_gl_image.cpp
+++ b/utests/compiler_fill_gl_image.cpp
@@ -70,7 +70,7 @@ static void compiler_fill_gl_image(void)
     for (uint32_t i = 0; i < w; i++)
       OCL_ASSERT(resultColor[j * w + i] == color);
   OCL_UNMAP_BUFFER(0);
-  delete resultColor;
+  delete[] resultColor;
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_gl_image);
diff --git a/utests/compiler_fill_image_2d_array.cpp b/utests/compiler_fill_image_2d_array.cpp
index 649b416..fc09362 100644
--- a/utests/compiler_fill_image_2d_array.cpp
+++ b/utests/compiler_fill_image_2d_array.cpp
@@ -37,9 +37,9 @@ static void compiler_fill_image_2d_array(void)
   globals[0] = w/2;
   locals[0] = 16;
   globals[1] = h;
-  locals[1] = 8;
+  locals[1] = 4;
   globals[2] = array;
-  locals[2] = 8;
+  locals[2] = 4;
   OCL_NDRANGE(3);
 
   // Check result
diff --git a/utests/compiler_overflow.cpp b/utests/compiler_overflow.cpp
index 1404cfe..5517b5a 100644
--- a/utests/compiler_overflow.cpp
+++ b/utests/compiler_overflow.cpp
@@ -56,6 +56,9 @@ void test(const char *kernel_name, int func_type)
 
   U max = get_max<U>();
 
+  // test add and sub overflow when src1 is 1:
+  // uadd.with.overflow: max + 1
+  // usub.with.overflow: 0 - 1
   OCL_MAP_BUFFER(0);
   for (uint32_t i = 0; i < n; ++i) {
     if(func_type == 0) {
@@ -101,6 +104,55 @@ void test(const char *kernel_name, int func_type)
       OCL_ASSERT(0);
   }
   OCL_UNMAP_BUFFER(2);
+
+  // test add and sub overflow when src1 is max:
+  // uadd.with.overflow: max + max
+  // usub.with.overflow: 0 - max
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i) {
+    if(func_type == 0) {
+      ((T*)buf_data[0])[i].x = max;
+      ((T*)buf_data[0])[i].y = max;
+      ((T*)buf_data[0])[i].z = max;
+      ((T*)buf_data[0])[i].w = i;
+    }else if(func_type == 1) {
+      ((T*)buf_data[0])[i].x = 0;
+      ((T*)buf_data[0])[i].y = 0;
+      ((T*)buf_data[0])[i].z = 0;
+      ((T*)buf_data[0])[i].w = n+2-i;
+    }else
+      OCL_ASSERT(0);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i) {
+      ((T*)buf_data[1])[i].x = max;
+      ((T*)buf_data[1])[i].y = max;
+      ((T*)buf_data[1])[i].z = max;
+      ((T*)buf_data[1])[i].w = 1;
+  }
+  OCL_UNMAP_BUFFER(1);
+
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (uint32_t i = 0; i < 16; ++i) {
+    // printf("%u,%u,%u,%u\n", ((T*)buf_data[2])[i].x,((T*)buf_data[2])[i].y, ((T*)buf_data[2])[i].z, ((T*)buf_data[2])[i].w  );
+    if(func_type == 0) {
+      OCL_ASSERT(((T*)buf_data[2])[i].x == max-1);
+      OCL_ASSERT(((T*)buf_data[2])[i].y == max);
+      OCL_ASSERT(((T*)buf_data[2])[i].z == max);
+      OCL_ASSERT(((T*)buf_data[2])[i].w == i+2);
+    }else if(func_type == 1) {
+      OCL_ASSERT(((T*)buf_data[2])[i].x == 1);
+      OCL_ASSERT(((T*)buf_data[2])[i].y == 0);
+      OCL_ASSERT(((T*)buf_data[2])[i].z == 0);
+      OCL_ASSERT(((T*)buf_data[2])[i].w == n-i);
+    }else
+      OCL_ASSERT(0);
+  }
+  OCL_UNMAP_BUFFER(2);
 }
 
 }
diff --git a/utests/compiler_private_const.cpp b/utests/compiler_private_const.cpp
new file mode 100644
index 0000000..d5fa982
--- /dev/null
+++ b/utests/compiler_private_const.cpp
@@ -0,0 +1,27 @@
+#include "utest_helper.hpp"
+
+void compiler_private_const(void)
+{
+  const size_t n = 16;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_private_const");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = n;
+  locals[0] = n;
+
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  for (size_t i = 0; i < n; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[0])[i] == (int32_t)(i * 2));
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_private_const);
+
+
diff --git a/utests/compiler_saturate.cpp b/utests/compiler_saturate.cpp
index 6880df0..97420db 100644
--- a/utests/compiler_saturate.cpp
+++ b/utests/compiler_saturate.cpp
@@ -2,7 +2,7 @@
 
 namespace {
 
-constexpr int n = 16;
+const int n = 16;
 
 // declaration only, we should create each template specification for each type.
 template<typename T>
diff --git a/utests/compiler_saturate_sub.cpp b/utests/compiler_saturate_sub.cpp
index 1c95e2d..5c57625 100644
--- a/utests/compiler_saturate_sub.cpp
+++ b/utests/compiler_saturate_sub.cpp
@@ -2,7 +2,7 @@
 
 namespace {
 
-constexpr int n = 16;
+const int n = 16;
 
 // declaration only, we should create each template specification for each type.
 template<typename T>
diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
index 807739b..e2dc0d7 100644
--- a/utests/get_cl_info.cpp
+++ b/utests/get_cl_info.cpp
@@ -160,8 +160,8 @@ Info_Result<T>* cast_as(void *info)
 	cl_int ret; \
 	size_t ret_size; \
 	\
-	Info_Result<TYPE>* info = cast_as<TYPE>(x.second); \
-	ret = FUNC (__VA_ARGS__, x.first, \
+	Info_Result<TYPE>* info = cast_as<TYPE>(x->second); \
+	ret = FUNC (__VA_ARGS__, x->first, \
 		info->size, info->get_ret(), &ret_size); \
 	OCL_ASSERT((!ret)); \
 	OCL_ASSERT((info->check_result())); \
@@ -221,8 +221,8 @@ void get_program_info(void)
     maps.insert(make_pair(CL_PROGRAM_BINARIES,
                           (void *)(new Info_Result<char **>(&expect_source, &sz, 1))));
 
-    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
-        switch (x.first) {
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
         case CL_PROGRAM_REFERENCE_COUNT:
         case CL_PROGRAM_NUM_DEVICES:
             CALL_PROGINFO_AND_RET(cl_uint);
@@ -245,7 +245,7 @@ void get_program_info(void)
         default:
             break;
         }
-    });
+    }
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_program_info);
@@ -296,8 +296,8 @@ void get_queue_info(void)
                           (void *)(new Info_Result<cl_command_queue_properties>(
                                        ((cl_command_queue_properties)prop)))));
 
-    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
-        switch (x.first) {
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
         case CL_QUEUE_CONTEXT:
             CALL_QUEUEINFO_AND_RET(cl_context);
             break;
@@ -313,7 +313,7 @@ void get_queue_info(void)
         default:
             break;
         }
-    });
+    }
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_queue_info);
@@ -345,8 +345,8 @@ void get_program_build_info(void)
     maps.insert(make_pair(CL_PROGRAM_BUILD_LOG, /* not supported now, just "" */
                           (void *)(new Info_Result<char *>(log, sz))));
 
-    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
-        switch (x.first) {
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
         case CL_PROGRAM_BUILD_STATUS:
             CALL_PROG_BUILD_INFO_AND_RET(cl_build_status);
             break;
@@ -359,7 +359,7 @@ void get_program_build_info(void)
         default:
             break;
         }
-    });
+    }
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_program_build_info);
@@ -409,8 +409,8 @@ void get_context_info(void)
                           (void *)(new Info_Result<char*>(
                                        (const char*)NULL, 100*sizeof(cl_context_properties)))));
 
-    std::for_each(maps.begin(), maps.end(), [](pair<cl_context_info, void *> x) {
-        switch (x.first) {
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
         case CL_CONTEXT_NUM_DEVICES:
             CALL_CONTEXTINFO_AND_RET(cl_uint);
             break;
@@ -426,7 +426,7 @@ void get_context_info(void)
         default:
             break;
         }
-    });
+    }
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_context_info);
@@ -469,8 +469,8 @@ void get_kernel_info(void)
     maps.insert(make_pair(CL_KERNEL_FUNCTION_NAME,
                           (void *)(new Info_Result<char*>(expected_name, strlen(expected_name)+1))));
 
-    std::for_each(maps.begin(), maps.end(), [](pair<cl_kernel_info, void *> x) {
-        switch (x.first) {
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
         case CL_KERNEL_PROGRAM:
             CALL_KERNELINFO_AND_RET(cl_program);
             break;
@@ -489,7 +489,7 @@ void get_kernel_info(void)
         default:
             break;
         }
-    });
+    }
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_kernel_info);
@@ -600,8 +600,8 @@ void get_mem_info(void)
     maps.insert(make_pair(CL_MEM_OFFSET,
                           (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
 
-    std::for_each(maps.begin(), maps.end(), [](pair<cl_mem_info, void *> x) {
-        switch (x.first) {
+    for (map<cl_program_info, void *>::iterator x = maps.begin(); x != maps.end(); ++x) {
+        switch (x->first) {
         case CL_MEM_TYPE:
             CALL_GETMEMINFO_AND_RET(cl_mem_object_type);
             break;
@@ -633,7 +633,7 @@ void get_mem_info(void)
         default:
             break;
         }
-    });
+    }
 
     clEnqueueUnmapMemObject(queue, buf[0], map_ptr, 0, NULL, NULL);
 }
diff --git a/utests/runtime_alloc_host_ptr_buffer.cpp b/utests/runtime_alloc_host_ptr_buffer.cpp
new file mode 100644
index 0000000..793682b
--- /dev/null
+++ b/utests/runtime_alloc_host_ptr_buffer.cpp
@@ -0,0 +1,25 @@
+#include "utest_helper.hpp"
+
+static void runtime_alloc_host_ptr_buffer(void)
+{
+  const size_t n = 4096*100;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("runtime_alloc_host_ptr_buffer");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_ALLOC_HOST_PTR, n * sizeof(uint32_t), NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = n;
+  locals[0] = 256;
+  OCL_NDRANGE(1);
+
+  // Check result
+  uint32_t* mapptr = (uint32_t*)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_READ, 0, n*sizeof(uint32_t), 0, NULL, NULL, NULL);
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(mapptr[i] == i / 2);
+  clEnqueueUnmapMemObject(queue, buf[0], mapptr, 0, NULL, NULL);
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_alloc_host_ptr_buffer);
diff --git a/utests/runtime_climage_from_boname.cpp b/utests/runtime_climage_from_boname.cpp
new file mode 100644
index 0000000..4e7f06a
--- /dev/null
+++ b/utests/runtime_climage_from_boname.cpp
@@ -0,0 +1,212 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+extern "C"
+{
+#include <X11/Xlibint.h>
+#include <X11/Xlib.h>
+#include <xf86drm.h>
+#include <intel_bufmgr.h>
+#include <drm.h>
+#include <drm_sarea.h>
+#include <X11/Xmd.h>
+#include <X11/Xregion.h>
+#include <X11/extensions/Xext.h>
+#include <X11/extensions/extutil.h>
+}
+
+// part of following code is copy from beignet/src/x11/
+typedef struct {
+    CARD8   reqType;
+    CARD8   dri2Reqtype;
+    CARD16  length B16;
+    CARD32  window B32;
+    CARD32  magic B32;
+} xDRI2AuthenticateReq;
+#define sz_xDRI2AuthenticateReq   12
+
+typedef struct {
+    BYTE    type;   /* X_Reply */
+    BYTE    pad1;
+    CARD16  sequenceNumber B16;
+    CARD32  length B32;
+    CARD32  authenticated B32;
+    CARD32  pad2 B32;
+    CARD32  pad3 B32;
+    CARD32  pad4 B32;
+    CARD32  pad5 B32;
+    CARD32  pad6 B32;
+} xDRI2AuthenticateReply;
+#define sz_xDRI2AuthenticateReply	32
+
+#define X_DRI2Authenticate		2
+
+static char va_dri2ExtensionName[] = "DRI2";
+static XExtensionInfo _va_dri2_info_data;
+static XExtensionInfo *va_dri2Info = &_va_dri2_info_data;
+static XEXT_GENERATE_CLOSE_DISPLAY (VA_DRI2CloseDisplay, va_dri2Info)
+static /* const */ XExtensionHooks va_dri2ExtensionHooks = {
+    NULL,				/* create_gc */
+    NULL,				/* copy_gc */
+    NULL,				/* flush_gc */
+    NULL,				/* free_gc */
+    NULL,				/* create_font */
+    NULL,				/* free_font */
+    VA_DRI2CloseDisplay,		/* close_display */
+    NULL,				/* wire_to_event */
+    NULL,				/* event_to_wire */
+    NULL,				/* error */
+    NULL,				/* error_string */
+};
+
+static XEXT_GENERATE_FIND_DISPLAY (DRI2FindDisplay, va_dri2Info,
+				   va_dri2ExtensionName,
+				   &va_dri2ExtensionHooks,
+				   0, NULL)
+
+static Bool VA_DRI2Authenticate(Display *dpy, XID window, drm_magic_t magic)
+{
+    XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+    xDRI2AuthenticateReq *req;
+    xDRI2AuthenticateReply rep;
+
+    XextCheckExtension (dpy, info, va_dri2ExtensionName, False);
+
+    LockDisplay(dpy);
+    GetReq(DRI2Authenticate, req);
+    req->reqType = info->codes->major_opcode;
+    req->dri2Reqtype = X_DRI2Authenticate;
+    req->window = window;
+    req->magic = magic;
+
+    if (!_XReply(dpy, (xReply *)&rep, 0, xFalse)) {
+	UnlockDisplay(dpy);
+	SyncHandle();
+	return False;
+    }
+
+    UnlockDisplay(dpy);
+    SyncHandle();
+
+    return rep.authenticated;
+}
+
+
+void runtime_climage_from_boname(void)
+{
+  const int w = 1024;
+  const int h = 256;
+  const int hStart = 128;
+  const int offset = hStart * w;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("runtime_climage_from_boname");
+
+  int fd = open("/dev/dri/card0", O_RDWR);
+  OCL_ASSERT(fd>0);
+
+  drm_magic_t magic;
+  drmGetMagic(fd, &magic);
+
+  Display* dpy = XOpenDisplay(NULL);
+  if (dpy == NULL) {
+    fprintf(stderr, " Can't open Display, skipping.\n");
+    return; 
+  }
+  XID root = RootWindow(dpy, DefaultScreen(dpy));
+
+  Bool auth = VA_DRI2Authenticate(dpy, root, magic);
+  OCL_ASSERT(auth);
+
+  drm_intel_bufmgr* bufmgr = drm_intel_bufmgr_gem_init(fd, 1024);
+  OCL_ASSERT(bufmgr != NULL);
+
+  drm_intel_bo * bo = drm_intel_bo_alloc(bufmgr, "runtime_climage_from_boname", w*h, 0);
+  OCL_ASSERT(bo != NULL);
+
+  drm_intel_bo_map(bo, 0);
+  unsigned char* addr = (unsigned char*)bo->virt;
+  memset(addr, 0xCD, w*h);
+  drm_intel_bo_unmap(bo);
+
+  unsigned int boName = 0;
+  drm_intel_bo_flink(bo, &boName);
+
+  cl_image_format fmt;
+  fmt.image_channel_order = CL_R;
+  fmt.image_channel_data_type = CL_UNORM_INT8;
+
+  cl_libva_image imageParam;
+  imageParam.fmt = fmt;
+  imageParam.bo_name = boName;
+  imageParam.offset = offset;
+  imageParam.width = w;
+  imageParam.height = h - hStart;
+  imageParam.row_pitch = w;
+
+  cl_mem dst = clCreateImageFromLibvaIntel(ctx, &imageParam, NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &dst);
+  globals[0] = w;
+  globals[1] = h-hStart;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  OCL_FINISH();
+
+  drm_intel_bo_map(bo, 0);
+  addr = (unsigned char*)bo->virt;
+  for (int i = 0; i < hStart; ++i) {
+    for (int j = 0; j < w; ++j) {
+      OCL_ASSERT(addr[j+i*w]==0xCD);
+    }
+  }
+  for (int i = hStart; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      OCL_ASSERT(addr[j+i*w]==(unsigned char)(0.34*255+0.5));
+    }
+  }
+  drm_intel_bo_unmap(bo);
+
+
+  // Run the kernel for the seconde time
+  OCL_SET_ARG(0, sizeof(cl_mem), &dst);
+  globals[0] = w;
+  globals[1] = h-hStart;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  OCL_FINISH();
+
+  drm_intel_bo_map(bo, 0);
+  addr = (unsigned char*)bo->virt;
+  for (int i = 0; i < hStart; ++i) {
+    for (int j = 0; j < w; ++j) {
+      OCL_ASSERT(addr[j+i*w]==0xCD);
+    }
+  }
+  for (int i = hStart; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      OCL_ASSERT(addr[j+i*w]==(unsigned char)(0.34*255+0.5));
+    }
+  }
+  drm_intel_bo_unmap(bo);
+
+  clReleaseMemObject(dst);
+  drm_intel_bo_unreference(bo);
+  drm_intel_bufmgr_destroy(bufmgr);
+  XCloseDisplay(dpy);
+  close(fd);
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_climage_from_boname);
diff --git a/utests/runtime_use_host_ptr_buffer.cpp b/utests/runtime_use_host_ptr_buffer.cpp
index 4603f90..79273c3 100644
--- a/utests/runtime_use_host_ptr_buffer.cpp
+++ b/utests/runtime_use_host_ptr_buffer.cpp
@@ -20,17 +20,11 @@ static void runtime_use_host_ptr_buffer(void)
   OCL_NDRANGE(1);
 
   // Check result
-
-#ifdef HAS_USERPTR
-  OCL_FINISH();
-#else
   void* mapptr = (int*)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_READ, 0, n*sizeof(uint32_t), 0, NULL, NULL, NULL);
   OCL_ASSERT(mapptr == buf_data[0]);
-  clEnqueueUnmapMemObject(queue, buf[0], mapptr, 0, NULL, NULL);
-#endif
-
   for (uint32_t i = 0; i < n; ++i)
     OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i / 2);
+  clEnqueueUnmapMemObject(queue, buf[0], mapptr, 0, NULL, NULL);
 
   free(buf_data[0]);
   buf_data[0] = NULL;
diff --git a/utests/sub_buffer.cpp b/utests/sub_buffer.cpp
index d32fd65..6228034 100644
--- a/utests/sub_buffer.cpp
+++ b/utests/sub_buffer.cpp
@@ -15,6 +15,7 @@ void sub_buffer_check(void)
     error = clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(address_align ), &address_align, NULL );
     OCL_ASSERT(error == CL_SUCCESS);
 
+    max_alloc_size /= 8;
     main_buf_content = (char *)malloc(sizeof(char) * max_alloc_size);
 
     for (cl_ulong i = 0; i < max_alloc_size; i++) {
@@ -25,8 +26,8 @@ void sub_buffer_check(void)
     OCL_ASSERT(error == CL_SUCCESS);
 
     /* Test read sub buffer. */
-    for (cl_ulong sz = 64; sz < max_alloc_size; sz*=4) {
-        for (cl_ulong off = 0; off < max_alloc_size; off += 1234) {
+    for (cl_ulong sz = max_alloc_size / 4; sz <= max_alloc_size; sz += max_alloc_size / 4) {
+        for (cl_ulong off = 0; off < max_alloc_size; off += 1234 + max_alloc_size / 3) {
             cl_buffer_region region;
             region.origin = off;
             region.size = sz;
@@ -71,8 +72,8 @@ void sub_buffer_check(void)
     }
 
 
-    for (cl_ulong sz = 64; sz < max_alloc_size; sz*=4) {
-        for (cl_ulong off = 0; off < max_alloc_size; off += 1234) {
+    for (cl_ulong sz = max_alloc_size / 4; sz <= max_alloc_size; sz += max_alloc_size / 4) {
+        for (cl_ulong off = 0; off < max_alloc_size; off += 1234 + max_alloc_size / 3) {
             cl_buffer_region region;
             region.origin = off;
             region.size = sz;
diff --git a/utests/utest.cpp b/utests/utest.cpp
index b914891..0a03d8b 100644
--- a/utests/utest.cpp
+++ b/utests/utest.cpp
@@ -48,15 +48,14 @@ void runSummaryAtExit(void) {
   // If case crashes, count it as fail, and accumulate finishrun
   if(UTest::retStatistics.finishrun != UTest::utestList->size()) {
     UTest::retStatistics.finishrun++;
-    UTest::retStatistics.failCount++;
+   // UTest::retStatistics.failCount++;
   }
   printf("\nsummary:\n----------\n");
   printf("  total: %zu\n",UTest::utestList->size());
-  printf("  run: %zu\n",UTest::retStatistics.finishrun);
+  printf("  run: %zu\n",UTest::retStatistics.actualrun);
   printf("  pass: %zu\n",UTest::retStatistics.passCount);
   printf("  fail: %zu\n",UTest::retStatistics.failCount);
-  printf("  pass rate: %f\n",1-(float)UTest::retStatistics.failCount/(float)UTest::utestList->size());
-
+  printf("  pass rate: %f\n", (UTest::retStatistics.actualrun)?((float)UTest::retStatistics.passCount/(float)UTest::retStatistics.actualrun):(float)0);
   releaseUTestList();
 }
 
@@ -84,6 +83,7 @@ void signalHandler( int signum )
   }
 
   printf("    Interrupt signal (%s) received.", name);
+  UTest::retStatistics.failCount++;
 
   exit(signum);
 }
@@ -128,7 +128,7 @@ void UTest::do_run(struct UTest utest){
   // Print function name
   printf("%s()", utest.name);
   fflush(stdout);
-
+  retStatistics.actualrun++;
   // Run one case in utestList, print result [SUCCESS] or [FAILED]
   (utest.fn)();
 }
diff --git a/utests/utest.hpp b/utests/utest.hpp
index 0dc611d..b028b64 100644
--- a/utests/utest.hpp
+++ b/utests/utest.hpp
@@ -37,6 +37,7 @@ struct RStatistics
   size_t passCount;
   size_t failCount;
   size_t finishrun;
+  size_t actualrun;
 };
 
 /*! Quick and dirty unit test system with registration */
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index 5da2752..510c41a 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -8,6 +8,91 @@ FLT_MAX_NEGA='-0x1.0p-126f'
 
 paraTypeList={'float':'%e','int':'%d','double':'%lf','uint':'%d','string':'%s'}
 
+Single_Precision_ULPs={'acos' : '4' ,
+                      'acospi' : '5' ,
+                      'asin' : '4' ,
+                      'asinpi' : '5' ,
+                      'atan' : '5' ,
+                      'atan2' : '6' ,
+                      'atanpi' : '5' ,
+                      'atan2pi' : '6' ,
+                      'acosh' : '4' ,
+                      'asinh' : '4' ,
+                      'atanh' : '5' ,
+                      'cbrt' : '2' ,
+                      'ceil' : '-1' ,
+                      'copysign' : '0' ,
+                      'cos' : '4' ,
+                      'cosh' : '4' ,
+                      'cospi' : '4' ,
+                      'erfc' : '16' ,
+                      'erf' : '16' ,
+                      'exp' : '3' ,
+                      'exp2' : '3' ,
+                      'exp10' : '3' ,
+                      'expm1' : '3' ,
+                      'fabs' : '0' ,
+                      'fdim' : '-1' ,
+                      'floor' : '-1' ,
+                      'fma' : '-1' ,
+                      'fmax' : '0' ,
+                      'fmin' : '0' ,
+                      'fmod' : '0' ,
+                      'fract' : '-1' ,
+                      'frexp' : '0' ,
+                      'hypot' : '4' ,
+                      'ilogb' : '0' ,
+                      'ldexp' : '-1' ,
+                      'log' : '3' ,
+                      'log2' : '3' ,
+                      'log10' : '3' ,
+                      'log1p' : '2' ,
+                      'logb' : '0' ,
+                      'maxmag' : '0' ,
+                      'minmag' : '0' ,
+                      'modf' : '0' ,
+                      'nan' : '0' ,
+                      'nextafter' : '0' ,
+                      'pow' : '16' ,
+                      'pown' : '16' ,
+                      'powr' : '16' ,
+                      'remainder' : '0' ,
+                      'remquo' : '0' ,
+                      'rint' : '-1' ,
+                      'rootn' : '16' ,
+                      'round' : '-1' ,
+                      'rsqrt' : '2' ,
+                      'sin' : '4' ,
+                      'sincos' : '4' ,
+                      'sinh' : '4' ,
+                      'sinpi' : '4' ,
+                      'sqrt' : '3' ,
+                      'tan' : '5' ,
+                      'tanh' : '5' ,
+                      'tanpi' : '6' ,
+                      'tgamma' : '16' ,
+                      'trunc' : '-1' ,
+                      'half_cos' : '8192' ,
+                      'half_divide' : '8192' ,
+                      'half_exp' : '8192' ,
+                      'half_exp2' : '8192' ,
+                      'half_exp10' : '8192' ,
+                      'half_log2' : '8192' ,
+                      'half_log10' : '8192' ,
+                      'half_powr' : '8192' ,
+                      'half_recip' : '8192' ,
+                      'half_rsqrt' : '8192' ,
+                      'half_sin' : '8192' ,
+                      'half_sqrt' : '8192' ,
+                      'half_tan' : '8192' }
+
+def Min_ulp(function):
+    if function in Single_Precision_ULPs.keys():
+         ulpValues = Single_Precision_ULPs[function]
+    else:
+         ulpValues = 1
+    return ulpValues
+
 
 def ulpUnit(ulpSize):
   return re.findall(r"([a-zA-Z_]+)",ulpSize)[0]
@@ -15,7 +100,7 @@ def ulpUnit(ulpSize):
 def ulpNum(ulpSize):
   return re.findall(r"([0-9]+)",ulpSize)[0]
 
-def udebug(ulpSize,returnType):
+def udebug(ulpSize,returnType,function):
   #ulpUnit=re.findall(r"([a-zA-Z_]+)",ulpSize)[0]
   #ulpNum=re.findall(r"([0-9]+)",ulpSize)[0]
   text='''
@@ -27,7 +112,7 @@ def udebug(ulpSize,returnType):
     if (env_strict == NULL || strcmp(env_strict, "0") == 0)
       ULPSIZE_FACTOR = 1000;
     else
-      ULPSIZE_FACTOR = 1;
+      ULPSIZE_FACTOR = %s;
     
     if (isinf(cpu_data[index])){
       INFORNAN="INF";
@@ -53,7 +138,7 @@ def udebug(ulpSize,returnType):
       else
         printf_c("%s expect:%s\\n", log, INFORNAN);
       }
-    else if (diff <= ULPSIZE){
+    else if ((ULPSIZE >= 0 && diff <= ULPSIZE) || (ULPSIZE < 0 && diff == 0)){
       printf("%s expect:%s\\n", log, ULPSIZE);
       }
     else
@@ -61,19 +146,22 @@ def udebug(ulpSize,returnType):
 #else
     if (isinf(cpu_data[index])){
       sprintf(log, "%s expect:%s\\n", log, INFORNAN);
-      OCL_ASSERTM(isinf(gpu_data[index]),log);
+      OCL_ASSERTM(isinf(gpu_data[index]) || !env_strict,log);
       }
     else if (isnan(cpu_data[index])){
       sprintf(log, "%s expect:%s\\n", log, INFORNAN);
-      OCL_ASSERTM(isnan(gpu_data[index]),log);
+      OCL_ASSERTM(isnan(gpu_data[index]) || !env_strict,log);
       }
     else{
       sprintf(log, "%s expect:%s\\n", log, ULPSIZE);
-      OCL_ASSERTM(fabs(gpu_data[index]-cpu_data[index]) <= ULPSIZE, log);
+      if (ULPSIZE < 0)
+            OCL_ASSERTM(gpu_data[index] == cpu_data[index], log);
+      else
+            OCL_ASSERTM(fabs(gpu_data[index]-cpu_data[index]) <= ULPSIZE, log);
       }
 #endif
   }
-}\n'''%(returnType,\
+}\n'''%(returnType,Min_ulp(function),\
         ulpUnit(ulpSize),ulpNum(ulpSize),\
         ulpNum(ulpSize), ulpNum(ulpSize),\
         paraTypeList['string'],paraTypeList['string'],\
@@ -317,7 +405,7 @@ static void %s_%s(void)
 
     self.cpplines += funcline
 
-    self.cpplines += [ udebug(self.ulp,self.retType(index)) ]
+    self.cpplines += [ udebug(self.ulp,self.retType(index),self.funcName) ]
     self.cpplines += [ "MAKE_UTEST_FROM_FUNCTION(%s_%s)"%(self.fileName,namesuffix) ]
 
   def genCL(self,index):
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index df0e508..591054e 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -401,7 +401,7 @@ cl_ocl_init(void)
 
 error:
   if (props)
-    delete props;
+    delete[] props;
   return status;
 }
 
@@ -680,3 +680,25 @@ int cl_INT_ULP(int int_number)
 {
   return 0;
 }
+
+int time_subtract(struct timeval *y, struct timeval *x, struct timeval *result)
+{
+  if ( x->tv_sec > y->tv_sec )
+    return   -1;
+
+  if ((x->tv_sec == y->tv_sec) && (x->tv_usec > y->tv_usec))
+    return   -1;
+
+  if ( result != NULL){
+    result->tv_sec = ( y->tv_sec - x->tv_sec );
+    result->tv_usec = ( y->tv_usec - x->tv_usec );
+
+    if (result->tv_usec < 0){
+      result->tv_sec --;
+      result->tv_usec += 1000000;
+    }
+  }
+
+  int msec = 1000.0*(y->tv_sec - x->tv_sec) + (y->tv_usec - x->tv_usec)/1000.0;
+  return msec;
+}
\ No newline at end of file
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 026eb1c..5d8e835 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -230,5 +230,8 @@ extern float cl_FLT_ULP(float float_number);
 /* Calculator ULP of each INT value */
 extern int cl_INT_ULP(int int_number);
 
+/* subtract the time */
+int time_subtract(struct timeval *y, struct timeval *x, struct timeval *result);
+
 #endif /* __UTEST_HELPER_HPP__ */
 
diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
index 75926b6..71a031f 100755
--- a/utests/utest_math_gen.py
+++ b/utests/utest_math_gen.py
@@ -462,7 +462,10 @@ static float pown(float x, int y){
   powr_output_type = ['float','float2','float4','float8','float16']
   powr_cpu_func='''
 static float powr(float x, int y){
-    return powf(x,y);
+    if (x<0)
+        return NAN;
+    else
+        return powf(x,y);
 } '''
   powrUtests = func('powr','powr',[powr_input_type1,powr_input_type2],powr_output_type,[powr_input_values1,powr_input_values2],'16 * FLT_ULP', powr_cpu_func)
   
diff --git a/utests/vload_bench.cpp b/utests/vload_bench.cpp
index 3765996..a7703fc 100644
--- a/utests/vload_bench.cpp
+++ b/utests/vload_bench.cpp
@@ -11,7 +11,7 @@ static double vload_bench(const char *kernelFunc, uint32_t N, uint32_t offset, b
   struct timeval start, end;
 
   // Setup kernel and buffers
-  std::string kernelName = kernelFunc + std::to_string(N);
+  std::string kernelName = kernelFunc + std::to_string((long long unsigned int)N);
   OCL_CALL (cl_kernel_init, "vload_bench.cl", kernelName.c_str(), SOURCE, NULL);
   //OCL_CREATE_KERNEL("compiler_array");
   buf_data[0] = (T*) malloc(sizeof(T) * n);
@@ -42,7 +42,7 @@ static double vload_bench(const char *kernelFunc, uint32_t N, uint32_t offset, b
     OCL_MAP_BUFFER(0);
     OCL_MAP_BUFFER(1);
     for (uint32_t i = 0; i < globals[0]; ++i) {
-      OCL_ASSERT(((T*)buf_data[0])[i + offset] == ((uint32_t*)buf_data[1])[i]);
+      OCL_ASSERT((uint32_t)(((T*)buf_data[0])[i + offset]) == ((uint32_t*)buf_data[1])[i]);
     }
     return 0;
   }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git